21 lines
511 B
Bash
21 lines
511 B
Bash
|
|
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
|
|
|
|
START=0
|
|
END=50
|
|
|
|
DATA_PATHS=""
|
|
for idx in $(seq -f "%05g" $START $END); do
|
|
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
|
|
done
|
|
|
|
DATA_ARGS="
|
|
--data-path ${DATA_PATHS}
|
|
--split 999,1,0
|
|
--tokenizer-type HuggingFaceTokenizer
|
|
--tokenizer-model /apps/yi/model_training/data/tokenizer
|
|
"
|
|
|
|
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
|
|
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
|
|
# --vocab-size 151936 |