Files
2026-05-06 15:06:07 +08:00

21 lines
511 B
Plaintext

DATA_DIR=/ssd/yi/converted_data/megatron_phase1
START=0
END=50
DATA_PATHS=""
for idx in $(seq -f "%05g" $START $END); do
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
done
DATA_ARGS="
--data-path ${DATA_PATHS}
--split 999,1,0
--tokenizer-type HuggingFaceTokenizer
--tokenizer-model /apps/yi/model_training/data/tokenizer
"
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
# --vocab-size 151936