DATA_DIR=/ssd/yi/converted_data/megatron_phase1 START=0 END=50 DATA_PATHS="" for idx in $(seq -f "%05g" $START $END); do DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document" done DATA_ARGS=" --data-path ${DATA_PATHS} --split 999,1,0 --tokenizer-type HuggingFaceTokenizer --tokenizer-model /apps/yi/model_training/data/tokenizer " # --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json # --merge-file /apps/yi/model_training/data/tokenizer/merges.txt # --vocab-size 151936