Initial Commit
This commit is contained in:
21
scripts/kaiyuan2b-profiling/params/gpt_smoke/data.sh
Normal file
21
scripts/kaiyuan2b-profiling/params/gpt_smoke/data.sh
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
|
||||
|
||||
START=0
|
||||
END=50
|
||||
|
||||
DATA_PATHS=""
|
||||
for idx in $(seq -f "%05g" $START $END); do
|
||||
DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document"
|
||||
done
|
||||
|
||||
DATA_ARGS="
|
||||
--data-path ${DATA_PATHS}
|
||||
--split 999,1,0
|
||||
--tokenizer-type HuggingFaceTokenizer
|
||||
--tokenizer-model /apps/yi/model_training/data/tokenizer
|
||||
"
|
||||
|
||||
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
|
||||
# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt
|
||||
# --vocab-size 151936
|
||||
Reference in New Issue
Block a user