diff --git a/scripts/convert_phase_to_megatron.py b/scripts/convert_phase_to_megatron.py index 1de60ea..3e9850b 100644 --- a/scripts/convert_phase_to_megatron.py +++ b/scripts/convert_phase_to_megatron.py @@ -23,16 +23,18 @@ Megatron's IndexedDatasetBuilder directly. Usage: -python /apps/yi/model_training/scripts/convert_phase_to_megatron.py \ - --input-dir /apps/yi/model_training/data/phase1 \ - --output-dir /ssd/yi/converted_data/megatron_phase1 \ - --megatron-dir /apps/yi/model_training/Megatron-LM \ - --tokenizer-model /apps/yi/model_training/data/tokenizer \ +python /ssd1/yi/pretrain_kaiyuan2b/scripts/convert_phase_to_megatron.py \ + --input-dir /ssd1/yi/data/phase1 \ + --output-dir /ssd1/yi/converted_data/phase1 \ + --megatron-dir /ssd1/yi/pretrain_kaiyuan2b/Megatron-LM \ + --tokenizer-model /ssd1/yi/data/tokenizer \ --text-key text \ - --num-shards 4 \ - --workers-per-shard 16 \ - --start 100 \ - --end 220 + --num-shards 16 \ + --workers-per-shard 12 \ + --batch-size 16384 \ + --chunksize 128 \ + --start 0 \ + --end 210 """ _TOKENIZER = None @@ -214,8 +216,8 @@ def parse_args(): parser.add_argument("--output-prefix-prefix", default="phase1") parser.add_argument("--num-shards", type=int, default=1, help="Parallel parquet files.") parser.add_argument("--workers-per-shard", type=int, default=max((os.cpu_count() or 8) // 2, 1)) - parser.add_argument("--batch-size", type=int, default=8192, help="Parquet record batch size.") - parser.add_argument("--chunksize", type=int, default=64, help="Tokenizer pool imap chunk size.") + parser.add_argument("--batch-size", type=int, default=16384, help="Parquet record batch size.") + parser.add_argument("--chunksize", type=int, default=128, help="Tokenizer pool imap chunk size.") parser.add_argument("--log-interval", type=int, default=10000) parser.add_argument("--start", type=int, default=0) parser.add_argument("--end", type=int, default=None) diff --git a/scripts/download_kaiyuan.sh b/scripts/download_kaiyuan.sh index 9af5ad0..9c05907 100644 --- a/scripts/download_kaiyuan.sh +++ b/scripts/download_kaiyuan.sh @@ -1,6 +1,6 @@ #!/bin/bash -source /apps/yi/.venv/bin/activate +# source /apps/yi/.venv/bin/activate export HF_HUB_ENABLE_HF_TRANSFER=1 export HF_HUB_DISABLE_SYMLINKS_WARNING=1 @@ -9,18 +9,18 @@ export HF_HUB_DISABLE_SYMLINKS_WARNING=1 export HF_ENDPOINT="https://hf-mirror.com/" -unset http_proxy -unset https_proxy -export http_proxy=http://10.29.1.201:8888 -export https_proxy=http://10.29.1.201:8888 +# unset http_proxy +# unset https_proxy +# export http_proxy=http://10.29.1.201:8888 +# export https_proxy=http://10.29.1.201:8888 -unset HTTP_PROXY -unset HTTPS_PROXY -export HTTP_PROXY=http://10.29.1.201:8888 -export HTTPS_PROXY=http://10.29.1.201:8888 +# unset HTTP_PROXY +# unset HTTPS_PROXY +# export HTTP_PROXY=http://10.29.1.201:8888 +# export HTTPS_PROXY=http://10.29.1.201:8888 -LOCAL_DIR="/apps/yi/kaiyuan_pretraining/" -PHASE="phase2" +LOCAL_DIR="/ssd1/yi/data/" +PHASE="phase1" echo "Starting download of $PHASE data to $LOCAL_DIR..." diff --git a/scripts/kaiyuan2b-training/params/qwen3_1p7b/data_phase1_smoke.sh b/scripts/kaiyuan2b-training/params/qwen3_1p7b/data_phase1_smoke.sh index 9e9cbd7..a296118 100644 --- a/scripts/kaiyuan2b-training/params/qwen3_1p7b/data_phase1_smoke.sh +++ b/scripts/kaiyuan2b-training/params/qwen3_1p7b/data_phase1_smoke.sh @@ -1,8 +1,8 @@ -DATA_DIR=/ssd/yi/converted_data/megatron_phase1 +DATA_DIR=/ssd1/yi/converted_data/phase1 START=0 -END=210 +END=0 DATA_PATHS="" for idx in $(seq -f "%05g" $START $END); do @@ -13,7 +13,7 @@ DATA_ARGS=" --data-path ${DATA_PATHS} --split 999,1,0 --tokenizer-type HuggingFaceTokenizer - --tokenizer-model /apps/yi/model_training/data/tokenizer + --tokenizer-model /ssd1/yi/data/tokenizer " # --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json diff --git a/scripts/kaiyuan2b-training/params/qwen3_1p7b/hparams.sh b/scripts/kaiyuan2b-training/params/qwen3_1p7b/hparams.sh index b5f6c0a..eb4054a 100644 --- a/scripts/kaiyuan2b-training/params/qwen3_1p7b/hparams.sh +++ b/scripts/kaiyuan2b-training/params/qwen3_1p7b/hparams.sh @@ -1,5 +1,5 @@ HPARAMS=" - --micro-batch-size 16 + --micro-batch-size 4 --global-batch-size 2048 --train-iters 87000 --eval-iters 10 diff --git a/scripts/kaiyuan2b-training/start_training.sh b/scripts/kaiyuan2b-training/start_training.sh index 36c7ce5..88bc986 100755 --- a/scripts/kaiyuan2b-training/start_training.sh +++ b/scripts/kaiyuan2b-training/start_training.sh @@ -2,7 +2,7 @@ set -euo pipefail SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts} +ARTIFACT_ROOT=${ARTIFACT_ROOT:-/ssd1/yi/artifacts} RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state" LOG_DIR="${ARTIFACT_ROOT}/logs" diff --git a/scripts/kaiyuan2b-training/training_smoke_qwen3_1p7b.sh b/scripts/kaiyuan2b-training/training_smoke_qwen3_1p7b.sh index bbbaf90..00011d0 100644 --- a/scripts/kaiyuan2b-training/training_smoke_qwen3_1p7b.sh +++ b/scripts/kaiyuan2b-training/training_smoke_qwen3_1p7b.sh @@ -7,9 +7,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 MODE=${1:-qwen3_1p7b_smoke_yi} TRAIN_NAME=${2:-qwen3_1p7b_smoke_yi} -MEGATRON_PATH=/apps/yi/model_training/Megatron-LM -ARTIFACT_ROOT=/apps/yi/model_training/artifacts -SCRIPT_DIR=/apps/yi/model_training/scripts/kaiyuan2b-training +MEGATRON_PATH=/ssd1/yi/pretrain_kaiyuan2b/Megatron-LM +ARTIFACT_ROOT=/ssd1/yi/artifacts +SCRIPT_DIR=/ssd1/yi/pretrain_kaiyuan2b/scripts/kaiyuan2b-training PARAMS_DIR="${SCRIPT_DIR}/params" TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}" CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"