chore: adapt path and training hparams

This commit is contained in:
2026-05-09 21:40:10 +08:00
parent 75eacf00c2
commit 46560f5740
6 changed files with 32 additions and 30 deletions

View File

@@ -23,16 +23,18 @@ Megatron's IndexedDatasetBuilder directly.
Usage: Usage:
python /apps/yi/model_training/scripts/convert_phase_to_megatron.py \ python /ssd1/yi/pretrain_kaiyuan2b/scripts/convert_phase_to_megatron.py \
--input-dir /apps/yi/model_training/data/phase1 \ --input-dir /ssd1/yi/data/phase1 \
--output-dir /ssd/yi/converted_data/megatron_phase1 \ --output-dir /ssd1/yi/converted_data/phase1 \
--megatron-dir /apps/yi/model_training/Megatron-LM \ --megatron-dir /ssd1/yi/pretrain_kaiyuan2b/Megatron-LM \
--tokenizer-model /apps/yi/model_training/data/tokenizer \ --tokenizer-model /ssd1/yi/data/tokenizer \
--text-key text \ --text-key text \
--num-shards 4 \ --num-shards 16 \
--workers-per-shard 16 \ --workers-per-shard 12 \
--start 100 \ --batch-size 16384 \
--end 220 --chunksize 128 \
--start 0 \
--end 210
""" """
_TOKENIZER = None _TOKENIZER = None
@@ -214,8 +216,8 @@ def parse_args():
parser.add_argument("--output-prefix-prefix", default="phase1") parser.add_argument("--output-prefix-prefix", default="phase1")
parser.add_argument("--num-shards", type=int, default=1, help="Parallel parquet files.") parser.add_argument("--num-shards", type=int, default=1, help="Parallel parquet files.")
parser.add_argument("--workers-per-shard", type=int, default=max((os.cpu_count() or 8) // 2, 1)) parser.add_argument("--workers-per-shard", type=int, default=max((os.cpu_count() or 8) // 2, 1))
parser.add_argument("--batch-size", type=int, default=8192, help="Parquet record batch size.") parser.add_argument("--batch-size", type=int, default=16384, help="Parquet record batch size.")
parser.add_argument("--chunksize", type=int, default=64, help="Tokenizer pool imap chunk size.") parser.add_argument("--chunksize", type=int, default=128, help="Tokenizer pool imap chunk size.")
parser.add_argument("--log-interval", type=int, default=10000) parser.add_argument("--log-interval", type=int, default=10000)
parser.add_argument("--start", type=int, default=0) parser.add_argument("--start", type=int, default=0)
parser.add_argument("--end", type=int, default=None) parser.add_argument("--end", type=int, default=None)

View File

@@ -1,6 +1,6 @@
#!/bin/bash #!/bin/bash
source /apps/yi/.venv/bin/activate # source /apps/yi/.venv/bin/activate
export HF_HUB_ENABLE_HF_TRANSFER=1 export HF_HUB_ENABLE_HF_TRANSFER=1
export HF_HUB_DISABLE_SYMLINKS_WARNING=1 export HF_HUB_DISABLE_SYMLINKS_WARNING=1
@@ -9,18 +9,18 @@ export HF_HUB_DISABLE_SYMLINKS_WARNING=1
export HF_ENDPOINT="https://hf-mirror.com/" export HF_ENDPOINT="https://hf-mirror.com/"
unset http_proxy # unset http_proxy
unset https_proxy # unset https_proxy
export http_proxy=http://10.29.1.201:8888 # export http_proxy=http://10.29.1.201:8888
export https_proxy=http://10.29.1.201:8888 # export https_proxy=http://10.29.1.201:8888
unset HTTP_PROXY # unset HTTP_PROXY
unset HTTPS_PROXY # unset HTTPS_PROXY
export HTTP_PROXY=http://10.29.1.201:8888 # export HTTP_PROXY=http://10.29.1.201:8888
export HTTPS_PROXY=http://10.29.1.201:8888 # export HTTPS_PROXY=http://10.29.1.201:8888
LOCAL_DIR="/apps/yi/kaiyuan_pretraining/" LOCAL_DIR="/ssd1/yi/data/"
PHASE="phase2" PHASE="phase1"
echo "Starting download of $PHASE data to $LOCAL_DIR..." echo "Starting download of $PHASE data to $LOCAL_DIR..."

View File

@@ -1,8 +1,8 @@
DATA_DIR=/ssd/yi/converted_data/megatron_phase1 DATA_DIR=/ssd1/yi/converted_data/phase1
START=0 START=0
END=210 END=0
DATA_PATHS="" DATA_PATHS=""
for idx in $(seq -f "%05g" $START $END); do for idx in $(seq -f "%05g" $START $END); do
@@ -13,7 +13,7 @@ DATA_ARGS="
--data-path ${DATA_PATHS} --data-path ${DATA_PATHS}
--split 999,1,0 --split 999,1,0
--tokenizer-type HuggingFaceTokenizer --tokenizer-type HuggingFaceTokenizer
--tokenizer-model /apps/yi/model_training/data/tokenizer --tokenizer-model /ssd1/yi/data/tokenizer
" "
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json # --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json

View File

@@ -1,5 +1,5 @@
HPARAMS=" HPARAMS="
--micro-batch-size 16 --micro-batch-size 4
--global-batch-size 2048 --global-batch-size 2048
--train-iters 87000 --train-iters 87000
--eval-iters 10 --eval-iters 10

View File

@@ -2,7 +2,7 @@
set -euo pipefail set -euo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts} ARTIFACT_ROOT=${ARTIFACT_ROOT:-/ssd1/yi/artifacts}
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state" RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
LOG_DIR="${ARTIFACT_ROOT}/logs" LOG_DIR="${ARTIFACT_ROOT}/logs"

View File

@@ -7,9 +7,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
MODE=${1:-qwen3_1p7b_smoke_yi} MODE=${1:-qwen3_1p7b_smoke_yi}
TRAIN_NAME=${2:-qwen3_1p7b_smoke_yi} TRAIN_NAME=${2:-qwen3_1p7b_smoke_yi}
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM MEGATRON_PATH=/ssd1/yi/pretrain_kaiyuan2b/Megatron-LM
ARTIFACT_ROOT=/apps/yi/model_training/artifacts ARTIFACT_ROOT=/ssd1/yi/artifacts
SCRIPT_DIR=/apps/yi/model_training/scripts/kaiyuan2b-training SCRIPT_DIR=/ssd1/yi/pretrain_kaiyuan2b/scripts/kaiyuan2b-training
PARAMS_DIR="${SCRIPT_DIR}/params" PARAMS_DIR="${SCRIPT_DIR}/params"
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}" TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}" CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"