chore: adapt path and training hparams
This commit is contained in:
@@ -23,16 +23,18 @@ Megatron's IndexedDatasetBuilder directly.
|
|||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
python /apps/yi/model_training/scripts/convert_phase_to_megatron.py \
|
python /ssd1/yi/pretrain_kaiyuan2b/scripts/convert_phase_to_megatron.py \
|
||||||
--input-dir /apps/yi/model_training/data/phase1 \
|
--input-dir /ssd1/yi/data/phase1 \
|
||||||
--output-dir /ssd/yi/converted_data/megatron_phase1 \
|
--output-dir /ssd1/yi/converted_data/phase1 \
|
||||||
--megatron-dir /apps/yi/model_training/Megatron-LM \
|
--megatron-dir /ssd1/yi/pretrain_kaiyuan2b/Megatron-LM \
|
||||||
--tokenizer-model /apps/yi/model_training/data/tokenizer \
|
--tokenizer-model /ssd1/yi/data/tokenizer \
|
||||||
--text-key text \
|
--text-key text \
|
||||||
--num-shards 4 \
|
--num-shards 16 \
|
||||||
--workers-per-shard 16 \
|
--workers-per-shard 12 \
|
||||||
--start 100 \
|
--batch-size 16384 \
|
||||||
--end 220
|
--chunksize 128 \
|
||||||
|
--start 0 \
|
||||||
|
--end 210
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_TOKENIZER = None
|
_TOKENIZER = None
|
||||||
@@ -214,8 +216,8 @@ def parse_args():
|
|||||||
parser.add_argument("--output-prefix-prefix", default="phase1")
|
parser.add_argument("--output-prefix-prefix", default="phase1")
|
||||||
parser.add_argument("--num-shards", type=int, default=1, help="Parallel parquet files.")
|
parser.add_argument("--num-shards", type=int, default=1, help="Parallel parquet files.")
|
||||||
parser.add_argument("--workers-per-shard", type=int, default=max((os.cpu_count() or 8) // 2, 1))
|
parser.add_argument("--workers-per-shard", type=int, default=max((os.cpu_count() or 8) // 2, 1))
|
||||||
parser.add_argument("--batch-size", type=int, default=8192, help="Parquet record batch size.")
|
parser.add_argument("--batch-size", type=int, default=16384, help="Parquet record batch size.")
|
||||||
parser.add_argument("--chunksize", type=int, default=64, help="Tokenizer pool imap chunk size.")
|
parser.add_argument("--chunksize", type=int, default=128, help="Tokenizer pool imap chunk size.")
|
||||||
parser.add_argument("--log-interval", type=int, default=10000)
|
parser.add_argument("--log-interval", type=int, default=10000)
|
||||||
parser.add_argument("--start", type=int, default=0)
|
parser.add_argument("--start", type=int, default=0)
|
||||||
parser.add_argument("--end", type=int, default=None)
|
parser.add_argument("--end", type=int, default=None)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
source /apps/yi/.venv/bin/activate
|
# source /apps/yi/.venv/bin/activate
|
||||||
|
|
||||||
export HF_HUB_ENABLE_HF_TRANSFER=1
|
export HF_HUB_ENABLE_HF_TRANSFER=1
|
||||||
export HF_HUB_DISABLE_SYMLINKS_WARNING=1
|
export HF_HUB_DISABLE_SYMLINKS_WARNING=1
|
||||||
@@ -9,18 +9,18 @@ export HF_HUB_DISABLE_SYMLINKS_WARNING=1
|
|||||||
|
|
||||||
export HF_ENDPOINT="https://hf-mirror.com/"
|
export HF_ENDPOINT="https://hf-mirror.com/"
|
||||||
|
|
||||||
unset http_proxy
|
# unset http_proxy
|
||||||
unset https_proxy
|
# unset https_proxy
|
||||||
export http_proxy=http://10.29.1.201:8888
|
# export http_proxy=http://10.29.1.201:8888
|
||||||
export https_proxy=http://10.29.1.201:8888
|
# export https_proxy=http://10.29.1.201:8888
|
||||||
|
|
||||||
unset HTTP_PROXY
|
# unset HTTP_PROXY
|
||||||
unset HTTPS_PROXY
|
# unset HTTPS_PROXY
|
||||||
export HTTP_PROXY=http://10.29.1.201:8888
|
# export HTTP_PROXY=http://10.29.1.201:8888
|
||||||
export HTTPS_PROXY=http://10.29.1.201:8888
|
# export HTTPS_PROXY=http://10.29.1.201:8888
|
||||||
|
|
||||||
LOCAL_DIR="/apps/yi/kaiyuan_pretraining/"
|
LOCAL_DIR="/ssd1/yi/data/"
|
||||||
PHASE="phase2"
|
PHASE="phase1"
|
||||||
|
|
||||||
echo "Starting download of $PHASE data to $LOCAL_DIR..."
|
echo "Starting download of $PHASE data to $LOCAL_DIR..."
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
|
|
||||||
DATA_DIR=/ssd/yi/converted_data/megatron_phase1
|
DATA_DIR=/ssd1/yi/converted_data/phase1
|
||||||
|
|
||||||
START=0
|
START=0
|
||||||
END=210
|
END=0
|
||||||
|
|
||||||
DATA_PATHS=""
|
DATA_PATHS=""
|
||||||
for idx in $(seq -f "%05g" $START $END); do
|
for idx in $(seq -f "%05g" $START $END); do
|
||||||
@@ -13,7 +13,7 @@ DATA_ARGS="
|
|||||||
--data-path ${DATA_PATHS}
|
--data-path ${DATA_PATHS}
|
||||||
--split 999,1,0
|
--split 999,1,0
|
||||||
--tokenizer-type HuggingFaceTokenizer
|
--tokenizer-type HuggingFaceTokenizer
|
||||||
--tokenizer-model /apps/yi/model_training/data/tokenizer
|
--tokenizer-model /ssd1/yi/data/tokenizer
|
||||||
"
|
"
|
||||||
|
|
||||||
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
|
# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
HPARAMS="
|
HPARAMS="
|
||||||
--micro-batch-size 16
|
--micro-batch-size 4
|
||||||
--global-batch-size 2048
|
--global-batch-size 2048
|
||||||
--train-iters 87000
|
--train-iters 87000
|
||||||
--eval-iters 10
|
--eval-iters 10
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||||
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}
|
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/ssd1/yi/artifacts}
|
||||||
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
|
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
|
||||||
LOG_DIR="${ARTIFACT_ROOT}/logs"
|
LOG_DIR="${ARTIFACT_ROOT}/logs"
|
||||||
|
|
||||||
|
|||||||
@@ -7,9 +7,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
|
|||||||
MODE=${1:-qwen3_1p7b_smoke_yi}
|
MODE=${1:-qwen3_1p7b_smoke_yi}
|
||||||
TRAIN_NAME=${2:-qwen3_1p7b_smoke_yi}
|
TRAIN_NAME=${2:-qwen3_1p7b_smoke_yi}
|
||||||
|
|
||||||
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
|
MEGATRON_PATH=/ssd1/yi/pretrain_kaiyuan2b/Megatron-LM
|
||||||
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
|
ARTIFACT_ROOT=/ssd1/yi/artifacts
|
||||||
SCRIPT_DIR=/apps/yi/model_training/scripts/kaiyuan2b-training
|
SCRIPT_DIR=/ssd1/yi/pretrain_kaiyuan2b/scripts/kaiyuan2b-training
|
||||||
PARAMS_DIR="${SCRIPT_DIR}/params"
|
PARAMS_DIR="${SCRIPT_DIR}/params"
|
||||||
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
|
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
|
||||||
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
|
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
|
||||||
|
|||||||
Reference in New Issue
Block a user