From f154c1611d06357b2b592cdd54dd9da63993faa5 Mon Sep 17 00:00:00 2001 From: yi_lu Date: Wed, 6 May 2026 15:06:07 +0800 Subject: [PATCH] Initial Commit --- .gitignore | 6 + scripts/Dockerfile | 57 ++++++ scripts/convert_megatron_weight_to_hf.sh | 10 ++ scripts/convert_phase_to_megatron.py | 164 ++++++++++++++++++ scripts/download_kaiyuan.sh | 41 +++++ scripts/inspect_parquet.py | 41 +++++ .../kaiyuan2b-profiling/eval_smoke_gpt2.sh | 69 ++++++++ .../params/data_phase1.sh.back | 21 +++ .../params/gpt_smoke/data.sh | 21 +++ .../params/gpt_smoke/hparams.sh | 13 ++ .../params/gpt_smoke/model.sh | 18 ++ .../params/optim_common.sh | 10 ++ .../params/qwen3_1p7b/data_phase1_smoke.sh | 21 +++ .../params/qwen3_1p7b/hparams.sh | 13 ++ .../params/qwen3_1p7b/model.sh | 19 ++ .../training_smoke_gpt2.sh | 66 +++++++ .../training_smoke_qwen3_1p7b.sh | 94 ++++++++++ scripts/kaiyuan2b-training/eval_smoke_gpt2.sh | 69 ++++++++ .../params/data_phase1.sh.back | 21 +++ .../params/gpt_smoke/data.sh | 21 +++ .../params/gpt_smoke/hparams.sh | 13 ++ .../params/gpt_smoke/model.sh | 18 ++ .../kaiyuan2b-training/params/optim_common.sh | 10 ++ .../params/qwen3_1p7b/data_phase1_smoke.sh | 21 +++ .../params/qwen3_1p7b/hparams.sh | 13 ++ .../params/qwen3_1p7b/model.sh | 19 ++ .../kaiyuan2b-training/training_smoke_gpt2.sh | 66 +++++++ .../training_smoke_qwen3_1p7b.sh | 83 +++++++++ scripts/toy_model_training.sh | 30 ++++ 29 files changed, 1068 insertions(+) create mode 100644 scripts/Dockerfile create mode 100644 scripts/convert_megatron_weight_to_hf.sh create mode 100644 scripts/convert_phase_to_megatron.py create mode 100644 scripts/download_kaiyuan.sh create mode 100644 scripts/inspect_parquet.py create mode 100644 scripts/kaiyuan2b-profiling/eval_smoke_gpt2.sh create mode 100644 scripts/kaiyuan2b-profiling/params/data_phase1.sh.back create mode 100644 scripts/kaiyuan2b-profiling/params/gpt_smoke/data.sh create mode 100644 scripts/kaiyuan2b-profiling/params/gpt_smoke/hparams.sh create mode 100644 scripts/kaiyuan2b-profiling/params/gpt_smoke/model.sh create mode 100644 scripts/kaiyuan2b-profiling/params/optim_common.sh create mode 100644 scripts/kaiyuan2b-profiling/params/qwen3_1p7b/data_phase1_smoke.sh create mode 100644 scripts/kaiyuan2b-profiling/params/qwen3_1p7b/hparams.sh create mode 100644 scripts/kaiyuan2b-profiling/params/qwen3_1p7b/model.sh create mode 100644 scripts/kaiyuan2b-profiling/training_smoke_gpt2.sh create mode 100644 scripts/kaiyuan2b-profiling/training_smoke_qwen3_1p7b.sh create mode 100644 scripts/kaiyuan2b-training/eval_smoke_gpt2.sh create mode 100644 scripts/kaiyuan2b-training/params/data_phase1.sh.back create mode 100644 scripts/kaiyuan2b-training/params/gpt_smoke/data.sh create mode 100644 scripts/kaiyuan2b-training/params/gpt_smoke/hparams.sh create mode 100644 scripts/kaiyuan2b-training/params/gpt_smoke/model.sh create mode 100644 scripts/kaiyuan2b-training/params/optim_common.sh create mode 100644 scripts/kaiyuan2b-training/params/qwen3_1p7b/data_phase1_smoke.sh create mode 100644 scripts/kaiyuan2b-training/params/qwen3_1p7b/hparams.sh create mode 100644 scripts/kaiyuan2b-training/params/qwen3_1p7b/model.sh create mode 100644 scripts/kaiyuan2b-training/training_smoke_gpt2.sh create mode 100644 scripts/kaiyuan2b-training/training_smoke_qwen3_1p7b.sh create mode 100644 scripts/toy_model_training.sh diff --git a/.gitignore b/.gitignore index 36b13f1..19d9789 100644 --- a/.gitignore +++ b/.gitignore @@ -174,3 +174,9 @@ cython_debug/ # PyPI configuration file .pypirc +# custom ignored items +.venv/ +artifacts/* +data/* +*.log +*.nsys-rep \ No newline at end of file diff --git a/scripts/Dockerfile b/scripts/Dockerfile new file mode 100644 index 0000000..510693d --- /dev/null +++ b/scripts/Dockerfile @@ -0,0 +1,57 @@ +FROM nvcr.io/nvidia/pytorch:25.10-py3 + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG http_proxy +ARG https_proxy + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 +ENV PIP_NO_CACHE_DIR=1 +ENV PIP_CONSTRAINT= +ENV MAX_JOBS=8 +ENV PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH + +RUN apt-get update && apt-get install -y \ + git curl wget vim tmux htop rsync ca-certificates \ + build-essential ninja-build cmake pkg-config bzip2 \ + && rm -rf /var/lib/apt/lists/* + +RUN python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ + python -m pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn && \ + python -m pip install -U pip setuptools wheel packaging + +RUN python -m pip install \ + transformers datasets tokenizers sentencepiece accelerate \ + numpy pandas pyarrow fastparquet zstandard jsonlines tqdm rich einops regex \ + tensorboard wandb evaluate lm-eval \ + omegaconf hydra-core nltk ftfy six psutil pydantic + +# NGC PyTorch usually already includes Transformer Engine. +# Keep this check; do not reinstall TE unless it fails. +RUN python - <<'PY' +import torch +print("torch:", torch.__version__, "cuda:", torch.version.cuda) +try: + import transformer_engine + print("transformer_engine: OK") +except Exception as e: + print("transformer_engine import failed:", e) +PY + +RUN git clone https://github.com/NVIDIA/Megatron-LM.git /opt/Megatron-LM && \ + cd /opt/Megatron-LM && \ + python -m pip install -U "setuptools<80.0.0,>=77.0.0" packaging && \ + python -m pip install --no-build-isolation -e . + +# Optional Apex: only install if you really need fused optimizers from Apex. +# Many modern Megatron paths rely more on Transformer Engine / fused kernels. +RUN git clone https://github.com/NVIDIA/apex.git /opt/apex && \ + cd /opt/apex && \ + python -m pip install -v --disable-pip-version-check --no-build-isolation \ + --config-settings "--build-option=--cpp_ext" \ + --config-settings "--build-option=--cuda_ext" \ + . + +WORKDIR /workspace +CMD ["/bin/bash"] \ No newline at end of file diff --git a/scripts/convert_megatron_weight_to_hf.sh b/scripts/convert_megatron_weight_to_hf.sh new file mode 100644 index 0000000..b454991 --- /dev/null +++ b/scripts/convert_megatron_weight_to_hf.sh @@ -0,0 +1,10 @@ +CKPT_DIR=/apps/yi/model_training/artifacts/checkpoints/smoke_gpt +MEGATRON_PATH=/apps/yi/model_training/Megatron-LM +HF_OUT=/apps/yi/model_training/artifacts/hf_models/smoke_gpt_15000 + +python $MEGATRON_PATH/tools/checkpoint/convert.py \ + --model-type GPT \ + --loader core \ + --saver core \ + --load-dir $CKPT_DIR \ + --save-dir $HF_OUT diff --git a/scripts/convert_phase_to_megatron.py b/scripts/convert_phase_to_megatron.py new file mode 100644 index 0000000..ab19170 --- /dev/null +++ b/scripts/convert_phase_to_megatron.py @@ -0,0 +1,164 @@ +import argparse +import json +import os +import subprocess +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path +from tqdm import tqdm +import pyarrow.parquet as pq + +""" +Convert Pyarrow parquets to megatron format, use jsonl as intermediate format. + +Takes in parquet schema: + +text: + +Usage: + +python /apps/yi/model_training/scripts/convert_phase_to_megatron.py \ + --input-dir /apps/yi/model_training/data/phase1 \ + --output-dir /ssd/yi/converted_data/megatron_phase1 \ + --tmp-dir /ssd/yi/converted_data/tmp_jsonl \ + --megatron-dir /apps/yi/model_training/Megatron-LM \ + --tokenizer-model /apps/yi/model_training/data/tokenizer \ + --text-key text \ + --num-shards 4 \ + --workers-per-shard 16 \ + --start 100 \ + --end 220 # 1 of total 220 parquets +""" + + + +def parquet_to_jsonl(parquet_path: Path, jsonl_path: Path, text_key: str): + jsonl_path.parent.mkdir(parents=True, exist_ok=True) + + rows = 0 + with jsonl_path.open("w", encoding="utf-8") as fout: + pf = pq.ParquetFile(parquet_path) + for batch in pf.iter_batches(columns=[text_key], batch_size=8192): + col = batch.column(0).to_pylist() + for text in col: + if isinstance(text, str) and text.strip(): + fout.write(json.dumps({text_key: text}, ensure_ascii=False) + "\n") + rows += 1 + return rows + + +def run_one(args_tuple): + ( + parquet_path, + output_dir, + tmp_dir, + text_key, + megatron_dir, + tokenizer_type, + tokenizer_model, + workers_per_shard, + keep_jsonl, + overwrite, + ) = args_tuple + + parquet_path = Path(parquet_path) + stem = parquet_path.name.replace(".zstd.parquet", "").replace(".parquet", "") + jsonl_path = Path(tmp_dir) / f"{stem}.jsonl" + output_prefix = Path(output_dir) / f"phase1_{stem}" + + bin_file = Path(str(output_prefix) + f"_{text_key}_document.bin") + idx_file = Path(str(output_prefix) + f"_{text_key}_document.idx") + + if not overwrite and bin_file.exists() and idx_file.exists(): + return f"[SKIP] {parquet_path.name}: existing bin/idx" + + print(f"[START] {parquet_path.name}", flush=True) + + rows = parquet_to_jsonl(parquet_path, jsonl_path, text_key) + print(f"[JSONL DONE] {parquet_path.name}: rows={rows}, jsonl={jsonl_path}", flush=True) + + print(f"[MEGATRON START] {parquet_path.name}", flush=True) + + cmd = [ + "python", + str(Path(megatron_dir) / "tools/preprocess_data.py"), + "--input", str(jsonl_path), + "--output-prefix", str(output_prefix), + "--tokenizer-type", tokenizer_type, + "--tokenizer-model", tokenizer_model, + "--json-keys", text_key, + "--workers", str(workers_per_shard), + "--append-eod", + ] + + env = os.environ.copy() + proc = subprocess.run( + cmd, + cwd=megatron_dir, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + + if proc.returncode != 0: + return f"[FAIL] {parquet_path.name}\n{proc.stdout[-4000:]}" + + if not keep_jsonl: + jsonl_path.unlink(missing_ok=True) + + return f"[OK] {parquet_path.name}: rows={rows}, output_prefix={output_prefix}" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--tmp-dir", required=True) + parser.add_argument("--megatron-dir", default="/apps/model_training/Megatron-LM") + parser.add_argument("--tokenizer-type", default="HuggingFaceTokenizer") + parser.add_argument("--tokenizer-model", required=True) + parser.add_argument("--text-key", default="text") + parser.add_argument("--num-shards", type=int, default=1, help="parallel parquet shards") + parser.add_argument("--workers-per-shard", type=int, default=8) + parser.add_argument("--start", type=int, default=0) + parser.add_argument("--end", type=int, default=None) + parser.add_argument("--keep-jsonl", action="store_true") + parser.add_argument("--overwrite", action="store_true") + args = parser.parse_args() + + files = sorted(Path(args.input_dir).glob("*.zstd.parquet")) + if not files: + files = sorted(Path(args.input_dir).glob("*.parquet")) + + files = files[args.start:args.end] + print(f"Converting {len(files)} files") + print(f"Parallel shards: {args.num_shards}") + print(f"Workers per shard: {args.workers_per_shard}") + + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + Path(args.tmp_dir).mkdir(parents=True, exist_ok=True) + + tasks = [ + ( + str(f), + args.output_dir, + args.tmp_dir, + args.text_key, + args.megatron_dir, + args.tokenizer_type, + args.tokenizer_model, + args.workers_per_shard, + args.keep_jsonl, + args.overwrite, + ) + for f in files + ] + + with ProcessPoolExecutor(max_workers=args.num_shards) as ex: + futs = [ex.submit(run_one, t) for t in tasks] + for fut in tqdm(as_completed(futs), total=len(futs)): + print(fut.result(), flush=True) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/download_kaiyuan.sh b/scripts/download_kaiyuan.sh new file mode 100644 index 0000000..9af5ad0 --- /dev/null +++ b/scripts/download_kaiyuan.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +source /apps/yi/.venv/bin/activate + +export HF_HUB_ENABLE_HF_TRANSFER=1 +export HF_HUB_DISABLE_SYMLINKS_WARNING=1 + +# activate mirror if necessary + +export HF_ENDPOINT="https://hf-mirror.com/" + +unset http_proxy +unset https_proxy +export http_proxy=http://10.29.1.201:8888 +export https_proxy=http://10.29.1.201:8888 + +unset HTTP_PROXY +unset HTTPS_PROXY +export HTTP_PROXY=http://10.29.1.201:8888 +export HTTPS_PROXY=http://10.29.1.201:8888 + +LOCAL_DIR="/apps/yi/kaiyuan_pretraining/" +PHASE="phase2" + +echo "Starting download of $PHASE data to $LOCAL_DIR..." + + +while true; do + hf download thu-pacman/PCMind-2.1-Kaiyuan-2B \ + --repo-type dataset \ + --local-dir "$LOCAL_DIR" \ + --include "$PHASE/*" + + if [ $? -eq 0 ]; then + echo "Download completed successfully!" + break + else + echo "Download interrupted or failed. Retrying in 10 seconds..." + sleep 10 + fi +done \ No newline at end of file diff --git a/scripts/inspect_parquet.py b/scripts/inspect_parquet.py new file mode 100644 index 0000000..1e054b0 --- /dev/null +++ b/scripts/inspect_parquet.py @@ -0,0 +1,41 @@ +import argparse +from pathlib import Path +import pyarrow.parquet as pq +import pyarrow.compute as pc + +""" +Inspect the organization of kaiyuan-pretraining data parquets. + +Usage: +python /apps/yi/scripts/inspect_parquet.py \ + --dir /apps/yi/kaiyuan_pretraining/phase1 \ + --limit-files 2 + +""" + +import pyarrow.parquet as pq + +path = "/apps/yi/kaiyuan_pretraining/phase1/part-00000.zstd.parquet" + +pf = pq.ParquetFile(path) + +print("=== Schema ===") +print(pf.schema) + +print("\n=== Columns ===") +print(pf.schema.names) + +# 只读取 very small batch +table = pf.read_row_group(0) # 只读第一个 row group + +print("\n=== First row (safe preview) ===") + +row = table.slice(0, 1).to_pydict() + +for k, v in row.items(): + val = v[0] + + if isinstance(val, str): + print(f"{k}: {val[:200]}... (len={len(val)})") + else: + print(f"{k}: type={type(val)}") \ No newline at end of file diff --git a/scripts/kaiyuan2b-profiling/eval_smoke_gpt2.sh b/scripts/kaiyuan2b-profiling/eval_smoke_gpt2.sh new file mode 100644 index 0000000..fde1793 --- /dev/null +++ b/scripts/kaiyuan2b-profiling/eval_smoke_gpt2.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# This example will start serving the 345M model. +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +# +CHECKPOINT=/apps/yi/model_training/artifacts/checkpoints/smoke_gpt + +# +VOCAB_FILE=/apps/yi/model_training/data/tokenizer/vocab.json + +# +MERGE_FILE=/apps/yi/model_training/data/tokenizer/merges.txt + +# +TOKENIZER_PATH=/apps/yi/model_training/data/tokenizer + +MEGATRON_PATH=/apps/yi/model_training/Megatron-LM + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# pip install flask-restful + +# torchrun $DISTRIBUTED_ARGS $MEGATRON_PATH/tools/run_text_generation_server.py \ +# --tensor-model-parallel-size 1 \ +# --pipeline-model-parallel-size 1 \ +# --num-layers 12 \ +# --hidden-size 3072 \ +# --load ${CHECKPOINT} \ +# --num-attention-heads 8 \ +# --num-query-groups 4 \ +# --max-position-embeddings 4096 \ +# --fp16 \ +# --micro-batch-size 1 \ +# --seq-length 1024 \ +# --temperature 1.0 \ + +# --top_p 0.9 \ +# --seed 42 \ +# --tokenizer-type GPT2BPETokenizer +# --vocab-file $VOCAB_FILE \ +# --merge-file $MERGE_FILE \ + + +torchrun $DISTRIBUTED_ARGS $MEGATRON_PATH/tools/run_text_generation_server.py \ + --load $CHECKPOINT \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 12 \ + --hidden-size 768 \ + --ffn-hidden-size 3072 \ + --num-attention-heads 8 \ + --num-query-groups 4 \ + --group-query-attention \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --position-embedding-type rope \ + --rotary-base 10000 \ + --swiglu \ + --disable-bias-linear \ + --normalization RMSNorm \ + --untie-embeddings-and-output-weights \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model $TOKENIZER_PATH \ + --bf16 \ + --micro-batch-size 1 \ diff --git a/scripts/kaiyuan2b-profiling/params/data_phase1.sh.back b/scripts/kaiyuan2b-profiling/params/data_phase1.sh.back new file mode 100644 index 0000000..853f7aa --- /dev/null +++ b/scripts/kaiyuan2b-profiling/params/data_phase1.sh.back @@ -0,0 +1,21 @@ + +DATA_DIR=/ssd/yi/converted_data/megatron_phase1 + +START=0 +END=50 + +DATA_PATHS="" +for idx in $(seq -f "%05g" $START $END); do + DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document" +done + +DATA_ARGS=" + --data-path ${DATA_PATHS} + --split 999,1,0 + --tokenizer-type HuggingFaceTokenizer + --tokenizer-model /apps/yi/model_training/data/tokenizer +" + +# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json +# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt +# --vocab-size 151936 \ No newline at end of file diff --git a/scripts/kaiyuan2b-profiling/params/gpt_smoke/data.sh b/scripts/kaiyuan2b-profiling/params/gpt_smoke/data.sh new file mode 100644 index 0000000..853f7aa --- /dev/null +++ b/scripts/kaiyuan2b-profiling/params/gpt_smoke/data.sh @@ -0,0 +1,21 @@ + +DATA_DIR=/ssd/yi/converted_data/megatron_phase1 + +START=0 +END=50 + +DATA_PATHS="" +for idx in $(seq -f "%05g" $START $END); do + DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document" +done + +DATA_ARGS=" + --data-path ${DATA_PATHS} + --split 999,1,0 + --tokenizer-type HuggingFaceTokenizer + --tokenizer-model /apps/yi/model_training/data/tokenizer +" + +# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json +# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt +# --vocab-size 151936 \ No newline at end of file diff --git a/scripts/kaiyuan2b-profiling/params/gpt_smoke/hparams.sh b/scripts/kaiyuan2b-profiling/params/gpt_smoke/hparams.sh new file mode 100644 index 0000000..800832e --- /dev/null +++ b/scripts/kaiyuan2b-profiling/params/gpt_smoke/hparams.sh @@ -0,0 +1,13 @@ +HPARAM_ARGS=" + --micro-batch-size 32 + --global-batch-size 2048 + --train-iters 15000 + --eval-iters 10 + --eval-interval 1000 + --save-interval 1000 + --log-interval 1 + --lr 1e-3 + --min-lr 1e-3 + --lr-decay-style constant + --lr-warmup-iters 10 +" \ No newline at end of file diff --git a/scripts/kaiyuan2b-profiling/params/gpt_smoke/model.sh b/scripts/kaiyuan2b-profiling/params/gpt_smoke/model.sh new file mode 100644 index 0000000..ba05bea --- /dev/null +++ b/scripts/kaiyuan2b-profiling/params/gpt_smoke/model.sh @@ -0,0 +1,18 @@ +# downscaled qwen3 arch, simulate gpt2-level training +MODEL_ARGS=" + --seq-length 4096 + --hidden-size 768 + --ffn-hidden-size 3072 + --num-layers 12 + --num-attention-heads 8 + --num-query-groups 4 + --rotary-base 10000 + --init-method-std 0.018 + --group-query-attention + --max-position-embeddings 4096 + --position-embedding-type rope + --swiglu + --disable-bias-linear + --normalization RMSNorm + --untie-embeddings-and-output-weights +" \ No newline at end of file diff --git a/scripts/kaiyuan2b-profiling/params/optim_common.sh b/scripts/kaiyuan2b-profiling/params/optim_common.sh new file mode 100644 index 0000000..6862da5 --- /dev/null +++ b/scripts/kaiyuan2b-profiling/params/optim_common.sh @@ -0,0 +1,10 @@ +# note: by default decoupled_weight_decay is True and adam optimizer acts as adamW + +OPTIM_ARGS=" + --optimizer adam + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --weight-decay 0.1 + --clip-grad 1.0 +" \ No newline at end of file diff --git a/scripts/kaiyuan2b-profiling/params/qwen3_1p7b/data_phase1_smoke.sh b/scripts/kaiyuan2b-profiling/params/qwen3_1p7b/data_phase1_smoke.sh new file mode 100644 index 0000000..a48a3d9 --- /dev/null +++ b/scripts/kaiyuan2b-profiling/params/qwen3_1p7b/data_phase1_smoke.sh @@ -0,0 +1,21 @@ + +DATA_DIR=/ssd/yi/converted_data/megatron_phase1 + +START=0 +END=210 + +DATA_PATHS="" +for idx in $(seq -f "%05g" $START $END); do + DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document" +done + +DATA_ARGS=" + --data-path ${DATA_PATHS} + --split 999,1,0 + --tokenizer-type HuggingFaceTokenizer + --tokenizer-model /apps/yi/model_training/data/tokenizer +" + +# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json +# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt +# --vocab-size 151936 diff --git a/scripts/kaiyuan2b-profiling/params/qwen3_1p7b/hparams.sh b/scripts/kaiyuan2b-profiling/params/qwen3_1p7b/hparams.sh new file mode 100644 index 0000000..facd81d --- /dev/null +++ b/scripts/kaiyuan2b-profiling/params/qwen3_1p7b/hparams.sh @@ -0,0 +1,13 @@ +HPARAMS=" + --micro-batch-size 16 + --global-batch-size 2048 + --train-iters 19760 + --eval-iters 10 + --eval-interval 1000 + --save-interval 1000 + --log-interval 1 + --lr 5e-3 + --min-lr 5e-3 + --lr-decay-style constant + --lr-warmup-iters 10 +" diff --git a/scripts/kaiyuan2b-profiling/params/qwen3_1p7b/model.sh b/scripts/kaiyuan2b-profiling/params/qwen3_1p7b/model.sh new file mode 100644 index 0000000..d275f29 --- /dev/null +++ b/scripts/kaiyuan2b-profiling/params/qwen3_1p7b/model.sh @@ -0,0 +1,19 @@ +# note: official qwen3 training uses qk norm while megatron has no official support + +MODEL_ARGS=" + --seq-length 4096 + --hidden-size 2048 + --ffn-hidden-size 6144 + --num-layers 28 + --num-attention-heads 16 + --num-query-groups 8 + --rotary-base 10000 + --init-method-std 0.018 + --group-query-attention + --max-position-embeddings 4096 + --position-embedding-type rope + --swiglu + --disable-bias-linear + --normalization RMSNorm + --untie-embeddings-and-output-weights +" \ No newline at end of file diff --git a/scripts/kaiyuan2b-profiling/training_smoke_gpt2.sh b/scripts/kaiyuan2b-profiling/training_smoke_gpt2.sh new file mode 100644 index 0000000..ce3cafc --- /dev/null +++ b/scripts/kaiyuan2b-profiling/training_smoke_gpt2.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +set -euo pipefail + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +MODE=${1:-smoke} +TRAIN_NAME=${2:-smoke_gpt} + +MEGATRON_PATH=/apps/yi/model_training/Megatron-LM +ARTIFACT_ROOT=/apps/yi/model_training/artifacts +TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}" +CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}" + +source params/optim_common.sh +source params/gpt_smoke/model.sh +source params/gpt_smoke/data.sh +source params/gpt_smoke/hparams.sh + +RUN_ARGS=$HPARAM_ARGS + +LOGGING_ARGS=" + --tensorboard-dir ${TB_DIR} + --tensorboard-log-interval 1 + --log-interval 1 + --log-timers-to-tensorboard + --log-validation-ppl-to-tensorboard + --log-memory-to-tensorboard + --log-world-size-to-tensorboard + --log-num-zeros-in-grad + --log-device-memory-used + --log-throughput + --log-params-norm +" + +PRECISION_ARGS="--bf16" +PARALLEL_ARGS=" + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 1 +" +# PARALLEL_ARGS=" +# --tensor-model-parallel-size 2 +# --sequence-parallel +# " + +mkdir -p "$CKPT_DIR" "$TB_DIR" + +DISTRIBUTED_ARGS=" + --nproc_per_node 8 + --nnodes 1 + --node_rank 0 + --master_addr localhost + --master_port 6000 +" + +torchrun $DISTRIBUTED_ARGS \ + $MEGATRON_PATH/pretrain_gpt.py \ + $MODEL_ARGS \ + $OPTIM_ARGS \ + $PRECISION_ARGS \ + $PARALLEL_ARGS \ + $DATA_ARGS \ + $RUN_ARGS \ + $LOGGING_ARGS\ + --save "$CKPT_DIR" \ + --load "$CKPT_DIR" \ \ No newline at end of file diff --git a/scripts/kaiyuan2b-profiling/training_smoke_qwen3_1p7b.sh b/scripts/kaiyuan2b-profiling/training_smoke_qwen3_1p7b.sh new file mode 100644 index 0000000..db863bd --- /dev/null +++ b/scripts/kaiyuan2b-profiling/training_smoke_qwen3_1p7b.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +set -euo pipefail + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +#export CUDA_DEVICE_MAX_CONNECTIONS=1 + +MODE=${1:-qwen3_1p7b_smoke} +TRAIN_NAME=${2:-qwen3_1p7b_smoke} + +MEGATRON_PATH=/apps/yi/model_training/Megatron-LM +ARTIFACT_ROOT=/apps/yi/model_training/artifacts +TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}" +CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}" + +source params/optim_common.sh +source params/qwen3_1p7b/model.sh +source params/qwen3_1p7b/data_phase1_smoke.sh +source params/qwen3_1p7b/hparams.sh + +LOGGING_ARGS=" + --tensorboard-dir ${TB_DIR} + --tensorboard-log-interval 1 + --log-interval 1 + --log-timers-to-tensorboard + --log-validation-ppl-to-tensorboard + --log-memory-to-tensorboard + --log-world-size-to-tensorboard + --log-num-zeros-in-grad + --log-device-memory-used + --log-throughput + --log-params-norm +" + +PRECISION_ARGS="--bf16" +PARALLEL_ARGS=" + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 1 +" +# PARALLEL_ARGS=" +# --tensor-model-parallel-size 2 +# --sequence-parallel +# " + +if [ "$MODE" = "qwen3_1p7b_smoke" ]; then + source params/qwen3_1p7b/hparams.sh + RUN_ARGS="$HPARAMS" +elif [ "$MODE" = "phase1" ]; then + source params/phase1_full.sh + RUN_ARGS="$PHASE_ARGS" +else + echo "Unknown mode: $MODE" + exit 1 +fi + +mkdir -p "$CKPT_DIR" "$TB_DIR" + +DISTRIBUTED_ARGS=" + --nproc_per_node 8 + --nnodes 1 + --node_rank 0 + --master_addr localhost + --master_port 6000 +" +nsys profile \ + -s none \ + -t cuda,nvtx,cudnn,cublas \ + -o megatron_8gpu_%p \ + --force-overwrite true \ + --capture-range=cudaProfilerApi \ + --capture-range-end=stop \ + --cuda-graph-trace=node \ +torchrun $DISTRIBUTED_ARGS \ + $MEGATRON_PATH/pretrain_gpt.py \ + $MODEL_ARGS \ + $OPTIM_ARGS \ + $PRECISION_ARGS \ + $PARALLEL_ARGS \ + $DATA_ARGS \ + $RUN_ARGS \ + $LOGGING_ARGS\ + --save "$CKPT_DIR" \ + --load "$CKPT_DIR" \ + --enable-cuda-graph \ + --cuda-graph-warmup-steps 3 \ + --profile \ + --profile-step-start 10 \ + --profile-step-end 12 \ + --profile-ranks 0 1 \ + --transformer-impl transformer_engine \ + --cross-entropy-loss-fusion \ + --cross-entropy-fusion-impl te +#--use-distributed-optimizer +#--overlap-grad-reduce \ +# --overlap-param-gather \ diff --git a/scripts/kaiyuan2b-training/eval_smoke_gpt2.sh b/scripts/kaiyuan2b-training/eval_smoke_gpt2.sh new file mode 100644 index 0000000..fde1793 --- /dev/null +++ b/scripts/kaiyuan2b-training/eval_smoke_gpt2.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# This example will start serving the 345M model. +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +# +CHECKPOINT=/apps/yi/model_training/artifacts/checkpoints/smoke_gpt + +# +VOCAB_FILE=/apps/yi/model_training/data/tokenizer/vocab.json + +# +MERGE_FILE=/apps/yi/model_training/data/tokenizer/merges.txt + +# +TOKENIZER_PATH=/apps/yi/model_training/data/tokenizer + +MEGATRON_PATH=/apps/yi/model_training/Megatron-LM + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# pip install flask-restful + +# torchrun $DISTRIBUTED_ARGS $MEGATRON_PATH/tools/run_text_generation_server.py \ +# --tensor-model-parallel-size 1 \ +# --pipeline-model-parallel-size 1 \ +# --num-layers 12 \ +# --hidden-size 3072 \ +# --load ${CHECKPOINT} \ +# --num-attention-heads 8 \ +# --num-query-groups 4 \ +# --max-position-embeddings 4096 \ +# --fp16 \ +# --micro-batch-size 1 \ +# --seq-length 1024 \ +# --temperature 1.0 \ + +# --top_p 0.9 \ +# --seed 42 \ +# --tokenizer-type GPT2BPETokenizer +# --vocab-file $VOCAB_FILE \ +# --merge-file $MERGE_FILE \ + + +torchrun $DISTRIBUTED_ARGS $MEGATRON_PATH/tools/run_text_generation_server.py \ + --load $CHECKPOINT \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 12 \ + --hidden-size 768 \ + --ffn-hidden-size 3072 \ + --num-attention-heads 8 \ + --num-query-groups 4 \ + --group-query-attention \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --position-embedding-type rope \ + --rotary-base 10000 \ + --swiglu \ + --disable-bias-linear \ + --normalization RMSNorm \ + --untie-embeddings-and-output-weights \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model $TOKENIZER_PATH \ + --bf16 \ + --micro-batch-size 1 \ diff --git a/scripts/kaiyuan2b-training/params/data_phase1.sh.back b/scripts/kaiyuan2b-training/params/data_phase1.sh.back new file mode 100644 index 0000000..853f7aa --- /dev/null +++ b/scripts/kaiyuan2b-training/params/data_phase1.sh.back @@ -0,0 +1,21 @@ + +DATA_DIR=/ssd/yi/converted_data/megatron_phase1 + +START=0 +END=50 + +DATA_PATHS="" +for idx in $(seq -f "%05g" $START $END); do + DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document" +done + +DATA_ARGS=" + --data-path ${DATA_PATHS} + --split 999,1,0 + --tokenizer-type HuggingFaceTokenizer + --tokenizer-model /apps/yi/model_training/data/tokenizer +" + +# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json +# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt +# --vocab-size 151936 \ No newline at end of file diff --git a/scripts/kaiyuan2b-training/params/gpt_smoke/data.sh b/scripts/kaiyuan2b-training/params/gpt_smoke/data.sh new file mode 100644 index 0000000..853f7aa --- /dev/null +++ b/scripts/kaiyuan2b-training/params/gpt_smoke/data.sh @@ -0,0 +1,21 @@ + +DATA_DIR=/ssd/yi/converted_data/megatron_phase1 + +START=0 +END=50 + +DATA_PATHS="" +for idx in $(seq -f "%05g" $START $END); do + DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document" +done + +DATA_ARGS=" + --data-path ${DATA_PATHS} + --split 999,1,0 + --tokenizer-type HuggingFaceTokenizer + --tokenizer-model /apps/yi/model_training/data/tokenizer +" + +# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json +# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt +# --vocab-size 151936 \ No newline at end of file diff --git a/scripts/kaiyuan2b-training/params/gpt_smoke/hparams.sh b/scripts/kaiyuan2b-training/params/gpt_smoke/hparams.sh new file mode 100644 index 0000000..800832e --- /dev/null +++ b/scripts/kaiyuan2b-training/params/gpt_smoke/hparams.sh @@ -0,0 +1,13 @@ +HPARAM_ARGS=" + --micro-batch-size 32 + --global-batch-size 2048 + --train-iters 15000 + --eval-iters 10 + --eval-interval 1000 + --save-interval 1000 + --log-interval 1 + --lr 1e-3 + --min-lr 1e-3 + --lr-decay-style constant + --lr-warmup-iters 10 +" \ No newline at end of file diff --git a/scripts/kaiyuan2b-training/params/gpt_smoke/model.sh b/scripts/kaiyuan2b-training/params/gpt_smoke/model.sh new file mode 100644 index 0000000..ba05bea --- /dev/null +++ b/scripts/kaiyuan2b-training/params/gpt_smoke/model.sh @@ -0,0 +1,18 @@ +# downscaled qwen3 arch, simulate gpt2-level training +MODEL_ARGS=" + --seq-length 4096 + --hidden-size 768 + --ffn-hidden-size 3072 + --num-layers 12 + --num-attention-heads 8 + --num-query-groups 4 + --rotary-base 10000 + --init-method-std 0.018 + --group-query-attention + --max-position-embeddings 4096 + --position-embedding-type rope + --swiglu + --disable-bias-linear + --normalization RMSNorm + --untie-embeddings-and-output-weights +" \ No newline at end of file diff --git a/scripts/kaiyuan2b-training/params/optim_common.sh b/scripts/kaiyuan2b-training/params/optim_common.sh new file mode 100644 index 0000000..6862da5 --- /dev/null +++ b/scripts/kaiyuan2b-training/params/optim_common.sh @@ -0,0 +1,10 @@ +# note: by default decoupled_weight_decay is True and adam optimizer acts as adamW + +OPTIM_ARGS=" + --optimizer adam + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --weight-decay 0.1 + --clip-grad 1.0 +" \ No newline at end of file diff --git a/scripts/kaiyuan2b-training/params/qwen3_1p7b/data_phase1_smoke.sh b/scripts/kaiyuan2b-training/params/qwen3_1p7b/data_phase1_smoke.sh new file mode 100644 index 0000000..9e9cbd7 --- /dev/null +++ b/scripts/kaiyuan2b-training/params/qwen3_1p7b/data_phase1_smoke.sh @@ -0,0 +1,21 @@ + +DATA_DIR=/ssd/yi/converted_data/megatron_phase1 + +START=0 +END=210 + +DATA_PATHS="" +for idx in $(seq -f "%05g" $START $END); do + DATA_PATHS+=" 1 ${DATA_DIR}/phase1_part-${idx}_text_document" +done + +DATA_ARGS=" + --data-path ${DATA_PATHS} + --split 999,1,0 + --tokenizer-type HuggingFaceTokenizer + --tokenizer-model /apps/yi/model_training/data/tokenizer +" + +# --vocab-file /apps/yi/model_training/data/tokenizer/vocab.json +# --merge-file /apps/yi/model_training/data/tokenizer/merges.txt +# --vocab-size 151936 \ No newline at end of file diff --git a/scripts/kaiyuan2b-training/params/qwen3_1p7b/hparams.sh b/scripts/kaiyuan2b-training/params/qwen3_1p7b/hparams.sh new file mode 100644 index 0000000..b5f6c0a --- /dev/null +++ b/scripts/kaiyuan2b-training/params/qwen3_1p7b/hparams.sh @@ -0,0 +1,13 @@ +HPARAMS=" + --micro-batch-size 16 + --global-batch-size 2048 + --train-iters 87000 + --eval-iters 10 + --eval-interval 1000 + --save-interval 1000 + --log-interval 1 + --lr 5e-3 + --min-lr 5e-3 + --lr-decay-style constant + --lr-warmup-iters 10 +" \ No newline at end of file diff --git a/scripts/kaiyuan2b-training/params/qwen3_1p7b/model.sh b/scripts/kaiyuan2b-training/params/qwen3_1p7b/model.sh new file mode 100644 index 0000000..d275f29 --- /dev/null +++ b/scripts/kaiyuan2b-training/params/qwen3_1p7b/model.sh @@ -0,0 +1,19 @@ +# note: official qwen3 training uses qk norm while megatron has no official support + +MODEL_ARGS=" + --seq-length 4096 + --hidden-size 2048 + --ffn-hidden-size 6144 + --num-layers 28 + --num-attention-heads 16 + --num-query-groups 8 + --rotary-base 10000 + --init-method-std 0.018 + --group-query-attention + --max-position-embeddings 4096 + --position-embedding-type rope + --swiglu + --disable-bias-linear + --normalization RMSNorm + --untie-embeddings-and-output-weights +" \ No newline at end of file diff --git a/scripts/kaiyuan2b-training/training_smoke_gpt2.sh b/scripts/kaiyuan2b-training/training_smoke_gpt2.sh new file mode 100644 index 0000000..ce3cafc --- /dev/null +++ b/scripts/kaiyuan2b-training/training_smoke_gpt2.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +set -euo pipefail + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +MODE=${1:-smoke} +TRAIN_NAME=${2:-smoke_gpt} + +MEGATRON_PATH=/apps/yi/model_training/Megatron-LM +ARTIFACT_ROOT=/apps/yi/model_training/artifacts +TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}" +CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}" + +source params/optim_common.sh +source params/gpt_smoke/model.sh +source params/gpt_smoke/data.sh +source params/gpt_smoke/hparams.sh + +RUN_ARGS=$HPARAM_ARGS + +LOGGING_ARGS=" + --tensorboard-dir ${TB_DIR} + --tensorboard-log-interval 1 + --log-interval 1 + --log-timers-to-tensorboard + --log-validation-ppl-to-tensorboard + --log-memory-to-tensorboard + --log-world-size-to-tensorboard + --log-num-zeros-in-grad + --log-device-memory-used + --log-throughput + --log-params-norm +" + +PRECISION_ARGS="--bf16" +PARALLEL_ARGS=" + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 1 +" +# PARALLEL_ARGS=" +# --tensor-model-parallel-size 2 +# --sequence-parallel +# " + +mkdir -p "$CKPT_DIR" "$TB_DIR" + +DISTRIBUTED_ARGS=" + --nproc_per_node 8 + --nnodes 1 + --node_rank 0 + --master_addr localhost + --master_port 6000 +" + +torchrun $DISTRIBUTED_ARGS \ + $MEGATRON_PATH/pretrain_gpt.py \ + $MODEL_ARGS \ + $OPTIM_ARGS \ + $PRECISION_ARGS \ + $PARALLEL_ARGS \ + $DATA_ARGS \ + $RUN_ARGS \ + $LOGGING_ARGS\ + --save "$CKPT_DIR" \ + --load "$CKPT_DIR" \ \ No newline at end of file diff --git a/scripts/kaiyuan2b-training/training_smoke_qwen3_1p7b.sh b/scripts/kaiyuan2b-training/training_smoke_qwen3_1p7b.sh new file mode 100644 index 0000000..38bb903 --- /dev/null +++ b/scripts/kaiyuan2b-training/training_smoke_qwen3_1p7b.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +set -euo pipefail + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +MODE=${1:-qwen3_1p7b_smoke_yi} +TRAIN_NAME=${2:-qwen3_1p7b_smoke_yi} + +MEGATRON_PATH=/apps/yi/model_training/Megatron-LM +ARTIFACT_ROOT=/apps/yi/model_training/artifacts +SCRIPT_DIR=/apps/yi/model_training/scripts/kaiyuan2b-training +PARAMS_DIR="${SCRIPT_DIR}/params" +TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}" +CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}" + +source "${PARAMS_DIR}/optim_common.sh" +source "${PARAMS_DIR}/qwen3_1p7b/model.sh" +source "${PARAMS_DIR}/qwen3_1p7b/data_phase1_smoke.sh" +source "${PARAMS_DIR}/qwen3_1p7b/hparams.sh" + +LOGGING_ARGS=" + --tensorboard-dir ${TB_DIR} + --tensorboard-log-interval 1 + --log-interval 1 + --log-timers-to-tensorboard + --log-validation-ppl-to-tensorboard + --log-memory-to-tensorboard + --log-world-size-to-tensorboard + --log-num-zeros-in-grad + --log-device-memory-used + --log-throughput + --log-params-norm +" + +PRECISION_ARGS="--bf16" +PARALLEL_ARGS=" + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 1 +" +# PARALLEL_ARGS=" +# --tensor-model-parallel-size 2 +# --sequence-parallel +# " + +if [ "$MODE" = "qwen3_1p7b_smoke_yi" ]; then + source "${PARAMS_DIR}/qwen3_1p7b/hparams.sh" + RUN_ARGS="$HPARAMS" +elif [ "$MODE" = "phase1" ]; then + source "${PARAMS_DIR}/phase1_full.sh" + RUN_ARGS="$PHASE_ARGS" +else + echo "Unknown mode: $MODE" + exit 1 +fi + +mkdir -p "$CKPT_DIR" "$TB_DIR" + +DISTRIBUTED_ARGS=" + --nproc_per_node 8 + --nnodes 1 + --node_rank 0 + --master_addr localhost + --master_port 6000 +" + +torchrun $DISTRIBUTED_ARGS \ + $MEGATRON_PATH/pretrain_gpt.py \ + $MODEL_ARGS \ + $OPTIM_ARGS \ + $PRECISION_ARGS \ + $PARALLEL_ARGS \ + $DATA_ARGS \ + $RUN_ARGS \ + $LOGGING_ARGS\ + --save "$CKPT_DIR" \ + --load "$CKPT_DIR" \ + --enable-cuda-graph \ + --cuda-graph-warmup-steps 3 \ + --transformer-impl transformer_engine \ + --cross-entropy-loss-fusion \ + --cross-entropy-fusion-impl te + diff --git a/scripts/toy_model_training.sh b/scripts/toy_model_training.sh new file mode 100644 index 0000000..96f6770 --- /dev/null +++ b/scripts/toy_model_training.sh @@ -0,0 +1,30 @@ +torchrun --nproc_per_node=1 pretrain_gpt.py \ + --num-layers 2 \ + --hidden-size 256 \ + --ffn-hidden-size 1024 \ + --num-attention-heads 4 \ + --seq-length 512 \ + --max-position-embeddings 512 \ + --micro-batch-size 1 \ + --global-batch-size 8 \ + --train-iters 200 \ + --lr 1e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --lr-warmup-iters 2 \ + --weight-decay 0.01 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --init-method-std 0.02 \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model /apps/yi/kaiyuan_pretraining/tokenizer \ + --data-path 1.0 /apps/yi/kaiyuan_pretraining/megatron_phase1/phase1_part-00000_text_document \ + --split 949,50,1 \ + --bf16 \ + --save /apps/yi/checkpoints/tiny-test-full \ + --save-interval 20 \ + --tensorboard-dir /apps/yi/tb_logs/tiny-test-full \ + --log-interval 1 \ + --eval-interval 10 \ + --eval-iters 20 \ No newline at end of file