#!/usr/bin/env bash set -euo pipefail export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_DEVICE_MAX_CONNECTIONS=1 MODE=${1:-qwen3_1p7b_smoke_yi} TRAIN_NAME=${2:-qwen3_1p7b_smoke_yi} MEGATRON_PATH=/ssd1/yi/pretrain_kaiyuan2b/Megatron-LM ARTIFACT_ROOT=/ssd1/yi/artifacts SCRIPT_DIR=/ssd1/yi/pretrain_kaiyuan2b/scripts/kaiyuan2b-training PARAMS_DIR="${SCRIPT_DIR}/params" TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}" CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}" CHECKPOINT_KEEP_RECENT=${CHECKPOINT_KEEP_RECENT:-3} CHECKPOINT_CLEANUP_INTERVAL_SECONDS=${CHECKPOINT_CLEANUP_INTERVAL_SECONDS:-300} EXTRA_ARGS=${EXTRA_ARGS:-} NPROC_PER_NODE=${NPROC_PER_NODE:-8} NNODES=${NNODES:-1} NODE_RANK=${NODE_RANK:-0} MASTER_ADDR=${MASTER_ADDR:-localhost} MASTER_PORT=${MASTER_PORT:-6000} ZERO_STAGE=${ZERO_STAGE:-0} source "${PARAMS_DIR}/optim_common.sh" source "${PARAMS_DIR}/qwen3_1p7b/model.sh" source "${PARAMS_DIR}/qwen3_1p7b/data_phase1_smoke.sh" source "${PARAMS_DIR}/qwen3_1p7b/hparams.sh" LOGGING_ARGS=" --tensorboard-dir ${TB_DIR} --tensorboard-log-interval 1 --log-interval 1 --log-timers-to-tensorboard --log-validation-ppl-to-tensorboard --log-memory-to-tensorboard --log-world-size-to-tensorboard --log-num-zeros-in-grad --log-device-memory-used --log-throughput --log-params-norm " PRECISION_ARGS="--bf16" PARALLEL_ARGS=" --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 " # PARALLEL_ARGS=" # --tensor-model-parallel-size 2 # --sequence-parallel # " if [ "$MODE" = "qwen3_1p7b_smoke_yi" ]; then source "${PARAMS_DIR}/qwen3_1p7b/hparams.sh" RUN_ARGS="$HPARAMS" elif [ "$MODE" = "phase1" ]; then source "${PARAMS_DIR}/phase1_full.sh" RUN_ARGS="$PHASE_ARGS" else echo "Unknown mode: $MODE" exit 1 fi case "$ZERO_STAGE" in 0) ZERO_ARGS="" ;; 1) ZERO_ARGS=" --use-distributed-optimizer --data-parallel-sharding-strategy optim " ;; 2) ZERO_ARGS=" --use-distributed-optimizer --data-parallel-sharding-strategy optim_grads " ;; *) echo "Unsupported ZERO_STAGE=${ZERO_STAGE}; expected 0, 1, or 2" >&2 exit 1 ;; esac mkdir -p "$CKPT_DIR" "$TB_DIR" cleanup_old_checkpoints_once() { local ckpt_dir=$1 local keep=$2 if ! [[ "$keep" =~ ^[0-9]+$ ]] || [ "$keep" -le 0 ] || [ ! -d "$ckpt_dir" ]; then return 0 fi local latest="" if [ -f "${ckpt_dir}/latest_checkpointed_iteration.txt" ]; then read -r latest < "${ckpt_dir}/latest_checkpointed_iteration.txt" || latest="" if [[ "$latest" =~ ^[0-9]+$ ]]; then latest=$(printf "iter_%07d" "$latest") else latest="" fi fi local checkpoints=() while IFS= read -r path; do checkpoints+=("$path") done < <(find "$ckpt_dir" -maxdepth 1 -type d -name 'iter_[0-9][0-9][0-9][0-9][0-9][0-9][0-9]' -print | sort) local delete_count=$((${#checkpoints[@]} - keep)) if [ "$delete_count" -le 0 ]; then return 0 fi local i base for ((i = 0; i < delete_count; i++)); do base=$(basename "${checkpoints[$i]}") if [ "$base" = "$latest" ]; then continue fi echo "[checkpoint-cleanup] deleting old checkpoint: ${checkpoints[$i]}" rm -rf -- "${checkpoints[$i]}" done } checkpoint_cleanup_loop() { local ckpt_dir=$1 local keep=$2 local interval=$3 if ! [[ "$interval" =~ ^[0-9]+$ ]] || [ "$interval" -le 0 ]; then return 0 fi while true; do sleep "$interval" cleanup_old_checkpoints_once "$ckpt_dir" "$keep" done } checkpoint_cleanup_loop "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT" "$CHECKPOINT_CLEANUP_INTERVAL_SECONDS" & CHECKPOINT_CLEANUP_PID=$! trap 'kill "$CHECKPOINT_CLEANUP_PID" 2>/dev/null || true; cleanup_old_checkpoints_once "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT"' EXIT DISTRIBUTED_ARGS=" --nproc_per_node ${NPROC_PER_NODE} --nnodes ${NNODES} --node_rank ${NODE_RANK} --master_addr ${MASTER_ADDR} --master_port ${MASTER_PORT} " torchrun $DISTRIBUTED_ARGS \ $MEGATRON_PATH/pretrain_gpt.py \ $MODEL_ARGS \ $OPTIM_ARGS \ $ZERO_ARGS \ $PRECISION_ARGS \ $PARALLEL_ARGS \ $DATA_ARGS \ $RUN_ARGS \ $LOGGING_ARGS\ --save "$CKPT_DIR" \ --load "$CKPT_DIR" \ --enable-cuda-graph \ --cuda-graph-warmup-steps 3 \ --transformer-impl transformer_engine \ --cross-entropy-loss-fusion \ --cross-entropy-fusion-impl te \ $EXTRA_ARGS