174 lines
4.3 KiB
Bash
174 lines
4.3 KiB
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
|
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
|
|
|
MODE=${1:-qwen3_1p7b_smoke_yi}
|
|
TRAIN_NAME=${2:-qwen3_1p7b_smoke_yi}
|
|
|
|
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
|
|
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
|
|
SCRIPT_DIR=/apps/yi/model_training/scripts/kaiyuan2b-training
|
|
PARAMS_DIR="${SCRIPT_DIR}/params"
|
|
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
|
|
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
|
|
CHECKPOINT_KEEP_RECENT=${CHECKPOINT_KEEP_RECENT:-3}
|
|
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=${CHECKPOINT_CLEANUP_INTERVAL_SECONDS:-300}
|
|
EXTRA_ARGS=${EXTRA_ARGS:-}
|
|
NPROC_PER_NODE=${NPROC_PER_NODE:-8}
|
|
NNODES=${NNODES:-1}
|
|
NODE_RANK=${NODE_RANK:-0}
|
|
MASTER_ADDR=${MASTER_ADDR:-localhost}
|
|
MASTER_PORT=${MASTER_PORT:-6000}
|
|
ZERO_STAGE=${ZERO_STAGE:-0}
|
|
|
|
source "${PARAMS_DIR}/optim_common.sh"
|
|
source "${PARAMS_DIR}/qwen3_1p7b/model.sh"
|
|
source "${PARAMS_DIR}/qwen3_1p7b/data_phase1_smoke.sh"
|
|
source "${PARAMS_DIR}/qwen3_1p7b/hparams.sh"
|
|
|
|
LOGGING_ARGS="
|
|
--tensorboard-dir ${TB_DIR}
|
|
--tensorboard-log-interval 1
|
|
--log-interval 1
|
|
--log-timers-to-tensorboard
|
|
--log-validation-ppl-to-tensorboard
|
|
--log-memory-to-tensorboard
|
|
--log-world-size-to-tensorboard
|
|
--log-num-zeros-in-grad
|
|
--log-device-memory-used
|
|
--log-throughput
|
|
--log-params-norm
|
|
"
|
|
|
|
PRECISION_ARGS="--bf16"
|
|
PARALLEL_ARGS="
|
|
--tensor-model-parallel-size 1
|
|
--pipeline-model-parallel-size 1
|
|
"
|
|
# PARALLEL_ARGS="
|
|
# --tensor-model-parallel-size 2
|
|
# --sequence-parallel
|
|
# "
|
|
|
|
if [ "$MODE" = "qwen3_1p7b_smoke_yi" ]; then
|
|
source "${PARAMS_DIR}/qwen3_1p7b/hparams.sh"
|
|
RUN_ARGS="$HPARAMS"
|
|
elif [ "$MODE" = "phase1" ]; then
|
|
source "${PARAMS_DIR}/phase1_full.sh"
|
|
RUN_ARGS="$PHASE_ARGS"
|
|
else
|
|
echo "Unknown mode: $MODE"
|
|
exit 1
|
|
fi
|
|
|
|
case "$ZERO_STAGE" in
|
|
0)
|
|
ZERO_ARGS=""
|
|
;;
|
|
1)
|
|
ZERO_ARGS="
|
|
--use-distributed-optimizer
|
|
--data-parallel-sharding-strategy optim
|
|
"
|
|
;;
|
|
2)
|
|
ZERO_ARGS="
|
|
--use-distributed-optimizer
|
|
--data-parallel-sharding-strategy optim_grads
|
|
"
|
|
;;
|
|
*)
|
|
echo "Unsupported ZERO_STAGE=${ZERO_STAGE}; expected 0, 1, or 2" >&2
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
mkdir -p "$CKPT_DIR" "$TB_DIR"
|
|
|
|
cleanup_old_checkpoints_once() {
|
|
local ckpt_dir=$1
|
|
local keep=$2
|
|
|
|
if ! [[ "$keep" =~ ^[0-9]+$ ]] || [ "$keep" -le 0 ] || [ ! -d "$ckpt_dir" ]; then
|
|
return 0
|
|
fi
|
|
|
|
local latest=""
|
|
if [ -f "${ckpt_dir}/latest_checkpointed_iteration.txt" ]; then
|
|
read -r latest < "${ckpt_dir}/latest_checkpointed_iteration.txt" || latest=""
|
|
if [[ "$latest" =~ ^[0-9]+$ ]]; then
|
|
latest=$(printf "iter_%07d" "$latest")
|
|
else
|
|
latest=""
|
|
fi
|
|
fi
|
|
|
|
local checkpoints=()
|
|
while IFS= read -r path; do
|
|
checkpoints+=("$path")
|
|
done < <(find "$ckpt_dir" -maxdepth 1 -type d -name 'iter_[0-9][0-9][0-9][0-9][0-9][0-9][0-9]' -print | sort)
|
|
|
|
local delete_count=$((${#checkpoints[@]} - keep))
|
|
if [ "$delete_count" -le 0 ]; then
|
|
return 0
|
|
fi
|
|
|
|
local i base
|
|
for ((i = 0; i < delete_count; i++)); do
|
|
base=$(basename "${checkpoints[$i]}")
|
|
if [ "$base" = "$latest" ]; then
|
|
continue
|
|
fi
|
|
echo "[checkpoint-cleanup] deleting old checkpoint: ${checkpoints[$i]}"
|
|
rm -rf -- "${checkpoints[$i]}"
|
|
done
|
|
}
|
|
|
|
checkpoint_cleanup_loop() {
|
|
local ckpt_dir=$1
|
|
local keep=$2
|
|
local interval=$3
|
|
|
|
if ! [[ "$interval" =~ ^[0-9]+$ ]] || [ "$interval" -le 0 ]; then
|
|
return 0
|
|
fi
|
|
|
|
while true; do
|
|
sleep "$interval"
|
|
cleanup_old_checkpoints_once "$ckpt_dir" "$keep"
|
|
done
|
|
}
|
|
|
|
checkpoint_cleanup_loop "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT" "$CHECKPOINT_CLEANUP_INTERVAL_SECONDS" &
|
|
CHECKPOINT_CLEANUP_PID=$!
|
|
trap 'kill "$CHECKPOINT_CLEANUP_PID" 2>/dev/null || true; cleanup_old_checkpoints_once "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT"' EXIT
|
|
|
|
DISTRIBUTED_ARGS="
|
|
--nproc_per_node ${NPROC_PER_NODE}
|
|
--nnodes ${NNODES}
|
|
--node_rank ${NODE_RANK}
|
|
--master_addr ${MASTER_ADDR}
|
|
--master_port ${MASTER_PORT}
|
|
"
|
|
|
|
torchrun $DISTRIBUTED_ARGS \
|
|
$MEGATRON_PATH/pretrain_gpt.py \
|
|
$MODEL_ARGS \
|
|
$OPTIM_ARGS \
|
|
$ZERO_ARGS \
|
|
$PRECISION_ARGS \
|
|
$PARALLEL_ARGS \
|
|
$DATA_ARGS \
|
|
$RUN_ARGS \
|
|
$LOGGING_ARGS\
|
|
--save "$CKPT_DIR" \
|
|
--load "$CKPT_DIR" \
|
|
--enable-cuda-graph \
|
|
--cuda-graph-warmup-steps 3 \
|
|
--transformer-impl transformer_engine \
|
|
--cross-entropy-loss-fusion \
|
|
--cross-entropy-fusion-impl te \
|
|
$EXTRA_ARGS
|