Files
pretrain_kaiyuan2b/scripts/kaiyuan2b-training/training_smoke_gpt2.sh

158 lines
3.7 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export CUDA_DEVICE_MAX_CONNECTIONS=1
MODE=${1:-smoke}
TRAIN_NAME=${2:-smoke_gpt}
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
CHECKPOINT_KEEP_RECENT=${CHECKPOINT_KEEP_RECENT:-3}
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=${CHECKPOINT_CLEANUP_INTERVAL_SECONDS:-300}
EXTRA_ARGS=${EXTRA_ARGS:-}
NPROC_PER_NODE=${NPROC_PER_NODE:-8}
NNODES=${NNODES:-1}
NODE_RANK=${NODE_RANK:-0}
MASTER_ADDR=${MASTER_ADDR:-localhost}
MASTER_PORT=${MASTER_PORT:-6000}
ZERO_STAGE=${ZERO_STAGE:-0}
source params/optim_common.sh
source params/gpt_smoke/model.sh
source params/gpt_smoke/data.sh
source params/gpt_smoke/hparams.sh
RUN_ARGS=$HPARAM_ARGS
LOGGING_ARGS="
--tensorboard-dir ${TB_DIR}
--tensorboard-log-interval 1
--log-interval 1
--log-timers-to-tensorboard
--log-validation-ppl-to-tensorboard
--log-memory-to-tensorboard
--log-world-size-to-tensorboard
--log-num-zeros-in-grad
--log-device-memory-used
--log-throughput
--log-params-norm
"
PRECISION_ARGS="--bf16"
PARALLEL_ARGS="
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 1
"
# PARALLEL_ARGS="
# --tensor-model-parallel-size 2
# --sequence-parallel
# "
case "$ZERO_STAGE" in
0)
ZERO_ARGS=""
;;
1)
ZERO_ARGS="
--use-distributed-optimizer
--data-parallel-sharding-strategy optim
"
;;
2)
ZERO_ARGS="
--use-distributed-optimizer
--data-parallel-sharding-strategy optim_grads
"
;;
*)
echo "Unsupported ZERO_STAGE=${ZERO_STAGE}; expected 0, 1, or 2" >&2
exit 1
;;
esac
mkdir -p "$CKPT_DIR" "$TB_DIR"
cleanup_old_checkpoints_once() {
local ckpt_dir=$1
local keep=$2
if ! [[ "$keep" =~ ^[0-9]+$ ]] || [ "$keep" -le 0 ] || [ ! -d "$ckpt_dir" ]; then
return 0
fi
local latest=""
if [ -f "${ckpt_dir}/latest_checkpointed_iteration.txt" ]; then
read -r latest < "${ckpt_dir}/latest_checkpointed_iteration.txt" || latest=""
if [[ "$latest" =~ ^[0-9]+$ ]]; then
latest=$(printf "iter_%07d" "$latest")
else
latest=""
fi
fi
local checkpoints=()
while IFS= read -r path; do
checkpoints+=("$path")
done < <(find "$ckpt_dir" -maxdepth 1 -type d -name 'iter_[0-9][0-9][0-9][0-9][0-9][0-9][0-9]' -print | sort)
local delete_count=$((${#checkpoints[@]} - keep))
if [ "$delete_count" -le 0 ]; then
return 0
fi
local i base
for ((i = 0; i < delete_count; i++)); do
base=$(basename "${checkpoints[$i]}")
if [ "$base" = "$latest" ]; then
continue
fi
echo "[checkpoint-cleanup] deleting old checkpoint: ${checkpoints[$i]}"
rm -rf -- "${checkpoints[$i]}"
done
}
checkpoint_cleanup_loop() {
local ckpt_dir=$1
local keep=$2
local interval=$3
if ! [[ "$interval" =~ ^[0-9]+$ ]] || [ "$interval" -le 0 ]; then
return 0
fi
while true; do
sleep "$interval"
cleanup_old_checkpoints_once "$ckpt_dir" "$keep"
done
}
checkpoint_cleanup_loop "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT" "$CHECKPOINT_CLEANUP_INTERVAL_SECONDS" &
CHECKPOINT_CLEANUP_PID=$!
trap 'kill "$CHECKPOINT_CLEANUP_PID" 2>/dev/null || true; cleanup_old_checkpoints_once "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT"' EXIT
DISTRIBUTED_ARGS="
--nproc_per_node ${NPROC_PER_NODE}
--nnodes ${NNODES}
--node_rank ${NODE_RANK}
--master_addr ${MASTER_ADDR}
--master_port ${MASTER_PORT}
"
torchrun $DISTRIBUTED_ARGS \
$MEGATRON_PATH/pretrain_gpt.py \
$MODEL_ARGS \
$OPTIM_ARGS \
$ZERO_ARGS \
$PRECISION_ARGS \
$PARALLEL_ARGS \
$DATA_ARGS \
$RUN_ARGS \
$LOGGING_ARGS\
--save "$CKPT_DIR" \
--load "$CKPT_DIR" \
$EXTRA_ARGS