feat: optimized dataset convertion efficiency, add on-demand training start/stop script
This commit is contained in:
92
scripts/kaiyuan2b-training/start_training.sh
Executable file
92
scripts/kaiyuan2b-training/start_training.sh
Executable file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}
|
||||
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
|
||||
LOG_DIR="${ARTIFACT_ROOT}/logs"
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage:
|
||||
bash start_training.sh <model> [mode] [train_name]
|
||||
|
||||
Models:
|
||||
gpt_smoke
|
||||
qwen3_1p7b
|
||||
|
||||
Examples:
|
||||
bash start_training.sh gpt_smoke smoke smoke_gpt
|
||||
bash start_training.sh qwen3_1p7b qwen3_1p7b_smoke_yi qwen3_1p7b_smoke_yi
|
||||
|
||||
Environment overrides:
|
||||
CHECKPOINT_KEEP_RECENT=3
|
||||
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=300
|
||||
EXTRA_ARGS="--exit-duration-in-mins 120"
|
||||
EOF
|
||||
}
|
||||
|
||||
model=${1:-}
|
||||
mode=${2:-}
|
||||
train_name=${3:-}
|
||||
|
||||
if [ -z "$model" ] || [ "$model" = "-h" ] || [ "$model" = "--help" ]; then
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
case "$model" in
|
||||
gpt_smoke)
|
||||
train_script="${SCRIPT_DIR}/training_smoke_gpt2.sh"
|
||||
mode=${mode:-smoke}
|
||||
train_name=${train_name:-smoke_gpt}
|
||||
;;
|
||||
qwen3_1p7b)
|
||||
train_script="${SCRIPT_DIR}/training_smoke_qwen3_1p7b.sh"
|
||||
mode=${mode:-qwen3_1p7b_smoke_yi}
|
||||
train_name=${train_name:-qwen3_1p7b_smoke_yi}
|
||||
;;
|
||||
*)
|
||||
echo "Unknown model: $model" >&2
|
||||
usage >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
mkdir -p "$RUN_STATE_DIR" "$LOG_DIR"
|
||||
|
||||
pid_file="${RUN_STATE_DIR}/${train_name}.pid"
|
||||
meta_file="${RUN_STATE_DIR}/${train_name}.env"
|
||||
log_file="${LOG_DIR}/${train_name}.log"
|
||||
|
||||
if [ -f "$pid_file" ]; then
|
||||
old_pid=$(cat "$pid_file")
|
||||
if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
|
||||
echo "Training already appears to be running: train_name=${train_name}, pid=${old_pid}" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
combined_extra_args="--exit-signal-handler ${EXTRA_ARGS:-}"
|
||||
|
||||
cd "$SCRIPT_DIR"
|
||||
EXTRA_ARGS="$combined_extra_args" setsid bash "$train_script" "$mode" "$train_name" > "$log_file" 2>&1 &
|
||||
|
||||
pid=$!
|
||||
pgid=$(ps -o pgid= -p "$pid" | tr -d ' ' || true)
|
||||
printf '%s\n' "$pid" > "$pid_file"
|
||||
cat > "$meta_file" <<EOF
|
||||
MODEL=${model}
|
||||
MODE=${mode}
|
||||
TRAIN_NAME=${train_name}
|
||||
PID=${pid}
|
||||
PGID=${pgid}
|
||||
LOG_FILE=${log_file}
|
||||
TRAIN_SCRIPT=${train_script}
|
||||
CHECKPOINT_KEEP_RECENT=${CHECKPOINT_KEEP_RECENT:-3}
|
||||
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=${CHECKPOINT_CLEANUP_INTERVAL_SECONDS:-300}
|
||||
EOF
|
||||
|
||||
echo "Started training: model=${model}, mode=${mode}, train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}"
|
||||
echo "Log: ${log_file}"
|
||||
echo "Stop: bash ${SCRIPT_DIR}/stop_training.sh ${train_name}"
|
||||
68
scripts/kaiyuan2b-training/stop_training.sh
Executable file
68
scripts/kaiyuan2b-training/stop_training.sh
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}
|
||||
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
|
||||
GRACE_SECONDS=${GRACE_SECONDS:-300}
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage:
|
||||
bash stop_training.sh <train_name>
|
||||
|
||||
Environment overrides:
|
||||
GRACE_SECONDS=300
|
||||
EOF
|
||||
}
|
||||
|
||||
train_name=${1:-}
|
||||
if [ -z "$train_name" ] || [ "$train_name" = "-h" ] || [ "$train_name" = "--help" ]; then
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
pid_file="${RUN_STATE_DIR}/${train_name}.pid"
|
||||
meta_file="${RUN_STATE_DIR}/${train_name}.env"
|
||||
|
||||
if [ ! -f "$pid_file" ]; then
|
||||
echo "PID file not found: ${pid_file}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
pid=$(cat "$pid_file")
|
||||
if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then
|
||||
echo "Training is not running for train_name=${train_name}; cleaning stale state."
|
||||
rm -f "$pid_file" "$meta_file"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
pgid=$(ps -o pgid= -p "$pid" | tr -d ' ' || true)
|
||||
if [ -z "$pgid" ] && [ -f "$meta_file" ]; then
|
||||
pgid=$(grep '^PGID=' "$meta_file" | cut -d= -f2- || true)
|
||||
fi
|
||||
|
||||
echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}"
|
||||
if [ -n "$pgid" ]; then
|
||||
kill -TERM "-${pgid}" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true
|
||||
else
|
||||
kill -TERM "$pid" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
deadline=$((SECONDS + GRACE_SECONDS))
|
||||
while kill -0 "$pid" 2>/dev/null; do
|
||||
if [ "$SECONDS" -ge "$deadline" ]; then
|
||||
echo "Training did not exit within ${GRACE_SECONDS}s." >&2
|
||||
echo "If checkpoint saving is still running, wait and inspect logs before forcing termination." >&2
|
||||
if [ -n "$pgid" ]; then
|
||||
echo "Force kill manually if needed: kill -KILL -${pgid}" >&2
|
||||
else
|
||||
echo "Force kill manually if needed: kill -KILL ${pid}" >&2
|
||||
fi
|
||||
exit 2
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
|
||||
rm -f "$pid_file" "$meta_file"
|
||||
echo "Stopped training: train_name=${train_name}"
|
||||
@@ -11,6 +11,9 @@ MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
|
||||
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
|
||||
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
|
||||
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
|
||||
CHECKPOINT_KEEP_RECENT=${CHECKPOINT_KEEP_RECENT:-3}
|
||||
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=${CHECKPOINT_CLEANUP_INTERVAL_SECONDS:-300}
|
||||
EXTRA_ARGS=${EXTRA_ARGS:-}
|
||||
|
||||
source params/optim_common.sh
|
||||
source params/gpt_smoke/model.sh
|
||||
@@ -45,6 +48,64 @@ PARALLEL_ARGS="
|
||||
|
||||
mkdir -p "$CKPT_DIR" "$TB_DIR"
|
||||
|
||||
cleanup_old_checkpoints_once() {
|
||||
local ckpt_dir=$1
|
||||
local keep=$2
|
||||
|
||||
if ! [[ "$keep" =~ ^[0-9]+$ ]] || [ "$keep" -le 0 ] || [ ! -d "$ckpt_dir" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local latest=""
|
||||
if [ -f "${ckpt_dir}/latest_checkpointed_iteration.txt" ]; then
|
||||
read -r latest < "${ckpt_dir}/latest_checkpointed_iteration.txt" || latest=""
|
||||
if [[ "$latest" =~ ^[0-9]+$ ]]; then
|
||||
latest=$(printf "iter_%07d" "$latest")
|
||||
else
|
||||
latest=""
|
||||
fi
|
||||
fi
|
||||
|
||||
local checkpoints=()
|
||||
while IFS= read -r path; do
|
||||
checkpoints+=("$path")
|
||||
done < <(find "$ckpt_dir" -maxdepth 1 -type d -name 'iter_[0-9][0-9][0-9][0-9][0-9][0-9][0-9]' -print | sort)
|
||||
|
||||
local delete_count=$((${#checkpoints[@]} - keep))
|
||||
if [ "$delete_count" -le 0 ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local i base
|
||||
for ((i = 0; i < delete_count; i++)); do
|
||||
base=$(basename "${checkpoints[$i]}")
|
||||
if [ "$base" = "$latest" ]; then
|
||||
continue
|
||||
fi
|
||||
echo "[checkpoint-cleanup] deleting old checkpoint: ${checkpoints[$i]}"
|
||||
rm -rf -- "${checkpoints[$i]}"
|
||||
done
|
||||
}
|
||||
|
||||
checkpoint_cleanup_loop() {
|
||||
local ckpt_dir=$1
|
||||
local keep=$2
|
||||
local interval=$3
|
||||
|
||||
if ! [[ "$interval" =~ ^[0-9]+$ ]] || [ "$interval" -le 0 ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
while true; do
|
||||
sleep "$interval"
|
||||
cleanup_old_checkpoints_once "$ckpt_dir" "$keep"
|
||||
done
|
||||
}
|
||||
|
||||
checkpoint_cleanup_loop "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT" "$CHECKPOINT_CLEANUP_INTERVAL_SECONDS" &
|
||||
CHECKPOINT_CLEANUP_PID=$!
|
||||
trap 'kill "$CHECKPOINT_CLEANUP_PID" 2>/dev/null || true; cleanup_old_checkpoints_once "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT"' EXIT
|
||||
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node 8
|
||||
--nnodes 1
|
||||
@@ -63,4 +124,5 @@ torchrun $DISTRIBUTED_ARGS \
|
||||
$RUN_ARGS \
|
||||
$LOGGING_ARGS\
|
||||
--save "$CKPT_DIR" \
|
||||
--load "$CKPT_DIR" \
|
||||
--load "$CKPT_DIR" \
|
||||
$EXTRA_ARGS
|
||||
|
||||
@@ -13,6 +13,9 @@ SCRIPT_DIR=/apps/yi/model_training/scripts/kaiyuan2b-training
|
||||
PARAMS_DIR="${SCRIPT_DIR}/params"
|
||||
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
|
||||
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
|
||||
CHECKPOINT_KEEP_RECENT=${CHECKPOINT_KEEP_RECENT:-3}
|
||||
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=${CHECKPOINT_CLEANUP_INTERVAL_SECONDS:-300}
|
||||
EXTRA_ARGS=${EXTRA_ARGS:-}
|
||||
|
||||
source "${PARAMS_DIR}/optim_common.sh"
|
||||
source "${PARAMS_DIR}/qwen3_1p7b/model.sh"
|
||||
@@ -56,6 +59,64 @@ fi
|
||||
|
||||
mkdir -p "$CKPT_DIR" "$TB_DIR"
|
||||
|
||||
cleanup_old_checkpoints_once() {
|
||||
local ckpt_dir=$1
|
||||
local keep=$2
|
||||
|
||||
if ! [[ "$keep" =~ ^[0-9]+$ ]] || [ "$keep" -le 0 ] || [ ! -d "$ckpt_dir" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local latest=""
|
||||
if [ -f "${ckpt_dir}/latest_checkpointed_iteration.txt" ]; then
|
||||
read -r latest < "${ckpt_dir}/latest_checkpointed_iteration.txt" || latest=""
|
||||
if [[ "$latest" =~ ^[0-9]+$ ]]; then
|
||||
latest=$(printf "iter_%07d" "$latest")
|
||||
else
|
||||
latest=""
|
||||
fi
|
||||
fi
|
||||
|
||||
local checkpoints=()
|
||||
while IFS= read -r path; do
|
||||
checkpoints+=("$path")
|
||||
done < <(find "$ckpt_dir" -maxdepth 1 -type d -name 'iter_[0-9][0-9][0-9][0-9][0-9][0-9][0-9]' -print | sort)
|
||||
|
||||
local delete_count=$((${#checkpoints[@]} - keep))
|
||||
if [ "$delete_count" -le 0 ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local i base
|
||||
for ((i = 0; i < delete_count; i++)); do
|
||||
base=$(basename "${checkpoints[$i]}")
|
||||
if [ "$base" = "$latest" ]; then
|
||||
continue
|
||||
fi
|
||||
echo "[checkpoint-cleanup] deleting old checkpoint: ${checkpoints[$i]}"
|
||||
rm -rf -- "${checkpoints[$i]}"
|
||||
done
|
||||
}
|
||||
|
||||
checkpoint_cleanup_loop() {
|
||||
local ckpt_dir=$1
|
||||
local keep=$2
|
||||
local interval=$3
|
||||
|
||||
if ! [[ "$interval" =~ ^[0-9]+$ ]] || [ "$interval" -le 0 ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
while true; do
|
||||
sleep "$interval"
|
||||
cleanup_old_checkpoints_once "$ckpt_dir" "$keep"
|
||||
done
|
||||
}
|
||||
|
||||
checkpoint_cleanup_loop "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT" "$CHECKPOINT_CLEANUP_INTERVAL_SECONDS" &
|
||||
CHECKPOINT_CLEANUP_PID=$!
|
||||
trap 'kill "$CHECKPOINT_CLEANUP_PID" 2>/dev/null || true; cleanup_old_checkpoints_once "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT"' EXIT
|
||||
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node 8
|
||||
--nnodes 1
|
||||
@@ -79,5 +140,5 @@ torchrun $DISTRIBUTED_ARGS \
|
||||
--cuda-graph-warmup-steps 3 \
|
||||
--transformer-impl transformer_engine \
|
||||
--cross-entropy-loss-fusion \
|
||||
--cross-entropy-fusion-impl te
|
||||
|
||||
--cross-entropy-fusion-impl te \
|
||||
$EXTRA_ARGS
|
||||
|
||||
Reference in New Issue
Block a user