feat: optimized dataset convertion efficiency, add on-demand training start/stop script

This commit is contained in:
2026-05-06 22:32:18 +08:00
parent 056df3b6ca
commit 0008288964
6 changed files with 607 additions and 114 deletions

View File

@@ -13,6 +13,9 @@ SCRIPT_DIR=/apps/yi/model_training/scripts/kaiyuan2b-training
PARAMS_DIR="${SCRIPT_DIR}/params"
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
CHECKPOINT_KEEP_RECENT=${CHECKPOINT_KEEP_RECENT:-3}
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=${CHECKPOINT_CLEANUP_INTERVAL_SECONDS:-300}
EXTRA_ARGS=${EXTRA_ARGS:-}
source "${PARAMS_DIR}/optim_common.sh"
source "${PARAMS_DIR}/qwen3_1p7b/model.sh"
@@ -56,6 +59,64 @@ fi
mkdir -p "$CKPT_DIR" "$TB_DIR"
cleanup_old_checkpoints_once() {
local ckpt_dir=$1
local keep=$2
if ! [[ "$keep" =~ ^[0-9]+$ ]] || [ "$keep" -le 0 ] || [ ! -d "$ckpt_dir" ]; then
return 0
fi
local latest=""
if [ -f "${ckpt_dir}/latest_checkpointed_iteration.txt" ]; then
read -r latest < "${ckpt_dir}/latest_checkpointed_iteration.txt" || latest=""
if [[ "$latest" =~ ^[0-9]+$ ]]; then
latest=$(printf "iter_%07d" "$latest")
else
latest=""
fi
fi
local checkpoints=()
while IFS= read -r path; do
checkpoints+=("$path")
done < <(find "$ckpt_dir" -maxdepth 1 -type d -name 'iter_[0-9][0-9][0-9][0-9][0-9][0-9][0-9]' -print | sort)
local delete_count=$((${#checkpoints[@]} - keep))
if [ "$delete_count" -le 0 ]; then
return 0
fi
local i base
for ((i = 0; i < delete_count; i++)); do
base=$(basename "${checkpoints[$i]}")
if [ "$base" = "$latest" ]; then
continue
fi
echo "[checkpoint-cleanup] deleting old checkpoint: ${checkpoints[$i]}"
rm -rf -- "${checkpoints[$i]}"
done
}
checkpoint_cleanup_loop() {
local ckpt_dir=$1
local keep=$2
local interval=$3
if ! [[ "$interval" =~ ^[0-9]+$ ]] || [ "$interval" -le 0 ]; then
return 0
fi
while true; do
sleep "$interval"
cleanup_old_checkpoints_once "$ckpt_dir" "$keep"
done
}
checkpoint_cleanup_loop "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT" "$CHECKPOINT_CLEANUP_INTERVAL_SECONDS" &
CHECKPOINT_CLEANUP_PID=$!
trap 'kill "$CHECKPOINT_CLEANUP_PID" 2>/dev/null || true; cleanup_old_checkpoints_once "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT"' EXIT
DISTRIBUTED_ARGS="
--nproc_per_node 8
--nnodes 1
@@ -79,5 +140,5 @@ torchrun $DISTRIBUTED_ARGS \
--cuda-graph-warmup-steps 3 \
--transformer-impl transformer_engine \
--cross-entropy-loss-fusion \
--cross-entropy-fusion-impl te
--cross-entropy-fusion-impl te \
$EXTRA_ARGS