feat: optimized dataset convertion efficiency, add on-demand training start/stop script
This commit is contained in:
@@ -11,6 +11,9 @@ MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
|
||||
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
|
||||
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
|
||||
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
|
||||
CHECKPOINT_KEEP_RECENT=${CHECKPOINT_KEEP_RECENT:-3}
|
||||
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=${CHECKPOINT_CLEANUP_INTERVAL_SECONDS:-300}
|
||||
EXTRA_ARGS=${EXTRA_ARGS:-}
|
||||
|
||||
source params/optim_common.sh
|
||||
source params/gpt_smoke/model.sh
|
||||
@@ -45,6 +48,64 @@ PARALLEL_ARGS="
|
||||
|
||||
mkdir -p "$CKPT_DIR" "$TB_DIR"
|
||||
|
||||
cleanup_old_checkpoints_once() {
|
||||
local ckpt_dir=$1
|
||||
local keep=$2
|
||||
|
||||
if ! [[ "$keep" =~ ^[0-9]+$ ]] || [ "$keep" -le 0 ] || [ ! -d "$ckpt_dir" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local latest=""
|
||||
if [ -f "${ckpt_dir}/latest_checkpointed_iteration.txt" ]; then
|
||||
read -r latest < "${ckpt_dir}/latest_checkpointed_iteration.txt" || latest=""
|
||||
if [[ "$latest" =~ ^[0-9]+$ ]]; then
|
||||
latest=$(printf "iter_%07d" "$latest")
|
||||
else
|
||||
latest=""
|
||||
fi
|
||||
fi
|
||||
|
||||
local checkpoints=()
|
||||
while IFS= read -r path; do
|
||||
checkpoints+=("$path")
|
||||
done < <(find "$ckpt_dir" -maxdepth 1 -type d -name 'iter_[0-9][0-9][0-9][0-9][0-9][0-9][0-9]' -print | sort)
|
||||
|
||||
local delete_count=$((${#checkpoints[@]} - keep))
|
||||
if [ "$delete_count" -le 0 ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local i base
|
||||
for ((i = 0; i < delete_count; i++)); do
|
||||
base=$(basename "${checkpoints[$i]}")
|
||||
if [ "$base" = "$latest" ]; then
|
||||
continue
|
||||
fi
|
||||
echo "[checkpoint-cleanup] deleting old checkpoint: ${checkpoints[$i]}"
|
||||
rm -rf -- "${checkpoints[$i]}"
|
||||
done
|
||||
}
|
||||
|
||||
checkpoint_cleanup_loop() {
|
||||
local ckpt_dir=$1
|
||||
local keep=$2
|
||||
local interval=$3
|
||||
|
||||
if ! [[ "$interval" =~ ^[0-9]+$ ]] || [ "$interval" -le 0 ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
while true; do
|
||||
sleep "$interval"
|
||||
cleanup_old_checkpoints_once "$ckpt_dir" "$keep"
|
||||
done
|
||||
}
|
||||
|
||||
checkpoint_cleanup_loop "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT" "$CHECKPOINT_CLEANUP_INTERVAL_SECONDS" &
|
||||
CHECKPOINT_CLEANUP_PID=$!
|
||||
trap 'kill "$CHECKPOINT_CLEANUP_PID" 2>/dev/null || true; cleanup_old_checkpoints_once "$CKPT_DIR" "$CHECKPOINT_KEEP_RECENT"' EXIT
|
||||
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node 8
|
||||
--nnodes 1
|
||||
@@ -63,4 +124,5 @@ torchrun $DISTRIBUTED_ARGS \
|
||||
$RUN_ARGS \
|
||||
$LOGGING_ARGS\
|
||||
--save "$CKPT_DIR" \
|
||||
--load "$CKPT_DIR" \
|
||||
--load "$CKPT_DIR" \
|
||||
$EXTRA_ARGS
|
||||
|
||||
Reference in New Issue
Block a user