feat: optimized dataset convertion efficiency, add on-demand training start/stop script
This commit is contained in:
68
scripts/kaiyuan2b-training/stop_training.sh
Executable file
68
scripts/kaiyuan2b-training/stop_training.sh
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts}
|
||||
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
|
||||
GRACE_SECONDS=${GRACE_SECONDS:-300}
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage:
|
||||
bash stop_training.sh <train_name>
|
||||
|
||||
Environment overrides:
|
||||
GRACE_SECONDS=300
|
||||
EOF
|
||||
}
|
||||
|
||||
train_name=${1:-}
|
||||
if [ -z "$train_name" ] || [ "$train_name" = "-h" ] || [ "$train_name" = "--help" ]; then
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
pid_file="${RUN_STATE_DIR}/${train_name}.pid"
|
||||
meta_file="${RUN_STATE_DIR}/${train_name}.env"
|
||||
|
||||
if [ ! -f "$pid_file" ]; then
|
||||
echo "PID file not found: ${pid_file}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
pid=$(cat "$pid_file")
|
||||
if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then
|
||||
echo "Training is not running for train_name=${train_name}; cleaning stale state."
|
||||
rm -f "$pid_file" "$meta_file"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
pgid=$(ps -o pgid= -p "$pid" | tr -d ' ' || true)
|
||||
if [ -z "$pgid" ] && [ -f "$meta_file" ]; then
|
||||
pgid=$(grep '^PGID=' "$meta_file" | cut -d= -f2- || true)
|
||||
fi
|
||||
|
||||
echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}"
|
||||
if [ -n "$pgid" ]; then
|
||||
kill -TERM "-${pgid}" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true
|
||||
else
|
||||
kill -TERM "$pid" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
deadline=$((SECONDS + GRACE_SECONDS))
|
||||
while kill -0 "$pid" 2>/dev/null; do
|
||||
if [ "$SECONDS" -ge "$deadline" ]; then
|
||||
echo "Training did not exit within ${GRACE_SECONDS}s." >&2
|
||||
echo "If checkpoint saving is still running, wait and inspect logs before forcing termination." >&2
|
||||
if [ -n "$pgid" ]; then
|
||||
echo "Force kill manually if needed: kill -KILL -${pgid}" >&2
|
||||
else
|
||||
echo "Force kill manually if needed: kill -KILL ${pid}" >&2
|
||||
fi
|
||||
exit 2
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
|
||||
rm -f "$pid_file" "$meta_file"
|
||||
echo "Stopped training: train_name=${train_name}"
|
||||
Reference in New Issue
Block a user