#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts} RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state" GRACE_SECONDS=${GRACE_SECONDS:-300} usage() { cat <<'EOF' Usage: bash stop_training.sh Environment overrides: GRACE_SECONDS=300 EOF } train_name=${1:-} if [ -z "$train_name" ] || [ "$train_name" = "-h" ] || [ "$train_name" = "--help" ]; then usage exit 0 fi pid_file="${RUN_STATE_DIR}/${train_name}.pid" meta_file="${RUN_STATE_DIR}/${train_name}.env" if [ ! -f "$pid_file" ]; then echo "PID file not found: ${pid_file}" >&2 exit 1 fi pid=$(cat "$pid_file") if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then echo "Training is not running for train_name=${train_name}; cleaning stale state." rm -f "$pid_file" "$meta_file" exit 0 fi pgid=$(ps -o pgid= -p "$pid" | tr -d ' ' || true) if [ -z "$pgid" ] && [ -f "$meta_file" ]; then pgid=$(grep '^PGID=' "$meta_file" | cut -d= -f2- || true) fi echo "Sending SIGTERM to training process group: train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}" if [ -n "$pgid" ]; then kill -TERM "-${pgid}" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true else kill -TERM "$pid" 2>/dev/null || true fi deadline=$((SECONDS + GRACE_SECONDS)) while kill -0 "$pid" 2>/dev/null; do if [ "$SECONDS" -ge "$deadline" ]; then echo "Training did not exit within ${GRACE_SECONDS}s." >&2 echo "If checkpoint saving is still running, wait and inspect logs before forcing termination." >&2 if [ -n "$pgid" ]; then echo "Force kill manually if needed: kill -KILL -${pgid}" >&2 else echo "Force kill manually if needed: kill -KILL ${pid}" >&2 fi exit 2 fi sleep 5 done rm -f "$pid_file" "$meta_file" echo "Stopped training: train_name=${train_name}"