#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ARTIFACT_ROOT=${ARTIFACT_ROOT:-/apps/yi/model_training/artifacts} RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state" LOG_DIR="${ARTIFACT_ROOT}/logs" usage() { cat <<'EOF' Usage: bash start_training.sh [mode] [train_name] Models: gpt_smoke qwen3_1p7b Examples: bash start_training.sh gpt_smoke smoke smoke_gpt bash start_training.sh qwen3_1p7b qwen3_1p7b_smoke_yi qwen3_1p7b_smoke_yi Environment overrides: CHECKPOINT_KEEP_RECENT=3 CHECKPOINT_CLEANUP_INTERVAL_SECONDS=300 EXTRA_ARGS="--exit-duration-in-mins 120" EOF } model=${1:-} mode=${2:-} train_name=${3:-} if [ -z "$model" ] || [ "$model" = "-h" ] || [ "$model" = "--help" ]; then usage exit 0 fi case "$model" in gpt_smoke) train_script="${SCRIPT_DIR}/training_smoke_gpt2.sh" mode=${mode:-smoke} train_name=${train_name:-smoke_gpt} ;; qwen3_1p7b) train_script="${SCRIPT_DIR}/training_smoke_qwen3_1p7b.sh" mode=${mode:-qwen3_1p7b_smoke_yi} train_name=${train_name:-qwen3_1p7b_smoke_yi} ;; *) echo "Unknown model: $model" >&2 usage >&2 exit 1 ;; esac mkdir -p "$RUN_STATE_DIR" "$LOG_DIR" pid_file="${RUN_STATE_DIR}/${train_name}.pid" meta_file="${RUN_STATE_DIR}/${train_name}.env" log_file="${LOG_DIR}/${train_name}.log" if [ -f "$pid_file" ]; then old_pid=$(cat "$pid_file") if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then echo "Training already appears to be running: train_name=${train_name}, pid=${old_pid}" >&2 exit 1 fi fi combined_extra_args="--exit-signal-handler ${EXTRA_ARGS:-}" cd "$SCRIPT_DIR" EXTRA_ARGS="$combined_extra_args" setsid bash "$train_script" "$mode" "$train_name" > "$log_file" 2>&1 & pid=$! pgid=$(ps -o pgid= -p "$pid" | tr -d ' ' || true) printf '%s\n' "$pid" > "$pid_file" cat > "$meta_file" <