93 lines
2.3 KiB
Bash
Executable File
93 lines
2.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
|
ARTIFACT_ROOT=${ARTIFACT_ROOT:-/ssd1/yi/artifacts}
|
|
RUN_STATE_DIR="${ARTIFACT_ROOT}/run_state"
|
|
LOG_DIR="${ARTIFACT_ROOT}/logs"
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage:
|
|
bash start_training.sh <model> [mode] [train_name]
|
|
|
|
Models:
|
|
gpt_smoke
|
|
qwen3_1p7b
|
|
|
|
Examples:
|
|
bash start_training.sh gpt_smoke smoke smoke_gpt
|
|
bash start_training.sh qwen3_1p7b qwen3_1p7b_smoke_yi qwen3_1p7b_smoke_yi
|
|
|
|
Environment overrides:
|
|
CHECKPOINT_KEEP_RECENT=3
|
|
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=300
|
|
EXTRA_ARGS="--exit-duration-in-mins 120"
|
|
EOF
|
|
}
|
|
|
|
model=${1:-}
|
|
mode=${2:-}
|
|
train_name=${3:-}
|
|
|
|
if [ -z "$model" ] || [ "$model" = "-h" ] || [ "$model" = "--help" ]; then
|
|
usage
|
|
exit 0
|
|
fi
|
|
|
|
case "$model" in
|
|
gpt_smoke)
|
|
train_script="${SCRIPT_DIR}/training_smoke_gpt2.sh"
|
|
mode=${mode:-smoke}
|
|
train_name=${train_name:-smoke_gpt}
|
|
;;
|
|
qwen3_1p7b)
|
|
train_script="${SCRIPT_DIR}/training_smoke_qwen3_1p7b.sh"
|
|
mode=${mode:-qwen3_1p7b_smoke_yi}
|
|
train_name=${train_name:-qwen3_1p7b_smoke_yi}
|
|
;;
|
|
*)
|
|
echo "Unknown model: $model" >&2
|
|
usage >&2
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
mkdir -p "$RUN_STATE_DIR" "$LOG_DIR"
|
|
|
|
pid_file="${RUN_STATE_DIR}/${train_name}.pid"
|
|
meta_file="${RUN_STATE_DIR}/${train_name}.env"
|
|
log_file="${LOG_DIR}/${train_name}.log"
|
|
|
|
if [ -f "$pid_file" ]; then
|
|
old_pid=$(cat "$pid_file")
|
|
if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
|
|
echo "Training already appears to be running: train_name=${train_name}, pid=${old_pid}" >&2
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
combined_extra_args="--exit-signal-handler ${EXTRA_ARGS:-}"
|
|
|
|
cd "$SCRIPT_DIR"
|
|
EXTRA_ARGS="$combined_extra_args" setsid bash "$train_script" "$mode" "$train_name" > "$log_file" 2>&1 &
|
|
|
|
pid=$!
|
|
pgid=$(ps -o pgid= -p "$pid" | tr -d ' ' || true)
|
|
printf '%s\n' "$pid" > "$pid_file"
|
|
cat > "$meta_file" <<EOF
|
|
MODEL=${model}
|
|
MODE=${mode}
|
|
TRAIN_NAME=${train_name}
|
|
PID=${pid}
|
|
PGID=${pgid}
|
|
LOG_FILE=${log_file}
|
|
TRAIN_SCRIPT=${train_script}
|
|
CHECKPOINT_KEEP_RECENT=${CHECKPOINT_KEEP_RECENT:-3}
|
|
CHECKPOINT_CLEANUP_INTERVAL_SECONDS=${CHECKPOINT_CLEANUP_INTERVAL_SECONDS:-300}
|
|
EOF
|
|
|
|
echo "Started training: model=${model}, mode=${mode}, train_name=${train_name}, pid=${pid}, pgid=${pgid:-unknown}"
|
|
echo "Log: ${log_file}"
|
|
echo "Stop: bash ${SCRIPT_DIR}/stop_training.sh ${train_name}"
|