66 lines
1.4 KiB
Bash
66 lines
1.4 KiB
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
|
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
|
|
|
MODE=${1:-smoke}
|
|
TRAIN_NAME=${2:-smoke_gpt}
|
|
|
|
MEGATRON_PATH=/apps/yi/model_training/Megatron-LM
|
|
ARTIFACT_ROOT=/apps/yi/model_training/artifacts
|
|
TB_DIR="${ARTIFACT_ROOT}/tb_logs/${TRAIN_NAME}"
|
|
CKPT_DIR="${ARTIFACT_ROOT}/checkpoints/${TRAIN_NAME}"
|
|
|
|
source params/optim_common.sh
|
|
source params/gpt_smoke/model.sh
|
|
source params/gpt_smoke/data.sh
|
|
source params/gpt_smoke/hparams.sh
|
|
|
|
RUN_ARGS=$HPARAM_ARGS
|
|
|
|
LOGGING_ARGS="
|
|
--tensorboard-dir ${TB_DIR}
|
|
--tensorboard-log-interval 1
|
|
--log-interval 1
|
|
--log-timers-to-tensorboard
|
|
--log-validation-ppl-to-tensorboard
|
|
--log-memory-to-tensorboard
|
|
--log-world-size-to-tensorboard
|
|
--log-num-zeros-in-grad
|
|
--log-device-memory-used
|
|
--log-throughput
|
|
--log-params-norm
|
|
"
|
|
|
|
PRECISION_ARGS="--bf16"
|
|
PARALLEL_ARGS="
|
|
--tensor-model-parallel-size 1
|
|
--pipeline-model-parallel-size 1
|
|
"
|
|
# PARALLEL_ARGS="
|
|
# --tensor-model-parallel-size 2
|
|
# --sequence-parallel
|
|
# "
|
|
|
|
mkdir -p "$CKPT_DIR" "$TB_DIR"
|
|
|
|
DISTRIBUTED_ARGS="
|
|
--nproc_per_node 8
|
|
--nnodes 1
|
|
--node_rank 0
|
|
--master_addr localhost
|
|
--master_port 6000
|
|
"
|
|
|
|
torchrun $DISTRIBUTED_ARGS \
|
|
$MEGATRON_PATH/pretrain_gpt.py \
|
|
$MODEL_ARGS \
|
|
$OPTIM_ARGS \
|
|
$PRECISION_ARGS \
|
|
$PARALLEL_ARGS \
|
|
$DATA_ARGS \
|
|
$RUN_ARGS \
|
|
$LOGGING_ARGS\
|
|
--save "$CKPT_DIR" \
|
|
--load "$CKPT_DIR" \ |