FROM nvcr.io/nvidia/pytorch:25.10-py3 ARG HTTP_PROXY ARG HTTPS_PROXY ARG http_proxy ARG https_proxy ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 ENV PIP_NO_CACHE_DIR=1 ENV PIP_CONSTRAINT= ENV MAX_JOBS=8 ENV PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH RUN apt-get update && apt-get install -y \ git curl wget vim tmux htop rsync ca-certificates \ build-essential ninja-build cmake pkg-config bzip2 \ && rm -rf /var/lib/apt/lists/* RUN python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ python -m pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn && \ python -m pip install -U pip setuptools wheel packaging RUN python -m pip install \ transformers datasets tokenizers sentencepiece accelerate \ numpy pandas pyarrow fastparquet zstandard jsonlines tqdm rich einops regex \ tensorboard wandb evaluate lm-eval \ omegaconf hydra-core nltk ftfy six psutil pydantic # NGC PyTorch usually already includes Transformer Engine. # Keep this check; do not reinstall TE unless it fails. RUN python - <<'PY' import torch print("torch:", torch.__version__, "cuda:", torch.version.cuda) try: import transformer_engine print("transformer_engine: OK") except Exception as e: print("transformer_engine import failed:", e) PY RUN git clone https://github.com/NVIDIA/Megatron-LM.git /opt/Megatron-LM && \ cd /opt/Megatron-LM && \ python -m pip install -U "setuptools<80.0.0,>=77.0.0" packaging && \ python -m pip install --no-build-isolation -e . # Optional Apex: only install if you really need fused optimizers from Apex. # Many modern Megatron paths rely more on Transformer Engine / fused kernels. RUN git clone https://github.com/NVIDIA/apex.git /opt/apex && \ cd /opt/apex && \ python -m pip install -v --disable-pip-version-check --no-build-isolation \ --config-settings "--build-option=--cpp_ext" \ --config-settings "--build-option=--cuda_ext" \ . WORKDIR /workspace CMD ["/bin/bash"]