Spaces:

NTUST-DDRC
/

gen3c

Build error

App Files Files Community

gen3c / Dockerfile

elungky

Attempt to fix Exit code 137 (OOM) by using --no-build-isolation for transformer-engine

54bda79 about 1 month ago

raw

history blame

3.52 kB

	# Start from a clean NVIDIA CUDA base image.
	FROM nvidia/cuda:12.4.0-devel-ubuntu22.04

	# Set environment variables for non-interactive installations to prevent prompts during apt-get.
	ENV DEBIAN_FRONTEND=noninteractive
	ENV CONDA_DIR=/opt/conda
	ENV PATH=$CONDA_DIR/bin:$PATH

	WORKDIR /app

	# Install essential system dependencies required for Miniconda and general build tools.
	RUN apt-get update && apt-get install -y --no-install-recommends \
	wget \
	git \
	build-essential \
	libgl1-mesa-glx \
	libglib2.0-0 \
	&& rm -rf /var/lib/apt/lists/*

	# Install Miniconda
	RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
	/bin/bash miniconda.sh -b -p $CONDA_DIR && \
	rm miniconda.sh && \
	conda clean --all --yes && \
	conda config --set auto_activate_base false && \
	conda config --add channels conda-forge

	# Accept Conda Terms of Service for default channels.
	RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
	conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
	conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r

	# Copy all local project files into the container's working directory (/app).
	COPY . /app

	# Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
	RUN conda env create -f cosmos-predict1.yaml

	# Set the default Conda environment to be activated.
	ENV CONDA_DEFAULT_ENV=cosmos-predict1
	ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH

	# Install PyTorch and TorchVision via pip with specific CUDA index.
	RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
	conda activate cosmos-predict1 && \
	pip install --no-cache-dir \
	torch==2.3.1 \
	torchvision==0.18.1 \
	torchaudio==2.3.1 \
	--index-url https://download.pytorch.org/whl/cu121

	# --- NEW SECTION: Patch Transformer Engine linking issues by creating symlinks for headers ---
	# These commands must run AFTER conda env creation and BEFORE transformer-engine pip install.
	RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
	conda activate cosmos-predict1 && \
	echo "Creating symlinks for NVIDIA headers..." && \
	ln -sf $CONDA_PREFIX/lib/python3.10/site-packages/nvidia//include/ $CONDA_PREFIX/include/ && \
	ln -sf $CONDA_PREFIX/lib/python3.10/site-packages/nvidia//include/ $CONDA_PREFIX/include/python3.10/
	# --- END NEW SECTION ---

	# Install Transformer Engine separately after PyTorch and cuDNN are in place and headers are linked.
	RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
	conda activate cosmos-predict1 && \
	#pip install transformer-engine[pytorch]==1.12.0
	pip install --no-cache-dir --no-build-isolation transformer-engine[pytorch]==1.12.0

	# --- Verification Steps ---
	RUN echo "Verifying Python and Conda installations..."
	RUN python --version
	RUN conda env list
	RUN echo "Verifying PyTorch and CUDA availability..."
	RUN conda run -n cosmos-predict1 python <<EOF
	import torch
	print('PyTorch Version: ' + torch.__version__)
	print('CUDA Available: ' + str(torch.cuda.is_available()))
	if torch.cuda.is_available():
	print('CUDA Device Name: ' + torch.cuda.get_device_name(0))
	else:
	print('CUDA Device Name: N/A')
	EOF
	RUN [ $? -eq 0 ] \|\| echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
	# --- End Verification Steps ---

	# Make the start.sh script executable.
	RUN chmod +x /app/start.sh

	# Set the default command to run when the container starts.
	CMD ["/app/start.sh"]