Spaces:
Build error
Build error
File size: 5,233 Bytes
9679875 0287daf 5fa8a70 ea133b8 49860eb 9679875 5fa8a70 cbe7167 5fa8a70 cf41009 9679875 a27c594 5fa8a70 e0e9267 5fa8a70 193cb9f 5fa8a70 a27c594 e0e9267 e5064c2 59d6df8 ca59c13 5fa8a70 ea133b8 26ccbe7 5fa8a70 193cb9f 5fa8a70 d6f9440 cbe7167 8da02bd a27c594 8da02bd a27c594 8da02bd a27c594 8da02bd a27c594 fb926f5 a27c594 cf41009 a27c594 9679875 26ccbe7 5fa8a70 598f651 8d604f9 a27c594 193cb9f 5fa8a70 8d604f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# Adopt new base image with cuDNN pre-installed
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
# Set environment variables for non-interactive installations to prevent prompts during apt-get.
ENV DEBIAN_FRONTEND=noninteractive
ENV CONDA_DIR=/opt/conda
WORKDIR /app
# Install essential system dependencies from both Dockerfiles
RUN apt-get update -y && apt-get install -qqy \
wget \
git \
build-essential \
libgl1-mesa-glx \
libglib2.0-0 \
rsync \
make \
libssl-dev zlib1g-dev \
libbz2-dev libreadline-dev libsqlite3-dev curl llvm \
libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
ffmpeg libsm6 libxext6 cmake libmagickwand-dev \
git-lfs \
&& rm -rf /var/lib/apt/lists/* \
&& git lfs install
# Install Miniconda
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
/bin/bash miniconda.sh -b -p $CONDA_DIR && \
rm miniconda.sh && \
export PATH=$CONDA_DIR/bin:$PATH && \
conda clean --all --yes && \
conda config --set auto_activate_base false && \
conda config --add channels conda-forge
# Set the global PATH for Conda's base environment immediately after installation.
ENV PATH=$CONDA_DIR/bin:$PATH
# Accept Conda Terms of Service for default channels.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
# Copy all local project files into the container's working directory (/app).
COPY . /app
# Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
RUN conda env create -f cosmos-predict1.yaml
# Set the default Conda environment to be activated and update PATH (for subsequent layers and runtime)
ENV CONDA_DEFAULT_ENV=cosmos-predict1
ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
# Install PyTorch and TorchVision via pip with specific CUDA index.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
pip install --no-cache-dir \
torch==2.3.1 \
torchvision==0.18.1 \
torchaudio==2.3.1 \
--index-url https://download.pytorch.org/whl/cu121
# NEW: Dynamically find cudnn.h and symlink it to the Conda environment's include path.
# This ensures Transformer Engine can find it during compilation.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
CUDNN_HEADER_PATH=$(find "$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/" -name "cudnn.h" | head -n 1) && \
if [ -f "$CUDNN_HEADER_PATH" ]; then \
echo "Found cudnn.h at: $CUDNN_HEADER_PATH"; \
mkdir -p "$CONDA_PREFIX/include" && \
ln -sf "$CUDNN_HEADER_PATH" "$CONDA_PREFIX/include/cudnn.h" || \
cp "$CUDNN_HEADER_PATH" "$CONDA_PREFIX/include/cudnn.h"; \
else \
echo "Error: cudnn.h not found in any expected location within Conda environment. This will likely cause compilation failures."; \
exit 1; \
fi
# IMPORTANT: Symlink fix for Transformer Engine compilation (from INSTALL.md).
# These symlinks are for other NVIDIA headers that might be in Python site-packages.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/ || true && \
ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/python3.10 || true
# Install Transformer Engine by attempting to compile it.
# Pass CUDA_HOME explicitly to ensure it looks in the correct Conda environment path.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
CUDA_HOME=$CONDA_PREFIX pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==1.12.0"
# Install Apex for inference.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
git clone https://github.com/NVIDIA/apex /app/apex && \
CUDA_HOME=$CONDA_PREFIX pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" /app/apex
# Install MoGe for inference.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
pip install --no-cache-dir git+https://github.com/microsoft/MoGe.git
# Make the start.sh script executable.
RUN chmod +x /app/start.sh
# Verification Steps
RUN echo "Verifying Python and Conda installations..."
RUN python --version
RUN conda env list
RUN echo "Verifying PyTorch and CUDA availability..."
RUN conda run -n cosmos-predict1 python <<EOF
import torch
print('PyTorch Version: ' + torch.__version__)
print('CUDA Available: ' + str(torch.cuda.is_available()))
if torch.cuda.is_available():
print('CUDA Device Name: ' + torch.cuda.get_device_name(0))
else:
print('CUDA Device Name: N/A')
EOF
RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
# Set the default command to run when the container starts.
CMD ["/app/start.sh"] |