Spaces:

NTUST-DDRC
/

gen3c

Build error

App Files Files Community

elungky commited on Jul 22

Commit

cf41009

1 Parent(s): f305527

Attempt to fix libcudnn.so.9 error by installing cudnn via conda and transformer_engine separately

Browse files

Files changed (2) hide show

Dockerfile +12 -18
cosmos-predict1.yaml +2 -2

Dockerfile CHANGED Viewed

@@ -1,28 +1,22 @@
 # Start from a clean NVIDIA CUDA base image.
-# Using 12.4.0-devel-ubuntu22.04 to align with the CUDA version specified in your cosmos-predict1.yaml.
 FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
 # Set environment variables for non-interactive installations to prevent prompts during apt-get.
 ENV DEBIAN_FRONTEND=noninteractive
-# Define the base directory for Conda installation.
 ENV CONDA_DIR=/opt/conda
-# Add Conda's binary directory to the system's PATH.
 ENV PATH=$CONDA_DIR/bin:$PATH
-# Set the working directory inside the container. All subsequent commands will run from here.
 WORKDIR /app
-# Install essential system dependencies required for Miniconda and general build processes.
-# This includes wget for downloading, git for cloning (if needed), build-essential for compiling,
-# and libgl1-mesa-glx for graphics-related libraries often used by ML frameworks.
-# NEW: Added libglib2.0-0 for cv2 dependency
 RUN apt-get update && apt-get install -y --no-install-recommends \
     wget \
     git \
     build-essential \
     libgl1-mesa-glx \
-    libglib2.0-0 \
-    # Clean up apt cache to reduce image size
     && rm -rf /var/lib/apt/lists/*
 # Install Miniconda
@@ -34,23 +28,18 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
     conda config --add channels conda-forge
 # Accept Conda Terms of Service for default channels.
-# We use '. ' (dot space) instead of 'source' as /bin/sh is the default shell.
 RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
     conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
     conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
 # Copy all local project files into the container's working directory (/app).
-# This includes your cosmos-predict1.yaml, gui/requirements.txt, start.sh, etc.
 COPY . /app
 # Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
-# This step will install all specified Python and pip dependencies (excluding PyTorch/TorchVision).
 RUN conda env create -f cosmos-predict1.yaml
 # Set the default Conda environment to be activated.
 ENV CONDA_DEFAULT_ENV=cosmos-predict1
-# Add the newly created Conda environment's binary directory to the PATH.
-# This ensures that executables (like python, pip, uvicorn) from this environment are found.
 ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
 # Install PyTorch and TorchVision via pip with specific CUDA index.
@@ -62,12 +51,19 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
         torchaudio==2.3.1 \
         --index-url https://download.pytorch.org/whl/cu121
 # --- Verification Steps ---
 RUN echo "Verifying Python and Conda installations..."
 RUN python --version
 RUN conda env list
 RUN echo "Verifying PyTorch and CUDA availability..."
-# Use a heredoc for multi-line Python code
 RUN conda run -n cosmos-predict1 python <<EOF
 import torch
 print('PyTorch Version: ' + torch.__version__)
@@ -77,7 +73,6 @@ if torch.cuda.is_available():
 else:
     print('CUDA Device Name: N/A')
 EOF
-# Add the fallback echo if the python command fails
 RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
 # --- End Verification Steps ---
@@ -85,5 +80,4 @@ RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cos
 RUN chmod +x /app/start.sh
 # Set the default command to run when the container starts.
-# This will execute your start.sh script.
 CMD ["/app/start.sh"]

 # Start from a clean NVIDIA CUDA base image.
 FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
 # Set environment variables for non-interactive installations to prevent prompts during apt-get.
 ENV DEBIAN_FRONTEND=noninteractive
 ENV CONDA_DIR=/opt/conda
 ENV PATH=$CONDA_DIR/bin:$PATH
 WORKDIR /app
+# Install essential system dependencies required for Miniconda and general build tools.
 RUN apt-get update && apt-get install -y --no-install-recommends \
     wget \
     git \
     build-essential \
     libgl1-mesa-glx \
+    libglib2.0-0 \
+    # Add any other system libraries that might be needed by transformer_engine or other deep learning libs
+    # For example, libstdc++6, libgomp1, etc., though usually covered by base image or build-essential
     && rm -rf /var/lib/apt/lists/*
 # Install Miniconda
     conda config --add channels conda-forge
 # Accept Conda Terms of Service for default channels.
 RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
     conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
     conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
 # Copy all local project files into the container's working directory (/app).
 COPY . /app
 # Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
 RUN conda env create -f cosmos-predict1.yaml
 # Set the default Conda environment to be activated.
 ENV CONDA_DEFAULT_ENV=cosmos-predict1
 ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
 # Install PyTorch and TorchVision via pip with specific CUDA index.
         torchaudio==2.3.1 \
         --index-url https://download.pytorch.org/whl/cu121
+# --- NEW: Install Transformer Engine separately after PyTorch and cuDNN are in place ---
+# This ensures it can find the necessary CUDA/cuDNN libraries.
+# Also add git to apt-get install above if not already there.
+RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
+    conda activate cosmos-predict1 && \
+    pip install --no-cache-dir transformer-engine[pytorch]==1.12.0
+# --- END NEW ---
 # --- Verification Steps ---
 RUN echo "Verifying Python and Conda installations..."
 RUN python --version
 RUN conda env list
 RUN echo "Verifying PyTorch and CUDA availability..."
 RUN conda run -n cosmos-predict1 python <<EOF
 import torch
 print('PyTorch Version: ' + torch.__version__)
 else:
     print('CUDA Device Name: N/A')
 EOF
 RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
 # --- End Verification Steps ---
 RUN chmod +x /app/start.sh
 # Set the default command to run when the container starts.
 CMD ["/app/start.sh"]

cosmos-predict1.yaml CHANGED Viewed

@@ -15,7 +15,7 @@ dependencies:
   # - torchvision=0.18.1
   # - torchaudio=2.3.1
   # - pytorch-cuda=12.1
-  # - cudnn
   # - libcublas
   - pip:
@@ -31,5 +31,5 @@ dependencies:
       - megatron.core
       - attrs
       - iopath
-      - transformer_engine
       # PyTorch and TorchVision will be installed separately in the Dockerfile

   # - torchvision=0.18.1
   # - torchaudio=2.3.1
   # - pytorch-cuda=12.1
+  - cudnn
   # - libcublas
   - pip:
       - megatron.core
       - attrs
       - iopath
+      #- transformer_engine
       # PyTorch and TorchVision will be installed separately in the Dockerfile