elungky commited on
Commit
cf41009
·
1 Parent(s): f305527

Attempt to fix libcudnn.so.9 error by installing cudnn via conda and transformer_engine separately

Browse files
Files changed (2) hide show
  1. Dockerfile +12 -18
  2. cosmos-predict1.yaml +2 -2
Dockerfile CHANGED
@@ -1,28 +1,22 @@
1
  # Start from a clean NVIDIA CUDA base image.
2
- # Using 12.4.0-devel-ubuntu22.04 to align with the CUDA version specified in your cosmos-predict1.yaml.
3
  FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
4
 
5
  # Set environment variables for non-interactive installations to prevent prompts during apt-get.
6
  ENV DEBIAN_FRONTEND=noninteractive
7
- # Define the base directory for Conda installation.
8
  ENV CONDA_DIR=/opt/conda
9
- # Add Conda's binary directory to the system's PATH.
10
  ENV PATH=$CONDA_DIR/bin:$PATH
11
 
12
- # Set the working directory inside the container. All subsequent commands will run from here.
13
  WORKDIR /app
14
 
15
- # Install essential system dependencies required for Miniconda and general build processes.
16
- # This includes wget for downloading, git for cloning (if needed), build-essential for compiling,
17
- # and libgl1-mesa-glx for graphics-related libraries often used by ML frameworks.
18
- # NEW: Added libglib2.0-0 for cv2 dependency
19
  RUN apt-get update && apt-get install -y --no-install-recommends \
20
  wget \
21
  git \
22
  build-essential \
23
  libgl1-mesa-glx \
24
- libglib2.0-0 \
25
- # Clean up apt cache to reduce image size
 
26
  && rm -rf /var/lib/apt/lists/*
27
 
28
  # Install Miniconda
@@ -34,23 +28,18 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
34
  conda config --add channels conda-forge
35
 
36
  # Accept Conda Terms of Service for default channels.
37
- # We use '. ' (dot space) instead of 'source' as /bin/sh is the default shell.
38
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
39
  conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
40
  conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
41
 
42
  # Copy all local project files into the container's working directory (/app).
43
- # This includes your cosmos-predict1.yaml, gui/requirements.txt, start.sh, etc.
44
  COPY . /app
45
 
46
  # Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
47
- # This step will install all specified Python and pip dependencies (excluding PyTorch/TorchVision).
48
  RUN conda env create -f cosmos-predict1.yaml
49
 
50
  # Set the default Conda environment to be activated.
51
  ENV CONDA_DEFAULT_ENV=cosmos-predict1
52
- # Add the newly created Conda environment's binary directory to the PATH.
53
- # This ensures that executables (like python, pip, uvicorn) from this environment are found.
54
  ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
55
 
56
  # Install PyTorch and TorchVision via pip with specific CUDA index.
@@ -62,12 +51,19 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
62
  torchaudio==2.3.1 \
63
  --index-url https://download.pytorch.org/whl/cu121
64
 
 
 
 
 
 
 
 
 
65
  # --- Verification Steps ---
66
  RUN echo "Verifying Python and Conda installations..."
67
  RUN python --version
68
  RUN conda env list
69
  RUN echo "Verifying PyTorch and CUDA availability..."
70
- # Use a heredoc for multi-line Python code
71
  RUN conda run -n cosmos-predict1 python <<EOF
72
  import torch
73
  print('PyTorch Version: ' + torch.__version__)
@@ -77,7 +73,6 @@ if torch.cuda.is_available():
77
  else:
78
  print('CUDA Device Name: N/A')
79
  EOF
80
- # Add the fallback echo if the python command fails
81
  RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
82
  # --- End Verification Steps ---
83
 
@@ -85,5 +80,4 @@ RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cos
85
  RUN chmod +x /app/start.sh
86
 
87
  # Set the default command to run when the container starts.
88
- # This will execute your start.sh script.
89
  CMD ["/app/start.sh"]
 
1
  # Start from a clean NVIDIA CUDA base image.
 
2
  FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
3
 
4
  # Set environment variables for non-interactive installations to prevent prompts during apt-get.
5
  ENV DEBIAN_FRONTEND=noninteractive
 
6
  ENV CONDA_DIR=/opt/conda
 
7
  ENV PATH=$CONDA_DIR/bin:$PATH
8
 
 
9
  WORKDIR /app
10
 
11
+ # Install essential system dependencies required for Miniconda and general build tools.
 
 
 
12
  RUN apt-get update && apt-get install -y --no-install-recommends \
13
  wget \
14
  git \
15
  build-essential \
16
  libgl1-mesa-glx \
17
+ libglib2.0-0 \
18
+ # Add any other system libraries that might be needed by transformer_engine or other deep learning libs
19
+ # For example, libstdc++6, libgomp1, etc., though usually covered by base image or build-essential
20
  && rm -rf /var/lib/apt/lists/*
21
 
22
  # Install Miniconda
 
28
  conda config --add channels conda-forge
29
 
30
  # Accept Conda Terms of Service for default channels.
 
31
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
32
  conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
33
  conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
34
 
35
  # Copy all local project files into the container's working directory (/app).
 
36
  COPY . /app
37
 
38
  # Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
 
39
  RUN conda env create -f cosmos-predict1.yaml
40
 
41
  # Set the default Conda environment to be activated.
42
  ENV CONDA_DEFAULT_ENV=cosmos-predict1
 
 
43
  ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
44
 
45
  # Install PyTorch and TorchVision via pip with specific CUDA index.
 
51
  torchaudio==2.3.1 \
52
  --index-url https://download.pytorch.org/whl/cu121
53
 
54
+ # --- NEW: Install Transformer Engine separately after PyTorch and cuDNN are in place ---
55
+ # This ensures it can find the necessary CUDA/cuDNN libraries.
56
+ # Also add git to apt-get install above if not already there.
57
+ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
58
+ conda activate cosmos-predict1 && \
59
+ pip install --no-cache-dir transformer-engine[pytorch]==1.12.0
60
+ # --- END NEW ---
61
+
62
  # --- Verification Steps ---
63
  RUN echo "Verifying Python and Conda installations..."
64
  RUN python --version
65
  RUN conda env list
66
  RUN echo "Verifying PyTorch and CUDA availability..."
 
67
  RUN conda run -n cosmos-predict1 python <<EOF
68
  import torch
69
  print('PyTorch Version: ' + torch.__version__)
 
73
  else:
74
  print('CUDA Device Name: N/A')
75
  EOF
 
76
  RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
77
  # --- End Verification Steps ---
78
 
 
80
  RUN chmod +x /app/start.sh
81
 
82
  # Set the default command to run when the container starts.
 
83
  CMD ["/app/start.sh"]
cosmos-predict1.yaml CHANGED
@@ -15,7 +15,7 @@ dependencies:
15
  # - torchvision=0.18.1
16
  # - torchaudio=2.3.1
17
  # - pytorch-cuda=12.1
18
- # - cudnn
19
  # - libcublas
20
 
21
  - pip:
@@ -31,5 +31,5 @@ dependencies:
31
  - megatron.core
32
  - attrs
33
  - iopath
34
- - transformer_engine
35
  # PyTorch and TorchVision will be installed separately in the Dockerfile
 
15
  # - torchvision=0.18.1
16
  # - torchaudio=2.3.1
17
  # - pytorch-cuda=12.1
18
+ - cudnn
19
  # - libcublas
20
 
21
  - pip:
 
31
  - megatron.core
32
  - attrs
33
  - iopath
34
+ #- transformer_engine
35
  # PyTorch and TorchVision will be installed separately in the Dockerfile