elungky commited on
Commit
a27c594
·
1 Parent(s): e0e9267

Fix cudnn.h not found error by copying to Conda env include path and setting CUDA_HOME

Browse files
Files changed (1) hide show
  1. Dockerfile +26 -15
Dockerfile CHANGED
@@ -20,27 +20,23 @@ RUN apt-get update -y && apt-get install -qqy \
20
  libbz2-dev libreadline-dev libsqlite3-dev curl llvm \
21
  libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
22
  ffmpeg libsm6 libxext6 cmake libmagickwand-dev \
23
- # Ensure git-lfs is installed and initialized
24
  git-lfs \
25
  && rm -rf /var/lib/apt/lists/* \
26
- && git lfs install # Initialize LFS system-wide
27
 
28
  # Install Miniconda
29
  RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
30
  /bin/bash miniconda.sh -b -p $CONDA_DIR && \
31
  rm miniconda.sh && \
32
- # Add Conda to PATH for subsequent commands in this RUN layer
33
  export PATH=$CONDA_DIR/bin:$PATH && \
34
  conda clean --all --yes && \
35
  conda config --set auto_activate_base false && \
36
  conda config --add channels conda-forge
37
 
38
- # NEW: Set the global PATH for Conda's base environment immediately after installation.
39
- # This ensures 'conda' command is available for subsequent RUN commands.
40
  ENV PATH=$CONDA_DIR/bin:$PATH
41
 
42
  # Accept Conda Terms of Service for default channels.
43
- # Now 'conda' command should be found.
44
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
45
  conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
46
  conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
@@ -49,7 +45,6 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
49
  COPY . /app
50
 
51
  # Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
52
- # Now 'conda' command should be found.
53
  RUN conda env create -f cosmos-predict1.yaml
54
 
55
  # Set the default Conda environment to be activated and update PATH (for subsequent layers and runtime)
@@ -65,15 +60,32 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
65
  torchaudio==2.3.1 \
66
  --index-url https://download.pytorch.org/whl/cu121
67
 
68
- # IMPORTANT: Symlink fix for Transformer Engine compilation.
69
- ENV CONDA_PREFIX_FIX=/usr/local/cuda
70
- RUN ln -sf $CONDA_PREFIX_FIX/lib/python3.10/site-packages/nvidia/*/include/* $CONDA_PREFIX_FIX/include/ || true && \
71
- ln -sf $CONDA_PREFIX_FIX/lib/python3.10/site-packages/nvidia/*/include/* $CONDA_PREFIX_FIX/include/python3.10 || true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- # Install Transformer Engine by attempting to compile it, relying on the robust build environment.
 
74
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
75
  conda activate cosmos-predict1 && \
76
- pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==1.12.0"
77
 
78
  # Install Apex for inference.
79
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
@@ -89,7 +101,7 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
89
  # Make the start.sh script executable.
90
  RUN chmod +x /app/start.sh
91
 
92
- # --- Verification Steps ---
93
  RUN echo "Verifying Python and Conda installations..."
94
  RUN python --version
95
  RUN conda env list
@@ -104,7 +116,6 @@ else:
104
  print('CUDA Device Name: N/A')
105
  EOF
106
  RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
107
- # --- End Verification Steps ---
108
 
109
  # Set the default command to run when the container starts.
110
  CMD ["/app/start.sh"]
 
20
  libbz2-dev libreadline-dev libsqlite3-dev curl llvm \
21
  libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
22
  ffmpeg libsm6 libxext6 cmake libmagickwand-dev \
 
23
  git-lfs \
24
  && rm -rf /var/lib/apt/lists/* \
25
+ && git lfs install
26
 
27
  # Install Miniconda
28
  RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
29
  /bin/bash miniconda.sh -b -p $CONDA_DIR && \
30
  rm miniconda.sh && \
 
31
  export PATH=$CONDA_DIR/bin:$PATH && \
32
  conda clean --all --yes && \
33
  conda config --set auto_activate_base false && \
34
  conda config --add channels conda-forge
35
 
36
+ # Set the global PATH for Conda's base environment immediately after installation.
 
37
  ENV PATH=$CONDA_DIR/bin:$PATH
38
 
39
  # Accept Conda Terms of Service for default channels.
 
40
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
41
  conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
42
  conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
 
45
  COPY . /app
46
 
47
  # Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
 
48
  RUN conda env create -f cosmos-predict1.yaml
49
 
50
  # Set the default Conda environment to be activated and update PATH (for subsequent layers and runtime)
 
60
  torchaudio==2.3.1 \
61
  --index-url https://download.pytorch.org/whl/cu121
62
 
63
+ # NEW: Ensure cudnn.h is available in the Conda environment's include path.
64
+ # The base image has cudnn.h at /usr/local/cuda/include.
65
+ # We explicitly copy it to the Conda environment's include directory if it's not already there.
66
+ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
67
+ conda activate cosmos-predict1 && \
68
+ if [ ! -f "$CONCONDA_PREFIX/include/cudnn.h" ]; then \
69
+ echo "cudnn.h not found in Conda environment, copying from /usr/local/cuda/include"; \
70
+ mkdir -p "$CONDA_PREFIX/include" && \
71
+ cp /usr/local/cuda/include/cudnn.h "$CONDA_PREFIX/include/"; \
72
+ else \
73
+ echo "cudnn.h already present in Conda environment"; \
74
+ fi
75
+
76
+ # IMPORTANT: Symlink fix for Transformer Engine compilation (from INSTALL.md).
77
+ # These symlinks are for other NVIDIA headers that might be in Python site-packages.
78
+ # $CONDA_PREFIX is the current activated Conda environment.
79
+ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
80
+ conda activate cosmos-predict1 && \
81
+ ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/ || true && \
82
+ ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/python3.10 || true
83
 
84
+ # Install Transformer Engine by attempting to compile it.
85
+ # Pass CUDA_HOME explicitly to ensure it looks in the correct Conda environment path.
86
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
87
  conda activate cosmos-predict1 && \
88
+ CUDA_HOME=$CONDA_PREFIX pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==1.12.0"
89
 
90
  # Install Apex for inference.
91
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
 
101
  # Make the start.sh script executable.
102
  RUN chmod +x /app/start.sh
103
 
104
+ # Verification Steps
105
  RUN echo "Verifying Python and Conda installations..."
106
  RUN python --version
107
  RUN conda env list
 
116
  print('CUDA Device Name: N/A')
117
  EOF
118
  RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
 
119
 
120
  # Set the default command to run when the container starts.
121
  CMD ["/app/start.sh"]