elungky commited on
Commit
62d1e04
·
1 Parent(s): cf41009

Fix cudnn.h not found during Transformer Engine build by adding symlinks as per INSTALL.md

Browse files
Files changed (1) hide show
  1. Dockerfile +10 -6
Dockerfile CHANGED
@@ -15,8 +15,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
15
  build-essential \
16
  libgl1-mesa-glx \
17
  libglib2.0-0 \
18
- # Add any other system libraries that might be needed by transformer_engine or other deep learning libs
19
- # For example, libstdc++6, libgomp1, etc., though usually covered by base image or build-essential
20
  && rm -rf /var/lib/apt/lists/*
21
 
22
  # Install Miniconda
@@ -51,13 +49,19 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
51
  torchaudio==2.3.1 \
52
  --index-url https://download.pytorch.org/whl/cu121
53
 
54
- # --- NEW: Install Transformer Engine separately after PyTorch and cuDNN are in place ---
55
- # This ensures it can find the necessary CUDA/cuDNN libraries.
56
- # Also add git to apt-get install above if not already there.
 
 
 
 
 
 
 
57
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
58
  conda activate cosmos-predict1 && \
59
  pip install --no-cache-dir transformer-engine[pytorch]==1.12.0
60
- # --- END NEW ---
61
 
62
  # --- Verification Steps ---
63
  RUN echo "Verifying Python and Conda installations..."
 
15
  build-essential \
16
  libgl1-mesa-glx \
17
  libglib2.0-0 \
 
 
18
  && rm -rf /var/lib/apt/lists/*
19
 
20
  # Install Miniconda
 
49
  torchaudio==2.3.1 \
50
  --index-url https://download.pytorch.org/whl/cu121
51
 
52
+ # --- NEW SECTION: Patch Transformer Engine linking issues by creating symlinks for headers ---
53
+ # These commands must run AFTER conda env creation and BEFORE transformer-engine pip install.
54
+ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
55
+ conda activate cosmos-predict1 && \
56
+ echo "Creating symlinks for NVIDIA headers..." && \
57
+ ln -sf $CONDA_PREFIX/lib/python3.10/site-packages/nvidia/*/include/* $CONDA_PREFIX/include/ && \
58
+ ln -sf $CONDA_PREFIX/lib/python3.10/site-packages/nvidia/*/include/* $CONDA_PREFIX/include/python3.10/
59
+ # --- END NEW SECTION ---
60
+
61
+ # Install Transformer Engine separately after PyTorch and cuDNN are in place and headers are linked.
62
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
63
  conda activate cosmos-predict1 && \
64
  pip install --no-cache-dir transformer-engine[pytorch]==1.12.0
 
65
 
66
  # --- Verification Steps ---
67
  RUN echo "Verifying Python and Conda installations..."