elungky commited on
Commit
9679875
·
1 Parent(s): 95d53e3

Merged Dockerfile with robust build environment for transformer-engine compilation

Browse files
Files changed (1) hide show
  1. Dockerfile +35 -31
Dockerfile CHANGED
@@ -1,23 +1,31 @@
1
- # Start from a clean NVIDIA CUDA base image.
2
- FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
3
 
4
  # Set environment variables for non-interactive installations to prevent prompts during apt-get.
5
  ENV DEBIAN_FRONTEND=noninteractive
6
  ENV CONDA_DIR=/opt/conda
7
- ENV PATH=$CONDA_DIR/bin:$PATH
8
 
9
  WORKDIR /app
10
 
11
- # Install essential system dependencies required for Miniconda and general build tools.
12
- RUN apt-get update && apt-get install -y --no-install-recommends \
13
  wget \
14
  git \
15
  build-essential \
16
  libgl1-mesa-glx \
17
  libglib2.0-0 \
18
- && rm -rf /var/lib/apt/lists/*
 
 
 
 
 
 
 
 
 
19
 
20
- # Install Miniconda
21
  RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
22
  /bin/bash miniconda.sh -b -p $CONDA_DIR && \
23
  rm miniconda.sh && \
@@ -36,7 +44,7 @@ COPY . /app
36
  # Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
37
  RUN conda env create -f cosmos-predict1.yaml
38
 
39
- # Set the default Conda environment to be activated.
40
  ENV CONDA_DEFAULT_ENV=cosmos-predict1
41
  ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
42
 
@@ -49,35 +57,31 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
49
  torchaudio==2.3.1 \
50
  --index-url https://download.pytorch.org/whl/cu121
51
 
52
- # Copy the pre-built Transformer Engine wheel into the container
53
- # Ensure the filename matches your actual wheel file.
54
- COPY ./transformer_engine.whl /tmp/
 
 
 
55
 
56
- # Install Transformer Engine using the pre-built wheel
57
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
58
  conda activate cosmos-predict1 && \
59
- pip install --no-cache-dir /tmp/transformer_engine.whl
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  # Make the start.sh script executable.
62
- # THIS IS A STANDALONE RUN COMMAND.
63
  RUN chmod +x /app/start.sh
64
 
65
- # --- Verification Steps ---
66
- RUN echo "Verifying Python and Conda installations..."
67
- RUN python --version
68
- RUN conda env list
69
- RUN echo "Verifying PyTorch and CUDA availability..."
70
- RUN conda run -n cosmos-predict1 python <<EOF
71
- import torch
72
- print('PyTorch Version: ' + torch.__version__)
73
- print('CUDA Available: ' + str(torch.cuda.is_available()))
74
- if torch.cuda.is_available():
75
- print('CUDA Device Name: ' + torch.cuda.get_device_name(0))
76
- else:
77
- print('CUDA Device Name: N/A')
78
- EOF
79
- RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
80
- # --- End Verification Steps ---
81
-
82
  # Set the default command to run when the container starts.
83
  CMD ["/app/start.sh"]
 
1
+ # Adopt new base image with cuDNN pre-installed
2
+ FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
3
 
4
  # Set environment variables for non-interactive installations to prevent prompts during apt-get.
5
  ENV DEBIAN_FRONTEND=noninteractive
6
  ENV CONDA_DIR=/opt/conda
 
7
 
8
  WORKDIR /app
9
 
10
+ # Install essential system dependencies from both Dockerfiles
11
+ RUN apt-get update -y && apt-get install -qqy \
12
  wget \
13
  git \
14
  build-essential \
15
  libgl1-mesa-glx \
16
  libglib2.0-0 \
17
+ rsync \
18
+ make \
19
+ libssl-dev zlib1g-dev \
20
+ libbz2-dev libreadline-dev libsqlite3-dev curl llvm \
21
+ libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
22
+ ffmpeg libsm6 libxext6 cmake libmagickwand-dev \
23
+ # Ensure git-lfs is installed and initialized
24
+ git-lfs \
25
+ && rm -rf /var/lib/apt/lists/* \
26
+ && git lfs install # Initialize LFS system-wide
27
 
28
+ # Install Miniconda (retain our existing approach)
29
  RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
30
  /bin/bash miniconda.sh -b -p $CONDA_DIR && \
31
  rm miniconda.sh && \
 
44
  # Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
45
  RUN conda env create -f cosmos-predict1.yaml
46
 
47
+ # Set the default Conda environment to be activated and update PATH
48
  ENV CONDA_DEFAULT_ENV=cosmos-predict1
49
  ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
50
 
 
57
  torchaudio==2.3.1 \
58
  --index-url https://download.pytorch.org/whl/cu121
59
 
60
+ # IMPORTANT: Symlink fix for Transformer Engine compilation.
61
+ # The `nvidia/cuda` base images place CUDA libraries and headers in /usr/local/cuda.
62
+ # We need to ensure that the build system can find cuDNN headers.
63
+ ENV CONDA_PREFIX_FIX=/usr/local/cuda
64
+ RUN ln -sf $CONDA_PREFIX_FIX/lib/python3.10/site-packages/nvidia/*/include/* $CONDA_PREFIX_FIX/include/ || true && \
65
+ ln -sf $CONDA_PREFIX_FIX/lib/python3.10/site-packages/nvidia/*/include/* $CONDA_PREFIX_FIX/include/python3.10 || true
66
 
67
+ # Install Transformer Engine by attempting to compile it, relying on the robust build environment.
68
  RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
69
  conda activate cosmos-predict1 && \
70
+ pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==1.12.0"
71
+
72
+ # Install Apex for inference.
73
+ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
74
+ conda activate cosmos-predict1 && \
75
+ git clone https://github.com/NVIDIA/apex /app/apex && \
76
+ CUDA_HOME=$CONDA_PREFIX pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" /app/apex
77
+
78
+ # Install MoGe for inference.
79
+ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
80
+ conda activate cosmos-predict1 && \
81
+ pip install --no-cache-dir git+https://github.com/microsoft/MoGe.git
82
 
83
  # Make the start.sh script executable.
 
84
  RUN chmod +x /app/start.sh
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  # Set the default command to run when the container starts.
87
  CMD ["/app/start.sh"]