File size: 5,233 Bytes
9679875
 
0287daf
5fa8a70
 
 
 
ea133b8
49860eb
9679875
 
5fa8a70
 
cbe7167
5fa8a70
cf41009
9679875
 
 
 
 
 
 
 
a27c594
5fa8a70
e0e9267
5fa8a70
 
 
193cb9f
5fa8a70
 
 
 
a27c594
e0e9267
 
e5064c2
59d6df8
ca59c13
 
 
5fa8a70
ea133b8
26ccbe7
5fa8a70
 
 
193cb9f
5fa8a70
 
 
d6f9440
cbe7167
 
 
 
 
 
 
 
8da02bd
 
a27c594
 
8da02bd
 
 
a27c594
8da02bd
 
a27c594
8da02bd
 
a27c594
 
 
 
 
 
 
 
fb926f5
a27c594
 
cf41009
 
a27c594
9679875
 
 
 
 
 
 
 
 
 
 
26ccbe7
5fa8a70
598f651
8d604f9
a27c594
193cb9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fa8a70
8d604f9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Adopt new base image with cuDNN pre-installed
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

# Set environment variables for non-interactive installations to prevent prompts during apt-get.
ENV DEBIAN_FRONTEND=noninteractive
ENV CONDA_DIR=/opt/conda

WORKDIR /app

# Install essential system dependencies from both Dockerfiles
RUN apt-get update -y && apt-get install -qqy \
    wget \
    git \
    build-essential \
    libgl1-mesa-glx \
    libglib2.0-0 \
    rsync \
    make \
    libssl-dev zlib1g-dev \
    libbz2-dev libreadline-dev libsqlite3-dev curl llvm \
    libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
    ffmpeg libsm6 libxext6 cmake libmagickwand-dev \
    git-lfs \
    && rm -rf /var/lib/apt/lists/* \
    && git lfs install

# Install Miniconda
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
    /bin/bash miniconda.sh -b -p $CONDA_DIR && \
    rm miniconda.sh && \
    export PATH=$CONDA_DIR/bin:$PATH && \
    conda clean --all --yes && \
    conda config --set auto_activate_base false && \
    conda config --add channels conda-forge

# Set the global PATH for Conda's base environment immediately after installation.
ENV PATH=$CONDA_DIR/bin:$PATH

# Accept Conda Terms of Service for default channels.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
    conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
    conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r

# Copy all local project files into the container's working directory (/app).
COPY . /app

# Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
RUN conda env create -f cosmos-predict1.yaml

# Set the default Conda environment to be activated and update PATH (for subsequent layers and runtime)
ENV CONDA_DEFAULT_ENV=cosmos-predict1
ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH

# Install PyTorch and TorchVision via pip with specific CUDA index.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
    conda activate cosmos-predict1 && \
    pip install --no-cache-dir \
        torch==2.3.1 \
        torchvision==0.18.1 \
        torchaudio==2.3.1 \
        --index-url https://download.pytorch.org/whl/cu121

# NEW: Dynamically find cudnn.h and symlink it to the Conda environment's include path.
# This ensures Transformer Engine can find it during compilation.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
    conda activate cosmos-predict1 && \
    CUDNN_HEADER_PATH=$(find "$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/" -name "cudnn.h" | head -n 1) && \
    if [ -f "$CUDNN_HEADER_PATH" ]; then \
        echo "Found cudnn.h at: $CUDNN_HEADER_PATH"; \
        mkdir -p "$CONDA_PREFIX/include" && \
        ln -sf "$CUDNN_HEADER_PATH" "$CONDA_PREFIX/include/cudnn.h" || \
        cp "$CUDNN_HEADER_PATH" "$CONDA_PREFIX/include/cudnn.h"; \
    else \
        echo "Error: cudnn.h not found in any expected location within Conda environment. This will likely cause compilation failures."; \
        exit 1; \
    fi

# IMPORTANT: Symlink fix for Transformer Engine compilation (from INSTALL.md).
# These symlinks are for other NVIDIA headers that might be in Python site-packages.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
    conda activate cosmos-predict1 && \
    ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/ || true && \
    ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/python3.10 || true

# Install Transformer Engine by attempting to compile it.
# Pass CUDA_HOME explicitly to ensure it looks in the correct Conda environment path.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
    conda activate cosmos-predict1 && \
    CUDA_HOME=$CONDA_PREFIX pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==1.12.0"

# Install Apex for inference.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
    conda activate cosmos-predict1 && \
    git clone https://github.com/NVIDIA/apex /app/apex && \
    CUDA_HOME=$CONDA_PREFIX pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" /app/apex

# Install MoGe for inference.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
    conda activate cosmos-predict1 && \
    pip install --no-cache-dir git+https://github.com/microsoft/MoGe.git

# Make the start.sh script executable.
RUN chmod +x /app/start.sh

# Verification Steps
RUN echo "Verifying Python and Conda installations..."
RUN python --version
RUN conda env list
RUN echo "Verifying PyTorch and CUDA availability..."
RUN conda run -n cosmos-predict1 python <<EOF
import torch
print('PyTorch Version: ' + torch.__version__)
print('CUDA Available: ' + str(torch.cuda.is_available()))
if torch.cuda.is_available():
    print('CUDA Device Name: ' + torch.cuda.get_device_name(0))
else:
    print('CUDA Device Name: N/A')
EOF
RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."

# Set the default command to run when the container starts.
CMD ["/app/start.sh"]