Spaces:
Build error
Build error
Attempt to fix libcudnn.so.9 error by installing cudnn via conda and transformer_engine separately
Browse files- Dockerfile +12 -18
- cosmos-predict1.yaml +2 -2
Dockerfile
CHANGED
@@ -1,28 +1,22 @@
|
|
1 |
# Start from a clean NVIDIA CUDA base image.
|
2 |
-
# Using 12.4.0-devel-ubuntu22.04 to align with the CUDA version specified in your cosmos-predict1.yaml.
|
3 |
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
|
4 |
|
5 |
# Set environment variables for non-interactive installations to prevent prompts during apt-get.
|
6 |
ENV DEBIAN_FRONTEND=noninteractive
|
7 |
-
# Define the base directory for Conda installation.
|
8 |
ENV CONDA_DIR=/opt/conda
|
9 |
-
# Add Conda's binary directory to the system's PATH.
|
10 |
ENV PATH=$CONDA_DIR/bin:$PATH
|
11 |
|
12 |
-
# Set the working directory inside the container. All subsequent commands will run from here.
|
13 |
WORKDIR /app
|
14 |
|
15 |
-
# Install essential system dependencies required for Miniconda and general build
|
16 |
-
# This includes wget for downloading, git for cloning (if needed), build-essential for compiling,
|
17 |
-
# and libgl1-mesa-glx for graphics-related libraries often used by ML frameworks.
|
18 |
-
# NEW: Added libglib2.0-0 for cv2 dependency
|
19 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
20 |
wget \
|
21 |
git \
|
22 |
build-essential \
|
23 |
libgl1-mesa-glx \
|
24 |
-
libglib2.0-0 \
|
25 |
-
#
|
|
|
26 |
&& rm -rf /var/lib/apt/lists/*
|
27 |
|
28 |
# Install Miniconda
|
@@ -34,23 +28,18 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
|
|
34 |
conda config --add channels conda-forge
|
35 |
|
36 |
# Accept Conda Terms of Service for default channels.
|
37 |
-
# We use '. ' (dot space) instead of 'source' as /bin/sh is the default shell.
|
38 |
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
39 |
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
|
40 |
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
|
41 |
|
42 |
# Copy all local project files into the container's working directory (/app).
|
43 |
-
# This includes your cosmos-predict1.yaml, gui/requirements.txt, start.sh, etc.
|
44 |
COPY . /app
|
45 |
|
46 |
# Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
|
47 |
-
# This step will install all specified Python and pip dependencies (excluding PyTorch/TorchVision).
|
48 |
RUN conda env create -f cosmos-predict1.yaml
|
49 |
|
50 |
# Set the default Conda environment to be activated.
|
51 |
ENV CONDA_DEFAULT_ENV=cosmos-predict1
|
52 |
-
# Add the newly created Conda environment's binary directory to the PATH.
|
53 |
-
# This ensures that executables (like python, pip, uvicorn) from this environment are found.
|
54 |
ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
|
55 |
|
56 |
# Install PyTorch and TorchVision via pip with specific CUDA index.
|
@@ -62,12 +51,19 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
|
62 |
torchaudio==2.3.1 \
|
63 |
--index-url https://download.pytorch.org/whl/cu121
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
# --- Verification Steps ---
|
66 |
RUN echo "Verifying Python and Conda installations..."
|
67 |
RUN python --version
|
68 |
RUN conda env list
|
69 |
RUN echo "Verifying PyTorch and CUDA availability..."
|
70 |
-
# Use a heredoc for multi-line Python code
|
71 |
RUN conda run -n cosmos-predict1 python <<EOF
|
72 |
import torch
|
73 |
print('PyTorch Version: ' + torch.__version__)
|
@@ -77,7 +73,6 @@ if torch.cuda.is_available():
|
|
77 |
else:
|
78 |
print('CUDA Device Name: N/A')
|
79 |
EOF
|
80 |
-
# Add the fallback echo if the python command fails
|
81 |
RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
|
82 |
# --- End Verification Steps ---
|
83 |
|
@@ -85,5 +80,4 @@ RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cos
|
|
85 |
RUN chmod +x /app/start.sh
|
86 |
|
87 |
# Set the default command to run when the container starts.
|
88 |
-
# This will execute your start.sh script.
|
89 |
CMD ["/app/start.sh"]
|
|
|
1 |
# Start from a clean NVIDIA CUDA base image.
|
|
|
2 |
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
|
3 |
|
4 |
# Set environment variables for non-interactive installations to prevent prompts during apt-get.
|
5 |
ENV DEBIAN_FRONTEND=noninteractive
|
|
|
6 |
ENV CONDA_DIR=/opt/conda
|
|
|
7 |
ENV PATH=$CONDA_DIR/bin:$PATH
|
8 |
|
|
|
9 |
WORKDIR /app
|
10 |
|
11 |
+
# Install essential system dependencies required for Miniconda and general build tools.
|
|
|
|
|
|
|
12 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
13 |
wget \
|
14 |
git \
|
15 |
build-essential \
|
16 |
libgl1-mesa-glx \
|
17 |
+
libglib2.0-0 \
|
18 |
+
# Add any other system libraries that might be needed by transformer_engine or other deep learning libs
|
19 |
+
# For example, libstdc++6, libgomp1, etc., though usually covered by base image or build-essential
|
20 |
&& rm -rf /var/lib/apt/lists/*
|
21 |
|
22 |
# Install Miniconda
|
|
|
28 |
conda config --add channels conda-forge
|
29 |
|
30 |
# Accept Conda Terms of Service for default channels.
|
|
|
31 |
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
32 |
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
|
33 |
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
|
34 |
|
35 |
# Copy all local project files into the container's working directory (/app).
|
|
|
36 |
COPY . /app
|
37 |
|
38 |
# Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
|
|
|
39 |
RUN conda env create -f cosmos-predict1.yaml
|
40 |
|
41 |
# Set the default Conda environment to be activated.
|
42 |
ENV CONDA_DEFAULT_ENV=cosmos-predict1
|
|
|
|
|
43 |
ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
|
44 |
|
45 |
# Install PyTorch and TorchVision via pip with specific CUDA index.
|
|
|
51 |
torchaudio==2.3.1 \
|
52 |
--index-url https://download.pytorch.org/whl/cu121
|
53 |
|
54 |
+
# --- NEW: Install Transformer Engine separately after PyTorch and cuDNN are in place ---
|
55 |
+
# This ensures it can find the necessary CUDA/cuDNN libraries.
|
56 |
+
# Also add git to apt-get install above if not already there.
|
57 |
+
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
58 |
+
conda activate cosmos-predict1 && \
|
59 |
+
pip install --no-cache-dir transformer-engine[pytorch]==1.12.0
|
60 |
+
# --- END NEW ---
|
61 |
+
|
62 |
# --- Verification Steps ---
|
63 |
RUN echo "Verifying Python and Conda installations..."
|
64 |
RUN python --version
|
65 |
RUN conda env list
|
66 |
RUN echo "Verifying PyTorch and CUDA availability..."
|
|
|
67 |
RUN conda run -n cosmos-predict1 python <<EOF
|
68 |
import torch
|
69 |
print('PyTorch Version: ' + torch.__version__)
|
|
|
73 |
else:
|
74 |
print('CUDA Device Name: N/A')
|
75 |
EOF
|
|
|
76 |
RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
|
77 |
# --- End Verification Steps ---
|
78 |
|
|
|
80 |
RUN chmod +x /app/start.sh
|
81 |
|
82 |
# Set the default command to run when the container starts.
|
|
|
83 |
CMD ["/app/start.sh"]
|
cosmos-predict1.yaml
CHANGED
@@ -15,7 +15,7 @@ dependencies:
|
|
15 |
# - torchvision=0.18.1
|
16 |
# - torchaudio=2.3.1
|
17 |
# - pytorch-cuda=12.1
|
18 |
-
|
19 |
# - libcublas
|
20 |
|
21 |
- pip:
|
@@ -31,5 +31,5 @@ dependencies:
|
|
31 |
- megatron.core
|
32 |
- attrs
|
33 |
- iopath
|
34 |
-
|
35 |
# PyTorch and TorchVision will be installed separately in the Dockerfile
|
|
|
15 |
# - torchvision=0.18.1
|
16 |
# - torchaudio=2.3.1
|
17 |
# - pytorch-cuda=12.1
|
18 |
+
- cudnn
|
19 |
# - libcublas
|
20 |
|
21 |
- pip:
|
|
|
31 |
- megatron.core
|
32 |
- attrs
|
33 |
- iopath
|
34 |
+
#- transformer_engine
|
35 |
# PyTorch and TorchVision will be installed separately in the Dockerfile
|