elungky commited on
Commit
cbe7167
·
1 Parent(s): 9ec2085

Attempt to fix torchvision::nms error by installing PyTorch via pip with official CUDA index

Browse files
Files changed (2) hide show
  1. Dockerfile +15 -4
  2. cosmos-predict1.yaml +10 -10
Dockerfile CHANGED
@@ -1,5 +1,4 @@
1
  # Start from a clean NVIDIA CUDA base image.
2
- # This provides the necessary CUDA runtime and development tools.
3
  # Using 12.4.0-devel-ubuntu22.04 to align with the CUDA version specified in your cosmos-predict1.yaml.
4
  FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
5
 
@@ -19,7 +18,7 @@ WORKDIR /app
19
  RUN apt-get update && apt-get install -y --no-install-recommends \
20
  wget \
21
  git \
22
- build-essential \
23
  libgl1-mesa-glx \
24
  # Clean up apt cache to reduce image size
25
  && rm -rf /var/lib/apt/lists/*
@@ -43,7 +42,7 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
43
  COPY . /app
44
 
45
  # Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
46
- # This step will install all specified Python, PyTorch, CUDA, and pip dependencies.
47
  RUN conda env create -f cosmos-predict1.yaml
48
 
49
  # Set the default Conda environment to be activated.
@@ -52,12 +51,24 @@ ENV CONDA_DEFAULT_ENV=cosmos-predict1
52
  # This ensures that executables (like python, pip, uvicorn) from this environment are found.
53
  ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
54
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  # --- Verification Steps ---
56
  RUN echo "Verifying Python and Conda installations..."
57
  RUN python --version
58
  RUN conda env list
59
  RUN echo "Verifying PyTorch and CUDA availability..."
60
- # REVISED AGAIN: Use a heredoc for multi-line Python code
61
  RUN conda run -n cosmos-predict1 python <<EOF
62
  import torch
63
  print('PyTorch Version: ' + torch.__version__)
 
1
  # Start from a clean NVIDIA CUDA base image.
 
2
  # Using 12.4.0-devel-ubuntu22.04 to align with the CUDA version specified in your cosmos-predict1.yaml.
3
  FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
4
 
 
18
  RUN apt-get update && apt-get install -y --no-install-recommends \
19
  wget \
20
  git \
21
+ build-essential \
22
  libgl1-mesa-glx \
23
  # Clean up apt cache to reduce image size
24
  && rm -rf /var/lib/apt/lists/*
 
42
  COPY . /app
43
 
44
  # Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
45
+ # This step will install all specified Python and pip dependencies (excluding PyTorch/TorchVision).
46
  RUN conda env create -f cosmos-predict1.yaml
47
 
48
  # Set the default Conda environment to be activated.
 
51
  # This ensures that executables (like python, pip, uvicorn) from this environment are found.
52
  ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
53
 
54
+ # --- NEW: Install PyTorch and TorchVision via pip with specific CUDA index ---
55
+ # This is crucial for ensuring the correct CUDA-enabled builds are installed.
56
+ # We'll use CUDA 12.1 as it's commonly compatible with 12.4+ drivers.
57
+ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
58
+ conda activate cosmos-predict1 && \
59
+ pip install --no-cache-dir \
60
+ torch==2.3.1 \
61
+ torchvision==0.18.1 \
62
+ torchaudio==2.3.1 \
63
+ --index-url https://download.pytorch.org/whl/cu121
64
+ # --- END NEW ---
65
+
66
  # --- Verification Steps ---
67
  RUN echo "Verifying Python and Conda installations..."
68
  RUN python --version
69
  RUN conda env list
70
  RUN echo "Verifying PyTorch and CUDA availability..."
71
+ # Use a heredoc for multi-line Python code
72
  RUN conda run -n cosmos-predict1 python <<EOF
73
  import torch
74
  print('PyTorch Version: ' + torch.__version__)
cosmos-predict1.yaml CHANGED
@@ -1,7 +1,6 @@
1
  name: cosmos-predict1
2
  channels:
3
- - pytorch
4
- - nvidia
5
  - conda-forge
6
  - defaults
7
  dependencies:
@@ -11,13 +10,13 @@ dependencies:
11
  - ninja
12
  - gcc=12.4.0
13
  - gxx=12.4.0
14
-
15
- - pytorch=2.3.1
16
- - torchvision=0.18.1
17
- - torchaudio=2.3.1
18
- - pytorch-cuda=12.4
19
- - cudnn
20
- - libcublas
21
 
22
  - pip:
23
  - -r file:///app/gui/requirements.txt
@@ -29,4 +28,5 @@ dependencies:
29
  - diffusers
30
  - megatron
31
  - IPython
32
- - megatron.core
 
 
1
  name: cosmos-predict1
2
  channels:
3
+ # Keep these channels for other dependencies, but PyTorch will be installed via pip
 
4
  - conda-forge
5
  - defaults
6
  dependencies:
 
10
  - ninja
11
  - gcc=12.4.0
12
  - gxx=12.4.0
13
+ # Remove all PyTorch/TorchVision/CUDA related entries from Conda dependencies
14
+ # - pytorch=2.3.1
15
+ # - torchvision=0.18.1
16
+ # - torchaudio=2.3.1
17
+ # - pytorch-cuda=12.1
18
+ # - cudnn
19
+ # - libcublas
20
 
21
  - pip:
22
  - -r file:///app/gui/requirements.txt
 
28
  - diffusers
29
  - megatron
30
  - IPython
31
+ - megatron.core
32
+ # PyTorch and TorchVision will be installed separately in the Dockerfile