Spaces:

roll-ai
/

RealCam-I2V

Runtime error

App Files Files Community

roll-ai commited on 4 days ago

Commit

14964a5

verified ·

1 Parent(s): ee142ac

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +60 -0
index.html +529 -0
requirements.txt +33 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,60 @@

+# ==== GPU base (CUDA 12.1 + cuDNN) ====
+FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    HF_HOME=/root/.cache/huggingface \
+    TRANSFORMERS_CACHE=/root/.cache/huggingface/transformers \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_SERVER_PORT=8080 \
+    DISPLAY=:99
+# ---- System deps (ffmpeg + headless GL/X) ----
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 python3-pip python3-venv \
+    git git-lfs \
+    ffmpeg \
+    libgl1 \
+    libgl1-mesa-dri \
+    libglib2.0-0 \
+    xvfb \
+ && git lfs install \
+ && rm -rf /var/lib/apt/lists/*
+# ---- Workdir & copy requirements first (cache-friendly) ----
+WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+# ---- Python deps (Torch CUDA 12.1 first) ----
+RUN python3 -m pip install --upgrade pip \
+ && python3 -m pip install --extra-index-url https://download.pytorch.org/whl/cu121 \
+      torch torchvision torchaudio \
+ && python3 -m pip install -r requirements.txt \
+ && python3 -m pip install huggingface_hub imageio-ffmpeg
+# ---- App code ----
+# Expect: gradio_app.py (updated), image_to_video.py, demo/, etc.
+COPY . /app
+# ---- Ports & default envs ----
+EXPOSE 8080
+ENV DEVICE=cuda \
+    SAVE_FPS=16 \
+    RESULT_DIR=/app/results \
+    MODEL_META_PATH=demo/models.json \
+    EXAMPLE_META_PATH=demo/examples.json \
+    CAMERA_POSE_META_PATH=demo/camera_poses.json \
+    DEPTH_MODEL_PATH=pretrained/Metric3D/metric_depth_vit_large_800k.pth \
+    CAPTION_MODEL_PATH=pretrained/Qwen2.5-VL-7B-Instruct
+# Optional (set at runtime to auto-download via gradio_app.py):
+# ENV REPO_COGVIDEOX=THUDM/CogVideoX1.5-5B-I2V
+# ENV REPO_METRIC3D=JUGGHM/Metric3D
+# ENV REPO_QWEN_VL=Qwen/Qwen2.5-VL-7B-Instruct
+# ENV REPO_REALCAM=MuteApo/RealCam-I2V
+# ENV HF_TOKEN=   # pass at runtime; don't bake secrets
+# ---- Launch under Xvfb to satisfy headless GL ----
+CMD ["bash", "-lc", "xvfb-run -s '-screen 0 1280x720x24' python3 finetune/gradio_app.py"]

index.html ADDED Viewed

	@@ -0,0 +1,529 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
+    <!-- Replace the content tag with appropriate information -->
+    <meta name="description"
+        content="RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control">
+    <meta property="og:title"
+        content="RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control" />
+    <meta property="og:description"
+        content="RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control" />
+    <!-- Keywords for your paper to be indexed by-->
+    <meta name="keywords" content="RealCam-I2V, Complex Camera Control, Image-to-Video Generation">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control</title>
+    <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
+    <link href="https://fonts.googleapis.com/css2?family=Playfair+Display:ital,wght@1,400&display=swap"
+        rel="stylesheet">
+    <link rel="stylesheet" href="static/css/bulma.min.css">
+    <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
+    <link rel="stylesheet" href="static/css/bulma-slider.min.css">
+    <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
+    <link rel="stylesheet" href="static/css/index.css">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/dreampulse/computer-modern-web-font@master/fonts.css">
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
+    <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
+    <script defer src="static/js/fontawesome.all.min.js"></script>
+    <script src="static/js/bulma-carousel.min.js"></script>
+    <script src="static/js/bulma-slider.min.js"></script>
+    <script src="static/js/index.js"></script>
+    <script type="text/javascript" async
+        src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_SVG"></script>
+    <script type="text/x-mathjax-config">
+        MathJax.Hub.Config({
+            tex2jax: {
+                inlineMath: [['$','$'], ['\\(','\\)']]
+            }
+        });
+    </script>
+    <style>
+        .video-container {
+            display: flex;
+            justify-content: center;
+            gap: 0px;
+        }
+        .italic {
+            font-family: 'Playfair Display';
+            font-style: italic;
+        }
+    </style>
+</head>
+<body>
+    <!-- title and author -->
+    <section class="hero">
+        <div class="hero-body">
+            <div class="container is-max-desktop">
+                <div class="columns is-centered">
+                    <div class="column has-text-centered">
+                        <h1 class="title is-2 publication-title">
+                            RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control
+                        </h1>
+                        <div class="is-size-5 publication-authors">
+                            <span class="author-block">Teng Li<sup>1,2*</sup>,</span>
+                            <span class="author-block">Guangcong Zheng<sup>1,2*</sup>,</span>
+                            <span class="author-block">Rui Jiang<sup>1,2</sup>,</span>
+                            <span class="author-block">Shuigen Zhan<sup>1</sup>,</span>
+                            <span class="author-block">Tao Wu<sup>1</sup>,</span>
+                            <span class="author-block">Yehao Lu<sup>1</sup>,</span>
+                            <span class="author-block">Yining Lin<sup>3</sup>,</span>
+                            <br>
+                            <span class="author-block">Chuanyun Deng<sup>2</sup>,</span>
+                            <span class="author-block">Yepan Xiong<sup>2</sup>,</span>
+                            <span class="author-block">Min Chen<sup>2</sup>,</span>
+                            <span class="author-block">Lin Cheng<sup>2</sup>,</span>
+                            <span class="author-block">Xi Li<sup>1&#9993;</sup></span>
+                        </div>
+                        <div class="is-size-5 publication-authors">
+                            <span class="author-block"><sup>1</sup>Zhejiang University,</span>
+                            <span class="author-block"><sup>2</sup>Huawei,</span>
+                            <span class="author-block"><sup>3</sup>Supremind</span>
+                            <br>
+                            <span class="author-block">ICCV 2025</span>
+                        </div>
+                        <div class="column has-text-centered">
+                            <div class="publication-links">
+                                <span class="link-block">
+                                    <a href="https://arxiv.org/pdf/2502.10059.pdf" target="_blank"
+                                        class="external-link button is-normal is-rounded is-dark">
+                                        <span class="icon">
+                                            <i class="fas fa-file-pdf"></i>
+                                        </span>
+                                        <span>Paper</span>
+                                    </a>
+                                </span>
+                                <span class="link-block">
+                                    <a href="https://arxiv.org/abs/2502.10059" target="_blank"
+                                        class="external-link button is-normal is-rounded is-dark">
+                                        <span class="icon">
+                                            <i class="ai ai-arxiv"></i>
+                                        </span>
+                                        <span>arXiv</span>
+                                    </a>
+                                </span>
+                                <span class="link-block">
+                                    <a href="https://github.com/ZGCTroy/RealCam-I2V" target="_blank"
+                                        class="external-link button is-normal is-rounded is-dark">
+                                        <span class="icon">
+                                            <i class="fab fa-github"></i>
+                                        </span>
+                                        <span>Code</span>
+                                    </a>
+                                </span>
+                                <span class="link-block">
+                                    <a href="https://github.com/ZGCTroy/CamI2V" target="_blank"
+                                        class="external-link button is-normal is-rounded is-dark">
+                                        <span class="icon">
+                                            <i class="fab fa-github"></i>
+                                        </span>
+                                        <span>CamI2V</span>
+                                    </a>
+                                </span>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </section>
+    <!-- abstract -->
+    <section class="section hero is-light">
+        <div class="container is-max-desktop">
+            <div class="columns is-centered has-text-centered">
+                <div class="column is-four-fifths">
+                    <h2 class="title is-3">Abstract</h2>
+                    <div class="content has-text-justified">
+                        <p>
+                            Recent advancements in camera-trajectory-guided image-to-video generation offer higher
+                            precision and better support for complex camera control compared to text-based approaches.
+                            However, they also introduce significant usability challenges, as users often struggle to
+                            provide precise camera parameters when working with arbitrary real-world images without
+                            knowledge of their depth nor scene scale.
+                            To address these real-world application issues, we propose RealCam-I2V, a novel
+                            diffusion-based video generation framework that integrates monocular metric depth estimation
+                            to establish 3D scene reconstruction in a preprocessing step.
+                            During training, the reconstructed 3D scene enables scaling camera parameters from relative
+                            to metric scales, ensuring compatibility and scale consistency across diverse real-world
+                            images.
+                            In inference, RealCam-I2V offers an intuitive interface where users can precisely draw
+                            camera trajectories by dragging within the 3D scene.
+                            To further enhance precise camera control and scene consistency, we propose
+                            scene-constrained noise shaping, which shapes high-level noise and also allows the framework
+                            to maintain dynamic and coherent video generation in lower noise stages.
+                            RealCam-I2V achieves significant improvements in controllability and video quality on the
+                            RealEstate10K and out-of-domain images. We further enables applications like
+                            camera-controlled looping video generation and generative frame interpolation.
+                        </p>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </section>
+    <section class="section hero">
+        <div class="container has-text-centered">
+            <h2 class="title is-3">Demo</h2>
+            <div class="video-container">
+                <div>
+                    <video autoplay controls muted loop width="80%">
+                        <source src="static/videos/demo/4d_demo.mp4" type="video/mp4" />
+                    </video>
+                </div>
+            </div>
+            <h2 class="subtitle has-text-centered italic">
+                4D Visualization
+            </h2>
+            <br>
+            <div class="video-container">
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/73c3266a-d3e1-41c9-9691-729478a8bf77.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/79131dea-ca85-49df-b68b-cdb208f164c7.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/b17050b5-3ed8-44ae-94a4-ec939c57b41f.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/8ab67ba3-8300-4b82-98b7-8e28403cf6f7.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+            </div>
+            <h2 class="subtitle has-text-centered italic">
+                Aerial View
+            </h2>
+            <br>
+            <div class="video-container">
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/3f962cd6-fbf4-4b8a-b107-1468931c80f4.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/d4db16a8-3f82-43b3-8432-cc8df007f10c.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/34614b89-431d-4e31-8d82-89a0f082aaed.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/4f0b8b62-a278-4b0e-8457-b6e8b099de59.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+            </div>
+            <h2 class="subtitle has-text-centered italic">
+                Urban Exploration
+            </h2>
+            <br>
+            <div class="video-container">
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/f8180809-8e91-4ef8-b19b-9d42e99f5e00.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/6c23cfd0-9618-4edd-9003-28b6b92c4196.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/37c7abfa-c442-4df5-ace5-d2a2fa1c23aa.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/cogvideo1.5/e304abe7-3e5a-4929-9c0d-0dd8fec78b48.mp4"
+                            type="video/mp4" />
+                    </video>
+                </div>
+            </div>
+            <h2 class="subtitle has-text-centered italic">
+                FPV & Sports
+            </h2>
+            <br>
+            <div class="video-container">
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/dynamic/cogvideox_controlnetxs_c52592a0.mp4" type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/dynamic/cogvideox_controlnetxs_19c3e433.mp4" type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/dynamic/cogvideox_controlnetxs_43d1ce7d.mp4" type="video/mp4" />
+                    </video>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/dynamic/cogvideox_controlnetxs_183e7ba2.mp4" type="video/mp4" />
+                    </video>
+                </div>
+            </div>
+            <h2 class="subtitle has-text-centered italic">
+                Complex Trajectories & Scene Dynamics
+            </h2>
+            <br>
+            <div class="video-container">
+                <div>
+                    <video autoplay controls muted loop width="60%">
+                        <source src="static/videos/demo/cogvideox.mp4" type="video/mp4" />
+                    </video>
+                </div>
+            </div>
+            <h2 class="subtitle has-text-centered italic">
+                Various Domains
+            </h2>
+            <br>
+            <div class="video-container">
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/various_types/cartoon.mp4" type="video/mp4" />
+                    </video>
+                    <h2 class="subtitle has-text-centered italic">
+                        Cartoon
+                    </h2>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/various_types/food.mp4" type="video/mp4" />
+                    </video>
+                    <h2 class="subtitle has-text-centered italic">
+                        Food
+                    </h2>
+                </div>
+            </div>
+            <br>
+            <div class="video-container">
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/various_types/human.mp4" type="video/mp4" />
+                    </video>
+                    <h2 class="subtitle has-text-centered italic">
+                        Human
+                    </h2>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/various_types/pets.mp4" type="video/mp4" />
+                    </video>
+                    <h2 class="subtitle has-text-centered italic">
+                        Pets
+                    </h2>
+                </div>
+            </div>
+            <br>
+            <div class="video-container">
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/demo/product_demo.mp4" type="video/mp4" />
+                    </video>
+                    <h2 class="subtitle has-text-centered italic">
+                        Product Demo
+                    </h2>
+                </div>
+                <div>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/demo/chinese_landscape.mp4" type="video/mp4" />
+                    </video>
+                    <h2 class="subtitle has-text-centered italic">
+                        Chinese Antique
+                    </h2>
+                </div>
+            </div>
+        </div>
+    </section>
+    <!-- Method -->
+    <section class="section hero">
+        <div class="container has-text-centered">
+            <h2 class="title is-3">Method</h2>
+            <!-- step 1 -->
+            <div class="container has-text-centered">
+                <h2 class="title has-text-centered is-4 italic">
+                    Step 1 (Training & Inference): Construct 3D point cloud by monocular metric depth estimation.
+                </h2>
+                <div class="video-container" style="gap: 5px;">
+                    <img src="static/images/scene1.jpg" width="25%" />
+                    <img src="static/images/scene2.jpg" width="25%" />
+                    <img src="static/images/scene3.jpg" width="25%" />
+                </div>
+            </div>
+            <br>
+            <!-- step 2 -->
+            <div class="container has-text-centered">
+                <h2 class="title has-text-centered is-4 italic">
+                    Step 2 (Training): Align from relative-scale to metric-scale.
+                </h2>
+                <img src="static/images/align.jpg" width="80%" />
+            </div>
+            <br>
+            <!-- step 3 -->
+            <div class="container has-text-centered">
+                <h2 class="title has-text-centered is-4 italic">
+                    Step 3 (Inference): Render preview video with camera trajectory on the reconstructed 3D scene.
+                </h2>
+                <div class="video-container">
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/preview_video/preview1.mp4" type="video/mp4" />
+                    </video>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/preview_video/preview2.mp4" type="video/mp4" />
+                    </video>
+                    <video autoplay controls muted loop>
+                        <source src="static/videos/preview_video/preview3.mp4" type="video/mp4" />
+                    </video>
+                </div>
+            </div>
+            <br>
+            <!-- step 4 -->
+            <div class="container has-text-centered">
+                <h2 class="title has-text-centered is-4 italic">
+                    Step 4 (Inference): Scene-constrained noise shaping.
+                </h2>
+                <div class="content has-text-justified">
+                    We paste the visible latents of preview video into the predicted latent during generation process.
+                    However, we only paste on the high noise level and allow for
+                    dynamics in lower level of noise, thus we name it "noise shaping" that only shapes the noise at the
+                    initial high noise stage.
+                </div>
+                <div class="video-container">
+                    <div>
+                        <video autoplay controls muted loop>
+                            <source src="static/videos/ablation/ablation1_preview.mp4" type="video/mp4" />
+                        </video>
+                        <h2 class="subtitle has-text-centered is-6 italic">
+                            Preview Video
+                        </h2>
+                    </div>
+                    <div>
+                        <video autoplay controls muted loop>
+                            <source src="static/videos/ablation/ablation1_withNoiseShaping.mp4" type="video/mp4" />
+                        </video>
+                        <h2 class="subtitle has-text-centered is-6 italic">
+                            w. Scene-Constrained Noise Shaping
+                        </h2>
+                    </div>
+                    <div>
+                        <video autoplay controls muted loop>
+                            <source src="static/videos/ablation/ablation1_withoutNoiseShaping.mp4" type="video/mp4" />
+                        </video>
+                        <h2 class="subtitle has-text-centered is-6 italic">
+                            w.o. Scene-Constrained Noise Shaping
+                        </h2>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </section>
+    <section class="section" id="BibTeX">
+        <div class="container is-max-desktop content">
+            <h2 class="title">BibTeX</h2>
+            <pre><code>
+@article{li2025realcam,
+    title={RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control},
+    author={Li, Teng and Zheng, Guangcong and Jiang, Rui and Zhan, Shuigen and Wu, Tao and Lu, Yehao and Lin, Yining and Li, Xi},
+    journal={arXiv preprint arXiv:2502.10059},
+    year={2025},
+}
+            </code></pre>
+        </div>
+    </section>
+</body>
+</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+diffusers
+accelerate<1.8
+transformers
+numpy<2
+torch==2.6
+torchvision
+torchcodec==0.0.2
+sentencepiece
+gradio
+imageio
+imageio-ffmpeg
+moviepy
+wandb
+deepspeed<0.17
+peft
+decord
+opencv-python
+tensorboard
+open_clip_torch==2.22.0
+einops
+fairscale
+timm
+pillow
+xfuser
+flash-attn<2.8
+xformers
+lightning
+mmcv
+html4vision
+plyfile
+open3d
+pyvirtualdisplay
+qwen_vl_utls[decord]