roll-ai commited on
Commit
14964a5
·
verified ·
1 Parent(s): ee142ac

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +60 -0
  2. index.html +529 -0
  3. requirements.txt +33 -0
Dockerfile ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==== GPU base (CUDA 12.1 + cuDNN) ====
2
+ FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
3
+
4
+ ENV DEBIAN_FRONTEND=noninteractive \
5
+ PYTHONDONTWRITEBYTECODE=1 \
6
+ PYTHONUNBUFFERED=1 \
7
+ PIP_NO_CACHE_DIR=1 \
8
+ HF_HOME=/root/.cache/huggingface \
9
+ TRANSFORMERS_CACHE=/root/.cache/huggingface/transformers \
10
+ GRADIO_SERVER_NAME=0.0.0.0 \
11
+ GRADIO_SERVER_PORT=8080 \
12
+ DISPLAY=:99
13
+
14
+ # ---- System deps (ffmpeg + headless GL/X) ----
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ python3 python3-pip python3-venv \
17
+ git git-lfs \
18
+ ffmpeg \
19
+ libgl1 \
20
+ libgl1-mesa-dri \
21
+ libglib2.0-0 \
22
+ xvfb \
23
+ && git lfs install \
24
+ && rm -rf /var/lib/apt/lists/*
25
+
26
+ # ---- Workdir & copy requirements first (cache-friendly) ----
27
+ WORKDIR /app
28
+ COPY requirements.txt /app/requirements.txt
29
+
30
+ # ---- Python deps (Torch CUDA 12.1 first) ----
31
+ RUN python3 -m pip install --upgrade pip \
32
+ && python3 -m pip install --extra-index-url https://download.pytorch.org/whl/cu121 \
33
+ torch torchvision torchaudio \
34
+ && python3 -m pip install -r requirements.txt \
35
+ && python3 -m pip install huggingface_hub imageio-ffmpeg
36
+
37
+ # ---- App code ----
38
+ # Expect: gradio_app.py (updated), image_to_video.py, demo/, etc.
39
+ COPY . /app
40
+
41
+ # ---- Ports & default envs ----
42
+ EXPOSE 8080
43
+ ENV DEVICE=cuda \
44
+ SAVE_FPS=16 \
45
+ RESULT_DIR=/app/results \
46
+ MODEL_META_PATH=demo/models.json \
47
+ EXAMPLE_META_PATH=demo/examples.json \
48
+ CAMERA_POSE_META_PATH=demo/camera_poses.json \
49
+ DEPTH_MODEL_PATH=pretrained/Metric3D/metric_depth_vit_large_800k.pth \
50
+ CAPTION_MODEL_PATH=pretrained/Qwen2.5-VL-7B-Instruct
51
+
52
+ # Optional (set at runtime to auto-download via gradio_app.py):
53
+ # ENV REPO_COGVIDEOX=THUDM/CogVideoX1.5-5B-I2V
54
+ # ENV REPO_METRIC3D=JUGGHM/Metric3D
55
+ # ENV REPO_QWEN_VL=Qwen/Qwen2.5-VL-7B-Instruct
56
+ # ENV REPO_REALCAM=MuteApo/RealCam-I2V
57
+ # ENV HF_TOKEN= # pass at runtime; don't bake secrets
58
+
59
+ # ---- Launch under Xvfb to satisfy headless GL ----
60
+ CMD ["bash", "-lc", "xvfb-run -s '-screen 0 1280x720x24' python3 finetune/gradio_app.py"]
index.html ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+
4
+ <head>
5
+ <meta charset="utf-8">
6
+ <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
7
+ <!-- Replace the content tag with appropriate information -->
8
+ <meta name="description"
9
+ content="RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control">
10
+ <meta property="og:title"
11
+ content="RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control" />
12
+ <meta property="og:description"
13
+ content="RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control" />
14
+
15
+ <!-- Keywords for your paper to be indexed by-->
16
+ <meta name="keywords" content="RealCam-I2V, Complex Camera Control, Image-to-Video Generation">
17
+ <meta name="viewport" content="width=device-width, initial-scale=1">
18
+
19
+ <title>RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control</title>
20
+ <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
21
+ <link href="https://fonts.googleapis.com/css2?family=Playfair+Display:ital,wght@1,400&display=swap"
22
+ rel="stylesheet">
23
+
24
+ <link rel="stylesheet" href="static/css/bulma.min.css">
25
+ <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
26
+ <link rel="stylesheet" href="static/css/bulma-slider.min.css">
27
+ <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
28
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
29
+ <link rel="stylesheet" href="static/css/index.css">
30
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/dreampulse/computer-modern-web-font@master/fonts.css">
31
+
32
+ <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
33
+ <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
34
+ <script defer src="static/js/fontawesome.all.min.js"></script>
35
+ <script src="static/js/bulma-carousel.min.js"></script>
36
+ <script src="static/js/bulma-slider.min.js"></script>
37
+ <script src="static/js/index.js"></script>
38
+
39
+ <script type="text/javascript" async
40
+ src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_SVG"></script>
41
+ <script type="text/x-mathjax-config">
42
+ MathJax.Hub.Config({
43
+ tex2jax: {
44
+ inlineMath: [['$','$'], ['\\(','\\)']]
45
+ }
46
+ });
47
+ </script>
48
+
49
+ <style>
50
+ .video-container {
51
+ display: flex;
52
+ justify-content: center;
53
+ gap: 0px;
54
+ }
55
+
56
+ .italic {
57
+ font-family: 'Playfair Display';
58
+ font-style: italic;
59
+ }
60
+ </style>
61
+ </head>
62
+
63
+ <body>
64
+ <!-- title and author -->
65
+ <section class="hero">
66
+ <div class="hero-body">
67
+ <div class="container is-max-desktop">
68
+ <div class="columns is-centered">
69
+ <div class="column has-text-centered">
70
+ <h1 class="title is-2 publication-title">
71
+ RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control
72
+ </h1>
73
+
74
+ <div class="is-size-5 publication-authors">
75
+ <span class="author-block">Teng Li<sup>1,2*</sup>,</span>
76
+ <span class="author-block">Guangcong Zheng<sup>1,2*</sup>,</span>
77
+ <span class="author-block">Rui Jiang<sup>1,2</sup>,</span>
78
+ <span class="author-block">Shuigen Zhan<sup>1</sup>,</span>
79
+ <span class="author-block">Tao Wu<sup>1</sup>,</span>
80
+ <span class="author-block">Yehao Lu<sup>1</sup>,</span>
81
+ <span class="author-block">Yining Lin<sup>3</sup>,</span>
82
+ <br>
83
+ <span class="author-block">Chuanyun Deng<sup>2</sup>,</span>
84
+ <span class="author-block">Yepan Xiong<sup>2</sup>,</span>
85
+ <span class="author-block">Min Chen<sup>2</sup>,</span>
86
+ <span class="author-block">Lin Cheng<sup>2</sup>,</span>
87
+ <span class="author-block">Xi Li<sup>1&#9993;</sup></span>
88
+ </div>
89
+
90
+ <div class="is-size-5 publication-authors">
91
+ <span class="author-block"><sup>1</sup>Zhejiang University,</span>
92
+ <span class="author-block"><sup>2</sup>Huawei,</span>
93
+ <span class="author-block"><sup>3</sup>Supremind</span>
94
+ <br>
95
+ <span class="author-block">ICCV 2025</span>
96
+ </div>
97
+
98
+ <div class="column has-text-centered">
99
+ <div class="publication-links">
100
+ <span class="link-block">
101
+ <a href="https://arxiv.org/pdf/2502.10059.pdf" target="_blank"
102
+ class="external-link button is-normal is-rounded is-dark">
103
+ <span class="icon">
104
+ <i class="fas fa-file-pdf"></i>
105
+ </span>
106
+ <span>Paper</span>
107
+ </a>
108
+ </span>
109
+
110
+ <span class="link-block">
111
+ <a href="https://arxiv.org/abs/2502.10059" target="_blank"
112
+ class="external-link button is-normal is-rounded is-dark">
113
+ <span class="icon">
114
+ <i class="ai ai-arxiv"></i>
115
+ </span>
116
+ <span>arXiv</span>
117
+ </a>
118
+ </span>
119
+
120
+ <span class="link-block">
121
+ <a href="https://github.com/ZGCTroy/RealCam-I2V" target="_blank"
122
+ class="external-link button is-normal is-rounded is-dark">
123
+ <span class="icon">
124
+ <i class="fab fa-github"></i>
125
+ </span>
126
+ <span>Code</span>
127
+ </a>
128
+ </span>
129
+
130
+ <span class="link-block">
131
+ <a href="https://github.com/ZGCTroy/CamI2V" target="_blank"
132
+ class="external-link button is-normal is-rounded is-dark">
133
+ <span class="icon">
134
+ <i class="fab fa-github"></i>
135
+ </span>
136
+ <span>CamI2V</span>
137
+ </a>
138
+ </span>
139
+ </div>
140
+ </div>
141
+ </div>
142
+ </div>
143
+ </div>
144
+ </div>
145
+ </section>
146
+
147
+ <!-- abstract -->
148
+ <section class="section hero is-light">
149
+ <div class="container is-max-desktop">
150
+ <div class="columns is-centered has-text-centered">
151
+ <div class="column is-four-fifths">
152
+ <h2 class="title is-3">Abstract</h2>
153
+ <div class="content has-text-justified">
154
+ <p>
155
+ Recent advancements in camera-trajectory-guided image-to-video generation offer higher
156
+ precision and better support for complex camera control compared to text-based approaches.
157
+ However, they also introduce significant usability challenges, as users often struggle to
158
+ provide precise camera parameters when working with arbitrary real-world images without
159
+ knowledge of their depth nor scene scale.
160
+ To address these real-world application issues, we propose RealCam-I2V, a novel
161
+ diffusion-based video generation framework that integrates monocular metric depth estimation
162
+ to establish 3D scene reconstruction in a preprocessing step.
163
+ During training, the reconstructed 3D scene enables scaling camera parameters from relative
164
+ to metric scales, ensuring compatibility and scale consistency across diverse real-world
165
+ images.
166
+ In inference, RealCam-I2V offers an intuitive interface where users can precisely draw
167
+ camera trajectories by dragging within the 3D scene.
168
+ To further enhance precise camera control and scene consistency, we propose
169
+ scene-constrained noise shaping, which shapes high-level noise and also allows the framework
170
+ to maintain dynamic and coherent video generation in lower noise stages.
171
+ RealCam-I2V achieves significant improvements in controllability and video quality on the
172
+ RealEstate10K and out-of-domain images. We further enables applications like
173
+ camera-controlled looping video generation and generative frame interpolation.
174
+ </p>
175
+ </div>
176
+ </div>
177
+ </div>
178
+ </div>
179
+ </section>
180
+
181
+ <section class="section hero">
182
+ <div class="container has-text-centered">
183
+ <h2 class="title is-3">Demo</h2>
184
+
185
+ <div class="video-container">
186
+ <div>
187
+ <video autoplay controls muted loop width="80%">
188
+ <source src="static/videos/demo/4d_demo.mp4" type="video/mp4" />
189
+ </video>
190
+ </div>
191
+ </div>
192
+ <h2 class="subtitle has-text-centered italic">
193
+ 4D Visualization
194
+ </h2>
195
+
196
+ <br>
197
+
198
+ <div class="video-container">
199
+ <div>
200
+ <video autoplay controls muted loop>
201
+ <source src="static/videos/cogvideo1.5/73c3266a-d3e1-41c9-9691-729478a8bf77.mp4"
202
+ type="video/mp4" />
203
+ </video>
204
+ </div>
205
+
206
+ <div>
207
+ <video autoplay controls muted loop>
208
+ <source src="static/videos/cogvideo1.5/79131dea-ca85-49df-b68b-cdb208f164c7.mp4"
209
+ type="video/mp4" />
210
+ </video>
211
+ </div>
212
+
213
+ <div>
214
+ <video autoplay controls muted loop>
215
+ <source src="static/videos/cogvideo1.5/b17050b5-3ed8-44ae-94a4-ec939c57b41f.mp4"
216
+ type="video/mp4" />
217
+ </video>
218
+ </div>
219
+
220
+ <div>
221
+ <video autoplay controls muted loop>
222
+ <source src="static/videos/cogvideo1.5/8ab67ba3-8300-4b82-98b7-8e28403cf6f7.mp4"
223
+ type="video/mp4" />
224
+ </video>
225
+ </div>
226
+ </div>
227
+ <h2 class="subtitle has-text-centered italic">
228
+ Aerial View
229
+ </h2>
230
+
231
+ <br>
232
+
233
+ <div class="video-container">
234
+ <div>
235
+ <video autoplay controls muted loop>
236
+ <source src="static/videos/cogvideo1.5/3f962cd6-fbf4-4b8a-b107-1468931c80f4.mp4"
237
+ type="video/mp4" />
238
+ </video>
239
+ </div>
240
+
241
+ <div>
242
+ <video autoplay controls muted loop>
243
+ <source src="static/videos/cogvideo1.5/d4db16a8-3f82-43b3-8432-cc8df007f10c.mp4"
244
+ type="video/mp4" />
245
+ </video>
246
+ </div>
247
+
248
+ <div>
249
+ <video autoplay controls muted loop>
250
+ <source src="static/videos/cogvideo1.5/34614b89-431d-4e31-8d82-89a0f082aaed.mp4"
251
+ type="video/mp4" />
252
+ </video>
253
+ </div>
254
+
255
+ <div>
256
+ <video autoplay controls muted loop>
257
+ <source src="static/videos/cogvideo1.5/4f0b8b62-a278-4b0e-8457-b6e8b099de59.mp4"
258
+ type="video/mp4" />
259
+ </video>
260
+ </div>
261
+ </div>
262
+ <h2 class="subtitle has-text-centered italic">
263
+ Urban Exploration
264
+ </h2>
265
+
266
+ <br>
267
+
268
+ <div class="video-container">
269
+ <div>
270
+ <video autoplay controls muted loop>
271
+ <source src="static/videos/cogvideo1.5/f8180809-8e91-4ef8-b19b-9d42e99f5e00.mp4"
272
+ type="video/mp4" />
273
+ </video>
274
+ </div>
275
+
276
+ <div>
277
+ <video autoplay controls muted loop>
278
+ <source src="static/videos/cogvideo1.5/6c23cfd0-9618-4edd-9003-28b6b92c4196.mp4"
279
+ type="video/mp4" />
280
+ </video>
281
+ </div>
282
+
283
+ <div>
284
+ <video autoplay controls muted loop>
285
+ <source src="static/videos/cogvideo1.5/37c7abfa-c442-4df5-ace5-d2a2fa1c23aa.mp4"
286
+ type="video/mp4" />
287
+ </video>
288
+ </div>
289
+
290
+ <div>
291
+ <video autoplay controls muted loop>
292
+ <source src="static/videos/cogvideo1.5/e304abe7-3e5a-4929-9c0d-0dd8fec78b48.mp4"
293
+ type="video/mp4" />
294
+ </video>
295
+ </div>
296
+ </div>
297
+ <h2 class="subtitle has-text-centered italic">
298
+ FPV & Sports
299
+ </h2>
300
+
301
+ <br>
302
+
303
+ <div class="video-container">
304
+ <div>
305
+ <video autoplay controls muted loop>
306
+ <source src="static/videos/dynamic/cogvideox_controlnetxs_c52592a0.mp4" type="video/mp4" />
307
+ </video>
308
+ </div>
309
+
310
+ <div>
311
+ <video autoplay controls muted loop>
312
+ <source src="static/videos/dynamic/cogvideox_controlnetxs_19c3e433.mp4" type="video/mp4" />
313
+ </video>
314
+ </div>
315
+
316
+ <div>
317
+ <video autoplay controls muted loop>
318
+ <source src="static/videos/dynamic/cogvideox_controlnetxs_43d1ce7d.mp4" type="video/mp4" />
319
+ </video>
320
+ </div>
321
+
322
+ <div>
323
+ <video autoplay controls muted loop>
324
+ <source src="static/videos/dynamic/cogvideox_controlnetxs_183e7ba2.mp4" type="video/mp4" />
325
+ </video>
326
+ </div>
327
+ </div>
328
+ <h2 class="subtitle has-text-centered italic">
329
+ Complex Trajectories & Scene Dynamics
330
+ </h2>
331
+
332
+ <br>
333
+
334
+ <div class="video-container">
335
+ <div>
336
+ <video autoplay controls muted loop width="60%">
337
+ <source src="static/videos/demo/cogvideox.mp4" type="video/mp4" />
338
+ </video>
339
+ </div>
340
+ </div>
341
+ <h2 class="subtitle has-text-centered italic">
342
+ Various Domains
343
+ </h2>
344
+
345
+ <br>
346
+
347
+ <div class="video-container">
348
+ <div>
349
+ <video autoplay controls muted loop>
350
+ <source src="static/videos/various_types/cartoon.mp4" type="video/mp4" />
351
+ </video>
352
+ <h2 class="subtitle has-text-centered italic">
353
+ Cartoon
354
+ </h2>
355
+ </div>
356
+
357
+ <div>
358
+ <video autoplay controls muted loop>
359
+ <source src="static/videos/various_types/food.mp4" type="video/mp4" />
360
+ </video>
361
+ <h2 class="subtitle has-text-centered italic">
362
+ Food
363
+ </h2>
364
+ </div>
365
+ </div>
366
+
367
+ <br>
368
+
369
+ <div class="video-container">
370
+ <div>
371
+ <video autoplay controls muted loop>
372
+ <source src="static/videos/various_types/human.mp4" type="video/mp4" />
373
+ </video>
374
+ <h2 class="subtitle has-text-centered italic">
375
+ Human
376
+ </h2>
377
+ </div>
378
+
379
+ <div>
380
+ <video autoplay controls muted loop>
381
+ <source src="static/videos/various_types/pets.mp4" type="video/mp4" />
382
+ </video>
383
+ <h2 class="subtitle has-text-centered italic">
384
+ Pets
385
+ </h2>
386
+ </div>
387
+ </div>
388
+
389
+ <br>
390
+
391
+ <div class="video-container">
392
+ <div>
393
+ <video autoplay controls muted loop>
394
+ <source src="static/videos/demo/product_demo.mp4" type="video/mp4" />
395
+ </video>
396
+ <h2 class="subtitle has-text-centered italic">
397
+ Product Demo
398
+ </h2>
399
+ </div>
400
+
401
+ <div>
402
+ <video autoplay controls muted loop>
403
+ <source src="static/videos/demo/chinese_landscape.mp4" type="video/mp4" />
404
+ </video>
405
+ <h2 class="subtitle has-text-centered italic">
406
+ Chinese Antique
407
+ </h2>
408
+ </div>
409
+ </div>
410
+
411
+ </div>
412
+
413
+ </section>
414
+
415
+ <!-- Method -->
416
+ <section class="section hero">
417
+ <div class="container has-text-centered">
418
+ <h2 class="title is-3">Method</h2>
419
+
420
+ <!-- step 1 -->
421
+ <div class="container has-text-centered">
422
+ <h2 class="title has-text-centered is-4 italic">
423
+ Step 1 (Training & Inference): Construct 3D point cloud by monocular metric depth estimation.
424
+ </h2>
425
+
426
+ <div class="video-container" style="gap: 5px;">
427
+ <img src="static/images/scene1.jpg" width="25%" />
428
+ <img src="static/images/scene2.jpg" width="25%" />
429
+ <img src="static/images/scene3.jpg" width="25%" />
430
+ </div>
431
+ </div>
432
+
433
+ <br>
434
+
435
+ <!-- step 2 -->
436
+ <div class="container has-text-centered">
437
+ <h2 class="title has-text-centered is-4 italic">
438
+ Step 2 (Training): Align from relative-scale to metric-scale.
439
+ </h2>
440
+
441
+ <img src="static/images/align.jpg" width="80%" />
442
+ </div>
443
+
444
+ <br>
445
+
446
+ <!-- step 3 -->
447
+ <div class="container has-text-centered">
448
+ <h2 class="title has-text-centered is-4 italic">
449
+ Step 3 (Inference): Render preview video with camera trajectory on the reconstructed 3D scene.
450
+ </h2>
451
+
452
+ <div class="video-container">
453
+ <video autoplay controls muted loop>
454
+ <source src="static/videos/preview_video/preview1.mp4" type="video/mp4" />
455
+ </video>
456
+
457
+ <video autoplay controls muted loop>
458
+ <source src="static/videos/preview_video/preview2.mp4" type="video/mp4" />
459
+ </video>
460
+
461
+ <video autoplay controls muted loop>
462
+ <source src="static/videos/preview_video/preview3.mp4" type="video/mp4" />
463
+ </video>
464
+ </div>
465
+ </div>
466
+
467
+ <br>
468
+
469
+ <!-- step 4 -->
470
+ <div class="container has-text-centered">
471
+ <h2 class="title has-text-centered is-4 italic">
472
+ Step 4 (Inference): Scene-constrained noise shaping.
473
+ </h2>
474
+
475
+ <div class="content has-text-justified">
476
+ We paste the visible latents of preview video into the predicted latent during generation process.
477
+ However, we only paste on the high noise level and allow for
478
+ dynamics in lower level of noise, thus we name it "noise shaping" that only shapes the noise at the
479
+ initial high noise stage.
480
+ </div>
481
+
482
+ <div class="video-container">
483
+ <div>
484
+ <video autoplay controls muted loop>
485
+ <source src="static/videos/ablation/ablation1_preview.mp4" type="video/mp4" />
486
+ </video>
487
+ <h2 class="subtitle has-text-centered is-6 italic">
488
+ Preview Video
489
+ </h2>
490
+ </div>
491
+
492
+ <div>
493
+ <video autoplay controls muted loop>
494
+ <source src="static/videos/ablation/ablation1_withNoiseShaping.mp4" type="video/mp4" />
495
+ </video>
496
+ <h2 class="subtitle has-text-centered is-6 italic">
497
+ w. Scene-Constrained Noise Shaping
498
+ </h2>
499
+ </div>
500
+
501
+ <div>
502
+ <video autoplay controls muted loop>
503
+ <source src="static/videos/ablation/ablation1_withoutNoiseShaping.mp4" type="video/mp4" />
504
+ </video>
505
+ <h2 class="subtitle has-text-centered is-6 italic">
506
+ w.o. Scene-Constrained Noise Shaping
507
+ </h2>
508
+ </div>
509
+ </div>
510
+ </div>
511
+ </div>
512
+ </section>
513
+
514
+ <section class="section" id="BibTeX">
515
+ <div class="container is-max-desktop content">
516
+ <h2 class="title">BibTeX</h2>
517
+ <pre><code>
518
+ @article{li2025realcam,
519
+ title={RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control},
520
+ author={Li, Teng and Zheng, Guangcong and Jiang, Rui and Zhan, Shuigen and Wu, Tao and Lu, Yehao and Lin, Yining and Li, Xi},
521
+ journal={arXiv preprint arXiv:2502.10059},
522
+ year={2025},
523
+ }
524
+ </code></pre>
525
+ </div>
526
+ </section>
527
+ </body>
528
+
529
+ </html>
requirements.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diffusers
2
+ accelerate<1.8
3
+ transformers
4
+ numpy<2
5
+ torch==2.6
6
+ torchvision
7
+ torchcodec==0.0.2
8
+ sentencepiece
9
+ gradio
10
+ imageio
11
+ imageio-ffmpeg
12
+ moviepy
13
+ wandb
14
+ deepspeed<0.17
15
+ peft
16
+ decord
17
+ opencv-python
18
+ tensorboard
19
+ open_clip_torch==2.22.0
20
+ einops
21
+ fairscale
22
+ timm
23
+ pillow
24
+ xfuser
25
+ flash-attn<2.8
26
+ xformers
27
+ lightning
28
+ mmcv
29
+ html4vision
30
+ plyfile
31
+ open3d
32
+ pyvirtualdisplay
33
+ qwen_vl_utls[decord]