Fabrice-TIERCELIN commited on
Commit
e4642a8
·
verified ·
1 Parent(s): 952536d

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +11 -18
  2. app.py +496 -984
  3. app_endframe.py +0 -0
  4. app_v2v.py +4 -0
  5. requirements.txt +22 -40
README.md CHANGED
@@ -1,21 +1,14 @@
1
  ---
2
- title: SUPIR Image Upscaler
 
 
 
3
  sdk: gradio
4
- emoji: 📷
5
- sdk_version: 4.38.1
6
  app_file: app.py
7
- license: mit
8
- colorFrom: blue
9
- colorTo: pink
10
- tags:
11
- - Upscaling
12
- - Restoring
13
- - Image-to-Image
14
- - Image-2-Image
15
- - Img-to-Img
16
- - Img-2-Img
17
- - language models
18
- - LLMs
19
- short_description: Restore blurred or small images with prompt
20
- suggested_hardware: zero-a10g
21
- ---
 
1
  ---
2
+ title: FramePack F1 + V2V + EF
3
+ emoji: 👽
4
+ colorFrom: pink
5
+ colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 5.29.1
 
8
  app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ short_description: fast video generation from images & text
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,1036 +1,548 @@
 
 
1
  import os
 
 
 
2
  import gradio as gr
3
- import argparse
4
- import numpy as np
5
  import torch
 
6
  import einops
7
- import copy
 
8
  import math
9
- import time
10
  import random
11
  import spaces
12
- import re
13
- import uuid
14
 
15
- from gradio_imageslider import ImageSlider
16
  from PIL import Image
17
- from SUPIR.util import HWC3, upscale_image, fix_resize, convert_dtype, create_SUPIR_model, load_QF_ckpt
18
- from huggingface_hub import hf_hub_download
19
- from pillow_heif import register_heif_opener
20
-
21
- register_heif_opener()
22
-
23
- max_64_bit_int = np.iinfo(np.int32).max
24
-
25
- hf_hub_download(repo_id="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", filename="open_clip_pytorch_model.bin", local_dir="laion_CLIP-ViT-bigG-14-laion2B-39B-b160k")
26
- hf_hub_download(repo_id="camenduru/SUPIR", filename="sd_xl_base_1.0_0.9vae.safetensors", local_dir="yushan777_SUPIR")
27
- hf_hub_download(repo_id="camenduru/SUPIR", filename="SUPIR-v0F.ckpt", local_dir="yushan777_SUPIR")
28
- hf_hub_download(repo_id="camenduru/SUPIR", filename="SUPIR-v0Q.ckpt", local_dir="yushan777_SUPIR")
29
- hf_hub_download(repo_id="RunDiffusion/Juggernaut-XL-Lightning", filename="Juggernaut_RunDiffusionPhoto2_Lightning_4Steps.safetensors", local_dir="RunDiffusion_Juggernaut-XL-Lightning")
30
-
31
- parser = argparse.ArgumentParser()
32
- parser.add_argument("--opt", type=str, default='options/SUPIR_v0.yaml')
33
- parser.add_argument("--ip", type=str, default='127.0.0.1')
34
- parser.add_argument("--port", type=int, default='6688')
35
- parser.add_argument("--no_llava", action='store_true', default=True)#False
36
- parser.add_argument("--use_image_slider", action='store_true', default=False)#False
37
- parser.add_argument("--log_history", action='store_true', default=False)
38
- parser.add_argument("--loading_half_params", action='store_true', default=False)#False
39
- parser.add_argument("--use_tile_vae", action='store_true', default=True)#False
40
- parser.add_argument("--encoder_tile_size", type=int, default=512)
41
- parser.add_argument("--decoder_tile_size", type=int, default=64)
42
- parser.add_argument("--load_8bit_llava", action='store_true', default=False)
43
- args = parser.parse_args()
44
-
45
- if torch.cuda.device_count() > 0:
46
- SUPIR_device = 'cuda:0'
47
-
48
- # Load SUPIR
49
- model, default_setting = create_SUPIR_model(args.opt, SUPIR_sign='Q', load_default_setting=True)
50
- if args.loading_half_params:
51
- model = model.half()
52
- if args.use_tile_vae:
53
- model.init_tile_vae(encoder_tile_size=args.encoder_tile_size, decoder_tile_size=args.decoder_tile_size)
54
- model = model.to(SUPIR_device)
55
- model.first_stage_model.denoise_encoder_s1 = copy.deepcopy(model.first_stage_model.denoise_encoder)
56
- model.current_model = 'v0-Q'
57
- ckpt_Q, ckpt_F = load_QF_ckpt(args.opt)
58
-
59
- def check_upload(input_image):
60
- if input_image is None:
61
- raise gr.Error("Please provide an image to restore.")
62
- return gr.update(visible = True)
63
-
64
- def update_seed(is_randomize_seed, seed):
65
- if is_randomize_seed:
66
- return random.randint(0, max_64_bit_int)
67
- return seed
68
-
69
- def reset():
70
- return [
71
- None,
72
- 0,
73
- None,
74
- None,
75
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
76
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
77
- 1,
78
- 1024,
79
- 1,
80
- 2,
81
- 50,
82
- -1.0,
83
- 1.,
84
- default_setting.s_cfg_Quality if torch.cuda.device_count() > 0 else 1.0,
85
- True,
86
- random.randint(0, max_64_bit_int),
87
- 5,
88
- 1.003,
89
- "Wavelet",
90
- "fp32",
91
- "fp32",
92
- 1.0,
93
- True,
94
- False,
95
- default_setting.spt_linear_CFG_Quality if torch.cuda.device_count() > 0 else 1.0,
96
- 0.,
97
- "v0-Q",
98
- "input",
99
- 179
100
- ]
101
-
102
- def check_and_update(input_image):
103
- if input_image is None:
104
- raise gr.Error("Please provide an image to restore.")
105
- return gr.update(visible = True)
106
-
107
- @spaces.GPU(duration=420)
108
- def stage1_process(
109
- input_image,
110
- gamma_correction,
111
- diff_dtype,
112
- ae_dtype
113
- ):
114
- print('stage1_process ==>>')
115
- if torch.cuda.device_count() == 0:
116
- gr.Warning('Set this space to GPU config to make it work.')
117
- return None, None
118
- torch.cuda.set_device(SUPIR_device)
119
- LQ = HWC3(np.array(Image.open(input_image)))
120
- LQ = fix_resize(LQ, 512)
121
- # stage1
122
- LQ = np.array(LQ) / 255 * 2 - 1
123
- LQ = torch.tensor(LQ, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0).to(SUPIR_device)[:, :3, :, :]
124
-
125
- model.ae_dtype = convert_dtype(ae_dtype)
126
- model.model.dtype = convert_dtype(diff_dtype)
127
-
128
- LQ = model.batchify_denoise(LQ, is_stage1=True)
129
- LQ = (LQ[0].permute(1, 2, 0) * 127.5 + 127.5).cpu().numpy().round().clip(0, 255).astype(np.uint8)
130
- # gamma correction
131
- LQ = LQ / 255.0
132
- LQ = np.power(LQ, gamma_correction)
133
- LQ *= 255.0
134
- LQ = LQ.round().clip(0, 255).astype(np.uint8)
135
- print('<<== stage1_process')
136
- return LQ, gr.update(visible = True)
137
-
138
- def stage2_process_example(*args, **kwargs):
139
- [result_slider, result_gallery, restore_information, reset_btn] = restore_in_Xmin(*args, **kwargs)
140
- return [result_slider, restore_information, reset_btn]
141
-
142
- def stage2_process(*args, **kwargs):
143
- try:
144
- return restore_in_Xmin(*args, **kwargs)
145
- except Exception as e:
146
- # NO_GPU_MESSAGE_INQUEUE
147
- print("gradio.exceptions.Error 'No GPU is currently available for you after 60s'")
148
- print('str(type(e)): ' + str(type(e))) # <class 'gradio.exceptions.Error'>
149
- print('str(e): ' + str(e)) # You have exceeded your GPU quota...
150
- try:
151
- print('e.message: ' + e.message) # No GPU is currently available for you after 60s
152
- except Exception as e2:
153
- print('Failure')
154
- if str(e).startswith("No GPU is currently available for you after 60s"):
155
- print('Exception identified!!!')
156
- #if str(type(e)) == "<class 'gradio.exceptions.Error'>":
157
- #print('Exception of name ' + type(e).__name__)
158
- raise e
159
-
160
- def restore_in_Xmin(
161
- noisy_image,
162
- rotation,
163
- denoise_image,
164
- prompt,
165
- a_prompt,
166
- n_prompt,
167
- num_samples,
168
- min_size,
169
- downscale,
170
- upscale,
171
- edm_steps,
172
- s_stage1,
173
- s_stage2,
174
- s_cfg,
175
- randomize_seed,
176
- seed,
177
- s_churn,
178
- s_noise,
179
- color_fix_type,
180
- diff_dtype,
181
- ae_dtype,
182
- gamma_correction,
183
- linear_CFG,
184
- linear_s_stage2,
185
- spt_linear_CFG,
186
- spt_linear_s_stage2,
187
- model_select,
188
- output_format,
189
- allocation
190
- ):
191
- print("noisy_image:\n" + str(noisy_image))
192
- print("denoise_image:\n" + str(denoise_image))
193
- print("rotation: " + str(rotation))
194
- print("prompt: " + str(prompt))
195
- print("a_prompt: " + str(a_prompt))
196
- print("n_prompt: " + str(n_prompt))
197
- print("num_samples: " + str(num_samples))
198
- print("min_size: " + str(min_size))
199
- print("downscale: " + str(downscale))
200
- print("upscale: " + str(upscale))
201
- print("edm_steps: " + str(edm_steps))
202
- print("s_stage1: " + str(s_stage1))
203
- print("s_stage2: " + str(s_stage2))
204
- print("s_cfg: " + str(s_cfg))
205
- print("randomize_seed: " + str(randomize_seed))
206
- print("seed: " + str(seed))
207
- print("s_churn: " + str(s_churn))
208
- print("s_noise: " + str(s_noise))
209
- print("color_fix_type: " + str(color_fix_type))
210
- print("diff_dtype: " + str(diff_dtype))
211
- print("ae_dtype: " + str(ae_dtype))
212
- print("gamma_correction: " + str(gamma_correction))
213
- print("linear_CFG: " + str(linear_CFG))
214
- print("linear_s_stage2: " + str(linear_s_stage2))
215
- print("spt_linear_CFG: " + str(spt_linear_CFG))
216
- print("spt_linear_s_stage2: " + str(spt_linear_s_stage2))
217
- print("model_select: " + str(model_select))
218
- print("GPU time allocation: " + str(allocation) + " min")
219
- print("output_format: " + str(output_format))
220
-
221
- input_format = re.sub(r"^.*\.([^\.]+)$", r"\1", noisy_image)
222
-
223
- if input_format not in ['png', 'webp', 'jpg', 'jpeg', 'gif', 'bmp', 'heic']:
224
- gr.Warning('Invalid image format. Please first convert into *.png, *.webp, *.jpg, *.jpeg, *.gif, *.bmp or *.heic.')
225
- return None, None, None, None
226
-
227
- if output_format == "input":
228
- if noisy_image is None:
229
- output_format = "png"
230
  else:
231
- output_format = input_format
232
- print("final output_format: " + str(output_format))
233
 
234
- if prompt is None:
235
- prompt = ""
236
 
237
- if a_prompt is None:
238
- a_prompt = ""
 
 
 
239
 
240
- if n_prompt is None:
241
- n_prompt = ""
242
 
243
- if prompt != "" and a_prompt != "":
244
- a_prompt = prompt + ", " + a_prompt
245
- else:
246
- a_prompt = prompt + a_prompt
247
- print("Final prompt: " + str(a_prompt))
248
 
249
- denoise_image = np.array(Image.open(noisy_image if denoise_image is None else denoise_image))
250
 
251
- if rotation == 90:
252
- denoise_image = np.array(list(zip(*denoise_image[::-1])))
253
- elif rotation == 180:
254
- denoise_image = np.array(list(zip(*denoise_image[::-1])))
255
- denoise_image = np.array(list(zip(*denoise_image[::-1])))
256
- elif rotation == -90:
257
- denoise_image = np.array(list(zip(*denoise_image))[::-1])
258
 
259
- if 1 < downscale:
260
- input_height, input_width, input_channel = denoise_image.shape
261
- denoise_image = np.array(Image.fromarray(denoise_image).resize((input_width // downscale, input_height // downscale), Image.LANCZOS))
262
 
263
- denoise_image = HWC3(denoise_image)
264
 
265
- if torch.cuda.device_count() == 0:
266
- gr.Warning('Set this space to GPU config to make it work.')
267
- return [noisy_image, denoise_image], gr.update(label="Downloadable results in *." + output_format + " format", format = output_format, value = [denoise_image]), None, gr.update(visible=True)
268
 
269
- if model_select != model.current_model:
270
- print('load ' + model_select)
271
- if model_select == 'v0-Q':
272
- model.load_state_dict(ckpt_Q, strict=False)
273
- elif model_select == 'v0-F':
274
- model.load_state_dict(ckpt_F, strict=False)
275
- model.current_model = model_select
276
 
277
- model.ae_dtype = convert_dtype(ae_dtype)
278
- model.model.dtype = convert_dtype(diff_dtype)
279
 
280
- return restore_on_gpu(
281
- noisy_image, denoise_image, prompt, a_prompt, n_prompt, num_samples, min_size, downscale, upscale, edm_steps, s_stage1, s_stage2, s_cfg, randomize_seed, seed, s_churn, s_noise, color_fix_type, diff_dtype, ae_dtype, gamma_correction, linear_CFG, linear_s_stage2, spt_linear_CFG, spt_linear_s_stage2, model_select, output_format, allocation
282
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
- def get_duration(
285
- noisy_image,
286
- input_image,
287
- prompt,
288
- a_prompt,
289
- n_prompt,
290
- num_samples,
291
- min_size,
292
- downscale,
293
- upscale,
294
- edm_steps,
295
- s_stage1,
296
- s_stage2,
297
- s_cfg,
298
- randomize_seed,
299
- seed,
300
- s_churn,
301
- s_noise,
302
- color_fix_type,
303
- diff_dtype,
304
- ae_dtype,
305
- gamma_correction,
306
- linear_CFG,
307
- linear_s_stage2,
308
- spt_linear_CFG,
309
- spt_linear_s_stage2,
310
- model_select,
311
- output_format,
312
- allocation
313
- ):
314
- return allocation
315
 
316
  @spaces.GPU(duration=get_duration)
317
- def restore_on_gpu(
318
- noisy_image,
319
- input_image,
320
- prompt,
321
- a_prompt,
322
- n_prompt,
323
- num_samples,
324
- min_size,
325
- downscale,
326
- upscale,
327
- edm_steps,
328
- s_stage1,
329
- s_stage2,
330
- s_cfg,
331
- randomize_seed,
332
- seed,
333
- s_churn,
334
- s_noise,
335
- color_fix_type,
336
- diff_dtype,
337
- ae_dtype,
338
- gamma_correction,
339
- linear_CFG,
340
- linear_s_stage2,
341
- spt_linear_CFG,
342
- spt_linear_s_stage2,
343
- model_select,
344
- output_format,
345
- allocation
346
- ):
347
- start = time.time()
348
- print('restore ==>>')
349
-
350
- torch.cuda.set_device(SUPIR_device)
351
-
352
- with torch.no_grad():
353
- input_image = upscale_image(input_image, upscale, unit_resolution=32, min_size=min_size)
354
- LQ = np.array(input_image) / 255.0
355
- LQ = np.power(LQ, gamma_correction)
356
- LQ *= 255.0
357
- LQ = LQ.round().clip(0, 255).astype(np.uint8)
358
- LQ = LQ / 255 * 2 - 1
359
- LQ = torch.tensor(LQ, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0).to(SUPIR_device)[:, :3, :, :]
360
- captions = ['']
361
-
362
- samples = model.batchify_sample(LQ, captions, num_steps=edm_steps, restoration_scale=s_stage1, s_churn=s_churn,
363
- s_noise=s_noise, cfg_scale=s_cfg, control_scale=s_stage2, seed=seed,
364
- num_samples=num_samples, p_p=a_prompt, n_p=n_prompt, color_fix_type=color_fix_type,
365
- use_linear_CFG=linear_CFG, use_linear_control_scale=linear_s_stage2,
366
- cfg_scale_start=spt_linear_CFG, control_scale_start=spt_linear_s_stage2)
367
-
368
- x_samples = (einops.rearrange(samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().round().clip(
369
- 0, 255).astype(np.uint8)
370
- results = [x_samples[i] for i in range(num_samples)]
371
- torch.cuda.empty_cache()
372
-
373
- # All the results have the same size
374
- input_height, input_width, input_channel = np.array(input_image).shape
375
- result_height, result_width, result_channel = np.array(results[0]).shape
376
-
377
- print('<<== restore')
378
- end = time.time()
379
- secondes = int(end - start)
380
- minutes = math.floor(secondes / 60)
381
- secondes = secondes - (minutes * 60)
382
- hours = math.floor(minutes / 60)
383
- minutes = minutes - (hours * 60)
384
- information = ("Start the process again if you want a different result. " if randomize_seed else "") + \
385
- "If you don't get the image you wanted, add more details in the « Image description ». " + \
386
- "The image" + (" has" if len(results) == 1 else "s have") + " been generated in " + \
387
- ((str(hours) + " h, ") if hours != 0 else "") + \
388
- ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
389
- str(secondes) + " sec. " + \
390
- "The new image resolution is " + str(result_width) + \
391
- " pixels large and " + str(result_height) + \
392
- " pixels high, so a resolution of " + f'{result_width * result_height:,}' + " pixels."
393
- print(information)
394
- try:
395
- print("Initial resolution: " + f'{input_width * input_height:,}')
396
- print("Final resolution: " + f'{result_width * result_height:,}')
397
- print("edm_steps: " + str(edm_steps))
398
- print("num_samples: " + str(num_samples))
399
- print("downscale: " + str(downscale))
400
- print("Estimated minutes: " + f'{(((result_width * result_height**(1/1.75)) * input_width * input_height * (edm_steps**(1/2)) * (num_samples**(1/2.5)))**(1/2.5)) / 25000:,}')
401
- except Exception as e:
402
- print('Exception of Estimation')
403
-
404
- # Only one image can be shown in the slider
405
- return [noisy_image] + [results[0]], gr.update(label="Downloadable results in *." + output_format + " format", format = output_format, value = results), gr.update(value = information, visible = True), gr.update(visible=True)
406
-
407
- def load_and_reset(param_setting):
408
- print('load_and_reset ==>>')
409
  if torch.cuda.device_count() == 0:
410
  gr.Warning('Set this space to GPU config to make it work.')
411
- return None, None, None, None, None, None, None, None, None, None, None, None, None, None
412
- edm_steps = default_setting.edm_steps
413
- s_stage2 = 1.0
414
- s_stage1 = -1.0
415
- s_churn = 5
416
- s_noise = 1.003
417
- a_prompt = 'Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - ' \
418
- 'realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore ' \
419
- 'detailing, hyper sharpness, perfect without deformations.'
420
- n_prompt = 'painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, ' \
421
- '3D render, unreal engine, blurring, dirty, messy, worst quality, low quality, frames, watermark, ' \
422
- 'signature, jpeg artifacts, deformed, lowres, over-smooth'
423
- color_fix_type = 'Wavelet'
424
- spt_linear_s_stage2 = 0.0
425
- linear_s_stage2 = False
426
- linear_CFG = True
427
- if param_setting == "Quality":
428
- s_cfg = default_setting.s_cfg_Quality
429
- spt_linear_CFG = default_setting.spt_linear_CFG_Quality
430
- model_select = "v0-Q"
431
- elif param_setting == "Fidelity":
432
- s_cfg = default_setting.s_cfg_Fidelity
433
- spt_linear_CFG = default_setting.spt_linear_CFG_Fidelity
434
- model_select = "v0-F"
435
- else:
436
- raise NotImplementedError
437
- gr.Info('The parameters are reset.')
438
- print('<<== load_and_reset')
439
- return edm_steps, s_cfg, s_stage2, s_stage1, s_churn, s_noise, a_prompt, n_prompt, color_fix_type, linear_CFG, \
440
- linear_s_stage2, spt_linear_CFG, spt_linear_s_stage2, model_select
441
-
442
- def log_information(result_gallery):
443
- print('log_information')
444
- if result_gallery is not None:
445
- for i, result in enumerate(result_gallery):
446
- print(result[0])
447
-
448
- def on_select_result(result_slider, result_gallery, evt: gr.SelectData):
449
- print('on_select_result')
450
- if result_gallery is not None:
451
- for i, result in enumerate(result_gallery):
452
- print(result[0])
453
- return [result_slider[0], result_gallery[evt.index][0]]
454
-
455
- title_html = """
456
- <h1><center>SUPIR</center></h1>
457
- <big><center>Upscale your images up to x10 freely, without account, without watermark and download it</center></big>
458
- <center><big><big>🤸<big><big><big><big><big><big>🤸</big></big></big></big></big></big></big></big></center>
459
 
460
- <p>This is an online demo of SUPIR, a practicing model scaling for photo-realistic image restoration.
461
- The content added by SUPIR is <b><u>imagination, not real-world information</u></b>.
462
- SUPIR is for beauty and illustration only.
463
- Most of the processes last few minutes.
464
- If you want to upscale AI-generated images, be noticed that <i>PixArt Sigma</i> space can directly generate 5984x5984 images.
465
- Due to Gradio issues, the generated image is slightly less satured than the original.
466
- Please leave a <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR/discussions/new">message in discussion</a> if you encounter issues.
467
- You can also use <a href="https://huggingface.co/spaces/gokaygokay/AuraSR">AuraSR</a> to upscale x4.
468
 
469
- <p><center><a href="https://arxiv.org/abs/2401.13627">Paper</a> &emsp; <a href="http://supir.xpixel.group/">Project Page</a> &emsp; <a href="https://huggingface.co/blog/MonsterMMORPG/supir-sota-image-upscale-better-than-magnific-ai">Local Install Guide</a></center></p>
470
- <p><center><a style="display:inline-block" href='https://github.com/Fanghua-Yu/SUPIR'><img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/Fanghua-Yu/SUPIR?style=social"></a></center></p>
471
- """
472
-
473
-
474
- claim_md = """
475
- ## **Piracy**
476
- The images are not stored but the logs are saved during a month.
477
- ## **How to get SUPIR**
478
- You can get SUPIR on HuggingFace by [duplicating this space](https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR?duplicate=true) and set GPU.
479
- You can also install SUPIR on your computer following [this tutorial](https://huggingface.co/blog/MonsterMMORPG/supir-sota-image-upscale-better-than-magnific-ai).
480
- You can install _Pinokio_ on your computer and then install _SUPIR_ into it. It should be quite easy if you have an Nvidia GPU.
481
- ## **Terms of use**
482
- By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research. Please submit a feedback to us if you get any inappropriate answer! We will collect those to keep improving our models. For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
483
- ## **License**
484
- The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/Fanghua-Yu/SUPIR) of SUPIR.
485
- """
486
-
487
- # Gradio interface
488
- with gr.Blocks() as interface:
 
 
 
 
 
 
 
 
 
 
 
489
  if torch.cuda.device_count() == 0:
490
  with gr.Row():
491
  gr.HTML("""
492
- <p style="background-color: red;"><big><big><big><b>⚠️To use SUPIR, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR?duplicate=true">duplicate this space</a> and set a GPU with 30 GB VRAM.</b>
493
 
494
- You can't use SUPIR directly here because this space runs on a CPU, which is not enough for SUPIR. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR/discussions/new">feedback</a> if you have issues.
495
  </big></big></big></p>
496
  """)
497
- gr.HTML(title_html)
498
-
499
- input_image = gr.Image(label="Input (*.png, *.webp, *.jpeg, *.jpg, *.gif, *.bmp, *.heic)", show_label=True, type="filepath", height=600, elem_id="image-input")
500
- rotation = gr.Radio([["No rotation", 0], ["⤵ Rotate +90°", 90], ["↩ Return 180°", 180], ["⤴ Rotate -90°", -90]], label="Orientation correction", info="Will apply the following rotation before restoring the image; the AI needs a good orientation to understand the content", value=0, interactive=True, visible=False)
501
- with gr.Group():
502
- prompt = gr.Textbox(label="Image description", info="Help the AI understand what the image represents; describe as much as possible, especially the details we can't see on the original image; you can write in any language", value="", placeholder="A 33 years old man, walking, in the street, Santiago, morning, Summer, photorealistic", lines=3)
503
- prompt_hint = gr.HTML("You can use a <a href='"'https://huggingface.co/spaces/badayvedat/LLaVA'"'>LlaVa space</a> to auto-generate the description of your image.")
504
- upscale = gr.Radio([["x1", 1], ["x2", 2], ["x3", 3], ["x4", 4], ["x5", 5], ["x6", 6], ["x7", 7], ["x8", 8], ["x9", 9], ["x10", 10]], label="Upscale factor", info="Resolution x1 to x10", value=2, interactive=True)
505
- output_format = gr.Radio([["As input", "input"], ["*.png", "png"], ["*.webp", "webp"], ["*.jpeg", "jpeg"], ["*.gif", "gif"], ["*.bmp", "bmp"]], label="Image format for result", info="File extention", value="input", interactive=True)
506
- allocation = gr.Slider(label="GPU allocation time (in seconds)", info="lower=May abort run, higher=Quota penalty for next runs; only useful for ZeroGPU", value=179, minimum=59, maximum=320, step=1)
507
-
508
- with gr.Accordion("Pre-denoising (optional)", open=False):
509
- gamma_correction = gr.Slider(label="Gamma Correction", info = "lower=lighter, higher=darker", minimum=0.1, maximum=2.0, value=1.0, step=0.1)
510
- denoise_button = gr.Button(value="Pre-denoise")
511
- denoise_image = gr.Image(label="Denoised image", show_label=True, type="filepath", sources=[], interactive = False, height=600, elem_id="image-s1")
512
- denoise_information = gr.HTML(value="If present, the denoised image will be used for the restoration instead of the input image.", visible=False)
513
-
514
- with gr.Accordion("Advanced options", open=False):
515
- a_prompt = gr.Textbox(label="Additional image description",
516
- info="Completes the main image description",
517
- value='Cinematic, High Contrast, highly detailed, taken using a Canon EOS R '
518
- 'camera, hyper detailed photo - realistic maximum detail, 32k, Color '
519
- 'Grading, ultra HD, extreme meticulous detailing, skin pore detailing, clothing fabric detailing, '
520
- 'hyper sharpness, perfect without deformations.',
521
- lines=3)
522
- n_prompt = gr.Textbox(label="Negative image description",
523
- info="Disambiguate by listing what the image does NOT represent",
524
- value='painting, oil painting, illustration, drawing, art, sketch, anime, '
525
- 'cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, '
526
- 'worst quality, low quality, frames, watermark, signature, jpeg artifacts, '
527
- 'deformed, lowres, over-smooth',
528
- lines=3)
529
- edm_steps = gr.Slider(label="Steps", info="lower=faster, higher=more details; too many steps create a checker effect", minimum=1, maximum=200, value=default_setting.edm_steps if torch.cuda.device_count() > 0 else 1, step=1)
530
- num_samples = gr.Slider(label="Num Samples", info="Number of generated results", minimum=1, maximum=4 if not args.use_image_slider else 1
531
- , value=1, step=1)
532
- min_size = gr.Slider(label="Minimum size", info="Minimum height, minimum width of the result", minimum=32, maximum=4096, value=1024, step=32)
533
- downscale = gr.Radio([["/1", 1], ["/2", 2], ["/3", 3], ["/4", 4], ["/5", 5], ["/6", 6], ["/7", 7], ["/8", 8], ["/9", 9], ["/10", 10]], label="Pre-downscale factor", info="Reducing blurred image reduce the process time", value=1, interactive=True)
534
- with gr.Row():
535
- with gr.Column():
536
- model_select = gr.Radio([["💃 Quality (v0-Q)", "v0-Q"], ["🎯 Fidelity (v0-F)", "v0-F"]], label="Model Selection", info="Pretrained model", value="v0-Q",
537
- interactive=True)
538
- with gr.Column():
539
- color_fix_type = gr.Radio([["None", "None"], ["AdaIn (improve as a photo)", "AdaIn"], ["Wavelet (for JPEG artifacts)", "Wavelet"]], label="Color-Fix Type", info="AdaIn=Improve following a style, Wavelet=For JPEG artifacts", value="AdaIn",
540
- interactive=True)
541
- s_cfg = gr.Slider(label="Text Guidance Scale", info="lower=follow the image, higher=follow the prompt", minimum=1.0, maximum=15.0,
542
- value=default_setting.s_cfg_Quality if torch.cuda.device_count() > 0 else 1.0, step=0.1)
543
- s_stage2 = gr.Slider(label="Restoring Guidance Strength", minimum=0., maximum=1., value=1., step=0.05)
544
- s_stage1 = gr.Slider(label="Pre-denoising Guidance Strength", minimum=-1.0, maximum=6.0, value=-1.0, step=1.0)
545
- s_churn = gr.Slider(label="S-Churn", minimum=0, maximum=40, value=5, step=1)
546
- s_noise = gr.Slider(label="S-Noise", minimum=1.0, maximum=1.1, value=1.003, step=0.001)
547
- with gr.Row():
548
- with gr.Column():
549
- linear_CFG = gr.Checkbox(label="Linear CFG", value=True)
550
- spt_linear_CFG = gr.Slider(label="CFG Start", minimum=1.0,
551
- maximum=9.0, value=default_setting.spt_linear_CFG_Quality if torch.cuda.device_count() > 0 else 1.0, step=0.5)
552
- with gr.Column():
553
- linear_s_stage2 = gr.Checkbox(label="Linear Restoring Guidance", value=False)
554
- spt_linear_s_stage2 = gr.Slider(label="Guidance Start", minimum=0.,
555
- maximum=1., value=0., step=0.05)
556
- with gr.Column():
557
- diff_dtype = gr.Radio([["fp32 (precision)", "fp32"], ["fp16 (medium)", "fp16"], ["bf16 (speed)", "bf16"]], label="Diffusion Data Type", value="fp32",
558
- interactive=True)
559
- with gr.Column():
560
- ae_dtype = gr.Radio([["fp32 (precision)", "fp32"], ["bf16 (speed)", "bf16"]], label="Auto-Encoder Data Type", value="fp32",
561
- interactive=True)
562
- randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different")
563
- seed = gr.Slider(label="Seed", minimum=0, maximum=max_64_bit_int, step=1, randomize=True)
564
- with gr.Group():
565
- param_setting = gr.Radio(["Quality", "Fidelity"], interactive=True, label="Presetting", value = "Quality")
566
- restart_button = gr.Button(value="Apply presetting")
567
-
568
- with gr.Column():
569
- diffusion_button = gr.Button(value="🚀 Upscale/Restore", variant = "primary", elem_id = "process_button")
570
- reset_btn = gr.Button(value="🧹 Reinit page", variant="stop", elem_id="reset_button", visible = False)
571
-
572
- warning = gr.HTML(value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
573
- restore_information = gr.HTML(value = "Restart the process to get another result.", visible = False)
574
- result_slider = ImageSlider(label = 'Comparator', show_label = False, interactive = False, elem_id = "slider1", show_download_button = False)
575
- result_gallery = gr.Gallery(label = 'Downloadable results', show_label = True, interactive = False, elem_id = "gallery1")
576
-
577
- gr.Examples(
578
- examples = [
579
- [
580
- "./Examples/Example1.png",
581
- 0,
582
- None,
583
- "Group of people, walking, happy, in the street, photorealistic, 8k, extremely detailled",
584
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
585
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
586
- 2,
587
- 1024,
588
- 1,
589
- 8,
590
- 100,
591
- -1,
592
- 1,
593
- 7.5,
594
- False,
595
- 42,
596
- 5,
597
- 1.003,
598
- "AdaIn",
599
- "fp16",
600
- "bf16",
601
- 1.0,
602
- True,
603
- 4,
604
- False,
605
- 0.,
606
- "v0-Q",
607
- "input",
608
- 179
609
- ],
610
- [
611
- "./Examples/Example2.jpeg",
612
- 0,
613
- None,
614
- "La cabeza de un gato atigrado, en una casa, fotorrealista, 8k, extremadamente detallada",
615
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
616
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
617
- 1,
618
- 1024,
619
- 1,
620
- 1,
621
- 200,
622
- -1,
623
- 1,
624
- 7.5,
625
- False,
626
- 42,
627
- 5,
628
- 1.003,
629
- "Wavelet",
630
- "fp16",
631
- "bf16",
632
- 1.0,
633
- True,
634
- 4,
635
- False,
636
- 0.,
637
- "v0-Q",
638
- "input",
639
- 179
640
- ],
641
- [
642
- "./Examples/Example3.webp",
643
- 0,
644
- None,
645
- "A red apple",
646
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
647
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
648
- 1,
649
- 1024,
650
- 1,
651
- 1,
652
- 200,
653
- -1,
654
- 1,
655
- 7.5,
656
- False,
657
- 42,
658
- 5,
659
- 1.003,
660
- "Wavelet",
661
- "fp16",
662
- "bf16",
663
- 1.0,
664
- True,
665
- 4,
666
- False,
667
- 0.,
668
- "v0-Q",
669
- "input",
670
- 179
671
- ],
672
- [
673
- "./Examples/Example3.webp",
674
- 0,
675
- None,
676
- "A red marble",
677
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
678
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
679
- 1,
680
- 1024,
681
- 1,
682
- 1,
683
- 200,
684
- -1,
685
- 1,
686
- 7.5,
687
- False,
688
- 42,
689
- 5,
690
- 1.003,
691
- "Wavelet",
692
- "fp16",
693
- "bf16",
694
- 1.0,
695
- True,
696
- 4,
697
- False,
698
- 0.,
699
- "v0-Q",
700
- "input",
701
- 179
702
- ],
703
- ],
704
- run_on_click = True,
705
- fn = stage2_process,
706
- inputs = [
707
- input_image,
708
- rotation,
709
- denoise_image,
710
- prompt,
711
- a_prompt,
712
- n_prompt,
713
- num_samples,
714
- min_size,
715
- downscale,
716
- upscale,
717
- edm_steps,
718
- s_stage1,
719
- s_stage2,
720
- s_cfg,
721
- randomize_seed,
722
- seed,
723
- s_churn,
724
- s_noise,
725
- color_fix_type,
726
- diff_dtype,
727
- ae_dtype,
728
- gamma_correction,
729
- linear_CFG,
730
- linear_s_stage2,
731
- spt_linear_CFG,
732
- spt_linear_s_stage2,
733
- model_select,
734
- output_format,
735
- allocation
736
- ],
737
- outputs = [
738
- result_slider,
739
- result_gallery,
740
- restore_information,
741
- reset_btn
742
- ],
743
- cache_examples = False,
744
- )
745
 
746
  with gr.Row(visible=False):
747
  gr.Examples(
748
  examples = [
749
  [
750
- "./Examples/Example1.png",
751
- 0,
752
- None,
753
- "Group of people, walking, happy, in the street, photorealistic, 8k, extremely detailled",
754
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
755
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
756
- 2,
757
- 1024,
758
- 1,
759
- 8,
760
- 100,
761
- -1,
762
- 1,
763
- 7.5,
764
- False,
765
- 42,
766
- 5,
767
- 1.003,
768
- "AdaIn",
769
- "fp16",
770
- "bf16",
771
- 1.0,
772
- True,
773
- 4,
774
- False,
775
- 0.,
776
- "v0-Q",
777
- "input",
778
- 179
779
  ],
780
  [
781
- "./Examples/Example2.jpeg",
782
- 0,
783
- None,
784
- "La cabeza de un gato atigrado, en una casa, fotorrealista, 8k, extremadamente detallada",
785
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
786
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
787
- 1,
788
- 1024,
789
- 1,
790
- 1,
791
- 200,
792
- -1,
793
- 1,
794
- 7.5,
795
- False,
796
- 42,
797
- 5,
798
- 1.003,
799
- "Wavelet",
800
- "fp16",
801
- "bf16",
802
- 1.0,
803
- True,
804
- 4,
805
- False,
806
- 0.,
807
- "v0-Q",
808
- "input",
809
- 179
810
  ],
811
  [
812
- "./Examples/Example3.webp",
813
- 0,
814
- None,
815
- "A red apple",
816
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
817
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
818
- 1,
819
- 1024,
820
- 1,
821
- 1,
822
- 200,
823
- -1,
824
- 1,
825
- 7.5,
826
- False,
827
- 42,
828
- 5,
829
- 1.003,
830
- "Wavelet",
831
- "fp16",
832
- "bf16",
833
- 1.0,
834
- True,
835
- 4,
836
- False,
837
- 0.,
838
- "v0-Q",
839
- "input",
840
- 179
841
  ],
842
  [
843
- "./Examples/Example3.webp",
844
- 0,
845
- None,
846
- "A red marble",
847
- "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, skin pore detailing, hyper sharpness, perfect without deformations.",
848
- "painting, oil painting, illustration, drawing, art, sketch, anime, cartoon, CG Style, 3D render, unreal engine, blurring, aliasing, pixel, unsharp, weird textures, ugly, dirty, messy, worst quality, low quality, frames, watermark, signature, jpeg artifacts, deformed, lowres, over-smooth",
849
- 1,
850
- 1024,
851
- 1,
852
- 1,
853
- 200,
854
- -1,
855
- 1,
856
- 7.5,
857
- False,
858
- 42,
859
- 5,
860
- 1.003,
861
- "Wavelet",
862
- "fp16",
863
- "bf16",
864
- 1.0,
865
- True,
866
- 4,
867
- False,
868
- 0.,
869
- "v0-Q",
870
- "input",
871
- 179
872
  ],
873
  ],
874
  run_on_click = True,
875
- fn = stage2_process_example,
876
- inputs = [
877
- input_image,
878
- rotation,
879
- denoise_image,
880
- prompt,
881
- a_prompt,
882
- n_prompt,
883
- num_samples,
884
- min_size,
885
- downscale,
886
- upscale,
887
- edm_steps,
888
- s_stage1,
889
- s_stage2,
890
- s_cfg,
891
- randomize_seed,
892
- seed,
893
- s_churn,
894
- s_noise,
895
- color_fix_type,
896
- diff_dtype,
897
- ae_dtype,
898
- gamma_correction,
899
- linear_CFG,
900
- linear_s_stage2,
901
- spt_linear_CFG,
902
- spt_linear_s_stage2,
903
- model_select,
904
- output_format,
905
- allocation
906
- ],
907
- outputs = [
908
- result_slider,
909
- restore_information,
910
- reset_btn
911
- ],
912
- cache_examples = "lazy",
913
  )
 
 
914
 
915
- with gr.Row():
916
- gr.Markdown(claim_md)
917
 
918
- input_image.upload(fn = check_upload, inputs = [
919
- input_image
920
- ], outputs = [
921
- rotation
922
- ], queue = False, show_progress = False)
923
-
924
- denoise_button.click(fn = check_and_update, inputs = [
925
- input_image
926
- ], outputs = [warning], queue = False, show_progress = False).success(fn = stage1_process, inputs = [
927
- input_image,
928
- gamma_correction,
929
- diff_dtype,
930
- ae_dtype
931
- ], outputs=[
932
- denoise_image,
933
- denoise_information
934
- ])
935
-
936
- diffusion_button.click(fn = update_seed, inputs = [
937
- randomize_seed,
938
- seed
939
- ], outputs = [
940
- seed
941
- ], queue = False, show_progress = False).then(fn = check_and_update, inputs = [
942
- input_image
943
- ], outputs = [warning], queue = False, show_progress = False).success(fn=stage2_process, inputs = [
944
- input_image,
945
- rotation,
946
- denoise_image,
947
- prompt,
948
- a_prompt,
949
- n_prompt,
950
- num_samples,
951
- min_size,
952
- downscale,
953
- upscale,
954
- edm_steps,
955
- s_stage1,
956
- s_stage2,
957
- s_cfg,
958
- randomize_seed,
959
- seed,
960
- s_churn,
961
- s_noise,
962
- color_fix_type,
963
- diff_dtype,
964
- ae_dtype,
965
- gamma_correction,
966
- linear_CFG,
967
- linear_s_stage2,
968
- spt_linear_CFG,
969
- spt_linear_s_stage2,
970
- model_select,
971
- output_format,
972
- allocation
973
- ], outputs = [
974
- result_slider,
975
- result_gallery,
976
- restore_information,
977
- reset_btn
978
- ]).success(fn = log_information, inputs = [
979
- result_gallery
980
- ], outputs = [], queue = False, show_progress = False)
981
-
982
- result_gallery.change(on_select_result, [result_slider, result_gallery], result_slider)
983
- result_gallery.select(on_select_result, [result_slider, result_gallery], result_slider)
984
-
985
- restart_button.click(fn = load_and_reset, inputs = [
986
- param_setting
987
- ], outputs = [
988
- edm_steps,
989
- s_cfg,
990
- s_stage2,
991
- s_stage1,
992
- s_churn,
993
- s_noise,
994
- a_prompt,
995
- n_prompt,
996
- color_fix_type,
997
- linear_CFG,
998
- linear_s_stage2,
999
- spt_linear_CFG,
1000
- spt_linear_s_stage2,
1001
- model_select
1002
- ])
1003
-
1004
- reset_btn.click(fn = reset, inputs = [], outputs = [
1005
- input_image,
1006
- rotation,
1007
- denoise_image,
1008
- prompt,
1009
- a_prompt,
1010
- n_prompt,
1011
- num_samples,
1012
- min_size,
1013
- downscale,
1014
- upscale,
1015
- edm_steps,
1016
- s_stage1,
1017
- s_stage2,
1018
- s_cfg,
1019
- randomize_seed,
1020
- seed,
1021
- s_churn,
1022
- s_noise,
1023
- color_fix_type,
1024
- diff_dtype,
1025
- ae_dtype,
1026
- gamma_correction,
1027
- linear_CFG,
1028
- linear_s_stage2,
1029
- spt_linear_CFG,
1030
- spt_linear_s_stage2,
1031
- model_select,
1032
- output_format,
1033
- allocation
1034
- ], queue = False, show_progress = False)
1035
-
1036
- interface.queue(10).launch()
 
1
+ from diffusers_helper.hf_login import login
2
+
3
  import os
4
+
5
+ os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
+
7
  import gradio as gr
 
 
8
  import torch
9
+ import traceback
10
  import einops
11
+ import safetensors.torch as sf
12
+ import numpy as np
13
  import math
 
14
  import random
15
  import spaces
 
 
16
 
 
17
  from PIL import Image
18
+ from diffusers import AutoencoderKLHunyuanVideo
19
+ from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
20
+ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
21
+ from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
22
+ from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
23
+ from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
24
+ from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
25
+ from diffusers_helper.thread_utils import AsyncStream, async_run
26
+ from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
27
+ from transformers import SiglipImageProcessor, SiglipVisionModel
28
+ from diffusers_helper.clip_vision import hf_clip_vision_encode
29
+ from diffusers_helper.bucket_tools import find_nearest_bucket
30
+
31
+
32
+ free_mem_gb = get_cuda_free_memory_gb(gpu)
33
+ high_vram = free_mem_gb > 80
34
+
35
+ print(f'Free VRAM {free_mem_gb} GB')
36
+ print(f'High-VRAM Mode: {high_vram}')
37
+
38
+ text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
39
+ text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
40
+ tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
41
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
42
+ vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
43
+
44
+ feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
45
+ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
46
+
47
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
48
+
49
+ vae.eval()
50
+ text_encoder.eval()
51
+ text_encoder_2.eval()
52
+ image_encoder.eval()
53
+ transformer.eval()
54
+
55
+ if not high_vram:
56
+ vae.enable_slicing()
57
+ vae.enable_tiling()
58
+
59
+ transformer.high_quality_fp32_output_for_inference = True
60
+ print('transformer.high_quality_fp32_output_for_inference = True')
61
+
62
+ transformer.to(dtype=torch.bfloat16)
63
+ vae.to(dtype=torch.float16)
64
+ image_encoder.to(dtype=torch.float16)
65
+ text_encoder.to(dtype=torch.float16)
66
+ text_encoder_2.to(dtype=torch.float16)
67
+
68
+ vae.requires_grad_(False)
69
+ text_encoder.requires_grad_(False)
70
+ text_encoder_2.requires_grad_(False)
71
+ image_encoder.requires_grad_(False)
72
+ transformer.requires_grad_(False)
73
+
74
+ if not high_vram:
75
+ # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
76
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
77
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
78
+ else:
79
+ text_encoder.to(gpu)
80
+ text_encoder_2.to(gpu)
81
+ image_encoder.to(gpu)
82
+ vae.to(gpu)
83
+ transformer.to(gpu)
84
+
85
+ stream = AsyncStream()
86
+
87
+ outputs_folder = './outputs/'
88
+ os.makedirs(outputs_folder, exist_ok=True)
89
+
90
+ input_image_debug_value = prompt_debug_value = total_second_length_debug_value = None
91
+
92
+ @torch.no_grad()
93
+ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
94
+ def encode_prompt(prompt, n_prompt):
95
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
96
+
97
+ if cfg == 1:
98
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  else:
100
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
 
101
 
102
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
103
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
104
 
105
+ llama_vec = llama_vec.to(transformer.dtype)
106
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
107
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
108
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
109
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
110
 
111
+ total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
112
+ total_latent_sections = int(max(round(total_latent_sections), 1))
113
 
114
+ job_id = generate_timestamp()
 
 
 
 
115
 
116
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
117
 
118
+ try:
119
+ # Clean GPU
120
+ if not high_vram:
121
+ unload_complete_models(
122
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
123
+ )
 
124
 
125
+ # Text encoding
 
 
126
 
127
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
128
 
129
+ if not high_vram:
130
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
131
+ load_model_as_complete(text_encoder_2, target_device=gpu)
132
 
133
+ prompt_parameters = []
134
+
135
+ for prompt_part in prompts:
136
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
 
 
 
137
 
138
+ # Processing input image
 
139
 
140
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
141
+
142
+ H, W, C = input_image.shape
143
+ height, width = find_nearest_bucket(H, W, resolution=640)
144
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
145
+
146
+ Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
147
+
148
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
149
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
150
+
151
+ # VAE encoding
152
+
153
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
154
+
155
+ if not high_vram:
156
+ load_model_as_complete(vae, target_device=gpu)
157
+
158
+ start_latent = vae_encode(input_image_pt, vae)
159
+
160
+ # CLIP Vision
161
+
162
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
163
+
164
+ if not high_vram:
165
+ load_model_as_complete(image_encoder, target_device=gpu)
166
+
167
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
168
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
169
+
170
+ # Dtype
171
+
172
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
173
+
174
+ # Sampling
175
+
176
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
177
+
178
+ rnd = torch.Generator("cpu").manual_seed(seed)
179
+
180
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
181
+ history_pixels = None
182
+
183
+ history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
184
+ total_generated_latent_frames = 1
185
+
186
+ for section_index in range(total_latent_sections):
187
+ if stream.input_queue.top() == 'end':
188
+ stream.output_queue.push(('end', None))
189
+ return
190
+
191
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
192
+
193
+ if len(prompt_parameters) > 0:
194
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(0)
195
+
196
+ if not high_vram:
197
+ unload_complete_models()
198
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
199
+
200
+ if use_teacache:
201
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
202
+ else:
203
+ transformer.initialize_teacache(enable_teacache=False)
204
+
205
+ def callback(d):
206
+ preview = d['denoised']
207
+ preview = vae_decode_fake(preview)
208
+
209
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
210
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
211
+
212
+ if stream.input_queue.top() == 'end':
213
+ stream.output_queue.push(('end', None))
214
+ raise KeyboardInterrupt('User ends the task.')
215
+
216
+ current_step = d['i'] + 1
217
+ percentage = int(100.0 * current_step / steps)
218
+ hint = f'Sampling {current_step}/{steps}'
219
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
220
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
221
+ return
222
+
223
+ indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
224
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
225
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
226
+
227
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
228
+ clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
229
+
230
+ generated_latents = sample_hunyuan(
231
+ transformer=transformer,
232
+ sampler='unipc',
233
+ width=width,
234
+ height=height,
235
+ frames=latent_window_size * 4 - 3,
236
+ real_guidance_scale=cfg,
237
+ distilled_guidance_scale=gs,
238
+ guidance_rescale=rs,
239
+ # shift=3.0,
240
+ num_inference_steps=steps,
241
+ generator=rnd,
242
+ prompt_embeds=llama_vec,
243
+ prompt_embeds_mask=llama_attention_mask,
244
+ prompt_poolers=clip_l_pooler,
245
+ negative_prompt_embeds=llama_vec_n,
246
+ negative_prompt_embeds_mask=llama_attention_mask_n,
247
+ negative_prompt_poolers=clip_l_pooler_n,
248
+ device=gpu,
249
+ dtype=torch.bfloat16,
250
+ image_embeddings=image_encoder_last_hidden_state,
251
+ latent_indices=latent_indices,
252
+ clean_latents=clean_latents,
253
+ clean_latent_indices=clean_latent_indices,
254
+ clean_latents_2x=clean_latents_2x,
255
+ clean_latent_2x_indices=clean_latent_2x_indices,
256
+ clean_latents_4x=clean_latents_4x,
257
+ clean_latent_4x_indices=clean_latent_4x_indices,
258
+ callback=callback,
259
+ )
260
+
261
+ total_generated_latent_frames += int(generated_latents.shape[2])
262
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
263
+
264
+ if not high_vram:
265
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
266
+ load_model_as_complete(vae, target_device=gpu)
267
+
268
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
269
+
270
+ if history_pixels is None:
271
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
272
+ else:
273
+ section_latent_frames = latent_window_size * 2
274
+ overlapped_frames = latent_window_size * 4 - 3
275
+
276
+ current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
277
+ history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
278
+
279
+ if not high_vram:
280
+ unload_complete_models()
281
+
282
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
283
+
284
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
285
+
286
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
287
+
288
+ stream.output_queue.push(('file', output_filename))
289
+ except:
290
+ traceback.print_exc()
291
+
292
+ if not high_vram:
293
+ unload_complete_models(
294
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
295
+ )
296
+
297
+ stream.output_queue.push(('end', None))
298
+ return
299
+
300
+ def get_duration(input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
301
+ global total_second_length_debug_value
302
+
303
+ if total_second_length_debug_value is not None:
304
+ return min(total_second_length_debug_value * 60, 600)
305
+ return total_second_length * 60
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
  @spaces.GPU(duration=get_duration)
309
+ def process(input_image, prompt,
310
+ t2v=False,
311
+ n_prompt="",
312
+ randomize_seed=True,
313
+ seed=31337,
314
+ total_second_length=5,
315
+ latent_window_size=9,
316
+ steps=25,
317
+ cfg=1.0,
318
+ gs=10.0,
319
+ rs=0.0,
320
+ gpu_memory_preservation=6,
321
+ use_teacache=True,
322
+ mp4_crf=16
323
+ ):
324
+ global stream, input_image_debug_value, prompt_debug_value, total_second_length_debug_value
325
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  if torch.cuda.device_count() == 0:
327
  gr.Warning('Set this space to GPU config to make it work.')
328
+ return None, None, None, None, None, None
329
+
330
+ if input_image_debug_value is not None or prompt_debug_value is not None or total_second_length_debug_value is not None:
331
+ print("Debug mode")
332
+ input_image = input_image_debug_value
333
+ prompt = prompt_debug_value
334
+ total_second_length = total_second_length_debug_value
335
+ input_image_debug_value = prompt_debug_value = total_second_length_debug_value = None
336
+
337
+ if randomize_seed:
338
+ seed = random.randint(0, np.iinfo(np.int32).max)
339
+
340
+ prompts = prompt.split(";")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
+ # assert input_image is not None, 'No input image!'
343
+ if t2v:
344
+ default_height, default_width = 640, 640
345
+ input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
346
+ print("No input image provided. Using a blank white image.")
 
 
 
347
 
348
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
349
+
350
+ stream = AsyncStream()
351
+
352
+ async_run(worker, input_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
353
+
354
+ output_filename = None
355
+
356
+ while True:
357
+ flag, data = stream.output_queue.next()
358
+
359
+ if flag == 'file':
360
+ output_filename = data
361
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
362
+
363
+ if flag == 'progress':
364
+ preview, desc, html = data
365
+ yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
366
+
367
+ if flag == 'end':
368
+ yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
369
+ break
370
+
371
+
372
+ def end_process():
373
+ stream.input_queue.push('end')
374
+
375
+
376
+ css = make_progress_bar_css()
377
+ block = gr.Blocks(css=css).queue()
378
+ with block:
379
  if torch.cuda.device_count() == 0:
380
  with gr.Row():
381
  gr.HTML("""
382
+ <p style="background-color: red;"><big><big><big><b>⚠️To use FramePack, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR?duplicate=true">duplicate this space</a> and set a GPU with 30 GB VRAM.</b>
383
 
384
+ You can't use FramePack directly here because this space runs on a CPU, which is not enough for FramePack. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/SUPIR/discussions/new">feedback</a> if you have issues.
385
  </big></big></big></p>
386
  """)
387
+ gr.Markdown('# FramePack-F1')
388
+ gr.Markdown(f"""### Video diffusion, but feels like image diffusion
389
+ *FramePack F1 - a FramePack model that only predicts future frames from history frames*
390
+ ### *beta* FramePack Fill 🖋️- draw a mask over the input image to inpaint the video output
391
+ adapted from the officical code repo [FramePack](https://github.com/lllyasviel/FramePack) by [lllyasviel](lllyasviel/FramePack_F1_I2V_HY_20250503) and [FramePack Studio](https://github.com/colinurbs/FramePack-Studio) 🙌🏻
392
+ """)
393
+ with gr.Row():
394
+ with gr.Column():
395
+ input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
396
+ prompt = gr.Textbox(label="Prompt", value='')
397
+ t2v = gr.Checkbox(label="do text-to-video", value=False)
398
+
399
+ with gr.Row():
400
+ start_button = gr.Button(value="Start Generation", variant="primary")
401
+ end_button = gr.Button(value="End Generation", variant="stop", interactive=False)
402
+
403
+ total_second_length = gr.Slider(label="Generated Video Length (Seconds)", minimum=1, maximum=5, value=2, step=0.1)
404
+ with gr.Accordion("Advanced settings", open=False):
405
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
406
+
407
+ n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, blurred, blurry") # Not used
408
+ randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
409
+ seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
410
+
411
+
412
+ latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1) # Should not change
413
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
414
+
415
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01) # Should not change
416
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended; 3=blurred motions& & unsharped; 10 focus motion')
417
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01) # Should not change
418
+
419
+ gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
420
+
421
+ mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
422
+
423
+ with gr.Accordion("Debug", open=False):
424
+ input_image_debug = gr.Image(type="numpy", label="Image Debug", height=320)
425
+ prompt_debug = gr.Textbox(label="Prompt Debug", value='')
426
+ total_second_length_debug = gr.Slider(label="Additional Video Length to Generate (Seconds) Debug", minimum=1, maximum=120, value=5, step=0.1)
427
+
428
+ with gr.Column():
429
+ preview_image = gr.Image(label="Next Latents", height=200, visible=False)
430
+ result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
431
+ progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
432
+ progress_bar = gr.HTML('', elem_classes='no-generating-animation')
433
+
434
+ gr.HTML('<div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>')
435
+
436
+ ips = [input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
437
+ start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
438
+ end_button.click(fn=end_process)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
 
440
  with gr.Row(visible=False):
441
  gr.Examples(
442
  examples = [
443
  [
444
+ "./img_examples/Example1.png", # input_image
445
+ "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
446
+ False, # t2v
447
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
448
+ True, # randomize_seed
449
+ 42, # seed
450
+ 1, # total_second_length
451
+ 9, # latent_window_size
452
+ 25, # steps
453
+ 1.0, # cfg
454
+ 10.0, # gs
455
+ 0.0, # rs
456
+ 6, # gpu_memory_preservation
457
+ True, # use_teacache
458
+ 16 # mp4_crf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  ],
460
  [
461
+ "./img_examples/Example1.png", # input_image
462
+ "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
463
+ False, # t2v
464
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
465
+ True, # randomize_seed
466
+ 42, # seed
467
+ 1, # total_second_length
468
+ 9, # latent_window_size
469
+ 25, # steps
470
+ 1.0, # cfg
471
+ 10.0, # gs
472
+ 0.0, # rs
473
+ 6, # gpu_memory_preservation
474
+ True, # use_teacache
475
+ 16 # mp4_crf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
  ],
477
  [
478
+ "./img_examples/Example1.png", # input_image
479
+ "We are sinking, photorealistic, realistic, intricate details, 8k, insanely detailed",
480
+ False, # t2v
481
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
482
+ True, # randomize_seed
483
+ 42, # seed
484
+ 1, # total_second_length
485
+ 9, # latent_window_size
486
+ 25, # steps
487
+ 1.0, # cfg
488
+ 10.0, # gs
489
+ 0.0, # rs
490
+ 6, # gpu_memory_preservation
491
+ True, # use_teacache
492
+ 16 # mp4_crf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  ],
494
  [
495
+ "./img_examples/Example1.png", # input_image
496
+ "A boat is passing, photorealistic, realistic, intricate details, 8k, insanely detailed",
497
+ False, # t2v
498
+ "Missing arm, unrealistic position, blurred, blurry", # n_prompt
499
+ True, # randomize_seed
500
+ 42, # seed
501
+ 1, # total_second_length
502
+ 9, # latent_window_size
503
+ 25, # steps
504
+ 1.0, # cfg
505
+ 10.0, # gs
506
+ 0.0, # rs
507
+ 6, # gpu_memory_preservation
508
+ True, # use_teacache
509
+ 16 # mp4_crf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  ],
511
  ],
512
  run_on_click = True,
513
+ fn = process,
514
+ inputs = ips,
515
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
516
+ cache_examples = True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  )
518
+ gr.Markdown('## Guide')
519
+ gr.Markdown("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
520
 
 
 
521
 
522
+ def handle_field_debug_change(input_image_debug_data, prompt_debug_data, total_second_length_debug_data):
523
+ global input_image_debug_value, prompt_debug_value, total_second_length_debug_value
524
+ input_image_debug_value = input_image_debug_data
525
+ prompt_debug_value = prompt_debug_data
526
+ total_second_length_debug_value = total_second_length_debug_data
527
+ return []
528
+
529
+ input_image_debug.upload(
530
+ fn=handle_field_debug_change,
531
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
532
+ outputs=[]
533
+ )
534
+
535
+ prompt_debug.change(
536
+ fn=handle_field_debug_change,
537
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
538
+ outputs=[]
539
+ )
540
+
541
+ total_second_length_debug.change(
542
+ fn=handle_field_debug_change,
543
+ inputs=[input_image_debug, prompt_debug, total_second_length_debug],
544
+ outputs=[]
545
+ )
546
+
547
+
548
+ block.launch(mcp_server=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_endframe.py CHANGED
The diff for this file is too large to render. See raw diff
 
app_v2v.py CHANGED
@@ -545,6 +545,10 @@ def get_duration(input_video, prompt, n_prompt, seed, batch, resolution, total_s
545
  def process(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
546
  global stream, high_vram, input_video_debug_value, prompt_debug_value, total_second_length_debug_value
547
 
 
 
 
 
548
  if input_video_debug_value is not None:
549
  input_video = input_video_debug_value
550
  input_video_debug_value = None
 
545
  def process(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
546
  global stream, high_vram, input_video_debug_value, prompt_debug_value, total_second_length_debug_value
547
 
548
+ if torch.cuda.device_count() == 0:
549
+ gr.Warning('Set this space to GPU config to make it work.')
550
+ return None, None, None, None, None, None
551
+
552
  if input_video_debug_value is not None:
553
  input_video = input_video_debug_value
554
  input_video_debug_value = None
requirements.txt CHANGED
@@ -1,41 +1,23 @@
1
- pydantic==2.10.6
2
- fastapi==0.115.8
3
- gradio_imageslider==0.0.20
4
- gradio_client==1.7.0
5
- numpy==1.26.4
6
- requests==2.32.3
7
- sentencepiece==0.2.0
8
- tokenizers==0.19.1
9
- torchvision==0.22.0
10
- uvicorn==0.30.1
11
- wandb==0.17.4
12
- httpx==0.27.0
13
- transformers==4.42.4
14
- accelerate==0.32.1
15
- scikit-learn==1.5.1
16
- einops==0.8.0
17
- einops-exts==0.0.4
18
- timm==1.0.7
19
- openai-clip==1.0.1
20
- fsspec==2024.6.1
21
- kornia==0.7.3
22
- matplotlib==3.9.1
23
- ninja==1.11.1.1
24
- omegaconf==2.3.0
25
- opencv-python==4.10.0.84
26
- pandas==2.2.2
27
- pillow==10.4.0
28
- pytorch-lightning==2.3.3
29
- PyYAML==6.0.1
30
- scipy==1.14.0
31
- tqdm==4.66.4
32
- triton==3.3.0
33
- urllib3==2.2.2
34
- webdataset==0.2.86
35
- xformers==0.0.30
36
- facexlib==0.3.0
37
- k-diffusion==0.1.1.post1
38
  diffusers==0.33.1
39
- pillow-heif==0.22.0
40
-
41
- open-clip-torch==2.24.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.6.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  diffusers==0.33.1
3
+ transformers==4.46.2
4
+ sentencepiece==0.2.0
5
+ pillow==11.1.0
6
+ av==12.1.0
7
+ numpy==1.26.2
8
+ scipy==1.12.0
9
+ requests==2.31.0
10
+ torchsde==0.2.6
11
+ torch>=2.0.0
12
+ torchvision
13
+ torchaudio
14
+ einops
15
+ opencv-contrib-python
16
+ safetensors
17
+ huggingface_hub
18
+ spaces
19
+ decord
20
+ imageio_ffmpeg
21
+ sageattention
22
+ xformers
23
+ bitsandbytes