DeFactOfficial commited on
Commit
94c9a29
·
verified ·
1 Parent(s): 7603095

Change settings UI

Browse files
Files changed (1) hide show
  1. app.py +46 -22
app.py CHANGED
@@ -17,20 +17,20 @@ from utils.utils import instantiate_from_config
17
  from scheduler.t2v_turbo_scheduler import T2VTurboScheduler
18
  from pipeline.t2v_turbo_vc2_pipeline import T2VTurboVC2Pipeline
19
 
20
- DESCRIPTION = """# T2V-Turbo 🚀
 
21
 
22
- Our model is distilled from [VideoCrafter2](https://ailab-cvc.github.io/videocrafter2/).
23
 
24
- T2V-Turbo learns a LoRA on top of the base model by aligning to the reward feedback from [HPSv2.1](https://github.com/tgxs002/HPSv2/tree/master) and [InternVid2 Stage 2 Model](https://huggingface.co/OpenGVLab/InternVideo2-Stage2_1B-224p-f4).
25
 
26
- T2V-Turbo-v2 optimizes the training techniques by finetuning the full base model and further aligns to [CLIPScore](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)
27
 
28
- T2V-Turbo trains on pure WebVid-10M data, whereas T2V-Turbo-v2 carufully optimizes different learning objectives with a mixutre of VidGen-1M and WebVid-10M data.
29
 
30
- Moreover, T2V-Turbo-v2 supports to distill motion priors from the training videos.
31
-
32
- [Project page for T2V-Turbo](https://t2v-turbo.github.io) 🥳
33
 
 
34
  [Project page for T2V-Turbo-v2](https://t2v-turbo-v2.github.io) 🤓
35
  """
36
  if torch.cuda.is_available():
@@ -70,20 +70,20 @@ example_txt = [
70
  "A musician strums his guitar, serenading the moonlit night.",
71
  ]
72
 
73
- examples = [[i, 7.5, 0.5, 16, 16, 0, True, "bf16"] for i in example_txt]
74
 
75
  @spaces.GPU(duration=120)
76
  @torch.inference_mode()
77
  def generate(
78
  prompt: str,
79
  guidance_scale: float = 7.5,
 
80
  percentage: float = 0.5,
81
  num_inference_steps: int = 4,
82
  num_frames: int = 16,
83
  seed: int = 0,
84
  randomize_seed: bool = False,
85
  param_dtype="bf16",
86
- motion_gs: float = 0.05,
87
  fps: int = 8,
88
  ):
89
 
@@ -167,35 +167,50 @@ if __name__ == "__main__":
167
  demo = gr.Interface(
168
  fn=generate,
169
  inputs=[
170
- Textbox(label="", placeholder="Please enter your prompt. \n"),
171
  gr.Slider(
172
- label="Guidance scale",
173
- minimum=2,
174
- maximum=14,
175
  step=0.1,
176
  value=7.5,
 
177
  ),
178
  gr.Slider(
179
- label="Percentage of steps to apply motion guidance (v2 w/ MG only)",
 
 
 
 
 
 
 
 
 
180
  minimum=0.0,
181
- maximum=0.5,
182
  step=0.05,
183
  value=0.5,
 
184
  ),
 
185
  gr.Slider(
186
- label="Number of inference steps",
187
- minimum=4,
188
- maximum=50,
189
  step=1,
190
  value=16,
 
191
  ),
192
  gr.Slider(
193
  label="Number of Video Frames",
194
  minimum=16,
195
- maximum=48,
196
  step=8,
197
  value=16,
 
198
  ),
 
199
  gr.Slider(
200
  label="Seed",
201
  minimum=0,
@@ -210,8 +225,17 @@ if __name__ == "__main__":
210
  label="torch.dtype",
211
  value="bf16",
212
  interactive=True,
213
- info="Dtype for inference. Default is bf16.",
214
- )
 
 
 
 
 
 
 
 
 
215
  ],
216
  outputs=[
217
  gr.Video(label="Generated Video", width=512, height=320, interactive=False, autoplay=True),
 
17
  from scheduler.t2v_turbo_scheduler import T2VTurboScheduler
18
  from pipeline.t2v_turbo_vc2_pipeline import T2VTurboVC2Pipeline
19
 
20
+ DESCRIPTION = """# T2V-Turbo-v2 🚀
21
+ ## A fast and efficient txt2video model that doesn't suck
22
 
23
+ This space was forked from the original so that I can fix whatever is causing its API not to work with HuggingChat's tools interface....
24
 
25
+ You know, because it would be really cool to combine an LLM with a text2video model that's fast, decent quality, and open source
26
 
27
+ I've also increased upper bounds of some params, and made other params adjustable in the UI which previously were locked. Please read the info because some of them are likely not worth messing with, but I like to give users the freedom to explore
28
 
29
+ The TLDR on this model is that it was distilled from VideoCrafter 2, and ended up beating the parent model on all of the benchmarks even tho its smaller and MUCH faster.
30
 
31
+ Don't get TOO excited tho - when you read the paper they claim it beat Kling and Runway Gen-3 on comprehensive benchmark scores, but this ain't Gen-3, its just not. Its a low res, high efficiency, txt2video engine that's perfect for recreational use and integration with chatbots, but it won't be winning any oscars
 
 
32
 
33
+ Official Project Page with links to Papers, Github Code, and Leaderboard:
34
  [Project page for T2V-Turbo-v2](https://t2v-turbo-v2.github.io) 🤓
35
  """
36
  if torch.cuda.is_available():
 
70
  "A musician strums his guitar, serenading the moonlit night.",
71
  ]
72
 
73
+ examples = [[i, 7.5, 0.5, 0.05, 16, 16, 0, True, "bf16", 8] for i in example_txt]
74
 
75
  @spaces.GPU(duration=120)
76
  @torch.inference_mode()
77
  def generate(
78
  prompt: str,
79
  guidance_scale: float = 7.5,
80
+ motion_gs: float = 0.05,
81
  percentage: float = 0.5,
82
  num_inference_steps: int = 4,
83
  num_frames: int = 16,
84
  seed: int = 0,
85
  randomize_seed: bool = False,
86
  param_dtype="bf16",
 
87
  fps: int = 8,
88
  ):
89
 
 
167
  demo = gr.Interface(
168
  fn=generate,
169
  inputs=[
170
+ Textbox(label="", placeholder="Please enter your prompt"),
171
  gr.Slider(
172
+ label="CFG Guidance",
173
+ minimum=1,
174
+ maximum=21,
175
  step=0.1,
176
  value=7.5,
177
+ info="Behaves like CFG Guidance on a txt2img diffusion model... 7.5 appears to indeed be the sweeet spot, but for certain prompts you may wish to adjust"
178
  ),
179
  gr.Slider(
180
+ label="MGS Guidance (Don't Change This)",
181
+ minimum=0.0,
182
+ maximum=1.0,
183
+ step=0.01,
184
+ value=0.05,
185
+ info="No idea where they came up with the default of 0.05 or why they're so certain its optimal, since its not mentioned in the paper. I've therefore opened it up for experimentation, with very low expectations"
186
+ ),
187
+
188
+ gr.Slider(
189
+ label="Motion Guidance Percentage (Don't Change This)",
190
  minimum=0.0,
191
+ maximum=0.8,
192
  step=0.05,
193
  value=0.5,
194
+ info="The authors specifically say in their paper that its important to apply MG to only the first N inference steps out of M total step. But the ideal value of N/M is not mentioned, so may be worth playing with"
195
  ),
196
+
197
  gr.Slider(
198
+ label="Inference Steps",
199
+ minimum=2,
200
+ maximum=200,
201
  step=1,
202
  value=16,
203
+ info="This is an interesting one because increasing step count is the equivalent to techniques like CoT that we use to increase test time compute in LLMs. In general, more steps = lower loss (higher quality). But the relationship is asymptotic and returns quickly diminish... Opened this up in case its needed for certain use cases, otherwise leave @ 16"
204
  ),
205
  gr.Slider(
206
  label="Number of Video Frames",
207
  minimum=16,
208
+ maximum=96,
209
  step=8,
210
  value=16,
211
+ info="Generated video length = number of frames / FPS. The benchmark evals involved 16 frames, to my knowledge. It is unclear how high you can go before consistency falls apart... but it would be lovely to get 96 frames at 24 fps of high quality video. Probably won't happen, but just in case, feel free to try"
212
  ),
213
+
214
  gr.Slider(
215
  label="Seed",
216
  minimum=0,
 
225
  label="torch.dtype",
226
  value="bf16",
227
  interactive=True,
228
+ info="bf16 is fast and high quality. end users should not change this setting",
229
+ ),
230
+ gr.Slider(
231
+ label="Desired Output FPS",
232
+ minimum=8,
233
+ maximum=24,
234
+ step=8,
235
+ value=8,
236
+ info="Higher = smoother, lower = longer video, purely a matter of preference"
237
+ ),
238
+
239
  ],
240
  outputs=[
241
  gr.Video(label="Generated Video", width=512, height=320, interactive=False, autoplay=True),