Spaces:

TempoFunk
/

makeavid-sd-jax

Runtime error

App Files Files Community

lopho commited on May 2, 2023

Commit

181019b

1 Parent(s): 9eaacdc

more info + correct model and dataset links

Browse files

Files changed (2) hide show

README.md +3 -1
app.py +23 -6

README.md CHANGED Viewed

@@ -11,9 +11,11 @@ license: agpl-3.0
 library_name: diffusers
 pipeline_tag: text-to-video
 datasets:
-- TempoFunk/tempofunk-s
 models:
 - TempoFunk/makeavid-sd-jax
 tags:
 - jax-diffusers-event
 ---

 library_name: diffusers
 pipeline_tag: text-to-video
 datasets:
+- TempoFunk/tempofunk-sdance
+- TempoFunk/tempofunk-m
 models:
 - TempoFunk/makeavid-sd-jax
+- runwayml/stable-diffusion-v1-5
 tags:
 - jax-diffusers-event
 ---

app.py CHANGED Viewed

@@ -121,15 +121,31 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
         with gr.Column():
             intro1 = gr.Markdown("""
                         # Make-A-Video Stable Diffusion JAX
                         **Please be patient. The model might have to compile with current parameters.**
                         This can take up to 5 minutes on the first run, and 2-3 minutes on later runs.
                         The compilation will be cached and consecutive runs with the same parameters
                         will be much faster.
-            """)
-        with gr.Column():
-            intro2 = gr.Markdown("""
-                        The following parameters require the model to compile
                         - Number of frames
                         - Width & Height
                         - Steps
@@ -153,7 +169,7 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
             )
             inference_steps_input = gr.Slider(
                 label = 'Steps',
-                minimum = 1,
                 maximum = 100,
                 value = 20,
                 step = 1
@@ -222,6 +238,7 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
     height_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     width_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     num_frames_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     inference_steps_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     will_trigger.value = trigger_check_fun(image_input.value, inference_steps_input.value, height_input.value, width_input.value, num_frames_input.value)
     ev = submit_button.click(
@@ -254,6 +271,6 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
     )
     cancel_button.click(fn = lambda: None, cancels = ev)
-demo.queue(concurrency_count = 1, max_size = 16)
 demo.launch()

         with gr.Column():
             intro1 = gr.Markdown("""
                         # Make-A-Video Stable Diffusion JAX
+                        We have extended a pretrained LMD inpainting image generation model with temporal convolutions and attention.
+                        We take advantage of the extra 5 input channels of the inpaint model to guide the video generation with a hint image and mask.
+                        The hint image can be given by the users, otherwise it is generated by an generative image model.
+                        The temporal convolution and attention is a port of [Make-A-Video Pytorch](https://github.com/lucidrains/make-a-video-pytorch/blob/main/make_a_video_pytorch)
+                        to FLAX. It is a pseudo 3D convolution that seperately convolves accross the spatial dimension in 2D and over the temporal dimension in 1D.
+                        Temporal attention is purely self attention and also separately attends to time and space.
+                        Only the new temporal layers have been fine tuned on a dataset of videos themed around dance.
+                        The model has been trained for 60 epochs on a dataset of 10,000 Videos with 120 frames each, randomly selecting a 24 frame range from each sample.
+                        See model and dataset links in the metadata.
+                        Model implementation and training code can be found at [https://github.com/lopho/makeavid-sd-tpu](https://github.com/lopho/makeavid-sd-tpu)
+            """)
+        with gr.Column():
+            intro3 = gr.Markdown("""
                         **Please be patient. The model might have to compile with current parameters.**
                         This can take up to 5 minutes on the first run, and 2-3 minutes on later runs.
                         The compilation will be cached and consecutive runs with the same parameters
                         will be much faster.
+                        Changes to the following parameters require the model to compile
                         - Number of frames
                         - Width & Height
                         - Steps
             )
             inference_steps_input = gr.Slider(
                 label = 'Steps',
+                minimum = 2,
                 maximum = 100,
                 value = 20,
                 step = 1
     height_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     width_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     num_frames_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
+    image_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     inference_steps_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     will_trigger.value = trigger_check_fun(image_input.value, inference_steps_input.value, height_input.value, width_input.value, num_frames_input.value)
     ev = submit_button.click(
     )
     cancel_button.click(fn = lambda: None, cancels = ev)
+demo.queue(concurrency_count = 1, max_size = 32)
 demo.launch()