liuhuadai commited on
Commit
2056f15
·
verified ·
1 Parent(s): 0690e84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -1
app.py CHANGED
@@ -329,7 +329,33 @@ def synthesize_video_with_audio(video_file, caption):
329
 
330
  # Gradio界面
331
  with gr.Blocks() as demo:
332
- gr.Markdown("# ThinkSound\nupload video and caption(optional), and get video with audio!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  with gr.Row():
334
  video_input = gr.Video(label="upload video")
335
  caption_input = gr.Textbox(label="caption(optional)", placeholder="can be empty", lines=1)
 
329
 
330
  # Gradio界面
331
  with gr.Blocks() as demo:
332
+ gr.Markdown(
333
+ """
334
+ # ThinkSound\n
335
+ ThinkSound is a unified Any2Audio generation framework with flow matching guided by Chain-of-Thought (CoT) reasoning.
336
+
337
+ Upload video and caption (optional), and get video with audio!
338
+
339
+ [Project page is here](https://huggingface.co/spaces/FunAudioLLM/ThinkSound)
340
+ [Model weights is here](https://huggingface.co/liuhuadai/ThinkSound)
341
+
342
+ ## Citation
343
+
344
+ If you find our work useful, please cite our paper:
345
+
346
+ ```bibtex
347
+ @misc{liu2025thinksoundchainofthoughtreasoningmultimodal,
348
+ title={ThinkSound: Chain-of-Thought Reasoning in Multimodal Large Language Models for Audio Generation and Editing},
349
+ author={Huadai Liu and Jialei Wang and Kaicheng Luo and Wen Wang and Qian Chen and Zhou Zhao and Wei Xue},
350
+ year={2025},
351
+ eprint={2506.21448},
352
+ archivePrefix={arXiv},
353
+ primaryClass={eess.AS},
354
+ url={https://arxiv.org/abs/2506.21448},
355
+ }
356
+ ```
357
+ """
358
+ )
359
  with gr.Row():
360
  video_input = gr.Video(label="upload video")
361
  caption_input = gr.Textbox(label="caption(optional)", placeholder="can be empty", lines=1)