import spaces import gradio as gr from struct_caption import StructCaptioner from fusion_caption import FusionCaptioner struct_captioner = StructCaptioner("Skywork/SkyCaptioner-V1") fusion_captioner = FusionCaptioner("Qwen/Qwen3-8B") with gr.Blocks() as demo: gr.Markdown( """

SkyCaptioner-V1

""", elem_id="header" ) with gr.Row(): with gr.Column(scale=0.5): video_input = gr.Video( label="Upload Video", interactive=True, format="mp4", ) btn_struct = gr.Button("Generate Struct Caption") with gr.Column(): struct_caption_output = gr.Code( label="Struct Caption", language="json", lines=25, interactive=False ) with gr.Row(): with gr.Column(scale=0.5): with gr.Row(): task_input = gr.Radio( label="Task Type", choices=["t2v", "i2v"], value="t2v", interactive=True ) btn_fusion = gr.Button("Generate Fusion Caption") with gr.Column(): fusion_caption_output = gr.Textbox( label="Fusion Caption", value="", interactive=False ) @spaces.GPU(duration=120) def generate_struct_caption(video): struct_caption = struct_captioner(video) return struct_caption @spaces.GPU(duration=120) def generate_fusion_caption(struct_caption_str, task): return fusion_captioner(struct_caption_str, task) btn_struct.click( fn=generate_struct_caption, inputs=video_input, outputs=struct_caption_output ) btn_fusion.click( fn=generate_fusion_caption, inputs=[struct_caption_output, task_input], outputs=fusion_caption_output ) gr.Examples( examples=[ ["./examples/1.mp4"], ["./examples/2.mp4"], ["./examples/3.mp4"], ["./examples/4.mp4"], ], inputs=video_input, label="Example Videos" ) demo.launch()