SkyCaptioner-V1 / app.py
pinoo's picture
update
755d5b9
import spaces
import gradio as gr
from struct_caption import StructCaptioner
from fusion_caption import FusionCaptioner
struct_captioner = StructCaptioner("Skywork/SkyCaptioner-V1")
fusion_captioner = FusionCaptioner("Qwen/Qwen3-8B")
with gr.Blocks() as demo:
gr.Markdown(
"""
<h1 style="text-align: center; font-size: 2em;">SkyCaptioner-V1</h1>
""",
elem_id="header"
)
with gr.Row():
with gr.Column(scale=0.5):
video_input = gr.Video(
label="Upload Video",
interactive=True,
format="mp4",
)
btn_struct = gr.Button("Generate Struct Caption")
with gr.Column():
struct_caption_output = gr.Code(
label="Struct Caption",
language="json",
lines=25,
interactive=False
)
with gr.Row():
with gr.Column(scale=0.5):
with gr.Row():
task_input = gr.Radio(
label="Task Type",
choices=["t2v", "i2v"],
value="t2v",
interactive=True
)
btn_fusion = gr.Button("Generate Fusion Caption")
with gr.Column():
fusion_caption_output = gr.Textbox(
label="Fusion Caption",
value="",
interactive=False
)
@spaces.GPU(duration=120)
def generate_struct_caption(video):
struct_caption = struct_captioner(video)
return struct_caption
@spaces.GPU(duration=120)
def generate_fusion_caption(struct_caption_str, task):
return fusion_captioner(struct_caption_str, task)
btn_struct.click(
fn=generate_struct_caption,
inputs=video_input,
outputs=struct_caption_output
)
btn_fusion.click(
fn=generate_fusion_caption,
inputs=[struct_caption_output, task_input],
outputs=fusion_caption_output
)
gr.Examples(
examples=[
["./examples/1.mp4"],
["./examples/2.mp4"],
["./examples/3.mp4"],
["./examples/4.mp4"],
],
inputs=video_input,
label="Example Videos"
)
demo.launch()