File size: 2,319 Bytes
0d1c12c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
755d5b9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import spaces
import gradio as gr

from struct_caption import StructCaptioner
from fusion_caption import FusionCaptioner

struct_captioner = StructCaptioner("Skywork/SkyCaptioner-V1")
fusion_captioner = FusionCaptioner("Qwen/Qwen3-8B")

with gr.Blocks() as demo:
    gr.Markdown(
        """
        <h1 style="text-align: center; font-size: 2em;">SkyCaptioner-V1</h1>
        """,
        elem_id="header"
    )

    with gr.Row():
        with gr.Column(scale=0.5):
            video_input = gr.Video(
                label="Upload Video",
                interactive=True,
                format="mp4",
            )
            
            btn_struct = gr.Button("Generate Struct Caption")

        with gr.Column():
            struct_caption_output = gr.Code(
                label="Struct Caption",
                language="json",
                lines=25,
                interactive=False
            )

    with gr.Row():
        with gr.Column(scale=0.5):
            with gr.Row():
                task_input = gr.Radio(
                    label="Task Type",
                    choices=["t2v", "i2v"],
                    value="t2v",
                    interactive=True
                )
            btn_fusion = gr.Button("Generate Fusion Caption") 

        with gr.Column():         
            fusion_caption_output = gr.Textbox(
                label="Fusion Caption",
                value="",
                interactive=False
            )

    @spaces.GPU(duration=120)
    def generate_struct_caption(video):
        struct_caption = struct_captioner(video)
        return struct_caption 

    @spaces.GPU(duration=120)
    def generate_fusion_caption(struct_caption_str, task):
        return fusion_captioner(struct_caption_str, task)

    btn_struct.click(
        fn=generate_struct_caption,
        inputs=video_input,
        outputs=struct_caption_output
    )

    btn_fusion.click(
        fn=generate_fusion_caption,
        inputs=[struct_caption_output, task_input],
        outputs=fusion_caption_output
    )

    gr.Examples(
        examples=[
            ["./examples/1.mp4"],
            ["./examples/2.mp4"],
            ["./examples/3.mp4"],
            ["./examples/4.mp4"],
        ],
        inputs=video_input,
        label="Example Videos"
    )

    demo.launch()