File size: 3,044 Bytes
fd2651a
9fecce5
 
29bc91e
d342d8b
fd2651a
fc93bc3
 
 
 
 
 
 
9fecce5
fc93bc3
9fecce5
fd2651a
d342d8b
600f2a3
 
 
 
 
 
 
 
29bc91e
 
 
600f2a3
 
 
 
8ead0a1
fc93bc3
 
8ead0a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc93bc3
 
9fecce5
 
 
 
fc93bc3
 
 
 
 
 
 
 
 
 
9fecce5
cde52cf
80cd182
 
9fecce5
 
80cd182
 
9fecce5
29bc91e
80cd182
fc93bc3
fd2651a
cde52cf
 
fd2651a
80cd182
cde52cf
80cd182
 
fd2651a
8afd5a6
12c4ba7
 
29bc91e
600f2a3
 
 
 
 
 
 
 
 
 
 
 
29bc91e
fd2651a
fc93bc3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
from qwen_vl_utils import process_vision_info
from threading import Thread
import spaces

file_path = "csfufu/Revisual-R1-final"

processor = AutoProcessor.from_pretrained(
    file_path,
    min_pixels=256*28*28,
    max_pixels=1280*28*28
)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    file_path, torch_dtype="auto", device_map="auto"
)

@spaces.GPU
def respond(
    input_dict,
    chat_history,
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    text = input_dict["text"]
    files = input_dict["files"]

    messages = [{
        "role": "system",
        "content": system_message
    }]

    print(chat_history)

    for message in chat_history:
        if isinstance(message["content"], str):
            messages.append({
                "role": message["role"],
                "content": [
                    { "type": "text", "text": message["content"] },
                ]
            })
        else:
            messages.append({
                "role": message["role"],
                "content": [
                    { "type": "image", "image": image }
                    for image in message["content"]
                ]
            })

    if text:
        messages.append({
            "role": "user",
            "content": [
                { "type": "text", "text": text },
            ]
        })
    
    for file in files:
        messages.append({
            "role": "user",
            "content": [
                { "type": "image", "image": file }
            ]
        })
    
    image_inputs, video_inputs = process_vision_info(messages)
    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt],
        images=image_inputs,
        videos=video_inputs,
        return_tensors="pt",
        padding=True,
    ).to(model.device)
    
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(**inputs, streamer=streamer, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p)

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer

demo = gr.ChatInterface(
        fn=respond,
        type='messages',
        multimodal=True,
        additional_inputs=[
            gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
            gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
            gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
            gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="Top-p (nucleus sampling)",
            ),
        ],
    )

demo.launch(debug=True)