cyrus28214 commited on
Commit
29bc91e
·
unverified ·
1 Parent(s): 80cd182
Files changed (2) hide show
  1. app.py +62 -50
  2. requirements.txt +3 -2
app.py CHANGED
@@ -1,43 +1,58 @@
1
  import gradio as gr
2
- import torch
3
- from PIL import Image
4
- from threading import Thread
5
- from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
6
  from transformers.image_utils import load_image
 
 
7
  import spaces
8
 
9
- device = "cuda" if torch.cuda.is_available() else "cpu"
10
- torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
11
-
12
- MODEL_ID = "HuggingFaceTB/SmolVLM-256M-Instruct"
13
- processor = AutoProcessor.from_pretrained(MODEL_ID)
14
- model = AutoModelForVision2Seq.from_pretrained(
15
  MODEL_ID,
16
- torch_dtype=torch_dtype,
17
- trust_remote_code=True
18
- ).to(device)
19
-
20
 
21
  @spaces.GPU
22
- def respond(
23
- message,
24
- history: list[tuple[str, str]],
25
- system_message,
26
- max_tokens,
27
- temperature,
28
- top_p,
29
- ):
30
- text = message["text"]
31
- files = message["files"]
 
 
 
32
  all_images = []
33
  current_message_images = []
 
34
 
35
- messages = [{"role": "system", "content": system_message}]
 
 
 
 
 
 
 
 
 
 
36
 
37
- print(message)
38
- print(history)
 
 
39
 
 
 
 
40
  current_message_images = [load_image(image) for image in files]
 
41
  messages.append({
42
  "role": "user",
43
  "content": [
@@ -45,14 +60,20 @@ def respond(
45
  {"type": "text", "text": text},
46
  ],
47
  })
 
 
 
 
 
 
48
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
49
  inputs = processor(
50
  text=[prompt],
51
- images=current_message_images if current_message_images else None,
52
  return_tensors="pt",
53
  padding=True,
54
- ).to(device, torch_dtype)
55
-
56
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
57
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
58
 
@@ -64,24 +85,15 @@ def respond(
64
  buffer += new_text
65
  yield buffer
66
 
67
- demo = gr.ChatInterface(
68
- respond,
69
- multimodal=True,
70
- additional_inputs=[
71
- gr.Textbox(value="You are a helpful and friendly multimodal assistant. You can analyze images and answer questions about them.", label="System message"),
72
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
73
- gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
74
- gr.Slider(
75
- minimum=0.1,
76
- maximum=1.0,
77
- value=0.95,
78
- step=0.05,
79
- label="Top-p (nucleus sampling)",
80
- ),
81
- ],
82
- title="Chatbot",
83
- description="Ask me anything or upload an image. This version uses AutoModel and AutoProcessor directly.",
84
- )
85
 
86
- if __name__ == "__main__":
87
- demo.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
 
 
 
3
  from transformers.image_utils import load_image
4
+ from threading import Thread
5
+ import torch
6
  import spaces
7
 
8
+ MODEL_ID = "TIGER-Lab/VL-Rethinker-7B"
9
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
10
+ model = AutoModelForImageTextToText.from_pretrained(
 
 
 
11
  MODEL_ID,
12
+ trust_remote_code=True,
13
+ torch_dtype=torch.bfloat16
14
+ ).to("cuda").eval()
 
15
 
16
  @spaces.GPU
17
+ def model_inference(input_dict, history):
18
+ text = input_dict["text"]
19
+ files = input_dict["files"]
20
+
21
+ """
22
+ Create chat history
23
+ Example history value:
24
+ [
25
+ [('pixel.png',), None],
26
+ ['ignore this image. just say "hi" and nothing else', 'Hi!'],
27
+ ['just say "hi" and nothing else', 'Hi!']
28
+ ]
29
+ """
30
  all_images = []
31
  current_message_images = []
32
+ messages = []
33
 
34
+ for val in history:
35
+ if val[0]:
36
+ if isinstance(val[0], str):
37
+ messages.append({
38
+ "role": "user",
39
+ "content": [
40
+ *[{"type": "image", "image": image} for image in current_message_images],
41
+ {"type": "text", "text": val[0]},
42
+ ],
43
+ })
44
+ current_message_images = []
45
 
46
+ else:
47
+ # Load messages. These will be appended to the first user text message that comes after
48
+ current_message_images = [load_image(image) for image in val[0]]
49
+ all_images += current_message_images
50
 
51
+ if val[1]:
52
+ messages.append({"role": "assistant", "content": val[1]})
53
+
54
  current_message_images = [load_image(image) for image in files]
55
+ all_images += current_message_images
56
  messages.append({
57
  "role": "user",
58
  "content": [
 
60
  {"type": "text", "text": text},
61
  ],
62
  })
63
+
64
+ #print(messages)
65
+
66
+ """
67
+ Generate and stream text
68
+ """
69
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
70
  inputs = processor(
71
  text=[prompt],
72
+ images=all_images if all_images else None,
73
  return_tensors="pt",
74
  padding=True,
75
+ ).to("cuda")
76
+
77
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
78
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
79
 
 
85
  buffer += new_text
86
  yield buffer
87
 
88
+ with gr.Blocks() as demo:
89
+ gr.ChatInterface(
90
+ fn=model_inference,
91
+ description="# **VL-Rethinker-7B**",
92
+ fill_height=True,
93
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
94
+ stop_btn="Stop Generation",
95
+ multimodal=True,
96
+ cache_examples=False,
97
+ )
 
 
 
 
 
 
 
 
98
 
99
+ demo.launch(debug=True)
 
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
- huggingface_hub==0.33.0
2
- transformers==4.52.4
 
 
1
+ huggingface_hub
2
+ transformers
3
+ torchvision