prithivMLmods commited on
Commit
b73b04e
·
verified ·
1 Parent(s): 5f5851f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -129
app.py CHANGED
@@ -15,12 +15,9 @@ import cv2
15
  import requests
16
 
17
  from transformers import (
18
- Qwen2VLForConditionalGeneration,
19
- Qwen2_5_VLForConditionalGeneration,
20
  AutoProcessor,
21
  TextIteratorStreamer,
22
- AutoModel,
23
- AutoTokenizer,
24
  )
25
  from transformers.image_utils import load_image
26
 
@@ -48,65 +45,15 @@ print("Using device:", device)
48
  # To address the warnings, we add `use_fast=False` to ensure we use the
49
  # processor version the model was originally saved with.
50
 
51
- # Load DREX-062225-exp
52
- MODEL_ID_X = "prithivMLmods/DREX-062225-exp"
53
- processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
54
- model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
55
- MODEL_ID_X,
56
  trust_remote_code=True,
57
  torch_dtype=torch.float16
58
  ).to(device).eval()
59
 
60
- # Load typhoon-ocr-3b
61
- MODEL_ID_T = "scb10x/typhoon-ocr-3b"
62
- processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True, use_fast=False)
63
- model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
64
- MODEL_ID_T,
65
- trust_remote_code=True,
66
- torch_dtype=torch.float16
67
- ).to(device).eval()
68
-
69
- # Load olmOCR-7B-0225-preview
70
- MODEL_ID_O = "allenai/olmOCR-7B-0225-preview"
71
- processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True, use_fast=False)
72
- model_o = Qwen2VLForConditionalGeneration.from_pretrained(
73
- MODEL_ID_O,
74
- trust_remote_code=True,
75
- torch_dtype=torch.float16
76
- ).to(device).eval()
77
-
78
- # Load Lumian-VLR-7B-Thinking
79
- MODEL_ID_J = "prithivMLmods/Lumian-VLR-7B-Thinking"
80
- SUBFOLDER = "think-preview"
81
- processor_j = AutoProcessor.from_pretrained(MODEL_ID_J, trust_remote_code=True, subfolder=SUBFOLDER, use_fast=False)
82
- model_j = Qwen2_5_VLForConditionalGeneration.from_pretrained(
83
- MODEL_ID_J,
84
- trust_remote_code=True,
85
- subfolder=SUBFOLDER,
86
- torch_dtype=torch.float16
87
- ).to(device).eval()
88
-
89
- # Load openbmb/MiniCPM-V-4
90
- MODEL_ID_V4 = 'openbmb/MiniCPM-V-4'
91
- model_v4 = AutoModel.from_pretrained(
92
- MODEL_ID_V4,
93
- trust_remote_code=True,
94
- torch_dtype=torch.bfloat16,
95
- # Using 'sdpa' can sometimes cause issues in certain environments,
96
- # letting transformers choose the default is safer.
97
- # attn_implementation='sdpa'
98
- ).eval().to(device)
99
- tokenizer_v4 = AutoTokenizer.from_pretrained(MODEL_ID_V4, trust_remote_code=True, use_fast=False)
100
-
101
- # --- Refactored Model Dictionary ---
102
- # This simplifies model selection in the generation functions.
103
- MODELS = {
104
- "DREX-062225-7B-exp": (processor_x, model_x),
105
- "Typhoon-OCR-3B": (processor_t, model_t),
106
- "olmOCR-7B-0225-preview": (processor_o, model_o),
107
- "Lumian-VLR-7B-Thinking": (processor_j, model_j),
108
- }
109
-
110
 
111
  def downsample_video(video_path):
112
  """
@@ -131,48 +78,28 @@ def downsample_video(video_path):
131
  return frames
132
 
133
  @spaces.GPU
134
- def generate_image(model_name: str, text: str, image: Image.Image,
135
  max_new_tokens: int = 1024,
136
  temperature: float = 0.6,
137
  top_p: float = 0.9,
138
  top_k: int = 50,
139
  repetition_penalty: float = 1.2):
140
  """
141
- Generates responses using the selected model for image input.
142
  """
143
  if image is None:
144
  yield "Please upload an image.", "Please upload an image."
145
  return
146
 
147
- # Handle MiniCPM-V-4 separately due to its different API
148
- if model_name == "openbmb/MiniCPM-V-4":
149
- msgs = [{'role': 'user', 'content': [image, text]}]
150
- try:
151
- answer = model_v4.chat(
152
- image=image.convert('RGB'), msgs=msgs, tokenizer=tokenizer_v4,
153
- max_new_tokens=max_new_tokens, temperature=temperature,
154
- top_p=top_p, repetition_penalty=repetition_penalty,
155
- )
156
- yield answer, answer
157
- except Exception as e:
158
- yield f"Error: {e}", f"Error: {e}"
159
- return
160
-
161
- # Use the dictionary for other models
162
- if model_name not in MODELS:
163
- yield "Invalid model selected.", "Invalid model selected."
164
- return
165
- processor, model = MODELS[model_name]
166
-
167
- messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text}]}]
168
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
169
- inputs = processor(
170
  text=[prompt_full], images=[image], return_tensors="pt", padding=True,
171
  truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
172
  ).to(device)
173
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
174
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
175
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
176
  thread.start()
177
  buffer = ""
178
  for new_text in streamer:
@@ -181,14 +108,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
181
  yield buffer, buffer
182
 
183
  @spaces.GPU
184
- def generate_video(model_name: str, text: str, video_path: str,
185
  max_new_tokens: int = 1024,
186
  temperature: float = 0.6,
187
  top_p: float = 0.9,
188
  top_k: int = 50,
189
  repetition_penalty: float = 1.2):
190
  """
191
- Generates responses using the selected model for video input.
192
  """
193
  if video_path is None:
194
  yield "Please upload a video.", "Please upload a video."
@@ -199,49 +126,24 @@ def generate_video(model_name: str, text: str, video_path: str,
199
  yield "Could not process video.", "Could not process video."
200
  return
201
 
202
- # Handle MiniCPM-V-4 separately
203
- if model_name == "openbmb/MiniCPM-V-4":
204
- images = [frame for frame, ts in frames_with_ts]
205
- # For video, the prompt includes the text and then all the image frames
206
- content = [text] + images
207
- msgs = [{'role': 'user', 'content': content}]
208
- try:
209
- # The .chat API still takes a single image argument, typically the first frame
210
- answer = model_v4.chat(
211
- image=images[0].convert('RGB'), msgs=msgs, tokenizer=tokenizer_v4,
212
- max_new_tokens=max_new_tokens, temperature=temperature,
213
- top_p=top_p, repetition_penalty=repetition_penalty,
214
- )
215
- yield answer, answer
216
- except Exception as e:
217
- yield f"Error: {e}", f"Error: {e}"
218
- return
219
-
220
- # Use the dictionary for other models
221
- if model_name not in MODELS:
222
- yield "Invalid model selected.", "Invalid model selected."
223
- return
224
- processor, model = MODELS[model_name]
225
-
226
- # Prepare messages for Qwen-style models
227
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
228
  images_for_processor = []
229
  for frame, timestamp in frames_with_ts:
230
- messages[0]["content"].append({"type": "image", "image": frame})
231
  images_for_processor.append(frame)
232
 
233
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
234
- inputs = processor(
235
  text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True,
236
  truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
237
  ).to(device)
238
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
239
  generation_kwargs = {
240
  **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
241
  "do_sample": True, "temperature": temperature, "top_p": top_p,
242
  "top_k": top_k, "repetition_penalty": repetition_penalty,
243
  }
244
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
245
  thread.start()
246
  buffer = ""
247
  for new_text in streamer:
@@ -302,25 +204,18 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
302
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
303
  with gr.Accordion("(Result.md)", open=False):
304
  markdown_output = gr.Markdown(label="(Result.Md)")
305
- model_choice = gr.Radio(
306
- choices=["Lumian-VLR-7B-Thinking", "openbmb/MiniCPM-V-4", "Typhoon-OCR-3B", "DREX-062225-7B-exp", "olmOCR-7B-0225-preview"],
307
- label="Select Model",
308
- value="Lumian-VLR-7B-Thinking"
309
- )
310
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
311
- gr.Markdown("> [MiniCPM-V 4.0](https://huggingface.co/openbmb/MiniCPM-V-4) is the latest efficient model in the MiniCPM-V series. The model is built based on SigLIP2-400M and MiniCPM4-3B with a total of 4.1B parameters. It inherits the strong single-image, multi-image and video understanding performance of MiniCPM-V 2.6 with largely improved efficiency. [Lumian-VLR-7B-Thinking](https://huggingface.co/prithivMLmods/Lumian-VLR-7B-Thinking) is a high-fidelity vision-language reasoning model built on Qwen2.5-VL-7B-Instruct, designed for fine-grained multimodal understanding, video reasoning, and document comprehension through explicit grounded reasoning.")
312
- gr.Markdown("> [olmOCR-7B-0225-preview](https://huggingface.co/allenai/olmOCR-7B-0225-preview) is a 7B parameter open large model designed for OCR tasks with robust text extraction, especially in complex document layouts. [Typhoon-ocr-3b](https://huggingface.co/scb10x/typhoon-ocr-3b) is a 3B parameter OCR model optimized for efficient and accurate optical character recognition in challenging conditions.")
313
- gr.Markdown("> [DREX-062225-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp) is an experimental multimodal model emphasizing strong document reading and extraction capabilities combined with vision-language understanding to support detailed document parsing and reasoning tasks.")
314
- gr.Markdown("> ⚠️ Note: Video inference performance can vary significantly between models.")
315
 
316
  image_submit.click(
317
  fn=generate_image,
318
- inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
319
  outputs=[output, markdown_output]
320
  )
321
  video_submit.click(
322
  fn=generate_video,
323
- inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
324
  outputs=[output, markdown_output]
325
  )
326
 
 
15
  import requests
16
 
17
  from transformers import (
18
+ Qwen3VLMoeForConditionalGeneration,
 
19
  AutoProcessor,
20
  TextIteratorStreamer,
 
 
21
  )
22
  from transformers.image_utils import load_image
23
 
 
45
  # To address the warnings, we add `use_fast=False` to ensure we use the
46
  # processor version the model was originally saved with.
47
 
48
+ # Load Qwen3VL
49
+ MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
50
+ processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
51
+ model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
52
+ MODEL_ID_Q3VL,
53
  trust_remote_code=True,
54
  torch_dtype=torch.float16
55
  ).to(device).eval()
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  def downsample_video(video_path):
59
  """
 
78
  return frames
79
 
80
  @spaces.GPU
81
+ def generate_image(text: str, image: Image.Image,
82
  max_new_tokens: int = 1024,
83
  temperature: float = 0.6,
84
  top_p: float = 0.9,
85
  top_k: int = 50,
86
  repetition_penalty: float = 1.2):
87
  """
88
+ Generates responses using the Qwen3-VL model for image input.
89
  """
90
  if image is None:
91
  yield "Please upload an image.", "Please upload an image."
92
  return
93
 
94
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
95
+ prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
96
+ inputs = processor_q3vl(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  text=[prompt_full], images=[image], return_tensors="pt", padding=True,
98
  truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
99
  ).to(device)
100
+ streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
101
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
102
+ thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
103
  thread.start()
104
  buffer = ""
105
  for new_text in streamer:
 
108
  yield buffer, buffer
109
 
110
  @spaces.GPU
111
+ def generate_video(text: str, video_path: str,
112
  max_new_tokens: int = 1024,
113
  temperature: float = 0.6,
114
  top_p: float = 0.9,
115
  top_k: int = 50,
116
  repetition_penalty: float = 1.2):
117
  """
118
+ Generates responses using the Qwen3-VL model for video input.
119
  """
120
  if video_path is None:
121
  yield "Please upload a video.", "Please upload a video."
 
126
  yield "Could not process video.", "Could not process video."
127
  return
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
130
  images_for_processor = []
131
  for frame, timestamp in frames_with_ts:
132
+ messages[0]["content"].append({"type": "image"})
133
  images_for_processor.append(frame)
134
 
135
+ prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
136
+ inputs = processor_q3vl(
137
  text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True,
138
  truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
139
  ).to(device)
140
+ streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
141
  generation_kwargs = {
142
  **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
143
  "do_sample": True, "temperature": temperature, "top_p": top_p,
144
  "top_k": top_k, "repetition_penalty": repetition_penalty,
145
  }
146
+ thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
147
  thread.start()
148
  buffer = ""
149
  for new_text in streamer:
 
204
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
205
  with gr.Accordion("(Result.md)", open=False):
206
  markdown_output = gr.Markdown(label="(Result.Md)")
 
 
 
 
 
207
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
208
+ gr.Markdown("> [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct) is a powerful, versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
209
+ gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
 
 
210
 
211
  image_submit.click(
212
  fn=generate_image,
213
+ inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
214
  outputs=[output, markdown_output]
215
  )
216
  video_submit.click(
217
  fn=generate_video,
218
+ inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
219
  outputs=[output, markdown_output]
220
  )
221