prithivMLmods commited on
Commit
8530a0c
·
verified ·
1 Parent(s): 9ec2e5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -113
app.py CHANGED
@@ -20,6 +20,8 @@ from transformers import (
20
  AutoModelForImageTextToText,
21
  AutoProcessor,
22
  TextIteratorStreamer,
 
 
23
  )
24
  from transformers.image_utils import load_image
25
 
@@ -30,6 +32,8 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
30
 
31
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
 
 
 
33
  # Load DREX-062225-exp
34
  MODEL_ID_X = "prithivMLmods/DREX-062225-exp"
35
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
@@ -68,14 +72,16 @@ model_j = Qwen2_5_VLForConditionalGeneration.from_pretrained(
68
  torch_dtype=torch.float16
69
  ).to(device).eval()
70
 
71
- # Load LMM-R1-MGT-PerceReason
72
- MODEL_ID_F = "VLM-Reasoner/LMM-R1-MGT-PerceReason"
73
- processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
74
- model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
75
- MODEL_ID_F,
76
  trust_remote_code=True,
77
- torch_dtype=torch.float16
78
- ).to(device).eval()
 
 
 
79
 
80
  def downsample_video(video_path):
81
  """
@@ -86,7 +92,8 @@ def downsample_video(video_path):
86
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
87
  fps = vidcap.get(cv2.CAP_PROP_FPS)
88
  frames = []
89
- frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
 
90
  for i in frame_indices:
91
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
92
  success, image = vidcap.read()
@@ -108,44 +115,46 @@ def generate_image(model_name: str, text: str, image: Image.Image,
108
  """
109
  Generates responses using the selected model for image input.
110
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  if model_name == "DREX-062225-7B-exp":
112
- processor = processor_x
113
- model = model_x
114
  elif model_name == "olmOCR-7B-0225-preview":
115
- processor = processor_o
116
- model = model_o
117
  elif model_name == "Typhoon-OCR-3B":
118
- processor = processor_t
119
- model = model_t
120
  elif model_name == "Lumian-VLR-7B-Thinking":
121
- processor = processor_j
122
- model = model_j
123
- elif model_name == "LMM-R1-MGT-PerceReason":
124
- processor = processor_f
125
- model = model_f
126
  else:
127
  yield "Invalid model selected.", "Invalid model selected."
128
  return
129
 
130
- if image is None:
131
- yield "Please upload an image.", "Please upload an image."
132
- return
133
-
134
- messages = [{
135
- "role": "user",
136
- "content": [
137
- {"type": "image", "image": image},
138
- {"type": "text", "text": text},
139
- ]
140
- }]
141
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
142
  inputs = processor(
143
- text=[prompt_full],
144
- images=[image],
145
- return_tensors="pt",
146
- padding=True,
147
- truncation=False,
148
- max_length=MAX_INPUT_TOKEN_LENGTH
149
  ).to(device)
150
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
151
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
@@ -167,57 +176,64 @@ def generate_video(model_name: str, text: str, video_path: str,
167
  """
168
  Generates responses using the selected model for video input.
169
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  if model_name == "DREX-062225-7B-exp":
171
- processor = processor_x
172
- model = model_x
173
  elif model_name == "olmOCR-7B-0225-preview":
174
- processor = processor_o
175
- model = model_o
176
  elif model_name == "Typhoon-OCR-3B":
177
- processor = processor_t
178
- model = model_t
179
  elif model_name == "Lumian-VLR-7B-Thinking":
180
- processor = processor_j
181
- model = model_j
182
- elif model_name == "LMM-R1-MGT-PerceReason":
183
- processor = processor_f
184
- model = model_f
185
  else:
186
  yield "Invalid model selected.", "Invalid model selected."
187
  return
188
 
189
- if video_path is None:
190
- yield "Please upload a video.", "Please upload a video."
191
- return
 
192
 
193
- frames = downsample_video(video_path)
194
- messages = [
195
- {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
196
- {"role": "user", "content": [{"type": "text", "text": text}]}
197
- ]
198
- for frame in frames:
199
- image, timestamp = frame
200
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
201
- messages[1]["content"].append({"type": "image", "image": image})
202
- inputs = processor.apply_chat_template(
203
- messages,
204
- tokenize=True,
205
- add_generation_prompt=True,
206
- return_dict=True,
207
- return_tensors="pt",
208
- truncation=False,
209
- max_length=MAX_INPUT_TOKEN_LENGTH
210
  ).to(device)
211
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
212
  generation_kwargs = {
213
- **inputs,
214
- "streamer": streamer,
215
- "max_new_tokens": max_new_tokens,
216
- "do_sample": True,
217
- "temperature": temperature,
218
- "top_p": top_p,
219
- "top_k": top_k,
220
- "repetition_penalty": repetition_penalty,
221
  }
222
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
223
  thread.start()
@@ -228,14 +244,6 @@ def generate_video(model_name: str, text: str, video_path: str,
228
  time.sleep(0.01)
229
  yield buffer, buffer
230
 
231
- def save_to_md(output_text):
232
- """
233
- Saves the output text to a Markdown file and returns the file path for download.
234
- """
235
- file_path = f"result_{uuid.uuid4()}.md"
236
- with open(file_path, "w") as f:
237
- f.write(output_text)
238
- return file_path
239
 
240
  # Define examples for image and video inference
241
  image_examples = [
@@ -254,18 +262,9 @@ video_examples = [
254
 
255
  # Added CSS to style the output area as a "Canvas"
256
  css = """
257
- .submit-btn {
258
- background-color: #2980b9 !important;
259
- color: white !important;
260
- }
261
- .submit-btn:hover {
262
- background-color: #3498db !important;
263
- }
264
- .canvas-output {
265
- border: 2px solid #4682B4;
266
- border-radius: 10px;
267
- padding: 20px;
268
- }
269
  """
270
 
271
  # Create the Gradio Interface
@@ -278,19 +277,13 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
278
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
279
  image_upload = gr.Image(type="pil", label="Image")
280
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
281
- gr.Examples(
282
- examples=image_examples,
283
- inputs=[image_query, image_upload]
284
- )
285
  with gr.TabItem("Video Inference"):
286
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
287
  video_upload = gr.Video(label="Video")
288
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
289
- gr.Examples(
290
- examples=video_examples,
291
- inputs=[video_query, video_upload]
292
- )
293
-
294
  with gr.Accordion("Advanced options", open=False):
295
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
296
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -302,19 +295,21 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
302
  with gr.Column(elem_classes="canvas-output"):
303
  gr.Markdown("## Output")
304
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2, show_copy_button=True)
305
- with gr.Accordion("(Result.md)", open=False):
306
  markdown_output = gr.Markdown(label="(Result.Md)")
307
  model_choice = gr.Radio(
308
- choices=["Lumian-VLR-7B-Thinking", "DREX-062225-7B-exp", "olmOCR-7B-0225-preview", "LMM-R1-MGT-PerceReason", "Typhoon-OCR-3B"],
309
  label="Select Model",
310
  value="Lumian-VLR-7B-Thinking"
311
  )
312
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
313
- gr.Markdown("> Lumian-VLR-7B-Thinking is a high-fidelity vision-language reasoning model built on Qwen2.5-VL-7B-Instruct, designed for fine-grained multimodal understanding, enhancing image captioning, video reasoning, and document comprehension through explicit grounded reasoning. It is trained first via supervised fine-tuning (SFT) on visually-grounded reasoning traces and then further refined using GRPO reinforcement learning to boost reasoning accuracy.")
314
- gr.Markdown("> LMM-R1-MGT-PerceReason is a vision-language model focused on advanced reasoning using a multimodal tree search approach enabling progressive visual-textual slow thinking, improving complex spatial and logical reasoning without fine-tuning. OLMOCR-7B-0225-preview is a 7B parameter open large model designed for OCR tasks with robust text extraction, especially in complex document layouts. ")
315
- gr.Markdown("> Typhoon-ocr-3b is a 3B parameter OCR model optimized for efficient and accurate optical character recognition in challenging conditions. DREX-062225-exp is an experimental multimodal model emphasizing strong document reading and extraction capabilities combined with vision-language understanding to support detailed document parsing and reasoning tasks.")
316
- gr.Markdown("> ⚠️ Note: Models in this space may not perform well on video inference tasks.")
317
-
 
 
318
  image_submit.click(
319
  fn=generate_image,
320
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
@@ -327,4 +322,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
327
  )
328
 
329
  if __name__ == "__main__":
330
- demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
20
  AutoModelForImageTextToText,
21
  AutoProcessor,
22
  TextIteratorStreamer,
23
+ AutoModel,
24
+ AutoTokenizer,
25
  )
26
  from transformers.image_utils import load_image
27
 
 
32
 
33
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
34
 
35
+ # --- Original Models ---
36
+
37
  # Load DREX-062225-exp
38
  MODEL_ID_X = "prithivMLmods/DREX-062225-exp"
39
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 
72
  torch_dtype=torch.float16
73
  ).to(device).eval()
74
 
75
+ # --- Load New Model: openbmb/MiniCPM-V-4 ---
76
+ MODEL_ID_V4 = 'openbmb/MiniCPM-V-4'
77
+ model_v4 = AutoModel.from_pretrained(
78
+ MODEL_ID_V4,
 
79
  trust_remote_code=True,
80
+ torch_dtype=torch.bfloat16,
81
+ attn_implementation='sdpa'
82
+ ).eval().to(device)
83
+ tokenizer_v4 = AutoTokenizer.from_pretrained(MODEL_ID_V4, trust_remote_code=True)
84
+
85
 
86
  def downsample_video(video_path):
87
  """
 
92
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
93
  fps = vidcap.get(cv2.CAP_PROP_FPS)
94
  frames = []
95
+ # Use a maximum of 10 frames to avoid excessive memory usage
96
+ frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
97
  for i in frame_indices:
98
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
99
  success, image = vidcap.read()
 
115
  """
116
  Generates responses using the selected model for image input.
117
  """
118
+ if image is None:
119
+ yield "Please upload an image.", "Please upload an image."
120
+ return
121
+
122
+ # Handle the new model separately due to its different API
123
+ if model_name == "openbmb/MiniCPM-V-4":
124
+ msgs = [{'role': 'user', 'content': [image, text]}]
125
+ try:
126
+ answer = model_v4.chat(
127
+ image=image.convert('RGB'),
128
+ msgs=msgs,
129
+ tokenizer=tokenizer_v4,
130
+ max_new_tokens=max_new_tokens,
131
+ temperature=temperature,
132
+ top_p=top_p,
133
+ repetition_penalty=repetition_penalty,
134
+ )
135
+ yield answer, answer
136
+ except Exception as e:
137
+ yield f"Error: {e}", f"Error: {e}"
138
+ return
139
+
140
+ # Original model selection logic
141
  if model_name == "DREX-062225-7B-exp":
142
+ processor, model = processor_x, model_x
 
143
  elif model_name == "olmOCR-7B-0225-preview":
144
+ processor, model = processor_o, model_o
 
145
  elif model_name == "Typhoon-OCR-3B":
146
+ processor, model = processor_t, model_t
 
147
  elif model_name == "Lumian-VLR-7B-Thinking":
148
+ processor, model = processor_j, model_j
 
 
 
 
149
  else:
150
  yield "Invalid model selected.", "Invalid model selected."
151
  return
152
 
153
+ messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text}]}]
 
 
 
 
 
 
 
 
 
 
154
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
155
  inputs = processor(
156
+ text=[prompt_full], images=[image], return_tensors="pt", padding=True,
157
+ truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
 
 
 
 
158
  ).to(device)
159
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
160
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
 
176
  """
177
  Generates responses using the selected model for video input.
178
  """
179
+ if video_path is None:
180
+ yield "Please upload a video.", "Please upload a video."
181
+ return
182
+
183
+ frames_with_ts = downsample_video(video_path)
184
+ if not frames_with_ts:
185
+ yield "Could not process video.", "Could not process video."
186
+ return
187
+
188
+ # Handle the new model separately
189
+ if model_name == "openbmb/MiniCPM-V-4":
190
+ images = [frame for frame, ts in frames_with_ts]
191
+ content = [text] + images
192
+ msgs = [{'role': 'user', 'content': content}]
193
+ try:
194
+ answer = model_v4.chat(
195
+ image=images[0].convert('RGB'),
196
+ msgs=msgs,
197
+ tokenizer=tokenizer_v4,
198
+ max_new_tokens=max_new_tokens,
199
+ temperature=temperature,
200
+ top_p=top_p,
201
+ repetition_penalty=repetition_penalty,
202
+ )
203
+ yield answer, answer
204
+ except Exception as e:
205
+ yield f"Error: {e}", f"Error: {e}"
206
+ return
207
+
208
+ # Original model selection logic
209
  if model_name == "DREX-062225-7B-exp":
210
+ processor, model = processor_x, model_x
 
211
  elif model_name == "olmOCR-7B-0225-preview":
212
+ processor, model = processor_o, model_o
 
213
  elif model_name == "Typhoon-OCR-3B":
214
+ processor, model = processor_t, model_t
 
215
  elif model_name == "Lumian-VLR-7B-Thinking":
216
+ processor, model = processor_j, model_j
 
 
 
 
217
  else:
218
  yield "Invalid model selected.", "Invalid model selected."
219
  return
220
 
221
+ # Prepare messages for Qwen-style models
222
+ messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
223
+ for frame, timestamp in frames_with_ts:
224
+ messages[0]["content"].append({"type": "image", "image": frame})
225
 
226
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
227
+ images_for_processor = [frame for frame, ts in frames_with_ts]
228
+ inputs = processor(
229
+ text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True,
230
+ truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
 
 
 
 
 
 
 
 
 
 
 
 
231
  ).to(device)
232
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
233
  generation_kwargs = {
234
+ **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
235
+ "do_sample": True, "temperature": temperature, "top_p": top_p,
236
+ "top_k": top_k, "repetition_penalty": repetition_penalty,
 
 
 
 
 
237
  }
238
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
239
  thread.start()
 
244
  time.sleep(0.01)
245
  yield buffer, buffer
246
 
 
 
 
 
 
 
 
 
247
 
248
  # Define examples for image and video inference
249
  image_examples = [
 
262
 
263
  # Added CSS to style the output area as a "Canvas"
264
  css = """
265
+ .submit-btn { background-color: #2980b9 !important; color: white !important; }
266
+ .submit-btn:hover { background-color: #3498db !important; }
267
+ .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
 
 
 
 
 
 
 
 
 
268
  """
269
 
270
  # Create the Gradio Interface
 
277
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
278
  image_upload = gr.Image(type="pil", label="Image")
279
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
280
+ gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
 
 
 
281
  with gr.TabItem("Video Inference"):
282
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
283
  video_upload = gr.Video(label="Video")
284
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
285
+ gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
286
+
 
 
 
287
  with gr.Accordion("Advanced options", open=False):
288
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
289
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
 
295
  with gr.Column(elem_classes="canvas-output"):
296
  gr.Markdown("## Output")
297
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2, show_copy_button=True)
298
+ with gr.Accordion("(Result.md)", open=False):
299
  markdown_output = gr.Markdown(label="(Result.Md)")
300
  model_choice = gr.Radio(
301
+ choices=[ "openbmb/MiniCPM-V-4", "Lumian-VLR-7B-Thinking", "DREX-062225-7B-exp", "olmOCR-7B-0225-preview", "Typhoon-OCR-3B"],
302
  label="Select Model",
303
  value="Lumian-VLR-7B-Thinking"
304
  )
305
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
306
+ gr.Markdown("> **MiniCPM-V 4** is a powerful open-source multimodal model capable of handling various image and text-based tasks with high accuracy.")
307
+ gr.Markdown("> **Lumian-VLR-7B-Thinking** is a high-fidelity vision-language reasoning model built on Qwen2.5-VL-7B-Instruct, designed for fine-grained multimodal understanding, enhancing image captioning, video reasoning, and document comprehension through explicit grounded reasoning.")
308
+ gr.Markdown("> **olmOCR-7B-0225-preview** is a 7B parameter open large model designed for OCR tasks with robust text extraction, especially in complex document layouts. ")
309
+ gr.Markdown("> **Typhoon-ocr-3b** is a 3B parameter OCR model optimized for efficient and accurate optical character recognition in challenging conditions.")
310
+ gr.Markdown("> **DREX-062225-exp** is an experimental multimodal model emphasizing strong document reading and extraction capabilities combined with vision-language understanding to support detailed document parsing and reasoning tasks.")
311
+ gr.Markdown("> ⚠️ Note: Video inference performance can vary significantly between models.")
312
+
313
  image_submit.click(
314
  fn=generate_image,
315
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
 
322
  )
323
 
324
  if __name__ == "__main__":
325
+ demo.queue(max_size=30).launch(share=True, show_error=True)