prithivMLmods commited on
Commit
6583df1
·
verified ·
1 Parent(s): 3f7b37c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +365 -0
app.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import uuid
4
+ import json
5
+ import time
6
+ import asyncio
7
+ from threading import Thread
8
+
9
+ import gradio as gr
10
+ import spaces
11
+ import torch
12
+ import numpy as np
13
+ from PIL import Image
14
+ import cv2
15
+
16
+ from transformers import (
17
+ Qwen2_5_VLForConditionalGeneration,
18
+ AutoProcessor,
19
+ TextIteratorStreamer,
20
+ )
21
+ from transformers.image_utils import load_image
22
+
23
+ # Constants for text generation
24
+ MAX_MAX_NEW_TOKENS = 2048
25
+ DEFAULT_MAX_NEW_TOKENS = 1024
26
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
27
+
28
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
29
+
30
+ # Load Vision-Matters-7B
31
+ MODEL_ID_M = "Yuting6/Vision-Matters-7B"
32
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
33
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
34
+ MODEL_ID_M, trust_remote_code=True,
35
+ torch_dtype=torch.float16).to(device).eval()
36
+
37
+ # Load ViGaL-7B
38
+ MODEL_ID_X = "yunfeixie/ViGaL-7B"
39
+ processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
40
+ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
41
+ MODEL_ID_X, trust_remote_code=True,
42
+ torch_dtype=torch.float16).to(device).eval()
43
+
44
+ # Load prithivMLmods/WR30a-Deep-7B-0711
45
+ MODEL_ID_T = "prithivMLmods/WR30a-Deep-7B-0711"
46
+ processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
47
+ model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
48
+ MODEL_ID_T, trust_remote_code=True,
49
+ torch_dtype=torch.float16).to(device).eval()
50
+
51
+ # Load Visionary-R1
52
+ MODEL_ID_O = "maifoundations/Visionary-R1"
53
+ processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True)
54
+ model_o = Qwen2_5_VLForConditionalGeneration.from_pretrained(
55
+ MODEL_ID_O, trust_remote_code=True,
56
+ torch_dtype=torch.float16).to(device).eval()
57
+
58
+ #-----------------------------subfolder-----------------------------#
59
+ # Load MonkeyOCR-pro-1.2B
60
+ MODEL_ID_W = "echo840/MonkeyOCR-pro-1.2B"
61
+ SUBFOLDER = "Recognition"
62
+ processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True, subfolder=SUBFOLDER)
63
+ model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
64
+ MODEL_ID_W, trust_remote_code=True,
65
+ subfolder=SUBFOLDER,
66
+ torch_dtype=torch.float16).to(device).eval()
67
+ #-----------------------------subfolder-----------------------------#
68
+
69
+ # Function to downsample video frames
70
+ def downsample_video(video_path):
71
+ """
72
+ Downsamples the video to evenly spaced frames.
73
+ Each frame is returned as a PIL image along with its timestamp.
74
+ """
75
+ vidcap = cv2.VideoCapture(video_path)
76
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
77
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
78
+ frames = []
79
+ frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
80
+ for i in frame_indices:
81
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
82
+ success, image = vidcap.read()
83
+ if success:
84
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
85
+ pil_image = Image.fromarray(image)
86
+ timestamp = round(i / fps, 2)
87
+ frames.append((pil_image, timestamp))
88
+ vidcap.release()
89
+ return frames
90
+
91
+ # Function to generate text responses based on image input
92
+ @spaces.GPU
93
+ def generate_image(model_name: str,
94
+ text: str,
95
+ image: Image.Image,
96
+ max_new_tokens: int = 1024,
97
+ temperature: float = 0.6,
98
+ top_p: float = 0.9,
99
+ top_k: int = 50,
100
+ repetition_penalty: float = 1.2):
101
+ """
102
+ Generates responses using the selected model for image input.
103
+ """
104
+ if model_name == "Vision-Matters-7B":
105
+ processor = processor_m
106
+ model = model_m
107
+ elif model_name == "ViGaL-7B":
108
+ processor = processor_x
109
+ model = model_x
110
+ elif model_name == "Visionary-R1-3B":
111
+ processor = processor_o
112
+ model = model_o
113
+ elif model_name == "WR30a-Deep-7B-0711":
114
+ processor = processor_t
115
+ model = model_t
116
+ elif model_name == "MonkeyOCR-pro-1.2B":
117
+ processor = processor_w
118
+ model = model_w
119
+ else:
120
+ yield "Invalid model selected.", "Invalid model selected."
121
+ return
122
+
123
+ if image is None:
124
+ yield "Please upload an image.", "Please upload an image."
125
+ return
126
+
127
+ messages = [{
128
+ "role": "user",
129
+ "content": [
130
+ {"type": "image", "image": image},
131
+ {"type": "text", "text": text},
132
+ ]
133
+ }]
134
+ prompt_full = processor.apply_chat_template(messages,
135
+ tokenize=False,
136
+ add_generation_prompt=True)
137
+ inputs = processor(text=[prompt_full],
138
+ images=[image],
139
+ return_tensors="pt",
140
+ padding=True,
141
+ truncation=False,
142
+ max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
143
+ streamer = TextIteratorStreamer(processor,
144
+ skip_prompt=True,
145
+ skip_special_tokens=True)
146
+ generation_kwargs = {
147
+ **inputs, "streamer": streamer,
148
+ "max_new_tokens": max_new_tokens
149
+ }
150
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
151
+ thread.start()
152
+ buffer = ""
153
+ for new_text in streamer:
154
+ buffer += new_text
155
+ time.sleep(0.01)
156
+ yield buffer, buffer
157
+
158
+ # Function to generate text responses based on video input
159
+ @spaces.GPU
160
+ def generate_video(model_name: str,
161
+ text: str,
162
+ video_path: str,
163
+ max_new_tokens: int = 1024,
164
+ temperature: float = 0.6,
165
+ top_p: float = 0.9,
166
+ top_k: int = 50,
167
+ repetition_penalty: float = 1.2):
168
+ """
169
+ Generates responses using the selected model for video input.
170
+ """
171
+ if model_name == "Vision-Matters-7B":
172
+ processor = processor_m
173
+ model = model_m
174
+ elif model_name == "ViGaL-7B":
175
+ processor = processor_x
176
+ model = model_x
177
+ elif model_name == "Visionary-R1-3B":
178
+ processor = processor_o
179
+ model = model_o
180
+ elif model_name == "WR30a-Deep-7B-0711":
181
+ processor = processor_t
182
+ model = model_t
183
+ elif model_name == "MonkeyOCR-pro-1.2B":
184
+ processor = processor_w
185
+ model = model_w
186
+ else:
187
+ yield "Invalid model selected.", "Invalid model selected."
188
+ return
189
+
190
+ if video_path is None:
191
+ yield "Please upload a video.", "Please upload a video."
192
+ return
193
+
194
+ frames = downsample_video(video_path)
195
+ messages = [{
196
+ "role": "system",
197
+ "content": [{"type": "text", "text": "You are a helpful assistant."}]
198
+ }, {
199
+ "role": "user",
200
+ "content": [{"type": "text", "text": text}]
201
+ }]
202
+ for frame in frames:
203
+ image, timestamp = frame
204
+ messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
205
+ messages[1]["content"].append({"type": "image", "image": image})
206
+ inputs = processor.apply_chat_template(
207
+ messages,
208
+ tokenize=True,
209
+ add_generation_prompt=True,
210
+ return_dict=True,
211
+ return_tensors="pt",
212
+ truncation=False,
213
+ max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
214
+ streamer = TextIteratorStreamer(processor,
215
+ skip_prompt=True,
216
+ skip_special_tokens=True)
217
+ generation_kwargs = {
218
+ **inputs,
219
+ "streamer": streamer,
220
+ "max_new_tokens": max_new_tokens,
221
+ "do_sample": True,
222
+ "temperature": temperature,
223
+ "top_p": top_p,
224
+ "top_k": top_k,
225
+ "repetition_penalty": repetition_penalty,
226
+ }
227
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
228
+ thread.start()
229
+ buffer = ""
230
+ for new_text in streamer:
231
+ buffer += new_text
232
+ buffer = buffer.replace("<|im_end|>", "")
233
+ time.sleep(0.01)
234
+ yield buffer, buffer
235
+
236
+ # Define examples for image and video inference
237
+ image_examples = [
238
+ ["Extract the content.", "images/7.png"],
239
+ ["Solve the problem to find the value.", "images/1.jpg"],
240
+ ["Explain the scene.", "images/6.JPG"],
241
+ ["Solve the problem step by step.", "images/2.jpg"],
242
+ ["Find the value of 'X'.", "images/3.jpg"],
243
+ ["Simplify the expression.", "images/4.jpg"],
244
+ ["Solve for the value.", "images/5.png"]
245
+ ]
246
+
247
+ video_examples = [
248
+ ["Explain the video in detail.", "videos/1.mp4"],
249
+ ["Explain the video in detail.", "videos/2.mp4"]
250
+ ]
251
+
252
+ #css
253
+ css = """
254
+ .submit-btn {
255
+ background-color: #2980b9 !important;
256
+ color: white !important;
257
+ }
258
+ .submit-btn:hover {
259
+ background-color: #3498db !important;
260
+ }
261
+ .canvas-output {
262
+ border: 2px solid #4682B4;
263
+ border-radius: 10px;
264
+ padding: 20px;
265
+ }
266
+ """
267
+
268
+ # Create the Gradio Interface
269
+ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
270
+ gr.Markdown(
271
+ "# **[Multimodal VLMs [OCR | VQA]](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**"
272
+ )
273
+ with gr.Row():
274
+ with gr.Column():
275
+ with gr.Tabs():
276
+ with gr.TabItem("Image Inference"):
277
+ image_query = gr.Textbox(
278
+ label="Query Input",
279
+ placeholder="Enter your query here...")
280
+ image_upload = gr.Image(type="pil", label="Image")
281
+ image_submit = gr.Button("Submit",
282
+ elem_classes="submit-btn")
283
+ gr.Examples(examples=image_examples,
284
+ inputs=[image_query, image_upload])
285
+ with gr.TabItem("Video Inference"):
286
+ video_query = gr.Textbox(
287
+ label="Query Input",
288
+ placeholder="Enter your query here...")
289
+ video_upload = gr.Video(label="Video")
290
+ video_submit = gr.Button("Submit",
291
+ elem_classes="submit-btn")
292
+ gr.Examples(examples=video_examples,
293
+ inputs=[video_query, video_upload])
294
+
295
+ with gr.Accordion("Advanced options", open=False):
296
+ max_new_tokens = gr.Slider(label="Max new tokens",
297
+ minimum=1,
298
+ maximum=MAX_MAX_NEW_TOKENS,
299
+ step=1,
300
+ value=DEFAULT_MAX_NEW_TOKENS)
301
+ temperature = gr.Slider(label="Temperature",
302
+ minimum=0.1,
303
+ maximum=4.0,
304
+ step=0.1,
305
+ value=0.6)
306
+ top_p = gr.Slider(label="Top-p (nucleus sampling)",
307
+ minimum=0.05,
308
+ maximum=1.0,
309
+ step=0.05,
310
+ value=0.9)
311
+ top_k = gr.Slider(label="Top-k",
312
+ minimum=1,
313
+ maximum=1000,
314
+ step=1,
315
+ value=50)
316
+ repetition_penalty = gr.Slider(label="Repetition penalty",
317
+ minimum=1.0,
318
+ maximum=2.0,
319
+ step=0.05,
320
+ value=1.2)
321
+
322
+ with gr.Column():
323
+ with gr.Column(elem_classes="canvas-output"):
324
+ gr.Markdown("## Output")
325
+ output = gr.Textbox(label="Raw Output Stream",
326
+ interactive=False,
327
+ lines=2, show_copy_button=True)
328
+ with gr.Accordion("(Result.md)", open=False):
329
+ markdown_output = gr.Markdown(
330
+ label="markup.md")
331
+ #download_btn = gr.Button("Download Result.md")
332
+
333
+ model_choice = gr.Radio(choices=[
334
+ "Vision-Matters-7B", "WR30a-Deep-7B-0711",
335
+ "ViGaL-7B", "MonkeyOCR-pro-1.2B", "Visionary-R1-3B"
336
+ ],
337
+ label="Select Model",
338
+ value="Vision-Matters-7B")
339
+
340
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLMs-5x/discussions)")
341
+ gr.Markdown("> [WR30a-Deep-7B-0711](https://huggingface.co/prithivMLmods/WR30a-Deep-7B-0711): wr30a-deep-7b-0711 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for image captioning, visual analysis, and image reasoning. Built on top of the qwen2.5-vl architecture, this experimental model enhances visual comprehension capabilities with focused training on 1,500k image pairs for superior image understanding.")
342
+ gr.Markdown("> [MonkeyOCR-pro-1.2B](https://huggingface.co/echo840/MonkeyOCR-pro-1.2B): MonkeyOCR adopts a structure-recognition-relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
343
+ gr.Markdown("> [Vision Matters 7B](https://huggingface.co/Yuting6/Vision-Matters-7B): vision-matters is a simple visual perturbation framework that can be easily integrated into existing post-training pipelines including sft, dpo, and grpo. our findings highlight the critical role of visual perturbation: better reasoning begins with better seeing.")
344
+ gr.Markdown("> [ViGaL 7B](https://huggingface.co/yunfeixie/ViGaL-7B): vigal-7b shows that training a 7b mllm on simple games like snake using reinforcement learning boosts performance on benchmarks like mathvista and mmmu without needing worked solutions or diagrams indicating transferable reasoning skills.")
345
+ gr.Markdown("> [Visionary-R1](https://huggingface.co/maifoundations/Visionary-R1): visionary-r1 is a novel framework for training visual language models (vlms) to perform robust visual reasoning using reinforcement learning (rl). unlike traditional approaches that rely heavily on (sft) or (cot) annotations, visionary-r1 leverages only visual question-answer pairs and rl, making the process more scalable and accessible.")
346
+ gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
347
+
348
+ # Define the submit button actions
349
+ image_submit.click(fn=generate_image,
350
+ inputs=[
351
+ model_choice, image_query, image_upload,
352
+ max_new_tokens, temperature, top_p, top_k,
353
+ repetition_penalty
354
+ ],
355
+ outputs=[output, markdown_output])
356
+ video_submit.click(fn=generate_video,
357
+ inputs=[
358
+ model_choice, video_query, video_upload,
359
+ max_new_tokens, temperature, top_p, top_k,
360
+ repetition_penalty
361
+ ],
362
+ outputs=[output, markdown_output])
363
+
364
+ if __name__ == "__main__":
365
+ demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)