Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -15,7 +15,7 @@ import cv2
|
|
15 |
|
16 |
from transformers import (
|
17 |
Qwen2_5_VLForConditionalGeneration,
|
18 |
-
|
19 |
AutoProcessor,
|
20 |
TextIteratorStreamer,
|
21 |
)
|
@@ -35,25 +35,33 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
35 |
MODEL_ID_M, trust_remote_code=True,
|
36 |
torch_dtype=torch.float16).to(device).eval()
|
37 |
|
38 |
-
# Load
|
39 |
-
MODEL_ID_X = "
|
40 |
processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
|
41 |
model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
42 |
MODEL_ID_X, trust_remote_code=True,
|
43 |
torch_dtype=torch.float16).to(device).eval()
|
44 |
|
45 |
-
# Load
|
46 |
-
MODEL_ID_T = "
|
47 |
processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
|
48 |
-
model_t =
|
49 |
MODEL_ID_T, trust_remote_code=True,
|
50 |
torch_dtype=torch.float16).to(device).eval()
|
51 |
|
52 |
-
# Load
|
53 |
-
MODEL_ID_O = "
|
54 |
-
|
|
|
55 |
model_o = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
56 |
-
MODEL_ID_O, trust_remote_code=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
torch_dtype=torch.float16).to(device).eval()
|
58 |
|
59 |
# Function to downsample video frames
|
@@ -94,15 +102,18 @@ def generate_image(model_name: str,
|
|
94 |
if model_name == "Camel-Doc-OCR-080125(v2)":
|
95 |
processor = processor_m
|
96 |
model = model_m
|
97 |
-
elif model_name == "
|
98 |
processor = processor_x
|
99 |
model = model_x
|
100 |
-
elif model_name == "
|
101 |
processor = processor_o
|
102 |
model = model_o
|
103 |
-
elif model_name == "
|
104 |
processor = processor_t
|
105 |
model = model_t
|
|
|
|
|
|
|
106 |
else:
|
107 |
yield "Invalid model selected.", "Invalid model selected."
|
108 |
return
|
@@ -158,15 +169,18 @@ def generate_video(model_name: str,
|
|
158 |
if model_name == "Camel-Doc-OCR-080125(v2)":
|
159 |
processor = processor_m
|
160 |
model = model_m
|
161 |
-
elif model_name == "
|
162 |
processor = processor_x
|
163 |
model = model_x
|
164 |
-
elif model_name == "
|
165 |
processor = processor_o
|
166 |
model = model_o
|
167 |
-
elif model_name == "
|
168 |
processor = processor_t
|
169 |
model = model_t
|
|
|
|
|
|
|
170 |
else:
|
171 |
yield "Invalid model selected.", "Invalid model selected."
|
172 |
return
|
@@ -249,7 +263,7 @@ css = """
|
|
249 |
# Create the Gradio Interface
|
250 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
251 |
gr.Markdown(
|
252 |
-
"# **[Multimodal
|
253 |
)
|
254 |
with gr.Row():
|
255 |
with gr.Column():
|
@@ -312,20 +326,12 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
312 |
#download_btn = gr.Button("Download Result.md")
|
313 |
|
314 |
model_choice = gr.Radio(choices=[
|
315 |
-
"Camel-Doc-OCR-080125(v2)", "
|
316 |
-
"
|
317 |
],
|
318 |
label="Select Model",
|
319 |
value="Camel-Doc-OCR-080125(v2)")
|
320 |
|
321 |
-
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLMs-5x/discussions)")
|
322 |
-
gr.Markdown("> [Camel-Doc-OCR-080125(v2)](https://huggingface.co/prithivMLmods/WR30a-Deep-7B-0711): the camel-doc-ocr-080125 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
|
323 |
-
gr.Markdown("> [MonkeyOCR-pro-1.2B](https://huggingface.co/echo840/MonkeyOCR-pro-1.2B): MonkeyOCR adopts a structure-recognition-relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
|
324 |
-
gr.Markdown("> [Vision Matters 7B](https://huggingface.co/Yuting6/Vision-Matters-7B): vision-matters is a simple visual perturbation framework that can be easily integrated into existing post-training pipelines including sft, dpo, and grpo. our findings highlight the critical role of visual perturbation: better reasoning begins with better seeing.")
|
325 |
-
gr.Markdown("> [ViGaL 7B](https://huggingface.co/yunfeixie/ViGaL-7B): vigal-7b shows that training a 7b mllm on simple games like snake using reinforcement learning boosts performance on benchmarks like mathvista and mmmu without needing worked solutions or diagrams indicating transferable reasoning skills.")
|
326 |
-
gr.Markdown("> [Visionary-R1](https://huggingface.co/maifoundations/Visionary-R1): visionary-r1 is a novel framework for training visual language models (vlms) to perform robust visual reasoning using reinforcement learning (rl). unlike traditional approaches that rely heavily on (sft) or (cot) annotations, visionary-r1 leverages only visual question-answer pairs and rl, making the process more scalable and accessible.")
|
327 |
-
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
328 |
-
|
329 |
# Define the submit button actions
|
330 |
image_submit.click(fn=generate_image,
|
331 |
inputs=[
|
|
|
15 |
|
16 |
from transformers import (
|
17 |
Qwen2_5_VLForConditionalGeneration,
|
18 |
+
AutoModelForCausalLM,
|
19 |
AutoProcessor,
|
20 |
TextIteratorStreamer,
|
21 |
)
|
|
|
35 |
MODEL_ID_M, trust_remote_code=True,
|
36 |
torch_dtype=torch.float16).to(device).eval()
|
37 |
|
38 |
+
# Load OCRFlux-3B
|
39 |
+
MODEL_ID_X = "ChatDOC/OCRFlux-3B"
|
40 |
processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
|
41 |
model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
42 |
MODEL_ID_X, trust_remote_code=True,
|
43 |
torch_dtype=torch.float16).to(device).eval()
|
44 |
|
45 |
+
# Load A.X-4.0-VL-Light
|
46 |
+
MODEL_ID_T = "skt/A.X-4.0-VL-Light"
|
47 |
processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
|
48 |
+
model_t = AutoModelForCausalLM.from_pretrained(
|
49 |
MODEL_ID_T, trust_remote_code=True,
|
50 |
torch_dtype=torch.float16).to(device).eval()
|
51 |
|
52 |
+
# Load MonkeyOCR-pro-1.2B
|
53 |
+
MODEL_ID_O = "echo840/MonkeyOCR-pro-1.2B"
|
54 |
+
SUBFOLDER = "Recognition"
|
55 |
+
processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True, subfolder=SUBFOLDER)
|
56 |
model_o = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
57 |
+
MODEL_ID_O, trust_remote_code=True, subfolder=SUBFOLDER,
|
58 |
+
torch_dtype=torch.float16).to(device).eval()
|
59 |
+
|
60 |
+
# Load OpenVLThinker-7B-v1.2
|
61 |
+
MODEL_ID_A = "ydeng9/OpenVLThinker-7B-v1.2"
|
62 |
+
processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
|
63 |
+
model_a = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
64 |
+
MODEL_ID_A, trust_remote_code=True,
|
65 |
torch_dtype=torch.float16).to(device).eval()
|
66 |
|
67 |
# Function to downsample video frames
|
|
|
102 |
if model_name == "Camel-Doc-OCR-080125(v2)":
|
103 |
processor = processor_m
|
104 |
model = model_m
|
105 |
+
elif model_name == "OCRFlux-3B":
|
106 |
processor = processor_x
|
107 |
model = model_x
|
108 |
+
elif model_name == "A.X-4.0-VL-Light":
|
109 |
processor = processor_o
|
110 |
model = model_o
|
111 |
+
elif model_name == "MonkeyOCR-pro-1.2B":
|
112 |
processor = processor_t
|
113 |
model = model_t
|
114 |
+
elif model_name == "OpenVLThinker-7B":
|
115 |
+
processor = processor_a
|
116 |
+
model = model_a
|
117 |
else:
|
118 |
yield "Invalid model selected.", "Invalid model selected."
|
119 |
return
|
|
|
169 |
if model_name == "Camel-Doc-OCR-080125(v2)":
|
170 |
processor = processor_m
|
171 |
model = model_m
|
172 |
+
elif model_name == "OCRFlux-3B":
|
173 |
processor = processor_x
|
174 |
model = model_x
|
175 |
+
elif model_name == "A.X-4.0-VL-Light":
|
176 |
processor = processor_o
|
177 |
model = model_o
|
178 |
+
elif model_name == "MonkeyOCR-pro-1.2B":
|
179 |
processor = processor_t
|
180 |
model = model_t
|
181 |
+
elif model_name == "OpenVLThinker-7B":
|
182 |
+
processor = processor_a
|
183 |
+
model = model_a
|
184 |
else:
|
185 |
yield "Invalid model selected.", "Invalid model selected."
|
186 |
return
|
|
|
263 |
# Create the Gradio Interface
|
264 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
265 |
gr.Markdown(
|
266 |
+
"# **[Multimodal OCR Outpost](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**"
|
267 |
)
|
268 |
with gr.Row():
|
269 |
with gr.Column():
|
|
|
326 |
#download_btn = gr.Button("Download Result.md")
|
327 |
|
328 |
model_choice = gr.Radio(choices=[
|
329 |
+
"Camel-Doc-OCR-080125(v2)", "OCRFlux-3B", "OpenVLThinker-7B",
|
330 |
+
"A.X-4.0-VL-Light", "MonkeyOCR-pro-1.2B"
|
331 |
],
|
332 |
label="Select Model",
|
333 |
value="Camel-Doc-OCR-080125(v2)")
|
334 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
# Define the submit button actions
|
336 |
image_submit.click(fn=generate_image,
|
337 |
inputs=[
|