prithivMLmods commited on
Commit
39d517d
·
verified ·
1 Parent(s): 4a26b1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -27
app.py CHANGED
@@ -15,7 +15,7 @@ import cv2
15
 
16
  from transformers import (
17
  Qwen2_5_VLForConditionalGeneration,
18
- LlavaOnevisionForConditionalGeneration,
19
  AutoProcessor,
20
  TextIteratorStreamer,
21
  )
@@ -35,25 +35,33 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
35
  MODEL_ID_M, trust_remote_code=True,
36
  torch_dtype=torch.float16).to(device).eval()
37
 
38
- # Load ViGaL-7B
39
- MODEL_ID_X = "yunfeixie/ViGaL-7B"
40
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
41
  model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
42
  MODEL_ID_X, trust_remote_code=True,
43
  torch_dtype=torch.float16).to(device).eval()
44
 
45
- # Load prithivMLmods/WR30a-Deep-7B-0711
46
- MODEL_ID_T = "NCSOFT/VARCO-VISION-2.0-1.7B-OCR"
47
  processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
48
- model_t = LlavaOnevisionForConditionalGeneration.from_pretrained(
49
  MODEL_ID_T, trust_remote_code=True,
50
  torch_dtype=torch.float16).to(device).eval()
51
 
52
- # Load Visionary-R1
53
- MODEL_ID_O = "maifoundations/Visionary-R1"
54
- processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True)
 
55
  model_o = Qwen2_5_VLForConditionalGeneration.from_pretrained(
56
- MODEL_ID_O, trust_remote_code=True,
 
 
 
 
 
 
 
57
  torch_dtype=torch.float16).to(device).eval()
58
 
59
  # Function to downsample video frames
@@ -94,15 +102,18 @@ def generate_image(model_name: str,
94
  if model_name == "Camel-Doc-OCR-080125(v2)":
95
  processor = processor_m
96
  model = model_m
97
- elif model_name == "ViGaL-7B":
98
  processor = processor_x
99
  model = model_x
100
- elif model_name == "Visionary-R1-3B":
101
  processor = processor_o
102
  model = model_o
103
- elif model_name == "Varco-Vision-2.0-OCR":
104
  processor = processor_t
105
  model = model_t
 
 
 
106
  else:
107
  yield "Invalid model selected.", "Invalid model selected."
108
  return
@@ -158,15 +169,18 @@ def generate_video(model_name: str,
158
  if model_name == "Camel-Doc-OCR-080125(v2)":
159
  processor = processor_m
160
  model = model_m
161
- elif model_name == "ViGaL-7B":
162
  processor = processor_x
163
  model = model_x
164
- elif model_name == "Visionary-R1-3B":
165
  processor = processor_o
166
  model = model_o
167
- elif model_name == "Varco-Vision-2.0-OCR":
168
  processor = processor_t
169
  model = model_t
 
 
 
170
  else:
171
  yield "Invalid model selected.", "Invalid model selected."
172
  return
@@ -249,7 +263,7 @@ css = """
249
  # Create the Gradio Interface
250
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
251
  gr.Markdown(
252
- "# **[Multimodal VLM OCR3](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**"
253
  )
254
  with gr.Row():
255
  with gr.Column():
@@ -312,20 +326,12 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
312
  #download_btn = gr.Button("Download Result.md")
313
 
314
  model_choice = gr.Radio(choices=[
315
- "Camel-Doc-OCR-080125(v2)", "Varco-Vision-2.0-OCR",
316
- "ViGaL-7B", "Visionary-R1-3B"
317
  ],
318
  label="Select Model",
319
  value="Camel-Doc-OCR-080125(v2)")
320
 
321
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLMs-5x/discussions)")
322
- gr.Markdown("> [Camel-Doc-OCR-080125(v2)](https://huggingface.co/prithivMLmods/WR30a-Deep-7B-0711): the camel-doc-ocr-080125 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
323
- gr.Markdown("> [MonkeyOCR-pro-1.2B](https://huggingface.co/echo840/MonkeyOCR-pro-1.2B): MonkeyOCR adopts a structure-recognition-relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
324
- gr.Markdown("> [Vision Matters 7B](https://huggingface.co/Yuting6/Vision-Matters-7B): vision-matters is a simple visual perturbation framework that can be easily integrated into existing post-training pipelines including sft, dpo, and grpo. our findings highlight the critical role of visual perturbation: better reasoning begins with better seeing.")
325
- gr.Markdown("> [ViGaL 7B](https://huggingface.co/yunfeixie/ViGaL-7B): vigal-7b shows that training a 7b mllm on simple games like snake using reinforcement learning boosts performance on benchmarks like mathvista and mmmu without needing worked solutions or diagrams indicating transferable reasoning skills.")
326
- gr.Markdown("> [Visionary-R1](https://huggingface.co/maifoundations/Visionary-R1): visionary-r1 is a novel framework for training visual language models (vlms) to perform robust visual reasoning using reinforcement learning (rl). unlike traditional approaches that rely heavily on (sft) or (cot) annotations, visionary-r1 leverages only visual question-answer pairs and rl, making the process more scalable and accessible.")
327
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
328
-
329
  # Define the submit button actions
330
  image_submit.click(fn=generate_image,
331
  inputs=[
 
15
 
16
  from transformers import (
17
  Qwen2_5_VLForConditionalGeneration,
18
+ AutoModelForCausalLM,
19
  AutoProcessor,
20
  TextIteratorStreamer,
21
  )
 
35
  MODEL_ID_M, trust_remote_code=True,
36
  torch_dtype=torch.float16).to(device).eval()
37
 
38
+ # Load OCRFlux-3B
39
+ MODEL_ID_X = "ChatDOC/OCRFlux-3B"
40
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
41
  model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
42
  MODEL_ID_X, trust_remote_code=True,
43
  torch_dtype=torch.float16).to(device).eval()
44
 
45
+ # Load A.X-4.0-VL-Light
46
+ MODEL_ID_T = "skt/A.X-4.0-VL-Light"
47
  processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
48
+ model_t = AutoModelForCausalLM.from_pretrained(
49
  MODEL_ID_T, trust_remote_code=True,
50
  torch_dtype=torch.float16).to(device).eval()
51
 
52
+ # Load MonkeyOCR-pro-1.2B
53
+ MODEL_ID_O = "echo840/MonkeyOCR-pro-1.2B"
54
+ SUBFOLDER = "Recognition"
55
+ processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True, subfolder=SUBFOLDER)
56
  model_o = Qwen2_5_VLForConditionalGeneration.from_pretrained(
57
+ MODEL_ID_O, trust_remote_code=True, subfolder=SUBFOLDER,
58
+ torch_dtype=torch.float16).to(device).eval()
59
+
60
+ # Load OpenVLThinker-7B-v1.2
61
+ MODEL_ID_A = "ydeng9/OpenVLThinker-7B-v1.2"
62
+ processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
63
+ model_a = Qwen2_5_VLForConditionalGeneration.from_pretrained(
64
+ MODEL_ID_A, trust_remote_code=True,
65
  torch_dtype=torch.float16).to(device).eval()
66
 
67
  # Function to downsample video frames
 
102
  if model_name == "Camel-Doc-OCR-080125(v2)":
103
  processor = processor_m
104
  model = model_m
105
+ elif model_name == "OCRFlux-3B":
106
  processor = processor_x
107
  model = model_x
108
+ elif model_name == "A.X-4.0-VL-Light":
109
  processor = processor_o
110
  model = model_o
111
+ elif model_name == "MonkeyOCR-pro-1.2B":
112
  processor = processor_t
113
  model = model_t
114
+ elif model_name == "OpenVLThinker-7B":
115
+ processor = processor_a
116
+ model = model_a
117
  else:
118
  yield "Invalid model selected.", "Invalid model selected."
119
  return
 
169
  if model_name == "Camel-Doc-OCR-080125(v2)":
170
  processor = processor_m
171
  model = model_m
172
+ elif model_name == "OCRFlux-3B":
173
  processor = processor_x
174
  model = model_x
175
+ elif model_name == "A.X-4.0-VL-Light":
176
  processor = processor_o
177
  model = model_o
178
+ elif model_name == "MonkeyOCR-pro-1.2B":
179
  processor = processor_t
180
  model = model_t
181
+ elif model_name == "OpenVLThinker-7B":
182
+ processor = processor_a
183
+ model = model_a
184
  else:
185
  yield "Invalid model selected.", "Invalid model selected."
186
  return
 
263
  # Create the Gradio Interface
264
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
265
  gr.Markdown(
266
+ "# **[Multimodal OCR Outpost](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**"
267
  )
268
  with gr.Row():
269
  with gr.Column():
 
326
  #download_btn = gr.Button("Download Result.md")
327
 
328
  model_choice = gr.Radio(choices=[
329
+ "Camel-Doc-OCR-080125(v2)", "OCRFlux-3B", "OpenVLThinker-7B",
330
+ "A.X-4.0-VL-Light", "MonkeyOCR-pro-1.2B"
331
  ],
332
  label="Select Model",
333
  value="Camel-Doc-OCR-080125(v2)")
334
 
 
 
 
 
 
 
 
 
335
  # Define the submit button actions
336
  image_submit.click(fn=generate_image,
337
  inputs=[