Files changed (8) hide show
  1. .gitattributes +0 -2
  2. README.md +4 -4
  3. app.py +41 -89
  4. images/0.png +0 -0
  5. images/3.jpg +0 -3
  6. images/4.png +0 -3
  7. images/ocr.png +0 -0
  8. requirements.txt +6 -13
.gitattributes CHANGED
@@ -43,5 +43,3 @@ rolm/2.jpeg filter=lfs diff=lfs merge=lfs -text
43
  images/1.jpg filter=lfs diff=lfs merge=lfs -text
44
  videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
45
  videos/2.mp4 filter=lfs diff=lfs merge=lfs -text
46
- images/4.png filter=lfs diff=lfs merge=lfs -text
47
- images/3.jpg filter=lfs diff=lfs merge=lfs -text
 
43
  images/1.jpg filter=lfs diff=lfs merge=lfs -text
44
  videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
45
  videos/2.mp4 filter=lfs diff=lfs merge=lfs -text
 
 
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
  title: OCR
3
  emoji: 🍍
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.42.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
- short_description: olmocr / nanonets ocr / qwen2vl ocr / aya vision / rolmocr
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: OCR
3
  emoji: 🍍
4
+ colorFrom: indigo
5
+ colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 5.34.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
+ short_description: image and video understanding
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -29,23 +29,11 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
29
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
- print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
33
- print("torch.__version__ =", torch.__version__)
34
- print("torch.version.cuda =", torch.version.cuda)
35
- print("cuda available:", torch.cuda.is_available())
36
- print("cuda device count:", torch.cuda.device_count())
37
- if torch.cuda.is_available():
38
- print("current device:", torch.cuda.current_device())
39
- print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
40
-
41
- print("Using device:", device)
42
-
43
- # --- Model Loading ---
44
- # Load Nanonets-OCR-s
45
- MODEL_ID_V = "nanonets/Nanonets-OCR-s"
46
- processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
47
- model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
48
- MODEL_ID_V,
49
  trust_remote_code=True,
50
  torch_dtype=torch.float16
51
  ).to(device).eval()
@@ -59,29 +47,20 @@ model_x = Qwen2VLForConditionalGeneration.from_pretrained(
59
  torch_dtype=torch.float16
60
  ).to(device).eval()
61
 
62
- # Load Aya-Vision-8b
63
- MODEL_ID_A = "CohereForAI/aya-vision-8b"
64
- processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
65
- model_a = AutoModelForImageTextToText.from_pretrained(
66
- MODEL_ID_A,
67
- trust_remote_code=True,
68
- torch_dtype=torch.float16
69
- ).to(device).eval()
70
-
71
- # Load olmOCR-7B-0725
72
- MODEL_ID_W = "allenai/olmOCR-7B-0725"
73
- processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
74
- model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
75
- MODEL_ID_W,
76
  trust_remote_code=True,
77
  torch_dtype=torch.float16
78
  ).to(device).eval()
79
 
80
- # Load RolmOCR
81
- MODEL_ID_M = "reducto/RolmOCR"
82
- processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
83
- model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
84
- MODEL_ID_M,
85
  trust_remote_code=True,
86
  torch_dtype=torch.float16
87
  ).to(device).eval()
@@ -116,29 +95,25 @@ def generate_image(model_name: str, text: str, image: Image.Image,
116
  repetition_penalty: float = 1.2):
117
  """
118
  Generates responses using the selected model for image input.
119
- Yields raw text and Markdown-formatted text.
120
  """
121
- if model_name == "RolmOCR-7B":
122
  processor = processor_m
123
  model = model_m
124
- elif model_name == "Qwen2-VL-OCR-2B":
125
  processor = processor_x
126
  model = model_x
127
  elif model_name == "Nanonets-OCR-s":
128
  processor = processor_v
129
  model = model_v
130
- elif model_name == "Aya-Vision-8B":
131
  processor = processor_a
132
  model = model_a
133
- elif model_name == "olmOCR-7B-0725":
134
- processor = processor_w
135
- model = model_w
136
  else:
137
- yield "Invalid model selected.", "Invalid model selected."
138
  return
139
 
140
  if image is None:
141
- yield "Please upload an image.", "Please upload an image."
142
  return
143
 
144
  messages = [{
@@ -166,7 +141,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
166
  buffer += new_text
167
  buffer = buffer.replace("<|im_end|>", "")
168
  time.sleep(0.01)
169
- yield buffer, buffer
170
 
171
  @spaces.GPU
172
  def generate_video(model_name: str, text: str, video_path: str,
@@ -177,29 +152,25 @@ def generate_video(model_name: str, text: str, video_path: str,
177
  repetition_penalty: float = 1.2):
178
  """
179
  Generates responses using the selected model for video input.
180
- Yields raw text and Markdown-formatted text.
181
  """
182
- if model_name == "RolmOCR-7B":
183
  processor = processor_m
184
  model = model_m
185
- elif model_name == "Qwen2-VL-OCR-2B":
186
  processor = processor_x
187
  model = model_x
188
  elif model_name == "Nanonets-OCR-s":
189
  processor = processor_v
190
  model = model_v
191
- elif model_name == "Aya-Vision-8B":
192
  processor = processor_a
193
  model = model_a
194
- elif model_name == "olmOCR-7B-0725":
195
- processor = processor_w
196
- model = model_w
197
  else:
198
- yield "Invalid model selected.", "Invalid model selected."
199
  return
200
 
201
  if video_path is None:
202
- yield "Please upload a video.", "Please upload a video."
203
  return
204
 
205
  frames = downsample_video(video_path)
@@ -238,21 +209,17 @@ def generate_video(model_name: str, text: str, video_path: str,
238
  buffer += new_text
239
  buffer = buffer.replace("<|im_end|>", "")
240
  time.sleep(0.01)
241
- yield buffer, buffer
242
 
243
  # Define examples for image and video inference
244
  image_examples = [
245
- ["Extract the full page.", "images/ocr.png"],
246
- ["Extract the content.", "images/4.png"],
247
- ["Explain the scene.", "images/3.jpg"],
248
- ["Convert this page to doc [table] precisely for markdown.", "images/0.png"],
249
  ["Perform OCR on the Image.", "images/1.jpg"],
250
- ["Extract the table content.", "images/2.png"]
251
  ]
252
 
253
  video_examples = [
254
- ["Explain the Ad in Detail.", "videos/1.mp4"],
255
- ["Identify the main actions in the cartoon video.", "videos/2.mp4"]
256
  ]
257
 
258
  css = """
@@ -263,16 +230,11 @@ css = """
263
  .submit-btn:hover {
264
  background-color: #3498db !important;
265
  }
266
- .canvas-output {
267
- border: 2px solid #4682B4;
268
- border-radius: 10px;
269
- padding: 20px;
270
- }
271
  """
272
 
273
  # Create the Gradio Interface
274
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
275
- gr.Markdown("# **[Multimodal OCR hpc/.](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
276
  with gr.Row():
277
  with gr.Column():
278
  with gr.Tabs():
@@ -298,39 +260,29 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
298
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
299
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
300
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
301
-
302
  with gr.Column():
303
- with gr.Column(elem_classes="canvas-output"):
304
- gr.Markdown("## Output")
305
- output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2, show_copy_button=True)
306
-
307
- with gr.Accordion("(Result.md)", open=False):
308
- markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
309
-
310
  model_choice = gr.Radio(
311
- choices=["olmOCR-7B-0725", "Nanonets-OCR-s", "RolmOCR-7B",
312
- "Aya-Vision-8B", "Qwen2-VL-OCR-2B"],
313
  label="Select Model",
314
- value="olmOCR-7B-0725"
315
  )
316
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR/discussions)")
 
 
317
  gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
318
- gr.Markdown("> [olmOCR-7B-0725](https://huggingface.co/allenai/olmOCR-7B-0725): olmocr-7b-0725 — fine-tuned with olmocr-mix-0225 on top of Qwen2.5-VL-7B-Instruct, pushing the boundaries of OCR technology. high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition.")
319
- gr.Markdown("> [Qwen2-VL-OCR-2B](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
320
- gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
321
  gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
322
-
323
- gr.Markdown("> ⚠️ Note: Models in this space may not perform well on video inference tasks.")
324
-
325
  image_submit.click(
326
  fn=generate_image,
327
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
328
- outputs=[output, markdown_output]
329
  )
330
  video_submit.click(
331
  fn=generate_video,
332
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
333
- outputs=[output, markdown_output]
334
  )
335
 
336
  if __name__ == "__main__":
 
29
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
+ # Load RolmOCR
33
+ MODEL_ID_M = "reducto/RolmOCR"
34
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
35
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
36
+ MODEL_ID_M,
 
 
 
 
 
 
 
 
 
 
 
 
37
  trust_remote_code=True,
38
  torch_dtype=torch.float16
39
  ).to(device).eval()
 
47
  torch_dtype=torch.float16
48
  ).to(device).eval()
49
 
50
+ # Load Nanonets-OCR-s
51
+ MODEL_ID_V = "nanonets/Nanonets-OCR-s"
52
+ processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
53
+ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
54
+ MODEL_ID_V,
 
 
 
 
 
 
 
 
 
55
  trust_remote_code=True,
56
  torch_dtype=torch.float16
57
  ).to(device).eval()
58
 
59
+ # Load aya-vision-8b
60
+ MODEL_ID_A = "CohereForAI/aya-vision-8b"
61
+ processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
62
+ model_a = AutoModelForImageTextToText.from_pretrained(
63
+ MODEL_ID_A,
64
  trust_remote_code=True,
65
  torch_dtype=torch.float16
66
  ).to(device).eval()
 
95
  repetition_penalty: float = 1.2):
96
  """
97
  Generates responses using the selected model for image input.
 
98
  """
99
+ if model_name == "RolmOCR":
100
  processor = processor_m
101
  model = model_m
102
+ elif model_name == "Qwen2-VL-OCR-2B-Instruct":
103
  processor = processor_x
104
  model = model_x
105
  elif model_name == "Nanonets-OCR-s":
106
  processor = processor_v
107
  model = model_v
108
+ elif model_name == "Aya-Vision":
109
  processor = processor_a
110
  model = model_a
 
 
 
111
  else:
112
+ yield "Invalid model selected."
113
  return
114
 
115
  if image is None:
116
+ yield "Please upload an image."
117
  return
118
 
119
  messages = [{
 
141
  buffer += new_text
142
  buffer = buffer.replace("<|im_end|>", "")
143
  time.sleep(0.01)
144
+ yield buffer
145
 
146
  @spaces.GPU
147
  def generate_video(model_name: str, text: str, video_path: str,
 
152
  repetition_penalty: float = 1.2):
153
  """
154
  Generates responses using the selected model for video input.
 
155
  """
156
+ if model_name == "RolmOCR":
157
  processor = processor_m
158
  model = model_m
159
+ elif model_name == "Qwen2-VL-OCR-2B-Instruct":
160
  processor = processor_x
161
  model = model_x
162
  elif model_name == "Nanonets-OCR-s":
163
  processor = processor_v
164
  model = model_v
165
+ elif model_name == "Aya-Vision":
166
  processor = processor_a
167
  model = model_a
 
 
 
168
  else:
169
+ yield "Invalid model selected."
170
  return
171
 
172
  if video_path is None:
173
+ yield "Please upload a video."
174
  return
175
 
176
  frames = downsample_video(video_path)
 
209
  buffer += new_text
210
  buffer = buffer.replace("<|im_end|>", "")
211
  time.sleep(0.01)
212
+ yield buffer
213
 
214
  # Define examples for image and video inference
215
  image_examples = [
 
 
 
 
216
  ["Perform OCR on the Image.", "images/1.jpg"],
217
+ ["Extract the table content", "images/2.png"]
218
  ]
219
 
220
  video_examples = [
221
+ ["Explain the Ad in Detail", "videos/1.mp4"],
222
+ ["Identify the main actions in the cartoon video", "videos/2.mp4"]
223
  ]
224
 
225
  css = """
 
230
  .submit-btn:hover {
231
  background-color: #3498db !important;
232
  }
 
 
 
 
 
233
  """
234
 
235
  # Create the Gradio Interface
236
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
237
+ gr.Markdown("# **Multimodal OCR**")
238
  with gr.Row():
239
  with gr.Column():
240
  with gr.Tabs():
 
260
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
261
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
262
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
263
  with gr.Column():
264
+ output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
 
 
 
 
 
 
265
  model_choice = gr.Radio(
266
+ choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B-Instruct", "RolmOCR", "Aya-Vision"],
 
267
  label="Select Model",
268
+ value="Nanonets-OCR-s"
269
  )
270
+
271
+ gr.Markdown("**Model Info**")
272
+ gr.Markdown("> [Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
273
  gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
274
+ gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents oprical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
 
 
275
  gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
276
+
 
 
277
  image_submit.click(
278
  fn=generate_image,
279
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
280
+ outputs=output
281
  )
282
  video_submit.click(
283
  fn=generate_video,
284
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
285
+ outputs=output
286
  )
287
 
288
  if __name__ == "__main__":
images/0.png DELETED
Binary file (86.1 kB)
 
images/3.jpg DELETED

Git LFS Details

  • SHA256: 510714fb3ee4eaddbd24f4b1f36e75bf13611326c39046674db27095c26132cc
  • Pointer size: 131 Bytes
  • Size of remote file: 224 kB
images/4.png DELETED

Git LFS Details

  • SHA256: 8a5736439eea1647b192e13473f9cde9c3c619dc066297e38dee2cf11fe5779d
  • Pointer size: 131 Bytes
  • Size of remote file: 152 kB
images/ocr.png DELETED
Binary file (39.7 kB)
 
requirements.txt CHANGED
@@ -1,19 +1,12 @@
1
- git+https://github.com/huggingface/transformers.git
2
- git+https://github.com/huggingface/accelerate.git
3
- git+https://github.com/huggingface/peft.git
4
  transformers-stream-generator
5
- huggingface_hub
6
- albumentations
7
- opencv-python
8
- sentencepiece
9
  qwen-vl-utils
10
- docling-core
11
- safetensors
12
  torchvision
13
- requests
 
14
  spaces
15
- gradio
16
  pillow
17
- gradio
18
- torch
19
  av
 
1
+ gradio
2
+ transformers
 
3
  transformers-stream-generator
 
 
 
 
4
  qwen-vl-utils
 
 
5
  torchvision
6
+ torch
7
+ huggingface_hub
8
  spaces
9
+ accelerate
10
  pillow
11
+ opencv-python
 
12
  av