Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

study

by piyushmadhukar - opened Jun 18

base: refs/heads/main

←

from: refs/pr/9

Discussion Files changed

+51

-114

This PR is in draft mode

Files changed (8) hide show

.gitattributes +0 -2
README.md +4 -4
app.py +41 -89
images/0.png +0 -0
images/3.jpg +0 -3
images/4.png +0 -3
images/ocr.png +0 -0
requirements.txt +6 -13

.gitattributes CHANGED Viewed

@@ -43,5 +43,3 @@ rolm/2.jpeg filter=lfs diff=lfs merge=lfs -text
 images/1.jpg filter=lfs diff=lfs merge=lfs -text
 videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
 videos/2.mp4 filter=lfs diff=lfs merge=lfs -text
-images/4.png filter=lfs diff=lfs merge=lfs -text
-images/3.jpg filter=lfs diff=lfs merge=lfs -text

 images/1.jpg filter=lfs diff=lfs merge=lfs -text
 videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
 videos/2.mp4 filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
 title: OCR
 emoji: 🍍
-colorFrom: gray
-colorTo: blue
 sdk: gradio
-sdk_version: 5.42.0
 app_file: app.py
 pinned: true
 license: apache-2.0
-short_description: olmocr / nanonets ocr / qwen2vl ocr / aya vision / rolmocr
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: OCR
 emoji: 🍍
+colorFrom: indigo
+colorTo: gray
 sdk: gradio
+sdk_version: 5.34.0
 app_file: app.py
 pinned: true
 license: apache-2.0
+short_description: image and video understanding
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -29,23 +29,11 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
-print("torch.__version__ =", torch.__version__)
-print("torch.version.cuda =", torch.version.cuda)
-print("cuda available:", torch.cuda.is_available())
-print("cuda device count:", torch.cuda.device_count())
-if torch.cuda.is_available():
-    print("current device:", torch.cuda.current_device())
-    print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
-print("Using device:", device)
-# --- Model Loading ---
-# Load Nanonets-OCR-s
-MODEL_ID_V = "nanonets/Nanonets-OCR-s"
-processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
-model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_V,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -59,29 +47,20 @@ model_x = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Aya-Vision-8b
-MODEL_ID_A = "CohereForAI/aya-vision-8b"
-processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
-model_a = AutoModelForImageTextToText.from_pretrained(
-    MODEL_ID_A,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Load olmOCR-7B-0725
-MODEL_ID_W = "allenai/olmOCR-7B-0725"
-processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
-model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_W,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load RolmOCR
-MODEL_ID_M = "reducto/RolmOCR"
-processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
-model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_M,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -116,29 +95,25 @@ def generate_image(model_name: str, text: str, image: Image.Image,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the selected model for image input.
-    Yields raw text and Markdown-formatted text.
     """
-    if model_name == "RolmOCR-7B":
         processor = processor_m
         model = model_m
-    elif model_name == "Qwen2-VL-OCR-2B":
         processor = processor_x
         model = model_x
     elif model_name == "Nanonets-OCR-s":
         processor = processor_v
         model = model_v
-    elif model_name == "Aya-Vision-8B":
         processor = processor_a
         model = model_a
-    elif model_name == "olmOCR-7B-0725":
-        processor = processor_w
-        model = model_w
     else:
-        yield "Invalid model selected.", "Invalid model selected."
         return
     if image is None:
-        yield "Please upload an image.", "Please upload an image."
         return
     messages = [{
@@ -166,7 +141,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
-        yield buffer, buffer
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
@@ -177,29 +152,25 @@ def generate_video(model_name: str, text: str, video_path: str,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the selected model for video input.
-    Yields raw text and Markdown-formatted text.
     """
-    if model_name == "RolmOCR-7B":
         processor = processor_m
         model = model_m
-    elif model_name == "Qwen2-VL-OCR-2B":
         processor = processor_x
         model = model_x
     elif model_name == "Nanonets-OCR-s":
         processor = processor_v
         model = model_v
-    elif model_name == "Aya-Vision-8B":
         processor = processor_a
         model = model_a
-    elif model_name == "olmOCR-7B-0725":
-        processor = processor_w
-        model = model_w
     else:
-        yield "Invalid model selected.", "Invalid model selected."
         return
     if video_path is None:
-        yield "Please upload a video.", "Please upload a video."
         return
     frames = downsample_video(video_path)
@@ -238,21 +209,17 @@ def generate_video(model_name: str, text: str, video_path: str,
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
-        yield buffer, buffer
 # Define examples for image and video inference
 image_examples = [
-    ["Extract the full page.", "images/ocr.png"],
-    ["Extract the content.", "images/4.png"],
-    ["Explain the scene.", "images/3.jpg"],
-    ["Convert this page to doc [table] precisely for markdown.", "images/0.png"],
     ["Perform OCR on the Image.", "images/1.jpg"],
-    ["Extract the table content.", "images/2.png"]
 ]
 video_examples = [
-    ["Explain the Ad in Detail.", "videos/1.mp4"],
-    ["Identify the main actions in the cartoon video.", "videos/2.mp4"]
 ]
 css = """
@@ -263,16 +230,11 @@ css = """
 .submit-btn:hover {
     background-color: #3498db !important;
 }
-.canvas-output {
-    border: 2px solid #4682B4;
-    border-radius: 10px;
-    padding: 20px;
-}
 """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **[Multimodal OCR hpc/.](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
@@ -298,39 +260,29 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
-            with gr.Column(elem_classes="canvas-output"):
-                gr.Markdown("## Output")
-                output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2, show_copy_button=True)
-                with gr.Accordion("(Result.md)", open=False):
-                    markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
             model_choice = gr.Radio(
-                choices=["olmOCR-7B-0725", "Nanonets-OCR-s", "RolmOCR-7B",
-                         "Aya-Vision-8B", "Qwen2-VL-OCR-2B"],
                 label="Select Model",
-                value="olmOCR-7B-0725"
             )
-            gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR/discussions)")
             gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
-            gr.Markdown("> [olmOCR-7B-0725](https://huggingface.co/allenai/olmOCR-7B-0725): olmocr-7b-0725 — fine-tuned with olmocr-mix-0225 on top of Qwen2.5-VL-7B-Instruct, pushing the boundaries of OCR technology. high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition.")
-            gr.Markdown("> [Qwen2-VL-OCR-2B](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
-            gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
             gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
-            gr.Markdown("> ⚠️ Note: Models in this space may not perform well on video inference tasks.")
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[output, markdown_output]
     )
     video_submit.click(
         fn=generate_video,
         inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[output, markdown_output]
     )
 if __name__ == "__main__":

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load RolmOCR
+MODEL_ID_M = "reducto/RolmOCR"
+processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
+model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_M,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Nanonets-OCR-s
+MODEL_ID_V = "nanonets/Nanonets-OCR-s"
+processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
+model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_V,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load aya-vision-8b
+MODEL_ID_A = "CohereForAI/aya-vision-8b"
+processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
+model_a = AutoModelForImageTextToText.from_pretrained(
+    MODEL_ID_A,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the selected model for image input.
     """
+    if model_name == "RolmOCR":
         processor = processor_m
         model = model_m
+    elif model_name == "Qwen2-VL-OCR-2B-Instruct":
         processor = processor_x
         model = model_x
     elif model_name == "Nanonets-OCR-s":
         processor = processor_v
         model = model_v
+    elif model_name == "Aya-Vision":
         processor = processor_a
         model = model_a
     else:
+        yield "Invalid model selected."
         return
     if image is None:
+        yield "Please upload an image."
         return
     messages = [{
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
+        yield buffer
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the selected model for video input.
     """
+    if model_name == "RolmOCR":
         processor = processor_m
         model = model_m
+    elif model_name == "Qwen2-VL-OCR-2B-Instruct":
         processor = processor_x
         model = model_x
     elif model_name == "Nanonets-OCR-s":
         processor = processor_v
         model = model_v
+    elif model_name == "Aya-Vision":
         processor = processor_a
         model = model_a
     else:
+        yield "Invalid model selected."
         return
     if video_path is None:
+        yield "Please upload a video."
         return
     frames = downsample_video(video_path)
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
+        yield buffer
 # Define examples for image and video inference
 image_examples = [
     ["Perform OCR on the Image.", "images/1.jpg"],
+    ["Extract the table content", "images/2.png"]
 ]
 video_examples = [
+    ["Explain the Ad in Detail", "videos/1.mp4"],
+    ["Identify the main actions in the cartoon video", "videos/2.mp4"]
 ]
 css = """
 .submit-btn:hover {
     background-color: #3498db !important;
 }
 """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **Multimodal OCR**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
+            output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
             model_choice = gr.Radio(
+                choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B-Instruct", "RolmOCR", "Aya-Vision"],
                 label="Select Model",
+                value="Nanonets-OCR-s"
             )
+            gr.Markdown("**Model Info**")
+            gr.Markdown("> [Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
             gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
+            gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents oprical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
             gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=output
     )
     video_submit.click(
         fn=generate_video,
         inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=output
     )
 if __name__ == "__main__":

images/0.png DELETED Viewed

Binary file (86.1 kB)

images/3.jpg DELETED Viewed

Git LFS Details

SHA256: 510714fb3ee4eaddbd24f4b1f36e75bf13611326c39046674db27095c26132cc
Pointer size: 131 Bytes
Size of remote file: 224 kB

images/4.png DELETED Viewed

Git LFS Details

SHA256: 8a5736439eea1647b192e13473f9cde9c3c619dc066297e38dee2cf11fe5779d
Pointer size: 131 Bytes
Size of remote file: 152 kB

images/ocr.png DELETED Viewed

Binary file (39.7 kB)

requirements.txt CHANGED Viewed

@@ -1,19 +1,12 @@
-git+https://github.com/huggingface/transformers.git
-git+https://github.com/huggingface/accelerate.git
-git+https://github.com/huggingface/peft.git
 transformers-stream-generator
-huggingface_hub
-albumentations
-opencv-python
-sentencepiece
 qwen-vl-utils
-docling-core
-safetensors
 torchvision
-requests
 spaces
-gradio
 pillow
-gradio
-torch
 av

+gradio
+transformers
 transformers-stream-generator
 qwen-vl-utils
 torchvision
+torch
+huggingface_hub
 spaces
+accelerate
 pillow
+opencv-python
 av