Spaces:

davanstrien
/

ocr-time-machine

Running on Zero

App Files Files Community

davanstrien HF Staff commited on May 22

Commit

e4442f3

1 Parent(s): 5f3165f

Add requirements.in and update requirements.txt with dependencies

Browse files

Files changed (3) hide show

app.py +138 -70
requirements.in +6 -0
requirements.txt +238 -3

app.py CHANGED Viewed

@@ -1,7 +1,25 @@
 import gradio as gr
-from PIL import Image # ImageDraw, ImageFont are no longer needed for overlay
 import xml.etree.ElementTree as ET
 import os
 # --- Helper Functions ---
@@ -34,16 +52,12 @@ def parse_alto_xml_for_text(xml_file_path):
         tree = ET.parse(xml_file_path)
         root = tree.getroot()
-        # Find all TextLine elements
         for text_line in root.findall(f'.//{ns_prefix}TextLine'):
             line_text_parts = []
             for string_element in text_line.findall(f'{ns_prefix}String'):
                 text = string_element.get('CONTENT')
-                if text: # Ensure text is not None
                     line_text_parts.append(text)
-            # Also consider <SP/> (Space) elements if they contribute to word separation
-            # and are not implicitly handled by joining CONTENT attributes.
-            # For now, just joining CONTENT attributes.
             if line_text_parts:
                 full_text_lines.append(" ".join(line_text_parts))
@@ -54,80 +68,148 @@ def parse_alto_xml_for_text(xml_file_path):
     except Exception as e:
         return f"An unexpected error occurred during XML parsing: {e}"
-# The draw_ocr_on_image function is no longer needed.
 # --- Gradio Interface Function ---
-def process_image_and_xml(image_path, xml_path):
     """
     Main function for the Gradio interface.
-    Processes the image and XML to return the image and extracted text.
     """
-    if image_path is None: # If no image is uploaded at all
-        return None, "Please upload an image."
-    try:
-        img_pil = Image.open(image_path).convert("RGB")
-    except Exception as e:
-        return None, f"Error loading image: {e}"
-    if xml_path is None: # If XML is missing, but image is present
-        return img_pil, "Please upload an OCR XML file."
-    # Both image and XML are presumably present
-    extracted_text = parse_alto_xml_for_text(xml_path)
-    return img_pil, extracted_text
 # --- Create Gradio App ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# OCR Viewer (ALTO XML) - Text Extractor")
     gr.Markdown(
-        "Upload an image and its corresponding ALTO OCR XML file. "
-        "The app will display the image and extract/show the plain text."
     )
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.File(label="Upload Image (PNG, JPG, etc.)", type="filepath")
-            xml_input = gr.File(label="Upload ALTO XML File (.xml)", type="filepath")
-            # show_overlay_checkbox has been removed
-            submit_button = gr.Button("Process Files", variant="primary")
     with gr.Row():
         with gr.Column(scale=1):
-            output_image_orig = gr.Image(label="Uploaded Image", type="pil", interactive=False)
         with gr.Column(scale=1):
-            output_text = gr.Textbox(label="Extracted Plain Text", lines=15, interactive=False)
-    # output_image_overlay has been removed
-    def update_interface(image_filepath, xml_filepath):
-        # image_filepath and xml_filepath are now strings (paths) or None
-        if image_filepath is None and xml_filepath is None:
-            return None, "Please upload an image and an XML file."
-        # process_image_and_xml handles cases where one is None
-        img, text = process_image_and_xml(image_filepath, xml_filepath)
-        return img, text
     submit_button.click(
-        fn=update_interface,
-        inputs=[image_input, xml_input], # show_overlay_checkbox removed
-        outputs=[output_image_orig, output_text] # output_image_overlay removed
     )
-    # The .change event for show_overlay_checkbox has been removed
     gr.Markdown("---")
     gr.Markdown("### Example ALTO XML Snippet (for `String` element extraction):")
     gr.Code(
-        value="""
-<alto xmlns="http://www.loc.gov/standards/alto/v3/alto.xsd">
   <Description>...</Description>
   <Styles>...</Styles>
   <Layout>
@@ -146,28 +228,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
       </PrintSpace>
     </Page>
   </Layout>
-</alto>
-        """,
         interactive=False
     )
 if __name__ == "__main__":
-    try:
-        # Create a dummy image for testing
-        img_test = Image.new('RGB', (2394, 3612), color = 'lightgray') # Dimensions from example XML
-        img_test.save("dummy_image.png")
-        print("Created dummy_image.png for testing.")
-        # Ensure the example XML file (189819724.34.xml) exists in the same directory
-        # or provide the correct path if it's elsewhere.
-        example_xml_filename = "189819724.34.xml"
-        if not os.path.exists(example_xml_filename):
-            print(f"WARNING: Example XML '{example_xml_filename}' not found. Please create it (using the content from the prompt) or upload your own.")
-    except ImportError:
-        print("Pillow not installed, can't create dummy image.")
-    except Exception as e:
-        print(f"Error during setup: {e}")
     demo.launch()

 import gradio as gr
+from PIL import Image
 import xml.etree.ElementTree as ET
 import os
+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
+# --- Global Model and Processor Initialization ---
+# Load the OCR model and processor once when the app starts
+try:
+    HF_PROCESSOR = AutoProcessor.from_pretrained("reducto/RolmOCR")
+    HF_MODEL = AutoModelForImageTextToText.from_pretrained(
+        "reducto/RolmOCR",
+        torch_dtype=torch.bfloat16,
+        # attn_implementation="flash_attention_2", # User had this commented out
+        device_map="auto"
+    )
+    HF_PIPE = pipeline("image-text-to-text", model=HF_MODEL, processor=HF_PROCESSOR)
+    print("Hugging Face OCR model loaded successfully.")
+except Exception as e:
+    print(f"Error loading Hugging Face model: {e}")
+    HF_PIPE = None
 # --- Helper Functions ---
         tree = ET.parse(xml_file_path)
         root = tree.getroot()
         for text_line in root.findall(f'.//{ns_prefix}TextLine'):
             line_text_parts = []
             for string_element in text_line.findall(f'{ns_prefix}String'):
                 text = string_element.get('CONTENT')
+                if text:
                     line_text_parts.append(text)
             if line_text_parts:
                 full_text_lines.append(" ".join(line_text_parts))
     except Exception as e:
         return f"An unexpected error occurred during XML parsing: {e}"
+def run_hf_ocr(image_path):
+    """
+    Runs OCR on the provided image using the pre-loaded Hugging Face model.
+    """
+    if HF_PIPE is None:
+        return "Hugging Face OCR model not available."
+    if image_path is None:
+        return "No image provided for OCR."
+    try:
+        # Load the image using PIL, as the pipeline expects an image object or path
+        pil_image = Image.open(image_path).convert("RGB")
+        # The user's example output for the pipeline call was:
+        # [{'generated_text': [{'role': 'user', ...}, {'role': 'assistant', 'content': "TEXT..."}]}]
+        # This suggests the pipeline is returning a conversational style output.
+        # We will try to call the pipeline with the image and prompt directly.
+        ocr_results = HF_PIPE(
+            pil_image,
+            prompt="Return the plain text representation of this document as if you were reading it naturally.\n"
+            # The pipeline should handle formatting this into messages if needed by the model.
+        )
+        # Parse the output based on the user's example structure
+        if isinstance(ocr_results, list) and ocr_results and 'generated_text' in ocr_results[0]:
+            generated_content = ocr_results[0]['generated_text']
+            # Check if generated_content itself is the direct text (some pipelines do this)
+            if isinstance(generated_content, str):
+                return generated_content
+            # Check for the conversational structure
+            # [{'role': 'user', ...}, {'role': 'assistant', 'content': "TEXT..."}]
+            if isinstance(generated_content, list) and generated_content:
+                # The assistant's response is typically the last message in the list
+                # or specifically the one with role 'assistant'.
+                assistant_message = None
+                for msg in reversed(generated_content): # Check from the end
+                    if isinstance(msg, dict) and msg.get('role') == 'assistant' and 'content' in msg:
+                        assistant_message = msg['content']
+                        break
+                if assistant_message:
+                    return assistant_message
+            # Fallback if parsing the complex structure fails but we got some string
+            if isinstance(generated_content, list) and generated_content and isinstance(generated_content[0], dict) and 'content' in generated_content[0]:
+                 # This is a guess if the structure is simpler than expected.
+                 # Or if the first part is the user prompt echo and second is assistant.
+                 if len(generated_content) > 1 and isinstance(generated_content[1], dict) and 'content' in generated_content[1]:
+                    return generated_content[1]['content'] # Assuming second part is assistant
+            print(f"Unexpected OCR output structure from HF model: {ocr_results}")
+            return "Error: Could not parse OCR model output. Please check console for details."
+        else:
+            print(f"Unexpected OCR output structure from HF model: {ocr_results}")
+            return "Error: OCR model did not return expected output. Please check console for details."
+    except Exception as e:
+        print(f"Error during Hugging Face OCR: {e}")
+        return f"Error during Hugging Face OCR: {str(e)}"
 # --- Gradio Interface Function ---
+def process_files(image_path, xml_path):
     """
     Main function for the Gradio interface.
+    Processes the image for display, runs OCR (Hugging Face model),
+    and parses ALTO XML if provided.
     """
+    img_to_display = None
+    alto_text_output = "ALTO XML not provided or not processed."
+    hf_ocr_text_output = "Image not provided or OCR not run."
+    if image_path:
+        try:
+            img_to_display = Image.open(image_path).convert("RGB")
+            hf_ocr_text_output = run_hf_ocr(image_path)
+        except Exception as e:
+            img_to_display = None # Clear image if it failed to load
+            hf_ocr_text_output = f"Error loading image or running HF OCR: {e}"
+    else:
+        hf_ocr_text_output = "Please upload an image to perform OCR."
+    if xml_path:
+        alto_text_output = parse_alto_xml_for_text(xml_path)
+    else:
+        alto_text_output = "No ALTO XML file uploaded."
+    # If only XML is provided without an image
+    if not image_path and xml_path:
+        img_to_display = None # No image to display
+        hf_ocr_text_output = "Upload an image to perform OCR."
+    return img_to_display, alto_text_output, hf_ocr_text_output
 # --- Create Gradio App ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# OCR Viewer and Extractor")
     gr.Markdown(
+        "Upload an image to perform OCR using a Hugging Face model. "
+        "Optionally, upload its corresponding ALTO OCR XML file to compare the extracted text."
     )
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.File(label="Upload Image (PNG, JPG, etc.)", type="filepath")
+            xml_input = gr.File(label="Upload ALTO XML File (Optional, .xml)", type="filepath")
+            submit_button = gr.Button("Process Image and XML", variant="primary")
     with gr.Row():
         with gr.Column(scale=1):
+            output_image_display = gr.Image(label="Uploaded Image", type="pil", interactive=False)
         with gr.Column(scale=1):
+            hf_ocr_output_textbox = gr.Textbox(
+                label="OCR Output (Hugging Face Model)",
+                lines=15,
+                interactive=False,
+                show_copy_button=True
+            )
+            alto_xml_output_textbox = gr.Textbox(
+                label="Text from ALTO XML",
+                lines=15,
+                interactive=False,
+                show_copy_button=True
+            )
     submit_button.click(
+        fn=process_files,
+        inputs=[image_input, xml_input],
+        outputs=[output_image_display, alto_xml_output_textbox, hf_ocr_output_textbox]
     )
     gr.Markdown("---")
     gr.Markdown("### Example ALTO XML Snippet (for `String` element extraction):")
     gr.Code(
+        value=(
+"""<alto xmlns="http://www.loc.gov/standards/alto/v3/alto.xsd">
   <Description>...</Description>
   <Styles>...</Styles>
   <Layout>
       </PrintSpace>
     </Page>
   </Layout>
+</alto>"""
+        ),
+        language="xml", # Added language for syntax highlighting
         interactive=False
     )
 if __name__ == "__main__":
+    # Removed dummy file creation as it's less relevant for single file focus
+    print("Attempting to launch Gradio demo...")
+    print("If the Hugging Face model is large, initial startup might take some time due to model download/loading.")
     demo.launch()

requirements.in ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+Pillow
+lxml
+torch
+transformers
+spaces

requirements.txt CHANGED Viewed

@@ -1,3 +1,238 @@
-gradio
-Pillow
-lxml

+# This file was autogenerated by uv via the following command:
+#    uv pip compile --python-platform linux --python-version 3.10 requirements.in -o requirements.txt
+aiofiles==24.1.0
+    # via gradio
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.9.0
+    # via
+    #   gradio
+    #   httpx
+    #   starlette
+certifi==2025.4.26
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.4.2
+    # via requests
+click==8.1.8
+    # via
+    #   typer
+    #   uvicorn
+exceptiongroup==1.3.0
+    # via anyio
+fastapi==0.115.12
+    # via gradio
+ffmpy==0.5.0
+    # via gradio
+filelock==3.18.0
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+fsspec==2025.5.0
+    # via
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+gradio==5.30.0
+    # via
+    #   -r requirements.in
+    #   spaces
+gradio-client==1.10.1
+    # via gradio
+groovy==0.1.2
+    # via gradio
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   safehttpx
+    #   spaces
+huggingface-hub==0.31.4
+    # via
+    #   gradio
+    #   gradio-client
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+lxml==5.4.0
+    # via -r requirements.in
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via
+    #   gradio
+    #   jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+networkx==3.4.2
+    # via torch
+numpy==2.2.6
+    # via
+    #   gradio
+    #   pandas
+    #   transformers
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.2
+    # via torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+orjson==3.10.18
+    # via gradio
+packaging==25.0
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   spaces
+    #   transformers
+pandas==2.2.3
+    # via gradio
+pillow==11.2.1
+    # via
+    #   -r requirements.in
+    #   gradio
+psutil==5.9.8
+    # via spaces
+pydantic==2.11.4
+    # via
+    #   fastapi
+    #   gradio
+    #   spaces
+pydantic-core==2.33.2
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.19.1
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-multipart==0.0.20
+    # via gradio
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   gradio
+    #   huggingface-hub
+    #   transformers
+regex==2024.11.6
+    # via transformers
+requests==2.32.3
+    # via
+    #   huggingface-hub
+    #   spaces
+    #   transformers
+rich==14.0.0
+    # via typer
+ruff==0.11.10
+    # via gradio
+safehttpx==0.1.6
+    # via gradio
+safetensors==0.5.3
+    # via transformers
+semantic-version==2.10.0
+    # via gradio
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via anyio
+spaces==0.36.0
+    # via -r requirements.in
+starlette==0.46.2
+    # via
+    #   fastapi
+    #   gradio
+sympy==1.13.1
+    # via torch
+tokenizers==0.21.1
+    # via transformers
+tomlkit==0.13.2
+    # via gradio
+torch==2.6.0
+    # via -r requirements.in
+tqdm==4.67.1
+    # via
+    #   huggingface-hub
+    #   transformers
+transformers==4.52.2
+    # via -r requirements.in
+triton==3.2.0
+    # via torch
+typer==0.15.4
+    # via gradio
+typing-extensions==4.13.2
+    # via
+    #   anyio
+    #   exceptiongroup
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+    #   rich
+    #   spaces
+    #   torch
+    #   typer
+    #   typing-inspection
+    #   uvicorn
+typing-inspection==0.4.1
+    # via pydantic
+tzdata==2025.2
+    # via pandas
+urllib3==2.4.0
+    # via requests
+uvicorn==0.34.2
+    # via gradio
+websockets==15.0.1
+    # via gradio-client