Spaces:

davanstrien
/

ocr-time-machine

Running on Zero

App Files Files Community

davanstrien HF Staff commited on May 22

Commit

2c499db

1 Parent(s): 275bb85

Refactor XML parsing functions to support both ALTO and PAGE formats, enhancing error handling and output consistency

Browse files

Files changed (1) hide show

app.py +82 -17

app.py CHANGED Viewed

@@ -23,18 +23,65 @@ HF_PIPE = pipeline("image-text-to-text", model=HF_MODEL, processor=HF_PROCESSOR)
 # --- Helper Functions ---
-def get_alto_namespace(xml_file_path):
     """
-    Dynamically gets the ALTO namespace from the XML file.
     """
     try:
         tree = ET.parse(xml_file_path)
         root = tree.getroot()
         if '}' in root.tag:
-            return root.tag.split('}')[0] + '}'
     except ET.ParseError:
         print(f"Error parsing XML to find namespace: {xml_file_path}")
-    return ''
 def parse_alto_xml_for_text(xml_file_path):
     """
@@ -48,7 +95,7 @@ def parse_alto_xml_for_text(xml_file_path):
         return "Error: XML file not provided or does not exist."
     try:
-        ns_prefix = get_alto_namespace(xml_file_path)
         tree = ET.parse(xml_file_path)
         root = tree.getroot()
@@ -68,6 +115,26 @@ def parse_alto_xml_for_text(xml_file_path):
     except Exception as e:
         return f"An unexpected error occurred during XML parsing: {e}"
 @spaces.GPU
 def predict(pil_image):
     """Performs OCR prediction using the Hugging Face model."""
@@ -148,10 +215,10 @@ def process_files(image_path, xml_path):
     """
     Main function for the Gradio interface.
     Processes the image for display, runs OCR (Hugging Face model),
-    and parses ALTO XML if provided.
     """
     img_to_display = None
-    alto_text_output = "ALTO XML not provided or not processed."
     hf_ocr_text_output = "Image not provided or OCR not run."
     if image_path:
@@ -164,19 +231,17 @@ def process_files(image_path, xml_path):
     else:
         hf_ocr_text_output = "Please upload an image to perform OCR."
     if xml_path:
-        alto_text_output = parse_alto_xml_for_text(xml_path)
     else:
-        alto_text_output = "No ALTO XML file uploaded."
     # If only XML is provided without an image
     if not image_path and xml_path:
         img_to_display = None # No image to display
         hf_ocr_text_output = "Upload an image to perform OCR."
-    return img_to_display, alto_text_output, hf_ocr_text_output
 # --- Create Gradio App ---
@@ -185,13 +250,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# OCR Viewer and Extractor")
     gr.Markdown(
         "Upload an image to perform OCR using a Hugging Face model. "
-        "Optionally, upload its corresponding ALTO OCR XML file to compare the extracted text."
     )
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.File(label="Upload Image (PNG, JPG, etc.)", type="filepath")
-            xml_input = gr.File(label="Upload ALTO XML File (Optional, .xml)", type="filepath")
             submit_button = gr.Button("Process Image and XML", variant="primary")
     with gr.Row():
@@ -204,8 +269,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 interactive=False,
                 show_copy_button=True
             )
-            alto_xml_output_textbox = gr.Textbox(
-                label="Text from ALTO XML",
                 lines=15,
                 interactive=False,
                 show_copy_button=True
@@ -214,7 +279,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     submit_button.click(
         fn=process_files,
         inputs=[image_input, xml_input],
-        outputs=[output_image_display, alto_xml_output_textbox, hf_ocr_output_textbox]
     )
     gr.Markdown("---")

 # --- Helper Functions ---
+def get_xml_namespace(xml_file_path):
     """
+    Dynamically gets the namespace from the XML file.
+    Returns both the namespace and the format type (ALTO or PAGE).
     """
     try:
         tree = ET.parse(xml_file_path)
         root = tree.getroot()
         if '}' in root.tag:
+            ns = root.tag.split('}')[0] + '}'
+            # Determine format based on root element
+            if 'PcGts' in root.tag:
+                return ns, 'PAGE'
+            elif 'alto' in root.tag.lower():
+                return ns, 'ALTO'
     except ET.ParseError:
         print(f"Error parsing XML to find namespace: {xml_file_path}")
+    return '', 'UNKNOWN'
+def parse_page_xml_for_text(xml_file_path):
+    """
+    Parses a PAGE XML file to extract text content.
+    Returns:
+        - full_text (str): All extracted text concatenated.
+    """
+    full_text_lines = []
+    if not xml_file_path or not os.path.exists(xml_file_path):
+        return "Error: XML file not provided or does not exist."
+    try:
+        ns_prefix, _ = get_xml_namespace(xml_file_path)
+        tree = ET.parse(xml_file_path)
+        root = tree.getroot()
+        # Find all TextLine elements
+        for text_line in root.findall(f'.//{ns_prefix}TextLine'):
+            # First try to get text from TextEquiv/Unicode
+            text_equiv = text_line.find(f'{ns_prefix}TextEquiv/{ns_prefix}Unicode')
+            if text_equiv is not None and text_equiv.text:
+                full_text_lines.append(text_equiv.text)
+                continue
+            # If no TextEquiv, try to get text from Word elements
+            line_text_parts = []
+            for word in text_line.findall(f'{ns_prefix}Word'):
+                word_text = word.find(f'{ns_prefix}TextEquiv/{ns_prefix}Unicode')
+                if word_text is not None and word_text.text:
+                    line_text_parts.append(word_text.text)
+            if line_text_parts:
+                full_text_lines.append(" ".join(line_text_parts))
+        return "\n".join(full_text_lines)
+    except ET.ParseError as e:
+        return f"Error parsing XML: {e}"
+    except Exception as e:
+        return f"An unexpected error occurred during XML parsing: {e}"
 def parse_alto_xml_for_text(xml_file_path):
     """
         return "Error: XML file not provided or does not exist."
     try:
+        ns_prefix, _ = get_xml_namespace(xml_file_path)
         tree = ET.parse(xml_file_path)
         root = tree.getroot()
     except Exception as e:
         return f"An unexpected error occurred during XML parsing: {e}"
+def parse_xml_for_text(xml_file_path):
+    """
+    Main function to parse XML files, automatically detecting the format.
+    """
+    if not xml_file_path or not os.path.exists(xml_file_path):
+        return "Error: XML file not provided or does not exist."
+    try:
+        _, xml_format = get_xml_namespace(xml_file_path)
+        if xml_format == 'PAGE':
+            return parse_page_xml_for_text(xml_file_path)
+        elif xml_format == 'ALTO':
+            return parse_alto_xml_for_text(xml_file_path)
+        else:
+            return f"Error: Unsupported XML format. Expected ALTO or PAGE XML."
+    except Exception as e:
+        return f"Error determining XML format: {str(e)}"
 @spaces.GPU
 def predict(pil_image):
     """Performs OCR prediction using the Hugging Face model."""
     """
     Main function for the Gradio interface.
     Processes the image for display, runs OCR (Hugging Face model),
+    and parses XML if provided.
     """
     img_to_display = None
+    xml_text_output = "XML not provided or not processed."
     hf_ocr_text_output = "Image not provided or OCR not run."
     if image_path:
     else:
         hf_ocr_text_output = "Please upload an image to perform OCR."
     if xml_path:
+        xml_text_output = parse_xml_for_text(xml_path)
     else:
+        xml_text_output = "No XML file uploaded."
     # If only XML is provided without an image
     if not image_path and xml_path:
         img_to_display = None # No image to display
         hf_ocr_text_output = "Upload an image to perform OCR."
+    return img_to_display, xml_text_output, hf_ocr_text_output
 # --- Create Gradio App ---
     gr.Markdown("# OCR Viewer and Extractor")
     gr.Markdown(
         "Upload an image to perform OCR using a Hugging Face model. "
+        "Optionally, upload its corresponding ALTO or PAGE XML file to compare the extracted text."
     )
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.File(label="Upload Image (PNG, JPG, etc.)", type="filepath")
+            xml_input = gr.File(label="Upload XML File (Optional, ALTO or PAGE format)", type="filepath")
             submit_button = gr.Button("Process Image and XML", variant="primary")
     with gr.Row():
                 interactive=False,
                 show_copy_button=True
             )
+            xml_output_textbox = gr.Textbox(
+                label="Text from XML",
                 lines=15,
                 interactive=False,
                 show_copy_button=True
     submit_button.click(
         fn=process_files,
         inputs=[image_input, xml_input],
+        outputs=[output_image_display, xml_output_textbox, hf_ocr_output_textbox]
     )
     gr.Markdown("---")