davanstrien HF Staff commited on
Commit
2c499db
·
1 Parent(s): 275bb85

Refactor XML parsing functions to support both ALTO and PAGE formats, enhancing error handling and output consistency

Browse files
Files changed (1) hide show
  1. app.py +82 -17
app.py CHANGED
@@ -23,18 +23,65 @@ HF_PIPE = pipeline("image-text-to-text", model=HF_MODEL, processor=HF_PROCESSOR)
23
 
24
  # --- Helper Functions ---
25
 
26
- def get_alto_namespace(xml_file_path):
27
  """
28
- Dynamically gets the ALTO namespace from the XML file.
 
29
  """
30
  try:
31
  tree = ET.parse(xml_file_path)
32
  root = tree.getroot()
33
  if '}' in root.tag:
34
- return root.tag.split('}')[0] + '}'
 
 
 
 
 
35
  except ET.ParseError:
36
  print(f"Error parsing XML to find namespace: {xml_file_path}")
37
- return ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def parse_alto_xml_for_text(xml_file_path):
40
  """
@@ -48,7 +95,7 @@ def parse_alto_xml_for_text(xml_file_path):
48
  return "Error: XML file not provided or does not exist."
49
 
50
  try:
51
- ns_prefix = get_alto_namespace(xml_file_path)
52
  tree = ET.parse(xml_file_path)
53
  root = tree.getroot()
54
 
@@ -68,6 +115,26 @@ def parse_alto_xml_for_text(xml_file_path):
68
  except Exception as e:
69
  return f"An unexpected error occurred during XML parsing: {e}"
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  @spaces.GPU
72
  def predict(pil_image):
73
  """Performs OCR prediction using the Hugging Face model."""
@@ -148,10 +215,10 @@ def process_files(image_path, xml_path):
148
  """
149
  Main function for the Gradio interface.
150
  Processes the image for display, runs OCR (Hugging Face model),
151
- and parses ALTO XML if provided.
152
  """
153
  img_to_display = None
154
- alto_text_output = "ALTO XML not provided or not processed."
155
  hf_ocr_text_output = "Image not provided or OCR not run."
156
 
157
  if image_path:
@@ -164,19 +231,17 @@ def process_files(image_path, xml_path):
164
  else:
165
  hf_ocr_text_output = "Please upload an image to perform OCR."
166
 
167
-
168
  if xml_path:
169
- alto_text_output = parse_alto_xml_for_text(xml_path)
170
  else:
171
- alto_text_output = "No ALTO XML file uploaded."
172
 
173
  # If only XML is provided without an image
174
  if not image_path and xml_path:
175
  img_to_display = None # No image to display
176
  hf_ocr_text_output = "Upload an image to perform OCR."
177
 
178
-
179
- return img_to_display, alto_text_output, hf_ocr_text_output
180
 
181
 
182
  # --- Create Gradio App ---
@@ -185,13 +250,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
185
  gr.Markdown("# OCR Viewer and Extractor")
186
  gr.Markdown(
187
  "Upload an image to perform OCR using a Hugging Face model. "
188
- "Optionally, upload its corresponding ALTO OCR XML file to compare the extracted text."
189
  )
190
 
191
  with gr.Row():
192
  with gr.Column(scale=1):
193
  image_input = gr.File(label="Upload Image (PNG, JPG, etc.)", type="filepath")
194
- xml_input = gr.File(label="Upload ALTO XML File (Optional, .xml)", type="filepath")
195
  submit_button = gr.Button("Process Image and XML", variant="primary")
196
 
197
  with gr.Row():
@@ -204,8 +269,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
204
  interactive=False,
205
  show_copy_button=True
206
  )
207
- alto_xml_output_textbox = gr.Textbox(
208
- label="Text from ALTO XML",
209
  lines=15,
210
  interactive=False,
211
  show_copy_button=True
@@ -214,7 +279,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
214
  submit_button.click(
215
  fn=process_files,
216
  inputs=[image_input, xml_input],
217
- outputs=[output_image_display, alto_xml_output_textbox, hf_ocr_output_textbox]
218
  )
219
 
220
  gr.Markdown("---")
 
23
 
24
  # --- Helper Functions ---
25
 
26
+ def get_xml_namespace(xml_file_path):
27
  """
28
+ Dynamically gets the namespace from the XML file.
29
+ Returns both the namespace and the format type (ALTO or PAGE).
30
  """
31
  try:
32
  tree = ET.parse(xml_file_path)
33
  root = tree.getroot()
34
  if '}' in root.tag:
35
+ ns = root.tag.split('}')[0] + '}'
36
+ # Determine format based on root element
37
+ if 'PcGts' in root.tag:
38
+ return ns, 'PAGE'
39
+ elif 'alto' in root.tag.lower():
40
+ return ns, 'ALTO'
41
  except ET.ParseError:
42
  print(f"Error parsing XML to find namespace: {xml_file_path}")
43
+ return '', 'UNKNOWN'
44
+
45
+ def parse_page_xml_for_text(xml_file_path):
46
+ """
47
+ Parses a PAGE XML file to extract text content.
48
+ Returns:
49
+ - full_text (str): All extracted text concatenated.
50
+ """
51
+ full_text_lines = []
52
+
53
+ if not xml_file_path or not os.path.exists(xml_file_path):
54
+ return "Error: XML file not provided or does not exist."
55
+
56
+ try:
57
+ ns_prefix, _ = get_xml_namespace(xml_file_path)
58
+ tree = ET.parse(xml_file_path)
59
+ root = tree.getroot()
60
+
61
+ # Find all TextLine elements
62
+ for text_line in root.findall(f'.//{ns_prefix}TextLine'):
63
+ # First try to get text from TextEquiv/Unicode
64
+ text_equiv = text_line.find(f'{ns_prefix}TextEquiv/{ns_prefix}Unicode')
65
+ if text_equiv is not None and text_equiv.text:
66
+ full_text_lines.append(text_equiv.text)
67
+ continue
68
+
69
+ # If no TextEquiv, try to get text from Word elements
70
+ line_text_parts = []
71
+ for word in text_line.findall(f'{ns_prefix}Word'):
72
+ word_text = word.find(f'{ns_prefix}TextEquiv/{ns_prefix}Unicode')
73
+ if word_text is not None and word_text.text:
74
+ line_text_parts.append(word_text.text)
75
+
76
+ if line_text_parts:
77
+ full_text_lines.append(" ".join(line_text_parts))
78
+
79
+ return "\n".join(full_text_lines)
80
+
81
+ except ET.ParseError as e:
82
+ return f"Error parsing XML: {e}"
83
+ except Exception as e:
84
+ return f"An unexpected error occurred during XML parsing: {e}"
85
 
86
  def parse_alto_xml_for_text(xml_file_path):
87
  """
 
95
  return "Error: XML file not provided or does not exist."
96
 
97
  try:
98
+ ns_prefix, _ = get_xml_namespace(xml_file_path)
99
  tree = ET.parse(xml_file_path)
100
  root = tree.getroot()
101
 
 
115
  except Exception as e:
116
  return f"An unexpected error occurred during XML parsing: {e}"
117
 
118
+ def parse_xml_for_text(xml_file_path):
119
+ """
120
+ Main function to parse XML files, automatically detecting the format.
121
+ """
122
+ if not xml_file_path or not os.path.exists(xml_file_path):
123
+ return "Error: XML file not provided or does not exist."
124
+
125
+ try:
126
+ _, xml_format = get_xml_namespace(xml_file_path)
127
+
128
+ if xml_format == 'PAGE':
129
+ return parse_page_xml_for_text(xml_file_path)
130
+ elif xml_format == 'ALTO':
131
+ return parse_alto_xml_for_text(xml_file_path)
132
+ else:
133
+ return f"Error: Unsupported XML format. Expected ALTO or PAGE XML."
134
+
135
+ except Exception as e:
136
+ return f"Error determining XML format: {str(e)}"
137
+
138
  @spaces.GPU
139
  def predict(pil_image):
140
  """Performs OCR prediction using the Hugging Face model."""
 
215
  """
216
  Main function for the Gradio interface.
217
  Processes the image for display, runs OCR (Hugging Face model),
218
+ and parses XML if provided.
219
  """
220
  img_to_display = None
221
+ xml_text_output = "XML not provided or not processed."
222
  hf_ocr_text_output = "Image not provided or OCR not run."
223
 
224
  if image_path:
 
231
  else:
232
  hf_ocr_text_output = "Please upload an image to perform OCR."
233
 
 
234
  if xml_path:
235
+ xml_text_output = parse_xml_for_text(xml_path)
236
  else:
237
+ xml_text_output = "No XML file uploaded."
238
 
239
  # If only XML is provided without an image
240
  if not image_path and xml_path:
241
  img_to_display = None # No image to display
242
  hf_ocr_text_output = "Upload an image to perform OCR."
243
 
244
+ return img_to_display, xml_text_output, hf_ocr_text_output
 
245
 
246
 
247
  # --- Create Gradio App ---
 
250
  gr.Markdown("# OCR Viewer and Extractor")
251
  gr.Markdown(
252
  "Upload an image to perform OCR using a Hugging Face model. "
253
+ "Optionally, upload its corresponding ALTO or PAGE XML file to compare the extracted text."
254
  )
255
 
256
  with gr.Row():
257
  with gr.Column(scale=1):
258
  image_input = gr.File(label="Upload Image (PNG, JPG, etc.)", type="filepath")
259
+ xml_input = gr.File(label="Upload XML File (Optional, ALTO or PAGE format)", type="filepath")
260
  submit_button = gr.Button("Process Image and XML", variant="primary")
261
 
262
  with gr.Row():
 
269
  interactive=False,
270
  show_copy_button=True
271
  )
272
+ xml_output_textbox = gr.Textbox(
273
+ label="Text from XML",
274
  lines=15,
275
  interactive=False,
276
  show_copy_button=True
 
279
  submit_button.click(
280
  fn=process_files,
281
  inputs=[image_input, xml_input],
282
+ outputs=[output_image_display, xml_output_textbox, hf_ocr_output_textbox]
283
  )
284
 
285
  gr.Markdown("---")