davanstrien HF Staff commited on
Commit
beca8ab
·
1 Parent(s): c7a30f7

parse olmo output

Browse files
Files changed (1) hide show
  1. app.py +37 -4
app.py CHANGED
@@ -3,6 +3,7 @@ from PIL import Image
3
  import xml.etree.ElementTree as ET
4
  import os
5
  import torch
 
6
  from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration
7
  import spaces
8
 
@@ -259,6 +260,14 @@ def run_hf_ocr(image_path, model_name="RolmOCR"):
259
  generated_content = ocr_results[0]["generated_text"]
260
 
261
  if isinstance(generated_content, str):
 
 
 
 
 
 
 
 
262
  return generated_content
263
 
264
  if isinstance(generated_content, list) and generated_content:
@@ -272,6 +281,14 @@ def run_hf_ocr(image_path, model_name="RolmOCR"):
272
  ),
273
  None,
274
  ):
 
 
 
 
 
 
 
 
275
  return assistant_message
276
 
277
  # Fallback if the specific assistant message structure isn't found but there's content
@@ -284,11 +301,27 @@ def run_hf_ocr(image_path, model_name="RolmOCR"):
284
  and isinstance(generated_content[1], dict)
285
  and "content" in generated_content[1]
286
  ):
287
- return generated_content[1][
288
- "content"
289
- ] # Assuming second part is assistant
 
 
 
 
 
 
 
290
  else:
291
- return generated_content[0]["content"]
 
 
 
 
 
 
 
 
 
292
 
293
  print(f"Unexpected OCR output structure from HF model: {ocr_results}")
294
  return "Error: Could not parse OCR model output. Check console."
 
3
  import xml.etree.ElementTree as ET
4
  import os
5
  import torch
6
+ import json
7
  from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration
8
  import spaces
9
 
 
260
  generated_content = ocr_results[0]["generated_text"]
261
 
262
  if isinstance(generated_content, str):
263
+ # Check if it's JSON format from olmOCR
264
+ if model_name == "olmOCR":
265
+ try:
266
+ json_data = json.loads(generated_content)
267
+ if "natural_text" in json_data:
268
+ return json_data["natural_text"]
269
+ except (json.JSONDecodeError, KeyError, TypeError):
270
+ pass
271
  return generated_content
272
 
273
  if isinstance(generated_content, list) and generated_content:
 
281
  ),
282
  None,
283
  ):
284
+ # Check if it's JSON format from olmOCR
285
+ if model_name == "olmOCR":
286
+ try:
287
+ json_data = json.loads(assistant_message)
288
+ if "natural_text" in json_data:
289
+ return json_data["natural_text"]
290
+ except (json.JSONDecodeError, KeyError, TypeError):
291
+ pass
292
  return assistant_message
293
 
294
  # Fallback if the specific assistant message structure isn't found but there's content
 
301
  and isinstance(generated_content[1], dict)
302
  and "content" in generated_content[1]
303
  ):
304
+ content = generated_content[1]["content"]
305
+ # Check if it's JSON format from olmOCR
306
+ if model_name == "olmOCR":
307
+ try:
308
+ json_data = json.loads(content)
309
+ if "natural_text" in json_data:
310
+ return json_data["natural_text"]
311
+ except (json.JSONDecodeError, KeyError, TypeError):
312
+ pass
313
+ return content # Assuming second part is assistant
314
  else:
315
+ content = generated_content[0]["content"]
316
+ # Check if it's JSON format from olmOCR
317
+ if model_name == "olmOCR":
318
+ try:
319
+ json_data = json.loads(content)
320
+ if "natural_text" in json_data:
321
+ return json_data["natural_text"]
322
+ except (json.JSONDecodeError, KeyError, TypeError):
323
+ pass
324
+ return content
325
 
326
  print(f"Unexpected OCR output structure from HF model: {ocr_results}")
327
  return "Error: Could not parse OCR model output. Check console."