davanstrien HF Staff commited on
Commit
e1b1045
·
1 Parent(s): bf47208

Refactor OCR model initialization and prediction handling for improved error reporting and message formatting

Browse files
Files changed (1) hide show
  1. app.py +27 -26
app.py CHANGED
@@ -6,11 +6,20 @@ import torch
6
  from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
7
  import spaces
8
 
9
- # --- Global Model and Processor (initialize as None for lazy loading) ---
10
  HF_PROCESSOR = None
11
  HF_MODEL = None
12
  HF_PIPE = None
13
- MODEL_LOAD_ERROR_MSG = None # To store any error message from loading
 
 
 
 
 
 
 
 
 
14
 
15
  # --- Helper Functions ---
16
 
@@ -59,36 +68,28 @@ def parse_alto_xml_for_text(xml_file_path):
59
  except Exception as e:
60
  return f"An unexpected error occurred during XML parsing: {e}"
61
 
62
- @spaces.GPU # Ensures GPU is available for model loading (on first call) and inference
63
  def predict(pil_image):
64
- """Performs OCR prediction using the Hugging Face model, with lazy loading."""
65
- global HF_PROCESSOR, HF_MODEL, HF_PIPE, MODEL_LOAD_ERROR_MSG
66
-
67
- if HF_PIPE is None and MODEL_LOAD_ERROR_MSG is None:
68
- try:
69
- print("Attempting to load Hugging Face model and processor within @spaces.GPU context...")
70
- HF_PROCESSOR = AutoProcessor.from_pretrained("reducto/RolmOCR")
71
- HF_MODEL = AutoModelForImageTextToText.from_pretrained(
72
- "reducto/RolmOCR",
73
- torch_dtype=torch.bfloat16,
74
- device_map="auto" # Should utilize ZeroGPU correctly here
75
- )
76
- HF_PIPE = pipeline("image-text-to-text", model=HF_MODEL, processor=HF_PROCESSOR)
77
- print("Hugging Face OCR model loaded successfully.")
78
- except Exception as e:
79
- MODEL_LOAD_ERROR_MSG = f"Error loading Hugging Face model: {str(e)}"
80
- print(MODEL_LOAD_ERROR_MSG)
81
- # HF_PIPE remains None, error message is stored
82
 
83
  if HF_PIPE is None:
84
  error_to_report = MODEL_LOAD_ERROR_MSG if MODEL_LOAD_ERROR_MSG else "OCR model could not be initialized."
85
  raise RuntimeError(error_to_report)
86
 
87
- # Proceed with inference if pipe is available
88
- return HF_PIPE(
89
- pil_image,
90
- prompt="Return the plain text representation of this document as if you were reading it naturally.\n",
91
- )
 
 
 
 
 
 
 
 
92
 
93
  def run_hf_ocr(image_path):
94
  """
 
6
  from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
7
  import spaces
8
 
9
+ # --- Global Model and Processor ---
10
  HF_PROCESSOR = None
11
  HF_MODEL = None
12
  HF_PIPE = None
13
+ MODEL_LOAD_ERROR_MSG = None
14
+
15
+ HF_PROCESSOR = AutoProcessor.from_pretrained("reducto/RolmOCR")
16
+ HF_MODEL = AutoModelForImageTextToText.from_pretrained(
17
+ "reducto/RolmOCR",
18
+ torch_dtype=torch.bfloat16,
19
+ device_map="auto"
20
+ )
21
+ HF_PIPE = pipeline("image-text-to-text", model=HF_MODEL, processor=HF_PROCESSOR)
22
+
23
 
24
  # --- Helper Functions ---
25
 
 
68
  except Exception as e:
69
  return f"An unexpected error occurred during XML parsing: {e}"
70
 
71
+ @spaces.GPU
72
  def predict(pil_image):
73
+ """Performs OCR prediction using the Hugging Face model."""
74
+ global HF_PIPE, MODEL_LOAD_ERROR_MSG
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  if HF_PIPE is None:
77
  error_to_report = MODEL_LOAD_ERROR_MSG if MODEL_LOAD_ERROR_MSG else "OCR model could not be initialized."
78
  raise RuntimeError(error_to_report)
79
 
80
+ # Format the message in the expected structure
81
+ messages = [
82
+ {
83
+ "role": "user",
84
+ "content": [
85
+ {"type": "image", "image": pil_image},
86
+ {"type": "text", "text": "Return the plain text representation of this document as if you were reading it naturally.\n"}
87
+ ]
88
+ }
89
+ ]
90
+
91
+ # Use the pipeline with the properly formatted messages
92
+ return HF_PIPE(messages)
93
 
94
  def run_hf_ocr(image_path):
95
  """