Spaces:

davanstrien
/

ocr-time-machine

Running on Zero

App Files Files Community

davanstrien HF Staff commited on May 22

Commit

e1b1045

1 Parent(s): bf47208

Refactor OCR model initialization and prediction handling for improved error reporting and message formatting

Browse files

Files changed (1) hide show

app.py +27 -26

app.py CHANGED Viewed

@@ -6,11 +6,20 @@ import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
 import spaces
-# --- Global Model and Processor (initialize as None for lazy loading) ---
 HF_PROCESSOR = None
 HF_MODEL = None
 HF_PIPE = None
-MODEL_LOAD_ERROR_MSG = None # To store any error message from loading
 # --- Helper Functions ---
@@ -59,36 +68,28 @@ def parse_alto_xml_for_text(xml_file_path):
     except Exception as e:
         return f"An unexpected error occurred during XML parsing: {e}"
-@spaces.GPU # Ensures GPU is available for model loading (on first call) and inference
 def predict(pil_image):
-    """Performs OCR prediction using the Hugging Face model, with lazy loading."""
-    global HF_PROCESSOR, HF_MODEL, HF_PIPE, MODEL_LOAD_ERROR_MSG
-    if HF_PIPE is None and MODEL_LOAD_ERROR_MSG is None:
-        try:
-            print("Attempting to load Hugging Face model and processor within @spaces.GPU context...")
-            HF_PROCESSOR = AutoProcessor.from_pretrained("reducto/RolmOCR")
-            HF_MODEL = AutoModelForImageTextToText.from_pretrained(
-                "reducto/RolmOCR",
-                torch_dtype=torch.bfloat16,
-                device_map="auto" # Should utilize ZeroGPU correctly here
-            )
-            HF_PIPE = pipeline("image-text-to-text", model=HF_MODEL, processor=HF_PROCESSOR)
-            print("Hugging Face OCR model loaded successfully.")
-        except Exception as e:
-            MODEL_LOAD_ERROR_MSG = f"Error loading Hugging Face model: {str(e)}"
-            print(MODEL_LOAD_ERROR_MSG)
-            # HF_PIPE remains None, error message is stored
     if HF_PIPE is None:
         error_to_report = MODEL_LOAD_ERROR_MSG if MODEL_LOAD_ERROR_MSG else "OCR model could not be initialized."
         raise RuntimeError(error_to_report)
-    # Proceed with inference if pipe is available
-    return HF_PIPE(
-        pil_image,
-        prompt="Return the plain text representation of this document as if you were reading it naturally.\n",
-    )
 def run_hf_ocr(image_path):
     """

 from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
 import spaces
+# --- Global Model and Processor ---
 HF_PROCESSOR = None
 HF_MODEL = None
 HF_PIPE = None
+MODEL_LOAD_ERROR_MSG = None
+HF_PROCESSOR = AutoProcessor.from_pretrained("reducto/RolmOCR")
+HF_MODEL = AutoModelForImageTextToText.from_pretrained(
+            "reducto/RolmOCR",
+            torch_dtype=torch.bfloat16,
+            device_map="auto"
+)
+HF_PIPE = pipeline("image-text-to-text", model=HF_MODEL, processor=HF_PROCESSOR)
 # --- Helper Functions ---
     except Exception as e:
         return f"An unexpected error occurred during XML parsing: {e}"
+@spaces.GPU
 def predict(pil_image):
+    """Performs OCR prediction using the Hugging Face model."""
+    global HF_PIPE, MODEL_LOAD_ERROR_MSG
     if HF_PIPE is None:
         error_to_report = MODEL_LOAD_ERROR_MSG if MODEL_LOAD_ERROR_MSG else "OCR model could not be initialized."
         raise RuntimeError(error_to_report)
+    # Format the message in the expected structure
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": pil_image},
+                {"type": "text", "text": "Return the plain text representation of this document as if you were reading it naturally.\n"}
+            ]
+        }
+    ]
+    # Use the pipeline with the properly formatted messages
+    return HF_PIPE(messages)
 def run_hf_ocr(image_path):
     """