Spaces:

Svngoku
/

mistral-ocr-demo

Running

App Files Files Community

Svngoku commited on Mar 6

Commit

86ba735

verified ·

1 Parent(s): 167a8e7

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -146

app.py CHANGED Viewed

@@ -1,26 +1,20 @@
 import os
 import base64
 import gradio as gr
-from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk
 from mistralai.models import OCRResponse
-from pathlib import Path
-import pycountry
-import json
-import logging
-from tenacity import retry, stop_after_attempt, wait_exponential
-import tempfile
-from typing import Union, Dict, List, Optional, Tuple
-from contextlib import contextmanager
 import requests
 import shutil
-from concurrent.futures import ThreadPoolExecutor
 import time
 # Constants
-DEFAULT_LANGUAGE = "English"
 SUPPORTED_IMAGE_TYPES = [".jpg", ".png", ".jpeg"]
 SUPPORTED_PDF_TYPES = [".pdf"]
-TEMP_FILE_EXPIRY = 7200  # 2 hours in seconds
 UPLOAD_FOLDER = "./uploads"
 MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
 MAX_PDF_PAGES = 50
@@ -36,15 +30,10 @@ logger = logging.getLogger(__name__)
 class OCRProcessor:
     def __init__(self, api_key: str):
-        self.api_key = self._validate_api_key(api_key)
-        self.client = Mistral(api_key=self.api_key)
-        self._validate_client()
-    @staticmethod
-    def _validate_api_key(api_key: str) -> str:
         if not api_key or not isinstance(api_key, str):
             raise ValueError("Valid API key must be provided")
-        return api_key
     def _validate_client(self) -> None:
         try:
@@ -60,21 +49,12 @@ class OCRProcessor:
             size = os.path.getsize(file_input)
         elif hasattr(file_input, 'read'):
             size = len(file_input.read())
-            file_input.seek(0)  # Reset file pointer
         else:
             size = len(file_input)
         if size > MAX_FILE_SIZE:
             raise ValueError(f"File size exceeds {MAX_FILE_SIZE/1024/1024}MB limit")
-    @staticmethod
-    def _encode_image(image_path: str) -> Optional[str]:
-        try:
-            with open(image_path, "rb") as image_file:
-                return base64.b64encode(image_file.read()).decode('utf-8')
-        except Exception as e:
-            logger.error(f"Error encoding image {image_path}: {str(e)}")
-            return None
     @staticmethod
     def _save_uploaded_file(file_input: Union[str, bytes], filename: str) -> str:
         clean_filename = os.path.basename(filename).replace(os.sep, "_")
@@ -102,7 +82,16 @@ class OCRProcessor:
             raise
     @staticmethod
-    def _pdf_to_images(pdf_path: str) -> List[str]:
         try:
             pdf_document = fitz.open(pdf_path)
             if pdf_document.page_count > MAX_PDF_PAGES:
@@ -110,42 +99,40 @@ class OCRProcessor:
                 raise ValueError(f"PDF exceeds maximum page limit of {MAX_PDF_PAGES}")
             with ThreadPoolExecutor() as executor:
-                image_paths = list(executor.map(
                     lambda i: OCRProcessor._convert_page(pdf_path, i),
                     range(pdf_document.page_count)
                 ))
             pdf_document.close()
-            return [path for path in image_paths if path]
         except Exception as e:
             logger.error(f"Error converting PDF to images: {str(e)}")
             return []
     @staticmethod
-    def _convert_page(pdf_path: str, page_num: int) -> Optional[str]:
         try:
             pdf_document = fitz.open(pdf_path)
             page = pdf_document[page_num]
             pix = page.get_pixmap(dpi=150)
             image_path = os.path.join(UPLOAD_FOLDER, f"page_{page_num + 1}_{int(time.time())}.png")
             pix.save(image_path)
             pdf_document.close()
-            return image_path
         except Exception as e:
             logger.error(f"Error converting page {page_num}: {str(e)}")
-            return None
     @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
-    def _call_ocr_api(self, document: Union[DocumentURLChunk, ImageURLChunk]) -> OCRResponse:
         return self.client.ocr.process(
             model="mistral-ocr-latest",
-            document=document,
             include_image_base64=True
         )
-    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
-    def _call_chat_complete(self, model: str, messages: List[Dict], **kwargs) -> Dict:
-        return self.client.chat.complete(model=model, messages=messages, **kwargs)
     def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes]) -> Tuple[str, List[str]]:
         file_name = getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf")
         logger.info(f"Processing uploaded PDF: {file_name}")
@@ -157,18 +144,45 @@ class OCRProcessor:
             if not os.path.exists(pdf_path):
                 raise FileNotFoundError(f"Saved PDF not found at: {pdf_path}")
-            image_paths = self._pdf_to_images(pdf_path)
-            with open(pdf_path, "rb") as f:
-                uploaded_file = self.client.files.upload(
-                    file={"file_name": file_name, "content": f},
-                    purpose="ocr"
-                )
-            signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=TEMP_FILE_EXPIRY)
-            response = self._call_ocr_api(DocumentURLChunk(document_url=signed_url.url))
-            return self._get_combined_markdown(response), image_paths
         except Exception as e:
-            return self._handle_error("PDF processing", e), []
     def ocr_uploaded_image(self, image_file: Union[str, bytes]) -> Tuple[str, str]:
         file_name = getattr(image_file, 'name', f"image_{int(time.time())}.jpg")
@@ -177,61 +191,11 @@ class OCRProcessor:
             self._check_file_size(image_file)
             image_path = self._save_uploaded_file(image_file, file_name)
             encoded_image = self._encode_image(image_path)
-            if not encoded_image:
-                raise ValueError("Failed to encode image")
-            base64_url = f"data:image/jpeg;base64,{encoded_image}"
-            response = self._call_ocr_api(ImageURLChunk(image_url=base64_url))
             return self._get_combined_markdown(response), image_path
         except Exception as e:
             return self._handle_error("image processing", e), None
-    def document_understanding(self, doc_url: str, question: str) -> str:
-        try:
-            messages = [{"role": "user", "content": [
-                TextChunk(text=question),
-                DocumentURLChunk(document_url=doc_url)
-            ]}]
-            response = self._call_chat_complete(
-                model="mistral-small-latest",
-                messages=messages,
-                temperature=0.1
-            )
-            return response.choices[0].message.content
-        except Exception as e:
-            return self._handle_error("document understanding", e)
-    def structured_ocr(self, image_file: Union[str, bytes]) -> Tuple[str, str]:
-        file_name = getattr(image_file, 'name', f"image_{int(time.time())}.jpg")
-        try:
-            self._check_file_size(image_file)
-            image_path = self._save_uploaded_file(image_file, file_name)
-            encoded_image = self._encode_image(image_path)
-            if not encoded_image:
-                raise ValueError("Failed to encode image")
-            base64_url = f"data:image/jpeg;base64,{encoded_image}"
-            ocr_response = self._call_ocr_api(ImageURLChunk(image_url=base64_url))
-            markdown = self._get_combined_markdown(ocr_response)
-            chat_response = self._call_chat_complete(
-                model="pixtral-12b-latest",
-                messages=[{
-                    "role": "user",
-                    "content": [
-                        ImageURLChunk(image_url=base64_url),
-                        TextChunk(text=(
-                            f"This is image's OCR in markdown:\n<BEGIN_IMAGE_OCR>\n{markdown}\n<END_IMAGE_OCR>.\n"
-                            "Convert this into a structured JSON response with file_name, topics, languages, and ocr_contents fields"
-                        ))
-                    ]
-                }],
-                response_format={"type": "json_object"},
-                temperature=0.1
-            )
-            return self._format_structured_response(image_path, json.loads(chat_response.choices[0].message.content)), image_path
-        except Exception as e:
-            return self._handle_error("structured OCR", e), None
     @staticmethod
     def _get_combined_markdown(response: OCRResponse) -> str:
         return "\n\n".join(
@@ -244,20 +208,6 @@ class OCRProcessor:
         logger.error(f"Error in {context}: {str(error)}")
         return f"**Error in {context}:** {str(error)}"
-    @staticmethod
-    def _format_structured_response(file_path: str, content: Dict) -> str:
-        languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
-        content_languages = content.get("languages", [DEFAULT_LANGUAGE])
-        valid_langs = [l for l in content_languages if l in languages.values()] or [DEFAULT_LANGUAGE]
-        response = {
-            "file_name": Path(file_path).name,
-            "topics": content.get("topics", []),
-            "languages": valid_langs,
-            "ocr_contents": content.get("ocr_contents", {})
-        }
-        return f"```json\n{json.dumps(response, indent=2, ensure_ascii=False)}\n```"
 def create_interface():
     css = """
     .output-markdown {font-size: 14px; max-height: 500px; overflow-y: auto;}
@@ -265,7 +215,7 @@ def create_interface():
     """
     with gr.Blocks(title="Mistral OCR App", css=css) as demo:
-        gr.Markdown("# Mistral OCR App\nUpload images or PDFs for OCR processing")
         with gr.Row():
             api_key = gr.Textbox(label="Mistral API Key", type="password", placeholder="Enter your API key")
@@ -310,45 +260,34 @@ def create_interface():
         with gr.Tab("PDF OCR"):
             with gr.Row():
-                pdf_input = gr.File(
-                    label=f"Upload PDF (max {MAX_FILE_SIZE/1024/1024}MB, {MAX_PDF_PAGES} pages)",
-                    file_types=SUPPORTED_PDF_TYPES
-                )
                 pdf_gallery = gr.Gallery(label="PDF Pages", height=300)
             pdf_output = gr.Markdown(label="OCR Result", elem_classes="output-markdown")
             process_pdf_btn = gr.Button("Process PDF", variant="primary")
-            def process_pdf(processor, pdf):
-                if not processor or not pdf:
-                    return "Please set API key and upload a PDF", []
-                return processor.ocr_uploaded_pdf(pdf)
             process_pdf_btn.click(
                 fn=process_pdf,
-                inputs=[processor_state, pdf_input],
                 outputs=[pdf_output, pdf_gallery]
             )
-        with gr.Tab("Structured OCR"):
-            structured_input = gr.File(
-                label=f"Upload Image for Structured OCR (max {MAX_FILE_SIZE/1024/1024}MB)",
-                file_types=SUPPORTED_IMAGE_TYPES
-            )
-            structured_output = gr.Markdown(label="Structured Result", elem_classes="output-markdown")
-            structured_preview = gr.Image(label="Preview", height=300)
-            process_structured_btn = gr.Button("Process Structured OCR", variant="primary")
-            def process_structured(processor, image):
-                if not processor or not image:
-                    return "Please set API key and upload an image", None
-                return processor.structured_ocr(image)
-            process_structured_btn.click(
-                fn=process_structured,
-                inputs=[processor_state, structured_input],
-                outputs=[structured_output, structured_preview]
-            )
     return demo
 if __name__ == "__main__":

 import os
 import base64
 import gradio as gr
+from mistralai import Mistral, ImageURLChunk
 from mistralai.models import OCRResponse
+from typing import Union, List, Tuple
 import requests
 import shutil
 import time
+import pymupdf as fitz
+import logging
+from tenacity import retry, stop_after_attempt, wait_exponential
+from concurrent.futures import ThreadPoolExecutor
 # Constants
 SUPPORTED_IMAGE_TYPES = [".jpg", ".png", ".jpeg"]
 SUPPORTED_PDF_TYPES = [".pdf"]
 UPLOAD_FOLDER = "./uploads"
 MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
 MAX_PDF_PAGES = 50
 class OCRProcessor:
     def __init__(self, api_key: str):
         if not api_key or not isinstance(api_key, str):
             raise ValueError("Valid API key must be provided")
+        self.client = Mistral(api_key=api_key)
+        self._validate_client()
     def _validate_client(self) -> None:
         try:
             size = os.path.getsize(file_input)
         elif hasattr(file_input, 'read'):
             size = len(file_input.read())
+            file_input.seek(0)
         else:
             size = len(file_input)
         if size > MAX_FILE_SIZE:
             raise ValueError(f"File size exceeds {MAX_FILE_SIZE/1024/1024}MB limit")
     @staticmethod
     def _save_uploaded_file(file_input: Union[str, bytes], filename: str) -> str:
         clean_filename = os.path.basename(filename).replace(os.sep, "_")
             raise
     @staticmethod
+    def _encode_image(image_path: str) -> str:
+        try:
+            with open(image_path, "rb") as image_file:
+                return base64.b64encode(image_file.read()).decode('utf-8')
+        except Exception as e:
+            logger.error(f"Error encoding image {image_path}: {str(e)}")
+            raise ValueError("Failed to encode image")
+    @staticmethod
+    def _pdf_to_images(pdf_path: str) -> List[Tuple[str, str]]:
         try:
             pdf_document = fitz.open(pdf_path)
             if pdf_document.page_count > MAX_PDF_PAGES:
                 raise ValueError(f"PDF exceeds maximum page limit of {MAX_PDF_PAGES}")
             with ThreadPoolExecutor() as executor:
+                image_data = list(executor.map(
                     lambda i: OCRProcessor._convert_page(pdf_path, i),
                     range(pdf_document.page_count)
                 ))
             pdf_document.close()
+            return [data for data in image_data if data]
         except Exception as e:
             logger.error(f"Error converting PDF to images: {str(e)}")
             return []
     @staticmethod
+    def _convert_page(pdf_path: str, page_num: int) -> Tuple[str, str]:
         try:
             pdf_document = fitz.open(pdf_path)
             page = pdf_document[page_num]
             pix = page.get_pixmap(dpi=150)
             image_path = os.path.join(UPLOAD_FOLDER, f"page_{page_num + 1}_{int(time.time())}.png")
             pix.save(image_path)
+            encoded = OCRProcessor._encode_image(image_path)
             pdf_document.close()
+            return image_path, encoded
         except Exception as e:
             logger.error(f"Error converting page {page_num}: {str(e)}")
+            return None, None
     @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
+    def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
+        base64_url = f"data:image/png;base64,{encoded_image}"
         return self.client.ocr.process(
             model="mistral-ocr-latest",
+            document=ImageURLChunk(image_url=base64_url),
             include_image_base64=True
         )
     def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes]) -> Tuple[str, List[str]]:
         file_name = getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf")
         logger.info(f"Processing uploaded PDF: {file_name}")
             if not os.path.exists(pdf_path):
                 raise FileNotFoundError(f"Saved PDF not found at: {pdf_path}")
+            image_data = self._pdf_to_images(pdf_path)
+            if not image_data:
+                raise ValueError("No pages converted from PDF")
+            # Process each page with OCR
+            ocr_results = []
+            for _, encoded in image_data:
+                response = self._call_ocr_api(encoded)
+                markdown = self._get_combined_markdown(response)
+                ocr_results.append(markdown)
+            image_paths = [path for path, _ in image_data]
+            return "\n\n".join(ocr_results), image_paths
+        except Exception as e:
+            return self._handle_error("uploaded PDF processing", e), []
+    def ocr_pdf_url(self, pdf_url: str) -> Tuple[str, List[str]]:
+        logger.info(f"Processing PDF URL: {pdf_url}")
+        try:
+            file_name = pdf_url.split('/')[-1] or f"pdf_{int(time.time())}.pdf"
+            pdf_path = self._save_uploaded_file(pdf_url, file_name)
+            if not os.path.exists(pdf_path):
+                raise FileNotFoundError(f"Saved PDF not found at: {pdf_path}")
+            image_data = self._pdf_to_images(pdf_path)
+            if not image_data:
+                raise ValueError("No pages converted from PDF")
+            ocr_results = []
+            for _, encoded in image_data:
+                response = self._call_ocr_api(encoded)
+                markdown = self._get_combined_markdown(response)
+                ocr_results.append(markdown)
+            image_paths = [path for path, _ in image_data]
+            return "\n\n".join(ocr_results), image_paths
         except Exception as e:
+            return self._handle_error("PDF URL processing", e), []
     def ocr_uploaded_image(self, image_file: Union[str, bytes]) -> Tuple[str, str]:
         file_name = getattr(image_file, 'name', f"image_{int(time.time())}.jpg")
             self._check_file_size(image_file)
             image_path = self._save_uploaded_file(image_file, file_name)
             encoded_image = self._encode_image(image_path)
+            response = self._call_ocr_api(encoded_image)
             return self._get_combined_markdown(response), image_path
         except Exception as e:
             return self._handle_error("image processing", e), None
     @staticmethod
     def _get_combined_markdown(response: OCRResponse) -> str:
         return "\n\n".join(
         logger.error(f"Error in {context}: {str(error)}")
         return f"**Error in {context}:** {str(error)}"
 def create_interface():
     css = """
     .output-markdown {font-size: 14px; max-height: 500px; overflow-y: auto;}
     """
     with gr.Blocks(title="Mistral OCR App", css=css) as demo:
+        gr.Markdown("# Mistral OCR App\nUpload images or PDFs, or provide a PDF URL for OCR processing")
         with gr.Row():
             api_key = gr.Textbox(label="Mistral API Key", type="password", placeholder="Enter your API key")
         with gr.Tab("PDF OCR"):
             with gr.Row():
+                with gr.Column():
+                    pdf_input = gr.File(
+                        label=f"Upload PDF (max {MAX_FILE_SIZE/1024/1024}MB, {MAX_PDF_PAGES} pages)",
+                        file_types=SUPPORTED_PDF_TYPES
+                    )
+                    pdf_url_input = gr.Textbox(
+                        label="Or Enter PDF URL",
+                        placeholder="e.g., https://arxiv.org/pdf/2201.04234.pdf"
+                    )
                 pdf_gallery = gr.Gallery(label="PDF Pages", height=300)
             pdf_output = gr.Markdown(label="OCR Result", elem_classes="output-markdown")
             process_pdf_btn = gr.Button("Process PDF", variant="primary")
+            def process_pdf(processor, pdf_file, pdf_url):
+                if not processor:
+                    return "Please set API key first", []
+                if pdf_file:
+                    return processor.ocr_uploaded_pdf(pdf_file)
+                elif pdf_url:
+                    return processor.ocr_pdf_url(pdf_url)
+                return "Please upload a PDF or provide a URL", []
             process_pdf_btn.click(
                 fn=process_pdf,
+                inputs=[processor_state, pdf_input, pdf_url_input],
                 outputs=[pdf_output, pdf_gallery]
             )
     return demo
 if __name__ == "__main__":