Spaces:

Svngoku
/

mistral-ocr-demo

Running

App Files Files Community

Svngoku commited on Mar 6

Commit

d0b423f

verified ·

1 Parent(s): bee31a6

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -46

app.py CHANGED Viewed

@@ -2,6 +2,12 @@ import os
 import base64
 import gradio as gr
 from mistralai import Mistral
 # Initialize Mistral client with API key
 api_key = os.environ.get("MISTRAL_API_KEY")
@@ -22,37 +28,27 @@ def ocr_pdf_url(pdf_url):
     try:
         ocr_response = client.ocr.process(
             model="mistral-ocr-latest",
-            document={
-                "type": "document_url",
-                "document_url": pdf_url
-            }
         )
-        return str(ocr_response)  # Convert response to string for display
     except Exception as e:
         return f"Error: {str(e)}"
 # OCR with Uploaded PDF
 def ocr_uploaded_pdf(pdf_file):
     try:
-        # Upload the PDF
         uploaded_pdf = client.files.upload(
-            file={
-                "file_name": pdf_file.name,
-                "content": open(pdf_file.name, "rb")
-            },
             purpose="ocr"
         )
-        # Get signed URL
-        signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
-        # Process OCR
         ocr_response = client.ocr.process(
             model="mistral-ocr-latest",
-            document={
-                "type": "document_url",
-                "document_url": signed_url.url
-            }
         )
-        return str(ocr_response)
     except Exception as e:
         return f"Error: {str(e)}"
@@ -61,12 +57,9 @@ def ocr_image_url(image_url):
     try:
         ocr_response = client.ocr.process(
             model="mistral-ocr-latest",
-            document={
-                "type": "image_url",
-                "image_url": image_url
-            }
         )
-        return str(ocr_response)
     except Exception as e:
         return f"Error: {str(e)}"
@@ -78,12 +71,9 @@ def ocr_uploaded_image(image_file):
             return base64_image
         ocr_response = client.ocr.process(
             model="mistral-ocr-latest",
-            document={
-                "type": "image_url",
-                "image_url": f"data:image/jpeg;base64,{base64_image}"
-            }
         )
-        return str(ocr_response)
     except Exception as e:
         return f"Error: {str(e)}"
@@ -91,13 +81,10 @@ def ocr_uploaded_image(image_file):
 def document_understanding(doc_url, question):
     try:
         messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": question},
-                    {"type": "document_url", "document_url": doc_url}
-                ]
-            }
         ]
         chat_response = client.chat.complete(
             model="mistral-small-latest",
@@ -107,32 +94,93 @@ def document_understanding(doc_url, question):
     except Exception as e:
         return f"Error: {str(e)}"
 # Gradio Interface
-with gr.Blocks(title="Mistral OCR & Document Understanding App") as demo:
-    gr.Markdown("# Mistral OCR & Document Understanding App")
-    gr.Markdown("Use this app to extract text from PDFs and images or ask questions about documents!")
     with gr.Tab("OCR with PDF URL"):
         pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
-        pdf_url_output = gr.Textbox(label="OCR Result")
         pdf_url_button = gr.Button("Process PDF")
         pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
     with gr.Tab("OCR with Uploaded PDF"):
         pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-        pdf_file_output = gr.Textbox(label="OCR Result")
         pdf_file_button = gr.Button("Process Uploaded PDF")
         pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
     with gr.Tab("OCR with Image URL"):
         image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
-        image_url_output = gr.Textbox(label="OCR Result")
         image_url_button = gr.Button("Process Image")
         image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
     with gr.Tab("OCR with Uploaded Image"):
         image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
-        image_file_output = gr.Textbox(label="OCR Result")
         image_file_button = gr.Button("Process Uploaded Image")
         image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
@@ -143,8 +191,11 @@ with gr.Blocks(title="Mistral OCR & Document Understanding App") as demo:
         doc_button = gr.Button("Ask Question")
         doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
 # Launch the app
-demo.launch(
-    share=True,
-    debug=True
-)

 import base64
 import gradio as gr
 from mistralai import Mistral
+from mistralai.models import OCRResponse
+from pathlib import Path
+from enum import Enum
+from pydantic import BaseModel
+import pycountry
+import json
 # Initialize Mistral client with API key
 api_key = os.environ.get("MISTRAL_API_KEY")
     try:
         ocr_response = client.ocr.process(
             model="mistral-ocr-latest",
+            document={"type": "document_url", "document_url": pdf_url},
+            include_image_base64=True
         )
+        return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
     except Exception as e:
         return f"Error: {str(e)}"
 # OCR with Uploaded PDF
 def ocr_uploaded_pdf(pdf_file):
     try:
         uploaded_pdf = client.files.upload(
+            file={"file_name": pdf_file.name, "content": open(pdf_file.name, "rb")},
             purpose="ocr"
         )
+        signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id, expiry=3600)
         ocr_response = client.ocr.process(
             model="mistral-ocr-latest",
+            document={"type": "document_url", "document_url": signed_url.url},
+            include_image_base64=True
         )
+        return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
     except Exception as e:
         return f"Error: {str(e)}"
     try:
         ocr_response = client.ocr.process(
             model="mistral-ocr-latest",
+            document={"type": "image_url", "image_url": image_url}
         )
+        return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
     except Exception as e:
         return f"Error: {str(e)}"
             return base64_image
         ocr_response = client.ocr.process(
             model="mistral-ocr-latest",
+            document={"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
         )
+        return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
     except Exception as e:
         return f"Error: {str(e)}"
 def document_understanding(doc_url, question):
     try:
         messages = [
+            {"role": "user", "content": [
+                {"type": "text", "text": question},
+                {"type": "document_url", "document_url": doc_url}
+            ]}
         ]
         chat_response = client.chat.complete(
             model="mistral-small-latest",
     except Exception as e:
         return f"Error: {str(e)}"
+# Structured OCR Setup
+languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
+class LanguageMeta(Enum.__class__):
+    def __new__(metacls, cls, bases, classdict):
+        for code, name in languages.items():
+            classdict[name.upper().replace(' ', '_')] = name
+        return super().__new__(metacls, cls, bases, classdict)
+class Language(Enum, metaclass=LanguageMeta):
+    pass
+class StructuredOCR(BaseModel):
+    file_name: str
+    topics: list[str]
+    languages: list[Language]
+    ocr_contents: dict
+def structured_ocr(image_file):
+    try:
+        image_path = Path(image_file.name)
+        encoded_image = encode_image(image_path)
+        if "Error" in encoded_image:
+            return encoded_image
+        base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+        # OCR processing
+        image_response = client.ocr.process(
+            document={"type": "image_url", "image_url": base64_data_url},
+            model="mistral-ocr-latest"
+        )
+        image_ocr_markdown = image_response.pages[0].markdown
+        # Structured output with pixtral-12b-latest
+        chat_response = client.chat.complete(
+            model="pixtral-12b-latest",
+            messages=[{
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": base64_data_url},
+                    {"type": "text", "text": (
+                        f"This is the image's OCR in markdown:\n<BEGIN_IMAGE_OCR>\n{image_ocr_markdown}\n<END_IMAGE_OCR>.\n"
+                        "Convert this into a structured JSON response with the OCR contents in a sensible dictionary."
+                    )}
+                ],
+            }],
+            response_format={"type": "json_object"},
+            temperature=0
+        )
+        response_dict = json.loads(chat_response.choices[0].message.content)
+        structured_response = StructuredOCR.parse_obj({
+            "file_name": image_path.name,
+            "topics": response_dict.get("topics", []),
+            "languages": [Language[l] for l in response_dict.get("languages", ["English"]) if l in languages.values()],
+            "ocr_contents": response_dict.get("ocr_contents", {})
+        })
+        return json.dumps(structured_response.dict(), indent=4)
+    except Exception as e:
+        return f"Error: {str(e)}"
 # Gradio Interface
+with gr.Blocks(title="Mistral OCR & Structured Output App") as demo:
+    gr.Markdown("# Mistral OCR & Structured Output App")
+    gr.Markdown("Extract text from PDFs and images, ask questions about documents, or get structured JSON output!")
     with gr.Tab("OCR with PDF URL"):
         pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
+        pdf_url_output = gr.Textbox(label="OCR Result (Markdown)")
         pdf_url_button = gr.Button("Process PDF")
         pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
     with gr.Tab("OCR with Uploaded PDF"):
         pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+        pdf_file_output = gr.Textbox(label="OCR Result (Markdown)")
         pdf_file_button = gr.Button("Process Uploaded PDF")
         pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
     with gr.Tab("OCR with Image URL"):
         image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
+        image_url_output = gr.Textbox(label="OCR Result (Markdown)")
         image_url_button = gr.Button("Process Image")
         image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
     with gr.Tab("OCR with Uploaded Image"):
         image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
+        image_file_output = gr.Textbox(label="OCR Result (Markdown)")
         image_file_button = gr.Button("Process Uploaded Image")
         image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
         doc_button = gr.Button("Ask Question")
         doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
+    with gr.Tab("Structured OCR"):
+        struct_image_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
+        struct_output = gr.Textbox(label="Structured JSON Output")
+        struct_button = gr.Button("Get Structured Output")
+        struct_button.click(structured_ocr, inputs=struct_image_input, outputs=struct_output)
 # Launch the app
+demo.launch(share=True, debug=True)