Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,12 @@ import os
|
|
2 |
import base64
|
3 |
import gradio as gr
|
4 |
from mistralai import Mistral
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# Initialize Mistral client with API key
|
7 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
@@ -22,37 +28,27 @@ def ocr_pdf_url(pdf_url):
|
|
22 |
try:
|
23 |
ocr_response = client.ocr.process(
|
24 |
model="mistral-ocr-latest",
|
25 |
-
document={
|
26 |
-
|
27 |
-
"document_url": pdf_url
|
28 |
-
}
|
29 |
)
|
30 |
-
return
|
31 |
except Exception as e:
|
32 |
return f"Error: {str(e)}"
|
33 |
|
34 |
# OCR with Uploaded PDF
|
35 |
def ocr_uploaded_pdf(pdf_file):
|
36 |
try:
|
37 |
-
# Upload the PDF
|
38 |
uploaded_pdf = client.files.upload(
|
39 |
-
file={
|
40 |
-
"file_name": pdf_file.name,
|
41 |
-
"content": open(pdf_file.name, "rb")
|
42 |
-
},
|
43 |
purpose="ocr"
|
44 |
)
|
45 |
-
|
46 |
-
signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
|
47 |
-
# Process OCR
|
48 |
ocr_response = client.ocr.process(
|
49 |
model="mistral-ocr-latest",
|
50 |
-
document={
|
51 |
-
|
52 |
-
"document_url": signed_url.url
|
53 |
-
}
|
54 |
)
|
55 |
-
return str(ocr_response)
|
56 |
except Exception as e:
|
57 |
return f"Error: {str(e)}"
|
58 |
|
@@ -61,12 +57,9 @@ def ocr_image_url(image_url):
|
|
61 |
try:
|
62 |
ocr_response = client.ocr.process(
|
63 |
model="mistral-ocr-latest",
|
64 |
-
document={
|
65 |
-
"type": "image_url",
|
66 |
-
"image_url": image_url
|
67 |
-
}
|
68 |
)
|
69 |
-
return str(ocr_response)
|
70 |
except Exception as e:
|
71 |
return f"Error: {str(e)}"
|
72 |
|
@@ -78,12 +71,9 @@ def ocr_uploaded_image(image_file):
|
|
78 |
return base64_image
|
79 |
ocr_response = client.ocr.process(
|
80 |
model="mistral-ocr-latest",
|
81 |
-
document={
|
82 |
-
"type": "image_url",
|
83 |
-
"image_url": f"data:image/jpeg;base64,{base64_image}"
|
84 |
-
}
|
85 |
)
|
86 |
-
return str(ocr_response)
|
87 |
except Exception as e:
|
88 |
return f"Error: {str(e)}"
|
89 |
|
@@ -91,13 +81,10 @@ def ocr_uploaded_image(image_file):
|
|
91 |
def document_understanding(doc_url, question):
|
92 |
try:
|
93 |
messages = [
|
94 |
-
{
|
95 |
-
"
|
96 |
-
"
|
97 |
-
|
98 |
-
{"type": "document_url", "document_url": doc_url}
|
99 |
-
]
|
100 |
-
}
|
101 |
]
|
102 |
chat_response = client.chat.complete(
|
103 |
model="mistral-small-latest",
|
@@ -107,32 +94,93 @@ def document_understanding(doc_url, question):
|
|
107 |
except Exception as e:
|
108 |
return f"Error: {str(e)}"
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
# Gradio Interface
|
111 |
-
with gr.Blocks(title="Mistral OCR &
|
112 |
-
gr.Markdown("# Mistral OCR &
|
113 |
-
gr.Markdown("
|
114 |
|
115 |
with gr.Tab("OCR with PDF URL"):
|
116 |
pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
|
117 |
-
pdf_url_output = gr.Textbox(label="OCR Result")
|
118 |
pdf_url_button = gr.Button("Process PDF")
|
119 |
pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
|
120 |
|
121 |
with gr.Tab("OCR with Uploaded PDF"):
|
122 |
pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
123 |
-
pdf_file_output = gr.Textbox(label="OCR Result")
|
124 |
pdf_file_button = gr.Button("Process Uploaded PDF")
|
125 |
pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
|
126 |
|
127 |
with gr.Tab("OCR with Image URL"):
|
128 |
image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
|
129 |
-
image_url_output = gr.Textbox(label="OCR Result")
|
130 |
image_url_button = gr.Button("Process Image")
|
131 |
image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
|
132 |
|
133 |
with gr.Tab("OCR with Uploaded Image"):
|
134 |
image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
|
135 |
-
image_file_output = gr.Textbox(label="OCR Result")
|
136 |
image_file_button = gr.Button("Process Uploaded Image")
|
137 |
image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
|
138 |
|
@@ -143,8 +191,11 @@ with gr.Blocks(title="Mistral OCR & Document Understanding App") as demo:
|
|
143 |
doc_button = gr.Button("Ask Question")
|
144 |
doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
# Launch the app
|
147 |
-
demo.launch(
|
148 |
-
share=True,
|
149 |
-
debug=True
|
150 |
-
)
|
|
|
2 |
import base64
|
3 |
import gradio as gr
|
4 |
from mistralai import Mistral
|
5 |
+
from mistralai.models import OCRResponse
|
6 |
+
from pathlib import Path
|
7 |
+
from enum import Enum
|
8 |
+
from pydantic import BaseModel
|
9 |
+
import pycountry
|
10 |
+
import json
|
11 |
|
12 |
# Initialize Mistral client with API key
|
13 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
|
|
28 |
try:
|
29 |
ocr_response = client.ocr.process(
|
30 |
model="mistral-ocr-latest",
|
31 |
+
document={"type": "document_url", "document_url": pdf_url},
|
32 |
+
include_image_base64=True
|
|
|
|
|
33 |
)
|
34 |
+
return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
|
35 |
except Exception as e:
|
36 |
return f"Error: {str(e)}"
|
37 |
|
38 |
# OCR with Uploaded PDF
|
39 |
def ocr_uploaded_pdf(pdf_file):
|
40 |
try:
|
|
|
41 |
uploaded_pdf = client.files.upload(
|
42 |
+
file={"file_name": pdf_file.name, "content": open(pdf_file.name, "rb")},
|
|
|
|
|
|
|
43 |
purpose="ocr"
|
44 |
)
|
45 |
+
signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id, expiry=3600)
|
|
|
|
|
46 |
ocr_response = client.ocr.process(
|
47 |
model="mistral-ocr-latest",
|
48 |
+
document={"type": "document_url", "document_url": signed_url.url},
|
49 |
+
include_image_base64=True
|
|
|
|
|
50 |
)
|
51 |
+
return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
|
52 |
except Exception as e:
|
53 |
return f"Error: {str(e)}"
|
54 |
|
|
|
57 |
try:
|
58 |
ocr_response = client.ocr.process(
|
59 |
model="mistral-ocr-latest",
|
60 |
+
document={"type": "image_url", "image_url": image_url}
|
|
|
|
|
|
|
61 |
)
|
62 |
+
return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
|
63 |
except Exception as e:
|
64 |
return f"Error: {str(e)}"
|
65 |
|
|
|
71 |
return base64_image
|
72 |
ocr_response = client.ocr.process(
|
73 |
model="mistral-ocr-latest",
|
74 |
+
document={"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
|
|
|
|
|
|
|
75 |
)
|
76 |
+
return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
|
77 |
except Exception as e:
|
78 |
return f"Error: {str(e)}"
|
79 |
|
|
|
81 |
def document_understanding(doc_url, question):
|
82 |
try:
|
83 |
messages = [
|
84 |
+
{"role": "user", "content": [
|
85 |
+
{"type": "text", "text": question},
|
86 |
+
{"type": "document_url", "document_url": doc_url}
|
87 |
+
]}
|
|
|
|
|
|
|
88 |
]
|
89 |
chat_response = client.chat.complete(
|
90 |
model="mistral-small-latest",
|
|
|
94 |
except Exception as e:
|
95 |
return f"Error: {str(e)}"
|
96 |
|
97 |
+
# Structured OCR Setup
|
98 |
+
languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
|
99 |
+
|
100 |
+
class LanguageMeta(Enum.__class__):
|
101 |
+
def __new__(metacls, cls, bases, classdict):
|
102 |
+
for code, name in languages.items():
|
103 |
+
classdict[name.upper().replace(' ', '_')] = name
|
104 |
+
return super().__new__(metacls, cls, bases, classdict)
|
105 |
+
|
106 |
+
class Language(Enum, metaclass=LanguageMeta):
|
107 |
+
pass
|
108 |
+
|
109 |
+
class StructuredOCR(BaseModel):
|
110 |
+
file_name: str
|
111 |
+
topics: list[str]
|
112 |
+
languages: list[Language]
|
113 |
+
ocr_contents: dict
|
114 |
+
|
115 |
+
def structured_ocr(image_file):
|
116 |
+
try:
|
117 |
+
image_path = Path(image_file.name)
|
118 |
+
encoded_image = encode_image(image_path)
|
119 |
+
if "Error" in encoded_image:
|
120 |
+
return encoded_image
|
121 |
+
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
122 |
+
|
123 |
+
# OCR processing
|
124 |
+
image_response = client.ocr.process(
|
125 |
+
document={"type": "image_url", "image_url": base64_data_url},
|
126 |
+
model="mistral-ocr-latest"
|
127 |
+
)
|
128 |
+
image_ocr_markdown = image_response.pages[0].markdown
|
129 |
+
|
130 |
+
# Structured output with pixtral-12b-latest
|
131 |
+
chat_response = client.chat.complete(
|
132 |
+
model="pixtral-12b-latest",
|
133 |
+
messages=[{
|
134 |
+
"role": "user",
|
135 |
+
"content": [
|
136 |
+
{"type": "image_url", "image_url": base64_data_url},
|
137 |
+
{"type": "text", "text": (
|
138 |
+
f"This is the image's OCR in markdown:\n<BEGIN_IMAGE_OCR>\n{image_ocr_markdown}\n<END_IMAGE_OCR>.\n"
|
139 |
+
"Convert this into a structured JSON response with the OCR contents in a sensible dictionary."
|
140 |
+
)}
|
141 |
+
],
|
142 |
+
}],
|
143 |
+
response_format={"type": "json_object"},
|
144 |
+
temperature=0
|
145 |
+
)
|
146 |
+
|
147 |
+
response_dict = json.loads(chat_response.choices[0].message.content)
|
148 |
+
structured_response = StructuredOCR.parse_obj({
|
149 |
+
"file_name": image_path.name,
|
150 |
+
"topics": response_dict.get("topics", []),
|
151 |
+
"languages": [Language[l] for l in response_dict.get("languages", ["English"]) if l in languages.values()],
|
152 |
+
"ocr_contents": response_dict.get("ocr_contents", {})
|
153 |
+
})
|
154 |
+
return json.dumps(structured_response.dict(), indent=4)
|
155 |
+
except Exception as e:
|
156 |
+
return f"Error: {str(e)}"
|
157 |
+
|
158 |
# Gradio Interface
|
159 |
+
with gr.Blocks(title="Mistral OCR & Structured Output App") as demo:
|
160 |
+
gr.Markdown("# Mistral OCR & Structured Output App")
|
161 |
+
gr.Markdown("Extract text from PDFs and images, ask questions about documents, or get structured JSON output!")
|
162 |
|
163 |
with gr.Tab("OCR with PDF URL"):
|
164 |
pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
|
165 |
+
pdf_url_output = gr.Textbox(label="OCR Result (Markdown)")
|
166 |
pdf_url_button = gr.Button("Process PDF")
|
167 |
pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
|
168 |
|
169 |
with gr.Tab("OCR with Uploaded PDF"):
|
170 |
pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
171 |
+
pdf_file_output = gr.Textbox(label="OCR Result (Markdown)")
|
172 |
pdf_file_button = gr.Button("Process Uploaded PDF")
|
173 |
pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
|
174 |
|
175 |
with gr.Tab("OCR with Image URL"):
|
176 |
image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
|
177 |
+
image_url_output = gr.Textbox(label="OCR Result (Markdown)")
|
178 |
image_url_button = gr.Button("Process Image")
|
179 |
image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
|
180 |
|
181 |
with gr.Tab("OCR with Uploaded Image"):
|
182 |
image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
|
183 |
+
image_file_output = gr.Textbox(label="OCR Result (Markdown)")
|
184 |
image_file_button = gr.Button("Process Uploaded Image")
|
185 |
image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
|
186 |
|
|
|
191 |
doc_button = gr.Button("Ask Question")
|
192 |
doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
|
193 |
|
194 |
+
with gr.Tab("Structured OCR"):
|
195 |
+
struct_image_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
|
196 |
+
struct_output = gr.Textbox(label="Structured JSON Output")
|
197 |
+
struct_button = gr.Button("Get Structured Output")
|
198 |
+
struct_button.click(structured_ocr, inputs=struct_image_input, outputs=struct_output)
|
199 |
+
|
200 |
# Launch the app
|
201 |
+
demo.launch(share=True, debug=True)
|
|
|
|
|
|