Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -31,9 +31,10 @@ def ocr_pdf_url(pdf_url):
|
|
31 |
document={"type": "document_url", "document_url": pdf_url},
|
32 |
include_image_base64=True
|
33 |
)
|
34 |
-
|
|
|
35 |
except Exception as e:
|
36 |
-
return f"Error
|
37 |
|
38 |
# OCR with Uploaded PDF
|
39 |
def ocr_uploaded_pdf(pdf_file):
|
@@ -48,9 +49,10 @@ def ocr_uploaded_pdf(pdf_file):
|
|
48 |
document={"type": "document_url", "document_url": signed_url.url},
|
49 |
include_image_base64=True
|
50 |
)
|
51 |
-
|
|
|
52 |
except Exception as e:
|
53 |
-
return f"Error
|
54 |
|
55 |
# OCR with Image URL
|
56 |
def ocr_image_url(image_url):
|
@@ -59,23 +61,25 @@ def ocr_image_url(image_url):
|
|
59 |
model="mistral-ocr-latest",
|
60 |
document={"type": "image_url", "image_url": image_url}
|
61 |
)
|
62 |
-
|
|
|
63 |
except Exception as e:
|
64 |
-
return f"Error
|
65 |
|
66 |
# OCR with Uploaded Image
|
67 |
def ocr_uploaded_image(image_file):
|
68 |
try:
|
69 |
base64_image = encode_image(image_file.name)
|
70 |
if "Error" in base64_image:
|
71 |
-
return base64_image
|
72 |
ocr_response = client.ocr.process(
|
73 |
model="mistral-ocr-latest",
|
74 |
document={"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
|
75 |
)
|
76 |
-
|
|
|
77 |
except Exception as e:
|
78 |
-
return f"Error
|
79 |
|
80 |
# Document Understanding
|
81 |
def document_understanding(doc_url, question):
|
@@ -90,9 +94,9 @@ def document_understanding(doc_url, question):
|
|
90 |
model="mistral-small-latest",
|
91 |
messages=messages
|
92 |
)
|
93 |
-
return chat_response.choices[0].message.content
|
94 |
except Exception as e:
|
95 |
-
return f"Error
|
96 |
|
97 |
# Structured OCR Setup
|
98 |
languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
|
@@ -117,7 +121,7 @@ def structured_ocr(image_file):
|
|
117 |
image_path = Path(image_file.name)
|
118 |
encoded_image = encode_image(image_path)
|
119 |
if "Error" in encoded_image:
|
120 |
-
return encoded_image
|
121 |
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
122 |
|
123 |
# OCR processing
|
@@ -151,49 +155,50 @@ def structured_ocr(image_file):
|
|
151 |
"languages": [Language[l] for l in response_dict.get("languages", ["English"]) if l in languages.values()],
|
152 |
"ocr_contents": response_dict.get("ocr_contents", {})
|
153 |
})
|
154 |
-
|
|
|
155 |
except Exception as e:
|
156 |
-
return f"Error
|
157 |
|
158 |
# Gradio Interface
|
159 |
with gr.Blocks(title="Mistral OCR & Structured Output App") as demo:
|
160 |
gr.Markdown("# Mistral OCR & Structured Output App")
|
161 |
-
gr.Markdown("Extract text from PDFs and images, ask questions about documents, or get structured JSON output!")
|
162 |
|
163 |
with gr.Tab("OCR with PDF URL"):
|
164 |
pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
|
165 |
-
pdf_url_output = gr.
|
166 |
pdf_url_button = gr.Button("Process PDF")
|
167 |
pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
|
168 |
|
169 |
with gr.Tab("OCR with Uploaded PDF"):
|
170 |
pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
171 |
-
pdf_file_output = gr.
|
172 |
pdf_file_button = gr.Button("Process Uploaded PDF")
|
173 |
pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
|
174 |
|
175 |
with gr.Tab("OCR with Image URL"):
|
176 |
image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
|
177 |
-
image_url_output = gr.
|
178 |
image_url_button = gr.Button("Process Image")
|
179 |
image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
|
180 |
|
181 |
with gr.Tab("OCR with Uploaded Image"):
|
182 |
image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
|
183 |
-
image_file_output = gr.
|
184 |
image_file_button = gr.Button("Process Uploaded Image")
|
185 |
image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
|
186 |
|
187 |
with gr.Tab("Document Understanding"):
|
188 |
doc_url_input = gr.Textbox(label="Document URL", placeholder="e.g., https://arxiv.org/pdf/1805.04770")
|
189 |
question_input = gr.Textbox(label="Question", placeholder="e.g., What is the last sentence?")
|
190 |
-
doc_output = gr.Textbox(label="Answer")
|
191 |
doc_button = gr.Button("Ask Question")
|
192 |
doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
|
193 |
|
194 |
with gr.Tab("Structured OCR"):
|
195 |
struct_image_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
|
196 |
-
struct_output = gr.
|
197 |
struct_button = gr.Button("Get Structured Output")
|
198 |
struct_button.click(structured_ocr, inputs=struct_image_input, outputs=struct_output)
|
199 |
|
|
|
31 |
document={"type": "document_url", "document_url": pdf_url},
|
32 |
include_image_base64=True
|
33 |
)
|
34 |
+
markdown = ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
|
35 |
+
return markdown # Return raw markdown for gr.Markdown to render
|
36 |
except Exception as e:
|
37 |
+
return f"**Error:** {str(e)}"
|
38 |
|
39 |
# OCR with Uploaded PDF
|
40 |
def ocr_uploaded_pdf(pdf_file):
|
|
|
49 |
document={"type": "document_url", "document_url": signed_url.url},
|
50 |
include_image_base64=True
|
51 |
)
|
52 |
+
markdown = ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
|
53 |
+
return markdown
|
54 |
except Exception as e:
|
55 |
+
return f"**Error:** {str(e)}"
|
56 |
|
57 |
# OCR with Image URL
|
58 |
def ocr_image_url(image_url):
|
|
|
61 |
model="mistral-ocr-latest",
|
62 |
document={"type": "image_url", "image_url": image_url}
|
63 |
)
|
64 |
+
markdown = ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
|
65 |
+
return markdown
|
66 |
except Exception as e:
|
67 |
+
return f"**Error:** {str(e)}"
|
68 |
|
69 |
# OCR with Uploaded Image
|
70 |
def ocr_uploaded_image(image_file):
|
71 |
try:
|
72 |
base64_image = encode_image(image_file.name)
|
73 |
if "Error" in base64_image:
|
74 |
+
return f"**Error:** {base64_image}"
|
75 |
ocr_response = client.ocr.process(
|
76 |
model="mistral-ocr-latest",
|
77 |
document={"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
|
78 |
)
|
79 |
+
markdown = ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
|
80 |
+
return markdown
|
81 |
except Exception as e:
|
82 |
+
return f"**Error:** {str(e)}"
|
83 |
|
84 |
# Document Understanding
|
85 |
def document_understanding(doc_url, question):
|
|
|
94 |
model="mistral-small-latest",
|
95 |
messages=messages
|
96 |
)
|
97 |
+
return chat_response.choices[0].message.content # Plain text output
|
98 |
except Exception as e:
|
99 |
+
return f"**Error:** {str(e)}"
|
100 |
|
101 |
# Structured OCR Setup
|
102 |
languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
|
|
|
121 |
image_path = Path(image_file.name)
|
122 |
encoded_image = encode_image(image_path)
|
123 |
if "Error" in encoded_image:
|
124 |
+
return f"**Error:** {encoded_image}"
|
125 |
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
126 |
|
127 |
# OCR processing
|
|
|
155 |
"languages": [Language[l] for l in response_dict.get("languages", ["English"]) if l in languages.values()],
|
156 |
"ocr_contents": response_dict.get("ocr_contents", {})
|
157 |
})
|
158 |
+
# Return as Markdown code block
|
159 |
+
return f"```json\n{json.dumps(structured_response.dict(), indent=4)}\n```"
|
160 |
except Exception as e:
|
161 |
+
return f"**Error:** {str(e)}"
|
162 |
|
163 |
# Gradio Interface
|
164 |
with gr.Blocks(title="Mistral OCR & Structured Output App") as demo:
|
165 |
gr.Markdown("# Mistral OCR & Structured Output App")
|
166 |
+
gr.Markdown("Extract text from PDFs and images, ask questions about documents, or get structured JSON output in Markdown format!")
|
167 |
|
168 |
with gr.Tab("OCR with PDF URL"):
|
169 |
pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
|
170 |
+
pdf_url_output = gr.Markdown(label="OCR Result (Markdown)")
|
171 |
pdf_url_button = gr.Button("Process PDF")
|
172 |
pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
|
173 |
|
174 |
with gr.Tab("OCR with Uploaded PDF"):
|
175 |
pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
176 |
+
pdf_file_output = gr.Markdown(label="OCR Result (Markdown)")
|
177 |
pdf_file_button = gr.Button("Process Uploaded PDF")
|
178 |
pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
|
179 |
|
180 |
with gr.Tab("OCR with Image URL"):
|
181 |
image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
|
182 |
+
image_url_output = gr.Markdown(label="OCR Result (Markdown)")
|
183 |
image_url_button = gr.Button("Process Image")
|
184 |
image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
|
185 |
|
186 |
with gr.Tab("OCR with Uploaded Image"):
|
187 |
image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
|
188 |
+
image_file_output = gr.Markdown(label="OCR Result (Markdown)")
|
189 |
image_file_button = gr.Button("Process Uploaded Image")
|
190 |
image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
|
191 |
|
192 |
with gr.Tab("Document Understanding"):
|
193 |
doc_url_input = gr.Textbox(label="Document URL", placeholder="e.g., https://arxiv.org/pdf/1805.04770")
|
194 |
question_input = gr.Textbox(label="Question", placeholder="e.g., What is the last sentence?")
|
195 |
+
doc_output = gr.Textbox(label="Answer") # Keep as Textbox for plain text
|
196 |
doc_button = gr.Button("Ask Question")
|
197 |
doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
|
198 |
|
199 |
with gr.Tab("Structured OCR"):
|
200 |
struct_image_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
|
201 |
+
struct_output = gr.Markdown(label="Structured JSON Output (Markdown)")
|
202 |
struct_button = gr.Button("Get Structured Output")
|
203 |
struct_button.click(structured_ocr, inputs=struct_image_input, outputs=struct_output)
|
204 |
|