Svngoku commited on
Commit
d0b423f
·
verified ·
1 Parent(s): bee31a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -46
app.py CHANGED
@@ -2,6 +2,12 @@ import os
2
  import base64
3
  import gradio as gr
4
  from mistralai import Mistral
 
 
 
 
 
 
5
 
6
  # Initialize Mistral client with API key
7
  api_key = os.environ.get("MISTRAL_API_KEY")
@@ -22,37 +28,27 @@ def ocr_pdf_url(pdf_url):
22
  try:
23
  ocr_response = client.ocr.process(
24
  model="mistral-ocr-latest",
25
- document={
26
- "type": "document_url",
27
- "document_url": pdf_url
28
- }
29
  )
30
- return str(ocr_response) # Convert response to string for display
31
  except Exception as e:
32
  return f"Error: {str(e)}"
33
 
34
  # OCR with Uploaded PDF
35
  def ocr_uploaded_pdf(pdf_file):
36
  try:
37
- # Upload the PDF
38
  uploaded_pdf = client.files.upload(
39
- file={
40
- "file_name": pdf_file.name,
41
- "content": open(pdf_file.name, "rb")
42
- },
43
  purpose="ocr"
44
  )
45
- # Get signed URL
46
- signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
47
- # Process OCR
48
  ocr_response = client.ocr.process(
49
  model="mistral-ocr-latest",
50
- document={
51
- "type": "document_url",
52
- "document_url": signed_url.url
53
- }
54
  )
55
- return str(ocr_response)
56
  except Exception as e:
57
  return f"Error: {str(e)}"
58
 
@@ -61,12 +57,9 @@ def ocr_image_url(image_url):
61
  try:
62
  ocr_response = client.ocr.process(
63
  model="mistral-ocr-latest",
64
- document={
65
- "type": "image_url",
66
- "image_url": image_url
67
- }
68
  )
69
- return str(ocr_response)
70
  except Exception as e:
71
  return f"Error: {str(e)}"
72
 
@@ -78,12 +71,9 @@ def ocr_uploaded_image(image_file):
78
  return base64_image
79
  ocr_response = client.ocr.process(
80
  model="mistral-ocr-latest",
81
- document={
82
- "type": "image_url",
83
- "image_url": f"data:image/jpeg;base64,{base64_image}"
84
- }
85
  )
86
- return str(ocr_response)
87
  except Exception as e:
88
  return f"Error: {str(e)}"
89
 
@@ -91,13 +81,10 @@ def ocr_uploaded_image(image_file):
91
  def document_understanding(doc_url, question):
92
  try:
93
  messages = [
94
- {
95
- "role": "user",
96
- "content": [
97
- {"type": "text", "text": question},
98
- {"type": "document_url", "document_url": doc_url}
99
- ]
100
- }
101
  ]
102
  chat_response = client.chat.complete(
103
  model="mistral-small-latest",
@@ -107,32 +94,93 @@ def document_understanding(doc_url, question):
107
  except Exception as e:
108
  return f"Error: {str(e)}"
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  # Gradio Interface
111
- with gr.Blocks(title="Mistral OCR & Document Understanding App") as demo:
112
- gr.Markdown("# Mistral OCR & Document Understanding App")
113
- gr.Markdown("Use this app to extract text from PDFs and images or ask questions about documents!")
114
 
115
  with gr.Tab("OCR with PDF URL"):
116
  pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
117
- pdf_url_output = gr.Textbox(label="OCR Result")
118
  pdf_url_button = gr.Button("Process PDF")
119
  pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
120
 
121
  with gr.Tab("OCR with Uploaded PDF"):
122
  pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
123
- pdf_file_output = gr.Textbox(label="OCR Result")
124
  pdf_file_button = gr.Button("Process Uploaded PDF")
125
  pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
126
 
127
  with gr.Tab("OCR with Image URL"):
128
  image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
129
- image_url_output = gr.Textbox(label="OCR Result")
130
  image_url_button = gr.Button("Process Image")
131
  image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
132
 
133
  with gr.Tab("OCR with Uploaded Image"):
134
  image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
135
- image_file_output = gr.Textbox(label="OCR Result")
136
  image_file_button = gr.Button("Process Uploaded Image")
137
  image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
138
 
@@ -143,8 +191,11 @@ with gr.Blocks(title="Mistral OCR & Document Understanding App") as demo:
143
  doc_button = gr.Button("Ask Question")
144
  doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
145
 
 
 
 
 
 
 
146
  # Launch the app
147
- demo.launch(
148
- share=True,
149
- debug=True
150
- )
 
2
  import base64
3
  import gradio as gr
4
  from mistralai import Mistral
5
+ from mistralai.models import OCRResponse
6
+ from pathlib import Path
7
+ from enum import Enum
8
+ from pydantic import BaseModel
9
+ import pycountry
10
+ import json
11
 
12
  # Initialize Mistral client with API key
13
  api_key = os.environ.get("MISTRAL_API_KEY")
 
28
  try:
29
  ocr_response = client.ocr.process(
30
  model="mistral-ocr-latest",
31
+ document={"type": "document_url", "document_url": pdf_url},
32
+ include_image_base64=True
 
 
33
  )
34
+ return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
35
  except Exception as e:
36
  return f"Error: {str(e)}"
37
 
38
  # OCR with Uploaded PDF
39
  def ocr_uploaded_pdf(pdf_file):
40
  try:
 
41
  uploaded_pdf = client.files.upload(
42
+ file={"file_name": pdf_file.name, "content": open(pdf_file.name, "rb")},
 
 
 
43
  purpose="ocr"
44
  )
45
+ signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id, expiry=3600)
 
 
46
  ocr_response = client.ocr.process(
47
  model="mistral-ocr-latest",
48
+ document={"type": "document_url", "document_url": signed_url.url},
49
+ include_image_base64=True
 
 
50
  )
51
+ return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
52
  except Exception as e:
53
  return f"Error: {str(e)}"
54
 
 
57
  try:
58
  ocr_response = client.ocr.process(
59
  model="mistral-ocr-latest",
60
+ document={"type": "image_url", "image_url": image_url}
 
 
 
61
  )
62
+ return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
63
  except Exception as e:
64
  return f"Error: {str(e)}"
65
 
 
71
  return base64_image
72
  ocr_response = client.ocr.process(
73
  model="mistral-ocr-latest",
74
+ document={"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
 
 
 
75
  )
76
+ return ocr_response.pages[0].markdown if ocr_response.pages else str(ocr_response)
77
  except Exception as e:
78
  return f"Error: {str(e)}"
79
 
 
81
  def document_understanding(doc_url, question):
82
  try:
83
  messages = [
84
+ {"role": "user", "content": [
85
+ {"type": "text", "text": question},
86
+ {"type": "document_url", "document_url": doc_url}
87
+ ]}
 
 
 
88
  ]
89
  chat_response = client.chat.complete(
90
  model="mistral-small-latest",
 
94
  except Exception as e:
95
  return f"Error: {str(e)}"
96
 
97
+ # Structured OCR Setup
98
+ languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
99
+
100
+ class LanguageMeta(Enum.__class__):
101
+ def __new__(metacls, cls, bases, classdict):
102
+ for code, name in languages.items():
103
+ classdict[name.upper().replace(' ', '_')] = name
104
+ return super().__new__(metacls, cls, bases, classdict)
105
+
106
+ class Language(Enum, metaclass=LanguageMeta):
107
+ pass
108
+
109
+ class StructuredOCR(BaseModel):
110
+ file_name: str
111
+ topics: list[str]
112
+ languages: list[Language]
113
+ ocr_contents: dict
114
+
115
+ def structured_ocr(image_file):
116
+ try:
117
+ image_path = Path(image_file.name)
118
+ encoded_image = encode_image(image_path)
119
+ if "Error" in encoded_image:
120
+ return encoded_image
121
+ base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
122
+
123
+ # OCR processing
124
+ image_response = client.ocr.process(
125
+ document={"type": "image_url", "image_url": base64_data_url},
126
+ model="mistral-ocr-latest"
127
+ )
128
+ image_ocr_markdown = image_response.pages[0].markdown
129
+
130
+ # Structured output with pixtral-12b-latest
131
+ chat_response = client.chat.complete(
132
+ model="pixtral-12b-latest",
133
+ messages=[{
134
+ "role": "user",
135
+ "content": [
136
+ {"type": "image_url", "image_url": base64_data_url},
137
+ {"type": "text", "text": (
138
+ f"This is the image's OCR in markdown:\n<BEGIN_IMAGE_OCR>\n{image_ocr_markdown}\n<END_IMAGE_OCR>.\n"
139
+ "Convert this into a structured JSON response with the OCR contents in a sensible dictionary."
140
+ )}
141
+ ],
142
+ }],
143
+ response_format={"type": "json_object"},
144
+ temperature=0
145
+ )
146
+
147
+ response_dict = json.loads(chat_response.choices[0].message.content)
148
+ structured_response = StructuredOCR.parse_obj({
149
+ "file_name": image_path.name,
150
+ "topics": response_dict.get("topics", []),
151
+ "languages": [Language[l] for l in response_dict.get("languages", ["English"]) if l in languages.values()],
152
+ "ocr_contents": response_dict.get("ocr_contents", {})
153
+ })
154
+ return json.dumps(structured_response.dict(), indent=4)
155
+ except Exception as e:
156
+ return f"Error: {str(e)}"
157
+
158
  # Gradio Interface
159
+ with gr.Blocks(title="Mistral OCR & Structured Output App") as demo:
160
+ gr.Markdown("# Mistral OCR & Structured Output App")
161
+ gr.Markdown("Extract text from PDFs and images, ask questions about documents, or get structured JSON output!")
162
 
163
  with gr.Tab("OCR with PDF URL"):
164
  pdf_url_input = gr.Textbox(label="PDF URL", placeholder="e.g., https://arxiv.org/pdf/2201.04234")
165
+ pdf_url_output = gr.Textbox(label="OCR Result (Markdown)")
166
  pdf_url_button = gr.Button("Process PDF")
167
  pdf_url_button.click(ocr_pdf_url, inputs=pdf_url_input, outputs=pdf_url_output)
168
 
169
  with gr.Tab("OCR with Uploaded PDF"):
170
  pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
171
+ pdf_file_output = gr.Textbox(label="OCR Result (Markdown)")
172
  pdf_file_button = gr.Button("Process Uploaded PDF")
173
  pdf_file_button.click(ocr_uploaded_pdf, inputs=pdf_file_input, outputs=pdf_file_output)
174
 
175
  with gr.Tab("OCR with Image URL"):
176
  image_url_input = gr.Textbox(label="Image URL", placeholder="e.g., https://example.com/image.jpg")
177
+ image_url_output = gr.Textbox(label="OCR Result (Markdown)")
178
  image_url_button = gr.Button("Process Image")
179
  image_url_button.click(ocr_image_url, inputs=image_url_input, outputs=image_url_output)
180
 
181
  with gr.Tab("OCR with Uploaded Image"):
182
  image_file_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
183
+ image_file_output = gr.Textbox(label="OCR Result (Markdown)")
184
  image_file_button = gr.Button("Process Uploaded Image")
185
  image_file_button.click(ocr_uploaded_image, inputs=image_file_input, outputs=image_file_output)
186
 
 
191
  doc_button = gr.Button("Ask Question")
192
  doc_button.click(document_understanding, inputs=[doc_url_input, question_input], outputs=doc_output)
193
 
194
+ with gr.Tab("Structured OCR"):
195
+ struct_image_input = gr.File(label="Upload Image", file_types=[".jpg", ".png"])
196
+ struct_output = gr.Textbox(label="Structured JSON Output")
197
+ struct_button = gr.Button("Get Structured Output")
198
+ struct_button.click(structured_ocr, inputs=struct_image_input, outputs=struct_output)
199
+
200
  # Launch the app
201
+ demo.launch(share=True, debug=True)