Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,8 @@ import pymupdf as fitz
|
|
11 |
import logging
|
12 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
13 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
14 |
|
15 |
# Constants
|
16 |
SUPPORTED_IMAGE_TYPES = [".jpg", ".png", ".jpeg"]
|
@@ -62,13 +64,16 @@ class OCRProcessor:
|
|
62 |
|
63 |
try:
|
64 |
if isinstance(file_input, str) and file_input.startswith("http"):
|
65 |
-
|
|
|
66 |
response.raise_for_status()
|
67 |
with open(file_path, 'wb') as f:
|
68 |
f.write(response.content)
|
69 |
elif isinstance(file_input, str) and os.path.exists(file_input):
|
|
|
70 |
shutil.copy2(file_input, file_path)
|
71 |
else:
|
|
|
72 |
with open(file_path, 'wb') as f:
|
73 |
if hasattr(file_input, 'read'):
|
74 |
shutil.copyfileobj(file_input, f)
|
@@ -76,6 +81,7 @@ class OCRProcessor:
|
|
76 |
f.write(file_input)
|
77 |
if not os.path.exists(file_path):
|
78 |
raise FileNotFoundError(f"Failed to save file at {file_path}")
|
|
|
79 |
return file_path
|
80 |
except Exception as e:
|
81 |
logger.error(f"Error saving file {filename}: {str(e)}")
|
@@ -127,11 +133,18 @@ class OCRProcessor:
|
|
127 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
128 |
def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
|
129 |
base64_url = f"data:image/png;base64,{encoded_image}"
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes]) -> Tuple[str, List[str]]:
|
137 |
file_name = getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf")
|
@@ -139,7 +152,6 @@ class OCRProcessor:
|
|
139 |
try:
|
140 |
self._check_file_size(pdf_file)
|
141 |
pdf_path = self._save_uploaded_file(pdf_file, file_name)
|
142 |
-
logger.info(f"Saved PDF to: {pdf_path}")
|
143 |
|
144 |
if not os.path.exists(pdf_path):
|
145 |
raise FileNotFoundError(f"Saved PDF not found at: {pdf_path}")
|
@@ -148,14 +160,13 @@ class OCRProcessor:
|
|
148 |
if not image_data:
|
149 |
raise ValueError("No pages converted from PDF")
|
150 |
|
151 |
-
# Process each page with OCR
|
152 |
ocr_results = []
|
153 |
-
for
|
|
|
154 |
response = self._call_ocr_api(encoded)
|
155 |
-
|
156 |
-
ocr_results.append(
|
157 |
|
158 |
-
image_paths = [path for path, _ in image_data]
|
159 |
return "\n\n".join(ocr_results), image_paths
|
160 |
except Exception as e:
|
161 |
return self._handle_error("uploaded PDF processing", e), []
|
@@ -174,12 +185,12 @@ class OCRProcessor:
|
|
174 |
raise ValueError("No pages converted from PDF")
|
175 |
|
176 |
ocr_results = []
|
177 |
-
for
|
|
|
178 |
response = self._call_ocr_api(encoded)
|
179 |
-
|
180 |
-
ocr_results.append(
|
181 |
|
182 |
-
image_paths = [path for path, _ in image_data]
|
183 |
return "\n\n".join(ocr_results), image_paths
|
184 |
except Exception as e:
|
185 |
return self._handle_error("PDF URL processing", e), []
|
@@ -192,16 +203,51 @@ class OCRProcessor:
|
|
192 |
image_path = self._save_uploaded_file(image_file, file_name)
|
193 |
encoded_image = self._encode_image(image_path)
|
194 |
response = self._call_ocr_api(encoded_image)
|
195 |
-
return self.
|
196 |
except Exception as e:
|
197 |
return self._handle_error("image processing", e), None
|
198 |
|
199 |
@staticmethod
|
200 |
-
def
|
201 |
-
|
202 |
-
|
203 |
-
if page.markdown.strip()
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
@staticmethod
|
207 |
def _handle_error(context: str, error: Exception) -> str:
|
@@ -276,11 +322,14 @@ def create_interface():
|
|
276 |
def process_pdf(processor, pdf_file, pdf_url):
|
277 |
if not processor:
|
278 |
return "Please set API key first", []
|
279 |
-
|
|
|
|
|
280 |
return processor.ocr_uploaded_pdf(pdf_file)
|
281 |
-
elif pdf_url:
|
|
|
282 |
return processor.ocr_pdf_url(pdf_url)
|
283 |
-
return "Please upload a PDF or provide a URL", []
|
284 |
|
285 |
process_pdf_btn.click(
|
286 |
fn=process_pdf,
|
|
|
11 |
import logging
|
12 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
13 |
from concurrent.futures import ThreadPoolExecutor
|
14 |
+
import socket
|
15 |
+
from requests.exceptions import ConnectionError, Timeout
|
16 |
|
17 |
# Constants
|
18 |
SUPPORTED_IMAGE_TYPES = [".jpg", ".png", ".jpeg"]
|
|
|
64 |
|
65 |
try:
|
66 |
if isinstance(file_input, str) and file_input.startswith("http"):
|
67 |
+
logger.info(f"Downloading from URL: {file_input}")
|
68 |
+
response = requests.get(file_input, timeout=30)
|
69 |
response.raise_for_status()
|
70 |
with open(file_path, 'wb') as f:
|
71 |
f.write(response.content)
|
72 |
elif isinstance(file_input, str) and os.path.exists(file_input):
|
73 |
+
logger.info(f"Copying local file: {file_input}")
|
74 |
shutil.copy2(file_input, file_path)
|
75 |
else:
|
76 |
+
logger.info(f"Saving file object: {filename}")
|
77 |
with open(file_path, 'wb') as f:
|
78 |
if hasattr(file_input, 'read'):
|
79 |
shutil.copyfileobj(file_input, f)
|
|
|
81 |
f.write(file_input)
|
82 |
if not os.path.exists(file_path):
|
83 |
raise FileNotFoundError(f"Failed to save file at {file_path}")
|
84 |
+
logger.info(f"File saved to: {file_path}")
|
85 |
return file_path
|
86 |
except Exception as e:
|
87 |
logger.error(f"Error saving file {filename}: {str(e)}")
|
|
|
133 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
134 |
def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
|
135 |
base64_url = f"data:image/png;base64,{encoded_image}"
|
136 |
+
try:
|
137 |
+
logger.info("Calling OCR API")
|
138 |
+
response = self.client.ocr.process(
|
139 |
+
model="mistral-ocr-latest",
|
140 |
+
document=ImageURLChunk(image_url=base64_url),
|
141 |
+
include_image_base64=True
|
142 |
+
)
|
143 |
+
logger.info("OCR API call successful")
|
144 |
+
return response
|
145 |
+
except (ConnectionError, Timeout, socket.error) as e:
|
146 |
+
logger.error(f"Network error during OCR API call: {str(e)}")
|
147 |
+
raise
|
148 |
|
149 |
def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes]) -> Tuple[str, List[str]]:
|
150 |
file_name = getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf")
|
|
|
152 |
try:
|
153 |
self._check_file_size(pdf_file)
|
154 |
pdf_path = self._save_uploaded_file(pdf_file, file_name)
|
|
|
155 |
|
156 |
if not os.path.exists(pdf_path):
|
157 |
raise FileNotFoundError(f"Saved PDF not found at: {pdf_path}")
|
|
|
160 |
if not image_data:
|
161 |
raise ValueError("No pages converted from PDF")
|
162 |
|
|
|
163 |
ocr_results = []
|
164 |
+
image_paths = [path for path, _ in image_data]
|
165 |
+
for i, (_, encoded) in enumerate(image_data):
|
166 |
response = self._call_ocr_api(encoded)
|
167 |
+
markdown_with_images = self._get_combined_markdown_with_images(response, image_paths, i)
|
168 |
+
ocr_results.append(markdown_with_images)
|
169 |
|
|
|
170 |
return "\n\n".join(ocr_results), image_paths
|
171 |
except Exception as e:
|
172 |
return self._handle_error("uploaded PDF processing", e), []
|
|
|
185 |
raise ValueError("No pages converted from PDF")
|
186 |
|
187 |
ocr_results = []
|
188 |
+
image_paths = [path for path, _ in image_data]
|
189 |
+
for i, (_, encoded) in enumerate(image_data):
|
190 |
response = self._call_ocr_api(encoded)
|
191 |
+
markdown_with_images = self._get_combined_markdown_with_images(response, image_paths, i)
|
192 |
+
ocr_results.append(markdown_with_images)
|
193 |
|
|
|
194 |
return "\n\n".join(ocr_results), image_paths
|
195 |
except Exception as e:
|
196 |
return self._handle_error("PDF URL processing", e), []
|
|
|
203 |
image_path = self._save_uploaded_file(image_file, file_name)
|
204 |
encoded_image = self._encode_image(image_path)
|
205 |
response = self._call_ocr_api(encoded_image)
|
206 |
+
return self._get_combined_markdown_with_images(response), image_path
|
207 |
except Exception as e:
|
208 |
return self._handle_error("image processing", e), None
|
209 |
|
210 |
@staticmethod
|
211 |
+
def _get_combined_markdown_with_images(response: OCRResponse, image_paths: List[str] = None, page_index: int = None) -> str:
|
212 |
+
markdown_parts = []
|
213 |
+
for i, page in enumerate(response.pages):
|
214 |
+
if page.markdown.strip():
|
215 |
+
markdown = page.markdown
|
216 |
+
logger.info(f"Page {i} markdown: {markdown}")
|
217 |
+
if hasattr(page, 'images') and page.images:
|
218 |
+
logger.info(f"Found {len(page.images)} images in page {i}")
|
219 |
+
for img in page.images:
|
220 |
+
if img.image_base64:
|
221 |
+
logger.info(f"Replacing image {img.id} with base64")
|
222 |
+
markdown = markdown.replace(
|
223 |
+
f"",
|
224 |
+
f""
|
225 |
+
)
|
226 |
+
else:
|
227 |
+
logger.warning(f"No base64 data for image {img.id}")
|
228 |
+
if image_paths and page_index is not None and page_index < len(image_paths):
|
229 |
+
local_encoded = OCRProcessor._encode_image(image_paths[page_index])
|
230 |
+
markdown = markdown.replace(
|
231 |
+
f"",
|
232 |
+
f""
|
233 |
+
)
|
234 |
+
else:
|
235 |
+
logger.warning(f"No images found in page {i}")
|
236 |
+
# Replace known placeholders or append the local image
|
237 |
+
if image_paths and page_index is not None and page_index < len(image_paths):
|
238 |
+
local_encoded = OCRProcessor._encode_image(image_paths[page_index])
|
239 |
+
# Replace placeholders like img-0.jpeg
|
240 |
+
placeholder = f"img-{i}.jpeg"
|
241 |
+
if placeholder in markdown:
|
242 |
+
markdown = markdown.replace(
|
243 |
+
placeholder,
|
244 |
+
f""
|
245 |
+
)
|
246 |
+
else:
|
247 |
+
# Append the image if no placeholder is found
|
248 |
+
markdown += f"\n\n"
|
249 |
+
markdown_parts.append(markdown)
|
250 |
+
return "\n\n".join(markdown_parts) or "No text or images detected"
|
251 |
|
252 |
@staticmethod
|
253 |
def _handle_error(context: str, error: Exception) -> str:
|
|
|
322 |
def process_pdf(processor, pdf_file, pdf_url):
|
323 |
if not processor:
|
324 |
return "Please set API key first", []
|
325 |
+
logger.info(f"Received inputs - PDF file: {pdf_file}, PDF URL: {pdf_url}")
|
326 |
+
if pdf_file is not None and hasattr(pdf_file, 'name'):
|
327 |
+
logger.info(f"Processing as uploaded PDF: {pdf_file.name}")
|
328 |
return processor.ocr_uploaded_pdf(pdf_file)
|
329 |
+
elif pdf_url and pdf_url.strip():
|
330 |
+
logger.info(f"Processing as PDF URL: {pdf_url}")
|
331 |
return processor.ocr_pdf_url(pdf_url)
|
332 |
+
return "Please upload a PDF or provide a valid URL", []
|
333 |
|
334 |
process_pdf_btn.click(
|
335 |
fn=process_pdf,
|