Svngoku commited on
Commit
be2e6ae
·
verified ·
1 Parent(s): 86ba735

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -25
app.py CHANGED
@@ -11,6 +11,8 @@ import pymupdf as fitz
11
  import logging
12
  from tenacity import retry, stop_after_attempt, wait_exponential
13
  from concurrent.futures import ThreadPoolExecutor
 
 
14
 
15
  # Constants
16
  SUPPORTED_IMAGE_TYPES = [".jpg", ".png", ".jpeg"]
@@ -62,13 +64,16 @@ class OCRProcessor:
62
 
63
  try:
64
  if isinstance(file_input, str) and file_input.startswith("http"):
65
- response = requests.get(file_input, timeout=10)
 
66
  response.raise_for_status()
67
  with open(file_path, 'wb') as f:
68
  f.write(response.content)
69
  elif isinstance(file_input, str) and os.path.exists(file_input):
 
70
  shutil.copy2(file_input, file_path)
71
  else:
 
72
  with open(file_path, 'wb') as f:
73
  if hasattr(file_input, 'read'):
74
  shutil.copyfileobj(file_input, f)
@@ -76,6 +81,7 @@ class OCRProcessor:
76
  f.write(file_input)
77
  if not os.path.exists(file_path):
78
  raise FileNotFoundError(f"Failed to save file at {file_path}")
 
79
  return file_path
80
  except Exception as e:
81
  logger.error(f"Error saving file {filename}: {str(e)}")
@@ -127,11 +133,18 @@ class OCRProcessor:
127
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
128
  def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
129
  base64_url = f"data:image/png;base64,{encoded_image}"
130
- return self.client.ocr.process(
131
- model="mistral-ocr-latest",
132
- document=ImageURLChunk(image_url=base64_url),
133
- include_image_base64=True
134
- )
 
 
 
 
 
 
 
135
 
136
  def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes]) -> Tuple[str, List[str]]:
137
  file_name = getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf")
@@ -139,7 +152,6 @@ class OCRProcessor:
139
  try:
140
  self._check_file_size(pdf_file)
141
  pdf_path = self._save_uploaded_file(pdf_file, file_name)
142
- logger.info(f"Saved PDF to: {pdf_path}")
143
 
144
  if not os.path.exists(pdf_path):
145
  raise FileNotFoundError(f"Saved PDF not found at: {pdf_path}")
@@ -148,14 +160,13 @@ class OCRProcessor:
148
  if not image_data:
149
  raise ValueError("No pages converted from PDF")
150
 
151
- # Process each page with OCR
152
  ocr_results = []
153
- for _, encoded in image_data:
 
154
  response = self._call_ocr_api(encoded)
155
- markdown = self._get_combined_markdown(response)
156
- ocr_results.append(markdown)
157
 
158
- image_paths = [path for path, _ in image_data]
159
  return "\n\n".join(ocr_results), image_paths
160
  except Exception as e:
161
  return self._handle_error("uploaded PDF processing", e), []
@@ -174,12 +185,12 @@ class OCRProcessor:
174
  raise ValueError("No pages converted from PDF")
175
 
176
  ocr_results = []
177
- for _, encoded in image_data:
 
178
  response = self._call_ocr_api(encoded)
179
- markdown = self._get_combined_markdown(response)
180
- ocr_results.append(markdown)
181
 
182
- image_paths = [path for path, _ in image_data]
183
  return "\n\n".join(ocr_results), image_paths
184
  except Exception as e:
185
  return self._handle_error("PDF URL processing", e), []
@@ -192,16 +203,51 @@ class OCRProcessor:
192
  image_path = self._save_uploaded_file(image_file, file_name)
193
  encoded_image = self._encode_image(image_path)
194
  response = self._call_ocr_api(encoded_image)
195
- return self._get_combined_markdown(response), image_path
196
  except Exception as e:
197
  return self._handle_error("image processing", e), None
198
 
199
  @staticmethod
200
- def _get_combined_markdown(response: OCRResponse) -> str:
201
- return "\n\n".join(
202
- page.markdown for page in response.pages
203
- if page.markdown.strip()
204
- ) or "No text detected"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  @staticmethod
207
  def _handle_error(context: str, error: Exception) -> str:
@@ -276,11 +322,14 @@ def create_interface():
276
  def process_pdf(processor, pdf_file, pdf_url):
277
  if not processor:
278
  return "Please set API key first", []
279
- if pdf_file:
 
 
280
  return processor.ocr_uploaded_pdf(pdf_file)
281
- elif pdf_url:
 
282
  return processor.ocr_pdf_url(pdf_url)
283
- return "Please upload a PDF or provide a URL", []
284
 
285
  process_pdf_btn.click(
286
  fn=process_pdf,
 
11
  import logging
12
  from tenacity import retry, stop_after_attempt, wait_exponential
13
  from concurrent.futures import ThreadPoolExecutor
14
+ import socket
15
+ from requests.exceptions import ConnectionError, Timeout
16
 
17
  # Constants
18
  SUPPORTED_IMAGE_TYPES = [".jpg", ".png", ".jpeg"]
 
64
 
65
  try:
66
  if isinstance(file_input, str) and file_input.startswith("http"):
67
+ logger.info(f"Downloading from URL: {file_input}")
68
+ response = requests.get(file_input, timeout=30)
69
  response.raise_for_status()
70
  with open(file_path, 'wb') as f:
71
  f.write(response.content)
72
  elif isinstance(file_input, str) and os.path.exists(file_input):
73
+ logger.info(f"Copying local file: {file_input}")
74
  shutil.copy2(file_input, file_path)
75
  else:
76
+ logger.info(f"Saving file object: {filename}")
77
  with open(file_path, 'wb') as f:
78
  if hasattr(file_input, 'read'):
79
  shutil.copyfileobj(file_input, f)
 
81
  f.write(file_input)
82
  if not os.path.exists(file_path):
83
  raise FileNotFoundError(f"Failed to save file at {file_path}")
84
+ logger.info(f"File saved to: {file_path}")
85
  return file_path
86
  except Exception as e:
87
  logger.error(f"Error saving file {filename}: {str(e)}")
 
133
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
134
  def _call_ocr_api(self, encoded_image: str) -> OCRResponse:
135
  base64_url = f"data:image/png;base64,{encoded_image}"
136
+ try:
137
+ logger.info("Calling OCR API")
138
+ response = self.client.ocr.process(
139
+ model="mistral-ocr-latest",
140
+ document=ImageURLChunk(image_url=base64_url),
141
+ include_image_base64=True
142
+ )
143
+ logger.info("OCR API call successful")
144
+ return response
145
+ except (ConnectionError, Timeout, socket.error) as e:
146
+ logger.error(f"Network error during OCR API call: {str(e)}")
147
+ raise
148
 
149
  def ocr_uploaded_pdf(self, pdf_file: Union[str, bytes]) -> Tuple[str, List[str]]:
150
  file_name = getattr(pdf_file, 'name', f"pdf_{int(time.time())}.pdf")
 
152
  try:
153
  self._check_file_size(pdf_file)
154
  pdf_path = self._save_uploaded_file(pdf_file, file_name)
 
155
 
156
  if not os.path.exists(pdf_path):
157
  raise FileNotFoundError(f"Saved PDF not found at: {pdf_path}")
 
160
  if not image_data:
161
  raise ValueError("No pages converted from PDF")
162
 
 
163
  ocr_results = []
164
+ image_paths = [path for path, _ in image_data]
165
+ for i, (_, encoded) in enumerate(image_data):
166
  response = self._call_ocr_api(encoded)
167
+ markdown_with_images = self._get_combined_markdown_with_images(response, image_paths, i)
168
+ ocr_results.append(markdown_with_images)
169
 
 
170
  return "\n\n".join(ocr_results), image_paths
171
  except Exception as e:
172
  return self._handle_error("uploaded PDF processing", e), []
 
185
  raise ValueError("No pages converted from PDF")
186
 
187
  ocr_results = []
188
+ image_paths = [path for path, _ in image_data]
189
+ for i, (_, encoded) in enumerate(image_data):
190
  response = self._call_ocr_api(encoded)
191
+ markdown_with_images = self._get_combined_markdown_with_images(response, image_paths, i)
192
+ ocr_results.append(markdown_with_images)
193
 
 
194
  return "\n\n".join(ocr_results), image_paths
195
  except Exception as e:
196
  return self._handle_error("PDF URL processing", e), []
 
203
  image_path = self._save_uploaded_file(image_file, file_name)
204
  encoded_image = self._encode_image(image_path)
205
  response = self._call_ocr_api(encoded_image)
206
+ return self._get_combined_markdown_with_images(response), image_path
207
  except Exception as e:
208
  return self._handle_error("image processing", e), None
209
 
210
  @staticmethod
211
+ def _get_combined_markdown_with_images(response: OCRResponse, image_paths: List[str] = None, page_index: int = None) -> str:
212
+ markdown_parts = []
213
+ for i, page in enumerate(response.pages):
214
+ if page.markdown.strip():
215
+ markdown = page.markdown
216
+ logger.info(f"Page {i} markdown: {markdown}")
217
+ if hasattr(page, 'images') and page.images:
218
+ logger.info(f"Found {len(page.images)} images in page {i}")
219
+ for img in page.images:
220
+ if img.image_base64:
221
+ logger.info(f"Replacing image {img.id} with base64")
222
+ markdown = markdown.replace(
223
+ f"![{img.id}]({img.id})",
224
+ f"![{img.id}](data:image/png;base64,{img.image_base64})"
225
+ )
226
+ else:
227
+ logger.warning(f"No base64 data for image {img.id}")
228
+ if image_paths and page_index is not None and page_index < len(image_paths):
229
+ local_encoded = OCRProcessor._encode_image(image_paths[page_index])
230
+ markdown = markdown.replace(
231
+ f"![{img.id}]({img.id})",
232
+ f"![{img.id}](data:image/png;base64,{local_encoded})"
233
+ )
234
+ else:
235
+ logger.warning(f"No images found in page {i}")
236
+ # Replace known placeholders or append the local image
237
+ if image_paths and page_index is not None and page_index < len(image_paths):
238
+ local_encoded = OCRProcessor._encode_image(image_paths[page_index])
239
+ # Replace placeholders like img-0.jpeg
240
+ placeholder = f"img-{i}.jpeg"
241
+ if placeholder in markdown:
242
+ markdown = markdown.replace(
243
+ placeholder,
244
+ f"![Page {i} Image](data:image/png;base64,{local_encoded})"
245
+ )
246
+ else:
247
+ # Append the image if no placeholder is found
248
+ markdown += f"\n\n![Page {i} Image](data:image/png;base64,{local_encoded})"
249
+ markdown_parts.append(markdown)
250
+ return "\n\n".join(markdown_parts) or "No text or images detected"
251
 
252
  @staticmethod
253
  def _handle_error(context: str, error: Exception) -> str:
 
322
  def process_pdf(processor, pdf_file, pdf_url):
323
  if not processor:
324
  return "Please set API key first", []
325
+ logger.info(f"Received inputs - PDF file: {pdf_file}, PDF URL: {pdf_url}")
326
+ if pdf_file is not None and hasattr(pdf_file, 'name'):
327
+ logger.info(f"Processing as uploaded PDF: {pdf_file.name}")
328
  return processor.ocr_uploaded_pdf(pdf_file)
329
+ elif pdf_url and pdf_url.strip():
330
+ logger.info(f"Processing as PDF URL: {pdf_url}")
331
  return processor.ocr_pdf_url(pdf_url)
332
+ return "Please upload a PDF or provide a valid URL", []
333
 
334
  process_pdf_btn.click(
335
  fn=process_pdf,