Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import os
|
2 |
import io
|
3 |
-
import re
|
4 |
import logging
|
5 |
import subprocess
|
6 |
from datetime import datetime
|
@@ -19,9 +19,9 @@ from werkzeug.utils import secure_filename
|
|
19 |
import requests # For requests.exceptions.HTTPError
|
20 |
from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
|
21 |
|
22 |
-
|
23 |
-
import pdf2image
|
24 |
-
from pdf2image import convert_from_path, convert_from_bytes
|
25 |
# from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors
|
26 |
|
27 |
import pytesseract
|
@@ -108,7 +108,6 @@ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
|
|
108 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
109 |
repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
|
110 |
|
111 |
-
# Ensure UPLOAD_FOLDER exists before writing temp file
|
112 |
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
113 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
|
114 |
temp_image_path = tmp_file.name
|
@@ -129,143 +128,86 @@ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
|
|
129 |
try: os.remove(temp_image_path)
|
130 |
except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
|
131 |
|
132 |
-
|
133 |
-
def format_page_text_to_markdown_chunk(page_text_content):
|
134 |
-
chunk_md = ""
|
135 |
-
page_text_content = re.sub(r'\n\s*\n+', '\n\n', page_text_content.strip())
|
136 |
-
lines = page_text_content.split('\n')
|
137 |
-
is_in_list = False
|
138 |
-
for line_text in lines:
|
139 |
-
line_stripped = line_text.strip()
|
140 |
-
if not line_stripped:
|
141 |
-
chunk_md += "\n"
|
142 |
-
is_in_list = False
|
143 |
-
continue
|
144 |
-
list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
|
145 |
-
is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
|
146 |
-
if is_heading_candidate and not list_match:
|
147 |
-
chunk_md += f"## {line_stripped}\n\n"
|
148 |
-
is_in_list = False
|
149 |
-
elif list_match:
|
150 |
-
list_item_text = list_match.group(1)
|
151 |
-
chunk_md += f"- {list_item_text}\n"
|
152 |
-
is_in_list = True
|
153 |
-
else:
|
154 |
-
if is_in_list: chunk_md += "\n"
|
155 |
-
chunk_md += f"{line_text}\n\n"
|
156 |
-
is_in_list = False
|
157 |
-
return re.sub(r'\n\s*\n+', '\n\n', chunk_md.strip()) + "\n\n"
|
158 |
-
|
159 |
|
160 |
# --- Main PDF Processing Logic (Generator Function for Streaming) ---
|
161 |
|
162 |
def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
|
163 |
try:
|
164 |
-
yield yield_message("markdown_replace", {"content": "# Extracted
|
165 |
time.sleep(0.01)
|
166 |
|
167 |
-
|
168 |
-
|
169 |
|
170 |
source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
|
171 |
pdf_input_source_path_or_url.startswith(('http://', 'https://'))
|
172 |
|
173 |
-
pdf_handle_for_text = None
|
174 |
-
pdf_bytes_for_images = None
|
175 |
-
|
176 |
if source_is_url:
|
|
|
|
|
177 |
try:
|
178 |
response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
|
179 |
response.raise_for_status()
|
180 |
-
|
181 |
-
|
182 |
-
yield yield_message("status", {"message": f"PDF downloaded from URL ({len(
|
183 |
time.sleep(0.01)
|
184 |
except RequestsHTTPError as e:
|
185 |
-
logger.error(f"URL fetch HTTP error
|
186 |
yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
|
187 |
return
|
188 |
except requests.RequestException as e:
|
189 |
-
logger.error(f"URL fetch network error
|
190 |
yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
|
191 |
return
|
192 |
else:
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
with pdfplumber.open(pdf_handle_for_text) as pdf:
|
198 |
-
total_text_pages = len(pdf.pages)
|
199 |
-
yield yield_message("status", {"message": f"Found {total_text_pages} page(s) for text extraction."})
|
200 |
-
time.sleep(0.01)
|
201 |
|
202 |
-
|
203 |
-
yield yield_message("status", {"message": f"Extracting text from page {i+1}/{total_text_pages}..."})
|
204 |
-
time.sleep(0.01)
|
205 |
-
|
206 |
-
page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
|
207 |
-
|
208 |
-
# Removed table extraction logic here
|
209 |
-
# page_tables_md = "" # No longer needed
|
210 |
-
# tables = page.extract_tables() # No longer needed
|
211 |
-
# if tables: # No longer needed
|
212 |
-
# ... (table processing code removed) ...
|
213 |
-
|
214 |
-
formatted_page_text_md = format_page_text_to_markdown_chunk(page_text)
|
215 |
-
|
216 |
-
yield yield_message("markdown_chunk", {"content": formatted_page_text_md})
|
217 |
-
# if page_tables_md: # No longer needed, as page_tables_md is not created
|
218 |
-
# yield yield_message("markdown_chunk", {"content": page_tables_md})
|
219 |
-
time.sleep(0.01)
|
220 |
-
except Exception as e:
|
221 |
-
logger.error(f"Error during PDF text extraction: {str(e)}", exc_info=True) # Updated log message
|
222 |
-
yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
|
223 |
|
|
|
224 |
if not check_poppler():
|
225 |
yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
|
226 |
else:
|
227 |
-
yield yield_message("status", {"message": "Starting image extraction..."})
|
228 |
-
|
|
|
229 |
if not HF_TOKEN:
|
230 |
yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
|
231 |
|
232 |
time.sleep(0.01)
|
233 |
-
extracted_pil_images_overall_count = 0
|
234 |
try:
|
235 |
-
|
236 |
-
|
237 |
-
image_source_for_convert = pdf_bytes_for_images
|
238 |
-
logger.info("Using downloaded bytes for image conversion.")
|
239 |
-
elif not source_is_url:
|
240 |
-
image_source_for_convert = pdf_input_source_path_or_url
|
241 |
-
logger.info("Using local file path for image conversion.")
|
242 |
-
|
243 |
-
if image_source_for_convert:
|
244 |
-
try:
|
245 |
pdf_info = None
|
246 |
-
if
|
247 |
-
pdf_info = pdf2image.pdfinfo_from_bytes(
|
248 |
else:
|
249 |
-
pdf_info = pdf2image.pdfinfo_from_path(
|
250 |
|
251 |
num_image_pages = pdf_info.get("Pages", 0)
|
252 |
-
yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for
|
253 |
|
254 |
batch_size = 1
|
255 |
for page_idx_start in range(1, num_image_pages + 1, batch_size):
|
256 |
page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
|
257 |
-
yield yield_message("status", {"message": f"
|
258 |
time.sleep(0.01)
|
259 |
|
260 |
page_images_pil = []
|
261 |
-
if
|
262 |
-
page_images_pil = convert_from_bytes(
|
263 |
else:
|
264 |
-
page_images_pil = convert_from_path(
|
265 |
|
266 |
for img_idx_in_batch, img_pil in enumerate(page_images_pil):
|
267 |
extracted_pil_images_overall_count += 1
|
268 |
-
current_pdf_page_num = page_idx_start + img_idx_in_batch
|
269 |
page_num_for_log = f"pdfpage_{current_pdf_page_num}"
|
270 |
|
271 |
yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
|
@@ -275,15 +217,16 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
|
|
275 |
try:
|
276 |
ocr_text = pytesseract.image_to_string(img_pil).strip()
|
277 |
if ocr_text: yield yield_message("status", {"message": f" OCR successful for image {extracted_pil_images_overall_count}."})
|
|
|
278 |
except Exception as ocr_e:
|
279 |
logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
|
280 |
ocr_text = f"OCR failed: {str(ocr_e)}"
|
281 |
|
282 |
-
image_md_chunk = ""
|
283 |
if HF_TOKEN:
|
284 |
-
image_url_or_error = upload_image_to_hf_stream(img_pil, "
|
285 |
if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
|
286 |
-
image_md_chunk += f"![Image {
|
287 |
yield yield_message("status", {"message": f" Image {extracted_pil_images_overall_count} uploaded."})
|
288 |
else:
|
289 |
image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
|
@@ -292,49 +235,54 @@ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
|
|
292 |
image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"
|
293 |
|
294 |
if ocr_text:
|
295 |
-
image_md_chunk += f"**
|
|
|
|
|
296 |
|
297 |
yield yield_message("image_md", {"content": image_md_chunk})
|
298 |
time.sleep(0.01)
|
299 |
except Exception as e_img_info:
|
300 |
logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
|
301 |
-
yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk."})
|
302 |
# Fallback to bulk conversion
|
303 |
bulk_images_pil = []
|
304 |
-
if
|
305 |
-
bulk_images_pil = convert_from_bytes(
|
306 |
else:
|
307 |
-
bulk_images_pil = convert_from_path(
|
308 |
|
309 |
-
yield yield_message("status", {"message": f"Fallback:
|
310 |
for i, img_pil in enumerate(bulk_images_pil):
|
311 |
extracted_pil_images_overall_count +=1
|
312 |
-
page_num_for_log = f"bulk_image_{i+1}"
|
313 |
-
yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk) (OCR & Upload)..."})
|
314 |
ocr_text = ""
|
315 |
try: ocr_text = pytesseract.image_to_string(img_pil).strip()
|
316 |
except Exception as e: ocr_text = f"OCR Error: {e}"
|
317 |
|
318 |
-
image_md_chunk = f"
|
319 |
if HF_TOKEN:
|
320 |
-
image_url_or_error = upload_image_to_hf_stream(img_pil, "
|
321 |
if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
|
322 |
-
image_md_chunk
|
323 |
else:
|
324 |
image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
|
325 |
-
|
326 |
-
|
|
|
|
|
|
|
327 |
yield yield_message("image_md", {"content": image_md_chunk})
|
328 |
time.sleep(0.01)
|
329 |
|
330 |
else:
|
331 |
-
yield yield_message("status", {"message": "No valid source for image extraction."})
|
332 |
|
333 |
except Exception as e:
|
334 |
-
logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
|
335 |
-
yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
|
336 |
|
337 |
-
yield yield_message("final_status", {"message": "
|
338 |
|
339 |
except Exception as e:
|
340 |
logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
|
@@ -366,12 +314,13 @@ def process_pdf_stream():
|
|
366 |
|
367 |
filename = secure_filename(pdf_file.filename)
|
368 |
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
|
|
369 |
fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
|
370 |
-
os.close(fd)
|
371 |
-
pdf_file.save(temp_path)
|
372 |
-
outer_temp_pdf_path = temp_path
|
373 |
logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
|
374 |
-
pdf_input_source_for_generator = outer_temp_pdf_path
|
375 |
yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
|
376 |
time.sleep(0.01)
|
377 |
|
@@ -381,7 +330,7 @@ def process_pdf_stream():
|
|
381 |
yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
|
382 |
return
|
383 |
|
384 |
-
pdf_input_source_for_generator = unquoted_url
|
385 |
yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
|
386 |
time.sleep(0.01)
|
387 |
else:
|
@@ -408,6 +357,6 @@ def process_pdf_stream():
|
|
408 |
# --- Main Execution ---
|
409 |
if __name__ == '__main__':
|
410 |
if not check_poppler():
|
411 |
-
logger.warning("Poppler utilities might not be installed correctly.
|
412 |
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
413 |
app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)
|
|
|
1 |
import os
|
2 |
import io
|
3 |
+
import re # Still needed for some image filename manipulation if any, but not for text formatting
|
4 |
import logging
|
5 |
import subprocess
|
6 |
from datetime import datetime
|
|
|
19 |
import requests # For requests.exceptions.HTTPError
|
20 |
from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
|
21 |
|
22 |
+
# pdfplumber is no longer needed
|
23 |
+
import pdf2image
|
24 |
+
from pdf2image import convert_from_path, convert_from_bytes
|
25 |
# from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors
|
26 |
|
27 |
import pytesseract
|
|
|
108 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
109 |
repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
|
110 |
|
|
|
111 |
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
112 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
|
113 |
temp_image_path = tmp_file.name
|
|
|
128 |
try: os.remove(temp_image_path)
|
129 |
except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
|
130 |
|
131 |
+
# format_page_text_to_markdown_chunk function is removed as it's no longer used.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
# --- Main PDF Processing Logic (Generator Function for Streaming) ---
|
134 |
|
135 |
def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
|
136 |
try:
|
137 |
+
yield yield_message("markdown_replace", {"content": "# Extracted Images and OCR Text\n\n"})
|
138 |
time.sleep(0.01)
|
139 |
|
140 |
+
actual_pdf_input_for_images = None
|
141 |
+
is_input_bytes = False
|
142 |
|
143 |
source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
|
144 |
pdf_input_source_path_or_url.startswith(('http://', 'https://'))
|
145 |
|
|
|
|
|
|
|
146 |
if source_is_url:
|
147 |
+
yield yield_message("status", {"message": f"Downloading PDF from URL..."})
|
148 |
+
time.sleep(0.01)
|
149 |
try:
|
150 |
response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
|
151 |
response.raise_for_status()
|
152 |
+
actual_pdf_input_for_images = response.content
|
153 |
+
is_input_bytes = True
|
154 |
+
yield yield_message("status", {"message": f"PDF downloaded from URL ({len(actual_pdf_input_for_images)/1024:.2f} KB)."})
|
155 |
time.sleep(0.01)
|
156 |
except RequestsHTTPError as e:
|
157 |
+
logger.error(f"URL fetch HTTP error: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
|
158 |
yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
|
159 |
return
|
160 |
except requests.RequestException as e:
|
161 |
+
logger.error(f"URL fetch network error: {str(e)}", exc_info=True)
|
162 |
yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
|
163 |
return
|
164 |
else:
|
165 |
+
actual_pdf_input_for_images = pdf_input_source_path_or_url
|
166 |
+
is_input_bytes = False
|
167 |
+
yield yield_message("status", {"message": f"Processing local PDF file..."})
|
168 |
+
time.sleep(0.01)
|
|
|
|
|
|
|
|
|
169 |
|
170 |
+
# ----- Direct Text Extraction (using pdfplumber) is REMOVED -----
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
+
# ----- Image Extraction and OCR -----
|
173 |
if not check_poppler():
|
174 |
yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
|
175 |
else:
|
176 |
+
yield yield_message("status", {"message": "Starting image extraction and OCR..."})
|
177 |
+
# The "## Extracted Images" title is now more specific
|
178 |
+
yield yield_message("markdown_chunk", {"content": "## Extracted Images & OCR Text from PDF Pages\n\n"})
|
179 |
if not HF_TOKEN:
|
180 |
yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
|
181 |
|
182 |
time.sleep(0.01)
|
183 |
+
extracted_pil_images_overall_count = 0
|
184 |
try:
|
185 |
+
if actual_pdf_input_for_images:
|
186 |
+
try: # Batched conversion attempt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
pdf_info = None
|
188 |
+
if is_input_bytes:
|
189 |
+
pdf_info = pdf2image.pdfinfo_from_bytes(actual_pdf_input_for_images, userpw=None, poppler_path=None)
|
190 |
else:
|
191 |
+
pdf_info = pdf2image.pdfinfo_from_path(actual_pdf_input_for_images, userpw=None, poppler_path=None)
|
192 |
|
193 |
num_image_pages = pdf_info.get("Pages", 0)
|
194 |
+
yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for image conversion and OCR."})
|
195 |
|
196 |
batch_size = 1
|
197 |
for page_idx_start in range(1, num_image_pages + 1, batch_size):
|
198 |
page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
|
199 |
+
yield yield_message("status", {"message": f"Converting PDF page(s) {page_idx_start}-{page_idx_end} to image(s)..."})
|
200 |
time.sleep(0.01)
|
201 |
|
202 |
page_images_pil = []
|
203 |
+
if is_input_bytes:
|
204 |
+
page_images_pil = convert_from_bytes(actual_pdf_input_for_images, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
|
205 |
else:
|
206 |
+
page_images_pil = convert_from_path(actual_pdf_input_for_images, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
|
207 |
|
208 |
for img_idx_in_batch, img_pil in enumerate(page_images_pil):
|
209 |
extracted_pil_images_overall_count += 1
|
210 |
+
current_pdf_page_num = page_idx_start + img_idx_in_batch
|
211 |
page_num_for_log = f"pdfpage_{current_pdf_page_num}"
|
212 |
|
213 |
yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
|
|
|
217 |
try:
|
218 |
ocr_text = pytesseract.image_to_string(img_pil).strip()
|
219 |
if ocr_text: yield yield_message("status", {"message": f" OCR successful for image {extracted_pil_images_overall_count}."})
|
220 |
+
else: yield yield_message("status", {"message": f" OCR complete for image {extracted_pil_images_overall_count} (no text found)."})
|
221 |
except Exception as ocr_e:
|
222 |
logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
|
223 |
ocr_text = f"OCR failed: {str(ocr_e)}"
|
224 |
|
225 |
+
image_md_chunk = f"### Image from PDF Page {current_pdf_page_num}\n"
|
226 |
if HF_TOKEN:
|
227 |
+
image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_page_image", page_num_for_log)
|
228 |
if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
|
229 |
+
image_md_chunk += f"\n"
|
230 |
yield yield_message("status", {"message": f" Image {extracted_pil_images_overall_count} uploaded."})
|
231 |
else:
|
232 |
image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
|
|
|
235 |
image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"
|
236 |
|
237 |
if ocr_text:
|
238 |
+
image_md_chunk += f"**OCR Text (from PDF Page {current_pdf_page_num}):**\n```\n{ocr_text}\n```\n\n"
|
239 |
+
else:
|
240 |
+
image_md_chunk += f"_(No text detected by OCR for image from PDF page {current_pdf_page_num})_\n\n"
|
241 |
|
242 |
yield yield_message("image_md", {"content": image_md_chunk})
|
243 |
time.sleep(0.01)
|
244 |
except Exception as e_img_info:
|
245 |
logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
|
246 |
+
yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk conversion."})
|
247 |
# Fallback to bulk conversion
|
248 |
bulk_images_pil = []
|
249 |
+
if is_input_bytes:
|
250 |
+
bulk_images_pil = convert_from_bytes(actual_pdf_input_for_images, dpi=150)
|
251 |
else:
|
252 |
+
bulk_images_pil = convert_from_path(actual_pdf_input_for_images, dpi=150)
|
253 |
|
254 |
+
yield yield_message("status", {"message": f"Fallback: Converted {len(bulk_images_pil)} PDF pages to images in bulk."})
|
255 |
for i, img_pil in enumerate(bulk_images_pil):
|
256 |
extracted_pil_images_overall_count +=1
|
257 |
+
page_num_for_log = f"bulk_image_{i+1}"
|
258 |
+
yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk page {i+1}) (OCR & Upload)..."})
|
259 |
ocr_text = ""
|
260 |
try: ocr_text = pytesseract.image_to_string(img_pil).strip()
|
261 |
except Exception as e: ocr_text = f"OCR Error: {e}"
|
262 |
|
263 |
+
image_md_chunk = f"### Image from PDF Page (Bulk {i+1})\n"
|
264 |
if HF_TOKEN:
|
265 |
+
image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_page_image_fallback", page_num_for_log)
|
266 |
if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
|
267 |
+
image_md_chunk += f"\n"
|
268 |
else:
|
269 |
image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
|
270 |
+
else:
|
271 |
+
image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Fallback - not uploaded)**\n"
|
272 |
+
|
273 |
+
if ocr_text: image_md_chunk += f"**OCR Text (Bulk Page {i+1}):**\n```\n{ocr_text}\n```\n\n"
|
274 |
+
else: image_md_chunk += f"_(No text detected by OCR for bulk image {i+1})_\n\n"
|
275 |
yield yield_message("image_md", {"content": image_md_chunk})
|
276 |
time.sleep(0.01)
|
277 |
|
278 |
else:
|
279 |
+
yield yield_message("status", {"message": "No valid PDF input source provided for image extraction."})
|
280 |
|
281 |
except Exception as e:
|
282 |
+
logger.error(f"Error during image extraction/OCR processing: {str(e)}", exc_info=True)
|
283 |
+
yield yield_message("error", {"message": f"Error during image extraction/OCR: {str(e)}"})
|
284 |
|
285 |
+
yield yield_message("final_status", {"message": "Image extraction and OCR processing complete."})
|
286 |
|
287 |
except Exception as e:
|
288 |
logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
|
|
|
314 |
|
315 |
filename = secure_filename(pdf_file.filename)
|
316 |
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
317 |
+
# Save to a temporary file that generate_pdf_conversion_stream can access by path
|
318 |
fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
|
319 |
+
os.close(fd) # Close the file descriptor from mkstemp
|
320 |
+
pdf_file.save(temp_path) # Save the uploaded file's content to this path
|
321 |
+
outer_temp_pdf_path = temp_path # Store for cleanup
|
322 |
logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
|
323 |
+
pdf_input_source_for_generator = outer_temp_pdf_path # Pass the path
|
324 |
yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
|
325 |
time.sleep(0.01)
|
326 |
|
|
|
330 |
yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
|
331 |
return
|
332 |
|
333 |
+
pdf_input_source_for_generator = unquoted_url # Pass the URL string
|
334 |
yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
|
335 |
time.sleep(0.01)
|
336 |
else:
|
|
|
357 |
# --- Main Execution ---
|
358 |
if __name__ == '__main__':
|
359 |
if not check_poppler():
|
360 |
+
logger.warning("Poppler utilities might not be installed correctly. Image processing might fail.")
|
361 |
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
362 |
app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)
|