broadfield-dev commited on
Commit
35151aa
·
verified ·
1 Parent(s): cf5a0c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +281 -251
app.py CHANGED
@@ -6,21 +6,29 @@ import subprocess
6
  from datetime import datetime
7
  import urllib.parse
8
  import tempfile
 
 
9
 
10
- from flask import Flask, request, render_template, redirect, url_for
11
- from werkzeug.utils import secure_filename # For secure file handling
 
 
 
 
 
 
12
 
13
  import requests
14
  import pdfplumber
15
  from pdf2image import convert_from_path, convert_from_bytes
16
  import pytesseract
17
  from PIL import Image
18
- from huggingface_hub import HfApi, create_repo
19
 
20
  # --- Flask App Initialization ---
21
  app = Flask(__name__)
22
- app.config['UPLOAD_FOLDER'] = tempfile.gettempdir() # Use system temp dir
23
- app.config['MAX_CONTENT_LENGTH'] = 30 * 1024 * 1024 # 30 MB limit for uploads
24
 
25
  # --- Logging Configuration ---
26
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -28,13 +36,19 @@ logger = logging.getLogger(__name__)
28
 
29
  # --- Hugging Face Configuration ---
30
  HF_TOKEN = os.getenv("HF_TOKEN")
31
- HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted") # Allow override via env var
32
  hf_api = HfApi()
33
 
 
 
 
 
 
34
 
35
- # --- PDF Processing Helper Functions (Adapted from Gradio version) ---
36
 
37
  def check_poppler():
 
38
  try:
39
  result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
40
  version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
@@ -51,20 +65,37 @@ def check_poppler():
51
  return False
52
 
53
  def ensure_hf_dataset():
 
54
  if not HF_TOKEN:
55
- logger.warning("HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail.")
56
- return "Error: HF_TOKEN is not set. Please configure it in Space secrets for image uploads."
 
57
  try:
58
  repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
59
  logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
60
  return repo_id_obj.repo_id
61
-
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  except Exception as e:
63
  logger.error(f"Hugging Face dataset error: {str(e)}", exc_info=True)
64
  return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
65
 
66
 
67
- def upload_image_to_hf(image_pil, filename_base):
 
 
68
  repo_id_or_error = ensure_hf_dataset()
69
  if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
70
  return repo_id_or_error
@@ -73,162 +104,199 @@ def upload_image_to_hf(image_pil, filename_base):
73
  temp_image_path = None
74
  try:
75
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
76
- repo_filename = f"images/{filename_base}_{timestamp}.png" # Path in repo
77
-
78
- # Save PIL image to a temporary file to upload
79
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
80
  temp_image_path = tmp_file.name
81
  image_pil.save(temp_image_path, format="PNG")
82
 
83
  logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
84
  file_url = hf_api.upload_file(
85
- path_or_fileobj=temp_image_path,
86
- path_in_repo=repo_filename,
87
- repo_id=repo_id,
88
- repo_type="dataset",
89
- token=HF_TOKEN
90
  )
91
  logger.info(f"Successfully uploaded image: {file_url}")
92
  return file_url
93
  except Exception as e:
94
- logger.error(f"Image upload error for {filename_base}: {str(e)}", exc_info=True)
95
- return f"Error uploading image {filename_base}: {str(e)}"
96
  finally:
97
  if temp_image_path and os.path.exists(temp_image_path):
98
- try:
99
- os.remove(temp_image_path)
100
- except OSError as ose:
101
- logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
102
-
103
- def extract_text_from_pdf(pdf_input_source): # pdf_input_source is URL string or local file path
104
- try:
105
- pdf_file_like_object = None
106
- if isinstance(pdf_input_source, str) and pdf_input_source.startswith(('http://', 'https://')):
107
- logger.info(f"Fetching PDF from URL for text extraction: {pdf_input_source}")
108
- response = requests.get(pdf_input_source, stream=True, timeout=30)
109
- response.raise_for_status()
110
- pdf_file_like_object = io.BytesIO(response.content)
111
- logger.info("PDF downloaded successfully from URL.")
112
- elif isinstance(pdf_input_source, str) and os.path.exists(pdf_input_source): # Local file path
113
- logger.info(f"Processing local PDF file for text extraction: {pdf_input_source}")
114
- # pdfplumber.open can take a path directly
115
- pdf_file_like_object = pdf_input_source
 
 
 
 
 
 
 
 
 
 
116
  else:
117
- logger.error(f"Invalid pdf_input_source for text extraction: {pdf_input_source}")
118
- return "Error: Invalid input for PDF text extraction (must be URL or valid file path)."
119
-
120
- with pdfplumber.open(pdf_file_like_object) as pdf:
121
- full_text = ""
122
- for i, page in enumerate(pdf.pages):
123
- page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
124
- full_text += page_text + "\n\n"
125
- tables = page.extract_tables()
126
- if tables:
127
- for table_data in tables:
128
- if table_data:
129
- header = [" | ".join(str(cell) if cell is not None else "" for cell in table_data[0])]
130
- separator = [" | ".join(["---"] * len(table_data[0]))]
131
- body = [" | ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data[1:]]
132
- table_md_lines = header + separator + body
133
- full_text += f"**Table:**\n" + "\n".join(table_md_lines) + "\n\n"
134
- logger.info("Text and table extraction successful.")
135
- return full_text.strip()
136
- except requests.RequestException as e:
137
- logger.error(f"URL fetch error for text extraction: {str(e)}", exc_info=True)
138
- return f"Error fetching PDF from URL: {str(e)}"
139
- except Exception as e:
140
- logger.error(f"Text extraction error: {str(e)}", exc_info=True)
141
- return f"Error extracting text: {str(e)}"
142
 
143
- def extract_images_from_pdf(pdf_input_source): # pdf_input_source is URL string or local file path
144
- if not check_poppler():
145
- return "Error: poppler-utils not found or not working correctly. Image extraction depends on it."
146
-
147
- images_pil = []
 
 
 
148
  try:
149
- if isinstance(pdf_input_source, str) and pdf_input_source.startswith(('http://', 'https://')):
150
- logger.info(f"Fetching PDF from URL for image extraction: {pdf_input_source}")
151
- response = requests.get(pdf_input_source, stream=True, timeout=30)
152
- response.raise_for_status()
153
- logger.info("PDF downloaded successfully from URL, converting to images.")
154
- images_pil = convert_from_bytes(response.content, dpi=200)
155
- elif isinstance(pdf_input_source, str) and os.path.exists(pdf_input_source): # Local file path
156
- logger.info(f"Processing local PDF file for image extraction: {pdf_input_source}")
157
- images_pil = convert_from_path(pdf_input_source, dpi=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  else:
159
- logger.error(f"Invalid pdf_input_source for image extraction: {pdf_input_source}")
160
- return "Error: Invalid input for PDF image extraction (must be URL or valid file path)."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- logger.info(f"Successfully extracted {len(images_pil)} image(s) from PDF.")
163
- return images_pil
164
- except requests.RequestException as e:
165
- logger.error(f"URL fetch error for image extraction: {str(e)}", exc_info=True)
166
- return f"Error fetching PDF from URL for image extraction: {str(e)}"
167
  except Exception as e:
168
- logger.error(f"Image extraction error: {str(e)}", exc_info=True)
169
- return f"Error extracting images: {str(e)}"
170
-
171
-
172
- def format_to_markdown(text_content, images_input):
173
- markdown_output = "# Extracted PDF Content\n\n"
174
- if text_content.startswith("Error"): # If text extraction itself failed
175
- markdown_output += f"**Text Extraction Note:**\n{text_content}\n\n"
176
- else:
177
- text_content = re.sub(r'\n\s*\n+', '\n\n', text_content.strip())
178
- lines = text_content.split('\n')
179
- is_in_list = False
180
- for line_text in lines:
181
- line_stripped = line_text.strip()
182
- if not line_stripped:
183
- markdown_output += "\n"
184
- is_in_list = False
185
- continue
186
- list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
187
- is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
188
- if is_heading_candidate and not list_match:
189
- markdown_output += f"## {line_stripped}\n\n"
190
- is_in_list = False
191
- elif list_match:
192
- list_item_text = list_match.group(1)
193
- markdown_output += f"- {list_item_text}\n"
194
- is_in_list = True
195
- else:
196
- if is_in_list: markdown_output += "\n"
197
- markdown_output += f"{line_text}\n\n"
198
- is_in_list = False
199
- markdown_output = re.sub(r'\n\s*\n+', '\n\n', markdown_output.strip()) + "\n\n"
200
-
201
- if isinstance(images_input, list) and images_input:
202
- markdown_output += "## Extracted Images\n\n"
203
- if not HF_TOKEN:
204
- markdown_output += "**Note:** `HF_TOKEN` not set. Images were extracted but not uploaded to Hugging Face Hub.\n\n"
205
-
206
- for i, img_pil in enumerate(images_input):
207
- ocr_text = ""
208
- try:
209
- ocr_text = pytesseract.image_to_string(img_pil).strip()
210
- logger.info(f"OCR for image {i+1} successful.")
211
- except Exception as ocr_e:
212
- logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
213
- ocr_text = f"OCR failed: {str(ocr_e)}"
214
-
215
- if HF_TOKEN: # Only attempt upload if token is present
216
- image_filename_base = f"extracted_image_{i+1}"
217
- image_url_or_error = upload_image_to_hf(img_pil, image_filename_base)
218
- if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
219
- markdown_output += f"![Image {i+1}]({image_url_or_error})\n"
220
- else:
221
- markdown_output += f"**Image {i+1} (Upload Error):** {str(image_url_or_error)}\n\n"
222
- else: # No token, show placeholder or local info if we were saving them locally
223
- markdown_output += f"**Image {i+1} (not uploaded due to missing HF_TOKEN)**\n"
224
-
225
- if ocr_text:
226
- markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
227
-
228
- elif isinstance(images_input, str) and images_input.startswith("Error"):
229
- markdown_output += f"## Image Extraction Note\n\n{images_input}\n\n"
230
-
231
- return markdown_output.strip()
232
 
233
  # --- Flask Routes ---
234
 
@@ -236,114 +304,76 @@ def format_to_markdown(text_content, images_input):
236
  def index():
237
  return render_template('index.html')
238
 
239
- @app.route('/process', methods=['POST'])
240
- def process_pdf_route():
241
  pdf_file = request.files.get('pdf_file')
242
  pdf_url = request.form.get('pdf_url', '').strip()
243
 
244
- status_message = "Starting PDF processing..."
245
- error_message = None
246
- markdown_output = None
247
- temp_pdf_path = None
248
- pdf_input_source = None # This will be a URL string or a local file path
249
-
250
- try:
251
- if pdf_file and pdf_file.filename:
252
- if not pdf_file.filename.lower().endswith('.pdf'):
253
- raise ValueError("Uploaded file is not a PDF.")
254
-
255
- filename = secure_filename(pdf_file.filename)
256
- # Save to a temporary file
257
- fd, temp_pdf_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
258
- os.close(fd) # close file descriptor from mkstemp
259
- pdf_file.save(temp_pdf_path)
260
- logger.info(f"Uploaded PDF saved to temporary path: {temp_pdf_path}")
261
- pdf_input_source = temp_pdf_path
262
- status_message = f"Processing uploaded PDF: {filename}"
263
-
264
- elif pdf_url:
265
- pdf_url = urllib.parse.unquote(pdf_url)
266
- # Basic URL validation
267
- if not (pdf_url.startswith('http://') or pdf_url.startswith('https://')):
268
- raise ValueError("Invalid URL scheme. Must be http or https.")
269
- if not pdf_url.lower().endswith('.pdf'):
270
- logger.warning(f"URL {pdf_url} does not end with .pdf. Proceeding with caution.")
271
- # Allow proceeding but log warning, actual check is content-type or processing error
272
-
273
- # Quick check with HEAD request (optional, but good practice)
274
- try:
275
- head_resp = requests.head(pdf_url, allow_redirects=True, timeout=10)
276
- head_resp.raise_for_status()
277
- content_type = head_resp.headers.get('content-type', '').lower()
278
- if 'application/pdf' not in content_type:
279
- logger.warning(f"URL {pdf_url} content-type is '{content_type}', not 'application/pdf'.")
280
- # Depending on strictness, could raise ValueError here
281
- except requests.RequestException as re:
282
- logger.error(f"Failed HEAD request for URL {pdf_url}: {re}")
283
- # Proceed, main request in extract functions will handle final failure
284
-
285
- pdf_input_source = pdf_url
286
- status_message = f"Processing PDF from URL: {pdf_url}"
287
- else:
288
- raise ValueError("No PDF file uploaded and no PDF URL provided.")
289
-
290
- # --- Core Processing ---
291
- status_message += "\nExtracting text..."
292
- logger.info(status_message)
293
- extracted_text = extract_text_from_pdf(pdf_input_source)
294
- if isinstance(extracted_text, str) and extracted_text.startswith("Error"):
295
- # Let format_to_markdown handle displaying this error within its structure
296
- logger.error(f"Text extraction resulted in error: {extracted_text}")
297
-
298
- status_message += "\nExtracting images..."
299
- logger.info(status_message)
300
- extracted_images = extract_images_from_pdf(pdf_input_source) # list of PIL images or error string
301
- if isinstance(extracted_images, str) and extracted_images.startswith("Error"):
302
- logger.error(f"Image extraction resulted in error: {extracted_images}")
303
-
304
- status_message += "\nFormatting to Markdown..."
305
- logger.info(status_message)
306
- markdown_output = format_to_markdown(extracted_text, extracted_images)
307
-
308
- status_message = "Processing complete."
309
- if isinstance(extracted_text, str) and extracted_text.startswith("Error"):
310
- status_message += f" (Text extraction issues: {extracted_text.split(':', 1)[1].strip()})"
311
- if isinstance(extracted_images, str) and extracted_images.startswith("Error"):
312
- status_message += f" (Image extraction issues: {extracted_images.split(':', 1)[1].strip()})"
313
- if not HF_TOKEN and isinstance(extracted_images, list) and extracted_images:
314
- status_message += " (Note: HF_TOKEN not set, images not uploaded to Hub)"
315
-
316
-
317
- except ValueError as ve:
318
- logger.error(f"Input validation error: {str(ve)}")
319
- error_message = str(ve)
320
- status_message = "Processing failed."
321
- except Exception as e:
322
- logger.error(f"An unexpected error occurred during processing: {str(e)}", exc_info=True)
323
- error_message = f"An unexpected error occurred: {str(e)}"
324
- status_message = "Processing failed due to an unexpected error."
325
- finally:
326
- if temp_pdf_path and os.path.exists(temp_pdf_path):
327
- try:
328
- os.remove(temp_pdf_path)
329
- logger.info(f"Removed temporary PDF: {temp_pdf_path}")
330
- except OSError as ose:
331
- logger.error(f"Error removing temporary PDF {temp_pdf_path}: {ose}")
332
-
333
- return render_template('index.html',
334
- markdown_output=markdown_output,
335
- status_message=status_message,
336
- error_message=error_message)
337
 
338
 
339
  # --- Main Execution ---
340
  if __name__ == '__main__':
341
- # This is for local development. For Hugging Face Spaces, Gunicorn is used via Dockerfile CMD.
342
- # Poppler check at startup for local dev convenience
343
- if not check_poppler():
344
  logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
345
-
346
- # Ensure UPLOAD_FOLDER exists
347
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
348
-
349
- app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True)
 
 
6
  from datetime import datetime
7
  import urllib.parse
8
  import tempfile
9
+ import json # For streaming JSON messages
10
+ import time # For gevent.sleep
11
 
12
+ from flask import Flask, request, render_template, Response, stream_with_context
13
+ from werkzeug.utils import secure_filename
14
+
15
+ # Ensure gevent is imported and monkey patched if needed for other libraries
16
+ # that might not be gevent-friendly. For built-in libs and requests (with Gunicorn gevent worker),
17
+ # this is often handled by Gunicorn.
18
+ # from gevent import monkey
19
+ # monkey.patch_all() # Apply this early if you suspect issues with other libs
20
 
21
  import requests
22
  import pdfplumber
23
  from pdf2image import convert_from_path, convert_from_bytes
24
  import pytesseract
25
  from PIL import Image
26
+ from huggingface_hub import HfApi, create_repo, HfHubHTTPError
27
 
28
  # --- Flask App Initialization ---
29
  app = Flask(__name__)
30
+ app.config['UPLOAD_FOLDER'] = tempfile.gettempdir()
31
+ app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50 MB limit for uploads, adjust as needed
32
 
33
  # --- Logging Configuration ---
34
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
36
 
37
  # --- Hugging Face Configuration ---
38
  HF_TOKEN = os.getenv("HF_TOKEN")
39
+ HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted")
40
  hf_api = HfApi()
41
 
42
+ # --- Helper to yield messages for streaming ---
43
+ def yield_message(type, data):
44
+ """Helper to format messages as JSON strings for streaming."""
45
+ # Add a newline so client can easily split messages
46
+ return json.dumps({"type": type, **data}) + "\n"
47
 
48
+ # --- PDF Processing Helper Functions (Adapted for Streaming) ---
49
 
50
  def check_poppler():
51
+ # (Same as before)
52
  try:
53
  result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
54
  version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
 
65
  return False
66
 
67
  def ensure_hf_dataset():
68
+ # (Same as before, but logs info useful for streaming if an error occurs)
69
  if not HF_TOKEN:
70
+ msg = "HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail."
71
+ logger.warning(msg)
72
+ return "Error: " + msg
73
  try:
74
  repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
75
  logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
76
  return repo_id_obj.repo_id
77
+ except HfHubHTTPError as e:
78
+ if e.response.status_code == 409:
79
+ logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists.")
80
+ # Attempt to construct the full repo_id (namespace/repo_name)
81
+ try:
82
+ user_info = hf_api.whoami(token=HF_TOKEN)
83
+ namespace = user_info.get('name') if user_info else None
84
+ if namespace:
85
+ return f"{namespace}/{HF_DATASET_REPO_NAME}"
86
+ except Exception as whoami_e:
87
+ logger.error(f"Could not determine namespace for existing repo via whoami: {whoami_e}")
88
+ return f"hf://datasets/{HF_DATASET_REPO_NAME}" # Fallback, might not be full id
89
+ logger.error(f"Hugging Face dataset error (HTTP {e.response.status_code}): {str(e)}")
90
+ return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
91
  except Exception as e:
92
  logger.error(f"Hugging Face dataset error: {str(e)}", exc_info=True)
93
  return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
94
 
95
 
96
+ def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
97
+ # (Adapted to potentially yield status during this sub-process if it were longer)
98
+ # For now, it's synchronous but part of the larger stream.
99
  repo_id_or_error = ensure_hf_dataset()
100
  if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
101
  return repo_id_or_error
 
104
  temp_image_path = None
105
  try:
106
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
107
+ repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
108
+
 
109
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
110
  temp_image_path = tmp_file.name
111
  image_pil.save(temp_image_path, format="PNG")
112
 
113
  logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
114
  file_url = hf_api.upload_file(
115
+ path_or_fileobj=temp_image_path, path_in_repo=repo_filename,
116
+ repo_id=repo_id, repo_type="dataset", token=HF_TOKEN
 
 
 
117
  )
118
  logger.info(f"Successfully uploaded image: {file_url}")
119
  return file_url
120
  except Exception as e:
121
+ logger.error(f"Image upload error for {filename_base}{page_num_for_log}: {str(e)}", exc_info=True)
122
+ return f"Error uploading image {filename_base}{page_num_for_log}: {str(e)}"
123
  finally:
124
  if temp_image_path and os.path.exists(temp_image_path):
125
+ try: os.remove(temp_image_path)
126
+ except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
127
+
128
+
129
+ def format_page_text_to_markdown_chunk(page_text_content):
130
+ """Formats a single page's text content into a markdown chunk.
131
+ More complex formatting logic can be applied here page by page.
132
+ """
133
+ chunk_md = ""
134
+ # Normalize newlines: multiple consecutive newlines become a single blank line (two \n chars)
135
+ page_text_content = re.sub(r'\n\s*\n+', '\n\n', page_text_content.strip())
136
+ lines = page_text_content.split('\n')
137
+ is_in_list = False
138
+ for line_text in lines:
139
+ line_stripped = line_text.strip()
140
+ if not line_stripped:
141
+ chunk_md += "\n"
142
+ is_in_list = False
143
+ continue
144
+ list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
145
+ is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
146
+ if is_heading_candidate and not list_match:
147
+ chunk_md += f"## {line_stripped}\n\n"
148
+ is_in_list = False
149
+ elif list_match:
150
+ list_item_text = list_match.group(1)
151
+ chunk_md += f"- {list_item_text}\n"
152
+ is_in_list = True
153
  else:
154
+ if is_in_list: chunk_md += "\n"
155
+ chunk_md += f"{line_text}\n\n"
156
+ is_in_list = False
157
+ return re.sub(r'\n\s*\n+', '\n\n', chunk_md.strip()) + "\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
+
160
+ # --- Main PDF Processing Logic (Generator Function for Streaming) ---
161
+
162
+ def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
163
+ """
164
+ Processes the PDF incrementally and yields status messages and markdown content.
165
+ `pdf_input_source_path_or_url` is a local file path or a URL string.
166
+ """
167
  try:
168
+ # Initial Markdown Title
169
+ yield yield_message("markdown_replace", {"content": "# Extracted PDF Content\n\n"})
170
+ time.sleep(0.01) # Give gevent a chance to yield
171
+
172
+ # 1. Text and Table Extraction (Page by Page)
173
+ yield yield_message("status", {"message": "Opening PDF for text extraction..."})
174
+ time.sleep(0.01)
175
+
176
+ source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
177
+ pdf_input_source_path_or_url.startswith(('http://', 'https://'))
178
+
179
+ pdf_handle_for_text = None
180
+ pdf_bytes_for_images = None # Store bytes if downloaded from URL for image extraction
181
+
182
+ if source_is_url:
183
+ try:
184
+ response = requests.get(pdf_input_source_path_or_url, stream=True, timeout=60) # Increased timeout
185
+ response.raise_for_status()
186
+ pdf_bytes_for_images = response.content # Read all content for pdf2image
187
+ pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images) # Use BytesIO for pdfplumber
188
+ yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
189
+ time.sleep(0.01)
190
+ except requests.RequestException as e:
191
+ logger.error(f"URL fetch error for PDF processing: {str(e)}", exc_info=True)
192
+ yield yield_message("error", {"message": f"Error fetching PDF from URL: {str(e)}"})
193
+ return # Stop generation
194
+ else: # Local file path
195
+ pdf_handle_for_text = pdf_input_source_path_or_url # pdfplumber takes path
196
+
197
+ total_text_pages = 0
198
+ try:
199
+ with pdfplumber.open(pdf_handle_for_text) as pdf:
200
+ total_text_pages = len(pdf.pages)
201
+ yield yield_message("status", {"message": f"Found {total_text_pages} page(s) for text extraction."})
202
+ time.sleep(0.01)
203
+
204
+ for i, page in enumerate(pdf.pages):
205
+ yield yield_message("status", {"message": f"Extracting text from page {i+1}/{total_text_pages}..."})
206
+ time.sleep(0.01) # gevent yield
207
+
208
+ page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
209
+
210
+ page_tables_md = ""
211
+ tables = page.extract_tables()
212
+ if tables:
213
+ for table_idx, table_data in enumerate(tables):
214
+ if table_data:
215
+ yield yield_message("status", {"message": f" Processing table {table_idx+1} on page {i+1}..."})
216
+ header = [" | ".join(str(cell) if cell is not None else "" for cell in table_data[0])]
217
+ separator = [" | ".join(["---"] * len(table_data[0]))]
218
+ body = [" | ".join(str(cell) if cell is not None else "" for cell in row) for row in table_data[1:]]
219
+ table_md_lines = header + separator + body
220
+ page_tables_md += f"**Table (Page {i+1}):**\n" + "\n".join(table_md_lines) + "\n\n"
221
+
222
+ formatted_page_text_md = format_page_text_to_markdown_chunk(page_text)
223
+
224
+ yield yield_message("markdown_chunk", {"content": formatted_page_text_md})
225
+ if page_tables_md:
226
+ yield yield_message("markdown_chunk", {"content": page_tables_md})
227
+ time.sleep(0.01) # gevent yield
228
+ except Exception as e:
229
+ logger.error(f"Error during PDF text/table extraction: {str(e)}", exc_info=True)
230
+ yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
231
+ # Continue to image extraction if possible, or return based on severity
232
+
233
+ # 2. Image Extraction and OCR
234
+ if not check_poppler():
235
+ yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
236
  else:
237
+ yield yield_message("status", {"message": "Starting image extraction..."})
238
+ yield yield_message("markdown_chunk", {"content": "## Extracted Images\n\n"})
239
+ if not HF_TOKEN:
240
+ yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
241
+
242
+ time.sleep(0.01)
243
+ extracted_pil_images = []
244
+ try:
245
+ if source_is_url and pdf_bytes_for_images:
246
+ # Use the already downloaded bytes
247
+ extracted_pil_images = convert_from_bytes(pdf_bytes_for_images, dpi=150) # Lower DPI for speed/memory
248
+ elif not source_is_url: # local file path
249
+ extracted_pil_images = convert_from_path(pdf_input_source_path_or_url, dpi=150)
250
+
251
+ yield yield_message("status", {"message": f"Found {len(extracted_pil_images)} image(s) in PDF (these are rasterized pages for now)."})
252
+ time.sleep(0.01)
253
+
254
+ # TODO: Implement more granular image extraction if pdf2image supports it,
255
+ # or if you integrate a library that can extract embedded images directly.
256
+ # For now, convert_from_path/bytes often gives full pages as images.
257
+
258
+ for i, img_pil in enumerate(extracted_pil_images):
259
+ page_num_for_log = f"page_{i+1}" # Assuming one image per page from convert_from_path
260
+ yield yield_message("status", {"message": f"Processing image {i+1}/{len(extracted_pil_images)} (OCR & Upload)..."})
261
+ time.sleep(0.01)
262
+
263
+ ocr_text = ""
264
+ try:
265
+ ocr_text = pytesseract.image_to_string(img_pil).strip()
266
+ if ocr_text:
267
+ yield yield_message("status", {"message": f" OCR successful for image {i+1}."})
268
+ except Exception as ocr_e:
269
+ logger.error(f"OCR error for image {i+1}: {str(ocr_e)}")
270
+ ocr_text = f"OCR failed: {str(ocr_e)}"
271
+
272
+ image_md_chunk = ""
273
+ if HF_TOKEN:
274
+ image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
275
+ if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
276
+ image_md_chunk += f"![Image {i+1}]({image_url_or_error})\n"
277
+ yield yield_message("status", {"message": f" Image {i+1} uploaded."})
278
+ else:
279
+ image_md_chunk += f"**Image {i+1} (Upload Error):** {str(image_url_or_error)}\n\n"
280
+ yield yield_message("error", {"message": f"Failed to upload image {i+1}: {str(image_url_or_error)}"})
281
+ else:
282
+ image_md_chunk += f"**Image {i+1} (not uploaded due to missing HF_TOKEN)**\n"
283
+
284
+ if ocr_text:
285
+ image_md_chunk += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
286
+
287
+ yield yield_message("image_md", {"content": image_md_chunk})
288
+ time.sleep(0.01) # gevent yield
289
+
290
+ except Exception as e:
291
+ logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
292
+ yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
293
 
294
+ yield yield_message("final_status", {"message": "All processing stages complete."})
295
+
 
 
 
296
  except Exception as e:
297
+ logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
298
+ yield yield_message("error", {"message": f"Critical processing error: {str(e)}"})
299
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
  # --- Flask Routes ---
302
 
 
304
  def index():
305
  return render_template('index.html')
306
 
307
+ @app.route('/process-stream', methods=['POST'])
308
+ def process_pdf_stream():
309
  pdf_file = request.files.get('pdf_file')
310
  pdf_url = request.form.get('pdf_url', '').strip()
311
 
312
+ temp_pdf_path = None # To store path of uploaded file for cleanup
313
+ pdf_input_source_for_generator = None
314
+
315
+ def stream_processor():
316
+ nonlocal temp_pdf_path # Make it accessible in this inner function for cleanup
317
+ nonlocal pdf_input_source_for_generator
318
+
319
+ try:
320
+ if pdf_file and pdf_file.filename:
321
+ if not pdf_file.filename.lower().endswith('.pdf'):
322
+ yield yield_message("error", {"message": "Uploaded file is not a PDF."})
323
+ return
324
+
325
+ filename = secure_filename(pdf_file.filename)
326
+ # Save to a temporary file (ensure UPLOAD_FOLDER is writable by app user)
327
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
328
+ fd, temp_pdf_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
329
+ os.close(fd)
330
+ pdf_file.save(temp_pdf_path)
331
+ logger.info(f"Uploaded PDF saved to temporary path: {temp_pdf_path}")
332
+ pdf_input_source_for_generator = temp_pdf_path
333
+ yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
334
+ time.sleep(0.01)
335
+
336
+ elif pdf_url:
337
+ unquoted_url = urllib.parse.unquote(pdf_url)
338
+ if not (unquoted_url.startswith('http://') or unquoted_url.startswith('https://')):
339
+ yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
340
+ return
341
+ # Consider a light check for .pdf extension, but content-type is more reliable
342
+
343
+ pdf_input_source_for_generator = unquoted_url
344
+ yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
345
+ time.sleep(0.01)
346
+ else:
347
+ yield yield_message("error", {"message": "No PDF file uploaded and no PDF URL provided."})
348
+ return
349
+
350
+ # Yield from the main generator
351
+ for message_part in generate_pdf_conversion_stream(pdf_input_source_for_generator):
352
+ yield message_part
353
+ # time.sleep(0.01) # Allow gevent to switch context, important for streaming
354
+
355
+ except Exception as e:
356
+ logger.error(f"Error setting up stream or in initial validation: {str(e)}", exc_info=True)
357
+ yield yield_message("error", {"message": f"Setup error: {str(e)}"})
358
+ finally:
359
+ if temp_pdf_path and os.path.exists(temp_pdf_path):
360
+ try:
361
+ os.remove(temp_pdf_path)
362
+ logger.info(f"Cleaned up temporary PDF: {temp_pdf_path}")
363
+ yield yield_message("status", {"message": f"Cleaned up temporary file."})
364
+ except OSError as ose:
365
+ logger.error(f"Error removing temporary PDF {temp_pdf_path}: {ose}")
366
+ yield yield_message("error", {"message": f"Could not clean temp file: {ose}"})
367
+
368
+ # Using stream_with_context for proper handling of request context within the generator
369
+ return Response(stream_with_context(stream_processor()), mimetype='application/x-ndjson')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
 
372
  # --- Main Execution ---
373
  if __name__ == '__main__':
374
+ if not check_poppler(): # Check Poppler at startup for local dev
 
 
375
  logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
 
 
376
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
377
+ # For local dev, Flask's built-in server is fine. Gunicorn handles production.
378
+ # The 'threaded=True' or using gevent server locally can also help test streaming.
379
+ app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)