broadfield-dev commited on
Commit
a074fa0
·
verified ·
1 Parent(s): 90650e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -402
app.py CHANGED
@@ -1,413 +1,128 @@
 
 
 
1
  import os
2
- import io
3
- import re
4
- import logging
5
- import subprocess
6
- from datetime import datetime
7
- import urllib.parse
8
- import tempfile
9
- import json # For streaming JSON messages
10
- import time # For gevent.sleep
11
 
12
- from flask import Flask, request, render_template, Response, stream_with_context
13
- from werkzeug.utils import secure_filename
14
-
15
- # Ensure gevent is imported and monkey patched if needed for other libraries
16
- # from gevent import monkey
17
- # monkey.patch_all() # Apply this early if you suspect issues with other libs
18
-
19
- import requests # For requests.exceptions.HTTPError
20
- from requests.exceptions import HTTPError as RequestsHTTPError # Specific import for clarity
21
-
22
- import pdfplumber
23
- import pdf2image # <<<<<<<<<<<<<<<< CORRECTED: Added this import
24
- from pdf2image import convert_from_path, convert_from_bytes # Keep these for direct use too
25
- # from pdf2image.exceptions import ... # If you need to catch specific pdf2image errors
26
-
27
- import pytesseract
28
- from PIL import Image
29
- from huggingface_hub import HfApi, create_repo
30
-
31
- # --- Flask App Initialization ---
32
  app = Flask(__name__)
33
- app.config['UPLOAD_FOLDER'] = tempfile.gettempdir()
34
- app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50 MB limit for uploads, adjust as needed
35
-
36
- # --- Logging Configuration ---
37
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
38
- logger = logging.getLogger(__name__)
39
-
40
- # --- Hugging Face Configuration ---
41
- HF_TOKEN = os.getenv("HF_TOKEN")
42
- HF_DATASET_REPO_NAME = os.getenv("HF_DATASET_REPO_NAME", "pdf-images-extracted")
43
- hf_api = HfApi()
44
-
45
- # --- Helper to yield messages for streaming ---
46
- def yield_message(type, data):
47
- """Helper to format messages as JSON strings for streaming."""
48
- return json.dumps({"type": type, **data}) + "\n"
49
-
50
- # --- PDF Processing Helper Functions (Adapted for Streaming) ---
51
-
52
- def check_poppler():
53
- try:
54
- result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
55
- version_info_log = result.stderr.strip() if result.stderr else result.stdout.strip()
56
- if version_info_log:
57
- logger.info(f"Poppler version check: {version_info_log.splitlines()[0] if version_info_log else 'No version output'}")
58
- else:
59
- logger.info("Poppler 'pdftoppm -v' ran. Assuming Poppler is present.")
60
- return True
61
- except FileNotFoundError:
62
- logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in PATH.")
63
- return False
64
- except Exception as e:
65
- logger.error(f"An unexpected error occurred during Poppler check: {str(e)}")
66
- return False
67
-
68
- def ensure_hf_dataset():
69
- if not HF_TOKEN:
70
- msg = "HF_TOKEN is not set. Cannot ensure Hugging Face dataset. Image uploads will fail."
71
- logger.warning(msg)
72
- return "Error: " + msg
73
- try:
74
- repo_id_obj = create_repo(repo_id=HF_DATASET_REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
75
- logger.info(f"Dataset repo ensured: {repo_id_obj.repo_id}")
76
- return repo_id_obj.repo_id
77
- except RequestsHTTPError as e:
78
- if e.response is not None and e.response.status_code == 409:
79
- logger.info(f"Dataset repo '{HF_DATASET_REPO_NAME}' already exists (HTTP 409).")
80
- try:
81
- user_info = hf_api.whoami(token=HF_TOKEN)
82
- namespace = user_info.get('name') if user_info else None
83
- if namespace:
84
- return f"{namespace}/{HF_DATASET_REPO_NAME}"
85
- else:
86
- logger.warning(f"Could not determine namespace for existing repo '{HF_DATASET_REPO_NAME}'. Using generic ID.")
87
- return HF_DATASET_REPO_NAME
88
- except Exception as whoami_e:
89
- logger.error(f"Could not determine namespace for existing repo via whoami due to: {whoami_e}. Using generic ID.")
90
- return HF_DATASET_REPO_NAME
91
- else:
92
- status_code = e.response.status_code if e.response is not None else "Unknown"
93
- logger.error(f"Hugging Face dataset HTTP error (Status: {status_code}): {str(e)}")
94
- return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}' due to HTTP error: {str(e)}"
95
- except Exception as e:
96
- logger.error(f"Hugging Face dataset general error: {str(e)}", exc_info=True)
97
- return f"Error: Failed to access or create dataset '{HF_DATASET_REPO_NAME}': {str(e)}"
98
-
99
-
100
- def upload_image_to_hf_stream(image_pil, filename_base, page_num_for_log=""):
101
- repo_id_or_error = ensure_hf_dataset()
102
- if isinstance(repo_id_or_error, str) and repo_id_or_error.startswith("Error"):
103
- return repo_id_or_error
104
-
105
- repo_id = repo_id_or_error
106
- temp_image_path = None
107
- try:
108
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
109
- repo_filename = f"images/{filename_base}_{page_num_for_log}_{timestamp}.png"
110
-
111
- # Ensure UPLOAD_FOLDER exists before writing temp file
112
- os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
113
- with tempfile.NamedTemporaryFile(delete=False, suffix=".png", dir=app.config['UPLOAD_FOLDER']) as tmp_file:
114
- temp_image_path = tmp_file.name
115
- image_pil.save(temp_image_path, format="PNG")
116
-
117
- logger.info(f"Attempting to upload {temp_image_path} to {repo_id}/{repo_filename}")
118
- file_url = hf_api.upload_file(
119
- path_or_fileobj=temp_image_path, path_in_repo=repo_filename,
120
- repo_id=repo_id, repo_type="dataset", token=HF_TOKEN
121
- )
122
- logger.info(f"Successfully uploaded image: {file_url}")
123
- return file_url
124
- except Exception as e:
125
- logger.error(f"Image upload error for {filename_base}{page_num_for_log}: {str(e)}", exc_info=True)
126
- return f"Error uploading image {filename_base}{page_num_for_log}: {str(e)}"
127
- finally:
128
- if temp_image_path and os.path.exists(temp_image_path):
129
- try: os.remove(temp_image_path)
130
- except OSError as ose: logger.error(f"Error removing temp image file {temp_image_path}: {ose}")
131
-
132
-
133
- def format_page_text_to_markdown_chunk(page_text_content):
134
- chunk_md = ""
135
- page_text_content = re.sub(r'\n\s*\n+', '\n\n', page_text_content.strip())
136
- lines = page_text_content.split('\n')
137
- is_in_list = False
138
- for line_text in lines:
139
- line_stripped = line_text.strip()
140
- if not line_stripped:
141
- chunk_md += "\n"
142
- is_in_list = False
143
- continue
144
- list_match = re.match(r'^\s*(?:(?:\d+\.)|[*+-])\s+(.*)', line_stripped)
145
- is_heading_candidate = line_stripped.isupper() and 5 < len(line_stripped) < 100
146
- if is_heading_candidate and not list_match:
147
- chunk_md += f"## {line_stripped}\n\n"
148
- is_in_list = False
149
- elif list_match:
150
- list_item_text = list_match.group(1)
151
- chunk_md += f"- {list_item_text}\n"
152
- is_in_list = True
153
- else:
154
- if is_in_list: chunk_md += "\n"
155
- chunk_md += f"{line_text}\n\n"
156
- is_in_list = False
157
- return re.sub(r'\n\s*\n+', '\n\n', chunk_md.strip()) + "\n\n"
158
-
159
-
160
- # --- Main PDF Processing Logic (Generator Function for Streaming) ---
161
-
162
- def generate_pdf_conversion_stream(pdf_input_source_path_or_url):
163
- try:
164
- yield yield_message("markdown_replace", {"content": "# Extracted PDF Content\n\n"})
165
- time.sleep(0.01)
166
 
167
- yield yield_message("status", {"message": "Opening PDF for text extraction..."})
168
- time.sleep(0.01)
169
 
170
- source_is_url = isinstance(pdf_input_source_path_or_url, str) and \
171
- pdf_input_source_path_or_url.startswith(('http://', 'https://'))
172
-
173
- pdf_handle_for_text = None
174
- pdf_bytes_for_images = None
175
 
176
- if source_is_url:
177
- try:
178
- response = requests.get(pdf_input_source_path_or_url, stream=False, timeout=60)
179
- response.raise_for_status()
180
- pdf_bytes_for_images = response.content
181
- pdf_handle_for_text = io.BytesIO(pdf_bytes_for_images)
182
- yield yield_message("status", {"message": f"PDF downloaded from URL ({len(pdf_bytes_for_images)/1024:.2f} KB)."})
183
- time.sleep(0.01)
184
- except RequestsHTTPError as e:
185
- logger.error(f"URL fetch HTTP error for PDF processing: {str(e)} (Status: {e.response.status_code if e.response else 'N/A'})", exc_info=True)
186
- yield yield_message("error", {"message": f"Error fetching PDF from URL (HTTP {e.response.status_code if e.response else 'N/A'}): {e.response.reason if e.response else str(e)}"})
187
- return
188
- except requests.RequestException as e:
189
- logger.error(f"URL fetch network error for PDF processing: {str(e)}", exc_info=True)
190
- yield yield_message("error", {"message": f"Network error fetching PDF from URL: {str(e)}"})
191
- return
192
- else:
193
- pdf_handle_for_text = pdf_input_source_path_or_url
194
-
195
- total_text_pages = 0
196
- try:
197
- with pdfplumber.open(pdf_handle_for_text) as pdf:
198
- total_text_pages = len(pdf.pages)
199
- yield yield_message("status", {"message": f"Found {total_text_pages} page(s) for text extraction."})
200
- time.sleep(0.01)
201
-
202
- for i, page in enumerate(pdf.pages):
203
- yield yield_message("status", {"message": f"Extracting text from page {i+1}/{total_text_pages}..."})
204
- time.sleep(0.01)
205
-
206
- page_text = page.extract_text(layout=True, x_density=1, y_density=1) or ""
207
-
208
- # Removed table extraction logic here
209
- # page_tables_md = "" # No longer needed
210
- # tables = page.extract_tables() # No longer needed
211
- # if tables: # No longer needed
212
- # ... (table processing code removed) ...
213
-
214
- formatted_page_text_md = format_page_text_to_markdown_chunk(page_text)
215
-
216
- yield yield_message("markdown_chunk", {"content": formatted_page_text_md})
217
- # if page_tables_md: # No longer needed, as page_tables_md is not created
218
- # yield yield_message("markdown_chunk", {"content": page_tables_md})
219
- time.sleep(0.01)
220
- except Exception as e:
221
- logger.error(f"Error during PDF text extraction: {str(e)}", exc_info=True) # Updated log message
222
- yield yield_message("error", {"message": f"Error during text extraction: {str(e)}"})
223
-
224
- if not check_poppler():
225
- yield yield_message("error", {"message": "Poppler (for image extraction) not found or not working."})
226
- else:
227
- yield yield_message("status", {"message": "Starting image extraction..."})
228
- yield yield_message("markdown_chunk", {"content": "## Extracted Images\n\n"})
229
- if not HF_TOKEN:
230
- yield yield_message("markdown_chunk", {"content": "**Note:** `HF_TOKEN` not set. Images will be described but not uploaded.\n\n"})
231
-
232
- time.sleep(0.01)
233
- extracted_pil_images_overall_count = 0 # Keep track of total images processed for numbering
234
- try:
235
- image_source_for_convert = None
236
- if source_is_url and pdf_bytes_for_images:
237
- image_source_for_convert = pdf_bytes_for_images
238
- logger.info("Using downloaded bytes for image conversion.")
239
- elif not source_is_url:
240
- image_source_for_convert = pdf_input_source_path_or_url
241
- logger.info("Using local file path for image conversion.")
242
-
243
- if image_source_for_convert:
244
- try:
245
- pdf_info = None
246
- if isinstance(image_source_for_convert, bytes):
247
- pdf_info = pdf2image.pdfinfo_from_bytes(image_source_for_convert, userpw=None, poppler_path=None)
248
- else:
249
- pdf_info = pdf2image.pdfinfo_from_path(image_source_for_convert, userpw=None, poppler_path=None)
250
-
251
- num_image_pages = pdf_info.get("Pages", 0)
252
- yield yield_message("status", {"message": f"PDF has {num_image_pages} page(s) for potential image extraction."})
253
-
254
- batch_size = 1
255
- for page_idx_start in range(1, num_image_pages + 1, batch_size):
256
- page_idx_end = min(page_idx_start + batch_size - 1, num_image_pages)
257
- yield yield_message("status", {"message": f"Extracting images from PDF page(s) {page_idx_start}-{page_idx_end}..."})
258
- time.sleep(0.01)
259
-
260
- page_images_pil = []
261
- if isinstance(image_source_for_convert, bytes):
262
- page_images_pil = convert_from_bytes(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
263
- else:
264
- page_images_pil = convert_from_path(image_source_for_convert, dpi=150, first_page=page_idx_start, last_page=page_idx_end)
265
-
266
- for img_idx_in_batch, img_pil in enumerate(page_images_pil):
267
- extracted_pil_images_overall_count += 1
268
- current_pdf_page_num = page_idx_start + img_idx_in_batch # Actual PDF page number
269
- page_num_for_log = f"pdfpage_{current_pdf_page_num}"
270
-
271
- yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (from PDF page {current_pdf_page_num}) (OCR & Upload)..."})
272
- time.sleep(0.01)
273
-
274
- ocr_text = ""
275
- try:
276
- ocr_text = pytesseract.image_to_string(img_pil).strip()
277
- if ocr_text: yield yield_message("status", {"message": f" OCR successful for image {extracted_pil_images_overall_count}."})
278
- except Exception as ocr_e:
279
- logger.error(f"OCR error for image {extracted_pil_images_overall_count}: {str(ocr_e)}")
280
- ocr_text = f"OCR failed: {str(ocr_e)}"
281
-
282
- image_md_chunk = ""
283
- if HF_TOKEN:
284
- image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image", page_num_for_log)
285
- if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
286
- image_md_chunk += f"![Image {extracted_pil_images_overall_count}]({image_url_or_error})\n"
287
- yield yield_message("status", {"message": f" Image {extracted_pil_images_overall_count} uploaded."})
288
- else:
289
- image_md_chunk += f"**Image {extracted_pil_images_overall_count} (Upload Error):** {str(image_url_or_error)}\n\n"
290
- yield yield_message("error", {"message": f"Failed to upload image {extracted_pil_images_overall_count}: {str(image_url_or_error)}"})
291
- else:
292
- image_md_chunk += f"**Image {extracted_pil_images_overall_count} (not uploaded due to missing HF_TOKEN)**\n"
293
-
294
- if ocr_text:
295
- image_md_chunk += f"**Image {extracted_pil_images_overall_count} OCR Text:**\n```\n{ocr_text}\n```\n\n"
296
-
297
- yield yield_message("image_md", {"content": image_md_chunk})
298
- time.sleep(0.01)
299
- except Exception as e_img_info:
300
- logger.error(f"Could not get PDF info for image batching or during batched conversion: {e_img_info}", exc_info=True)
301
- yield yield_message("error", {"message": f"Error preparing for image extraction: {e_img_info}. Trying bulk."})
302
- # Fallback to bulk conversion
303
- bulk_images_pil = []
304
- if isinstance(image_source_for_convert, bytes):
305
- bulk_images_pil = convert_from_bytes(image_source_for_convert, dpi=150)
306
- else:
307
- bulk_images_pil = convert_from_path(image_source_for_convert, dpi=150)
308
-
309
- yield yield_message("status", {"message": f"Fallback: Extracted {len(bulk_images_pil)} images in bulk."})
310
- for i, img_pil in enumerate(bulk_images_pil):
311
- extracted_pil_images_overall_count +=1
312
- page_num_for_log = f"bulk_image_{i+1}" # Less precise page info in fallback
313
- yield yield_message("status", {"message": f"Processing image {extracted_pil_images_overall_count} (bulk) (OCR & Upload)..."})
314
- ocr_text = ""
315
- try: ocr_text = pytesseract.image_to_string(img_pil).strip()
316
- except Exception as e: ocr_text = f"OCR Error: {e}"
317
-
318
- image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]\n"
319
- if HF_TOKEN:
320
- image_url_or_error = upload_image_to_hf_stream(img_pil, "pdf_image_fallback", page_num_for_log)
321
- if isinstance(image_url_or_error, str) and not image_url_or_error.startswith("Error"):
322
- image_md_chunk = f"![Image {extracted_pil_images_overall_count} (Fallback)]({image_url_or_error})\n"
323
- else:
324
- image_md_chunk += f"**Upload Error:** {str(image_url_or_error)}\n"
325
- if ocr_text: image_md_chunk += f"**OCR Text:**\n```\n{ocr_text}\n```\n\n"
326
- else: image_md_chunk += "\n"
327
- yield yield_message("image_md", {"content": image_md_chunk})
328
- time.sleep(0.01)
329
-
330
- else:
331
- yield yield_message("status", {"message": "No valid source for image extraction."})
332
-
333
- except Exception as e:
334
- logger.error(f"Error during image extraction/processing: {str(e)}", exc_info=True)
335
- yield yield_message("error", {"message": f"Error during image extraction: {str(e)}"})
336
-
337
- yield yield_message("final_status", {"message": "All processing stages complete."})
338
-
339
- except Exception as e:
340
- logger.error(f"Unhandled error in PDF conversion stream: {str(e)}", exc_info=True)
341
- yield yield_message("error", {"message": f"Critical processing error: {str(e)}"})
342
-
343
-
344
- # --- Flask Routes ---
345
-
346
- @app.route('/', methods=['GET'])
347
  def index():
348
- return render_template('index.html')
349
-
350
- @app.route('/process-stream', methods=['POST'])
351
- def process_pdf_stream():
352
- pdf_file = request.files.get('pdf_file')
353
- pdf_url = request.form.get('pdf_url', '').strip()
354
-
355
- outer_temp_pdf_path = None
356
-
357
- def stream_processor():
358
- nonlocal outer_temp_pdf_path
359
- pdf_input_source_for_generator = None
360
 
 
361
  try:
362
- if pdf_file and pdf_file.filename:
363
- if not pdf_file.filename.lower().endswith('.pdf'):
364
- yield yield_message("error", {"message": "Uploaded file is not a PDF."})
365
- return
366
-
367
- filename = secure_filename(pdf_file.filename)
368
- os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
369
- fd, temp_path = tempfile.mkstemp(suffix=".pdf", prefix="upload_", dir=app.config['UPLOAD_FOLDER'])
370
- os.close(fd)
371
- pdf_file.save(temp_path)
372
- outer_temp_pdf_path = temp_path
373
- logger.info(f"Uploaded PDF saved to temporary path: {outer_temp_pdf_path}")
374
- pdf_input_source_for_generator = outer_temp_pdf_path
375
- yield yield_message("status", {"message": f"Processing uploaded PDF: {filename}"})
376
- time.sleep(0.01)
377
-
378
- elif pdf_url:
379
- unquoted_url = urllib.parse.unquote(pdf_url)
380
- if not (unquoted_url.startswith('http://') or unquoted_url.startswith('https://')):
381
- yield yield_message("error", {"message": "Invalid URL scheme. Must be http or https."})
382
- return
383
-
384
- pdf_input_source_for_generator = unquoted_url
385
- yield yield_message("status", {"message": f"Preparing to process PDF from URL: {unquoted_url}"})
386
- time.sleep(0.01)
387
- else:
388
- yield yield_message("error", {"message": "No PDF file uploaded and no PDF URL provided."})
389
- return
390
-
391
- for message_part in generate_pdf_conversion_stream(pdf_input_source_for_generator):
392
- yield message_part
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
  except Exception as e:
395
- logger.error(f"Error setting up stream or in initial validation: {str(e)}", exc_info=True)
396
- yield yield_message("error", {"message": f"Setup error: {str(e)}"})
397
- finally:
398
- if outer_temp_pdf_path and os.path.exists(outer_temp_pdf_path):
399
- try:
400
- os.remove(outer_temp_pdf_path)
401
- logger.info(f"Cleaned up temporary PDF: {outer_temp_pdf_path}")
402
- except OSError as ose:
403
- logger.error(f"Error removing temporary PDF {outer_temp_pdf_path}: {ose}")
404
-
405
- return Response(stream_with_context(stream_processor()), mimetype='application/x-ndjson')
406
-
407
-
408
- # --- Main Execution ---
409
- if __name__ == '__main__':
410
- if not check_poppler():
411
- logger.warning("Poppler utilities might not be installed correctly. PDF processing might fail.")
412
- os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
413
- app.run(host='0.0.0.0', port=int(os.getenv("PORT", 7860)), debug=True, threaded=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, render_template_string, send_file
2
+ import markdown
3
+ import imgkit
4
  import os
5
+ import traceback
6
+ from io import BytesIO
 
 
 
 
 
 
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  app = Flask(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ # Use a directory within the app's working directory to avoid permission issues
11
+ TEMP_DIR = os.path.join(os.getcwd(), "temp")
12
 
13
+ # Create temporary directory if it doesn't exist
14
+ try:
15
+ os.makedirs(TEMP_DIR, exist_ok=True)
16
+ except Exception as e:
17
+ print(f"Error creating temp directory: {e}")
18
 
19
+ @app.route("/", methods=["GET", "POST"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def index():
21
+ preview_html = None
22
+ download_available = False
23
+ download_type = "png"
24
+ error_message = None
25
+ markdown_text = request.form.get("markdown_text", "") if request.method == "POST" else ""
 
 
 
 
 
 
 
26
 
27
+ if request.method == "POST" and markdown_text:
28
  try:
29
+ # Convert Markdown to HTML
30
+ html_content = markdown.markdown(markdown_text, extensions=['fenced_code', 'tables'])
31
+
32
+ # Prepare HTML with basic styling
33
+ full_html = f"""
34
+ <!DOCTYPE html>
35
+ <html>
36
+ <head>
37
+ <style>
38
+ body {{ font-family: Arial, sans-serif; padding: 20px; }}
39
+ pre, code {{ background: #f4f4f4; padding: 10px; border-radius: 5px; }}
40
+ table {{ border-collapse: collapse; width: 100%; }}
41
+ th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
42
+ th {{ background-color: #f2f2f2; }}
43
+ </style>
44
+ </head>
45
+ <body>
46
+ {html_content}
47
+ </body>
48
+ </html>
49
+ """
50
+
51
+ # Save HTML to a temporary file
52
+ html_path = os.path.join(TEMP_DIR, "output.html")
53
+ with open(html_path, "w", encoding="utf-8") as f:
54
+ f.write(full_html)
55
+
56
+ # Generate preview HTML
57
+ preview_html = full_html
58
+ download_available = True
59
+ download_type = request.form.get("download_type", "png")
60
+
61
+ if "download" in request.form:
62
+ if download_type == "html":
63
+ return send_file(
64
+ html_path,
65
+ as_attachment=True,
66
+ download_name="output.html",
67
+ mimetype="text/html"
68
+ )
69
+ else: # PNG
70
+ # Convert HTML to PNG using imgkit
71
+ png_path = os.path.join(TEMP_DIR, "output.png")
72
+ imgkit.from_string(full_html, png_path, options={"quiet": ""})
73
+ return send_file(
74
+ png_path,
75
+ as_attachment=True,
76
+ download_name="output.png",
77
+ mimetype="image/png"
78
+ )
79
 
80
  except Exception as e:
81
+ error_message = f"Error processing request: {str(e)}"
82
+ print(f"Error: {traceback.format_exc()}")
83
+
84
+ return render_template_string("""
85
+ <!DOCTYPE html>
86
+ <html>
87
+ <head>
88
+ <title>Markdown to PNG/HTML Converter</title>
89
+ <style>
90
+ body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
91
+ textarea { width: 100%; height: 300px; margin-bottom: 10px; }
92
+ select, button { padding: 10px; margin: 5px; }
93
+ .preview { border: 1px solid #ddd; padding: 15px; margin-top: 20px; }
94
+ .download-btn { background-color: #4CAF50; color: white; border: none; cursor: pointer; }
95
+ .download-btn:hover { background-color: #45a049; }
96
+ .error { color: red; margin-top: 10px; }
97
+ </style>
98
+ </head>
99
+ <body>
100
+ <h1>Markdown to PNG/HTML Converter</h1>
101
+ <form method="post">
102
+ <textarea name="markdown_text" placeholder="Paste your Markdown here...">{{ markdown_text }}</textarea><br>
103
+ <label for="download_type">Output format:</label>
104
+ <select name="download_type">
105
+ <option value="png" {% if download_type == 'png' %}selected{% endif %}>PNG</option>
106
+ <option value="html" {% if download_type == 'html' %}selected{% endif %}>HTML</option>
107
+ </select><br>
108
+ <button type="submit">Generate Preview</button>
109
+ {% if download_available %}
110
+ <button type="submit" name="download" value="true" class="download-btn">Download {{ download_type.upper() }}</button>
111
+ {% endif %}
112
+ </form>
113
+ {% if error_message %}
114
+ <p class="error">{{ error_message }}</p>
115
+ {% endif %}
116
+ {% if preview_html %}
117
+ <h2>Preview</h2>
118
+ <div class="preview">
119
+ {{ preview_html | safe }}
120
+ </div>
121
+ {% endif %}
122
+ </body>
123
+ </html>
124
+ """, preview_html=preview_html, download_available=download_available,
125
+ download_type=download_type, error_message=error_message, markdown_text=markdown_text)
126
+
127
+ if __name__ == "__main__":
128
+ app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))