pdf-to-markdown

Sleeping

App Files Files Community

Biifruu commited on Jun 20

Commit

02bbba1

verified ·

1 Parent(s): c283ac8

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -5

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import pytesseract
 import os
 import numpy as np
 import cv2
 def clean_ocr_text(text):
     lines = text.splitlines()
@@ -67,6 +68,16 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
 @spaces.GPU
 def convert(pdf_bytes):
     temp_pdf_path = "/tmp/uploaded_file.pdf"
     with open(temp_pdf_path, "wb") as f:
         f.write(pdf_bytes)
@@ -86,18 +97,15 @@ def convert(pdf_bytes):
             markdown_output += f"\n## Página {page_num + 1}\n\n"
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
             img.save(image_path)
             image_paths.append(image_path)
             markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
             try:
                 ocr_text = pytesseract.image_to_string(img)
             except pytesseract.TesseractError:
                 ocr_text = ""
             ocr_text = clean_ocr_text(ocr_text)
             if ocr_text.strip():
                 markdown_output += ocr_text + "\n"
@@ -107,9 +115,7 @@ def convert(pdf_bytes):
                 gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
                 _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
                 contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                 contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
                 for i, cnt in enumerate(contours):
                     x, y, w, h = cv2.boundingRect(cnt)
                     area = w * h

 import os
 import numpy as np
 import cv2
+import base64
 def clean_ocr_text(text):
     lines = text.splitlines()
 @spaces.GPU
 def convert(pdf_bytes):
+    # Si no se recibe ningún PDF, usamos uno por defecto.
+    if not pdf_bytes:
+        default_pdf_base64 = (
+            "JVBERi0xLjQKMSAwIG9iago8PC9UeXBlIC9DYXRhbG9nIC9QYWdlcyAyIDAgUgovT3V0cHV0cyA8PC9Qcm9jU2V0Wy9QREZdPj4+"
+            "CmVuZG9iagoKMiAwIG9iago8PC9UeXBlIC9QYWdlcyAvS2lkcyBbMyAwIFJdIC9Db3VudCAxPj4KZW5kb2JqCgozIDAgb2JqCjw8"
+            "L1R5cGUgL1BhZ2UgL1BhZ2VzIDIgMCBSIC9NZWRpYUJveCBbMCAwIDMwMCAxNDNdIC9Db250ZW50cyA0IDAgUiA+PgplbmRvYmoK"
+            "CgoKZW5kb2JqCnhyZWYKMC45NgolJUVPRgo="
+        )
+        pdf_bytes = base64.b64decode(default_pdf_base64)
     temp_pdf_path = "/tmp/uploaded_file.pdf"
     with open(temp_pdf_path, "wb") as f:
         f.write(pdf_bytes)
             markdown_output += f"\n## Página {page_num + 1}\n\n"
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
             img.save(image_path)
             image_paths.append(image_path)
             markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
             try:
                 ocr_text = pytesseract.image_to_string(img)
             except pytesseract.TesseractError:
                 ocr_text = ""
             ocr_text = clean_ocr_text(ocr_text)
             if ocr_text.strip():
                 markdown_output += ocr_text + "\n"
                 gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
                 _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
                 contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                 contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
                 for i, cnt in enumerate(contours):
                     x, y, w, h = cv2.boundingRect(cnt)
                     area = w * h