pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 10 days ago

Commit

dd8d861

verified ·

1 Parent(s): 095781d

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -7

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ import fitz  # PyMuPDF
 from PIL import Image
 import pytesseract
 import os
 def clean_ocr_text(text):
     lines = text.splitlines()
@@ -33,12 +35,12 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
                 if line_text:
                     elements.append((line_y, line_text, max_font_size))
-    # Extraer imágenes únicas (por xref, global)
     images_on_page = page.get_images(full=True)
     for img_index, img in enumerate(images_on_page):
         xref = img[0]
         if xref in seen_xrefs:
-            continue  # ya extraída
         seen_xrefs.add(xref)
         try:
             base_image = page.parent.extract_image(xref)
@@ -53,7 +55,6 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
         except Exception as e:
             elements.append((float("inf"), f"[Error imagen: {e}]", 10))
-    # Ordenar por posición
     elements.sort(key=lambda x: x[0])
     previous_y = None
@@ -75,7 +76,7 @@ def convert(pdf_file):
     doc = fitz.open(pdf_file)
     markdown_output = ""
     image_paths = []
-    seen_xrefs = set()  # <<-- GLOBAL para todo el PDF
     for page_num in range(len(doc)):
         page = doc[page_num]
@@ -94,18 +95,36 @@ def convert(pdf_file):
             markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
             try:
-                ocr_text = pytesseract.image_to_string(img, lang="spa")
-            except pytesseract.TesseractError:
                 ocr_text = pytesseract.image_to_string(img)
             ocr_text = clean_ocr_text(ocr_text)
             if ocr_text.strip():
                 markdown_output += ocr_text + "\n"
             markdown_output += "\n---\n\n"
-    # Guardar como archivo .md
     markdown_path = "/tmp/resultado.md"
     with open(markdown_path, "w", encoding="utf-8") as f:
         f.write(markdown_output)

 from PIL import Image
 import pytesseract
 import os
+import numpy as np
+import cv2
 def clean_ocr_text(text):
     lines = text.splitlines()
                 if line_text:
                     elements.append((line_y, line_text, max_font_size))
+    # Extraer imágenes únicas (por xref)
     images_on_page = page.get_images(full=True)
     for img_index, img in enumerate(images_on_page):
         xref = img[0]
         if xref in seen_xrefs:
+            continue
         seen_xrefs.add(xref)
         try:
             base_image = page.parent.extract_image(xref)
         except Exception as e:
             elements.append((float("inf"), f"[Error imagen: {e}]", 10))
     elements.sort(key=lambda x: x[0])
     previous_y = None
     doc = fitz.open(pdf_file)
     markdown_output = ""
     image_paths = []
+    seen_xrefs = set()
     for page_num in range(len(doc)):
         page = doc[page_num]
             markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
+            # OCR
             try:
                 ocr_text = pytesseract.image_to_string(img)
+            except pytesseract.TesseractError:
+                ocr_text = ""
             ocr_text = clean_ocr_text(ocr_text)
             if ocr_text.strip():
                 markdown_output += ocr_text + "\n"
+            # Detección de imágenes dentro de la imagen completa (por contornos)
+            try:
+                img_cv = np.array(img)
+                gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
+                _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
+                contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                for i, cnt in enumerate(contours):
+                    x, y, w, h = cv2.boundingRect(cnt)
+                    if w > 50 and h > 50:
+                        region = img_cv[y:y+h, x:x+w]
+                        detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
+                        Image.fromarray(region).save(detected_path)
+                        image_paths.append(detected_path)
+                        markdown_output += f"\n\n![imagen_detectada]({detected_path})\n"
+            except Exception as e:
+                markdown_output += f"\n\n[Error al detectar imágenes embebidas: {e}]\n"
             markdown_output += "\n---\n\n"
     markdown_path = "/tmp/resultado.md"
     with open(markdown_path, "w", encoding="utf-8") as f:
         f.write(markdown_output)