pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 10 days ago

Commit

3c39da9

verified ·

1 Parent(s): dd8d861

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -6

app.py CHANGED Viewed

@@ -21,13 +21,13 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
     image_counter = 1
     elements = []
-    page = doc[0]  # solo una página en cada llamada
     blocks = page.get_text("dict")["blocks"]
     for b in blocks:
         y = b["bbox"][1]
-        if b["type"] == 0:  # Texto
             for line in b["lines"]:
                 line_y = line["bbox"][1]
                 line_text = " ".join([span["text"] for span in line["spans"]]).strip()
@@ -35,7 +35,6 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
                 if line_text:
                     elements.append((line_y, line_text, max_font_size))
-    # Extraer imágenes únicas (por xref)
     images_on_page = page.get_images(full=True)
     for img_index, img in enumerate(images_on_page):
         xref = img[0]
@@ -95,7 +94,6 @@ def convert(pdf_file):
             markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
-            # OCR
             try:
                 ocr_text = pytesseract.image_to_string(img)
             except pytesseract.TesseractError:
@@ -105,16 +103,20 @@ def convert(pdf_file):
             if ocr_text.strip():
                 markdown_output += ocr_text + "\n"
-            # Detección de imágenes dentro de la imagen completa (por contornos)
             try:
                 img_cv = np.array(img)
                 gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
                 _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
                 contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                 for i, cnt in enumerate(contours):
                     x, y, w, h = cv2.boundingRect(cnt)
-                    if w > 50 and h > 50:
                         region = img_cv[y:y+h, x:x+w]
                         detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
                         Image.fromarray(region).save(detected_path)

     image_counter = 1
     elements = []
+    page = doc[0]
     blocks = page.get_text("dict")["blocks"]
     for b in blocks:
         y = b["bbox"][1]
+        if b["type"] == 0:
             for line in b["lines"]:
                 line_y = line["bbox"][1]
                 line_text = " ".join([span["text"] for span in line["spans"]]).strip()
                 if line_text:
                     elements.append((line_y, line_text, max_font_size))
     images_on_page = page.get_images(full=True)
     for img_index, img in enumerate(images_on_page):
         xref = img[0]
             markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
             try:
                 ocr_text = pytesseract.image_to_string(img)
             except pytesseract.TesseractError:
             if ocr_text.strip():
                 markdown_output += ocr_text + "\n"
+            # ✅ Detección limitada de imágenes embebidas (hasta 5 contornos grandes)
             try:
                 img_cv = np.array(img)
                 gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
                 _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
                 contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                # Ordenar por área y limitar a 5 regiones grandes
+                contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
                 for i, cnt in enumerate(contours):
                     x, y, w, h = cv2.boundingRect(cnt)
+                    area = w * h
+                    if area > 5000:
                         region = img_cv[y:y+h, x:x+w]
                         detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
                         Image.fromarray(region).save(detected_path)