pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 11 days ago

Commit

6e5a37b

verified ·

1 Parent(s): 7d786f3

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -9

app.py CHANGED Viewed

@@ -21,12 +21,13 @@ def extract_text_markdown(doc):
     markdown_output = ""
     image_counter = 1
-    for page in doc:
         blocks = page.get_text("dict")["blocks"]
         elements = []
         for b in blocks:
             y = b["bbox"][1]
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
@@ -34,14 +35,19 @@ def extract_text_markdown(doc):
                     max_font_size = max([span.get("size", 10) for span in line["spans"]])
                     if line_text:
                         elements.append((line_y, line_text, max_font_size))
             elif b["type"] == 1:  # Imagen
-                elements.append((y, f"![imagen_{image_counter}](#)", 10))
-                image_counter += 1
         elements.sort(key=lambda x: x[0])
         previous_y = None
-        previous_font = None
         for y, text, font_size in elements:
             is_header = font_size >= 14
@@ -55,7 +61,6 @@ def extract_text_markdown(doc):
                 markdown_output += text.strip() + "\n"
             previous_y = y
-            previous_font = font_size
         markdown_output += "\n---\n\n"
@@ -75,11 +80,23 @@ def convert(pdf_file):
             # Página con texto normal
             markdown_output += extract_text_markdown([page]) + "\n"
         else:
-            # Página sin texto: usar OCR
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            ocr_text = pytesseract.image_to_string(img, lang="spa")
-            markdown_output += clean_ocr_text(ocr_text) + "\n"
         markdown_output += "\n---\n\n"
@@ -90,3 +107,4 @@ gr.Interface(
     inputs=[gr.File(label="Sube tu PDF", type="filepath")],
     outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
 ).launch()

     markdown_output = ""
     image_counter = 1
+    for page_num, page in enumerate(doc):
         blocks = page.get_text("dict")["blocks"]
         elements = []
         for b in blocks:
             y = b["bbox"][1]
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
                     max_font_size = max([span.get("size", 10) for span in line["spans"]])
                     if line_text:
                         elements.append((line_y, line_text, max_font_size))
             elif b["type"] == 1:  # Imagen
+                try:
+                    image = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=b["bbox"])
+                    image_path = f"/tmp/imagen_embebida_{page_num + 1}_{image_counter}.jpg"
+                    image.save(image_path)
+                    elements.append((y, f"![imagen_{image_counter}]({image_path})", 10))
+                    image_counter += 1
+                except Exception as e:
+                    elements.append((y, f"[Error al procesar imagen: {e}]", 10))
         elements.sort(key=lambda x: x[0])
         previous_y = None
         for y, text, font_size in elements:
             is_header = font_size >= 14
                 markdown_output += text.strip() + "\n"
             previous_y = y
         markdown_output += "\n---\n\n"
             # Página con texto normal
             markdown_output += extract_text_markdown([page]) + "\n"
         else:
+            # Página vacía o imagen: hacer OCR
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            # Guardar imagen escaneada completa
+            image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
+            img.save(image_path)
+            markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
+            try:
+                ocr_text = pytesseract.image_to_string(img, lang="spa")
+            except pytesseract.TesseractError:
+                ocr_text = pytesseract.image_to_string(img)  # fallback sin lang
+            ocr_text = clean_ocr_text(ocr_text)
+            if ocr_text.strip():
+                markdown_output += ocr_text + "\n"
         markdown_output += "\n---\n\n"
     inputs=[gr.File(label="Sube tu PDF", type="filepath")],
     outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
 ).launch()