pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 11 days ago

Commit

4f878aa

verified ·

1 Parent(s): 584dc82

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -20

app.py CHANGED Viewed

@@ -22,6 +22,7 @@ def extract_text_markdown(doc, image_paths):
         blocks = page.get_text("dict")["blocks"]
         elements = []
         for b in blocks:
             y = b["bbox"][1]
@@ -33,31 +34,38 @@ def extract_text_markdown(doc, image_paths):
                     if line_text:
                         elements.append((line_y, line_text, max_font_size))
-            elif b["type"] == 1:  # Imagen
-                try:
-                    image = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=b["bbox"])
-                    image_path = f"/tmp/imagen_embebida_{page_num + 1}_{image_counter}.jpg"
-                    image.save(image_path)
-                    image_paths.append(image_path)
-                    elements.append((y, f"![imagen_{image_counter}]({image_path})", 10))
-                    image_counter += 1
-                except Exception as e:
-                    elements.append((y, f"[Error al procesar imagen: {e}]", 10))
         elements.sort(key=lambda x: x[0])
         previous_y = None
         for y, text, font_size in elements:
             is_header = font_size >= 14
             if previous_y is not None and abs(y - previous_y) > 10:
                 markdown_output += "\n"
             if is_header:
                 markdown_output += f"\n### {text.strip()}\n"
             else:
                 markdown_output += text.strip() + "\n"
             previous_y = y
         markdown_output += "\n---\n\n"
@@ -75,14 +83,11 @@ def convert(pdf_file):
         text = page.get_text("text").strip()
         if len(text) > 30:
-            # Página con texto normal
             markdown_output += extract_text_markdown([page], image_paths) + "\n"
         else:
-            # Página vacía o con imagen: hacer OCR
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            # Guardar imagen completa
             image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
             img.save(image_path)
             image_paths.append(image_path)
@@ -92,7 +97,7 @@ def convert(pdf_file):
             try:
                 ocr_text = pytesseract.image_to_string(img, lang="spa")
             except pytesseract.TesseractError:
-                ocr_text = pytesseract.image_to_string(img)  # fallback sin lang
             ocr_text = clean_ocr_text(ocr_text)
             if ocr_text.strip():
@@ -100,7 +105,12 @@ def convert(pdf_file):
         markdown_output += "\n---\n\n"
-    return markdown_output.strip(), {}, image_paths
 gr.Interface(
     fn=convert,
@@ -108,6 +118,7 @@ gr.Interface(
     outputs=[
         gr.Markdown(label="Markdown estructurado"),
         gr.JSON(label="Metadata"),
-        gr.Gallery(label="Imágenes extraídas", type="file")
     ],
 ).launch()

         blocks = page.get_text("dict")["blocks"]
         elements = []
+        # 🔁 Añadir texto normal (bloques)
         for b in blocks:
             y = b["bbox"][1]
                     if line_text:
                         elements.append((line_y, line_text, max_font_size))
+        # 🖼️ Extraer imágenes reales de la página (xref)
+        images_on_page = page.get_images(full=True)
+        for img_index, img in enumerate(images_on_page):
+            xref = img[0]
+            try:
+                base_image = doc.extract_image(xref)
+                image_bytes = base_image["image"]
+                ext = base_image["ext"]
+                image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
+                with open(image_path, "wb") as f:
+                    f.write(image_bytes)
+                image_paths.append(image_path)
+                y_pos = 50 + img_index * 10  # Posición estimada para ordenar
+                elements.append((y_pos, f"![imagen_{image_counter}]({image_path})", 10))
+                image_counter += 1
+            except Exception as e:
+                elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
+        # Ordenar y construir Markdown
         elements.sort(key=lambda x: x[0])
         previous_y = None
         for y, text, font_size in elements:
             is_header = font_size >= 14
             if previous_y is not None and abs(y - previous_y) > 10:
                 markdown_output += "\n"
             if is_header:
                 markdown_output += f"\n### {text.strip()}\n"
             else:
                 markdown_output += text.strip() + "\n"
             previous_y = y
         markdown_output += "\n---\n\n"
         text = page.get_text("text").strip()
         if len(text) > 30:
             markdown_output += extract_text_markdown([page], image_paths) + "\n"
         else:
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
             img.save(image_path)
             image_paths.append(image_path)
             try:
                 ocr_text = pytesseract.image_to_string(img, lang="spa")
             except pytesseract.TesseractError:
+                ocr_text = pytesseract.image_to_string(img)
             ocr_text = clean_ocr_text(ocr_text)
             if ocr_text.strip():
         markdown_output += "\n---\n\n"
+    # Guardar como archivo .md
+    markdown_path = "/tmp/resultado.md"
+    with open(markdown_path, "w", encoding="utf-8") as f:
+        f.write(markdown_output)
+    return markdown_output.strip(), {}, image_paths, markdown_path
 gr.Interface(
     fn=convert,
     outputs=[
         gr.Markdown(label="Markdown estructurado"),
         gr.JSON(label="Metadata"),
+        gr.Gallery(label="Imágenes extraídas", type="file"),
+        gr.File(label="Descargar .md")
     ],
 ).launch()