pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 25 days ago

Commit

564947a

verified ·

1 Parent(s): 891d450

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -9

app.py CHANGED Viewed

@@ -3,11 +3,10 @@ import gradio as gr
 import fitz  # PyMuPDF
 import ocrmypdf
 import tempfile
-import os
 def extract_text_markdown(doc):
     markdown_output = ""
-    image_counter = 1  # Contador de imágenes
     for page in doc:
         blocks = page.get_text("dict")["blocks"]
@@ -18,18 +17,24 @@ def extract_text_markdown(doc):
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
-                    line_text = " ".join([span["text"] for span in line["spans"]]).strip()
                     if line_text:
                         elements.append((line_y, line_text))
             elif b["type"] == 1:  # Imagen
-                # Añade un enlace con nombre único
                 elements.append((y, f"[imagen_{image_counter}]()"))
                 image_counter += 1
         # Ordenar por posición vertical
         elements.sort(key=lambda x: x[0])
-        # Reconstrucción con saltos lógicos
         previous_y = None
         for y, content in elements:
             if previous_y is not None and abs(y - previous_y) > 10:
@@ -46,7 +51,6 @@ def convert(pdf_file):
     original_doc = fitz.open(pdf_file)
     plain_text = "\n".join([page.get_text() for page in original_doc])
-    # Aplicar OCR solo si el PDF no tiene texto
     if len(plain_text.strip()) < 100:
         ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
         ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
@@ -55,11 +59,20 @@ def convert(pdf_file):
         doc = original_doc
     markdown = extract_text_markdown(doc)
-    metadata = {}  # Si necesitas metadatos, se pueden agregar aquí
     return markdown, metadata
 gr.Interface(
     fn=convert,
-    inputs=[gr.File(label="Sube tu PDF", type="filepath")],
-    outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
 ).launch()

 import fitz  # PyMuPDF
 import ocrmypdf
 import tempfile
 def extract_text_markdown(doc):
     markdown_output = ""
+    image_counter = 1
     for page in doc:
         blocks = page.get_text("dict")["blocks"]
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
+                    line_spans = line["spans"]
+                    # Si el texto tiene múltiples columnas o alineaciones → tabla simple
+                    if len(line_spans) > 1:
+                        line_text = " | ".join([span["text"].strip() for span in line_spans])
+                    else:
+                        line_text = " ".join([span["text"].strip() for span in line_spans])
                     if line_text:
                         elements.append((line_y, line_text))
             elif b["type"] == 1:  # Imagen
                 elements.append((y, f"[imagen_{image_counter}]()"))
                 image_counter += 1
         # Ordenar por posición vertical
         elements.sort(key=lambda x: x[0])
+        # Reconstrucción
         previous_y = None
         for y, content in elements:
             if previous_y is not None and abs(y - previous_y) > 10:
     original_doc = fitz.open(pdf_file)
     plain_text = "\n".join([page.get_text() for page in original_doc])
     if len(plain_text.strip()) < 100:
         ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
         ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
         doc = original_doc
     markdown = extract_text_markdown(doc)
+    metadata = {}  # Si deseas, aquí puedes agregar metadatos
     return markdown, metadata
+# Interfaz Gradio con botón copiar
+markdown_output = gr.Textbox(label="Markdown estructurado", lines=20)
+metadata_output = gr.JSON(label="Metadata")
+copy_button_html = """
+<button onclick="navigator.clipboard.writeText(document.querySelector('textarea').value)">📋 Copiar al portapapeles</button>
+"""
 gr.Interface(
     fn=convert,
+    inputs=gr.File(label="Sube tu PDF", type="filepath"),
+    outputs=[markdown_output, metadata_output, gr.HTML(copy_button_html)],
+    title="Extractor PDF → Markdown con imágenes como enlaces y soporte de tablas",
 ).launch()