pdf-to-markdown

Running

App Files Files Community

Biifruu commited on May 30

Commit

58c62dc

verified ·

1 Parent(s): 564947a

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -20

app.py CHANGED Viewed

@@ -17,13 +17,12 @@ def extract_text_markdown(doc):
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
-                    line_spans = line["spans"]
-                    # Si el texto tiene múltiples columnas o alineaciones → tabla simple
-                    if len(line_spans) > 1:
-                        line_text = " | ".join([span["text"].strip() for span in line_spans])
                     else:
-                        line_text = " ".join([span["text"].strip() for span in line_spans])
                     if line_text:
                         elements.append((line_y, line_text))
@@ -31,10 +30,8 @@ def extract_text_markdown(doc):
                 elements.append((y, f"[imagen_{image_counter}]()"))
                 image_counter += 1
-        # Ordenar por posición vertical
         elements.sort(key=lambda x: x[0])
-        # Reconstrucción
         previous_y = None
         for y, content in elements:
             if previous_y is not None and abs(y - previous_y) > 10:
@@ -59,20 +56,23 @@ def convert(pdf_file):
         doc = original_doc
     markdown = extract_text_markdown(doc)
-    metadata = {}  # Si deseas, aquí puedes agregar metadatos
     return markdown, metadata
-# Interfaz Gradio con botón copiar
-markdown_output = gr.Textbox(label="Markdown estructurado", lines=20)
-metadata_output = gr.JSON(label="Metadata")
-copy_button_html = """
-<button onclick="navigator.clipboard.writeText(document.querySelector('textarea').value)">📋 Copiar al portapapeles</button>
-"""
-gr.Interface(
-    fn=convert,
-    inputs=gr.File(label="Sube tu PDF", type="filepath"),
-    outputs=[markdown_output, metadata_output, gr.HTML(copy_button_html)],
-    title="Extractor PDF → Markdown con imágenes como enlaces y soporte de tablas",
-).launch()

             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
+                    spans = line["spans"]
+                    if len(spans) > 1:
+                        line_text = " | ".join(span["text"].strip() for span in spans)
                     else:
+                        line_text = " ".join(span["text"].strip() for span in spans)
                     if line_text:
                         elements.append((line_y, line_text))
                 elements.append((y, f"[imagen_{image_counter}]()"))
                 image_counter += 1
         elements.sort(key=lambda x: x[0])
         previous_y = None
         for y, content in elements:
             if previous_y is not None and abs(y - previous_y) > 10:
         doc = original_doc
     markdown = extract_text_markdown(doc)
+    metadata = {}  # Añade metadatos si quieres
     return markdown, metadata
+# Gradio Interface
+with gr.Blocks(title="Extractor PDF a Markdown") as demo:
+    gr.Markdown("### PDF → Markdown con imágenes como enlaces y botón de copiar")
+    pdf_input = gr.File(label="Sube tu PDF", type="filepath")
+    markdown_output = gr.Textbox(label="Markdown generado", lines=25, elem_id="md_output")
+    metadata_output = gr.JSON(label="Metadata")
+    with gr.Row():
+        convert_btn = gr.Button("Convertir PDF")
+        copy_btn = gr.HTML("""
+            <button onclick="navigator.clipboard.writeText(document.getElementById('md_output').value)">📋 Copiar Markdown</button>
+        """)
+    convert_btn.click(fn=convert, inputs=pdf_input, outputs=[markdown_output, metadata_output])
+demo.launch()