pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 18 days ago

Commit

dd29269

verified ·

1 Parent(s): f824c8d

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -33

app.py CHANGED Viewed

@@ -3,10 +3,11 @@ import gradio as gr
 import fitz  # PyMuPDF
 import ocrmypdf
 import tempfile
 def extract_text_markdown(doc):
     markdown_output = ""
-    image_counter = 1
     for page in doc:
         blocks = page.get_text("dict")["blocks"]
@@ -17,16 +18,18 @@ def extract_text_markdown(doc):
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
-                    spans = line["spans"]
-                    line_text = " ".join(span["text"].strip() for span in spans)
                     if line_text:
                         elements.append((line_y, line_text))
             elif b["type"] == 1:  # Imagen
                 elements.append((y, f"[imagen_{image_counter}]()"))
                 image_counter += 1
         elements.sort(key=lambda x: x[0])
         previous_y = None
         for y, content in elements:
             if previous_y is not None and abs(y - previous_y) > 10:
@@ -43,6 +46,7 @@ def convert(pdf_file):
     original_doc = fitz.open(pdf_file)
     plain_text = "\n".join([page.get_text() for page in original_doc])
     if len(plain_text.strip()) < 100:
         ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
         ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
@@ -51,35 +55,11 @@ def convert(pdf_file):
         doc = original_doc
     markdown = extract_text_markdown(doc)
-    metadata = {}  # Puedes agregar metadata si quieres
     return markdown, metadata
-# Gradio Interface
-with gr.Blocks(title="PDF → Markdown") as demo:
-    gr.Markdown("### PDF → Markdown con enlaces de imagen y botón copiar")
-    pdf_input = gr.File(label="Sube tu PDF", type="filepath")
-    markdown_output = gr.Textbox(label="Markdown generado", lines=25, elem_id="markdown-textbox")
-    metadata_output = gr.JSON(label="Metadata")
-    convert_btn = gr.Button("Convertir PDF")
-    # Botón copiar usando JS válido y accediendo al DOM real
-    gr.HTML("""
-    <button onclick="copyMarkdown()">📋 Copiar Markdown</button>
-    <script>
-        function copyMarkdown() {
-            const textarea = document.querySelector('#markdown-textbox textarea');
-            if (textarea) {
-                textarea.select();
-                document.execCommand('copy');
-            } else {
-                alert('No se pudo copiar el texto');
-            }
-        }
-    </script>
-    """)
-    convert_btn.click(fn=convert, inputs=pdf_input, outputs=[markdown_output, metadata_output])
-demo.launch()

 import fitz  # PyMuPDF
 import ocrmypdf
 import tempfile
+import os
 def extract_text_markdown(doc):
     markdown_output = ""
+    image_counter = 1  # Contador de imágenes
     for page in doc:
         blocks = page.get_text("dict")["blocks"]
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
+                    line_text = " ".join([span["text"] for span in line["spans"]]).strip()
                     if line_text:
                         elements.append((line_y, line_text))
             elif b["type"] == 1:  # Imagen
+                # Añade un enlace con nombre único
                 elements.append((y, f"[imagen_{image_counter}]()"))
                 image_counter += 1
+        # Ordenar por posición vertical
         elements.sort(key=lambda x: x[0])
+        # Reconstrucción con saltos lógicos
         previous_y = None
         for y, content in elements:
             if previous_y is not None and abs(y - previous_y) > 10:
     original_doc = fitz.open(pdf_file)
     plain_text = "\n".join([page.get_text() for page in original_doc])
+    # Aplicar OCR solo si el PDF no tiene texto
     if len(plain_text.strip()) < 100:
         ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
         ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
         doc = original_doc
     markdown = extract_text_markdown(doc)
+    metadata = {}  # Si necesitas metadatos, se pueden agregar aquí
     return markdown, metadata
+gr.Interface(
+    fn=convert,
+    inputs=[gr.File(label="Sube tu PDF", type="filepath")],
+    outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
+).launch()