pdf-to-markdown

Sleeping

File size: 1,946 Bytes

145d936
 
d4b4544
4337f3a
7b1bb08
 
4337f3a
e62d9f5
3920f3b
c1d7645
e62d9f5
d4b4544
 
145d936
d4b4544
e62d9f5
beb65ba
d4b4544
3e3d3c7
 
 
 
beb65ba
b3fecd4
beb65ba
7b1bb08
d4b4544
 
b3fecd4
3e3d3c7
 
7b1bb08
3e3d3c7
 
 
 
b3fecd4
d4b4544
e62d9f5
 
 
 
 
 
 
7b1bb08
e62d9f5
 
 
 
 
 
4337f3a
e62d9f5
7b1bb08
e62d9f5
145d936
 
e62d9f5
7b1bb08
 
145d936

import spaces
import gradio as gr
import fitz  # PyMuPDF
import ocrmypdf
import tempfile
import os

def extract_text_markdown(doc):
    markdown_output = ""

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        elements = []

        for b in blocks:
            y = b["bbox"][1]
            if b["type"] == 0:  # Texto
                for line in b["lines"]:
                    line_y = line["bbox"][1]
                    line_text = " ".join([span["text"] for span in line["spans"]]).strip()
                    if line_text:
                        elements.append((line_y, line_text))
            elif b["type"] == 1:  # Imagen
                elements.append((y, "[imagen]()"))  # Enlace vacío

        # Ordenar por posición vertical
        elements.sort(key=lambda x: x[0])

        # Reconstruir con saltos
        previous_y = None
        for y, content in elements:
            if previous_y is not None and abs(y - previous_y) > 10:
                markdown_output += "\n"
            markdown_output += content + "\n"
            previous_y = y

        markdown_output += "\n---\n\n"

    return markdown_output.strip()

@spaces.GPU
def convert(pdf_file):
    original_doc = fitz.open(pdf_file)
    plain_text = "\n".join([page.get_text() for page in original_doc])

    # Aplicar OCR solo si el PDF no tiene texto
    if len(plain_text.strip()) < 100:
        ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
        ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
        doc = fitz.open(ocr_temp_path)
    else:
        doc = original_doc

    markdown = extract_text_markdown(doc)
    metadata = {}  # Si necesitas metadatos, se pueden agregar aquí
    return markdown, metadata

gr.Interface(
    fn=convert,
    inputs=[gr.File(label="Sube tu PDF", type="filepath")],
    outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
).launch()