pdf-to-markdown

Sleeping

File size: 2,931 Bytes

145d936
 
d4b4544
4337f3a
7b1bb08
dd29269
4337f3a
e62d9f5
3920f3b
7064c41
c1d7645
e62d9f5
d4b4544
7064c41
d4b4544
145d936
d4b4544
e62d9f5
7064c41
beb65ba
d4b4544
7064c41
 
 
 
 
 
 
 
 
 
 
beb65ba
891d450
 
beb65ba
7064c41
 
 
 
 
 
 
d4b4544
7064c41
 
 
 
 
 
 
 
3e3d3c7
 
7b1bb08
3e3d3c7
 
 
 
b3fecd4
d4b4544
e62d9f5
 
 
 
 
 
 
dd29269
e62d9f5
 
 
 
 
 
4337f3a
e62d9f5
dd29269
e62d9f5
145d936
dd29269

import spaces
import gradio as gr
import fitz  # PyMuPDF
import ocrmypdf
import tempfile
import os

def extract_text_markdown(doc):
    markdown_output = ""
    image_counter = 1

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        lines_group = []
        elements = []

        for b in blocks:
            y = b["bbox"][1]

            if b["type"] == 0:  # Texto
                for line in b["lines"]:
                    spans = line["spans"]
                    cells = []
                    for span in spans:
                        cells.append((span["bbox"][0], span["text"]))  # (x, texto)
                    if len(cells) > 1:
                        lines_group.append((line["bbox"][1], cells))  # (y, columnas)
                    else:
                        line_text = " ".join(span["text"] for span in spans).strip()
                        if line_text:
                            elements.append((line["bbox"][1], line_text))

            elif b["type"] == 1:  # Imagen
                elements.append((y, f"[imagen_{image_counter}]()"))
                image_counter += 1

        # Detectar tablas rudimentarias
        if len(lines_group) >= 2:
            # Ordenar por coordenada vertical
            lines_group.sort(key=lambda x: x[0])
            header_cells = [cell[1].strip() for cell in lines_group[0][1]]
            markdown_output += "| " + " | ".join(header_cells) + " |\n"
            markdown_output += "| " + " | ".join(["---"] * len(header_cells)) + " |\n"

            for y, row in lines_group[1:]:
                row_cells = [cell[1].strip() for cell in row]
                markdown_output += "| " + " | ".join(row_cells) + " |\n"

            markdown_output += "\n"

        # Agregar líneas de texto sueltas
        elements.sort(key=lambda x: x[0])
        previous_y = None
        for y, content in elements:
            if previous_y is not None and abs(y - previous_y) > 10:
                markdown_output += "\n"
            markdown_output += content + "\n"
            previous_y = y

        markdown_output += "\n---\n\n"

    return markdown_output.strip()

@spaces.GPU
def convert(pdf_file):
    original_doc = fitz.open(pdf_file)
    plain_text = "\n".join([page.get_text() for page in original_doc])

    # Aplicar OCR solo si el PDF no tiene texto
    if len(plain_text.strip()) < 100:
        ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
        ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
        doc = fitz.open(ocr_temp_path)
    else:
        doc = original_doc

    markdown = extract_text_markdown(doc)
    metadata = {}  # Si necesitas metadatos, se pueden agregar aquí
    return markdown, metadata

gr.Interface(
    fn=convert,
    inputs=[gr.File(label="Sube tu PDF", type="filepath")],
    outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
).launch()