pdf-to-markdown

Running

File size: 4,157 Bytes

145d936
 
d4b4544
75d0452
 
584dc82
75d0452
 
 
 
 
 
 
 
 
4337f3a
584dc82
3920f3b
c20f519
c1d7645
6e5a37b
d4b4544
 
145d936
4f878aa
d4b4544
e62d9f5
6e5a37b
beb65ba
d4b4544
479e852
 
75d0452
479e852
75d0452
6e5a37b
4f878aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7064c41
3e3d3c7
75d0452
 
 
7b1bb08
3e3d3c7
75d0452
 
 
 
3e3d3c7
 
b3fecd4
d4b4544
e62d9f5
 
 
 
f79c813
 
9e2e286
f79c813
 
 
 
e62d9f5
f79c813
584dc82
f79c813
 
 
6e5a37b
 
 
9e2e286
 
6e5a37b
 
 
 
 
4f878aa
6e5a37b
 
 
 
f79c813
 
4337f3a
4f878aa
 
 
 
 
 
145d936
dd29269
 
 
565985f
1de2023
565985f
4f878aa
 
565985f
dd29269

import spaces
import gradio as gr
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import os

def clean_ocr_text(text):
    lines = text.splitlines()
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if line and not line.isspace():
            cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

def extract_text_markdown(doc, image_paths):
    markdown_output = ""
    image_counter = 1

    for page_num, page in enumerate(doc):
        blocks = page.get_text("dict")["blocks"]
        elements = []

        # 🔁 Añadir texto normal (bloques)
        for b in blocks:
            y = b["bbox"][1]

            if b["type"] == 0:  # Texto
                for line in b["lines"]:
                    line_y = line["bbox"][1]
                    line_text = " ".join([span["text"] for span in line["spans"]]).strip()
                    max_font_size = max([span.get("size", 10) for span in line["spans"]])
                    if line_text:
                        elements.append((line_y, line_text, max_font_size))

        # 🖼️ Extraer imágenes reales de la página (xref)
        images_on_page = page.get_images(full=True)
        for img_index, img in enumerate(images_on_page):
            xref = img[0]
            try:
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                ext = base_image["ext"]
                image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"

                with open(image_path, "wb") as f:
                    f.write(image_bytes)

                image_paths.append(image_path)
                y_pos = 50 + img_index * 10  # Posición estimada para ordenar
                elements.append((y_pos, f"![imagen_{image_counter}]({image_path})", 10))
                image_counter += 1
            except Exception as e:
                elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))

        # Ordenar y construir Markdown
        elements.sort(key=lambda x: x[0])
        previous_y = None

        for y, text, font_size in elements:
            is_header = font_size >= 14
            if previous_y is not None and abs(y - previous_y) > 10:
                markdown_output += "\n"
            if is_header:
                markdown_output += f"\n### {text.strip()}\n"
            else:
                markdown_output += text.strip() + "\n"
            previous_y = y

        markdown_output += "\n---\n\n"

    return markdown_output.strip()

@spaces.GPU
def convert(pdf_file):
    doc = fitz.open(pdf_file)
    markdown_output = ""
    image_paths = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text").strip()

        if len(text) > 30:
            markdown_output += extract_text_markdown([page], image_paths) + "\n"
        else:
            pix = page.get_pixmap(dpi=300)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
            img.save(image_path)
            image_paths.append(image_path)

            markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"

            try:
                ocr_text = pytesseract.image_to_string(img, lang="spa")
            except pytesseract.TesseractError:
                ocr_text = pytesseract.image_to_string(img)

            ocr_text = clean_ocr_text(ocr_text)
            if ocr_text.strip():
                markdown_output += ocr_text + "\n"

        markdown_output += "\n---\n\n"

    # Guardar como archivo .md
    markdown_path = "/tmp/resultado.md"
    with open(markdown_path, "w", encoding="utf-8") as f:
        f.write(markdown_output)

    return markdown_output.strip(), {}, image_paths, markdown_path

gr.Interface(
    fn=convert,
    inputs=[gr.File(label="Sube tu PDF", type="filepath")],
    outputs=[
        gr.Markdown(label="Markdown estructurado"),
        gr.JSON(label="Metadata"),
        gr.Gallery(label="Imágenes extraídas", type="file"),
        gr.File(label="Descargar .md")
    ],
).launch()