pdf-to-markdown

Running

File size: 2,696 Bytes

145d936
 
d4b4544
7b1bb08
dd29269
75d0452
 
 
 
 
 
 
 
 
 
 
 
 
4337f3a
e62d9f5
3920f3b
c20f519
c1d7645
e62d9f5
d4b4544
 
145d936
d4b4544
e62d9f5
beb65ba
d4b4544
479e852
 
75d0452
479e852
75d0452
beb65ba
75d0452
891d450
beb65ba
7064c41
479e852
3e3d3c7
75d0452
 
 
 
 
7b1bb08
3e3d3c7
75d0452
 
 
 
 
 
3e3d3c7
75d0452
3e3d3c7
b3fecd4
d4b4544
e62d9f5
 
 
 
f79c813
 
 
 
 
 
 
e62d9f5
f79c813
 
 
 
75d0452
f79c813
 
 
75d0452
f79c813
 
4337f3a
f79c813
145d936
dd29269

import spaces
import gradio as gr
import fitz  # PyMuPDF
import tempfile
import os
from PIL import Image
import pytesseract

def clean_ocr_text(text):
    lines = text.splitlines()
    cleaned_lines = []

    for line in lines:
        line = line.strip()
        if line and not line.isspace():
            cleaned_lines.append(line)

    return "\n".join(cleaned_lines)

def extract_text_markdown(doc):
    markdown_output = ""
    image_counter = 1

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        elements = []

        for b in blocks:
            y = b["bbox"][1]
            if b["type"] == 0:  # Texto
                for line in b["lines"]:
                    line_y = line["bbox"][1]
                    line_text = " ".join([span["text"] for span in line["spans"]]).strip()
                    max_font_size = max([span.get("size", 10) for span in line["spans"]])
                    if line_text:
                        elements.append((line_y, line_text, max_font_size))
            elif b["type"] == 1:  # Imagen
                elements.append((y, f"![imagen_{image_counter}](#)", 10))
                image_counter += 1

        elements.sort(key=lambda x: x[0])

        previous_y = None
        previous_font = None

        for y, text, font_size in elements:
            is_header = font_size >= 14

            if previous_y is not None and abs(y - previous_y) > 10:
                markdown_output += "\n"

            if is_header:
                markdown_output += f"\n### {text.strip()}\n"
            else:
                markdown_output += text.strip() + "\n"

            previous_y = y
            previous_font = font_size

        markdown_output += "\n---\n\n"

    return markdown_output.strip()

@spaces.GPU
def convert(pdf_file):
    doc = fitz.open(pdf_file)
    markdown_output = ""
    image_counter = 1

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text").strip()

        if len(text) > 30:
            # Página con texto normal
            markdown_output += extract_text_markdown([page]) + "\n"
        else:
            # Página sin texto: usar OCR
            pix = page.get_pixmap(dpi=300)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            ocr_text = pytesseract.image_to_string(img, lang="spa")
            markdown_output += clean_ocr_text(ocr_text) + "\n"

        markdown_output += "\n---\n\n"

    return markdown_output.strip(), {}

gr.Interface(
    fn=convert,
    inputs=[gr.File(label="Sube tu PDF", type="filepath")],
    outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
).launch()