pdf-to-markdown

Sleeping

File size: 2,331 Bytes

f79c813
 
145d936
 
d4b4544
4337f3a
7b1bb08
dd29269
4337f3a
e62d9f5
3920f3b
c20f519
c1d7645
e62d9f5
d4b4544
 
145d936
d4b4544
e62d9f5
beb65ba
d4b4544
479e852
 
 
 
beb65ba
891d450
 
beb65ba
7064c41
479e852
3e3d3c7
 
7b1bb08
3e3d3c7
 
 
 
b3fecd4
d4b4544
e62d9f5
 
c20f519
 
 
 
 
e62d9f5
 
f79c813
 
 
 
 
 
 
e62d9f5
f79c813
 
 
 
 
 
 
 
 
 
 
4337f3a
f79c813
145d936
dd29269

from PIL import Image
import pytesseract
import spaces
import gradio as gr
import fitz  # PyMuPDF
import ocrmypdf
import tempfile
import os

def extract_text_markdown(doc):
    markdown_output = ""
    image_counter = 1

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        elements = []

        for b in blocks:
            y = b["bbox"][1]
            if b["type"] == 0:  # Texto
                for line in b["lines"]:
                    line_y = line["bbox"][1]
                    line_text = " ".join([span["text"] for span in line["spans"]]).strip()
                    if line_text:
                        elements.append((line_y, line_text))
            elif b["type"] == 1:  # Imagen
                elements.append((y, f"[imagen_{image_counter}]()"))
                image_counter += 1

        elements.sort(key=lambda x: x[0])

        previous_y = None
        for y, content in elements:
            if previous_y is not None and abs(y - previous_y) > 10:
                markdown_output += "\n"
            markdown_output += content + "\n"
            previous_y = y

        markdown_output += "\n---\n\n"

    return markdown_output.strip()

def needs_ocr(doc):
    text_length = sum(len(page.get_text().strip()) for page in doc)
    image_count = sum(len(page.get_images(full=True)) for page in doc)
    return text_length < 500 or image_count > 0

@spaces.GPU
def convert(pdf_file):
    doc = fitz.open(pdf_file)
    markdown_output = ""
    image_counter = 1

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text").strip()

        if len(text) > 30:
            # Página con texto normal
            markdown_output += extract_text_markdown([page]) + "\n"
        else:
            # Página sin texto: usar OCR por imagen
            pix = page.get_pixmap(dpi=300)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            ocr_text = pytesseract.image_to_string(img, lang="spa")
            markdown_output += ocr_text.strip() + "\n"

        markdown_output += "\n---\n\n"

    return markdown_output.strip(), {}

gr.Interface(
    fn=convert,
    inputs=[gr.File(label="Sube tu PDF", type="filepath")],
    outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
).launch()