pdf-to-markdown / app.py
Biifruu's picture
Update app.py
beb65ba verified
raw
history blame
1.7 kB
import spaces
import gradio as gr
import fitz # PyMuPDF
import os
@spaces.GPU
def convert(pdf_file):
doc = fitz.open(pdf_file)
markdown_output = ""
image_dir = "extracted_images"
os.makedirs(image_dir, exist_ok=True)
image_counter = 0
for page_number, page in enumerate(doc):
blocks = page.get_text("dict")["blocks"]
elements = []
for b in blocks:
if b["type"] == 0: # Texto
for line in b["lines"]:
for span in line["spans"]:
y = span["bbox"][1]
text = span["text"]
elements.append((y, text.strip()))
elif b["type"] == 1: # Imagen
y = b["bbox"][1]
img = page.get_image_list(full=True)
if img:
xref = img[0][0]
pix = fitz.Pixmap(doc, xref)
img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
if pix.n > 4: # CMYK
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.save(img_path)
pix = None
elements.append((y, f"![imagen]({img_path})"))
image_counter += 1
# Ordenar por posición vertical (y)
elements.sort(key=lambda x: x[0])
for _, content in elements:
markdown_output += content + "\n\n"
return markdown_output.strip(), {}
gr.Interface(
convert,
inputs=[
gr.File(label="Upload PDF", type="filepath"),
],
outputs=[
gr.Text(label="Markdown"),
gr.JSON(label="Metadata"),
],
).launch()