Spaces:
Running
Running
import spaces | |
import gradio as gr | |
import fitz # PyMuPDF | |
import os | |
def convert(pdf_file): | |
doc = fitz.open(pdf_file) | |
markdown_output = "" | |
image_dir = "extracted_images" | |
os.makedirs(image_dir, exist_ok=True) | |
image_counter = 0 | |
for page_number, page in enumerate(doc): | |
blocks = page.get_text("dict")["blocks"] | |
elements = [] | |
for b in blocks: | |
if b["type"] == 0: # Texto | |
for line in b["lines"]: | |
for span in line["spans"]: | |
y = span["bbox"][1] | |
text = span["text"] | |
elements.append((y, text.strip())) | |
elif b["type"] == 1: # Imagen | |
y = b["bbox"][1] | |
img = page.get_image_list(full=True) | |
if img: | |
xref = img[0][0] | |
pix = fitz.Pixmap(doc, xref) | |
img_path = os.path.join(image_dir, f"imagen_{image_counter}.png") | |
if pix.n > 4: # CMYK | |
pix = fitz.Pixmap(fitz.csRGB, pix) | |
pix.save(img_path) | |
pix = None | |
elements.append((y, f"")) | |
image_counter += 1 | |
# Ordenar por posición vertical (y) | |
elements.sort(key=lambda x: x[0]) | |
for _, content in elements: | |
markdown_output += content + "\n\n" | |
return markdown_output.strip(), {} | |
gr.Interface( | |
convert, | |
inputs=[ | |
gr.File(label="Upload PDF", type="filepath"), | |
], | |
outputs=[ | |
gr.Text(label="Markdown"), | |
gr.JSON(label="Metadata"), | |
], | |
).launch() | |