Spaces:
Sleeping
Sleeping
import spaces | |
import gradio as gr | |
import fitz # PyMuPDF | |
def convert(pdf_file): | |
doc = fitz.open(pdf_file) | |
markdown_output = "" | |
for page in doc: | |
blocks = page.get_text("dict")["blocks"] | |
elements = [] | |
for b in blocks: | |
if b["type"] == 0: # texto | |
for line in b["lines"]: | |
for span in line["spans"]: | |
elements.append((span["bbox"][1], span["text"])) # y, texto | |
elif b["type"] == 1: # imagen | |
y_pos = b["bbox"][1] | |
elements.append((y_pos, "[imagen]()")) | |
# Ordenar por posición vertical | |
elements.sort(key=lambda x: x[0]) | |
for _, content in elements: | |
markdown_output += content.strip() + "\n\n" | |
return markdown_output.strip(), {} | |
gr.Interface( | |
convert, | |
inputs=[ | |
gr.File(label="Upload PDF", type="filepath"), | |
], | |
outputs=[ | |
gr.Text(label="Markdown"), | |
gr.JSON(label="Metadata"), | |
], | |
).launch() | |