Spaces:
Running
Running
import spaces | |
import gradio as gr | |
from pdf2image import convert_from_path | |
import pytesseract | |
from PIL import Image | |
import os | |
def convert(pdf_file): | |
pages = convert_from_path(pdf_file) | |
markdown_output = "" | |
metadata = {} # Opcional: puedes extraer metadata con PyMuPDF si lo deseas | |
for idx, page_image in enumerate(pages): | |
# Realizar OCR | |
text = pytesseract.image_to_string(page_image) | |
if text.strip() == "": | |
# Si no hay texto, insertar un enlace vacío | |
markdown_output += f"[imagen]()\n\n" | |
else: | |
markdown_output += text.strip() + "\n\n" | |
return markdown_output.strip(), metadata | |
gr.Interface( | |
convert, | |
inputs=[ | |
gr.File(label="Upload PDF", type="filepath"), | |
], | |
outputs=[ | |
gr.Text(label="Markdown"), | |
gr.JSON(label="Metadata"), | |
], | |
).launch() | |