pdf-to-markdown / app.py
Biifruu's picture
Update app.py
3920f3b verified
raw
history blame
885 Bytes
import spaces
import gradio as gr
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import os
@spaces.GPU
def convert(pdf_file):
pages = convert_from_path(pdf_file)
markdown_output = ""
metadata = {} # Opcional: puedes extraer metadata con PyMuPDF si lo deseas
for idx, page_image in enumerate(pages):
# Realizar OCR
text = pytesseract.image_to_string(page_image)
if text.strip() == "":
# Si no hay texto, insertar un enlace vacío
markdown_output += f"[imagen]()\n\n"
else:
markdown_output += text.strip() + "\n\n"
return markdown_output.strip(), metadata
gr.Interface(
convert,
inputs=[
gr.File(label="Upload PDF", type="filepath"),
],
outputs=[
gr.Text(label="Markdown"),
gr.JSON(label="Metadata"),
],
).launch()