Spaces:
Running
Running
File size: 885 Bytes
145d936 3920f3b b697ac0 145d936 312add7 3920f3b c1d7645 3920f3b 145d936 3920f3b 145d936 3920f3b 145d936 c1d7645 145d936 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import spaces
import gradio as gr
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import os
@spaces.GPU
def convert(pdf_file):
pages = convert_from_path(pdf_file)
markdown_output = ""
metadata = {} # Opcional: puedes extraer metadata con PyMuPDF si lo deseas
for idx, page_image in enumerate(pages):
# Realizar OCR
text = pytesseract.image_to_string(page_image)
if text.strip() == "":
# Si no hay texto, insertar un enlace vacío
markdown_output += f"[imagen]()\n\n"
else:
markdown_output += text.strip() + "\n\n"
return markdown_output.strip(), metadata
gr.Interface(
convert,
inputs=[
gr.File(label="Upload PDF", type="filepath"),
],
outputs=[
gr.Text(label="Markdown"),
gr.JSON(label="Metadata"),
],
).launch()
|