File size: 885 Bytes
145d936
 
3920f3b
 
 
 
b697ac0
145d936
312add7
3920f3b
 
 
c1d7645
3920f3b
 
 
145d936
3920f3b
 
 
 
 
145d936
3920f3b
145d936
 
 
c1d7645
 
 
 
 
 
 
145d936
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import spaces
import gradio as gr
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import os

@spaces.GPU
def convert(pdf_file):
    pages = convert_from_path(pdf_file)
    markdown_output = ""
    metadata = {}  # Opcional: puedes extraer metadata con PyMuPDF si lo deseas

    for idx, page_image in enumerate(pages):
        # Realizar OCR
        text = pytesseract.image_to_string(page_image)

        if text.strip() == "":
            # Si no hay texto, insertar un enlace vacío
            markdown_output += f"[imagen]()\n\n"
        else:
            markdown_output += text.strip() + "\n\n"

    return markdown_output.strip(), metadata

gr.Interface(
    convert,
    inputs=[
        gr.File(label="Upload PDF", type="filepath"),
    ],
    outputs=[
        gr.Text(label="Markdown"),
        gr.JSON(label="Metadata"),
    ],
).launch()