|
import gradio as gr |
|
import pytesseract |
|
from pdf2image import convert_from_path |
|
|
|
def extract_text_from_pdf(pdf_file): |
|
|
|
pages = convert_from_path(pdf_file.name, 600) |
|
|
|
|
|
text_data = '' |
|
for page in pages: |
|
text = pytesseract.image_to_string(page) |
|
text_data += text + '\n' |
|
|
|
return text_data |
|
|
|
|
|
iface = gr.Interface( |
|
fn=extract_text_from_pdf, |
|
inputs=gr.inputs.File(label="Sube tu archivo PDF"), |
|
outputs="text", |
|
title="Extractor de Texto de PDF", |
|
description="Sube un archivo PDF escaneado y extrae el texto usando OCR." |
|
) |
|
|
|
|
|
iface.launch() |