Spaces:

GAS17
/

pdfextract

Runtime error

pdfextract / app.py

Create app.py

5b2c368 verified 6 months ago

1.66 kB

	import gradio as gr
	import pytesseract
	from PIL import Image
	import os
	from pdf2image import convert_from_bytes
	import io

	# Configurar Tesseract para usar el modelo entrenado en Hugging Face Spaces
	tessdata_dir = "/home/user/.apt/usr/share/tesseract-ocr/4.00/tessdata"
	if os.path.exists(tessdata_dir):
	pytesseract.pytesseract.tesseract_cmd = '/home/user/.apt/usr/bin/tesseract'
	os.environ["TESSDATA_PREFIX"] = tessdata_dir

	def perform_ocr(file):
	if file is None:
	return "Por favor, sube un archivo."

	# Verificar si el archivo es un PDF
	if file.name.lower().endswith('.pdf'):
	# Convertir PDF a imágenes
	try:
	images = convert_from_bytes(file.read() if hasattr(file, 'read') else file)
	except Exception as e:
	return f"Error al procesar el PDF: {str(e)}"

	text = ""
	for image in images:
	text += pytesseract.image_to_string(image) + "\n\n"
	else:
	# Procesar como imagen
	try:
	if hasattr(file, 'read'):
	image = Image.open(io.BytesIO(file.read()))
	else:
	image = Image.open(file)
	text = pytesseract.image_to_string(image)
	except Exception as e:
	return f"Error al procesar la imagen: {str(e)}"

	return text

	# Crear la interfaz de Gradio
	iface = gr.Interface(
	fn=perform_ocr,
	inputs=gr.File(label="Sube una imagen o PDF"),
	outputs="text",
	title="Tesseract OCR para Imágenes y PDFs",
	description="Sube una imagen o un archivo PDF para extraer texto usando Tesseract OCR."
	)

	# Lanzar la interfaz
	iface.launch()