Spaces:

GAS17
/

pdfextract

Runtime error

pdfextract / app.py

Update app.py

9204aaf verified 8 months ago

1.72 kB

	import gradio as gr
	import io
	import sys

	try:
	from doctr.io import DocumentFile
	from doctr.models import ocr_predictor
	except ImportError:
	print("Error: Failed to import doctr. Please ensure it's installed correctly.")
	print("Python version:", sys.version)
	print("Python path:", sys.path)
	raise

	# Initialize the OCR model
	try:
	model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
	except Exception as e:
	print(f"Error initializing OCR model: {e}")
	raise

	def ocr_process(file):
	try:
	# Read the uploaded file
	if file.name.lower().endswith('.pdf'):
	doc = DocumentFile.from_pdf(file.name)
	else:
	# Assume it's an image if not PDF
	image_stream = io.BytesIO(file.read())
	doc = DocumentFile.from_images(image_stream)

	# Perform OCR
	result = model(doc)

	# Extract text from the result
	extracted_text = ""
	for page in result.pages:
	for block in page.blocks:
	for line in block.lines:
	for word in line.words:
	extracted_text += word.value + " "
	extracted_text += "\n"
	extracted_text += "\n"

	return extracted_text.strip()
	except Exception as e:
	return f"Error processing file: {str(e)}"

	# Create Gradio interface
	iface = gr.Interface(
	fn=ocr_process,
	inputs=gr.File(label="Upload PDF or Image"),
	outputs=gr.Textbox(label="Extracted Text"),
	title="OCR with doctr",
	description="Upload a PDF or image file to extract text using OCR."
	)

	# Launch the interface
	iface.launch()