Spaces:

GAS17
/

pdfextract

Runtime error

pdfextract / app.py

Update app.py

966ab7d verified 6 months ago

1.24 kB

	import gradio as gr
	import io
	from doctr.io import DocumentFile
	from doctr.models import ocr_predictor

	# Initialize the OCR model
	model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)

	def ocr_process(file):
	# Read the uploaded file
	if file.name.lower().endswith('.pdf'):
	doc = DocumentFile.from_pdf(file.name)
	else:
	# Assume it's an image if not PDF
	image_stream = io.BytesIO(file.read())
	doc = DocumentFile.from_images(image_stream)

	# Perform OCR
	result = model(doc)

	# Extract text from the result
	extracted_text = ""
	for page in result.pages:
	for block in page.blocks:
	for line in block.lines:
	for word in line.words:
	extracted_text += word.value + " "
	extracted_text += "\n"
	extracted_text += "\n"

	return extracted_text.strip()

	# Create Gradio interface
	iface = gr.Interface(
	fn=ocr_process,
	inputs=gr.File(label="Upload PDF or Image"),
	outputs=gr.Textbox(label="Extracted Text"),
	title="OCR with doctr",
	description="Upload a PDF or image file to extract text using OCR."
	)

	# Launch the interface
	iface.launch()