pdfextract / app.py
GAS17's picture
Update app.py
9204aaf verified
raw
history blame
1.72 kB
import gradio as gr
import io
import sys
try:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
except ImportError:
print("Error: Failed to import doctr. Please ensure it's installed correctly.")
print("Python version:", sys.version)
print("Python path:", sys.path)
raise
# Initialize the OCR model
try:
model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
except Exception as e:
print(f"Error initializing OCR model: {e}")
raise
def ocr_process(file):
try:
# Read the uploaded file
if file.name.lower().endswith('.pdf'):
doc = DocumentFile.from_pdf(file.name)
else:
# Assume it's an image if not PDF
image_stream = io.BytesIO(file.read())
doc = DocumentFile.from_images(image_stream)
# Perform OCR
result = model(doc)
# Extract text from the result
extracted_text = ""
for page in result.pages:
for block in page.blocks:
for line in block.lines:
for word in line.words:
extracted_text += word.value + " "
extracted_text += "\n"
extracted_text += "\n"
return extracted_text.strip()
except Exception as e:
return f"Error processing file: {str(e)}"
# Create Gradio interface
iface = gr.Interface(
fn=ocr_process,
inputs=gr.File(label="Upload PDF or Image"),
outputs=gr.Textbox(label="Extracted Text"),
title="OCR with doctr",
description="Upload a PDF or image file to extract text using OCR."
)
# Launch the interface
iface.launch()