pdfextract / app.py
GAS17's picture
Update app.py
966ab7d verified
raw
history blame
1.24 kB
import gradio as gr
import io
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
# Initialize the OCR model
model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
def ocr_process(file):
# Read the uploaded file
if file.name.lower().endswith('.pdf'):
doc = DocumentFile.from_pdf(file.name)
else:
# Assume it's an image if not PDF
image_stream = io.BytesIO(file.read())
doc = DocumentFile.from_images(image_stream)
# Perform OCR
result = model(doc)
# Extract text from the result
extracted_text = ""
for page in result.pages:
for block in page.blocks:
for line in block.lines:
for word in line.words:
extracted_text += word.value + " "
extracted_text += "\n"
extracted_text += "\n"
return extracted_text.strip()
# Create Gradio interface
iface = gr.Interface(
fn=ocr_process,
inputs=gr.File(label="Upload PDF or Image"),
outputs=gr.Textbox(label="Extracted Text"),
title="OCR with doctr",
description="Upload a PDF or image file to extract text using OCR."
)
# Launch the interface
iface.launch()