Persian-OCR / app.py
suprimedev's picture
Update app.py
1f78813 verified
raw
history blame
1.7 kB
import gradio as gr
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
# لیست زبان‌هایی که می‌خوای همزمان پشتیبانی بشن
# (حتماً باید پکیج زبان‌ها روی تسرکت نصب باشن)
AUTO_LANGS = "eng+fas+ara+rus+spa+fra"
def ocr_auto(input_file):
extracted_text = ""
if isinstance(input_file, str) and input_file.endswith('.pdf'):
images = convert_from_path(input_file)
for page_number, image in enumerate(images, start=1):
text = pytesseract.image_to_string(image, lang=AUTO_LANGS)
extracted_text += f"\n--- Page {page_number} ---\n{text}"
elif isinstance(input_file, Image.Image):
extracted_text = pytesseract.image_to_string(input_file, lang=AUTO_LANGS)
return extracted_text.strip()
def gradio_interface():
input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
file_input = gr.File(label="Upload PDF/Image", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
output_text = gr.Textbox(label="Extracted Text", interactive=False)
def process(input_type, file):
if not file:
return "⚠️ Please upload a file first."
if input_type == "PDF":
return ocr_auto(file.name)
else:
image = Image.open(file.name)
return ocr_auto(image)
gr.Interface(
fn=process,
inputs=[input_type, file_input],
outputs=[output_text],
title="Auto OCR (PDF/Image)",
description="Upload a PDF or Image. OCR will automatically detect and extract text in multiple languages."
).launch()
# Run
gradio_interface()