Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pytesseract | |
from pdf2image import convert_from_path | |
from PIL import Image | |
# لیست زبانهایی که میخوای همزمان پشتیبانی بشن | |
# (حتماً باید پکیج زبانها روی تسرکت نصب باشن) | |
AUTO_LANGS = "eng+fas+ara+rus+spa+fra" | |
def ocr_auto(input_file): | |
extracted_text = "" | |
if isinstance(input_file, str) and input_file.endswith('.pdf'): | |
images = convert_from_path(input_file) | |
for page_number, image in enumerate(images, start=1): | |
text = pytesseract.image_to_string(image, lang=AUTO_LANGS) | |
extracted_text += f"\n--- Page {page_number} ---\n{text}" | |
elif isinstance(input_file, Image.Image): | |
extracted_text = pytesseract.image_to_string(input_file, lang=AUTO_LANGS) | |
return extracted_text.strip() | |
def gradio_interface(): | |
input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF") | |
file_input = gr.File(label="Upload PDF/Image", file_types=[".pdf", ".png", ".jpg", ".jpeg"]) | |
output_text = gr.Textbox(label="Extracted Text", interactive=False) | |
def process(input_type, file): | |
if not file: | |
return "⚠️ Please upload a file first." | |
if input_type == "PDF": | |
return ocr_auto(file.name) | |
else: | |
image = Image.open(file.name) | |
return ocr_auto(image) | |
gr.Interface( | |
fn=process, | |
inputs=[input_type, file_input], | |
outputs=[output_text], | |
title="Auto OCR (PDF/Image)", | |
description="Upload a PDF or Image. OCR will automatically detect and extract text in multiple languages." | |
).launch() | |
# Run | |
gradio_interface() | |