Persian-OCR

Sleeping

Persian-OCR / app.py

Update app.py

1f78813 verified 25 days ago

1.7 kB

	import gradio as gr
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image

	# لیست زبان‌هایی که می‌خوای همزمان پشتیبانی بشن
	# (حتماً باید پکیج زبان‌ها روی تسرکت نصب باشن)
	AUTO_LANGS = "eng+fas+ara+rus+spa+fra"

	def ocr_auto(input_file):
	extracted_text = ""

	if isinstance(input_file, str) and input_file.endswith('.pdf'):
	images = convert_from_path(input_file)
	for page_number, image in enumerate(images, start=1):
	text = pytesseract.image_to_string(image, lang=AUTO_LANGS)
	extracted_text += f"\n--- Page {page_number} ---\n{text}"
	elif isinstance(input_file, Image.Image):
	extracted_text = pytesseract.image_to_string(input_file, lang=AUTO_LANGS)

	return extracted_text.strip()

	def gradio_interface():
	input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
	file_input = gr.File(label="Upload PDF/Image", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
	output_text = gr.Textbox(label="Extracted Text", interactive=False)

	def process(input_type, file):
	if not file:
	return "⚠️ Please upload a file first."
	if input_type == "PDF":
	return ocr_auto(file.name)
	else:
	image = Image.open(file.name)
	return ocr_auto(image)

	gr.Interface(
	fn=process,
	inputs=[input_type, file_input],
	outputs=[output_text],
	title="Auto OCR (PDF/Image)",
	description="Upload a PDF or Image. OCR will automatically detect and extract text in multiple languages."
	).launch()

	# Run
	gradio_interface()