tesseract-multi-ocr

Sleeping

App Files Files Community

tesseract-multi-ocr / app_blocks.py

pvanand

Update app_blocks.py

9ad522d verified 4 months ago

raw

history blame contribute delete

3.36 kB

	from typing import List
	import os
	import sys
	import subprocess
	from pathlib import Path

	import pytesseract
	from PIL import Image

	import gradio as gr

	def check_tesseract_installed():
	try:
	subprocess.run(['tesseract', '--version'], capture_output=True, check=True)
	return True
	except (subprocess.CalledProcessError, FileNotFoundError):
	return False

	def setup_tesseract():
	# Check if Tesseract is installed
	if not check_tesseract_installed():
	print("Tesseract is not installed. Please install it using:")
	print("sudo apt-get update && sudo apt-get install -y tesseract-ocr")
	print("For additional languages: sudo apt-get install -y tesseract-ocr-all")
	sys.exit(1)

	# Set TESSDATA_PREFIX if not already set
	if 'TESSDATA_PREFIX' not in os.environ:
	# Common locations for tessdata
	tessdata_paths = [
	'/usr/share/tesseract-ocr/4.00/tessdata', # Newer versions
	'/usr/share/tesseract-ocr/tessdata', # Older versions
	'/usr/share/tessdata', # Alternative location
	]

	for path in tessdata_paths:
	if Path(path).exists():
	os.environ['TESSDATA_PREFIX'] = path
	print(f"Set TESSDATA_PREFIX to {path}")
	break
	else:
	print("Warning: Could not find tessdata directory")
	print("Please install language data files or set TESSDATA_PREFIX manually")

	def tesseract_ocr(filepath: str, languages: List[str]=None):
	if languages is None:
	languages = ['eng'] # Default to English if no language specified
	image = Image.open(filepath)
	return pytesseract.image_to_string(image=image, lang='+'.join(languages))

	title = "Tesseract OCR"
	description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine."
	article = "<p style='text-align: center'><a href='https://tesseract-ocr.github.io/' target='_blank'>Tesseract documentation</a> \| <a href='https://github.com/tesseract-ocr/tesseract' target='_blank'>Github Repo</a></p>"
	examples = [
	["examples/weird_unicode_math_symbols.png", []],
	["examples/eurotext.png", ["eng"]],
	["examples/tesseract_sample.png", ["jpn", "eng"]],
	["examples/chi.jpg", ["HanS", "HanT"]],
	]

	with gr.Blocks(title=title) as demo:
	gr.Markdown(f'<h1 style="text-align: center; margin-bottom: 1rem;">{title}</h1>')
	gr.Markdown(description)
	with gr.Row():
	with gr.Column():
	image = gr.Image(type="filepath", label="Input")
	language_choices = pytesseract.get_languages()
	with gr.Accordion("Languages", open=False):
	languages = gr.CheckboxGroup(language_choices, type="value", value=["eng"], label='language')
	with gr.Row():
	btn_clear = gr.ClearButton([image, languages])
	btn_submit = gr.Button(value="Submit", variant="primary")
	with gr.Column():
	text = gr.Textbox(label="Output")

	btn_submit.click(tesseract_ocr, inputs=[image, languages], outputs=text, api_name="tesseract-ocr")
	btn_clear.add(text)

	gr.Examples(
	examples=examples,
	inputs=[image, languages],
	)

	gr.Markdown(article)

	if __name__ == '__main__':
	setup_tesseract()
	demo.launch()