tesseract-multi-ocr

Sleeping

File size: 3,364 Bytes

2509048
9ad522d
 
 
 
2509048
bbd8f5d
 
 
2509048
 
9ad522d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2509048
9ad522d
 
bbd8f5d
9ad522d
bbd8f5d
 
 
 
 
4c0f403
bbd8f5d
 
 
 
 
 
 
 
 
 
 
1c10827
bbd8f5d
1c10827
bbd8f5d
 
 
 
 
 
7e333d1
bbd8f5d
 
 
 
 
 
 
 
 
 
9ad522d
bbd8f5d

from typing import List
import os
import sys
import subprocess
from pathlib import Path

import pytesseract
from PIL import Image

import gradio as gr

def check_tesseract_installed():
    try:
        subprocess.run(['tesseract', '--version'], capture_output=True, check=True)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False

def setup_tesseract():
    # Check if Tesseract is installed
    if not check_tesseract_installed():
        print("Tesseract is not installed. Please install it using:")
        print("sudo apt-get update && sudo apt-get install -y tesseract-ocr")
        print("For additional languages: sudo apt-get install -y tesseract-ocr-all")
        sys.exit(1)
    
    # Set TESSDATA_PREFIX if not already set
    if 'TESSDATA_PREFIX' not in os.environ:
        # Common locations for tessdata
        tessdata_paths = [
            '/usr/share/tesseract-ocr/4.00/tessdata',  # Newer versions
            '/usr/share/tesseract-ocr/tessdata',       # Older versions
            '/usr/share/tessdata',                     # Alternative location
        ]
        
        for path in tessdata_paths:
            if Path(path).exists():
                os.environ['TESSDATA_PREFIX'] = path
                print(f"Set TESSDATA_PREFIX to {path}")
                break
        else:
            print("Warning: Could not find tessdata directory")
            print("Please install language data files or set TESSDATA_PREFIX manually")

def tesseract_ocr(filepath: str, languages: List[str]=None):
    if languages is None:
        languages = ['eng']  # Default to English if no language specified
    image = Image.open(filepath)
    return pytesseract.image_to_string(image=image, lang='+'.join(languages))

title = "Tesseract OCR"
description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine."
article = "<p style='text-align: center'><a href='https://tesseract-ocr.github.io/' target='_blank'>Tesseract documentation</a> | <a href='https://github.com/tesseract-ocr/tesseract' target='_blank'>Github Repo</a></p>"
examples = [
    ["examples/weird_unicode_math_symbols.png", []],
    ["examples/eurotext.png", ["eng"]],
    ["examples/tesseract_sample.png", ["jpn", "eng"]],
    ["examples/chi.jpg", ["HanS", "HanT"]],
]

with gr.Blocks(title=title) as demo:
    gr.Markdown(f'<h1 style="text-align: center; margin-bottom: 1rem;">{title}</h1>')
    gr.Markdown(description)
    with gr.Row():
        with gr.Column():
            image = gr.Image(type="filepath", label="Input")
            language_choices = pytesseract.get_languages()
            with gr.Accordion("Languages", open=False):
                languages = gr.CheckboxGroup(language_choices, type="value", value=["eng"], label='language')
            with gr.Row():
                btn_clear = gr.ClearButton([image, languages])
                btn_submit = gr.Button(value="Submit", variant="primary")
        with gr.Column():
            text = gr.Textbox(label="Output")

    btn_submit.click(tesseract_ocr, inputs=[image, languages], outputs=text, api_name="tesseract-ocr")
    btn_clear.add(text)

    gr.Examples(
        examples=examples,
        inputs=[image, languages],
    )

    gr.Markdown(article)

if __name__ == '__main__':
    setup_tesseract()
    demo.launch()