File size: 2,205 Bytes
9d34bba
 
1506ae7
010c4d7
9d34bba
 
43938ad
1506ae7
43938ad
 
9d34bba
1506ae7
 
 
 
 
02d986d
9d34bba
 
 
6b6b8dc
02d986d
 
9d34bba
6b6b8dc
9d34bba
 
6b6b8dc
02d986d
 
9d34bba
 
02d986d
1506ae7
 
9d34bba
 
02d986d
9d34bba
 
 
 
 
1506ae7
 
9d34bba
 
5415ed9
 
 
9d34bba
010c4d7
21e7816
 
 
 
02d986d
 
 
 
 
21e7816
 
 
 
9d34bba
21e7816
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import tempfile
import tesserocr
import os
import fitz  # PyMuPDF, imported as fitz for backward compatibility reasons
from PIL import Image
import logging
from multiprocessing.pool import Pool

logging.basicConfig(level=logging.INFO)

APIs = {
    "pol": tesserocr.PyTessBaseAPI(lang="pol", path="./tessdata"),
    "eng": tesserocr.PyTessBaseAPI(),
}

def pdf_to_image(pdf_file, path, progress, max_pages):
    # Convert the PDF to a PNG image using pdf2image
    doc = fitz.open(pdf_file.name)  # open document
    fnames = []
    idx = 1
    total = len(doc) if max_pages == 0 else max_pages
    for page in progress.tqdm(doc, desc="Converting PDF to image", total=total):
        pix = page.get_pixmap()
        output = f"{path}/page-{idx}.png"
        pix.save(output)
        fnames.append(output)
        idx += 1
        if max_pages > 0 and idx > max_pages:
            break
    return fnames

def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
    api = APIs[language]

    # Run OCR on the image using Tesseract
    with tempfile.TemporaryDirectory() as path:
        images = pdf_to_image(image, path, progress, max_pages)
        text_res = []
        for img in progress.tqdm(images, desc="Running OCR"):
            with open(img, 'rb') as f:
                img = Image.open(f)
                img.load()
                api.SetImage(img)
                text = api.GetUTF8Text()
                text_res.append(text)

    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
        file.write("\n".join(text_res))
        return file.name


if __name__ == "__main__":
    logging.info("Starting Tesseract OCR")
    iface = gr.Interface(
        fn=tesseract_ocr,
        inputs=[
            gr.File(label="PDF file"),
            gr.Dropdown(["eng", "pol"], label="Language", value="eng"),
            gr.Number(label="Number of pages", value=0)
        ],
        outputs=gr.File(label="Text file", type="file"),
        title="PDF to Text Converter",
        description="Converts a PDF file to text using Tesseract OCR."
    ).queue(concurrency_count=10)

    iface.launch(server_port=7860, server_name="0.0.0.0")