Spaces:
Runtime error
Runtime error
File size: 2,205 Bytes
9d34bba 1506ae7 010c4d7 9d34bba 43938ad 1506ae7 43938ad 9d34bba 1506ae7 02d986d 9d34bba 6b6b8dc 02d986d 9d34bba 6b6b8dc 9d34bba 6b6b8dc 02d986d 9d34bba 02d986d 1506ae7 9d34bba 02d986d 9d34bba 1506ae7 9d34bba 5415ed9 9d34bba 010c4d7 21e7816 02d986d 21e7816 9d34bba 21e7816 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import gradio as gr
import tempfile
import tesserocr
import os
import fitz # PyMuPDF, imported as fitz for backward compatibility reasons
from PIL import Image
import logging
from multiprocessing.pool import Pool
logging.basicConfig(level=logging.INFO)
APIs = {
"pol": tesserocr.PyTessBaseAPI(lang="pol", path="./tessdata"),
"eng": tesserocr.PyTessBaseAPI(),
}
def pdf_to_image(pdf_file, path, progress, max_pages):
# Convert the PDF to a PNG image using pdf2image
doc = fitz.open(pdf_file.name) # open document
fnames = []
idx = 1
total = len(doc) if max_pages == 0 else max_pages
for page in progress.tqdm(doc, desc="Converting PDF to image", total=total):
pix = page.get_pixmap()
output = f"{path}/page-{idx}.png"
pix.save(output)
fnames.append(output)
idx += 1
if max_pages > 0 and idx > max_pages:
break
return fnames
def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
api = APIs[language]
# Run OCR on the image using Tesseract
with tempfile.TemporaryDirectory() as path:
images = pdf_to_image(image, path, progress, max_pages)
text_res = []
for img in progress.tqdm(images, desc="Running OCR"):
with open(img, 'rb') as f:
img = Image.open(f)
img.load()
api.SetImage(img)
text = api.GetUTF8Text()
text_res.append(text)
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
file.write("\n".join(text_res))
return file.name
if __name__ == "__main__":
logging.info("Starting Tesseract OCR")
iface = gr.Interface(
fn=tesseract_ocr,
inputs=[
gr.File(label="PDF file"),
gr.Dropdown(["eng", "pol"], label="Language", value="eng"),
gr.Number(label="Number of pages", value=0)
],
outputs=gr.File(label="Text file", type="file"),
title="PDF to Text Converter",
description="Converts a PDF file to text using Tesseract OCR."
).queue(concurrency_count=10)
iface.launch(server_port=7860, server_name="0.0.0.0") |