Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,10 +2,10 @@ import gradio as gr
|
|
2 |
import pytesseract
|
3 |
from pdf2image import convert_from_path
|
4 |
from PIL import Image
|
5 |
-
from langdetect import detect_langs
|
6 |
|
7 |
-
#
|
8 |
-
|
|
|
9 |
|
10 |
def ocr_auto(input_file):
|
11 |
extracted_text = ""
|
@@ -13,20 +13,12 @@ def ocr_auto(input_file):
|
|
13 |
if isinstance(input_file, str) and input_file.endswith('.pdf'):
|
14 |
images = convert_from_path(input_file)
|
15 |
for page_number, image in enumerate(images, start=1):
|
16 |
-
text = pytesseract.image_to_string(image, lang=
|
17 |
extracted_text += f"\n--- Page {page_number} ---\n{text}"
|
18 |
elif isinstance(input_file, Image.Image):
|
19 |
-
extracted_text = pytesseract.image_to_string(input_file, lang=
|
20 |
|
21 |
-
|
22 |
-
try:
|
23 |
-
langs = detect_langs(extracted_text)
|
24 |
-
langs_detected = ", ".join([str(l) for l in langs])
|
25 |
-
extracted_text = f"[Detected Languages: {langs_detected}]\n\n{extracted_text}"
|
26 |
-
except:
|
27 |
-
pass
|
28 |
-
|
29 |
-
return extracted_text
|
30 |
|
31 |
def gradio_interface():
|
32 |
input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
|
@@ -47,7 +39,7 @@ def gradio_interface():
|
|
47 |
inputs=[input_type, file_input],
|
48 |
outputs=[output_text],
|
49 |
title="Auto OCR (PDF/Image)",
|
50 |
-
description="Upload a PDF or Image
|
51 |
).launch()
|
52 |
|
53 |
# Run
|
|
|
2 |
import pytesseract
|
3 |
from pdf2image import convert_from_path
|
4 |
from PIL import Image
|
|
|
5 |
|
6 |
+
# لیست زبانهایی که میخوای همزمان پشتیبانی بشن
|
7 |
+
# (حتماً باید پکیج زبانها روی تسرکت نصب باشن)
|
8 |
+
AUTO_LANGS = "eng+fas+ara+rus+spa+fra"
|
9 |
|
10 |
def ocr_auto(input_file):
|
11 |
extracted_text = ""
|
|
|
13 |
if isinstance(input_file, str) and input_file.endswith('.pdf'):
|
14 |
images = convert_from_path(input_file)
|
15 |
for page_number, image in enumerate(images, start=1):
|
16 |
+
text = pytesseract.image_to_string(image, lang=AUTO_LANGS)
|
17 |
extracted_text += f"\n--- Page {page_number} ---\n{text}"
|
18 |
elif isinstance(input_file, Image.Image):
|
19 |
+
extracted_text = pytesseract.image_to_string(input_file, lang=AUTO_LANGS)
|
20 |
|
21 |
+
return extracted_text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def gradio_interface():
|
24 |
input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
|
|
|
39 |
inputs=[input_type, file_input],
|
40 |
outputs=[output_text],
|
41 |
title="Auto OCR (PDF/Image)",
|
42 |
+
description="Upload a PDF or Image. OCR will automatically detect and extract text in multiple languages."
|
43 |
).launch()
|
44 |
|
45 |
# Run
|