Persian-OCR

Running

App Files Files Community

suprimedev commited on 21 days ago

Commit

1f78813

verified ·

1 Parent(s): 4379353

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -15

app.py CHANGED Viewed

@@ -2,10 +2,10 @@ import gradio as gr
 import pytesseract
 from pdf2image import convert_from_path
 from PIL import Image
-from langdetect import detect_langs
-# چند زبان پرکاربرد رو فعال می‌کنیم (می‌تونی تغییر بدی)
-DEFAULT_LANGS = "eng+fas+ara+rus+fra+spa"
 def ocr_auto(input_file):
     extracted_text = ""
@@ -13,20 +13,12 @@ def ocr_auto(input_file):
     if isinstance(input_file, str) and input_file.endswith('.pdf'):
         images = convert_from_path(input_file)
         for page_number, image in enumerate(images, start=1):
-            text = pytesseract.image_to_string(image, lang=DEFAULT_LANGS)
             extracted_text += f"\n--- Page {page_number} ---\n{text}"
     elif isinstance(input_file, Image.Image):
-        extracted_text = pytesseract.image_to_string(input_file, lang=DEFAULT_LANGS)
-    # تلاش برای تشخیص زبان‌های موجود در متن
-    try:
-        langs = detect_langs(extracted_text)
-        langs_detected = ", ".join([str(l) for l in langs])
-        extracted_text = f"[Detected Languages: {langs_detected}]\n\n{extracted_text}"
-    except:
-        pass
-    return extracted_text
 def gradio_interface():
     input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
@@ -47,7 +39,7 @@ def gradio_interface():
         inputs=[input_type, file_input],
         outputs=[output_text],
         title="Auto OCR (PDF/Image)",
-        description="Upload a PDF or Image, OCR will auto-detect and extract text in multiple languages."
     ).launch()
 # Run

 import pytesseract
 from pdf2image import convert_from_path
 from PIL import Image
+# لیست زبان‌هایی که می‌خوای همزمان پشتیبانی بشن
+# (حتماً باید پکیج زبان‌ها روی تسرکت نصب باشن)
+AUTO_LANGS = "eng+fas+ara+rus+spa+fra"
 def ocr_auto(input_file):
     extracted_text = ""
     if isinstance(input_file, str) and input_file.endswith('.pdf'):
         images = convert_from_path(input_file)
         for page_number, image in enumerate(images, start=1):
+            text = pytesseract.image_to_string(image, lang=AUTO_LANGS)
             extracted_text += f"\n--- Page {page_number} ---\n{text}"
     elif isinstance(input_file, Image.Image):
+        extracted_text = pytesseract.image_to_string(input_file, lang=AUTO_LANGS)
+    return extracted_text.strip()
 def gradio_interface():
     input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
         inputs=[input_type, file_input],
         outputs=[output_text],
         title="Auto OCR (PDF/Image)",
+        description="Upload a PDF or Image. OCR will automatically detect and extract text in multiple languages."
     ).launch()
 # Run