suprimedev commited on
Commit
b17d86f
·
verified ·
1 Parent(s): 1f78813

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -23
app.py CHANGED
@@ -2,45 +2,69 @@ import gradio as gr
2
  import pytesseract
3
  from pdf2image import convert_from_path
4
  from PIL import Image
 
5
 
6
- # لیست زبان‌هایی که می‌خوای همزمان پشتیبانی بشن
7
- # (حتماً باید پکیج زبان‌ها روی تسرکت نصب باشن)
8
- AUTO_LANGS = "eng+fas+ara+rus+spa+fra"
9
-
10
- def ocr_auto(input_file):
11
  extracted_text = ""
12
-
13
- if isinstance(input_file, str) and input_file.endswith('.pdf'):
 
 
14
  images = convert_from_path(input_file)
15
- for page_number, image in enumerate(images, start=1):
16
- text = pytesseract.image_to_string(image, lang=AUTO_LANGS)
17
- extracted_text += f"\n--- Page {page_number} ---\n{text}"
18
- elif isinstance(input_file, Image.Image):
19
- extracted_text = pytesseract.image_to_string(input_file, lang=AUTO_LANGS)
20
 
21
- return extracted_text.strip()
 
 
 
 
22
 
23
  def gradio_interface():
 
24
  input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
25
- file_input = gr.File(label="Upload PDF/Image", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  output_text = gr.Textbox(label="Extracted Text", interactive=False)
27
 
28
- def process(input_type, file):
29
- if not file:
30
- return "⚠️ Please upload a file first."
31
  if input_type == "PDF":
32
- return ocr_auto(file.name)
33
  else:
34
  image = Image.open(file.name)
35
- return ocr_auto(image)
 
36
 
 
37
  gr.Interface(
38
  fn=process,
39
- inputs=[input_type, file_input],
40
  outputs=[output_text],
41
- title="Auto OCR (PDF/Image)",
42
- description="Upload a PDF or Image. OCR will automatically detect and extract text in multiple languages."
43
  ).launch()
44
 
45
- # Run
 
46
  gradio_interface()
 
2
  import pytesseract
3
  from pdf2image import convert_from_path
4
  from PIL import Image
5
+ import os
6
 
7
+ # Function to perform OCR
8
+ def ocr(input_file, lang='fas'): # 'fas': Persian language (Farsi)
 
 
 
9
  extracted_text = ""
10
+
11
+ # Check if the input file is a PDF or an image
12
+ if isinstance(input_file, str) and input_file.endswith('.pdf'): # Check if the file is a PDF
13
+ # Convert PDF to images
14
  images = convert_from_path(input_file)
15
+
16
+ # Loop through each image and perform OCR
17
+ for page_number, image in enumerate(images):
18
+ text = pytesseract.image_to_string(image, lang=lang)
19
+ extracted_text += text
20
 
21
+ elif isinstance(input_file, Image.Image): # If the input is an image
22
+ text = pytesseract.image_to_string(input_file, lang=lang)
23
+ extracted_text = text
24
+
25
+ return extracted_text
26
 
27
  def gradio_interface():
28
+ # Define Gradio inputs and outputs
29
  input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
30
+ file_input = gr.File(label="Upload PDF/Image")
31
+ language_input = gr.Dropdown(
32
+ label="Select OCR Language",
33
+ choices=[
34
+ ("English", "eng"),
35
+ ("Mandarin Chinese", "chi_sim"),
36
+ ("Hindi", "hin"),
37
+ ("Spanish", "spa"),
38
+ ("French", "fra"),
39
+ ("Standard Arabic", "ara"),
40
+ ("Bengali", "ben"),
41
+ ("Portuguese", "por"),
42
+ ("Russian", "rus"),
43
+ ("Urdu", "urd"),
44
+ ("Persian (Farsi)", "fas")
45
+ ],
46
+ value="fas" # Default to Persian
47
+ )
48
  output_text = gr.Textbox(label="Extracted Text", interactive=False)
49
 
50
+ # Function to process the inputs and return the outputs
51
+ def process(input_type, file, lang):
 
52
  if input_type == "PDF":
53
+ extracted_text = ocr(file.name, lang)
54
  else:
55
  image = Image.open(file.name)
56
+ extracted_text = ocr(image, lang)
57
+ return extracted_text
58
 
59
+ # Create and launch Gradio interface
60
  gr.Interface(
61
  fn=process,
62
+ inputs=[input_type, file_input, language_input],
63
  outputs=[output_text],
64
+ title="OCR (PDF/Image)",
65
+ description="Upload a PDF or Image, select the OCR language, and extract the text."
66
  ).launch()
67
 
68
+
69
+ # Call the function to create the interface
70
  gradio_interface()