Pytesseract-PytesseractJs-LLM-OCR

Sleeping

App Files Files Community

Luke commited on Jul 30, 2024

Commit

68e1313

1 Parent(s): 03b6d75

no message

Browse files

Files changed (4) hide show

.gitignore +2 -0
Preprocess/preprocessImg.py +59 -0
app.py +54 -12
requirements.txt +2 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .idea/*
2	+ *.pyc

Preprocess/preprocessImg.py CHANGED Viewed

@@ -27,3 +27,62 @@ def preprocess_image002(image):
     gray = cv2.bilateralFilter(gray, 11, 17, 17)  # 雙邊濾波去噪
     edged = cv2.Canny(gray, 30, 200)  # 邊緣檢測
     return Image.fromarray(edged)

     gray = cv2.bilateralFilter(gray, 11, 17, 17)  # 雙邊濾波去噪
     edged = cv2.Canny(gray, 30, 200)  # 邊緣檢測
     return Image.fromarray(edged)
+# 方案三：自適應門檻和形態學變換
+def preprocess_image003(image):
+    # 將 PIL Image 轉換為 numpy array
+    image_np = np.array(image)
+    # 轉為灰階影像
+    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
+    # 自適應門檻處理
+    adaptive_thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
+    # 形態學變換 (開運算) 去除小噪點
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
+    morph = cv2.morphologyEx(adaptive_thresh, cv2.MORPH_OPEN, kernel)
+    return Image.fromarray(morph)
+# 方案四：CLAHE（限制對比度自適應直方圖均衡）
+def preprocess_image004(image):
+    # 將 PIL Image 轉換為 numpy array
+    image_np = np.array(image)
+    # 轉為灰階影像
+    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
+    # 應用 CLAHE
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    clahe_image = clahe.apply(gray)
+    # 二值化
+    _, binary = cv2.threshold(clahe_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    return Image.fromarray(binary)
+# 方案五：直方圖均衡化和高斯模糊
+def preprocess_image005(image):
+    # 將 PIL Image 轉換為 numpy array
+    image_np = np.array(image)
+    # 轉為灰階影像
+    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
+    # 直方圖均衡化
+    equalized = cv2.equalizeHist(gray)
+    # 高斯模糊
+    blurred = cv2.GaussianBlur(equalized, (5, 5), 0)
+    # 二值化
+    _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    return Image.fromarray(binary)
+# 方案六：自適應去噪與銳化
+def preprocess_image006(image):
+    # 將 PIL Image 轉換為 numpy array
+    image_np = np.array(image)
+    # 轉為灰階影像
+    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
+    # 自適應去噪
+    denoised = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
+    # 銳化
+    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
+    sharpened = cv2.filter2D(denoised, -1, kernel)
+    # 二值化
+    _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    return Image.fromarray(binary)

app.py CHANGED Viewed

@@ -2,7 +2,10 @@ import os
 import gradio as gr
 from Plan.AiLLM import llm_recognition
 from Plan.pytesseractOCR import ocr_recognition
-from Preprocess.preprocessImg import preprocess_image001, preprocess_image002
 # 取得所有語言清單
 languages = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
@@ -15,8 +18,18 @@ def preprocess_and_ocr(image, valid_type, language):
     # 方案二
     pre_img_002 = preprocess_image002(image)
     ocr_result_002 = ocr_recognition(pre_img_002, valid_type, language)
-    return pre_img_001, pre_img_002, ocr_result_001, ocr_result_002
 def preprocess_and_llm(image, valid_type, language):
@@ -26,34 +39,63 @@ def preprocess_and_llm(image, valid_type, language):
     # 方案二
     pre_img_002 = preprocess_image002(image)
     llm_result_002 = llm_recognition(pre_img_002, valid_type, language)
-    return pre_img_001, pre_img_002, llm_result_001, llm_result_002
 with gr.Blocks() as demo:
     with gr.Row():
         image_input = gr.Image(type="pil", label="上傳圖片")
-        preprocess_output_001 = gr.Image(type="pil", label="預處理後的圖片-方案一")
-        preprocess_output_002 = gr.Image(type="pil", label="預處理後的圖片-方案二")
-    with gr.Row():
         validation_type = gr.Dropdown(choices=["身分證正面", "身分證反面"], label="驗證類別")
         language_dropdown = gr.Dropdown(choices=languages, value="chi_tra", label="語言")
-        # preprocessed_type = gr.Radio(["001", "002"], label="解析方案")
     with gr.Row():
         ocr_button = gr.Button("使用 OCR")
         llm_button = gr.Button("使用 AI LLM")
     with gr.Row():
         ocr_output_001 = gr.JSON(label="OCR-001-解析結果")
         ocr_output_002 = gr.JSON(label="OCR-002-解析結果")
-        llm_output_001 = gr.JSON(label="AiLLM-001 解析結果")
-        llm_output_002 = gr.JSON(label="AiLLM-002 解析結果")
     ocr_button.click(preprocess_and_ocr, inputs=[image_input, validation_type, language_dropdown],
-                     outputs=[preprocess_output_001, preprocess_output_002, ocr_output_001, ocr_output_002])
     llm_button.click(preprocess_and_llm, inputs=[image_input, validation_type, language_dropdown],
-                     outputs=[preprocess_output_001, preprocess_output_002, llm_output_001, llm_output_002])
 demo.launch(share=False)

 import gradio as gr
 from Plan.AiLLM import llm_recognition
 from Plan.pytesseractOCR import ocr_recognition
+from Preprocess.preprocessImg import (
+    preprocess_image001, preprocess_image002, preprocess_image003,
+    preprocess_image004, preprocess_image005
+)
 # 取得所有語言清單
 languages = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
     # 方案二
     pre_img_002 = preprocess_image002(image)
     ocr_result_002 = ocr_recognition(pre_img_002, valid_type, language)
+    # 方案三
+    pre_img_003 = preprocess_image003(image)
+    ocr_result_003 = ocr_recognition(pre_img_003, valid_type, language)
+    # 方案四
+    pre_img_004 = preprocess_image004(image)
+    ocr_result_004 = ocr_recognition(pre_img_004, valid_type, language)
+    # 方案五
+    pre_img_005 = preprocess_image005(image)
+    ocr_result_005 = ocr_recognition(pre_img_005, valid_type, language)
+    return (pre_img_001, pre_img_002, pre_img_003, pre_img_004, pre_img_005,
+            ocr_result_001, ocr_result_002, ocr_result_003, ocr_result_004, ocr_result_005)
 def preprocess_and_llm(image, valid_type, language):
     # 方案二
     pre_img_002 = preprocess_image002(image)
     llm_result_002 = llm_recognition(pre_img_002, valid_type, language)
+    # 方案三
+    pre_img_003 = preprocess_image003(image)
+    llm_result_003 = llm_recognition(pre_img_003, valid_type, language)
+    # 方案四
+    pre_img_004 = preprocess_image004(image)
+    llm_result_004 = llm_recognition(pre_img_004, valid_type, language)
+    # 方案五
+    pre_img_005 = preprocess_image005(image)
+    llm_result_005 = llm_recognition(pre_img_005, valid_type, language)
+    return (pre_img_001, pre_img_002, pre_img_003, pre_img_004, pre_img_005,
+            llm_result_001, llm_result_002, llm_result_003, llm_result_004, llm_result_005)
 with gr.Blocks() as demo:
     with gr.Row():
         image_input = gr.Image(type="pil", label="上傳圖片")
         validation_type = gr.Dropdown(choices=["身分證正面", "身分證反面"], label="驗證類別")
         language_dropdown = gr.Dropdown(choices=languages, value="chi_tra", label="語言")
     with gr.Row():
         ocr_button = gr.Button("使用 OCR")
         llm_button = gr.Button("使用 AI LLM")
     with gr.Row():
+        preprocess_output_001 = gr.Image(type="pil", label="預處理後的圖片-方案一")
         ocr_output_001 = gr.JSON(label="OCR-001-解析結果")
+        llm_output_001 = gr.JSON(label="AiLLM-001-解析結果")
+    with gr.Row():
+        preprocess_output_002 = gr.Image(type="pil", label="預處理後的圖片-方案二")
         ocr_output_002 = gr.JSON(label="OCR-002-解析結果")
+        llm_output_002 = gr.JSON(label="AiLLM-002-解析結果")
+    with gr.Row():
+        preprocess_output_003 = gr.Image(type="pil", label="預處理後的圖片-方案三")
+        ocr_output_003 = gr.JSON(label="OCR-003-解析結果")
+        llm_output_003 = gr.JSON(label="AiLLM-003-解析結果")
+    with gr.Row():
+        preprocess_output_004 = gr.Image(type="pil", label="預處理後的圖片-方案四")
+        ocr_output_004 = gr.JSON(label="OCR-004-解析結果")
+        llm_output_004 = gr.JSON(label="AiLLM-004-解析結果")
+    with gr.Row():
+        preprocess_output_005 = gr.Image(type="pil", label="預處理後的圖片-方案五")
+        ocr_output_005 = gr.JSON(label="OCR-005-解析結果")
+        llm_output_005 = gr.JSON(label="AiLLM-005-解析結果")
     ocr_button.click(preprocess_and_ocr, inputs=[image_input, validation_type, language_dropdown],
+                     outputs=[
+                         preprocess_output_001, preprocess_output_002, preprocess_output_003, preprocess_output_004,
+                         preprocess_output_005,
+                         ocr_output_001, ocr_output_002, ocr_output_003, ocr_output_004, ocr_output_005
+                     ])
     llm_button.click(preprocess_and_llm, inputs=[image_input, validation_type, language_dropdown],
+                     outputs=[
+                         preprocess_output_001, preprocess_output_002, preprocess_output_003, preprocess_output_004,
+                         preprocess_output_005,
+                         llm_output_001, llm_output_002, llm_output_003, llm_output_004, llm_output_005
+                     ])
 demo.launch(share=False)

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ transformers
 Pillow
 torch
 huggingface-hub
-opencv-python

 Pillow
 torch
 huggingface-hub
+opencv-python
+numpy