DamLoan commited on
Commit
b83ba6e
·
verified ·
1 Parent(s): 60f3906

Update preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +104 -125
preprocess.py CHANGED
@@ -1,125 +1,104 @@
1
- # %% [markdown]
2
- # ### Step 1: Reading PDF Files
3
-
4
- # %%
5
- pip install pdf2image
6
-
7
- # %%
8
- # Setup directories
9
- pdf_directory = r"F:\Preprocessing"
10
- output_directory = r"F:\Images"
11
- os.makedirs(output_directory, exist_ok=True)
12
-
13
- # Poppler path
14
- poppler_path = r"F:\poppler-24.08.0\Library\bin"
15
-
16
- # %% [markdown]
17
- # ### Step 2: Convert PDF files to Images
18
-
19
- # %%
20
- import os
21
- import cv2
22
- import numpy as np
23
- from pdf2image import convert_from_path
24
- import glob
25
-
26
- # Hàm kiểm tra Poppler
27
- def check_poppler():
28
- return os.path.exists(os.path.join(poppler_path, "pdftoppm.exe"))
29
-
30
- # Hàm chuyển PDF sang ảnh
31
- def pdf_to_images(pdf_path, output_dir, dpi=300):
32
- try:
33
- pages = convert_from_path(pdf_path, dpi=dpi, poppler_path=poppler_path)
34
- for i, page in enumerate(pages):
35
- image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i+1}.jpg"
36
- image_path = os.path.join(output_dir, image_name)
37
- page.save(image_path, "JPEG", quality=95)
38
- return len(pages) # Trả về số lượng ảnh được tạo
39
- except Exception as e:
40
- print(f"✗ Error processing {pdf_path}: {e}")
41
- return 0
42
-
43
- # Xử toàn bộ file PDF
44
- def process_all_pdfs():
45
- pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
46
- total_images = 0
47
-
48
- if not pdf_files:
49
- print(f"No PDF files found in {pdf_directory}")
50
- return
51
-
52
- for pdf_file in pdf_files:
53
- num_pages = pdf_to_images(pdf_file, output_directory)
54
- total_images += num_pages
55
-
56
- print(f"\n✓ Tổng số file PDF: {len(pdf_files)}")
57
- print(f" Tổng số ảnh đã chuyển đổi: {total_images}")
58
-
59
- # MAIN EXECUTION
60
- if __name__ == "__main__":
61
- print("PDF TO IMAGES CONVERTER")
62
- print(f"Input directory: {pdf_directory}")
63
- print(f"Output directory: {output_directory}")
64
- print(f"Poppler path: {poppler_path}")
65
- print()
66
-
67
- if not os.path.exists(pdf_directory):
68
- print(f"✗ Input directory does not exist: {pdf_directory}")
69
- exit(1)
70
-
71
- if not check_poppler():
72
- print("\n❌ Please check Poppler installation:")
73
- print(f"1. Make sure pdftoppm.exe exists in: {poppler_path}")
74
- exit(1)
75
-
76
- process_all_pdfs()
77
- print("\n✓ Processing completed!")
78
-
79
- # %% [markdown]
80
- # ### Step 3: Image Preprocessing
81
-
82
- # %%
83
- import os
84
- import cv2
85
- import numpy as np
86
- from PIL import Image
87
-
88
- def preprocess_image(image_path):
89
- pil_img = Image.open(image_path)
90
- img = np.array(pil_img)
91
-
92
- gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
93
- clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
94
- contrast_img = clahe.apply(gray)
95
- _, binary = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
96
- kernel = np.ones((1, 1), np.uint8)
97
- bold_img = cv2.dilate(binary, kernel, iterations=1)
98
-
99
- return bold_img
100
-
101
- # Thư mục đầu vào và đầu ra
102
- input_folder = r"F:\Images"
103
- output_folder = r"F:\Images_Processed"
104
- os.makedirs(output_folder, exist_ok=True)
105
-
106
- # Duyệt qua tất cả ảnh
107
- for filename in os.listdir(input_folder):
108
- if filename.lower().endswith((".jpg", ".jpeg", ".png", ".bmp")):
109
- input_path = os.path.join(input_folder, filename)
110
- output_path = os.path.join(output_folder, filename)
111
-
112
- try:
113
- processed_img = preprocess_image(input_path)
114
-
115
- # Chuyển ảnh về PIL để lưu với Unicode path
116
- pil_result = Image.fromarray(processed_img)
117
- pil_result.save(output_path)
118
-
119
- except Exception as e:
120
- print(f"❌ Lỗi xử lý {filename}: {e}")
121
-
122
-
123
-
124
-
125
-
 
1
+ # %% [markdown]
2
+ # ### Step 1: Reading PDF Files
3
+ # Setup directories
4
+ pdf_directory = r"F:\Preprocessing"
5
+ output_directory = r"F:\Images"
6
+ os.makedirs(output_directory, exist_ok=True)
7
+
8
+ pages = convert_from_path(pdf_path, dpi=dpi)
9
+
10
+ # %% [markdown]
11
+ # ### Step 2: Convert PDF files to Images
12
+
13
+ # %%
14
+ import os
15
+ import cv2
16
+ import numpy as np
17
+ from pdf2image import convert_from_path
18
+ import glob
19
+
20
+ # Hàm chuyển PDF sang ảnh
21
+ def pdf_to_images(pdf_path, output_dir, dpi=300):
22
+ try:
23
+ pages = convert_from_path(pdf_path, dpi=dpi)
24
+ for i, page in enumerate(pages):
25
+ image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i+1}.jpg"
26
+ image_path = os.path.join(output_dir, image_name)
27
+ page.save(image_path, "JPEG", quality=95)
28
+ return len(pages) # Trả về số lượng ảnh được tạo
29
+ except Exception as e:
30
+ print(f"✗ Error processing {pdf_path}: {e}")
31
+ return 0
32
+
33
+ # Xử toàn bộ file PDF
34
+ def process_all_pdfs():
35
+ pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
36
+ total_images = 0
37
+
38
+ if not pdf_files:
39
+ print(f"No PDF files found in {pdf_directory}")
40
+ return
41
+
42
+ for pdf_file in pdf_files:
43
+ num_pages = pdf_to_images(pdf_file, output_directory)
44
+ total_images += num_pages
45
+
46
+ print(f"\n✓ Tổng số file PDF: {len(pdf_files)}")
47
+ print(f"✓ Tổng số ảnh đã chuyển đổi: {total_images}")
48
+
49
+ # MAIN EXECUTION
50
+ if __name__ == "__main__":
51
+ print("PDF TO IMAGES CONVERTER")
52
+ print(f"Input directory: {pdf_directory}")
53
+ print(f"Output directory: {output_directory}")
54
+ print()
55
+
56
+ if not os.path.exists(pdf_directory):
57
+ print(f" Input directory does not exist: {pdf_directory}")
58
+ exit(1)
59
+
60
+ process_all_pdfs()
61
+ print("\n✓ Processing completed!")
62
+
63
+ # %% [markdown]
64
+ # ### Step 3: Image Preprocessing
65
+
66
+ # %%
67
+ import os
68
+ import cv2
69
+ import numpy as np
70
+ from PIL import Image
71
+
72
+ def preprocess_image(image_path):
73
+ pil_img = Image.open(image_path)
74
+ img = np.array(pil_img)
75
+
76
+ gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
77
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
78
+ contrast_img = clahe.apply(gray)
79
+ _, binary = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
80
+ kernel = np.ones((1, 1), np.uint8)
81
+ bold_img = cv2.dilate(binary, kernel, iterations=1)
82
+
83
+ return bold_img
84
+
85
+ # Thư mục đầu vào và đầu ra
86
+ input_folder = r"F:\Images"
87
+ output_folder = r"F:\Images_Processed"
88
+ os.makedirs(output_folder, exist_ok=True)
89
+
90
+ # Duyệt qua tất cả ảnh
91
+ for filename in os.listdir(input_folder):
92
+ if filename.lower().endswith((".jpg", ".jpeg", ".png", ".bmp")):
93
+ input_path = os.path.join(input_folder, filename)
94
+ output_path = os.path.join(output_folder, filename)
95
+
96
+ try:
97
+ processed_img = preprocess_image(input_path)
98
+
99
+ # Chuyển ảnh về PIL để lưu với Unicode path
100
+ pil_result = Image.fromarray(processed_img)
101
+ pil_result.save(output_path)
102
+
103
+ except Exception as e:
104
+ print(f"❌ Lỗi xử lý {filename}: {e}")