Update preprocess.py
Browse files- preprocess.py +3 -49
preprocess.py
CHANGED
@@ -1,12 +1,4 @@
|
|
1 |
-
#
|
2 |
-
# Setup directories
|
3 |
-
pdf_directory = r"F:\Preprocessing"
|
4 |
-
output_directory = r"F:\Images"
|
5 |
-
os.makedirs(output_directory, exist_ok=True)
|
6 |
-
|
7 |
-
pages = convert_from_path(pdf_path, dpi=dpi)
|
8 |
-
|
9 |
-
# ### Step 2: Convert PDF files to Images
|
10 |
|
11 |
import os
|
12 |
import cv2
|
@@ -40,24 +32,7 @@ def process_all_pdfs():
|
|
40 |
num_pages = pdf_to_images(pdf_file, output_directory)
|
41 |
total_images += num_pages
|
42 |
|
43 |
-
|
44 |
-
print(f"✓ Tổng số ảnh đã chuyển đổi: {total_images}")
|
45 |
-
|
46 |
-
# MAIN EXECUTION
|
47 |
-
if __name__ == "__main__":
|
48 |
-
print("PDF TO IMAGES CONVERTER")
|
49 |
-
print(f"Input directory: {pdf_directory}")
|
50 |
-
print(f"Output directory: {output_directory}")
|
51 |
-
print()
|
52 |
-
|
53 |
-
if not os.path.exists(pdf_directory):
|
54 |
-
print(f"✗ Input directory does not exist: {pdf_directory}")
|
55 |
-
exit(1)
|
56 |
-
|
57 |
-
process_all_pdfs()
|
58 |
-
print("\n✓ Processing completed!")
|
59 |
-
|
60 |
-
# ### Step 3: Image Preprocessing
|
61 |
|
62 |
import os
|
63 |
import cv2
|
@@ -75,25 +50,4 @@ def preprocess_image(image_path):
|
|
75 |
kernel = np.ones((1, 1), np.uint8)
|
76 |
bold_img = cv2.dilate(binary, kernel, iterations=1)
|
77 |
|
78 |
-
return bold_img
|
79 |
-
|
80 |
-
# Thư mục đầu vào và đầu ra
|
81 |
-
input_folder = r"F:\Images"
|
82 |
-
output_folder = r"F:\Images_Processed"
|
83 |
-
os.makedirs(output_folder, exist_ok=True)
|
84 |
-
|
85 |
-
# Duyệt qua tất cả ảnh
|
86 |
-
for filename in os.listdir(input_folder):
|
87 |
-
if filename.lower().endswith((".jpg", ".jpeg", ".png", ".bmp")):
|
88 |
-
input_path = os.path.join(input_folder, filename)
|
89 |
-
output_path = os.path.join(output_folder, filename)
|
90 |
-
|
91 |
-
try:
|
92 |
-
processed_img = preprocess_image(input_path)
|
93 |
-
|
94 |
-
# Chuyển ảnh về PIL để lưu với Unicode path
|
95 |
-
pil_result = Image.fromarray(processed_img)
|
96 |
-
pil_result.save(output_path)
|
97 |
-
|
98 |
-
except Exception as e:
|
99 |
-
print(f"❌ Lỗi xử lý {filename}: {e}")
|
|
|
1 |
+
# Convert PDF files to Images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
import os
|
4 |
import cv2
|
|
|
32 |
num_pages = pdf_to_images(pdf_file, output_directory)
|
33 |
total_images += num_pages
|
34 |
|
35 |
+
# Image Preprocessing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
import os
|
38 |
import cv2
|
|
|
50 |
kernel = np.ones((1, 1), np.uint8)
|
51 |
bold_img = cv2.dilate(binary, kernel, iterations=1)
|
52 |
|
53 |
+
return bold_img
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|