DamLoan commited on
Commit
1f51108
·
verified ·
1 Parent(s): 17e5f34

Update preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +3 -49
preprocess.py CHANGED
@@ -1,12 +1,4 @@
1
- # ### Step 1: Reading PDF Files
2
- # Setup directories
3
- pdf_directory = r"F:\Preprocessing"
4
- output_directory = r"F:\Images"
5
- os.makedirs(output_directory, exist_ok=True)
6
-
7
- pages = convert_from_path(pdf_path, dpi=dpi)
8
-
9
- # ### Step 2: Convert PDF files to Images
10
 
11
  import os
12
  import cv2
@@ -40,24 +32,7 @@ def process_all_pdfs():
40
  num_pages = pdf_to_images(pdf_file, output_directory)
41
  total_images += num_pages
42
 
43
- print(f"\n✓ Tổng số file PDF: {len(pdf_files)}")
44
- print(f"✓ Tổng số ảnh đã chuyển đổi: {total_images}")
45
-
46
- # MAIN EXECUTION
47
- if __name__ == "__main__":
48
- print("PDF TO IMAGES CONVERTER")
49
- print(f"Input directory: {pdf_directory}")
50
- print(f"Output directory: {output_directory}")
51
- print()
52
-
53
- if not os.path.exists(pdf_directory):
54
- print(f"✗ Input directory does not exist: {pdf_directory}")
55
- exit(1)
56
-
57
- process_all_pdfs()
58
- print("\n✓ Processing completed!")
59
-
60
- # ### Step 3: Image Preprocessing
61
 
62
  import os
63
  import cv2
@@ -75,25 +50,4 @@ def preprocess_image(image_path):
75
  kernel = np.ones((1, 1), np.uint8)
76
  bold_img = cv2.dilate(binary, kernel, iterations=1)
77
 
78
- return bold_img
79
-
80
- # Thư mục đầu vào và đầu ra
81
- input_folder = r"F:\Images"
82
- output_folder = r"F:\Images_Processed"
83
- os.makedirs(output_folder, exist_ok=True)
84
-
85
- # Duyệt qua tất cả ảnh
86
- for filename in os.listdir(input_folder):
87
- if filename.lower().endswith((".jpg", ".jpeg", ".png", ".bmp")):
88
- input_path = os.path.join(input_folder, filename)
89
- output_path = os.path.join(output_folder, filename)
90
-
91
- try:
92
- processed_img = preprocess_image(input_path)
93
-
94
- # Chuyển ảnh về PIL để lưu với Unicode path
95
- pil_result = Image.fromarray(processed_img)
96
- pil_result.save(output_path)
97
-
98
- except Exception as e:
99
- print(f"❌ Lỗi xử lý {filename}: {e}")
 
1
+ # Convert PDF files to Images
 
 
 
 
 
 
 
 
2
 
3
  import os
4
  import cv2
 
32
  num_pages = pdf_to_images(pdf_file, output_directory)
33
  total_images += num_pages
34
 
35
+ # Image Preprocessing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  import os
38
  import cv2
 
50
  kernel = np.ones((1, 1), np.uint8)
51
  bold_img = cv2.dilate(binary, kernel, iterations=1)
52
 
53
+ return bold_img