AmrElsayeh commited on
Commit
695a221
·
verified ·
1 Parent(s): 1d74c76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -208
app.py CHANGED
@@ -1,208 +1,190 @@
1
- import os
2
- import logging
3
- import cv2
4
- import numpy as np
5
- from PIL import Image
6
- from pdf2image import convert_from_path
7
- from pytesseract import Output, pytesseract
8
- from scipy.ndimage import rotate
9
- from surya.ocr import run_ocr
10
- from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
11
- from surya.model.recognition.model import load_model as load_rec_model
12
- from surya.model.recognition.processor import load_processor as load_rec_processor
13
- import imutils
14
- import gradio as gr
15
-
16
- # Set the Tesseract path (update this path based on your system)
17
- # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Windows
18
- pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Correct
19
- # Configure logging
20
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
21
-
22
- # Initialize OCR models
23
- det_processor, det_model = load_det_processor(), load_det_model()
24
- rec_model, rec_processor = load_rec_model(), load_rec_processor()
25
-
26
- class DocumentProcessor:
27
- def __init__(self, output_dir: str = "output"):
28
- self.output_dir = output_dir
29
- self.corrected_images_dir = os.path.join(output_dir, "corrected_images")
30
- self.extracted_text_dir = os.path.join(output_dir, "extracted_text")
31
- self.detected_text_dir = os.path.join(output_dir, "Detected_Text_Line")
32
- self.detected_layout_dir = os.path.join(output_dir, "Detected_layout")
33
- self._create_dirs()
34
-
35
- def _create_dirs(self):
36
- """Create output directories if they don't exist."""
37
- os.makedirs(self.corrected_images_dir, exist_ok=True)
38
- os.makedirs(self.extracted_text_dir, exist_ok=True)
39
- os.makedirs(self.detected_text_dir, exist_ok=True)
40
- os.makedirs(self.detected_layout_dir, exist_ok=True)
41
-
42
- def process_document(self, input_path: str):
43
- """
44
- Process a PDF or image to:
45
- 1. Correct image skew and rotation.
46
- 2. Extract text using OCR.
47
- 3. Save corrected images, detected images, and extracted text.
48
- """
49
- try:
50
- if input_path.endswith(".pdf"):
51
- images = self._convert_pdf_to_images(input_path)
52
- else:
53
- images = [Image.open(input_path)]
54
-
55
- # Run Surya detection and layout
56
- self._run_surya_detection(input_path)
57
-
58
- corrected_images = []
59
- extracted_texts = []
60
-
61
- for i, image in enumerate(images):
62
- logging.info(f"Processing page {i + 1}")
63
- corrected_image = self._correct_image_rotation(image)
64
- extracted_text = self._extract_text(corrected_image)
65
-
66
- # Save results
67
- self._save_results(corrected_image, extracted_text, i + 1)
68
-
69
- corrected_images.append(corrected_image)
70
- extracted_texts.append(extracted_text)
71
-
72
- return corrected_images, extracted_texts
73
-
74
- except Exception as e:
75
- logging.error(f"Error processing document: {e}")
76
- raise
77
-
78
- def _convert_pdf_to_images(self, pdf_path: str):
79
- """Convert PDF to a list of images."""
80
- logging.info(f"Converting PDF to images: {pdf_path}")
81
- return convert_from_path(pdf_path)
82
-
83
- def _run_surya_detection(self, input_path: str):
84
- """Run Surya detection and layout commands."""
85
- logging.info("Running Surya detection and layout")
86
-
87
- # Step 1: Run surya_detect
88
- os.system(f"surya_detect --results_dir {self.detected_text_dir} --images {input_path}")
89
-
90
- # Extract the PDF name (without extension)
91
- pdf_name = os.path.splitext(os.path.basename(input_path))[0]
92
-
93
- # Step 2: Remove column files
94
- os.system(f"rm {self.detected_text_dir}/{pdf_name}/*column*")
95
-
96
- # Step 3: Run surya_layout
97
- os.system(f"surya_layout --results_dir {self.detected_layout_dir} --images {input_path}")
98
-
99
- def _correct_image_rotation(self, image: Image.Image):
100
- """Correct the skew and rotation of the image."""
101
- logging.info("Correcting image rotation")
102
- if isinstance(image, Image.Image):
103
- image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
104
-
105
- # Correct skew
106
- corrected_image = self._correct_skew(image)
107
-
108
- # Correct rotation
109
- results = pytesseract.image_to_osd(
110
- corrected_image,
111
- output_type=Output.DICT,
112
- config='--dpi 300 --psm 0 -c min_characters_to_try=5 -c tessedit_script_lang=Arabic'
113
- )
114
- if results["orientation"] != 0:
115
- corrected_image = imutils.rotate_bound(corrected_image, angle=results["rotate"])
116
-
117
- return Image.fromarray(cv2.cvtColor(corrected_image, cv2.COLOR_BGR2RGB))
118
-
119
- def _correct_skew(self, image: np.ndarray, delta: float = 0.1, limit: int = 3):
120
- """Correct the skew of an image by finding the best angle."""
121
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
122
- thresh = cv2.adaptiveThreshold(
123
- gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
124
- cv2.THRESH_BINARY_INV, 41, 15
125
- )
126
-
127
- scores = []
128
- angles = np.arange(-limit, limit + delta, delta)
129
- for angle in angles:
130
- _, score = self._determine_score(thresh, angle)
131
- scores.append(score)
132
-
133
- best_angle = angles[scores.index(max(scores))]
134
-
135
- (h, w) = image.shape[:2]
136
- center = (w // 2, h // 2)
137
- M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
138
- rotated = cv2.warpAffine(
139
- image, M, (w, h), flags=cv2.INTER_LINEAR,
140
- borderMode=cv2.BORDER_CONSTANT, borderValue=(255, 255, 255)
141
- )
142
-
143
- logging.info(f"Detected skew angle: {best_angle} degrees")
144
- return rotated
145
-
146
- def _determine_score(self, arr: np.ndarray, angle: float):
147
- """Rotate the image and calculate the score based on pixel intensity."""
148
- data = rotate(arr, angle, reshape=False, order=0)
149
- histogram = np.sum(data, axis=1, dtype=float)
150
- score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
151
- return histogram, score
152
-
153
- def _extract_text(self, image: Image.Image):
154
- """Extract text from the image using OCR."""
155
- logging.info("Extracting text")
156
- extracted_text_surya = run_ocr([image], [["en"]], det_model, det_processor, rec_model, rec_processor)
157
- surya_text = [line.text for line in extracted_text_surya[0].text_lines]
158
- return "\n".join(surya_text)
159
-
160
- def _save_results(self, corrected_image: Image.Image, extracted_text: str, page_num: int):
161
- """Save corrected images and extracted text."""
162
- # Save corrected image
163
- corrected_image.save(os.path.join(self.corrected_images_dir, f"page_{page_num}_corrected.png"))
164
-
165
- # Save extracted text
166
- with open(os.path.join(self.extracted_text_dir, f"page_{page_num}_text.txt"), "w", encoding="utf-8") as f:
167
- f.write(extracted_text)
168
- logging.info(f"Saved results for page {page_num}")
169
-
170
- # Gradio Interface
171
- def process_document_interface(file):
172
- processor = DocumentProcessor(output_dir="output")
173
- corrected_images, extracted_texts = processor.process_document(file.name)
174
-
175
- # Get detected images
176
- pdf_name = os.path.splitext(os.path.basename(file.name))[0]
177
- detected_text_images = [
178
- os.path.join(processor.detected_text_dir, pdf_name, f"{pdf_name}_{i}_bbox.png")
179
- for i in range(len(corrected_images))
180
- ]
181
- detected_layout_images = [
182
- os.path.join(processor.detected_layout_dir, pdf_name, f"{pdf_name}_{i}_bbox.png")
183
- for i in range(len(corrected_images))
184
- ]
185
-
186
- # Prepare outputs
187
- outputs = []
188
- for i, (corrected_image, extracted_text, detected_text_image, detected_layout_image) in enumerate(zip(corrected_images, extracted_texts, detected_text_images, detected_layout_images)):
189
- outputs.append((corrected_image, detected_text_image, detected_layout_image, extracted_text))
190
-
191
- return outputs
192
-
193
- # Gradio App
194
- iface = gr.Interface(
195
- fn=process_document_interface,
196
- inputs=gr.File(label="Upload PDF or Image"),
197
- outputs=[
198
- gr.Gallery(label="Corrected Images"),
199
- gr.Gallery(label="Detected Text Images"),
200
- gr.Gallery(label="Detected Layout Images"),
201
- gr.Textbox(label="Extracted Text")
202
- ],
203
- title="Document Processor",
204
- description="Upload a PDF or image to correct skew/rotation, detect text/layout, and extract text using OCR."
205
- )
206
-
207
- if __name__ == "__main__":
208
- iface.launch()
 
1
+ import os
2
+ import logging
3
+ import cv2
4
+ import numpy as np
5
+ from pdf2image import convert_from_path
6
+ from pytesseract import Output, pytesseract
7
+ from scipy.ndimage import rotate
8
+ from surya.ocr import run_ocr
9
+ from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
10
+ from surya.model.recognition.model import load_model as load_rec_model
11
+ from surya.model.recognition.processor import load_processor as load_rec_processor
12
+ import imutils
13
+ import gradio as gr
14
+ import subprocess
15
+ import glob
16
+ from PIL import Image, ImageDraw
17
+ from pytesseract import Output
18
+ import pytesseract
19
+
20
+ # Function to correct image skew
21
+ def correct_skew(image, delta=0.1, limit=3):
22
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
23
+ thresh = cv2.adaptiveThreshold(
24
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
25
+ cv2.THRESH_BINARY_INV, 41, 15
26
+ )
27
+
28
+ scores = []
29
+ angles = np.arange(-limit, limit + delta, delta)
30
+ for angle in angles:
31
+ _, score = determine_score(thresh, angle)
32
+ scores.append(score)
33
+
34
+ best_angle = angles[scores.index(max(scores))]
35
+
36
+ (h, w) = image.shape[:2]
37
+ center = (w // 2, h // 2)
38
+ M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
39
+ rotated = cv2.warpAffine(
40
+ image, M, (w, h), flags=cv2.INTER_LINEAR,
41
+ borderMode=cv2.BORDER_CONSTANT, borderValue=(255, 255, 255)
42
+ )
43
+
44
+ print(f"[INFO] Detected skew angle: {best_angle} degrees")
45
+ return rotated
46
+
47
+ def determine_score(arr, angle):
48
+ data = rotate(arr, angle, reshape=False, order=0)
49
+ histogram = np.sum(data, axis=1, dtype=float)
50
+ score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
51
+ return histogram, score
52
+
53
+ def correct_image_rotation(image):
54
+ if isinstance(image, Image.Image):
55
+ original_size = image.size
56
+ print('image original size is:', original_size)
57
+ image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
58
+
59
+ image_required = image.copy()
60
+ h, w = image_required.shape[:2]
61
+ cropped_rotated = cv2.resize(image_required, (w * 4, h * 4))
62
+
63
+ results = pytesseract.image_to_osd(
64
+ cropped_rotated,
65
+ output_type=Output.DICT,
66
+ config='--dpi 300 --psm 0 -c min_characters_to_try=5 -c tessedit_script_lang=Arabic'
67
+ )
68
+
69
+ if results["script"] not in ['Bengali', 'Latin', 'Greek', 'Katakana'] and results["orientation"] != 180:
70
+ print("[INFO] Detected orientation: {}".format(results["orientation"]))
71
+ print("[INFO] Rotate by {} degrees to correct".format(results["rotate"]))
72
+ print("[INFO] Detected script: {}".format(results["script"]))
73
+ rotated = imutils.rotate_bound(image, angle=results['rotate'])
74
+ if results['rotate'] in [90, 270]:
75
+ rotated_h, rotated_w = rotated.shape[:2]
76
+ original_size = (rotated_w, rotated_h)
77
+ print(f"Rotated dimensions: {rotated_w}x{rotated_h}")
78
+ if (rotated_w, rotated_h) != (h, w):
79
+ rotated = cv2.resize(rotated, (w, h))
80
+ else:
81
+ print("[INFO] Major orientation is correct, proceeding to fine-tune...")
82
+ rotated = image
83
+
84
+ final_rotated = correct_skew(rotated)
85
+ rotated_pil = Image.fromarray(cv2.cvtColor(final_rotated, cv2.COLOR_BGR2RGB))
86
+
87
+ print('resize the image to its original size: ', original_size)
88
+ corrected_image = rotated_pil.resize(original_size, Image.Resampling.LANCZOS)
89
+ return corrected_image
90
+
91
+ # Function to process PDF or image and detect text lines
92
+ def process_pdf(file_path):
93
+ # Define the results directories
94
+ detected_text_dir = "/home/Detected_Text_Line"
95
+ detected_layout_dir = "/home/Detected_layout"
96
+ ocr_dir = "/home/OCR"
97
+
98
+ # Ensure the results directories exist
99
+ os.makedirs(detected_text_dir, exist_ok=True)
100
+ os.makedirs(detected_layout_dir, exist_ok=True)
101
+ os.makedirs(ocr_dir, exist_ok=True)
102
+
103
+ # Extract the PDF name (without extension)
104
+ pdf_name = os.path.splitext(os.path.basename(file_path))[0]
105
+
106
+ # Step 1: Run surya_detect
107
+ try:
108
+ subprocess.run(
109
+ ["surya_detect", "--results_dir", detected_text_dir, "--images", file_path],
110
+ check=True,
111
+ )
112
+ print(f"[INFO] surya_detect completed for {file_path}")
113
+ except subprocess.CalledProcessError as e:
114
+ print(f"[ERROR] surya_detect failed: {e}")
115
+ return None
116
+
117
+ # Step 2: Remove column files (if they exist)
118
+ column_files = glob.glob(f"{detected_text_dir}/{pdf_name}/*column*")
119
+ if column_files:
120
+ try:
121
+ subprocess.run(["rm"] + column_files, check=True)
122
+ print(f"[INFO] Removed column files for {pdf_name}")
123
+ except subprocess.CalledProcessError as e:
124
+ print(f"[ERROR] Failed to remove column files: {e}")
125
+ else:
126
+ print(f"[INFO] No column files found for {pdf_name}")
127
+
128
+ # Return the path to the directory containing the output images
129
+ output_dir = os.path.join(detected_text_dir, pdf_name)
130
+ return output_dir
131
+
132
+ # Function to handle the Gradio interface
133
+ def gradio_interface(file):
134
+ # Step 1: Correct the skew of the input file
135
+ corrected_images = []
136
+ if file.name.endswith('.pdf'):
137
+ images = convert_from_path(file.name)
138
+ for i, image in enumerate(images):
139
+ corrected_image = correct_image_rotation(image)
140
+ corrected_images.append(corrected_image)
141
+ else:
142
+ image = Image.open(file.name)
143
+ corrected_image = correct_image_rotation(image)
144
+ corrected_images.append(corrected_image)
145
+
146
+ # Save corrected images to a folder
147
+ corrected_dir = "/home/Corrected_Images"
148
+ os.makedirs(corrected_dir, exist_ok=True)
149
+ for i, corrected_image in enumerate(corrected_images):
150
+ corrected_image.save(os.path.join(corrected_dir, f"corrected_{i}.png"))
151
+
152
+ # Step 2: Detect text lines in the corrected images
153
+ detected_dir = process_pdf(corrected_dir)
154
+
155
+ if detected_dir is None:
156
+ # Return a placeholder image with an error message
157
+ error_image = Image.new("RGB", (400, 200), color="red")
158
+ error_draw = ImageDraw.Draw(error_image)
159
+ error_draw.text((10, 10), "Error detecting text lines. Check the logs for details.", fill="white")
160
+ return corrected_images, [error_image]
161
+
162
+ # Load and return the detected text line images
163
+ detected_images = []
164
+ for image_file in sorted(os.listdir(detected_dir)):
165
+ if image_file.endswith((".png", ".jpg", ".jpeg")):
166
+ image_path = os.path.join(detected_dir, image_file)
167
+ detected_images.append(Image.open(image_path))
168
+
169
+ if not detected_images:
170
+ # Return a placeholder image if no output images are found
171
+ placeholder_image = Image.new("RGB", (400, 200), color="gray")
172
+ placeholder_draw = ImageDraw.Draw(placeholder_image)
173
+ placeholder_draw.text((10, 10), "No detected text line images found.", fill="white")
174
+ return corrected_images, [placeholder_image]
175
+
176
+ return corrected_images, detected_images
177
+
178
+ # Gradio Interface
179
+ iface = gr.Interface(
180
+ fn=gradio_interface,
181
+ inputs=gr.File(label="Upload PDF or Image"),
182
+ outputs=[
183
+ gr.Gallery(label="Corrected Images", columns=[2], height="auto"),
184
+ gr.Gallery(label="Detected Text Lines", columns=[2], height="auto"),
185
+ ],
186
+ title="PDF/Image Skew Correction and Text Line Detection",
187
+ description="Upload a PDF or image to correct skew and detect text lines.",
188
+ )
189
+
190
+ iface.launch()