File size: 7,596 Bytes
8900f0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
<<<<<<< HEAD
#utils_ocr.py

import cv2
from matplotlib.pylab import f
import numpy as np
from PIL import Image
import torch
from torchvision import transforms

# --- Image Preprocessing for OCR ---

def load_image_as_grayscale(image_path: str) -> Image.Image:
    """Loads an image from path and converts it to grayscale PIL Image."""
    # Use PIL for robust image loading and conversion to grayscale 'L' mode
    img = Image.open(image_path).convert('L')
    return img

def binarize_image(image_pil: Image.Image) -> Image.Image:
    """Binarizes a grayscale PIL Image (black and white)."""
    # Convert PIL to OpenCV format (numpy array)
    img_np = np.array(image_pil)
    # Apply Otsu's thresholding for adaptive binarization
    _, img_bin = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    # Invert colors: Handwritten text usually dark on light. OCR models often
    # prefer light text on dark background. Check your training data's style.
    # This example assumes dark text on light background and inverts to white text on black.
    img_bin = 255 - img_bin
    return Image.fromarray(img_bin)

def resize_image_for_ocr(image_pil: Image.Image, target_height: int) -> Image.Image:
    """
    Resizes a PIL Image to a target height while maintaining aspect ratio.
    Pads width if necessary to avoid distortion.
    """
    original_width, original_height = image_pil.size
    # Calculate new width based on target height and original aspect ratio
    new_width = int(original_width * (target_height / original_height))
    resized_img = image_pil.resize((new_width, target_height), Image.LANCZOS)
    return resized_img

def normalize_image_for_model(image_pil: Image.Image) -> torch.Tensor:
    """
    Converts a PIL Image to a PyTorch Tensor and normalizes pixel values.
    """
    # Convert to tensor (scales to 0-1 automatically)
    tensor_transform = transforms.ToTensor()
    img_tensor = tensor_transform(image_pil)
    # For grayscale images, mean and std are single values.
    # Adjust normalization values if your training data uses different ones.
    img_tensor = transforms.Normalize((0.5,), (0.5,))(img_tensor) # Normalize to [-1, 1]
    return img_tensor

def preprocess_user_image_for_ocr(uploaded_image_pil: Image.Image, target_height: int) -> torch.Tensor:
    """
    Combines all preprocessing steps for a single user-uploaded image
    to prepare it for the OCR model.
    """
    # Ensure it's grayscale
    img_gray = uploaded_image_pil.convert('L')

    # Binarize
    img_bin = binarize_image(img_gray)

    # Resize (maintain aspect ratio)
    img_resized = resize_image_for_ocr(img_bin, target_height)

    # Normalize and convert to tensor
    img_tensor = normalize_image_for_model(img_resized)

    # Add batch dimension: (C, H, W) -> (1, C, H, W)
    img_tensor = img_tensor.unsqueeze(0)

    return img_tensor

def pad_image_tensor(image_tensor: torch.Tensor, max_width: int) -> torch.Tensor:
    """
    Pads a single image tensor to a max_width with zeros.
    Input tensor shape: (C, H, W)
    Output tensor shape: (C, H, max_width)
    """
    C, H, W = image_tensor.shape
    if W > max_width:
        # If image is wider than max_width, you might want to crop or resize it.
        # For this example, we'll just return a warning or clip.
        # A more robust solution might split text lines or use a different resizing strategy.
        print(f"Warning: Image width {W} exceeds max_width {max_width}. Cropping.")
        return image_tensor[:, :, :max_width] # Simple cropping
    padding = max_width - W
    # Pad on the right (P_left, P_right, P_top, P_bottom)
    padded_tensor = f.pad(image_tensor, (0, padding), 'constant', 0)
=======
#utils_ocr.py

import cv2
from matplotlib.pylab import f
import numpy as np
from PIL import Image
import torch
from torchvision import transforms

# --- Image Preprocessing for OCR ---

def load_image_as_grayscale(image_path: str) -> Image.Image:
    """Loads an image from path and converts it to grayscale PIL Image."""
    # Use PIL for robust image loading and conversion to grayscale 'L' mode
    img = Image.open(image_path).convert('L')
    return img

def binarize_image(image_pil: Image.Image) -> Image.Image:
    """Binarizes a grayscale PIL Image (black and white)."""
    # Convert PIL to OpenCV format (numpy array)
    img_np = np.array(image_pil)
    # Apply Otsu's thresholding for adaptive binarization
    _, img_bin = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    # Invert colors: Handwritten text usually dark on light. OCR models often
    # prefer light text on dark background. Check your training data's style.
    # This example assumes dark text on light background and inverts to white text on black.
    img_bin = 255 - img_bin
    return Image.fromarray(img_bin)

def resize_image_for_ocr(image_pil: Image.Image, target_height: int) -> Image.Image:
    """

    Resizes a PIL Image to a target height while maintaining aspect ratio.

    Pads width if necessary to avoid distortion.

    """
    original_width, original_height = image_pil.size
    # Calculate new width based on target height and original aspect ratio
    new_width = int(original_width * (target_height / original_height))
    resized_img = image_pil.resize((new_width, target_height), Image.LANCZOS)
    return resized_img

def normalize_image_for_model(image_pil: Image.Image) -> torch.Tensor:
    """

    Converts a PIL Image to a PyTorch Tensor and normalizes pixel values.

    """
    # Convert to tensor (scales to 0-1 automatically)
    tensor_transform = transforms.ToTensor()
    img_tensor = tensor_transform(image_pil)
    # For grayscale images, mean and std are single values.
    # Adjust normalization values if your training data uses different ones.
    img_tensor = transforms.Normalize((0.5,), (0.5,))(img_tensor) # Normalize to [-1, 1]
    return img_tensor

def preprocess_user_image_for_ocr(uploaded_image_pil: Image.Image, target_height: int) -> torch.Tensor:
    """

    Combines all preprocessing steps for a single user-uploaded image

    to prepare it for the OCR model.

    """
    # Ensure it's grayscale
    img_gray = uploaded_image_pil.convert('L')

    # Binarize
    img_bin = binarize_image(img_gray)

    # Resize (maintain aspect ratio)
    img_resized = resize_image_for_ocr(img_bin, target_height)

    # Normalize and convert to tensor
    img_tensor = normalize_image_for_model(img_resized)

    # Add batch dimension: (C, H, W) -> (1, C, H, W)
    img_tensor = img_tensor.unsqueeze(0)

    return img_tensor

def pad_image_tensor(image_tensor: torch.Tensor, max_width: int) -> torch.Tensor:
    """

    Pads a single image tensor to a max_width with zeros.

    Input tensor shape: (C, H, W)

    Output tensor shape: (C, H, max_width)

    """
    C, H, W = image_tensor.shape
    if W > max_width:
        # If image is wider than max_width, you might want to crop or resize it.
        # For this example, we'll just return a warning or clip.
        # A more robust solution might split text lines or use a different resizing strategy.
        print(f"Warning: Image width {W} exceeds max_width {max_width}. Cropping.")
        return image_tensor[:, :, :max_width] # Simple cropping
    padding = max_width - W
    # Pad on the right (P_left, P_right, P_top, P_bottom)
    padded_tensor = f.pad(image_tensor, (0, padding), 'constant', 0)
>>>>>>> ee59e5b21399d8b323cff452a961ea2fd6c65308
    return padded_tensor