Spaces:
Build error
Build error
File size: 7,596 Bytes
8900f0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
<<<<<<< HEAD
#utils_ocr.py
import cv2
from matplotlib.pylab import f
import numpy as np
from PIL import Image
import torch
from torchvision import transforms
# --- Image Preprocessing for OCR ---
def load_image_as_grayscale(image_path: str) -> Image.Image:
"""Loads an image from path and converts it to grayscale PIL Image."""
# Use PIL for robust image loading and conversion to grayscale 'L' mode
img = Image.open(image_path).convert('L')
return img
def binarize_image(image_pil: Image.Image) -> Image.Image:
"""Binarizes a grayscale PIL Image (black and white)."""
# Convert PIL to OpenCV format (numpy array)
img_np = np.array(image_pil)
# Apply Otsu's thresholding for adaptive binarization
_, img_bin = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Invert colors: Handwritten text usually dark on light. OCR models often
# prefer light text on dark background. Check your training data's style.
# This example assumes dark text on light background and inverts to white text on black.
img_bin = 255 - img_bin
return Image.fromarray(img_bin)
def resize_image_for_ocr(image_pil: Image.Image, target_height: int) -> Image.Image:
"""
Resizes a PIL Image to a target height while maintaining aspect ratio.
Pads width if necessary to avoid distortion.
"""
original_width, original_height = image_pil.size
# Calculate new width based on target height and original aspect ratio
new_width = int(original_width * (target_height / original_height))
resized_img = image_pil.resize((new_width, target_height), Image.LANCZOS)
return resized_img
def normalize_image_for_model(image_pil: Image.Image) -> torch.Tensor:
"""
Converts a PIL Image to a PyTorch Tensor and normalizes pixel values.
"""
# Convert to tensor (scales to 0-1 automatically)
tensor_transform = transforms.ToTensor()
img_tensor = tensor_transform(image_pil)
# For grayscale images, mean and std are single values.
# Adjust normalization values if your training data uses different ones.
img_tensor = transforms.Normalize((0.5,), (0.5,))(img_tensor) # Normalize to [-1, 1]
return img_tensor
def preprocess_user_image_for_ocr(uploaded_image_pil: Image.Image, target_height: int) -> torch.Tensor:
"""
Combines all preprocessing steps for a single user-uploaded image
to prepare it for the OCR model.
"""
# Ensure it's grayscale
img_gray = uploaded_image_pil.convert('L')
# Binarize
img_bin = binarize_image(img_gray)
# Resize (maintain aspect ratio)
img_resized = resize_image_for_ocr(img_bin, target_height)
# Normalize and convert to tensor
img_tensor = normalize_image_for_model(img_resized)
# Add batch dimension: (C, H, W) -> (1, C, H, W)
img_tensor = img_tensor.unsqueeze(0)
return img_tensor
def pad_image_tensor(image_tensor: torch.Tensor, max_width: int) -> torch.Tensor:
"""
Pads a single image tensor to a max_width with zeros.
Input tensor shape: (C, H, W)
Output tensor shape: (C, H, max_width)
"""
C, H, W = image_tensor.shape
if W > max_width:
# If image is wider than max_width, you might want to crop or resize it.
# For this example, we'll just return a warning or clip.
# A more robust solution might split text lines or use a different resizing strategy.
print(f"Warning: Image width {W} exceeds max_width {max_width}. Cropping.")
return image_tensor[:, :, :max_width] # Simple cropping
padding = max_width - W
# Pad on the right (P_left, P_right, P_top, P_bottom)
padded_tensor = f.pad(image_tensor, (0, padding), 'constant', 0)
=======
#utils_ocr.py
import cv2
from matplotlib.pylab import f
import numpy as np
from PIL import Image
import torch
from torchvision import transforms
# --- Image Preprocessing for OCR ---
def load_image_as_grayscale(image_path: str) -> Image.Image:
"""Loads an image from path and converts it to grayscale PIL Image."""
# Use PIL for robust image loading and conversion to grayscale 'L' mode
img = Image.open(image_path).convert('L')
return img
def binarize_image(image_pil: Image.Image) -> Image.Image:
"""Binarizes a grayscale PIL Image (black and white)."""
# Convert PIL to OpenCV format (numpy array)
img_np = np.array(image_pil)
# Apply Otsu's thresholding for adaptive binarization
_, img_bin = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Invert colors: Handwritten text usually dark on light. OCR models often
# prefer light text on dark background. Check your training data's style.
# This example assumes dark text on light background and inverts to white text on black.
img_bin = 255 - img_bin
return Image.fromarray(img_bin)
def resize_image_for_ocr(image_pil: Image.Image, target_height: int) -> Image.Image:
"""
Resizes a PIL Image to a target height while maintaining aspect ratio.
Pads width if necessary to avoid distortion.
"""
original_width, original_height = image_pil.size
# Calculate new width based on target height and original aspect ratio
new_width = int(original_width * (target_height / original_height))
resized_img = image_pil.resize((new_width, target_height), Image.LANCZOS)
return resized_img
def normalize_image_for_model(image_pil: Image.Image) -> torch.Tensor:
"""
Converts a PIL Image to a PyTorch Tensor and normalizes pixel values.
"""
# Convert to tensor (scales to 0-1 automatically)
tensor_transform = transforms.ToTensor()
img_tensor = tensor_transform(image_pil)
# For grayscale images, mean and std are single values.
# Adjust normalization values if your training data uses different ones.
img_tensor = transforms.Normalize((0.5,), (0.5,))(img_tensor) # Normalize to [-1, 1]
return img_tensor
def preprocess_user_image_for_ocr(uploaded_image_pil: Image.Image, target_height: int) -> torch.Tensor:
"""
Combines all preprocessing steps for a single user-uploaded image
to prepare it for the OCR model.
"""
# Ensure it's grayscale
img_gray = uploaded_image_pil.convert('L')
# Binarize
img_bin = binarize_image(img_gray)
# Resize (maintain aspect ratio)
img_resized = resize_image_for_ocr(img_bin, target_height)
# Normalize and convert to tensor
img_tensor = normalize_image_for_model(img_resized)
# Add batch dimension: (C, H, W) -> (1, C, H, W)
img_tensor = img_tensor.unsqueeze(0)
return img_tensor
def pad_image_tensor(image_tensor: torch.Tensor, max_width: int) -> torch.Tensor:
"""
Pads a single image tensor to a max_width with zeros.
Input tensor shape: (C, H, W)
Output tensor shape: (C, H, max_width)
"""
C, H, W = image_tensor.shape
if W > max_width:
# If image is wider than max_width, you might want to crop or resize it.
# For this example, we'll just return a warning or clip.
# A more robust solution might split text lines or use a different resizing strategy.
print(f"Warning: Image width {W} exceeds max_width {max_width}. Cropping.")
return image_tensor[:, :, :max_width] # Simple cropping
padding = max_width - W
# Pad on the right (P_left, P_right, P_top, P_bottom)
padded_tensor = f.pad(image_tensor, (0, padding), 'constant', 0)
>>>>>>> ee59e5b21399d8b323cff452a961ea2fd6c65308
return padded_tensor |