Spaces:

marianeft
/

handwritten_name_recognizer

Build error

File size: 7,596 Bytes

8900f0a

<<<<<<< HEAD
#utils_ocr.py

import cv2
from matplotlib.pylab import f
import numpy as np
from PIL import Image
import torch
from torchvision import transforms

# --- Image Preprocessing for OCR ---

def load_image_as_grayscale(image_path: str) -> Image.Image:
    """Loads an image from path and converts it to grayscale PIL Image."""
    # Use PIL for robust image loading and conversion to grayscale 'L' mode
    img = Image.open(image_path).convert('L')
    return img

def binarize_image(image_pil: Image.Image) -> Image.Image:
    """Binarizes a grayscale PIL Image (black and white)."""
    # Convert PIL to OpenCV format (numpy array)
    img_np = np.array(image_pil)
    # Apply Otsu's thresholding for adaptive binarization
    _, img_bin = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    # Invert colors: Handwritten text usually dark on light. OCR models often
    # prefer light text on dark background. Check your training data's style.
    # This example assumes dark text on light background and inverts to white text on black.
    img_bin = 255 - img_bin
    return Image.fromarray(img_bin)

def resize_image_for_ocr(image_pil: Image.Image, target_height: int) -> Image.Image:
    """
    Resizes a PIL Image to a target height while maintaining aspect ratio.
    Pads width if necessary to avoid distortion.
    """
    original_width, original_height = image_pil.size
    # Calculate new width based on target height and original aspect ratio
    new_width = int(original_width * (target_height / original_height))
    resized_img = image_pil.resize((new_width, target_height), Image.LANCZOS)
    return resized_img

def normalize_image_for_model(image_pil: Image.Image) -> torch.Tensor:
    """
    Converts a PIL Image to a PyTorch Tensor and normalizes pixel values.
    """
    # Convert to tensor (scales to 0-1 automatically)
    tensor_transform = transforms.ToTensor()
    img_tensor = tensor_transform(image_pil)
    # For grayscale images, mean and std are single values.
    # Adjust normalization values if your training data uses different ones.
    img_tensor = transforms.Normalize((0.5,), (0.5,))(img_tensor) # Normalize to [-1, 1]
    return img_tensor

def preprocess_user_image_for_ocr(uploaded_image_pil: Image.Image, target_height: int) -> torch.Tensor:
    """
    Combines all preprocessing steps for a single user-uploaded image
    to prepare it for the OCR model.
    """
    # Ensure it's grayscale
    img_gray = uploaded_image_pil.convert('L')

    # Binarize
    img_bin = binarize_image(img_gray)

    # Resize (maintain aspect ratio)
    img_resized = resize_image_for_ocr(img_bin, target_height)

    # Normalize and convert to tensor
    img_tensor = normalize_image_for_model(img_resized)

    # Add batch dimension: (C, H, W) -> (1, C, H, W)
    img_tensor = img_tensor.unsqueeze(0)

    return img_tensor

def pad_image_tensor(image_tensor: torch.Tensor, max_width: int) -> torch.Tensor:
    """
    Pads a single image tensor to a max_width with zeros.
    Input tensor shape: (C, H, W)
    Output tensor shape: (C, H, max_width)
    """
    C, H, W = image_tensor.shape
    if W > max_width:
        # If image is wider than max_width, you might want to crop or resize it.
        # For this example, we'll just return a warning or clip.
        # A more robust solution might split text lines or use a different resizing strategy.
        print(f"Warning: Image width {W} exceeds max_width {max_width}. Cropping.")
        return image_tensor[:, :, :max_width] # Simple cropping
    padding = max_width - W
    # Pad on the right (P_left, P_right, P_top, P_bottom)
    padded_tensor = f.pad(image_tensor, (0, padding), 'constant', 0)
=======
#utils_ocr.py

import cv2
from matplotlib.pylab import f
import numpy as np
from PIL import Image
import torch
from torchvision import transforms

# --- Image Preprocessing for OCR ---

def load_image_as_grayscale(image_path: str) -> Image.Image:
    """Loads an image from path and converts it to grayscale PIL Image."""
    # Use PIL for robust image loading and conversion to grayscale 'L' mode
    img = Image.open(image_path).convert('L')
    return img

def binarize_image(image_pil: Image.Image) -> Image.Image:
    """Binarizes a grayscale PIL Image (black and white)."""
    # Convert PIL to OpenCV format (numpy array)
    img_np = np.array(image_pil)
    # Apply Otsu's thresholding for adaptive binarization
    _, img_bin = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    # Invert colors: Handwritten text usually dark on light. OCR models often
    # prefer light text on dark background. Check your training data's style.
    # This example assumes dark text on light background and inverts to white text on black.
    img_bin = 255 - img_bin
    return Image.fromarray(img_bin)

def resize_image_for_ocr(image_pil: Image.Image, target_height: int) -> Image.Image:
    """

    Resizes a PIL Image to a target height while maintaining aspect ratio.

    Pads width if necessary to avoid distortion.

    """
    original_width, original_height = image_pil.size
    # Calculate new width based on target height and original aspect ratio
    new_width = int(original_width * (target_height / original_height))
    resized_img = image_pil.resize((new_width, target_height), Image.LANCZOS)
    return resized_img

def normalize_image_for_model(image_pil: Image.Image) -> torch.Tensor:
    """

    Converts a PIL Image to a PyTorch Tensor and normalizes pixel values.

    """
    # Convert to tensor (scales to 0-1 automatically)
    tensor_transform = transforms.ToTensor()
    img_tensor = tensor_transform(image_pil)
    # For grayscale images, mean and std are single values.
    # Adjust normalization values if your training data uses different ones.
    img_tensor = transforms.Normalize((0.5,), (0.5,))(img_tensor) # Normalize to [-1, 1]
    return img_tensor

def preprocess_user_image_for_ocr(uploaded_image_pil: Image.Image, target_height: int) -> torch.Tensor:
    """

    Combines all preprocessing steps for a single user-uploaded image

    to prepare it for the OCR model.

    """
    # Ensure it's grayscale
    img_gray = uploaded_image_pil.convert('L')

    # Binarize
    img_bin = binarize_image(img_gray)

    # Resize (maintain aspect ratio)
    img_resized = resize_image_for_ocr(img_bin, target_height)

    # Normalize and convert to tensor
    img_tensor = normalize_image_for_model(img_resized)

    # Add batch dimension: (C, H, W) -> (1, C, H, W)
    img_tensor = img_tensor.unsqueeze(0)

    return img_tensor

def pad_image_tensor(image_tensor: torch.Tensor, max_width: int) -> torch.Tensor:
    """

    Pads a single image tensor to a max_width with zeros.

    Input tensor shape: (C, H, W)

    Output tensor shape: (C, H, max_width)

    """
    C, H, W = image_tensor.shape
    if W > max_width:
        # If image is wider than max_width, you might want to crop or resize it.
        # For this example, we'll just return a warning or clip.
        # A more robust solution might split text lines or use a different resizing strategy.
        print(f"Warning: Image width {W} exceeds max_width {max_width}. Cropping.")
        return image_tensor[:, :, :max_width] # Simple cropping
    padding = max_width - W
    # Pad on the right (P_left, P_right, P_top, P_bottom)
    padded_tensor = f.pad(image_tensor, (0, padding), 'constant', 0)
>>>>>>> ee59e5b21399d8b323cff452a961ea2fd6c65308
    return padded_tensor