Spaces:

fisherman611
/

handwritten-mathematical-expression-recognition

Sleeping

App Files Files Community

fisherman611 commited on Jul 22

Commit

324b9ef

verified ·

1 Parent(s): 89ae6ce

Upload 3 files

Browse files

Files changed (3) hide show

models/can/can_dataloader.py +529 -0
models/can/can_eval.py +423 -0
models/can/can_trainer.py +336 -0

models/can/can_dataloader.py ADDED Viewed

	@@ -0,0 +1,529 @@

+import os
+import torch
+from torch.utils.data import Dataset, DataLoader, ConcatDataset
+import albumentations as A
+from PIL import Image
+import pandas as pd
+import cv2
+import numpy as np
+from collections import Counter
+import json
+with open("config.json", "r") as json_file:
+    cfg = json.load(json_file)
+CAN_CONFIG = cfg["can"]
+# Global constants
+INPUT_HEIGHT = CAN_CONFIG["input_height"]
+INPUT_WIDTH = CAN_CONFIG["input_width"]
+BASE_DIR = CAN_CONFIG["base_dir"]
+BATCH_SIZE = CAN_CONFIG["batch_size"]
+NUM_WORKERS = CAN_CONFIG["num_workers"]
+def is_effectively_binary(img, threshold_percentage=0.9):
+    dark_pixels = np.sum(img < 20)
+    bright_pixels = np.sum(img > 235)
+    total_pixels = img.size
+    return (dark_pixels + bright_pixels) / total_pixels > threshold_percentage
+def before_padding(image):
+    # Apply Canny edge detector to find text edges
+    edges = cv2.Canny(image, 50, 150)
+    # Apply dilation to connect nearby edges
+    kernel = np.ones((7, 13), np.uint8)
+    dilated = cv2.dilate(edges, kernel, iterations=8)
+    # Find connected components
+    num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(
+        dilated, connectivity=8
+    )
+    # Optimize crop rectangle using F1 score
+    # Sort components by number of white pixels (excluding background which is label 0)
+    sorted_components = sorted(
+        range(1, num_labels), key=lambda i: stats[i, cv2.CC_STAT_AREA], reverse=True
+    )
+    # Initialize with empty crop
+    best_f1 = 0
+    best_crop = (0, 0, image.shape[1], image.shape[0])
+    total_white_pixels = np.sum(dilated > 0)
+    current_mask = np.zeros_like(dilated)
+    x_min, y_min = image.shape[1], image.shape[0]
+    x_max, y_max = 0, 0
+    for component_idx in sorted_components:
+        # Add this component to our mask
+        component_mask = labels == component_idx
+        current_mask = np.logical_or(current_mask, component_mask)
+        # Update bounding box
+        comp_y, comp_x = np.where(component_mask)
+        if len(comp_x) > 0 and len(comp_y) > 0:
+            x_min = min(x_min, np.min(comp_x))
+            y_min = min(y_min, np.min(comp_y))
+            x_max = max(x_max, np.max(comp_x))
+            y_max = max(y_max, np.max(comp_y))
+        # Calculate the current crop
+        width = x_max - x_min + 1
+        height = y_max - y_min + 1
+        crop_area = width * height
+        crop_mask = np.zeros_like(dilated)
+        crop_mask[y_min : y_max + 1, x_min : x_max + 1] = 1
+        white_in_crop = np.sum(np.logical_and(dilated > 0, crop_mask > 0))
+        # Calculate F1 score
+        precision = white_in_crop / crop_area
+        recall = white_in_crop / total_white_pixels
+        f1 = 2 * precision * recall / (precision + recall)
+        if f1 > best_f1:
+            best_f1 = f1
+            best_crop = (x_min, y_min, x_max, y_max)
+    # Apply the best crop to the original image
+    x_min, y_min, x_max, y_max = best_crop
+    cropped_image = image[y_min : y_max + 1, x_min : x_max + 1]
+    # Apply Gaussian adaptive thresholding
+    if is_effectively_binary(cropped_image):
+        _, thresh = cv2.threshold(cropped_image, 127, 255, cv2.THRESH_BINARY)
+    else:
+        thresh = cv2.adaptiveThreshold(
+            cropped_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
+        )
+    # Ensure background is black
+    white = np.sum(thresh == 255)
+    black = np.sum(thresh == 0)
+    if white > black:
+        thresh = 255 - thresh
+    # Clean up noise using median filter
+    denoised = cv2.medianBlur(thresh, 3)
+    for _ in range(3):
+        denoised = cv2.medianBlur(denoised, 3)
+    # Add padding
+    result = cv2.copyMakeBorder(denoised, 5, 5, 5, 5, cv2.BORDER_CONSTANT, value=0)
+    return result, best_crop
+def process_img(filename, convert_to_rgb=False):
+    """
+    Load, binarize, ensure black background, resize, and apply padding
+    Args:
+        filename: Path to the image file
+        convert_to_rgb: Whether to convert to RGB
+    Returns:
+        Processed image and crop information
+    """
+    image = cv2.imread(filename, cv2.IMREAD_GRAYSCALE)
+    if image is None:
+        raise ValueError(f"Could not read image file: {filename}")
+    bin_img, best_crop = before_padding(image)
+    h, w = bin_img.shape
+    new_w = int((INPUT_HEIGHT / h) * w)
+    if new_w > INPUT_WIDTH:
+        resized_img = cv2.resize(
+            bin_img, (INPUT_WIDTH, INPUT_HEIGHT), interpolation=cv2.INTER_AREA
+        )
+    else:
+        resized_img = cv2.resize(
+            bin_img, (new_w, INPUT_HEIGHT), interpolation=cv2.INTER_AREA
+        )
+        padded_img = (
+            np.ones((INPUT_HEIGHT, INPUT_WIDTH), dtype=np.uint8) * 0
+        )  # Black background
+        x_offset = (INPUT_WIDTH - new_w) // 2
+        padded_img[:, x_offset : x_offset + new_w] = resized_img
+        resized_img = padded_img
+    # Convert to BGR/RGB only if necessary
+    if convert_to_rgb:
+        resized_img = cv2.cvtColor(resized_img, cv2.COLOR_GRAY2BGR)
+    return resized_img, best_crop
+class HMERDatasetForCAN(Dataset):
+    """
+    Dataset integrated with the CAN model for HMER
+    """
+    def __init__(self, data_folder, label_file, vocab, transform=None, max_length=150):
+        """
+        Initialize the dataset
+        data_folder: Directory containing images
+        label_file: TSV file with two columns (filename, label), no header
+        vocab: Vocabulary object for tokenization
+        transform: Image transformations
+        max_length: Maximum length of the token sequence
+        """
+        self.data_folder = data_folder
+        self.max_length = max_length
+        self.vocab = vocab
+        # Read the label file
+        df = pd.read_csv(label_file, sep="\t", header=None, names=["filename", "label"])
+        # Check image file format
+        if os.path.exists(data_folder):
+            img_files = os.listdir(data_folder)
+            if img_files:
+                # Get the extension of the first file
+                extension = os.path.splitext(img_files[0])[1]
+                # Add extension to filenames if not present
+                df["filename"] = df["filename"].apply(
+                    lambda x: x if os.path.splitext(x)[1] else x + extension
+                )
+        self.annotations = dict(zip(df["filename"], df["label"]))
+        self.image_paths = list(self.annotations.keys())
+        # Default transformation
+        if transform is None:
+            transform = A.Compose(
+                [
+                    A.Normalize(
+                        mean=[0.0], std=[1.0]
+                    ),  # Normalize for single channel (grayscale)
+                    A.pytorch.ToTensorV2(),
+                ]
+            )
+        self.transform = transform
+    def __len__(self):
+        return len(self.image_paths)
+    def __getitem__(self, idx):
+        # Get image path and LaTeX expression
+        image_path = self.image_paths[idx]
+        latex = self.annotations[image_path]
+        # Process image
+        file_path = os.path.join(self.data_folder, image_path)
+        processed_img, _ = process_img(
+            file_path, convert_to_rgb=False
+        )  # Keep image as grayscale
+        # Convert to [C, H, W] format and normalize
+        if self.transform:
+            # Ensure image has the correct format for albumentations
+            processed_img = np.expand_dims(processed_img, axis=-1)  # [H, W, 1]
+            image = self.transform(image=processed_img)["image"]
+        else:
+            # If no transform, manually convert to tensor
+            image = torch.from_numpy(processed_img).float() / 255.0
+            image = image.unsqueeze(0)  # Add grayscale channel: [1, H, W]
+        # Tokenize LaTeX expression
+        tokens = self.vocab.tokenize(latex)
+        # Add start and end tokens
+        tokens = [self.vocab.start_token] + tokens + [self.vocab.end_token]
+        # Truncate if exceeding max length
+        if len(tokens) > self.max_length:
+            tokens = tokens[: self.max_length]
+        # Create counting vector for CAN
+        count_vector = self.create_count_vector(tokens)
+        # Store actual caption length
+        caption_length = torch.LongTensor([len(tokens)])
+        # Pad to max length
+        if len(tokens) < self.max_length:
+            tokens = tokens + [self.vocab.pad_token] * (self.max_length - len(tokens))
+        # Convert to tensor
+        caption = torch.LongTensor(tokens)
+        return image, caption, caption_length, count_vector
+    def create_count_vector(self, tokens):
+        """
+        Create counting vector for the CAN model
+        Args:
+            tokens: List of token IDs
+        Returns:
+            Tensor counting the occurrence of each symbol
+        """
+        # Count occurrences of each token
+        counter = Counter(tokens)
+        # Create counting vector with size equal to vocabulary size
+        count_vector = torch.zeros(len(self.vocab))
+        # Fill counting vector with counts
+        for token_id, count in counter.items():
+            if 0 <= token_id < len(count_vector):
+                count_vector[token_id] = count
+        return count_vector
+class Vocabulary:
+    """
+    Advanced Vocabulary class for tokenization
+    """
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = {}
+        self.idx = 0
+        # Add special tokens
+        self.add_word("<pad>")  # Padding token
+        self.add_word("<start>")  # Start token
+        self.add_word("<end>")  # End token
+        self.add_word("<unk>")  # Unknown token
+        self.pad_token = self.word2idx["<pad>"]
+        self.start_token = self.word2idx["<start>"]
+        self.end_token = self.word2idx["<end>"]
+        self.unk_token = self.word2idx["<unk>"]
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.word2idx[word] = self.idx
+            self.idx2word[self.idx] = word
+            self.idx += 1
+    def __len__(self):
+        return len(self.word2idx)
+    def tokenize(self, latex):
+        """
+        Tokenize LaTeX string into indices. Assumes tokens are space-separated.
+        """
+        tokens = []
+        for char in latex.split():
+            if char in self.word2idx:
+                tokens.append(self.word2idx[char])
+            else:
+                tokens.append(self.unk_token)
+        return tokens
+    def build_vocab(self, label_file):
+        """
+        Build vocabulary from label file
+        """
+        try:
+            df = pd.read_csv(
+                label_file, sep="\t", header=None, names=["filename", "label"]
+            )
+            all_labels_text = " ".join(df["label"].astype(str).tolist())
+            tokens = sorted(set(all_labels_text.split()))
+            for char in tokens:
+                self.add_word(char)
+        except Exception as e:
+            print(f"Error building vocabulary from {label_file}: {e}")
+    def save_vocab(self, path):
+        """
+        Save vocabulary to file
+        """
+        data = {"word2idx": self.word2idx, "idx2word": self.idx2word, "idx": self.idx}
+        torch.save(data, path)
+    def load_vocab(self, path):
+        """
+        Load vocabulary from file
+        """
+        data = torch.load(path)
+        self.word2idx = data["word2idx"]
+        self.idx2word = data["idx2word"]
+        self.idx = data["idx"]
+        # Update special tokens
+        self.pad_token = self.word2idx["<pad>"]
+        self.start_token = self.word2idx["<start>"]
+        self.end_token = self.word2idx["<end>"]
+        self.unk_token = self.word2idx["<unk>"]
+def build_unified_vocabulary(base_dir="data/CROHME"):
+    """
+    Build a unified vocabulary from all caption.txt files
+    Args:
+        base_dir: Root directory containing CROHME data
+    Returns:
+        Constructed Vocabulary object
+    """
+    vocab = Vocabulary()
+    # Get all subdirectories
+    subdirs = [
+        d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
+    ]
+    for subdir in subdirs:
+        caption_path = os.path.join(base_dir, subdir, "caption.txt")
+        if os.path.exists(caption_path):
+            vocab.build_vocab(caption_path)
+            print(f"Built vocabulary from {caption_path}")
+    print(f"Final vocabulary size: {len(vocab)}")
+    return vocab
+def create_dataloaders_for_can(base_dir="data/CROHME", batch_size=32, num_workers=4):
+    """
+    Create dataloaders for training the CAN model
+    Args:
+        base_dir: Root directory containing CROHME data
+        batch_size: Batch size
+        num_workers: Number of workers for DataLoader
+    Returns:
+        train_loader, val_loader, test_loader, vocab
+    """
+    # Build unified vocabulary
+    vocab = build_unified_vocabulary(base_dir)
+    # Save vocabulary for later use
+    os.makedirs("models", exist_ok=True)
+    vocab.save_vocab("models/hmer_vocab.pth")
+    # Create transform for grayscale data
+    transform = A.Compose(
+        [
+            A.Normalize(
+                mean=[0.0], std=[1.0]
+            ),  # Normalize for single channel (grayscale)
+            A.pytorch.ToTensorV2(),
+        ]
+    )
+    # Create datasets
+    train_datasets = []
+    # Use 'train' and possibly add other datasets to training set
+    train_dirs = ["train", "2014"]  # Add other directories if desired
+    for train_dir in train_dirs:
+        data_folder = os.path.join(base_dir, train_dir, "img")
+        label_file = os.path.join(base_dir, train_dir, "caption.txt")
+        if os.path.exists(data_folder) and os.path.exists(label_file):
+            train_datasets.append(
+                HMERDatasetForCAN(
+                    data_folder=data_folder,
+                    label_file=label_file,
+                    vocab=vocab,
+                    transform=transform,
+                )
+            )
+    # Combine training datasets
+    if train_datasets:
+        train_dataset = ConcatDataset(train_datasets)
+    else:
+        raise ValueError("No training datasets found")
+    # Validation dataset
+    val_data_folder = os.path.join(base_dir, "val", "img")
+    val_label_file = os.path.join(base_dir, "val", "caption.txt")
+    if not os.path.exists(val_data_folder) or not os.path.exists(val_label_file):
+        # Use '2016' as validation set if 'val' is not available
+        val_data_folder = os.path.join(base_dir, "2016", "img")
+        val_label_file = os.path.join(base_dir, "2016", "caption.txt")
+    val_dataset = HMERDatasetForCAN(
+        data_folder=val_data_folder,
+        label_file=val_label_file,
+        vocab=vocab,
+        transform=transform,
+    )
+    # Test dataset
+    test_data_folder = os.path.join(base_dir, "test", "img")
+    test_label_file = os.path.join(base_dir, "test", "caption.txt")
+    if not os.path.exists(test_data_folder) or not os.path.exists(test_label_file):
+        # Use '2019' as test set if 'test' is not available
+        test_data_folder = os.path.join(base_dir, "2019", "img")
+        test_label_file = os.path.join(base_dir, "2019", "caption.txt")
+    test_dataset = HMERDatasetForCAN(
+        data_folder=test_data_folder,
+        label_file=test_label_file,
+        vocab=vocab,
+        transform=transform,
+    )
+    # Create dataloaders
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=num_workers,
+        pin_memory=True,
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=True,
+    )
+    test_loader = DataLoader(
+        test_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=True,
+    )
+    return train_loader, val_loader, test_loader, vocab
+# Use functionality integrated with the CAN model
+def main():
+    # Create dataloader for the CAN model
+    train_loader, val_loader, test_loader, vocab = create_dataloaders_for_can(
+        base_dir=BASE_DIR, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS
+    )
+    # Print information
+    print(f"Training samples: {len(train_loader.dataset)}")
+    print(f"Validation samples: {len(val_loader.dataset)}")
+    print(f"Test samples: {len(test_loader.dataset)}")
+    # Check dataloader output
+    for images, captions, lengths, count_vectors in train_loader:
+        print(f"Image batch shape: {images.shape}")
+        print(f"Caption batch shape: {captions.shape}")
+        print(f"Lengths batch shape: {lengths.shape}")
+        print(f"Count vectors batch shape: {count_vectors.shape}")
+        break
+if __name__ == "__main__":
+    main()

models/can/can_eval.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import torch
+import pandas as pd
+from PIL import Image
+import cv2
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+import matplotlib.pyplot as plt
+import numpy as np
+from tqdm.auto import tqdm
+import json
+import torch.nn.functional as F
+from models.can.can import CAN, create_can_model
+from models.can.can_dataloader import Vocabulary, process_img, INPUT_HEIGHT, INPUT_WIDTH
+torch.serialization.add_safe_globals([Vocabulary])
+os.environ['QT_QPA_PLATFORM'] = 'offscreen'
+with open("config.json", "r") as json_file:
+    cfg = json.load(json_file)
+CAN_CONFIG = cfg["can"]
+# Global constants here
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+MODE = CAN_CONFIG["mode"]  # 'single' or 'evaluate'
+BACKBONE_TYPE = CAN_CONFIG["backbone_type"]
+PRETRAINED_BACKBONE = True if CAN_CONFIG["pretrained_backbone"] == 1 else False
+CHECKPOINT_PATH = f'checkpoints/{BACKBONE_TYPE}_can_best.pth' if PRETRAINED_BACKBONE == False else f'checkpoints/p_{BACKBONE_TYPE}_can_best.pth'
+IMAGE_PATH = f'{CAN_CONFIG["test_folder"]}/{CAN_CONFIG["relative_image_path"]}'
+VISUALIZE = True if CAN_CONFIG["visualize"] == 1 else False
+TEST_FOLDER = CAN_CONFIG["test_folder"]
+LABEL_FILE = CAN_CONFIG["label_file"]
+CLASSIFIER = CAN_CONFIG["classifier"]  # choose between 'frac', 'sum_or_lim', 'long_expr', and 'all'
+def filter_formula(formula_tokens, mode):
+    if mode == "frac":
+        return "\\frac" in formula_tokens
+    elif mode == "sum_or_lim":
+        return "\\sum" in formula_tokens or "\\limit" in formula_tokens
+    elif mode == "long_expr":
+        return len(formula_tokens) >= 30
+    elif mode == 'short_expr':
+        return len(formula_tokens) <= 10
+    return True
+def levenshtein_distance(lst1, lst2):
+    """
+    Calculate Levenshtein distance between two lists
+    """
+    m = len(lst1)
+    n = len(lst2)
+    prev_row = [j for j in range(n + 1)]
+    curr_row = [0] * (n + 1)
+    for i in range(1, m + 1):
+        curr_row[0] = i
+        for j in range(1, n + 1):
+            if lst1[i - 1] == lst2[j - 1]:
+                curr_row[j] = prev_row[j - 1]
+            else:
+                curr_row[j] = 1 + min(
+                    curr_row[j - 1],  # insertion
+                    prev_row[j],  # deletion
+                    prev_row[j - 1]  # substitution
+                )
+        prev_row = curr_row.copy()
+    return curr_row[n]
+def load_checkpoint(checkpoint_path, device, pretrained_backbone=True, backbone='densenet'):
+    """
+    Load checkpoint and return model and vocabulary
+    """
+    checkpoint = torch.load(checkpoint_path,
+                            map_location=device,
+                            weights_only=False)
+    vocab = checkpoint.get('vocab')
+    if vocab is None:
+        # Try to load vocab from a separate file if not in checkpoint
+        vocab_path = os.path.join(os.path.dirname(checkpoint_path),
+                                  'hmer_vocab.pth')
+        if os.path.exists(vocab_path):
+            vocab_data = torch.load(vocab_path)
+            vocab = Vocabulary()
+            vocab.word2idx = vocab_data['word2idx']
+            vocab.idx2word = vocab_data['idx2word']
+            vocab.idx = vocab_data['idx']
+            # Update special tokens
+            vocab.pad_token = vocab.word2idx['<pad>']
+            vocab.start_token = vocab.word2idx['<start>']
+            vocab.end_token = vocab.word2idx['<end>']
+            vocab.unk_token = vocab.word2idx['<unk>']
+        else:
+            raise ValueError(
+                f"Vocabulary not found in checkpoint and {vocab_path} does not exist"
+            )
+    # Initialize model with parameters from checkpoint
+    hidden_size = checkpoint.get('hidden_size', 256)
+    embedding_dim = checkpoint.get('embedding_dim', 256)
+    use_coverage = checkpoint.get('use_coverage', True)
+    model = create_can_model(num_classes=len(vocab),
+                             hidden_size=hidden_size,
+                             embedding_dim=embedding_dim,
+                             use_coverage=use_coverage,
+                             pretrained_backbone=pretrained_backbone,
+                             backbone_type=backbone).to(device)
+    model.load_state_dict(checkpoint['model'])
+    print(f"Loaded model from checkpoint {checkpoint_path}")
+    return model, vocab
+def recognize_single_image(model,
+                           image_path,
+                           vocab,
+                           device,
+                           max_length=150,
+                           visualize_attention=False):
+    """
+    Recognize handwritten mathematical expression from a single image using the CAN model
+    """
+    # Prepare image transform for grayscale images
+    transform = A.Compose([
+        A.Normalize(mean=[0.0], std=[1.0]),  # For grayscale
+        A.pytorch.ToTensorV2()
+    ])
+    # Load and transform image
+    processed_img, best_crop = process_img(image_path, convert_to_rgb=False)
+    # Ensure image has the correct format for albumentations
+    processed_img = np.expand_dims(processed_img, axis=-1)  # [H, W, 1]
+    image_tensor = transform(
+        image=processed_img)['image'].unsqueeze(0).to(device)
+    model.eval()
+    with torch.no_grad():
+        # Generate LaTeX using beam search
+        predictions, attention_weights = model.recognize(
+            image_tensor,
+            max_length=max_length,
+            start_token=vocab.start_token,
+            end_token=vocab.end_token,
+            beam_width=5  # Use beam search with width 5
+        )
+    # Convert indices to LaTeX tokens
+    latex_tokens = []
+    for idx in predictions:
+        if idx == vocab.end_token:
+            break
+        if idx != vocab.start_token:  # Skip start token
+            latex_tokens.append(vocab.idx2word[idx])
+    # Join tokens to get complete LaTeX
+    latex = ' '.join(latex_tokens)
+    # Visualize attention if requested
+    if visualize_attention and attention_weights is not None:
+        visualize_attention_maps(processed_img, attention_weights,
+                                 latex_tokens, best_crop)
+    return latex
+def visualize_attention_maps(orig_image,
+                             attention_weights,
+                             latex_tokens,
+                             best_crop,
+                             max_cols=4):
+    """
+    Visualize attention maps over the image for CAN model
+    """
+    # Create PIL image from numpy array
+    orig_image = orig_image.crop(best_crop)
+    orig_w, orig_h = orig_image.size
+    ratio = INPUT_HEIGHT / INPUT_WIDTH
+    num_tokens = len(latex_tokens)
+    num_cols = min(max_cols, num_tokens)
+    num_rows = int(np.ceil(num_tokens / num_cols))
+    fig, axes = plt.subplots(num_rows,
+                             num_cols,
+                             figsize=(num_cols * 3, int(num_rows * 6 * orig_h / orig_w)))
+    axes = np.array(axes).reshape(-1)
+    for i, (token, attn) in enumerate(zip(latex_tokens, attention_weights)):
+        ax = axes[i]
+        attn = attn[0:1].squeeze(0)
+        attn_len = attn.shape[0]
+        attn_w = int(np.sqrt(attn_len / ratio))
+        attn_h = int(np.sqrt(attn_len * ratio))
+        # resize to (orig_h, interpolated_w)
+        attn = attn.view(1, 1, attn_h, attn_w)
+        interp_w = int(orig_h / ratio)
+        attn = F.interpolate(attn, size=(orig_h, interp_w), mode='bilinear', align_corners=False)
+        attn = attn.squeeze().cpu().numpy()
+        # fix aspect ratio mismatch
+        if interp_w > orig_w:
+            # center crop width
+            start = (interp_w - orig_w) // 2
+            attn = attn[:, start:start + orig_w]
+        elif interp_w < orig_w:
+            # stretch to fit width
+            attn = cv2.resize(attn, (orig_w, orig_h), interpolation=cv2.INTER_CUBIC)
+        ax.imshow(orig_image)
+        ax.imshow(attn, cmap='jet', alpha=0.4)
+        ax.set_title(f'{token}', fontsize=10 * 8 * orig_h / orig_w)
+        ax.axis('off')
+    for j in range(i + 1, len(axes)):
+        axes[j].axis('off')
+    plt.tight_layout()
+    plt.savefig('attention_maps_can.png', bbox_inches='tight', dpi=150)
+    plt.close()
+def evaluate_model(model,
+                   test_folder,
+                   label_file,
+                   vocab,
+                   device,
+                   max_length=150,
+                   batch_size=32):
+    """
+    Evaluate CAN model on test set
+    """
+    df = pd.read_csv(label_file,
+                     sep='\t',
+                     header=None,
+                     names=['filename', 'label'])
+    # Check image file format
+    if os.path.exists(test_folder):
+        img_files = os.listdir(test_folder)
+        if img_files:
+            # Get the extension of the first file
+            extension = os.path.splitext(img_files[0])[1]
+            # Add extension to filenames if not present
+            df['filename'] = df['filename'].apply(
+                lambda x: x if os.path.splitext(x)[1] else x + extension)
+    annotations = dict(zip(df['filename'], df['label']))
+    model.eval()
+    correct = 0
+    err1 = 0
+    err2 = 0
+    err3 = 0
+    total = 0
+    transform = A.Compose([
+        A.Normalize(mean=[0.0], std=[1.0]),  # For grayscale
+        A.pytorch.ToTensorV2()
+    ])
+    results = {}
+    for image_path, gt_latex in tqdm(annotations.items(), desc="Evaluating"):
+        gt_latex: str = gt_latex
+        if not filter_formula(gt_latex.split(), CLASSIFIER):
+            continue
+        file_path = os.path.join(test_folder, image_path)
+        try:
+            processed_img, _ = process_img(file_path, convert_to_rgb=False)
+            # Ensure image has the correct format for albumentations
+            processed_img = np.expand_dims(processed_img, axis=-1)  # [H, W, 1]
+            image_tensor = transform(
+                image=processed_img)['image'].unsqueeze(0).to(device)
+            with torch.no_grad():
+                predictions, _ = model.recognize(
+                    image_tensor,
+                    max_length=max_length,
+                    start_token=vocab.start_token,
+                    end_token=vocab.end_token,
+                    beam_width=5  # Use beam search
+                )
+            # Convert indices to LaTeX tokens
+            pred_latex_tokens = []
+            for idx in predictions:
+                if idx == vocab.end_token:
+                    break
+                if idx != vocab.start_token:  # Skip start token
+                    pred_latex_tokens.append(vocab.idx2word[idx])
+            pred_latex = ' '.join(pred_latex_tokens)
+            gt_latex_tokens = gt_latex.split()
+            edit_distance = levenshtein_distance(pred_latex_tokens,
+                                                 gt_latex_tokens)
+            if edit_distance == 0:
+                correct += 1
+            elif edit_distance == 1:
+                err1 += 1
+            elif edit_distance == 2:
+                err2 += 1
+            elif edit_distance == 3:
+                err3 += 1
+            total += 1
+            # Save result
+            results[image_path] = {
+                'ground_truth': gt_latex,
+                'prediction': pred_latex,
+                'edit_distance': edit_distance
+            }
+        except Exception as e:
+            print(f"Error processing {image_path}: {e}")
+    # Calculate accuracy metrics
+    exprate = round(correct / total, 4) if total > 0 else 0
+    exprate_leq1 = round((correct + err1) / total, 4) if total > 0 else 0
+    exprate_leq2 = round(
+        (correct + err1 + err2) / total, 4) if total > 0 else 0
+    exprate_leq3 = round(
+        (correct + err1 + err2 + err3) / total, 4) if total > 0 else 0
+    print(f"Exact match rate: {exprate:.4f}")
+    print(f"Edit distance ≤ 1: {exprate_leq1:.4f}")
+    print(f"Edit distance ≤ 2: {exprate_leq2:.4f}")
+    print(f"Edit distance ≤ 3: {exprate_leq3:.4f}")
+    # Save results to file
+    with open('evaluation_results_can.json', 'w', encoding='utf-8') as f:
+        json.dump(
+            {
+                'accuracy': {
+                    'exprate': exprate,
+                    'exprate_leq1': exprate_leq1,
+                    'exprate_leq2': exprate_leq2,
+                    'exprate_leq3': exprate_leq3
+                },
+                'results': results
+            },
+            f,
+            indent=4)
+    return {
+        'exprate': exprate,
+        'exprate_leq1': exprate_leq1,
+        'exprate_leq2': exprate_leq2,
+        'exprate_leq3': exprate_leq3
+    }, results
+def main(mode):
+    device = DEVICE
+    print(f'Using device: {device}')
+    checkpoint_path = CHECKPOINT_PATH
+    backbone = BACKBONE_TYPE
+    pretrained_backbone = PRETRAINED_BACKBONE
+    # For single mode
+    image_path = IMAGE_PATH
+    visualize = VISUALIZE
+    # For evaluation mode
+    test_folder = TEST_FOLDER
+    label_file = LABEL_FILE
+    # Load model and vocabulary
+    model, vocab = load_checkpoint(checkpoint_path, device, pretrained_backbone=pretrained_backbone, backbone=backbone)
+    if mode == 'single':
+        if image_path is None:
+            raise ValueError('Image path is required for single mode')
+        latex = recognize_single_image(model,
+                                       image_path,
+                                       vocab,
+                                       device,
+                                       visualize_attention=visualize)
+        print(f'Recognized LaTeX: {latex}')
+    elif mode == 'evaluate':
+        if test_folder is None or label_file is None:
+            raise ValueError(
+                'Test folder and annotation file are required for evaluate mode'
+            )
+        metrics, results = evaluate_model(model, test_folder, label_file,
+                                          vocab, device)
+        print(f"##### Score of {CLASSIFIER} expression type: #####")
+        print(f'Evaluation metrics: {metrics}')
+if __name__ == '__main__':
+    # Ensure Vocabulary is safe for serialization
+    torch.serialization.add_safe_globals([Vocabulary])
+    # Run the main function
+    main(MODE)

models/can/can_trainer.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+from torch.utils.data import DataLoader
+import time
+import wandb
+from datetime import datetime
+from tqdm.auto import tqdm
+from models.can.can import CAN, create_can_model
+from models.can.can_dataloader import create_dataloaders_for_can, Vocabulary
+import albumentations as A
+import cv2
+import random
+import json
+with open("config.json", "r") as json_file:
+    cfg = json.load(json_file)
+CAN_CONFIG = cfg["can"]
+# Global constants
+BASE_DIR = CAN_CONFIG["base_dir"]
+SEED = CAN_CONFIG["seed"]
+CHECKPOINT_DIR = CAN_CONFIG["checkpoint_dir"]
+PRETRAINED_BACKBONE = True if CAN_CONFIG["pretrained_backbone"] == 1 else False
+BACKBONE_TYPE = CAN_CONFIG["backbone_type"]
+CHECKPOINT_NAME = f'{BACKBONE_TYPE}_can_best.pth' if PRETRAINED_BACKBONE == False else f'p_{BACKBONE_TYPE}_can_best.pth'
+BATCH_SIZE = CAN_CONFIG["batch_size"]
+HIDDEN_SIZE = CAN_CONFIG["hidden_size"]
+EMBEDDING_DIM = CAN_CONFIG["embedding_dim"]
+USE_COVERAGE = True if CAN_CONFIG["use_coverage"] == 1 else False
+LAMBDA_COUNT = CAN_CONFIG["lambda_count"]
+LR = CAN_CONFIG["lr"]
+EPOCHS = CAN_CONFIG["epochs"]
+GRAD_CLIP = CAN_CONFIG["grad_clip"]
+PRINT_FREQ = CAN_CONFIG["print_freq"]
+T = CAN_CONFIG["t"]
+T_MULT = CAN_CONFIG["t_mult"]
+PROJECT_NAME = f'final-hmer-can-{BACKBONE_TYPE}-pretrained' if PRETRAINED_BACKBONE == True else f'final-hmer-can-{BACKBONE_TYPE}'
+NUM_WORKERS = cfg["can"]["num_workers"]
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+class RandomMorphology(A.ImageOnlyTransform):
+    def __init__(self, p=0.5, kernel_size=3):
+        super(RandomMorphology, self).__init__(p)
+        self.kernel_size = kernel_size
+    def apply(self, img, **params):
+        op = random.choice(['erode', 'dilate'])
+        kernel = np.ones((self.kernel_size, self.kernel_size), np.uint8)
+        if op == 'erode':
+            return cv2.erode(img, kernel, iterations=1)
+        else:
+            return cv2.dilate(img, kernel, iterations=1)
+# Custom transforms for CAN model (grayscale images)
+train_transforms = A.Compose([
+    A.Rotate(limit=5, p=0.25, border_mode=cv2.BORDER_REPLICATE),
+    A.ElasticTransform(alpha=100,
+                       sigma=7,
+                       p=0.5,
+                       interpolation=cv2.INTER_CUBIC),
+    RandomMorphology(p=0.5, kernel_size=2),
+    A.Normalize(mean=[0.0], std=[1.0]),  # For grayscale
+    A.pytorch.ToTensorV2()
+])
+def train_epoch(model,
+                train_loader,
+                optimizer,
+                device,
+                grad_clip=5.0,
+                lambda_count=0.01,
+                print_freq=10):
+    """
+    Train the model for one epoch
+    """
+    model.train()
+    total_loss = 0.0
+    total_cls_loss = 0.0
+    total_count_loss = 0.0
+    batch_count = 0
+    for i, (images, captions, caption_lengths,
+            count_targets) in tqdm(enumerate(train_loader),
+                                   total=len(train_loader)):
+        batch_count += 1
+        images = images.to(device)
+        captions = captions.to(device)
+        count_targets = count_targets.to(device)
+        # Forward pass
+        outputs, count_vectors = model(images,
+                                       captions,
+                                       teacher_forcing_ratio=0.5)
+        # Calculate loss
+        loss, cls_loss, counting_loss = model.calculate_loss(
+            outputs=outputs,
+            targets=captions,
+            count_vectors=count_vectors,
+            count_targets=count_targets,
+            lambda_count=lambda_count)
+        # Backward pass
+        optimizer.zero_grad()
+        loss.backward()
+        # Clip gradients
+        if grad_clip:
+            nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+        # Update weights
+        optimizer.step()
+        # Track losses
+        total_loss += loss.item()
+        total_cls_loss += cls_loss.item()
+        total_count_loss += counting_loss.item()
+        # Print progress
+        if i % print_freq == 0 and i > 0:
+            print(
+                f'Batch {i}/{len(train_loader)}, Loss: {loss.item():.4f}, '
+                f'Cls Loss: {cls_loss.item():.4f}, Count Loss: {counting_loss.item():.4f}'
+            )
+    return total_loss / batch_count, total_cls_loss / batch_count, total_count_loss / batch_count
+def validate(model, val_loader, device, lambda_count=0.01):
+    """
+    Validate the model
+    """
+    model.eval()
+    total_loss = 0.0
+    total_cls_loss = 0.0
+    total_count_loss = 0.0
+    batch_count = 0
+    with torch.no_grad():
+        for i, (images, captions, caption_lengths,
+                count_targets) in tqdm(enumerate(val_loader),
+                                       total=len(val_loader)):
+            batch_count += 1
+            images = images.to(device)
+            captions = captions.to(device)
+            count_targets = count_targets.to(device)
+            # Forward pass
+            outputs, count_vectors = model(
+                images, captions,
+                teacher_forcing_ratio=0.0)  # No teacher forcing in validation
+            # Calculate loss
+            loss, cls_loss, counting_loss = model.calculate_loss(
+                outputs=outputs,
+                targets=captions,
+                count_vectors=count_vectors,
+                count_targets=count_targets,
+                lambda_count=lambda_count)
+            # Track losses
+            total_loss += loss.item()
+            total_cls_loss += cls_loss.item()
+            total_count_loss += counting_loss.item()
+    return total_loss / batch_count, total_cls_loss / batch_count, total_count_loss / batch_count
+def main():
+    # Configuration
+    dataset_dir = BASE_DIR
+    seed = SEED
+    checkpoints_dir = CHECKPOINT_DIR
+    checkpoint_name = CHECKPOINT_NAME
+    batch_size = BATCH_SIZE
+    # Model parameters
+    hidden_size = HIDDEN_SIZE
+    embedding_dim = EMBEDDING_DIM
+    use_coverage = USE_COVERAGE
+    lambda_count = LAMBDA_COUNT
+    # Training parameters
+    lr = LR
+    epochs = EPOCHS
+    grad_clip = GRAD_CLIP
+    print_freq = PRINT_FREQ
+    # Scheduler parameters
+    T_0 = T
+    T_mult = T_MULT
+    # Set random seeds
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    # Create checkpoint directory
+    os.makedirs(checkpoints_dir, exist_ok=True)
+    # Set device
+    device = DEVICE
+    print(f'Using device: {device}')
+    # Create dataloaders
+    train_loader, val_loader, test_loader, vocab = create_dataloaders_for_can(
+        base_dir=dataset_dir, batch_size=batch_size, num_workers=NUM_WORKERS)
+    print(f"Training samples: {len(train_loader.dataset)}")
+    print(f"Validation samples: {len(val_loader.dataset)}")
+    print(f"Test samples: {len(test_loader.dataset)}")
+    print(f"Vocabulary size: {len(vocab)}")
+    # Create model
+    model = create_can_model(num_classes=len(vocab),
+                             hidden_size=hidden_size,
+                             embedding_dim=embedding_dim,
+                             use_coverage=use_coverage,
+                             pretrained_backbone=PRETRAINED_BACKBONE,
+                             backbone_type=BACKBONE_TYPE).to(device)
+    # Create optimizer
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    # Create learning rate scheduler
+    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
+                                                               T_0=T_0,
+                                                               T_mult=T_mult)
+    # Initialize wandb
+    run_name = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+    wandb.init(project=PROJECT_NAME,
+               name=run_name,
+               config={
+                   'seed': seed,
+                   'batch_size': batch_size,
+                   'hidden_size': hidden_size,
+                   'embedding_dim': embedding_dim,
+                   'use_coverage': use_coverage,
+                   'lambda_count': lambda_count,
+                   'lr': lr,
+                   'epochs': epochs,
+                   'grad_clip': grad_clip,
+                   'T_0': T_0,
+                   'T_mult': T_mult
+               })
+    # Training loop
+    best_val_loss = float('inf')
+    for epoch in tqdm(range(epochs)):
+        curr_lr = scheduler.get_last_lr()[0]
+        print(f'Epoch {epoch+1:03}/{epochs:03}')
+        t1 = time.time()
+        # Train
+        train_loss, train_cls_loss, train_count_loss = train_epoch(
+            model=model,
+            train_loader=train_loader,
+            optimizer=optimizer,
+            device=device,
+            grad_clip=grad_clip,
+            lambda_count=lambda_count,
+            print_freq=print_freq)
+        # Validate
+        val_loss, val_cls_loss, val_count_loss = validate(
+            model=model,
+            val_loader=val_loader,
+            device=device,
+            lambda_count=lambda_count)
+        # Update learning rate
+        scheduler.step()
+        t2 = time.time()
+        # Print stats
+        print(
+            f'Train - Total Loss: {train_loss:.4f}, Class Loss: {train_cls_loss:.4f}, Count Loss: {train_count_loss:.4f}'
+        )
+        print(
+            f'Val - Total Loss: {val_loss:.4f}, Class Loss: {val_cls_loss:.4f}, Count Loss: {val_count_loss:.4f}'
+        )
+        print(f'Time: {t2 - t1:.2f}s, Learning Rate: {curr_lr:.6f}')
+        # Log metrics to wandb
+        wandb.log({
+            'train_loss': train_loss,
+            'train_cls_loss': train_cls_loss,
+            'train_count_loss': train_count_loss,
+            'val_loss': val_loss,
+            'val_cls_loss': val_cls_loss,
+            'val_count_loss': val_count_loss,
+            'learning_rate': curr_lr,
+            'epoch': epoch
+        })
+        # Save checkpoint
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+            checkpoint = {
+                'epoch': epoch,
+                'model': model.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'val_loss': val_loss,
+                'vocab': vocab
+            }
+            torch.save(checkpoint, os.path.join(checkpoints_dir,
+                                                checkpoint_name))
+            print('Model saved!')
+    print('Training completed!')
+if __name__ == "__main__":
+    main()