Spaces:

fisherman611
/

handwritten-mathematical-expression-recognition

Running

App Files Files Community

handwritten-mathematical-expression-recognition / models /can /can.py

fisherman611

Create models/can/can.py

89ae6ce verified 23 days ago

raw

history blame contribute delete

32.3 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torchvision.models as models
	import math


	"""Custom DenseNet Backbone"""
	class DenseBlock(nn.Module):
	"""
	Basic DenseNet block
	"""
	def __init__(self, in_channels, growth_rate, num_layers):
	super(DenseBlock, self).__init__()
	self.layers = nn.ModuleList()
	for i in range(num_layers):
	self.layers.append(self._make_layer(in_channels + i * growth_rate, growth_rate))

	def _make_layer(self, in_channels, growth_rate):
	layer = nn.Sequential(
	nn.BatchNorm2d(in_channels),
	nn.ReLU(inplace=True),
	nn.Conv2d(in_channels, 4 * growth_rate, kernel_size=1, bias=False),
	nn.BatchNorm2d(4 * growth_rate),
	nn.ReLU(inplace=True),
	nn.Conv2d(4 * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
	)
	return layer

	def forward(self, x):
	features = [x]
	for layer in self.layers:
	new_feature = layer(torch.cat(features, dim=1))
	features.append(new_feature)
	return torch.cat(features, dim=1)


	class TransitionLayer(nn.Module):
	"""
	Transition layer between DenseBlocks
	"""
	def __init__(self, in_channels, out_channels):
	super(TransitionLayer, self).__init__()
	self.transition = nn.Sequential(
	nn.BatchNorm2d(in_channels),
	nn.ReLU(inplace=True),
	nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
	nn.AvgPool2d(kernel_size=2, stride=2)
	)

	def forward(self, x):
	return self.transition(x)


	class DenseNetBackbone(nn.Module):
	"""
	DenseNet backbone for CAN
	"""
	def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16), num_init_features=64):
	super(DenseNetBackbone, self).__init__()

	# Initial layer
	self.features = nn.Sequential(
	nn.Conv2d(1, num_init_features, kernel_size=7, stride=2, padding=3, bias=False),
	nn.BatchNorm2d(num_init_features),
	nn.ReLU(inplace=True),
	nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
	)

	# DenseBlocks
	num_features = num_init_features
	for i, num_layers in enumerate(block_config):
	block = DenseBlock(num_features, growth_rate, num_layers)
	self.features.add_module(f'denseblock{i+1}', block)
	num_features = num_features + growth_rate * num_layers
	if i != len(block_config) - 1:
	trans = TransitionLayer(num_features, num_features // 2)
	self.features.add_module(f'transition{i+1}', trans)
	num_features = num_features // 2

	# Final processing
	self.features.add_module('norm5', nn.BatchNorm2d(num_features))
	self.features.add_module('relu5', nn.ReLU(inplace=True))

	self.out_channels = num_features # 684 (with default configuration)

	def forward(self, x):
	return self.features(x)


	"""Pretrained DenseNet"""
	class DenseNetFeatureExtractor(nn.Module):
	def __init__(self, densenet_model, out_channels=684):
	super().__init__()
	# Change input conv to 1 channel
	self.conv0 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
	# Copy pretrained weights (average over RGB channels)
	self.conv0.weight.data = densenet_model.features.conv0.weight.data.mean(dim=1, keepdim=True)
	self.features = densenet_model.features
	self.out_channels = out_channels
	# Add a 1x1 conv to match your expected output channels if needed
	self.final_conv = nn.Conv2d(1024, out_channels, kernel_size=1)
	self.final_bn = nn.BatchNorm2d(out_channels)
	self.final_relu = nn.ReLU(inplace=True)

	def forward(self, x):
	x = self.conv0(x)
	x = self.features.norm0(x)
	x = self.features.relu0(x)
	x = self.features.pool0(x)
	x = self.features.denseblock1(x)
	x = self.features.transition1(x)
	x = self.features.denseblock2(x)
	x = self.features.transition2(x)
	x = self.features.denseblock3(x)
	x = self.features.transition3(x)
	x = self.features.denseblock4(x)
	x = self.features.norm5(x)
	x = self.final_conv(x)
	x = self.final_bn(x)
	x = self.final_relu(x)
	return x


	"""Custom ResNet Backbone"""
	class BasicBlock(nn.Module):
	"""
	Basic ResNet block
	"""
	expansion = 1

	def __init__(self, in_channels, out_channels, stride=1):
	super(BasicBlock, self).__init__()
	self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
	self.bn1 = nn.BatchNorm2d(out_channels)
	self.relu = nn.ReLU(inplace=True)
	self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
	self.bn2 = nn.BatchNorm2d(out_channels)

	self.shortcut = nn.Sequential()
	if stride != 1 or in_channels != out_channels * self.expansion:
	self.shortcut = nn.Sequential(
	nn.Conv2d(in_channels, out_channels * self.expansion, kernel_size=1, stride=stride, bias=False),
	nn.BatchNorm2d(out_channels * self.expansion)
	)

	def forward(self, x):
	identity = x

	out = self.conv1(x)
	out = self.bn1(out)
	out = self.relu(out)

	out = self.conv2(out)
	out = self.bn2(out)

	out += self.shortcut(identity)
	out = self.relu(out)

	return out


	class Bottleneck(nn.Module):
	"""
	Bottleneck ResNet block
	"""
	expansion = 4

	def __init__(self, in_channels, out_channels, stride=1):
	super(Bottleneck, self).__init__()
	self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
	self.bn1 = nn.BatchNorm2d(out_channels)
	self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
	self.bn2 = nn.BatchNorm2d(out_channels)
	self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, bias=False)
	self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
	self.relu = nn.ReLU(inplace=True)

	self.shortcut = nn.Sequential()
	if stride != 1 or in_channels != out_channels * self.expansion:
	self.shortcut = nn.Sequential(
	nn.Conv2d(in_channels, out_channels * self.expansion, kernel_size=1, stride=stride, bias=False),
	nn.BatchNorm2d(out_channels * self.expansion)
	)

	def forward(self, x):
	identity = x

	out = self.conv1(x)
	out = self.bn1(out)
	out = self.relu(out)

	out = self.conv2(out)
	out = self.bn2(out)
	out = self.relu(out)

	out = self.conv3(out)
	out = self.bn3(out)

	out += self.shortcut(identity)
	out = self.relu(out)

	return out


	class ResNetBackbone(nn.Module):
	"""
	ResNet backbone for CAN model, designed to output similar dimensions as DenseNet
	"""
	def __init__(self, block_type='bottleneck', layers=[3, 4, 6, 3], num_init_features=64):
	super(ResNetBackbone, self).__init__()

	# Initial layer
	self.conv1 = nn.Conv2d(1, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)
	self.bn1 = nn.BatchNorm2d(num_init_features)
	self.relu = nn.ReLU(inplace=True)
	self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

	# Define block type
	if block_type == 'basic':
	block = BasicBlock
	expansion = 1
	elif block_type == 'bottleneck':
	block = Bottleneck
	expansion = 4
	else:
	raise ValueError(f"Unknown block type: {block_type}")

	# Create layers
	self.layer1 = self._make_layer(block, num_init_features, 64, layers[0], stride=1)
	self.layer2 = self._make_layer(block, 64 * expansion, 128, layers[1], stride=2)
	self.layer3 = self._make_layer(block, 128 * expansion, 256, layers[2], stride=2)
	self.layer4 = self._make_layer(block, 256 * expansion, 512, layers[3], stride=2)

	# Final processing to match DenseNet output channels
	self.final_conv = nn.Conv2d(512 * expansion, 684, kernel_size=1)
	self.final_bn = nn.BatchNorm2d(684)
	self.final_relu = nn.ReLU(inplace=True)

	self.out_channels = 684 # Match DenseNet output channels

	# Initialize weights
	self._initialize_weights()

	def _make_layer(self, block, in_channels, out_channels, num_blocks, stride):
	layers = []
	layers.append(block(in_channels, out_channels, stride))
	for _ in range(1, num_blocks):
	layers.append(block(out_channels * block.expansion, out_channels))
	return nn.Sequential(*layers)

	def _initialize_weights(self):
	for m in self.modules():
	if isinstance(m, nn.Conv2d):
	nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
	elif isinstance(m, nn.BatchNorm2d):
	nn.init.constant_(m.weight, 1)
	nn.init.constant_(m.bias, 0)

	def forward(self, x):
	x = self.conv1(x)
	x = self.bn1(x)
	x = self.relu(x)
	x = self.maxpool(x)

	x = self.layer1(x)
	x = self.layer2(x)
	x = self.layer3(x)
	x = self.layer4(x)

	x = self.final_conv(x)
	x = self.final_bn(x)
	x = self.final_relu(x)

	return x



	"""Pretrained ResNet"""
	class ResNetFeatureExtractor(nn.Module):
	def __init__(self, resnet_model, out_channels=684):
	super().__init__()
	# Change input conv to 1 channel
	self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
	self.conv1.weight.data = resnet_model.conv1.weight.data.sum(dim=1, keepdim=True) # average weights if needed
	self.bn1 = resnet_model.bn1
	self.relu = resnet_model.relu
	self.maxpool = resnet_model.maxpool
	self.layer1 = resnet_model.layer1
	self.layer2 = resnet_model.layer2
	self.layer3 = resnet_model.layer3
	self.layer4 = resnet_model.layer4
	# Add a 1x1 conv to match DenseNet output channels if needed
	self.final_conv = nn.Conv2d(2048, out_channels, kernel_size=1)
	self.final_bn = nn.BatchNorm2d(out_channels)
	self.final_relu = nn.ReLU(inplace=True)
	self.out_channels = out_channels

	def forward(self, x):
	x = self.conv1(x)
	x = self.bn1(x)
	x = self.relu(x)
	x = self.maxpool(x)
	x = self.layer1(x)
	x = self.layer2(x)
	x = self.layer3(x)
	x = self.layer4(x)
	x = self.final_conv(x)
	x = self.final_bn(x)
	x = self.final_relu(x)
	return x


	"""Channel Attention"""
	class ChannelAttention(nn.Module):
	"""
	Channel-wise attention mechanism
	"""
	def __init__(self, in_channels, ratio=16):
	super(ChannelAttention, self).__init__()
	self.avg_pool = nn.AdaptiveAvgPool2d(1)
	self.max_pool = nn.AdaptiveMaxPool2d(1)

	self.fc = nn.Sequential(
	nn.Conv2d(in_channels, in_channels // ratio, kernel_size=1, bias=False),
	nn.ReLU(inplace=True),
	nn.Conv2d(in_channels // ratio, in_channels, kernel_size=1, bias=False)
	)
	self.sigmoid = nn.Sigmoid()

	def forward(self, x):
	avg_out = self.fc(self.avg_pool(x))
	max_out = self.fc(self.max_pool(x))
	out = avg_out + max_out
	return self.sigmoid(out)


	"""Multi-scale Couting Module"""
	class MSCM(nn.Module):
	"""
	Multi-Scale Counting Module
	"""
	def __init__(self, in_channels, num_classes):
	super(MSCM, self).__init__()

	# Branch 1: 3x3 kernel
	self.branch1 = nn.Sequential(
	nn.Conv2d(in_channels, 256, kernel_size=3, padding=1),
	nn.ReLU(inplace=True),
	nn.Dropout2d(p=0.2)
	)
	self.attention1 = ChannelAttention(256)

	# Branch 2: 5x5 kernel
	self.branch2 = nn.Sequential(
	nn.Conv2d(in_channels, 256, kernel_size=5, padding=2),
	nn.ReLU(inplace=True),
	nn.Dropout2d(p=0.2)
	)
	self.attention2 = ChannelAttention(256)

	# 1x1 Conv layer to reduce channels and create counting map
	self.conv_reduce = nn.Conv2d(512, num_classes, kernel_size=1)
	self.sigmoid = nn.Sigmoid()

	def forward(self, x):
	# Process branch 1
	out1 = self.branch1(x)
	out1 = out1 * self.attention1(out1)

	# Process branch 2
	out2 = self.branch2(x)
	out2 = out2 * self.attention2(out2)

	# Concatenate features from both branches
	concat_features = torch.cat([out1, out2], dim=1) # Shape: B x 512 x H x W

	# Create counting map
	count_map = self.sigmoid(self.conv_reduce(concat_features)) # Shape: B x C x H x W

	# Apply sum-pooling to create 1D counting vector
	# Sum over the entire feature map along height and width
	count_vector = torch.sum(count_map, dim=(2, 3)) # Shape: B x C

	return count_map, count_vector


	"""Positional Encoding"""
	class PositionalEncoding(nn.Module):
	"""
	Positional encoding for attention decoder
	"""
	def __init__(self, d_model, max_seq_len=1024):
	super(PositionalEncoding, self).__init__()
	self.d_model = d_model

	# Create positional encoding matrix
	pe = torch.zeros(max_seq_len, d_model)
	position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
	div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

	pe[:, 0::2] = torch.sin(position * div_term)
	pe[:, 1::2] = torch.cos(position * div_term)
	self.register_buffer('pe', pe)

	def forward(self, x):
	# x shape: B x H x W x d_model
	b, h, w, _ = x.shape

	# Ensure we have enough positional encodings for the feature map size
	if h*w > self.pe.size(0): #type: ignore
	# Dynamically extend positional encodings if needed
	device = self.pe.device
	extended_pe = torch.zeros(h*w, self.d_model, device=device) #type: ignore
	position = torch.arange(0, h*w, dtype=torch.float, device=device).unsqueeze(1) #type: ignore
	div_term = torch.exp(torch.arange(0, self.d_model, 2, device=device).float() * (-math.log(10000.0) / self.d_model)) #type: ignore

	extended_pe[:, 0::2] = torch.sin(position * div_term)
	extended_pe[:, 1::2] = torch.cos(position * div_term)

	pos_encoding = extended_pe.view(h, w, -1)
	else:
	# Use pre-computed positional encodings
	pos_encoding = self.pe[:h*w].view(h, w, -1) #type: ignore

	pos_encoding = pos_encoding.unsqueeze(0).expand(b, -1, -1, -1) # B x H x W x d_model
	return pos_encoding


	"""Counting-combined Attentional Decoder"""
	class CCAD(nn.Module):
	"""
	Counting-Combined Attentional Decoder
	"""
	def __init__(self, input_channels, hidden_size, embedding_dim, num_classes, use_coverage=True):
	super(CCAD, self).__init__()

	self.hidden_size = hidden_size
	self.embedding_dim = embedding_dim
	self.use_coverage = use_coverage

	# Input layer to reduce feature map
	self.feature_proj = nn.Conv2d(input_channels, hidden_size * 2, kernel_size=1)

	# Positional encoding
	self.pos_encoder = PositionalEncoding(hidden_size * 2)

	# Embedding layer for output symbols
	self.embedding = nn.Embedding(num_classes, embedding_dim)

	# GRU cell
	self.gru = nn.GRUCell(embedding_dim + hidden_size + num_classes, hidden_size)

	# Attention
	self.attention_w = nn.Linear(hidden_size * 2, hidden_size)
	self.attention_v = nn.Linear(hidden_size, 1)
	if use_coverage:
	self.coverage_proj = nn.Linear(1, hidden_size)

	# Output layer
	self.out = nn.Linear(hidden_size + hidden_size + num_classes, num_classes)
	self.dropout = nn.Dropout(p=0.3)

	def forward(self, feature_map, count_vector, target=None, teacher_forcing_ratio=0.5, max_len=200):
	batch_size = feature_map.size(0)
	device = feature_map.device

	# Transform feature map
	projected_features = self.feature_proj(feature_map) # B x 2*hidden_size x H x W
	H, W = projected_features.size(2), projected_features.size(3)

	# Reshape feature map to B x HW x 2hidden_size
	projected_features = projected_features.permute(0, 2, 3, 1).contiguous() # B x H x W x 2*hidden_size

	# Add positional encoding
	pos_encoding = self.pos_encoder(projected_features) # B x H x W x 2*hidden_size
	projected_features = projected_features + pos_encoding

	# Reshape for attention processing
	projected_features = projected_features.view(batch_size, HW, -1) # B x HW x 2*hidden_size

	# Initialize initial hidden state
	h_t = torch.zeros(batch_size, self.hidden_size, device=device)

	# Initialize coverage attention if used
	if self.use_coverage:
	coverage = torch.zeros(batch_size, H*W, 1, device=device)

	# First <SOS> token
	y_t_1 = torch.ones(batch_size, dtype=torch.long, device=device)

	# Prepare target sequence if provided
	if target is not None:
	max_len = target.size(1)

	# Array to store predictions
	outputs = torch.zeros(batch_size, max_len, self.embedding.num_embeddings, device=device)

	for t in range(max_len):
	# Apply embedding to the previous symbol
	embedded = self.embedding(y_t_1) # B x embedding_dim

	# Compute attention
	attention_input = self.attention_w(projected_features) # B x H*W x hidden_size

	# Add coverage attention if used
	if self.use_coverage:
	coverage_input = self.coverage_proj(coverage.float()) #type: ignore
	attention_input = attention_input + coverage_input

	# Add hidden state to attention
	h_expanded = h_t.unsqueeze(1).expand(-1, HW, -1) # B x HW x hidden_size
	attention_input = torch.tanh(attention_input + h_expanded)

	# Compute attention weights
	e_t = self.attention_v(attention_input).squeeze(-1) # B x H*W
	alpha_t = F.softmax(e_t, dim=1) # B x H*W

	# Update coverage if used
	if self.use_coverage:
	coverage = coverage + alpha_t.unsqueeze(-1) #type: ignore

	# Compute context vector
	alpha_t = alpha_t.unsqueeze(1) # B x 1 x H*W
	context = torch.bmm(alpha_t, projected_features).squeeze(1) # B x 2*hidden_size
	context = context[:, :self.hidden_size] # Take the first half as context vector

	# Combine embedding, context vector, and count vector
	gru_input = torch.cat([embedded, context, count_vector], dim=1)

	# Update hidden state
	h_t = self.gru(gru_input, h_t)

	# Predict output symbol
	output = self.out(torch.cat([h_t, context, count_vector], dim=1))
	outputs[:, t] = output

	# Decide the next input symbol
	if target is not None and torch.rand(1).item() < teacher_forcing_ratio:
	y_t_1 = target[:, t]
	else:
	# Greedy decoding
	_, y_t_1 = output.max(1)

	return outputs


	"""Full model CAN (Counting-Aware Network)"""
	class CAN(nn.Module):
	"""
	Counting-Aware Network for handwritten mathematical expression recognition
	"""
	def __init__(self, num_classes, backbone=None, hidden_size=256, embedding_dim=256, use_coverage=True):
	super(CAN, self).__init__()

	# Backbone
	if backbone is None:
	self.backbone = DenseNetBackbone()
	else:
	self.backbone = backbone
	backbone_channels = self.backbone.out_channels

	# Multi-Scale Counting Module
	self.mscm = MSCM(backbone_channels, num_classes)

	# Counting-Combined Attentional Decoder
	self.decoder = CCAD(
	input_channels=backbone_channels,
	hidden_size=hidden_size,
	embedding_dim=embedding_dim,
	num_classes=num_classes,
	use_coverage=use_coverage
	)

	# Save parameters for later use
	self.hidden_size = hidden_size
	self.embedding_dim = embedding_dim
	self.num_classes = num_classes
	self.use_coverage = use_coverage

	def init_hidden_state(self, visual_features):
	"""
	Initialize hidden state and cell state for LSTM

	Args:
	visual_features: Visual features from backbone

	Returns:
	h, c: Initial hidden and cell states
	"""
	batch_size = visual_features.size(0)
	device = visual_features.device

	# Initialize hidden state with zeros
	h = torch.zeros(1, batch_size, self.hidden_size, device=device)
	c = torch.zeros(1, batch_size, self.hidden_size, device=device)

	return h, c

	def forward(self, x, target=None, teacher_forcing_ratio=0.5):
	# Extract features from backbone
	features = self.backbone(x)

	# Compute count map and count vector from MSCM
	count_map, count_vector = self.mscm(features)

	# Decode with CCAD
	outputs = self.decoder(features, count_vector, target, teacher_forcing_ratio)

	return outputs, count_vector

	def calculate_loss(self, outputs, targets, count_vectors, count_targets, lambda_count=0.01):
	"""
	Compute the combined loss function for CAN

	Args:
	outputs: Predicted output sequence from decoder
	targets: Actual target sequence
	count_vectors: Predicted count vector
	count_targets: Actual target count vector
	lambda_count: Weight for counting loss

	Returns:
	Total loss: L = L_cls + λ * L_counting
	"""
	# Loss for decoder (cross entropy)
	L_cls = F.cross_entropy(outputs.view(-1, outputs.size(-1)), targets.view(-1))

	# Loss for counting (MSE)
	L_counting = F.smooth_l1_loss(count_vectors / self.num_classes, count_targets / self.num_classes)

	# Total loss
	total_loss = L_cls + lambda_count * L_counting

	return total_loss, L_cls, L_counting

	def recognize(self, images, max_length=150, start_token=None, end_token=None, beam_width=5):
	"""
	Recognize the handwritten expression using beam search (batch_size=1 only).

	Args:
	images: Input image tensor, shape (1, channels, height, width)
	max_length: Maximum length of the output sequence
	start_token: Start token index
	end_token: End token index
	beam_width: Beam width for beam search

	Returns:
	best_sequence: List of token indices
	attention_weights: List of attention weights for visualization
	"""
	if images.size(0) != 1:
	raise ValueError("Beam search is implemented only for batch_size=1")

	device = images.device

	# Encode the image
	visual_features = self.backbone(images)

	# Get count vector
	_, count_vector = self.mscm(visual_features)

	# Prepare feature map for decoder
	projected_features = self.decoder.feature_proj(visual_features) # (1, 2*hidden_size, H, W)
	H, W = projected_features.size(2), projected_features.size(3)
	projected_features = projected_features.permute(0, 2, 3, 1).contiguous() # (1, H, W, 2*hidden_size)
	pos_encoding = self.decoder.pos_encoder(projected_features) # (1, H, W, 2*hidden_size)
	projected_features = projected_features + pos_encoding # (1, H, W, 2*hidden_size)
	projected_features = projected_features.view(1, HW, -1) # (1, HW, 2*hidden_size)

	# Initialize beams
	beam_sequences = [torch.tensor([start_token], device=device)] * beam_width # List of (seq_len) tensors
	beam_scores = torch.zeros(beam_width, device=device) # (beam_width)
	h_t = torch.zeros(beam_width, self.hidden_size, device=device) # (beam_width, hidden_size)
	if self.use_coverage:
	coverage = torch.zeros(beam_width, HW, device=device) # (beam_width, HW)

	all_attention_weights = []

	for step in range(max_length):
	# Get current tokens for all beams
	current_tokens = torch.tensor([seq[-1] for seq in beam_sequences], device=device) # (beam_width)

	# Apply embedding
	embedded = self.decoder.embedding(current_tokens) # (beam_width, embedding_dim)

	# Compute attention for each beam
	attention_input = self.decoder.attention_w(projected_features.expand(beam_width, -1, -1)) # (beam_width, H*W, hidden_size)
	if self.use_coverage:
	coverage_input = self.decoder.coverage_proj(coverage.unsqueeze(-1)) # (beam_width, H*W, hidden_size) #type: ignore
	attention_input = attention_input + coverage_input

	h_expanded = h_t.unsqueeze(1).expand(-1, HW, -1) # (beam_width, HW, hidden_size)
	attention_input = torch.tanh(attention_input + h_expanded)

	e_t = self.decoder.attention_v(attention_input).squeeze(-1) # (beam_width, H*W)
	alpha_t = F.softmax(e_t, dim=1) # (beam_width, H*W)

	all_attention_weights.append(alpha_t.detach())

	if self.use_coverage:
	coverage = coverage + alpha_t #type: ignore

	context = torch.bmm(alpha_t.unsqueeze(1), projected_features.expand(beam_width, -1, -1)).squeeze(1) # (beam_width, 2*hidden_size)
	context = context[:, :self.hidden_size] # (beam_width, hidden_size)

	# Expand count_vector to (beam_width, num_classes)
	count_vector_expanded = count_vector.expand(beam_width, -1) # (beam_width, num_classes)

	gru_input = torch.cat([embedded, context, count_vector_expanded], dim=1) # (beam_width, embedding_dim + hidden_size + num_classes)

	h_t = self.decoder.gru(gru_input, h_t) # (beam_width, hidden_size)

	output = self.decoder.out(torch.cat([h_t, context, count_vector_expanded], dim=1)) # (beam_width, num_classes)
	scores = F.log_softmax(output, dim=1) # (beam_width, num_classes)

	# Compute new scores for all beam-token combinations
	new_beam_scores = beam_scores.unsqueeze(1) + scores # (beam_width, num_classes)
	new_beam_scores_flat = new_beam_scores.view(-1) # (beam_width * num_classes)

	# Select top beam_width scores and indices
	topk_scores, topk_indices = new_beam_scores_flat.topk(beam_width)

	# Determine which beam and token each top score corresponds to
	beam_indices = topk_indices // self.num_classes # (beam_width)
	token_indices = topk_indices % self.num_classes # (beam_width)

	# Create new beam sequences and states
	new_beam_sequences = []
	new_h_t = []
	if self.use_coverage:
	new_coverage = []
	for i in range(beam_width):
	prev_beam_idx = beam_indices[i].item()
	token = token_indices[i].item()
	new_seq = torch.cat([beam_sequences[prev_beam_idx], torch.tensor([token], device=device)]) #type: ignore
	new_beam_sequences.append(new_seq)
	new_h_t.append(h_t[prev_beam_idx])
	if self.use_coverage:
	new_coverage.append(coverage[prev_beam_idx]) #type: ignore

	# Update beams
	beam_sequences = new_beam_sequences
	beam_scores = topk_scores
	h_t = torch.stack(new_h_t)
	if self.use_coverage:
	coverage = torch.stack(new_coverage) #type: ignore

	# Select the sequence with the highest score
	best_idx = beam_scores.argmax()
	best_sequence = beam_sequences[best_idx].tolist()

	# Remove <start> and stop at <end>
	if best_sequence[0] == start_token:
	best_sequence = best_sequence[1:]
	if end_token in best_sequence:
	end_idx = best_sequence.index(end_token)
	best_sequence = best_sequence[:end_idx]

	return best_sequence, all_attention_weights


	def create_can_model(num_classes, hidden_size=256, embedding_dim=256, use_coverage=True, pretrained_backbone=False, backbone_type='densenet'):
	"""
	Create CAN model with either DenseNet or ResNet backbone

	Args:
	num_classes: Number of symbol classes
	pretrained_backbone: Whether to use a pretrained backbone
	backbone_type: Type of backbone to use ('densenet' or 'resnet')

	Returns:
	CAN model
	"""
	# Create backbone
	if backbone_type == 'densenet':
	if pretrained_backbone:
	densenet = models.densenet121(pretrained=True)
	backbone = DenseNetFeatureExtractor(densenet, out_channels=684)
	else:
	backbone = DenseNetBackbone()
	elif backbone_type == 'resnet':
	if pretrained_backbone:
	resnet = models.resnet50(pretrained=True)
	backbone = ResNetFeatureExtractor(resnet, out_channels=684)
	else:
	backbone = ResNetBackbone(block_type='bottleneck', layers=[3, 4, 6, 3])
	else:
	raise ValueError(f"Unknown backbone type: {backbone_type}")

	# Create model
	model = CAN(
	num_classes=num_classes,
	backbone=backbone,
	hidden_size=hidden_size,
	embedding_dim=embedding_dim,
	use_coverage=use_coverage
	)

	return model


	# # Example usage
	# if __name__ == "__main__":
	# # Create CAN model with 101 symbol classes (example)
	# num_classes = 101 # Number of symbol classes + special tokens like <SOS>, <EOS>
	# model = create_can_model(num_classes)

	# # Create dummy input data
	# batch_size = 4
	# input_image = torch.randn(batch_size, 1, 128, 384) # B x C x H x W
	# target = torch.randint(0, num_classes, (batch_size, 50)) # B x max_len

	# # Forward pass
	# outputs, count_vectors = model(input_image, target)

	# # Print output shapes
	# print(f"Outputs shape: {outputs.shape}") # B x max_len x num_classes
	# print(f"Count vectors shape: {count_vectors.shape}") # B x num_classes