Spaces:
Build error
Build error
""" | |
Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023 | |
Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora | |
GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition | |
Project Website: https://abdur75648.github.io/UTRNet/ | |
Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial | |
4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/) | |
""" | |
import os,shutil | |
import time | |
import argparse | |
import random | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from datetime import datetime | |
import pytz | |
import torch | |
import torch.utils.data | |
import torch.nn.functional as F | |
from tqdm import tqdm | |
from nltk.metrics.distance import edit_distance | |
from utils import CTCLabelConverter, AttnLabelConverter, Averager, Logger | |
from dataset import hierarchical_dataset, AlignCollate | |
from model import Model | |
def validation(model, criterion, evaluation_loader, converter, opt, device): | |
""" validation or evaluation """ | |
eval_arr = [] | |
sum_len_gt = 0 | |
n_correct = 0 | |
norm_ED = 0 | |
length_of_data = 0 | |
infer_time = 0 | |
valid_loss_avg = Averager() | |
for i, (image_tensors, labels) in enumerate(tqdm(evaluation_loader)): | |
batch_size = image_tensors.size(0) | |
length_of_data = length_of_data + batch_size | |
image = image_tensors.to(device) | |
# For max length prediction | |
length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) | |
text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) | |
text_for_loss, length_for_loss = converter.encode(labels, batch_max_length=opt.batch_max_length) | |
start_time = time.time() | |
if 'CTC' in opt.Prediction: | |
preds = model(image) | |
forward_time = time.time() - start_time | |
preds_size = torch.IntTensor([preds.size(1)] * batch_size) | |
cost = criterion(preds.log_softmax(2).permute(1, 0, 2), text_for_loss, preds_size, length_for_loss) | |
_, preds_index = preds.max(2) | |
preds_str = converter.decode(preds_index.data, preds_size.data) | |
else: | |
preds = model(image, text=text_for_pred, is_train=False) | |
forward_time = time.time() - start_time | |
preds = preds[:, :text_for_loss.shape[1] - 1, :].to(device) | |
target = text_for_loss[:, 1:].to(device) # without [GO] Symbol | |
cost = criterion(preds.contiguous().view(-1, preds.shape[-1]), target.contiguous().view(-1)) | |
_, preds_index = preds.max(2) | |
preds_str = converter.decode(preds_index, length_for_pred) | |
labels = converter.decode(text_for_loss[:, 1:], length_for_loss) | |
infer_time += forward_time | |
valid_loss_avg.add(cost) | |
# calculate accuracy & confidence score | |
preds_prob = F.softmax(preds, dim=2) | |
preds_max_prob, _ = preds_prob.max(dim=2) | |
confidence_score_list = [] | |
for gt, pred, pred_max_prob in zip(labels, preds_str, preds_max_prob): | |
if 'Attn' in opt.Prediction: | |
gt = gt[:gt.find('[s]')] | |
pred_EOS = pred.find('[s]') | |
pred = pred[:pred_EOS] # prune after "end of sentence" token ([s]) | |
pred_max_prob = pred_max_prob[:pred_EOS] | |
if pred == gt: | |
n_correct += 1 | |
# ICDAR2019 Normalized Edit Distance | |
if len(gt) == 0 or len(pred) == 0: | |
ED = 0 | |
elif len(gt) > len(pred): | |
ED = 1 - edit_distance(pred, gt) / len(gt) | |
else: | |
ED = 1 - edit_distance(pred, gt) / len(pred) | |
eval_arr.append([gt,pred,ED]) | |
sum_len_gt += len(gt) | |
norm_ED += (ED*len(gt)) | |
# calculate confidence score (= multiply of pred_max_prob) | |
try: | |
confidence_score = pred_max_prob.cumprod(dim=0)[-1] | |
except: | |
confidence_score = 0 # for empty pred case, when prune after "end of sentence" token ([s]) | |
confidence_score_list.append(confidence_score) | |
# print(pred, gt, pred==gt, confidence_score) | |
accuracy = n_correct / float(length_of_data) * 100 | |
norm_ED = norm_ED / float(sum_len_gt) | |
return valid_loss_avg.val(), accuracy, norm_ED, eval_arr | |
def test(opt, device): | |
opt.device = device | |
os.makedirs("test_outputs", exist_ok=True) | |
datetime_now = str(datetime.now(pytz.timezone('Asia/Kolkata')).strftime("%Y-%m-%d_%H-%M-%S")) | |
logger = Logger(f'test_outputs/{datetime_now}.txt') | |
""" model configuration """ | |
if 'CTC' in opt.Prediction: | |
converter = CTCLabelConverter(opt.character) | |
else: | |
converter = AttnLabelConverter(opt.character) | |
opt.num_class = len(converter.character) | |
if opt.rgb: | |
opt.input_channel = 3 | |
model = Model(opt) | |
logger.log('model input parameters', opt.imgH, opt.imgW, opt.input_channel, opt.output_channel, | |
opt.hidden_size, opt.num_class, opt.batch_max_length, opt.FeatureExtraction, | |
opt.SequenceModeling, opt.Prediction) | |
model = model.to(device) | |
# load model | |
model.load_state_dict(torch.load(opt.saved_model, map_location=device)) | |
logger.log('Loaded pretrained model from %s' % opt.saved_model) | |
# logger.log(model) | |
""" setup loss """ | |
if 'CTC' in opt.Prediction: | |
criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) | |
else: | |
criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) # ignore [GO] token = ignore index 0 | |
""" evaluation """ | |
model.eval() | |
with torch.no_grad(): | |
AlignCollate_evaluation = AlignCollate(imgH=opt.imgH, imgW=opt.imgW)#, keep_ratio_with_pad=opt.PAD) | |
eval_data, eval_data_log = hierarchical_dataset(root=opt.eval_data, opt=opt, rand_aug=False) | |
logger.log(eval_data_log) | |
evaluation_loader = torch.utils.data.DataLoader( | |
eval_data, batch_size=opt.batch_size, | |
shuffle=False, | |
num_workers=int(opt.workers), | |
collate_fn=AlignCollate_evaluation, pin_memory=True) | |
_, accuracy, norm_ED, eval_arr = validation( model, criterion, evaluation_loader, converter, opt,device) | |
logger.log("="*20) | |
logger.log(f'Accuracy : {accuracy:0.4f}\n') | |
logger.log(f'Norm_ED : {norm_ED:0.4f}\n') | |
logger.log("="*20) | |
if opt.visualize: | |
logger.log("Threshold - ", opt.threshold) | |
logger.log("ED","\t","gt","\t","pred") | |
arr = [] | |
for gt,pred,ED in eval_arr: | |
ED = ED*100.0 | |
arr.append(ED) | |
if ED<=(opt.threshold): | |
logger.log(ED,"\t",gt,"\t",pred) | |
plt.hist(arr, edgecolor="red") | |
plt.savefig('test_outputs/'+str(datetime_now)+".png") | |
plt.close() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--visualize', action='store_true', help='for visualization of bad samples') | |
parser.add_argument('--threshold', type=float, help='Save samples below this threshold in txt file', default=50.0) | |
parser.add_argument('--eval_data', required=True, help='path to evaluation dataset') | |
parser.add_argument('--workers', type=int, help='number of data loading workers', default=4) | |
parser.add_argument('--batch_size', type=int, default=32, help='input batch size') | |
parser.add_argument('--saved_model', required=True, help="path to saved_model to evaluation") | |
""" Data processing """ | |
parser.add_argument('--batch_max_length', type=int, default=100, help='maximum-label-length') | |
parser.add_argument('--imgH', type=int, default=32, help='the height of the input image') | |
parser.add_argument('--imgW', type=int, default=400, help='the width of the input image') | |
parser.add_argument('--rgb', action='store_true', help='use rgb input') | |
""" Model Architecture """ | |
parser.add_argument('--FeatureExtraction', type=str, default="HRNet", #required=True, | |
help='FeatureExtraction stage VGG|RCNN|ResNet|UNet|HRNet|Densenet|InceptionUnet|ResUnet|AttnUNet|UNet|VGG') | |
parser.add_argument('--SequenceModeling', type=str, default="DBiLSTM", #required=True, | |
help='SequenceModeling stage LSTM|GRU|MDLSTM|BiLSTM|DBiLSTM') | |
parser.add_argument('--Prediction', type=str, default="CTC", #required=True, | |
help='Prediction stage CTC|Attn') | |
parser.add_argument('--input_channel', type=int, default=1, help='the number of input channel of Feature extractor') | |
parser.add_argument('--output_channel', type=int, default=512, help='the number of output channel of Feature extractor') | |
parser.add_argument('--hidden_size', type=int, default=256, help='the size of the LSTM hidden state') | |
""" GPU Selection """ | |
parser.add_argument('--device_id', type=str, default=None, help='cuda device ID') | |
opt = parser.parse_args() | |
if opt.FeatureExtraction == "HRNet": | |
opt.output_channel = 32 | |
# Fix random seeds for both numpy and pytorch | |
seed = 1111 | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed(seed) | |
np.random.seed(seed) | |
random.seed(seed) | |
torch.backends.cudnn.deterministic = True | |
torch.backends.cudnn.benchmark = False | |
""" vocab / character number configuration """ | |
file = open("UrduGlyphs.txt","r",encoding="utf-8") | |
content = file.readlines() | |
content = ''.join([str(elem).strip('\n') for elem in content]) | |
opt.character = content+" " | |
cuda_str = 'cuda' | |
if opt.device_id is not None: | |
cuda_str = f'cuda:{opt.device_id}' | |
device = torch.device(cuda_str if torch.cuda.is_available() else 'cpu') | |
print("Device : ", device) | |
# opt.eval_data = "/DATA/parseq/val/" | |
# test(opt, device) | |
# opt.eval_data = "/DATA/parseq/IIITH/lmdb_new/" | |
# test(opt, device) | |
# opt.eval_data = "/DATA/public_datasets/UPTI/valid/" | |
# test(opt, device) | |
test(opt, device) |