Spaces:
Sleeping
Sleeping
| # Copyright (c) OpenMMLab. All rights reserved. | |
| import argparse | |
| import os.path as osp | |
| from mmocr.utils import dump_ocr_data | |
| def convert_annotations(root_path, split): | |
| """Convert original annotations to mmocr format. | |
| The annotation format of this dataset is as the following: | |
| word_1.png, "flying" | |
| word_2.png, "today" | |
| word_3.png, "means" | |
| See the format of converted annotation in mmocr.utils.dump_ocr_data. | |
| Args: | |
| root_path (str): The root path of the dataset | |
| split (str): The split of dataset. Namely: Train or Test | |
| """ | |
| assert isinstance(root_path, str) | |
| assert isinstance(split, str) | |
| img_info = [] | |
| with open( | |
| osp.join(root_path, 'annotations', | |
| f'Challenge1_{split}_Task3_GT.txt'), | |
| encoding='"utf-8-sig') as f: | |
| annos = f.readlines() | |
| for anno in annos: | |
| # text may contain comma ',' | |
| dst_img_name, word = anno.split(', "') | |
| word = word.replace('"\n', '') | |
| img_info.append({ | |
| 'file_name': dst_img_name, | |
| 'anno_info': [{ | |
| 'text': word | |
| }] | |
| }) | |
| return img_info | |
| def parse_args(): | |
| parser = argparse.ArgumentParser( | |
| description='Generate training and test set of IC11') | |
| parser.add_argument('root_path', help='Root dir path of IC11') | |
| args = parser.parse_args() | |
| return args | |
| def main(): | |
| args = parse_args() | |
| root_path = args.root_path | |
| for split in ['Train', 'Test']: | |
| img_info = convert_annotations(root_path, split) | |
| dump_ocr_data(img_info, | |
| osp.join(root_path, f'{split.lower()}_label.json'), | |
| 'textrecog') | |
| print(f'{split} split converted.') | |
| if __name__ == '__main__': | |
| main() | |