Spaces:
Sleeping
Sleeping
| # Copyright (c) OpenMMLab. All rights reserved. | |
| import argparse | |
| import os.path as osp | |
| import time | |
| import warnings | |
| from mmengine import Config | |
| from mmocr.datasets.preparers import DatasetPreparer | |
| def parse_args(): | |
| parser = argparse.ArgumentParser( | |
| description='Preparing datasets used in MMOCR.') | |
| parser.add_argument( | |
| 'datasets', | |
| help='A list of the dataset names that would like to prepare.', | |
| nargs='+') | |
| parser.add_argument( | |
| '--nproc', help='Number of processes to run', default=4, type=int) | |
| parser.add_argument( | |
| '--task', | |
| default='textdet', | |
| choices=['textdet', 'textrecog', 'textspotting', 'kie'], | |
| help='Task type. Options are "textdet", "textrecog", "textspotting"' | |
| ' and "kie".') | |
| parser.add_argument( | |
| '--splits', | |
| default=['train', 'test', 'val'], | |
| help='A list of the split that would like to prepare.', | |
| nargs='+') | |
| parser.add_argument( | |
| '--lmdb', | |
| action='store_true', | |
| default=False, | |
| help='Whether to dump the textrecog dataset to LMDB format, It\'s a ' | |
| 'shortcut to force the dataset to be dumped in lmdb format. ' | |
| 'Applicable when --task=textrecog') | |
| parser.add_argument( | |
| '--overwrite-cfg', | |
| action='store_true', | |
| default=False, | |
| help='Whether to overwrite the dataset config file if it already' | |
| ' exists. If not specified, Dataset Preparer will not generate' | |
| ' new config for datasets whose configs are already in base.') | |
| parser.add_argument( | |
| '--dataset-zoo-path', | |
| default='./dataset_zoo', | |
| help='Path to dataset zoo config files.') | |
| args = parser.parse_args() | |
| return args | |
| def parse_meta(task: str, meta_path: str) -> None: | |
| """Parse meta file. | |
| Args: | |
| cfg_path (str): Path to meta file. | |
| """ | |
| try: | |
| meta = Config.fromfile(meta_path) | |
| except FileNotFoundError: | |
| return | |
| assert task in meta['Data']['Tasks'], \ | |
| f'Task {task} not supported!' | |
| # License related | |
| if meta['Data']['License']['Type']: | |
| print(f"\033[1;33;40mDataset Name: {meta['Name']}") | |
| print(f"License Type: {meta['Data']['License']['Type']}") | |
| print(f"License Link: {meta['Data']['License']['Link']}") | |
| print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m") | |
| print('\033[1;31;43mMMOCR does not own the dataset. Using this ' | |
| 'dataset you must accept the license provided by the owners, ' | |
| 'and cite the corresponding papers appropriately.') | |
| print('If you do not agree with the above license, please cancel ' | |
| 'the progress immediately by pressing ctrl+c. Otherwise, ' | |
| 'you are deemed to accept the terms and conditions.\033[0m') | |
| for i in range(5): | |
| print(f'{5-i}...') | |
| time.sleep(1) | |
| def force_lmdb(cfg): | |
| """Force the dataset to be dumped in lmdb format. | |
| Args: | |
| cfg (Config): Config object. | |
| Returns: | |
| Config: Config object. | |
| """ | |
| for split in ['train', 'val', 'test']: | |
| preparer_cfg = cfg.get(f'{split}_preparer') | |
| if preparer_cfg: | |
| if preparer_cfg.get('dumper') is None: | |
| raise ValueError( | |
| f'{split} split does not come with a dumper, ' | |
| 'so most likely the annotations are MMOCR-ready and do ' | |
| 'not need any adaptation, and it ' | |
| 'cannot be dumped in LMDB format.') | |
| preparer_cfg.dumper['type'] = 'TextRecogLMDBDumper' | |
| cfg.config_generator['dataset_name'] = f'{cfg.dataset_name}_lmdb' | |
| for split in ['train_anns', 'val_anns', 'test_anns']: | |
| if split in cfg.config_generator: | |
| # It can be None when users want to clear out the default | |
| # value | |
| if not cfg.config_generator[split]: | |
| continue | |
| ann_list = cfg.config_generator[split] | |
| for ann_dict in ann_list: | |
| ann_dict['ann_file'] = ( | |
| osp.splitext(ann_dict['ann_file'])[0] + '.lmdb') | |
| else: | |
| if split == 'train_anns': | |
| ann_list = [dict(ann_file='textrecog_train.lmdb')] | |
| elif split == 'test_anns': | |
| ann_list = [dict(ann_file='textrecog_test.lmdb')] | |
| else: | |
| ann_list = [] | |
| cfg.config_generator[split] = ann_list | |
| return cfg | |
| def main(): | |
| args = parse_args() | |
| if args.lmdb and args.task != 'textrecog': | |
| raise ValueError('--lmdb only works with --task=textrecog') | |
| for dataset in args.datasets: | |
| if not osp.isdir(osp.join(args.dataset_zoo_path, dataset)): | |
| warnings.warn(f'{dataset} is not supported yet. Please check ' | |
| 'dataset zoo for supported datasets.') | |
| continue | |
| meta_path = osp.join(args.dataset_zoo_path, dataset, 'metafile.yml') | |
| parse_meta(args.task, meta_path) | |
| cfg_path = osp.join(args.dataset_zoo_path, dataset, args.task + '.py') | |
| cfg = Config.fromfile(cfg_path) | |
| if args.overwrite_cfg and cfg.get('config_generator', | |
| None) is not None: | |
| cfg.config_generator.overwrite_cfg = args.overwrite_cfg | |
| cfg.nproc = args.nproc | |
| cfg.task = args.task | |
| cfg.dataset_name = dataset | |
| if args.lmdb: | |
| cfg = force_lmdb(cfg) | |
| preparer = DatasetPreparer.from_file(cfg) | |
| preparer.run(args.splits) | |
| if __name__ == '__main__': | |
| main() | |