import json from tqdm import tqdm def merge_interproscan_data(merge_file1, merge_file2): """ 合并test_data/interproscan_info.json到data/processed_data/interproscan_info.json 根据序列匹配进行合并,更新数据结构 """ # 读取源文件和目标文件 with open(merge_file1, 'r') as f: target_data = json.load(f) with open(merge_file2, 'r') as f: source_data = json.load(f) # 遍历目标文件中的每一项 for protein_id, protein_info in tqdm(target_data.items()): sequence = protein_info['sequence'] # 在源文件中查找匹配的序列 if sequence in source_data: source_info = source_data[sequence] # 更新interproscan_results if 'interproscan_results' not in protein_info: protein_info['interproscan_results'] = {} # 处理PFAM -> pfam_id if 'PFAM' in source_info and source_info['PFAM']: protein_info['interproscan_results']['pfam_id'] = source_info['PFAM'] # 处理GO -> go_id (保持原有结构) if 'GO' in source_info and source_info['GO']: protein_info['interproscan_results']['go_id'] = source_info['GO'] # 添加其他字段 for key, value in source_info.items(): if key not in ['PFAM', 'GO'] and value: # 跳过PFAM和GO,只处理其他非空字段 # 将字段名转换为小写加下划线的格式 field_name = key.lower().replace('_', '_') protein_info['interproscan_results'][field_name] = value # 保存更新后的数据 with open(merge_file1, 'w') as f: json.dump(target_data, f, indent=4, ensure_ascii=False) print("数据合并完成!") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('--merge_file1', default='data/processed_data/interproscan_info.json', help='合并的文件1') parser.add_argument('--merge_file2', default='test_data/interproscan_info.json', help='合并的文件2') args = parser.parse_args() merge_interproscan_data(args.merge_file1, args.merge_file2)