Spaces:
Runtime error
Runtime error
import json | |
from tqdm import tqdm | |
def merge_interproscan_data(merge_file1, merge_file2): | |
""" | |
合并test_data/interproscan_info.json到data/processed_data/interproscan_info.json | |
根据序列匹配进行合并,更新数据结构 | |
""" | |
# 读取源文件和目标文件 | |
with open(merge_file1, 'r') as f: | |
target_data = json.load(f) | |
with open(merge_file2, 'r') as f: | |
source_data = json.load(f) | |
# 遍历目标文件中的每一项 | |
for protein_id, protein_info in tqdm(target_data.items()): | |
sequence = protein_info['sequence'] | |
# 在源文件中查找匹配的序列 | |
if sequence in source_data: | |
source_info = source_data[sequence] | |
# 更新interproscan_results | |
if 'interproscan_results' not in protein_info: | |
protein_info['interproscan_results'] = {} | |
# 处理PFAM -> pfam_id | |
if 'PFAM' in source_info and source_info['PFAM']: | |
protein_info['interproscan_results']['pfam_id'] = source_info['PFAM'] | |
# 处理GO -> go_id (保持原有结构) | |
if 'GO' in source_info and source_info['GO']: | |
protein_info['interproscan_results']['go_id'] = source_info['GO'] | |
# 添加其他字段 | |
for key, value in source_info.items(): | |
if key not in ['PFAM', 'GO'] and value: # 跳过PFAM和GO,只处理其他非空字段 | |
# 将字段名转换为小写加下划线的格式 | |
field_name = key.lower().replace('_', '_') | |
protein_info['interproscan_results'][field_name] = value | |
# 保存更新后的数据 | |
with open(merge_file1, 'w') as f: | |
json.dump(target_data, f, indent=4, ensure_ascii=False) | |
print("数据合并完成!") | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--merge_file1', default='data/processed_data/interproscan_info.json', help='合并的文件1') | |
parser.add_argument('--merge_file2', default='test_data/interproscan_info.json', help='合并的文件2') | |
args = parser.parse_args() | |
merge_interproscan_data(args.merge_file1, args.merge_file2) | |