protein_rag / utils /functions.py
ericzhang1122's picture
Upload folder using huggingface_hub
5c20520 verified
import json
from tqdm import tqdm
def merge_interproscan_data(merge_file1, merge_file2):
"""
合并test_data/interproscan_info.json到data/processed_data/interproscan_info.json
根据序列匹配进行合并,更新数据结构
"""
# 读取源文件和目标文件
with open(merge_file1, 'r') as f:
target_data = json.load(f)
with open(merge_file2, 'r') as f:
source_data = json.load(f)
# 遍历目标文件中的每一项
for protein_id, protein_info in tqdm(target_data.items()):
sequence = protein_info['sequence']
# 在源文件中查找匹配的序列
if sequence in source_data:
source_info = source_data[sequence]
# 更新interproscan_results
if 'interproscan_results' not in protein_info:
protein_info['interproscan_results'] = {}
# 处理PFAM -> pfam_id
if 'PFAM' in source_info and source_info['PFAM']:
protein_info['interproscan_results']['pfam_id'] = source_info['PFAM']
# 处理GO -> go_id (保持原有结构)
if 'GO' in source_info and source_info['GO']:
protein_info['interproscan_results']['go_id'] = source_info['GO']
# 添加其他字段
for key, value in source_info.items():
if key not in ['PFAM', 'GO'] and value: # 跳过PFAM和GO,只处理其他非空字段
# 将字段名转换为小写加下划线的格式
field_name = key.lower().replace('_', '_')
protein_info['interproscan_results'][field_name] = value
# 保存更新后的数据
with open(merge_file1, 'w') as f:
json.dump(target_data, f, indent=4, ensure_ascii=False)
print("数据合并完成!")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--merge_file1', default='data/processed_data/interproscan_info.json', help='合并的文件1')
parser.add_argument('--merge_file2', default='test_data/interproscan_info.json', help='合并的文件2')
args = parser.parse_args()
merge_interproscan_data(args.merge_file1, args.merge_file2)