Spaces:

ericzhang1122
/

protein_rag

Runtime error

App Files Files Community

protein_rag / utils /functions.py

ericzhang1122

Upload folder using huggingface_hub

5c20520 verified 15 days ago

raw

history blame contribute delete

2.31 kB

	import json
	from tqdm import tqdm

	def merge_interproscan_data(merge_file1, merge_file2):
	"""
	合并test_data/interproscan_info.json到data/processed_data/interproscan_info.json
	根据序列匹配进行合并，更新数据结构
	"""
	# 读取源文件和目标文件
	with open(merge_file1, 'r') as f:
	target_data = json.load(f)

	with open(merge_file2, 'r') as f:
	source_data = json.load(f)

	# 遍历目标文件中的每一项
	for protein_id, protein_info in tqdm(target_data.items()):
	sequence = protein_info['sequence']

	# 在源文件中查找匹配的序列
	if sequence in source_data:
	source_info = source_data[sequence]

	# 更新interproscan_results
	if 'interproscan_results' not in protein_info:
	protein_info['interproscan_results'] = {}

	# 处理PFAM -> pfam_id
	if 'PFAM' in source_info and source_info['PFAM']:
	protein_info['interproscan_results']['pfam_id'] = source_info['PFAM']

	# 处理GO -> go_id (保持原有结构)
	if 'GO' in source_info and source_info['GO']:
	protein_info['interproscan_results']['go_id'] = source_info['GO']

	# 添加其他字段
	for key, value in source_info.items():
	if key not in ['PFAM', 'GO'] and value: # 跳过PFAM和GO，只处理其他非空字段
	# 将字段名转换为小写加下划线的格式
	field_name = key.lower().replace('_', '_')
	protein_info['interproscan_results'][field_name] = value

	# 保存更新后的数据
	with open(merge_file1, 'w') as f:
	json.dump(target_data, f, indent=4, ensure_ascii=False)

	print("数据合并完成！")

	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument('--merge_file1', default='data/processed_data/interproscan_info.json', help='合并的文件1')
	parser.add_argument('--merge_file2', default='test_data/interproscan_info.json', help='合并的文件2')
	args = parser.parse_args()
	merge_interproscan_data(args.merge_file1, args.merge_file2)