File size: 2,308 Bytes
5c20520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import json
from tqdm import tqdm

def merge_interproscan_data(merge_file1, merge_file2):
    """
    合并test_data/interproscan_info.json到data/processed_data/interproscan_info.json
    根据序列匹配进行合并,更新数据结构
    """
    # 读取源文件和目标文件
    with open(merge_file1, 'r') as f:
        target_data = json.load(f)
    
    with open(merge_file2, 'r') as f:
        source_data = json.load(f)
    
    # 遍历目标文件中的每一项
    for protein_id, protein_info in tqdm(target_data.items()):
        sequence = protein_info['sequence']
        
        # 在源文件中查找匹配的序列
        if sequence in source_data:
            source_info = source_data[sequence]
            
            # 更新interproscan_results
            if 'interproscan_results' not in protein_info:
                protein_info['interproscan_results'] = {}
            
            # 处理PFAM -> pfam_id
            if 'PFAM' in source_info and source_info['PFAM']:
                protein_info['interproscan_results']['pfam_id'] = source_info['PFAM']
            
            # 处理GO -> go_id (保持原有结构)
            if 'GO' in source_info and source_info['GO']:
                protein_info['interproscan_results']['go_id'] = source_info['GO']
            
            # 添加其他字段
            for key, value in source_info.items():
                if key not in ['PFAM', 'GO'] and value:  # 跳过PFAM和GO,只处理其他非空字段
                    # 将字段名转换为小写加下划线的格式
                    field_name = key.lower().replace('_', '_')
                    protein_info['interproscan_results'][field_name] = value
    
    # 保存更新后的数据
    with open(merge_file1, 'w') as f:
        json.dump(target_data, f, indent=4, ensure_ascii=False)
    
    print("数据合并完成!")

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--merge_file1', default='data/processed_data/interproscan_info.json', help='合并的文件1')
    parser.add_argument('--merge_file2', default='test_data/interproscan_info.json', help='合并的文件2')
    args = parser.parse_args()
    merge_interproscan_data(args.merge_file1, args.merge_file2)