File size: 2,097 Bytes
5c20520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import json
from collections import Counter
import os

_pfam_dict = None
_pfam_descriptions = None

def _load_pfam_data(protein2pfam_path):
    global _pfam_dict
    if _pfam_dict is None:
        with open(protein2pfam_path, 'r') as file:
            _pfam_dict = json.load(file)

def _load_pfam_descriptions(pfam_descriptions_path):
    global _pfam_descriptions
    if _pfam_descriptions is None:
        with open(pfam_descriptions_path, 'r') as file:
            _pfam_descriptions = json.load(file)

def get_motif_pfam(protein_id, protein2pfam_path, pfam_descriptions_path):
    """
    获取指定蛋白质的pfam信息及其定义
    
    参数:
    protein_id: str - 蛋白质ID
    protein2pfam_path: str - interproscan_info.json文件路径
    pfam_descriptions_path: str - pfam描述文件路径
    
    返回:
    dict - pfam_id到定义的映射字典,例如{"PF04820": "definition content"}
    """
    _load_pfam_data(protein2pfam_path)
    _load_pfam_descriptions(pfam_descriptions_path)
    
    if protein_id not in _pfam_dict:
        return {}
    
    protein_info = _pfam_dict[protein_id]
    _pfam_dicts = protein_info.get('interproscan_results', {}).get('pfam_id', [])
    pfam_ids = []
    for pfam_dict in _pfam_dicts:
        for key,value in pfam_dict.items():
            pfam_ids.append(key)
    
    result = {}
    for pfam_id in pfam_ids:
        if pfam_id in _pfam_descriptions:
            result[pfam_id] = _pfam_descriptions[pfam_id]['description']
    
    return result

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--protein_id", type=str, required=False, default="A8CF74")
    parser.add_argument("--protein2pfam_path", type=str, required=False, default="data/processed_data/interproscan_info.json")
    parser.add_argument("--pfam_descriptions_path", type=str, required=False, default="data/raw_data/all_pfam_descriptions.json")
    args = parser.parse_args()
    result = get_motif_pfam(args.protein_id, args.protein2pfam_path, args.pfam_descriptions_path)
    print(result)