File size: 3,773 Bytes
5c20520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import json
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from collections import defaultdict

# 全局变量声明
_go_data = None
_protein_go_dict = None

def _load_go_data(go_info_path):
    """懒加载GO数据"""
    global _go_data
    if _go_data is None:
        try:
            with open(go_info_path, 'r') as f:
                _go_data = json.load(f)
        except Exception as e:
            print(f"加载GO数据文件时发生错误: {str(e)}")
            _go_data = None

def _load_protein_go_dict(protein2gopath):
    """懒加载蛋白质-GO映射数据"""
    global _protein_go_dict
    if _protein_go_dict is None:
        try:
            _protein_go_dict = {}
            with open(protein2gopath, 'r') as f:
                for line in f:
                    data = json.loads(line)
                    _protein_go_dict[data['protein_id']] = data['GO_id']
        except Exception as e:
            print(f"加载蛋白质-GO映射数据时发生错误: {str(e)}")
            _protein_go_dict = None

def get_go_definition(go_id, go_info_path):
    """获取GO term的定义"""
    _load_go_data(go_info_path)
    if _go_data is None:
        return None

    if not go_id.startswith('GO_'):
        go_id = f"GO_{go_id}"
    full_id = f"http://purl.obolibrary.org/obo/{go_id}"
    
    for node in _go_data['graphs'][0]['nodes']:
        if node['id'] == full_id:
            if 'meta' in node and 'definition' in node['meta']:
                return node['meta']['definition']['val']
    return None

def analyze_protein_go(protein_id, protein2gopath, go_info_path):
    """
    分析蛋白质的GO注释信息,包括GO ID和定义
    
    参数:
    protein_id: str - 蛋白质ID
    protein2gopath: str - 蛋白质-GO映射文件路径
    
    返回:
    dict - 包含GO信息的字典
    """
    _load_protein_go_dict(protein2gopath)
    if _protein_go_dict is None:
        return {
            "status": "error",
            "message": "GO数据加载失败"
        }

    if protein_id not in _protein_go_dict:
        return {
            "status": "error",
            "message": f"未找到蛋白质 {protein_id} 的GO注释"
        }
    
    go_ids = _protein_go_dict[protein_id]
    go_info = []
    all_definitions = {}
    
    for go_id in go_ids:
        # 获取GO定义
        definition = get_go_definition(go_id, go_info_path)
        if definition:
            all_definitions[go_id] = definition
        
        go_info.append({
            "go_id": go_id
        })
    
    return {
        "status": "success",
        "protein_id": protein_id,
        "go_annotations": go_info,
        "all_related_definitions": all_definitions
    }

# 使用示例
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description='Analyze protein GO annotations')
    parser.add_argument('--protein_id', type=str, default='A8CF74')
    parser.add_argument('--protein2gopath', type=str, default='data/processed_data/go_integration_final_topk2.json')
    parser.add_argument('--go_info_path', type=str, default='data/raw_data/go.json')
    args = parser.parse_args()
    
    result = analyze_protein_go(args.protein_id, args.protein2gopath, args.go_info_path)
    
    if result["status"] == "success":
        print(f"\nProtein {result['protein_id']} GO annotations:")
        
        for anno in result["go_annotations"]:
            print(f"\nGO ID: {anno['go_id']}")
        
        print("\nAll related GO ID definitions:")
        for go_id, definition in result["all_related_definitions"].items():
            print(f"\nGO:{go_id}")
            print(f"Definition: {definition}")
    else:
        print(result["message"])