import json import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from collections import defaultdict # 全局变量声明 _go_data = None _protein_go_dict = None def _load_go_data(go_info_path): """懒加载GO数据""" global _go_data if _go_data is None: try: with open(go_info_path, 'r') as f: _go_data = json.load(f) except Exception as e: print(f"加载GO数据文件时发生错误: {str(e)}") _go_data = None def _load_protein_go_dict(protein2gopath): """懒加载蛋白质-GO映射数据""" global _protein_go_dict if _protein_go_dict is None: try: _protein_go_dict = {} with open(protein2gopath, 'r') as f: for line in f: data = json.loads(line) _protein_go_dict[data['protein_id']] = data['GO_id'] except Exception as e: print(f"加载蛋白质-GO映射数据时发生错误: {str(e)}") _protein_go_dict = None def get_go_definition(go_id, go_info_path): """获取GO term的定义""" _load_go_data(go_info_path) if _go_data is None: return None if not go_id.startswith('GO_'): go_id = f"GO_{go_id}" full_id = f"http://purl.obolibrary.org/obo/{go_id}" for node in _go_data['graphs'][0]['nodes']: if node['id'] == full_id: if 'meta' in node and 'definition' in node['meta']: return node['meta']['definition']['val'] return None def analyze_protein_go(protein_id, protein2gopath, go_info_path): """ 分析蛋白质的GO注释信息,包括GO ID和定义 参数: protein_id: str - 蛋白质ID protein2gopath: str - 蛋白质-GO映射文件路径 返回: dict - 包含GO信息的字典 """ _load_protein_go_dict(protein2gopath) if _protein_go_dict is None: return { "status": "error", "message": "GO数据加载失败" } if protein_id not in _protein_go_dict: return { "status": "error", "message": f"未找到蛋白质 {protein_id} 的GO注释" } go_ids = _protein_go_dict[protein_id] go_info = [] all_definitions = {} for go_id in go_ids: # 获取GO定义 definition = get_go_definition(go_id, go_info_path) if definition: all_definitions[go_id] = definition go_info.append({ "go_id": go_id }) return { "status": "success", "protein_id": protein_id, "go_annotations": go_info, "all_related_definitions": all_definitions } # 使用示例 if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Analyze protein GO annotations') parser.add_argument('--protein_id', type=str, default='A8CF74') parser.add_argument('--protein2gopath', type=str, default='data/processed_data/go_integration_final_topk2.json') parser.add_argument('--go_info_path', type=str, default='data/raw_data/go.json') args = parser.parse_args() result = analyze_protein_go(args.protein_id, args.protein2gopath, args.go_info_path) if result["status"] == "success": print(f"\nProtein {result['protein_id']} GO annotations:") for anno in result["go_annotations"]: print(f"\nGO ID: {anno['go_id']}") print("\nAll related GO ID definitions:") for go_id, definition in result["all_related_definitions"].items(): print(f"\nGO:{go_id}") print(f"Definition: {definition}") else: print(result["message"])