ericzhang1122 commited on
Commit
5c20520
·
verified ·
1 Parent(s): d6a3a76

Upload folder using huggingface_hub

Browse files
.gitignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data/
2
+ output/
3
+ interproscan/
4
+ blast_db/
5
+ *.pyc
6
+
7
+ # other
8
+ __pycache__/*
9
+ .gradio/*
10
+
11
+ install_blast.sh
12
+ blast.py
13
+
14
+ # data
15
+ output/*
16
+
17
+ # link
18
+ interproscan
19
+
20
+ # data
21
+ evolla_test_data/*
22
+ processed_data/*
23
+ downloads/*
24
+ test/*
25
+
26
+ test_interproscan.py
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Protein Rag
3
- emoji: 📉
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.35.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: protein_rag
3
+ app_file: demo.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.35.0
 
 
6
  ---
 
 
cal_llm_score.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
4
+ import json
5
+ from pathlib import Path
6
+ from tqdm import tqdm
7
+ import pandas as pd
8
+ import numpy as np
9
+ from utils.openai_access import call_chatgpt
10
+ from utils.mpr import MultipleProcessRunnerSimplifier
11
+ from utils.prompts import LLM_SCORE_PROMPT
12
+ import re
13
+
14
+ qa_data = {}
15
+
16
+ def load_qa_results_from_dir(results_dir):
17
+ """从结果目录加载所有QA结果"""
18
+ global qa_data
19
+ qa_data = {}
20
+
21
+ results_path = Path(results_dir)
22
+ json_files = list(results_path.glob("*.json"))
23
+
24
+ print(f"找到 {len(json_files)} 个结果文件")
25
+
26
+ for json_file in tqdm(json_files, desc="加载QA结果"):
27
+ try:
28
+ with open(json_file, 'r') as f:
29
+ data = json.load(f)
30
+ if ('index' in data and 'protein_id' in data and
31
+ 'ground_truth' in data and 'llm_answer' in data):
32
+ qa_data[data['index']] = data
33
+ except Exception as e:
34
+ print(f"加载文件 {json_file} 时出错: {e}")
35
+
36
+ print(f"成功加载 {len(qa_data)} 个QA对")
37
+ return qa_data
38
+
39
+ def extract_score_from_response(response):
40
+ """从LLM响应中提取分数"""
41
+ if not response:
42
+ return None
43
+
44
+ # 尝试解析JSON格式的响应
45
+ try:
46
+ if isinstance(response, str):
47
+ # 尝试直接解析JSON
48
+ json_match = re.search(r'\{[^}]*"score"[^}]*\}', response)
49
+ if json_match:
50
+ json_obj = json.loads(json_match.group())
51
+ return json_obj.get('score')
52
+
53
+ # 尝试提取数字
54
+ score_match = re.search(r'"score":\s*(\d+(?:\.\d+)?)', response)
55
+ if score_match:
56
+ return float(score_match.group(1))
57
+
58
+ # 尝试提取纯数字
59
+ number_match = re.search(r'(\d+(?:\.\d+)?)', response)
60
+ if number_match:
61
+ score = float(number_match.group(1))
62
+ if 0 <= score <= 100:
63
+ return score
64
+ elif isinstance(response, dict):
65
+ return response.get('score')
66
+ except:
67
+ pass
68
+
69
+ return None
70
+
71
+ def process_single_scoring(process_id, idx, qa_index, writer, save_dir):
72
+ """处理单个QA对的打分"""
73
+ try:
74
+ qa_item = qa_data[qa_index]
75
+ protein_id = qa_item['protein_id']
76
+ question = qa_item.get('question', '')
77
+ ground_truth = qa_item['ground_truth']
78
+ llm_answer = qa_item['llm_answer']
79
+
80
+ # 构建打分提示
81
+ scoring_prompt = LLM_SCORE_PROMPT.replace('{{ground_truth}}', str(ground_truth))
82
+ scoring_prompt = scoring_prompt.replace('{{llm_answer}}', str(llm_answer))
83
+
84
+ # 调用LLM进行打分
85
+ score_response = call_chatgpt(scoring_prompt)
86
+ score = extract_score_from_response(score_response)
87
+
88
+ # 构建结果数据
89
+ result = {
90
+ 'index': qa_index,
91
+ 'protein_id': protein_id,
92
+ 'question': question,
93
+ 'ground_truth': ground_truth,
94
+ 'llm_answer': llm_answer,
95
+ 'score': score,
96
+ 'raw_score_response': score_response
97
+ }
98
+
99
+ # 保存文件
100
+ save_path = os.path.join(save_dir, f"score_{protein_id}_{qa_index}.json")
101
+ with open(save_path, 'w') as f:
102
+ json.dump(result, f, indent=2, ensure_ascii=False)
103
+
104
+ except Exception as e:
105
+ print(f"处理QA索引 {qa_index} 时出错: {str(e)}")
106
+
107
+ def get_missing_score_indices(save_dir):
108
+ """检查哪些QA索引尚未完成打分"""
109
+ all_qa_indices = list(qa_data.keys())
110
+ problem_qa_indices = set()
111
+
112
+ for qa_index in tqdm(all_qa_indices, desc="检查打分文件"):
113
+ protein_id = qa_data[qa_index]['protein_id']
114
+ json_file = Path(save_dir) / f"score_{protein_id}_{qa_index}.json"
115
+
116
+ if not json_file.exists():
117
+ problem_qa_indices.add(qa_index)
118
+ continue
119
+
120
+ try:
121
+ with open(json_file, 'r') as f:
122
+ data = json.load(f)
123
+ if (data is None or len(data) == 0 or
124
+ 'score' not in data or
125
+ data.get('score') is None):
126
+ problem_qa_indices.add(qa_index)
127
+ json_file.unlink()
128
+ except Exception as e:
129
+ problem_qa_indices.add(qa_index)
130
+ try:
131
+ json_file.unlink()
132
+ except:
133
+ pass
134
+
135
+ return problem_qa_indices
136
+
137
+ def collect_scores_to_json(save_dir, output_json):
138
+ """收集所有打分结果并保存为JSON文件"""
139
+ results = []
140
+
141
+ save_path = Path(save_dir)
142
+ score_files = list(save_path.glob("score_*.json"))
143
+
144
+ for score_file in tqdm(score_files, desc="收集打分结果"):
145
+ try:
146
+ with open(score_file, 'r') as f:
147
+ data = json.load(f)
148
+ results.append({
149
+ 'index': data.get('index'),
150
+ 'protein_id': data.get('protein_id'),
151
+ 'question': data.get('question', ''),
152
+ 'ground_truth': data.get('ground_truth'),
153
+ 'llm_answer': data.get('llm_answer'),
154
+ 'score': data.get('score')
155
+ })
156
+ except Exception as e:
157
+ print(f"读取文件 {score_file} 时出错: {e}")
158
+
159
+ # 按index排序
160
+ results.sort(key=lambda x: x.get('index', 0))
161
+
162
+ # 保存为JSON文件
163
+ with open(output_json, 'w', encoding='utf-8') as f:
164
+ json.dump(results, f, indent=2, ensure_ascii=False)
165
+
166
+ print(f"打分结果已保存到: {output_json}")
167
+
168
+ # 转换为DataFrame用于分析
169
+ df = pd.DataFrame(results)
170
+ return df
171
+
172
+ def analyze_scores(df):
173
+ """对打分结果进行统计分析"""
174
+ print("\n=== 打分结果统计分析 ===")
175
+
176
+ # 基本统计
177
+ valid_scores = df[df['score'].notna()]['score']
178
+
179
+ if len(valid_scores) == 0:
180
+ print("没有有效的打分结果")
181
+ return
182
+
183
+ print(f"总样本数: {len(df)}")
184
+ print(f"有效打分数: {len(valid_scores)}")
185
+ print(f"无效打分数: {len(df) - len(valid_scores)}")
186
+ print(f"有效率: {len(valid_scores)/len(df)*100:.2f}%")
187
+
188
+ print(f"\n分数统计:")
189
+ print(f"平均分: {valid_scores.mean():.2f}")
190
+ print(f"中位数: {valid_scores.median():.2f}")
191
+ print(f"标准差: {valid_scores.std():.2f}")
192
+ print(f"最高分: {valid_scores.max():.2f}")
193
+ print(f"最低分: {valid_scores.min():.2f}")
194
+
195
+ # 分数分布
196
+ print(f"\n分数分布:")
197
+ bins = [0, 20, 40, 60, 80, 100]
198
+ labels = ['0-20', '21-40', '41-60', '61-80', '81-100']
199
+
200
+ for i, (low, high) in enumerate(zip(bins[:-1], bins[1:])):
201
+ count = len(valid_scores[(valid_scores >= low) & (valid_scores <= high)])
202
+ percentage = count / len(valid_scores) * 100
203
+ print(f"{labels[i]}: {count} ({percentage:.1f}%)")
204
+
205
+ # 分位数
206
+ print(f"\n分位数:")
207
+ quantiles = [0.1, 0.25, 0.5, 0.75, 0.9]
208
+ for q in quantiles:
209
+ print(f"{int(q*100)}%分位数: {valid_scores.quantile(q):.2f}")
210
+
211
+ # 按蛋白质ID分析(如果样本足够多)
212
+ if len(df['protein_id'].unique()) > 1:
213
+ print(f"\n按蛋白质ID分析:")
214
+ protein_stats = df[df['score'].notna()].groupby('protein_id')['score'].agg(['count', 'mean', 'std']).round(2)
215
+ print(protein_stats.head(10))
216
+
217
+ # 保存统计分析结果
218
+ stats_result = {
219
+ "basic_stats": {
220
+ "total_samples": len(df),
221
+ "valid_scores": len(valid_scores),
222
+ "invalid_scores": len(df) - len(valid_scores),
223
+ "valid_rate": len(valid_scores)/len(df)*100,
224
+ "mean_score": float(valid_scores.mean()),
225
+ "median_score": float(valid_scores.median()),
226
+ "std_score": float(valid_scores.std()),
227
+ "max_score": float(valid_scores.max()),
228
+ "min_score": float(valid_scores.min())
229
+ },
230
+ "distribution": {},
231
+ "quantiles": {}
232
+ }
233
+
234
+ # 分数分布统计
235
+ for i, (low, high) in enumerate(zip(bins[:-1], bins[1:])):
236
+ count = len(valid_scores[(valid_scores >= low) & (valid_scores <= high)])
237
+ percentage = count / len(valid_scores) * 100
238
+ stats_result["distribution"][labels[i]] = {
239
+ "count": count,
240
+ "percentage": percentage
241
+ }
242
+
243
+ # 分位数统计
244
+ for q in quantiles:
245
+ stats_result["quantiles"][f"{int(q*100)}%"] = float(valid_scores.quantile(q))
246
+
247
+ return stats_result
248
+
249
+ def main():
250
+ import argparse
251
+ parser = argparse.ArgumentParser()
252
+ parser.add_argument("--results_dir", type=str,
253
+ default="data/evolla_hard_motif_go",
254
+ help="包含LLM答案结果的目录")
255
+ parser.add_argument("--n_process", type=int, default=32,
256
+ help="并行进程数")
257
+ parser.add_argument("--save_dir", type=str,
258
+ default="data/llm_scores",
259
+ help="保存打分结果的目录")
260
+ parser.add_argument("--output_json", type=str,
261
+ default="data/llm_scores_results.json",
262
+ help="输出JSON文件路径")
263
+ parser.add_argument("--stats_json", type=str,
264
+ default="data/llm_scores_stats.json",
265
+ help="统计分析结果JSON文件路径")
266
+ parser.add_argument("--max_iterations", type=int, default=3,
267
+ help="最大迭代次数")
268
+ args = parser.parse_args()
269
+
270
+ # 创建保存目录
271
+ os.makedirs(args.save_dir, exist_ok=True)
272
+ os.makedirs(os.path.dirname(args.output_json), exist_ok=True)
273
+
274
+ # 加载QA结果数据
275
+ load_qa_results_from_dir(args.results_dir)
276
+
277
+ if not qa_data:
278
+ print("没有找到有效的QA结果数据")
279
+ return
280
+
281
+ # 循环检查和打分
282
+ iteration = 0
283
+ while iteration < args.max_iterations:
284
+ iteration += 1
285
+ print(f"\n开始第 {iteration} 轮打分")
286
+
287
+ # 获取缺失打分的QA索引
288
+ missing_indices = get_missing_score_indices(args.save_dir)
289
+
290
+ if not missing_indices:
291
+ print("所有QA对已完成打分!")
292
+ break
293
+
294
+ print(f"发现 {len(missing_indices)} 个待打分的QA对")
295
+
296
+ missing_indices_list = sorted(list(missing_indices))
297
+
298
+ # 使用多进程处理打分
299
+ mprs = MultipleProcessRunnerSimplifier(
300
+ data=missing_indices_list,
301
+ do=lambda process_id, idx, qa_index, writer: process_single_scoring(process_id, idx, qa_index, writer, args.save_dir),
302
+ n_process=args.n_process,
303
+ split_strategy="static"
304
+ )
305
+ mprs.run()
306
+
307
+ print(f"第 {iteration} 轮打分完成")
308
+
309
+ # 收集结果并保存为JSON
310
+ df = collect_scores_to_json(args.save_dir, args.output_json)
311
+
312
+ # 进行统计分析
313
+ stats_result = analyze_scores(df)
314
+
315
+ # 保存统计分析结果为JSON
316
+ with open(args.stats_json, 'w', encoding='utf-8') as f:
317
+ json.dump(stats_result, f, indent=2, ensure_ascii=False)
318
+ print(f"统计分析结果已保存到: {args.stats_json}")
319
+
320
+ # 检查最终结果
321
+ final_missing = get_missing_score_indices(args.save_dir)
322
+ if final_missing:
323
+ print(f"\n仍有 {len(final_missing)} 个QA对未能成功打分")
324
+ else:
325
+ print(f"\n所有 {len(qa_data)} 个QA对已成功完成打分!")
326
+
327
+ if __name__ == "__main__":
328
+ main()
calculate_ec_accuracy.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import json
3
+ import os
4
+ import sys
5
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
6
+ import re
7
+ from collections import defaultdict
8
+
9
+ def load_ground_truth(pkl_file):
10
+ """加载ground truth数据"""
11
+ with open(pkl_file, 'rb') as f:
12
+ data = pickle.load(f)
13
+
14
+ # 提取每个蛋白的EC号
15
+ gt_dict = {}
16
+ for item in data:
17
+ uniprot_id = item['uniprot_id']
18
+ ec_numbers = []
19
+
20
+ # 提取EC号
21
+ if 'ec' in item:
22
+ for ec_info in item['ec']:
23
+ if 'reaction' in ec_info and 'ecNumber' in ec_info['reaction']:
24
+ ec_numbers.append(ec_info['reaction']['ecNumber'])
25
+
26
+ gt_dict[uniprot_id] = set(ec_numbers) # 使用set去重
27
+
28
+ return gt_dict
29
+
30
+ def extract_ec_prediction(json_content):
31
+ """从预测结果中提取EC号"""
32
+ # 查找[EC_PREDICTION]标签后的内容
33
+ pattern = r'\[EC_PREDICTION\]\s*([^\n\r]*)'
34
+ match = re.search(pattern, json_content)
35
+
36
+ if match:
37
+ line_content = match.group(1).strip()
38
+ # 修改EC号格式匹配,支持不完整的EC号(带有-的情况)
39
+ # 匹配格式:数字.数字.数字.数字 或 数字.数字.数字.- 或 数字.数字.-.- 或 数字.-.-.-
40
+ ec_pattern = r'\b\d+\.(?:\d+|-)\.(?:\d+|-)\.(?:\d+|-)'
41
+ ec_numbers = re.findall(ec_pattern, line_content)
42
+ return ec_numbers
43
+
44
+ return []
45
+
46
+ def load_predictions(predictions_dir):
47
+ """加载所有预测结果"""
48
+ predictions = {}
49
+
50
+ for filename in os.listdir(predictions_dir):
51
+ if filename.endswith('.json'):
52
+ uniprot_id = filename.replace('.json', '')
53
+ filepath = os.path.join(predictions_dir, filename)
54
+
55
+ try:
56
+ with open(filepath, 'r', encoding='utf-8') as f:
57
+ content = f.read()
58
+
59
+ # 提取EC预测
60
+ predicted_ecs = extract_ec_prediction(content)
61
+ predictions[uniprot_id] = predicted_ecs
62
+
63
+ except Exception as e:
64
+ print(f"处理文件 {filename} 时出错: {e}")
65
+
66
+ return predictions
67
+
68
+ def calculate_accuracy(ground_truth, predictions, level=4):
69
+ """
70
+ 计算EC号在指定级别上的准确率
71
+ level: 1-4,表示比较EC号的前几个数字
72
+ """
73
+ correct = 0
74
+ total = 0
75
+
76
+ for uniprot_id, gt_ecs in ground_truth.items():
77
+ if uniprot_id in predictions and predictions[uniprot_id]:
78
+ # 取预测的第一个EC号
79
+ pred_ec = predictions[uniprot_id][0]
80
+
81
+ # 检查是否有任何ground truth EC号在指定级别上与预测匹配
82
+ is_correct = False
83
+ for gt_ec in gt_ecs:
84
+ # 将EC号分割成组成部分
85
+ gt_parts = gt_ec.split('.')[:level]
86
+ pred_parts = pred_ec.split('.')[:level]
87
+
88
+ # 比较前level个部分是否相同
89
+ if gt_parts == pred_parts:
90
+ is_correct = True
91
+ break
92
+
93
+ if is_correct:
94
+ correct += 1
95
+
96
+ total += 1
97
+
98
+ accuracy = correct / total if total > 0 else 0
99
+ return accuracy, correct, total
100
+
101
+ def calculate_prf1(ground_truth, predictions, level=4):
102
+ """
103
+ 计算EC号在指定级别上的精确率、召回率和F1分数 (微平均)
104
+ level: 1-4,表示比较EC号的前几个数字
105
+ """
106
+ total_tp = 0
107
+ total_fp = 0
108
+ total_fn = 0
109
+
110
+ # 添加用于记录错误预测的字典
111
+ incorrect_proteins = {
112
+ 'false_positives': [], # 预测了但GT中没有的
113
+ 'false_negatives': [], # GT中有但没预测到的
114
+ 'no_prediction': [], # 有GT但没有预测的
115
+ 'zero_prediction': [] # 预测了0个EC号的蛋白
116
+ }
117
+
118
+ for uniprot_id, gt_ecs_set in ground_truth.items():
119
+ if uniprot_id in predictions:
120
+ pred_ecs_set = set(predictions[uniprot_id])
121
+
122
+ # 如果GT是空的,跳过这个蛋白的评估
123
+ if not gt_ecs_set:
124
+ continue
125
+
126
+ # 检查是否预测了0个EC号
127
+ if not pred_ecs_set:
128
+ level_gt = set('.'.join(ec.split('.')[:level]) for ec in gt_ecs_set)
129
+ fn = len(level_gt)
130
+ total_fn += fn
131
+
132
+ incorrect_proteins['zero_prediction'].append({
133
+ 'protein_id': uniprot_id,
134
+ 'gt_ecs': list(level_gt)
135
+ })
136
+ continue
137
+
138
+ # --- 核心计算逻辑 ---
139
+ # 为了处理level,我们需要小心地计算交集
140
+ # level_gt = {'1.2.3.4' -> '1.2.3'}
141
+ level_gt = set('.'.join(ec.split('.')[:level]) for ec in gt_ecs_set)
142
+ level_pred = set('.'.join(ec.split('.')[:level]) for ec in pred_ecs_set)
143
+
144
+ # 计算 TP, FP, FN
145
+ tp = len(level_pred.intersection(level_gt))
146
+ fp = len(level_pred) - tp
147
+ fn = len(level_gt) - tp
148
+
149
+ total_tp += tp
150
+ total_fp += fp
151
+ total_fn += fn
152
+
153
+ # 记录有错误的蛋白ID
154
+ if fp > 0 or fn > 0:
155
+ fp_ecs = level_pred - level_gt # 假阳性的EC号
156
+ fn_ecs = level_gt - level_pred # 假阴性的EC号
157
+
158
+ if fp > 0:
159
+ incorrect_proteins['false_positives'].append({
160
+ 'protein_id': uniprot_id,
161
+ 'predicted_ecs': list(fp_ecs),
162
+ 'gt_ecs': list(level_gt)
163
+ })
164
+
165
+ if fn > 0:
166
+ incorrect_proteins['false_negatives'].append({
167
+ 'protein_id': uniprot_id,
168
+ 'missed_ecs': list(fn_ecs),
169
+ 'predicted_ecs': list(level_pred)
170
+ })
171
+ else:
172
+ # 有GT但没有预测的情况
173
+ if gt_ecs_set:
174
+ level_gt = set('.'.join(ec.split('.')[:level]) for ec in gt_ecs_set)
175
+ fn = len(level_gt)
176
+ total_fn += fn
177
+
178
+ incorrect_proteins['no_prediction'].append({
179
+ 'protein_id': uniprot_id,
180
+ 'gt_ecs': list(level_gt)
181
+ })
182
+
183
+ # 使用微平均计算总的 Precision, Recall, F1
184
+ precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
185
+ recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
186
+ f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
187
+
188
+ # total用于展示处理了多少个蛋白
189
+ total_proteins_evaluated = sum(1 for uid in ground_truth if uid in predictions and ground_truth[uid])
190
+
191
+ return {
192
+ 'precision': precision,
193
+ 'recall': recall,
194
+ 'f1_score': f1,
195
+ 'tp': total_tp,
196
+ 'fp': total_fp,
197
+ 'fn': total_fn,
198
+ 'evaluated_proteins': total_proteins_evaluated,
199
+ 'incorrect_proteins': incorrect_proteins
200
+ }
201
+
202
+ def main():
203
+ # 文件路径
204
+ import argparse
205
+ parser = argparse.ArgumentParser(description='Calculate EC accuracy')
206
+ parser.add_argument('--pkl_file', type=str, default='data/raw_data/difference_20241122_ec_dict_list.pkl')
207
+ parser.add_argument('--predictions_dir', type=str, default='data/clean_test_results_top2go_deepseek-r1')
208
+ args = parser.parse_args()
209
+ pkl_file = args.pkl_file
210
+ predictions_dir = args.predictions_dir
211
+
212
+ print("正在加载ground truth数据...")
213
+ ground_truth = load_ground_truth(pkl_file)
214
+ print(f"加载了 {len(ground_truth)} 个蛋白的ground truth数据")
215
+
216
+ print("正在加载预测结果...")
217
+ predictions = load_predictions(predictions_dir)
218
+ print(f"加载了 {len(predictions)} 个蛋白的预测结果")
219
+
220
+ # print(f"predictions: {predictions}")
221
+ # print(f"ground_truth: {ground_truth}")
222
+
223
+ # 找到共同的蛋白ID
224
+ common_ids = set(ground_truth.keys()) & set(predictions.keys())
225
+ valid_ids = {uid for uid in common_ids if ground_truth[uid]} # 只评估那些有GT EC号的蛋白
226
+ print(f"共同且有GT的蛋白数量: {len(valid_ids)}")
227
+
228
+ # 过滤数据
229
+ filtered_gt = {uid: ground_truth[uid] for uid in valid_ids}
230
+ filtered_pred = {uid: predictions[uid] for uid in valid_ids}
231
+
232
+ # 计算不同级别的PRF1
233
+ results = {}
234
+ print("\n=== 评估结果 ===")
235
+ for level in [1, 2, 3, 4]:
236
+ metrics = calculate_prf1(filtered_gt, filtered_pred, level=level)
237
+ results[level] = metrics
238
+ print(f"--- EC号前{level}级 ---")
239
+ print(f" Precision: {metrics['precision']:.4f}")
240
+ print(f" Recall: {metrics['recall']:.4f}")
241
+ print(f" F1-Score: {metrics['f1_score']:.4f}")
242
+ print(f" (TP: {metrics['tp']}, FP: {metrics['fp']}, FN: {metrics['fn']})")
243
+
244
+ # 打印预测错误的蛋白ID
245
+ incorrect = metrics['incorrect_proteins']
246
+
247
+ if incorrect['false_positives']:
248
+ print(f" 假阳性错误 ({len(incorrect['false_positives'])}个蛋白):")
249
+ for item in incorrect['false_positives'][:10]: # 只显示前10个
250
+ print(f" {item['protein_id']}: 错误预测了 {item['predicted_ecs']}, GT是 {item['gt_ecs']}")
251
+ if len(incorrect['false_positives']) > 10:
252
+ print(f" ... 还有 {len(incorrect['false_positives']) - 10} 个")
253
+
254
+ if incorrect['false_negatives']:
255
+ print(f" 假阴性错误 ({len(incorrect['false_negatives'])}个蛋白):")
256
+ for item in incorrect['false_negatives'][:10]: # 只显示前10个
257
+ print(f" {item['protein_id']}: 漏掉了 {item['missed_ecs']}, 预测了 {item['predicted_ecs']}")
258
+ if len(incorrect['false_negatives']) > 10:
259
+ print(f" ... 还有 {len(incorrect['false_negatives']) - 10} 个")
260
+
261
+ if incorrect['zero_prediction']:
262
+ print(f" 零预测错误 ({len(incorrect['zero_prediction'])}个蛋白):")
263
+ for item in incorrect['zero_prediction']:
264
+ print(f" {item['protein_id']}: GT是 {item['gt_ecs']}, 但预测了0个EC号")
265
+
266
+ if incorrect['no_prediction']:
267
+ print(f" 无预测错误 ({len(incorrect['no_prediction'])}个蛋白):")
268
+ for item in incorrect['no_prediction'][:10]: # 只显示前10个
269
+ print(f" {item['protein_id']}: GT是 {item['gt_ecs']}, 但没有预测")
270
+ if len(incorrect['no_prediction']) > 10:
271
+ print(f" ... 还有 {len(incorrect['no_prediction']) - 10} 个")
272
+
273
+ print() # 空行分隔
274
+
275
+ # 统计信息
276
+ print("\n=== 详细统计信息 ===")
277
+
278
+ # 统计ground truth中EC号的分布
279
+ gt_ec_counts = defaultdict(int)
280
+ for ecs in filtered_gt.values():
281
+ gt_ec_counts[len(ecs)] += 1
282
+
283
+ print("Ground truth中EC号数量分布:")
284
+ for count, freq in sorted(gt_ec_counts.items()):
285
+ print(f" {count}个EC号: {freq}个蛋白")
286
+
287
+ # 统计预测结果中EC号的分布
288
+ pred_ec_counts = defaultdict(int)
289
+ for ecs in filtered_pred.values():
290
+ pred_ec_counts[len(ecs)] += 1
291
+
292
+ print("\n预测结果中EC号数量分布:")
293
+ for count, freq in sorted(pred_ec_counts.items()):
294
+ print(f" {count}个EC号: {freq}个蛋白")
295
+
296
+ # 保存结果
297
+ output_file = 'test_results/ec_accuracy_results.json'
298
+ with open(output_file, 'w', encoding='utf-8') as f:
299
+ json.dump(results, f, indent=2, ensure_ascii=False)
300
+
301
+ # #保存ground truth
302
+ # with open('test_results/ground_truth.json', 'w', encoding='utf-8') as f:
303
+ # json.dump(filtered_gt, f, indent=2, ensure_ascii=False)
304
+
305
+ # #保存预测结果
306
+ # with open('test_results/predictions.json', 'w', encoding='utf-8') as f:
307
+ # json.dump(filtered_pred, f, indent=2, ensure_ascii=False)
308
+
309
+ print(f"\n结果已保存到 {output_file}")
310
+
311
+ if __name__ == "__main__":
312
+ main()
demo.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import sys
4
+ import tempfile
5
+ import gradio as gr
6
+ from typing import Dict, List, Optional
7
+ from pathlib import Path
8
+ from Bio import SeqIO
9
+ from io import StringIO
10
+
11
+ # 添加必要的路径
12
+ root_path = os.path.dirname(os.path.abspath(__file__))
13
+ sys.path.append(root_path)
14
+ sys.path.append(os.path.join(root_path, "Models/ProTrek"))
15
+
16
+ # 导入所需模块
17
+ from interproscan import InterproScan
18
+ from Bio.Blast.Applications import NcbiblastpCommandline
19
+ from utils.utils import extract_interproscan_metrics, get_seqnid, extract_blast_metrics, rename_interproscan_keys
20
+ from go_integration_pipeline import GOIntegrationPipeline
21
+ from utils.openai_access import call_chatgpt
22
+ from utils.prompts import FUNCTION_PROMPT
23
+
24
+ def get_prompt_template(selected_info_types=None):
25
+ """
26
+ 获取prompt模板,支持可选的信息类型
27
+
28
+ Args:
29
+ selected_info_types: 需要包含的信息类型列表,如['motif', 'go', 'superfamily', 'panther']
30
+ """
31
+ if selected_info_types is None:
32
+ selected_info_types = ['motif', 'go'] # 默认包含motif和go信息
33
+
34
+ PROMPT_TEMPLATE = FUNCTION_PROMPT + '\n' + """
35
+ input information:
36
+
37
+ {%- if 'motif' in selected_info_types and motif_pfam %}
38
+
39
+ motif:{% for motif_id, motif_info in motif_pfam.items() %}
40
+ {{motif_id}}: {{motif_info}}
41
+ {% endfor %}
42
+ {%- endif %}
43
+
44
+ {%- if 'go' in selected_info_types and go_data.status == 'success' %}
45
+
46
+ GO:{% for go_entry in go_data.go_annotations %}
47
+ ▢ GO term{{loop.index}}: {{go_entry.go_id}}
48
+ • definition: {{ go_data.all_related_definitions.get(go_entry.go_id, 'not found definition') }}
49
+ {% endfor %}
50
+ {%- endif %}
51
+
52
+ {%- for info_type in selected_info_types %}
53
+ {%- if info_type not in ['motif', 'go'] and interpro_descriptions.get(info_type) %}
54
+
55
+ {{info_type}}:{% for ipr_id, ipr_info in interpro_descriptions[info_type].items() %}
56
+ ▢ {{ipr_id}}: {{ipr_info.name}}
57
+ • description: {{ipr_info.abstract}}
58
+ {% endfor %}
59
+ {%- endif %}
60
+ {%- endfor %}
61
+
62
+ question: \n {{question}}
63
+ """
64
+
65
+ return PROMPT_TEMPLATE
66
+
67
+ class ProteinAnalysisDemo:
68
+ def __init__(self):
69
+ """
70
+ 蛋白质分析演示类
71
+ """
72
+ self.blast_database = "uniprot_swissprot"
73
+ self.expect_value = 0.01
74
+ self.interproscan_path = "interproscan/interproscan-5.75-106.0/interproscan.sh"
75
+ self.interproscan_libraries = [
76
+ "PFAM", "PIRSR", "PROSITE_PROFILES", "SUPERFAMILY", "PRINTS",
77
+ "PANTHER", "CDD", "GENE3D", "NCBIFAM", "SFLM", "MOBIDB_LITE",
78
+ "COILS", "PROSITE_PATTERNS", "FUNFAM", "SMART"
79
+ ]
80
+ self.go_topk = 2
81
+ self.selected_info_types = ['motif', 'go']
82
+
83
+ # 文件路径配置
84
+ self.pfam_descriptions_path = 'data/raw_data/all_pfam_descriptions.json'
85
+ self.go_info_path = 'data/raw_data/go.json'
86
+ self.interpro_data_path = 'data/raw_data/interpro_data.json'
87
+
88
+ # 初始化GO整合管道
89
+ self.go_pipeline = GOIntegrationPipeline(topk=self.go_topk)
90
+
91
+ # 初始化InterPro管理器(如果需要)
92
+ self.interpro_manager = None
93
+ other_types = [t for t in self.selected_info_types if t not in ['motif', 'go']]
94
+ if other_types and os.path.exists(self.interpro_data_path):
95
+ try:
96
+ from utils.generate_protein_prompt import get_interpro_manager
97
+ self.interpro_manager = get_interpro_manager(self.interpro_data_path, None)
98
+ except Exception as e:
99
+ print(f"初始化InterPro管理器失败: {str(e)}")
100
+
101
+ def validate_protein_sequence(self, sequence: str) -> bool:
102
+ """
103
+ 验证蛋白质序列格式
104
+ """
105
+ if not sequence:
106
+ return False
107
+
108
+ # 移除空白字符
109
+ sequence = sequence.strip().upper()
110
+
111
+ # 检查是否包含有效的氨基酸字符
112
+ valid_aa = set('ACDEFGHIKLMNPQRSTVWY')
113
+ sequence_chars = set(sequence.replace('\n', '').replace(' ', ''))
114
+
115
+ return sequence_chars.issubset(valid_aa) and len(sequence) > 0
116
+
117
+ def parse_fasta_content(self, fasta_content: str) -> tuple:
118
+ """
119
+ 解析FASTA内容,返回第一个序列
120
+ """
121
+ try:
122
+ fasta_io = StringIO(fasta_content)
123
+ records = list(SeqIO.parse(fasta_io, "fasta"))
124
+
125
+ if not records:
126
+ return None, "FASTA文件中没有找到有效的序列"
127
+
128
+ if len(records) > 1:
129
+ return None, "演示版本只支持单一序列,检测到多个序列"
130
+
131
+ record = records[0]
132
+ return str(record.seq), f"成功解析序列 ID: {record.id}"
133
+
134
+ except Exception as e:
135
+ return None, f"解析FASTA文件出错: {str(e)}"
136
+
137
+ def create_temp_fasta(self, sequence: str, seq_id: str = "demo_protein") -> str:
138
+ """
139
+ 创建临时FASTA文件
140
+ """
141
+ temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
142
+ temp_file.write(f">{seq_id}\n{sequence}\n")
143
+ temp_file.close()
144
+ return temp_file.name
145
+
146
+ def run_blast_analysis(self, fasta_file: str, temp_dir: str) -> Dict:
147
+ """
148
+ 运行BLAST分析
149
+ """
150
+ blast_xml = os.path.join(temp_dir, "blast_results.xml")
151
+
152
+ try:
153
+ blast_cmd = NcbiblastpCommandline(
154
+ query=fasta_file,
155
+ db=self.blast_database,
156
+ out=blast_xml,
157
+ outfmt=5, # XML格式
158
+ evalue=self.expect_value
159
+ )
160
+ blast_cmd()
161
+
162
+ # 提取BLAST结果
163
+ blast_results = extract_blast_metrics(blast_xml)
164
+
165
+ # 获取序列字典
166
+ seq_dict = get_seqnid(fasta_file)
167
+
168
+ blast_info = {}
169
+ for uid, info in blast_results.items():
170
+ blast_info[uid] = {"sequence": seq_dict[uid], "blast_results": info}
171
+
172
+ return blast_info
173
+
174
+ except Exception as e:
175
+ print(f"BLAST分析出错: {str(e)}")
176
+ return {}
177
+ finally:
178
+ if os.path.exists(blast_xml):
179
+ os.remove(blast_xml)
180
+
181
+ def run_interproscan_analysis(self, fasta_file: str, temp_dir: str) -> Dict:
182
+ """
183
+ 运行InterProScan分析
184
+ """
185
+ interproscan_json = os.path.join(temp_dir, "interproscan_results.json")
186
+
187
+ try:
188
+ interproscan = InterproScan(self.interproscan_path)
189
+ input_args = {
190
+ "fasta_file": fasta_file,
191
+ "goterms": True,
192
+ "pathways": True,
193
+ "save_dir": interproscan_json
194
+ }
195
+ interproscan.run(**input_args)
196
+
197
+ # 提取InterProScan结果
198
+ interproscan_results = extract_interproscan_metrics(
199
+ interproscan_json,
200
+ librarys=self.interproscan_libraries
201
+ )
202
+
203
+ # 获取序列字典
204
+ seq_dict = get_seqnid(fasta_file)
205
+
206
+ interproscan_info = {}
207
+ for id, seq in seq_dict.items():
208
+ info = interproscan_results[seq]
209
+ info = rename_interproscan_keys(info)
210
+ interproscan_info[id] = {"sequence": seq, "interproscan_results": info}
211
+
212
+ return interproscan_info
213
+
214
+ except Exception as e:
215
+ print(f"InterProScan分析出错: {str(e)}")
216
+ return {}
217
+ finally:
218
+ if os.path.exists(interproscan_json):
219
+ os.remove(interproscan_json)
220
+
221
+ def generate_prompt(self, protein_id: str, interproscan_info: Dict,
222
+ protein_go_dict: Dict, question: str) -> str:
223
+ """
224
+ 从内存中的数据生成prompt,包含完整的motif和GO定义
225
+ """
226
+ try:
227
+ from utils.protein_go_analysis import get_go_definition
228
+ from jinja2 import Template
229
+ # from utils.generate_protein_prompt import get_prompt_template
230
+
231
+ # 获取GO分析结果
232
+ go_ids = protein_go_dict.get(protein_id, [])
233
+ go_annotations = []
234
+ all_related_definitions = {}
235
+
236
+ if go_ids:
237
+ for go_id in go_ids:
238
+ # 确保GO ID格式正确
239
+ clean_go_id = go_id.split(":")[-1] if ":" in go_id else go_id
240
+ go_annotations.append({"go_id": clean_go_id})
241
+
242
+ # 获取GO定义
243
+ if os.path.exists(self.go_info_path):
244
+ definition = get_go_definition(clean_go_id, self.go_info_path)
245
+ if definition:
246
+ all_related_definitions[clean_go_id] = definition
247
+
248
+ # 获取motif信息
249
+ motif_pfam = {}
250
+ if os.path.exists(self.pfam_descriptions_path):
251
+ try:
252
+ # 从interproscan结果中提取pfam信息
253
+ interproscan_results = interproscan_info[protein_id].get('interproscan_results', {})
254
+ pfam_entries = interproscan_results.get('pfam_id', [])
255
+
256
+ # 加载pfam描述
257
+ with open(self.pfam_descriptions_path, 'r') as f:
258
+ pfam_descriptions = json.load(f)
259
+
260
+ # 构建motif_pfam字典
261
+ for entry in pfam_entries:
262
+ for pfam_id, ipr_id in entry.items():
263
+ if pfam_id and pfam_id in pfam_descriptions:
264
+ motif_pfam[pfam_id] = pfam_descriptions[pfam_id]['description']
265
+
266
+ except Exception as e:
267
+ print(f"获取motif信息时出错: {str(e)}")
268
+
269
+ # 获取InterPro描述信息
270
+ interpro_descriptions = {}
271
+ other_types = [t for t in self.selected_info_types if t not in ['motif', 'go']]
272
+ if other_types and self.interpro_manager:
273
+ interpro_descriptions = self.interpro_manager.get_description(protein_id, other_types)
274
+
275
+ # 准备模板数据
276
+ template_data = {
277
+ "protein_id": protein_id,
278
+ "selected_info_types": self.selected_info_types,
279
+ "go_data": {
280
+ "status": "success" if go_annotations else "no_data",
281
+ "go_annotations": go_annotations,
282
+ "all_related_definitions": all_related_definitions
283
+ },
284
+ "motif_pfam": motif_pfam,
285
+ "interpro_descriptions": interpro_descriptions,
286
+ "question": question
287
+ }
288
+
289
+ # 使用模板生成prompt
290
+ PROMPT_TEMPLATE = get_prompt_template(self.selected_info_types) # demo版本不使用lmdb
291
+ template = Template(PROMPT_TEMPLATE)
292
+ return template.render(**template_data)
293
+
294
+ except Exception as e:
295
+ print(f"生成prompt时出错 (protein_id: {protein_id}): {str(e)}")
296
+ # 如果出错,返回简化版本的prompt
297
+ return self._generate_fallback_prompt(protein_id, interproscan_info, protein_go_dict, question)
298
+
299
+ def _generate_fallback_prompt(self, protein_id: str, interproscan_info: Dict,
300
+ protein_go_dict: Dict, question: str) -> str:
301
+ """
302
+ 生成备用prompt(当主要方法失败时使用)
303
+ """
304
+ from utils.prompts import FUNCTION_PROMPT
305
+
306
+ prompt_parts = [FUNCTION_PROMPT]
307
+ prompt_parts.append("\ninput information:")
308
+
309
+ # 添加motif信息
310
+ if 'motif' in self.selected_info_types:
311
+ interproscan_results = interproscan_info[protein_id].get('interproscan_results', {})
312
+ pfam_entries = interproscan_results.get('pfam_id', [])
313
+
314
+ if pfam_entries:
315
+ prompt_parts.append("\nmotif:")
316
+ for entry in pfam_entries:
317
+ for key, value in entry.items():
318
+ if value:
319
+ prompt_parts.append(f"{value}: motif information")
320
+
321
+ # 添加GO信息
322
+ if 'go' in self.selected_info_types:
323
+ go_ids = protein_go_dict.get(protein_id, [])
324
+ if go_ids:
325
+ prompt_parts.append("\nGO:")
326
+ for i, go_id in enumerate(go_ids[:10], 1):
327
+ prompt_parts.append(f"▢ GO term{i}: {go_id}")
328
+ prompt_parts.append(f"• definition: GO term definition")
329
+
330
+ # 添加用户问题
331
+ prompt_parts.append(f"\nquestion: \n{question}")
332
+
333
+ return "\n".join(prompt_parts)
334
+
335
+ def analyze_protein(self, sequence_input: str, fasta_file, question: str) -> str:
336
+ """
337
+ 分析蛋白质序列并回答问题
338
+ """
339
+ if not question.strip():
340
+ return "请输入您的问题"
341
+
342
+ # 确定使用哪个序列输入
343
+ final_sequence = None
344
+ sequence_source = ""
345
+
346
+ if fasta_file is not None:
347
+ # 优先使用上传的文件
348
+ try:
349
+ fasta_content = fasta_file.read().decode('utf-8')
350
+ final_sequence, parse_msg = self.parse_fasta_content(fasta_content)
351
+ if final_sequence is None:
352
+ return f"文件解析错误: {parse_msg}"
353
+ sequence_source = f"来自上传文件: {parse_msg}"
354
+ except Exception as e:
355
+ return f"读取上传文件出错: {str(e)}"
356
+ elif sequence_input.strip():
357
+ # 使用文本框输入的序列
358
+ if self.validate_protein_sequence(sequence_input):
359
+ final_sequence = sequence_input.strip().upper().replace('\n', '').replace(' ', '')
360
+ sequence_source = "来自文本框输入"
361
+ else:
362
+ return "输入的序列格式不正确,请输入有效的蛋白质序列"
363
+ else:
364
+ return "请输入蛋白质序列或上传FASTA文件"
365
+
366
+ # 创建临时目录和文件
367
+ with tempfile.TemporaryDirectory() as temp_dir:
368
+ try:
369
+ # 创建临时FASTA文件
370
+ temp_fasta = self.create_temp_fasta(final_sequence, "demo_protein")
371
+
372
+ # 运行分析
373
+ status_msg = f"序列来源: {sequence_source}\n序列长度: {len(final_sequence)} 氨基酸\n\n正在进行分析...\n"
374
+
375
+ # 步骤1: BLAST和InterProScan分析
376
+ status_msg += "步骤1: 运行BLAST分析...\n"
377
+ blast_info = self.run_blast_analysis(temp_fasta, temp_dir)
378
+
379
+ status_msg += "步骤2: 运行InterProScan分析...\n"
380
+ interproscan_info = self.run_interproscan_analysis(temp_fasta, temp_dir)
381
+
382
+ if not blast_info or not interproscan_info:
383
+ return status_msg + "分析失败: 无法获取BLAST或InterProScan结果"
384
+
385
+ # 步骤2: 整合GO信息
386
+ status_msg += "步骤3: 整合GO信息...\n"
387
+ protein_go_dict = self.go_pipeline.first_level_filtering(interproscan_info, blast_info)
388
+
389
+ # 步骤3: 生成prompt
390
+ status_msg += "步骤4: 生成分析prompt...\n"
391
+ protein_id = "demo_protein"
392
+ prompt = self.generate_prompt(protein_id, interproscan_info, protein_go_dict, question)
393
+
394
+ # 步骤4: 调用LLM生成答案
395
+ status_msg += "步骤5: 生成答案...\n"
396
+ llm_response = call_chatgpt(prompt)
397
+
398
+ # 组织最终结果
399
+ result = f"""
400
+ {status_msg}
401
+
402
+ === 分析完成 ===
403
+
404
+ 问题: {question}
405
+
406
+ 答案: {llm_response}
407
+
408
+ === 分析详情 ===
409
+ - BLAST匹配数: {len(blast_info.get(protein_id, {}).get('blast_results', []))}
410
+ - InterProScan域数: {len(interproscan_info.get(protein_id, {}).get('interproscan_results', {}).get('pfam_id', []))}
411
+ - GO术语数: {len(protein_go_dict.get(protein_id, []))}
412
+ """
413
+
414
+ return result
415
+
416
+ except Exception as e:
417
+ return f"分析过程中出错: {str(e)}"
418
+ finally:
419
+ # 清理临时文件
420
+ if 'temp_fasta' in locals() and os.path.exists(temp_fasta):
421
+ os.remove(temp_fasta)
422
+
423
+ def create_demo():
424
+ """
425
+ 创建Gradio演示界面
426
+ """
427
+ analyzer = ProteinAnalysisDemo()
428
+
429
+ with gr.Blocks(title="蛋白质功能分析演示") as demo:
430
+ gr.Markdown("# 🧬 蛋白质功能分析演示")
431
+ gr.Markdown("输入蛋白质序列和问题,AI将基于BLAST、InterProScan和GO信息为您提供专业分析")
432
+
433
+ with gr.Row():
434
+ with gr.Column(scale=1):
435
+ gr.Markdown("### 📝 序列输入")
436
+ sequence_input = gr.Textbox(
437
+ label="蛋白质序列",
438
+ placeholder="请输入蛋白质序列(单字母氨基酸代码)...",
439
+ lines=5,
440
+ max_lines=10
441
+ )
442
+
443
+ gr.Markdown("**或者**")
444
+
445
+ fasta_file = gr.File(
446
+ label="上传FASTA文件",
447
+ file_types=[".fasta", ".fa", ".fas"],
448
+ file_count="single"
449
+ )
450
+
451
+ gr.Markdown("### ❓ 您的问题")
452
+ question_input = gr.Textbox(
453
+ label="问题",
454
+ placeholder="请输入关于该蛋白质的问题,例如:这个蛋白质的主要功能是什么?",
455
+ lines=3
456
+ )
457
+
458
+ analyze_btn = gr.Button("🔍 开始分析", variant="primary", size="lg")
459
+
460
+ with gr.Column(scale=2):
461
+ gr.Markdown("### 📊 分析结果")
462
+ output = gr.Textbox(
463
+ label="分析结果",
464
+ lines=20,
465
+ max_lines=30,
466
+ show_copy_button=True
467
+ )
468
+
469
+ # 示例
470
+ gr.Markdown("### 💡 示例")
471
+ gr.Examples(
472
+ examples=[
473
+ ["MKALIVLGLVLLSVTVQGKVFERCELARTLKRLGMDGYRGISLANWMCLAKWESGYNTRATNYNAGDRSTDYGIFQINSRYWCNDGKTPGAVNACHLSCSALLQDNIADAVACAKRVVRDPQGIRAWVAWRNRCQNRDVRQYVQGCGV", "这个蛋白质的主要功能是什么?"],
474
+ ["MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFPTSREJ", "这个蛋白质可能参与哪些生物学过程?"],
475
+ ["ATGAGTGAACGTCTGAAATCTATCATCACCGTCGACGACGAGAACGTCAAGCTGATCGACAAGATCCTGGCCTCCATCAAGGACCTGAACGAGCTGGTGGACATGATCGACGAGATCAAGAACGTCGACGACGAGCTGATCGACAAGATCCTGGCC", "这个序列编码的蛋白质具有什么结构特征?"]
476
+ ],
477
+ inputs=[sequence_input, question_input]
478
+ )
479
+
480
+ analyze_btn.click(
481
+ fn=analyzer.analyze_protein,
482
+ inputs=[sequence_input, fasta_file, question_input],
483
+ outputs=[output]
484
+ )
485
+
486
+ return demo
487
+
488
+ if __name__ == "__main__":
489
+ demo = create_demo()
490
+ demo.launch(
491
+ server_name="0.0.0.0",
492
+ server_port=30002,
493
+ share=True,
494
+ debug=False
495
+ )
demo.sh ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cd /zhangjiawei/protein_rag
2
+ source /root/miniconda3/etc/profile.d/conda.sh
3
+ conda activate rag_llm
4
+
5
+ for arg in "$@"
6
+ do
7
+ case $arg in
8
+ proxy=*) proxy="${arg#*=}" ;;
9
+ *) ;;
10
+ esac
11
+ done
12
+
13
+ # export http_proxy=http://${proxy}:4780 && export https_proxy=http://${proxy}:4780
14
+ # echo http_proxy=http://${proxy}:4780
15
+ # echo https_proxy=http://${proxy}:4780
16
+
17
+
18
+ # http_proxy=http://=10.16.12.236:4780
19
+ python demo.py
example/difference_20241122_ec_dict_list.fasta ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ >A8CF74
2
+ MTAAYLKTAFGDRLSITVVESSRIGTIGVGEATFSDIQHFFQFLNLREQDWMPACNATYKLGIRFENWRHVGHHFYQPFEQIRPVYGFPLTDWWLHDAPTDRFDTDCFVMPNLCEAGRSPRHLDGTLADEDFVEEGDELANRTMSEHQGKSQFPYAYHFEAALLAKFLTGYAVDRGVEHVVDDVLDVRLDQRGWIEHVVTAEHGEIHGDLFVDCTGFRGLLLNKALGVPFVSYQDTLPNDSAVALQVPLDMQRRGIVPNTTATAREAGWIWTIPLFGRVGTGYVYAKDYLSPEEAERTLREFVGPAAADVEANHIRMRIGRSQESWRNNCVAIGLSSGFVEPLESTGIFFIHHAIEQLVKHFPAADWNPKSRDMYNSAVAHVMDGIREFLVIHYRGAARADNQYWRDTKTRPLPDGLAERIECWQTQLPDTETIYPYYHGLPPYSYMCILMGGGAIRTPASAALALTDQGAAQKEFAAVRDRAAQLRDTLPSHYEYLARMRGLDV
3
+ >Q5B027
4
+ MKAFFAISASTLLATVHGHGYLTVPASRTRLGFEAGIDTCPECSILEPVDAWPNVTEAQVGRSGPCGYNARVSVDYNQPGDNWGNEPVVTYKAGDIVEVQWCVDNNGDHGGMFTYGICQDQELVDKFLDPDYLPTEEEKQAAEDCFLQGELKCGDVDGQECEYSPDCGEGEACYRNDWFTCNAFEADSDRGCQGVDGAELNSCKTTIAGGYTVTKKIKIPDYTSEHTLLRFRWNSFQTPQIYLHCADPTIEGGMEVRMRMIVMHGSFGVDTQHSFGHSFGFQGEGVYRAYRYIRGVAIIQMNLNINASLLPQPTLPIRGWSTRNIQHT
5
+ >A0A7H0XJI9
6
+ MRFLKAKAGLVASGAFLLASVPVVAADCALPSTYSWTSTGPLANPKSGWTAIKDFSNVVFNNNHIVYASTTDANGNYGSMNFGTFSDWSGMASASQNKMSFSAVAPTLFYFQPKNIWVLAYQWGSSTFTYRTSNDPTNANGWSSEQALFSGQITGSSTGAIDQTLIGDSTHMYLFFAGDNGKIYRSSMPINNFPGNFGTSSEVVLSDSQNNLFEAVQVYTVKGQNKYLMIVEAIGSQGRYFRSFTATSLGGSWTPQATSESQPFAGKANSGATWTNDISHGDLVRTNPDQTMTIDPCNLQFLYQGKNPSAGGNYNTLPWRPGVLTLKN
7
+ >D9X0I3
8
+ MSANSFDARSTLQVGDESYEIFRLDKVEGSARLPYSLKVLLENLLRTEDGANITADHIRALGGWDSQAQPSQEIQFTPARVIMQDFTGVPCVVDLATMREAVKELGGDPAKINPLAPAELVIDHSVIADKFGTNDAFKQNVELEYGRNKERYQFLRWGQTAFDEFKVVPPGTGIVHQVNIEHLARTVMVRGGQAYPDTLVGTDSHTTMVNGLGVLGWGVGGIEAEAAMLGQPVSMLIPRVVGFKLTGELKPGTTATDLVLTITEMLRGHGVVGKFVEFYGEGVAATSLANRATIGNMSPEFGSTAAIFPIDDETLNYLRLTGRSEQQVALVESYAKEQGLWLDPAAEPDFSEKLELDLSTVVPSIAGPKRPQDRIVLAEAAQQFAKDVLNYVEAPAAQPAASASPVDEASAESFPASDAPAYGSQENGAGAPQHADGTGAAVPSNPVTVTAPDGTSYEIDHGAVTVAAITSCTNTSNPYVMVAAALVAKKAVEKGLTRKPWVKTTLAPGSKVVTDYFEKSGLTPYLDKVGFNLVGYGCTTCIGNSGPLPEEVSKAVNDHDLAVTSVLSGNRNFEGRINPDVKMNYLASPPLVVAYALAGSMKVDITKDALGTDQDGNPVYLKDIWPSEAEVNDVVANAIGEDMFSKSYSDVFAGDAQWQALPIPTGNTFEWDPESTYVRKPPYFEGMEMEPAPVEDIAGARVLAKLGDSVTTDHISPAGAIKADTPAGKYLTEHGVERRDFNSYGSRRGNHEVMIRGTFANIRLRNQIAPGTEGGYTRDFTKDDAPVSFIYDASRNYIEQGIPLVVLAGKEYGSGSSRDWAAKGTALLGVKAVIAESYERIHRSNLIGMGVLPLQFPEGQSAATLGLTGEETFSFSGVTELNNGTTPRTVKVTTDTGVEFDAVVRIDTPGEADYYRNGGIMQYVLRSLIRK
9
+ >Q5MIX2
10
+ MKILLAVVFVLNLTNLAVPQHLITSSPSLPESKPVGRRPTYEEYKQQRESFLQTEDHHLLGANVTLTENEQLVNKFIMQMKLDEMEKGFNDSYNFIPARHIFEVLDRFGQSKVFNVIRRLPKGGVLHAHDMALGSTDLIVNATYLENLWQKGNFGLNHGPEFKFSRERPGKEWSLVSEIRQWMTNEVYDAKVAEVFSLYNADPLNAYKSLDNVWSKFQNLFACLAPLITFAPVWRQYYHDSLKQFYDDHVQYLEFRGVLPEVYDLDGKVYSAEEIVQLYYEETEQFKAKYPDFIGVKFIYAPGRYASDEEFQKLLDTTNRLHKKFPNFLAGFDLVGQEDPGRSLFEFAPALLKLPASINFFFHAGETNWYGMKTDQNLVDAVLLGTKRIGHGFAVLKHPKVLKEIKRRQICIEINPISNQVLKLVQDQRNHPAALLFSDNYPVVVSSDDPSFGRSTPLSHDFYVAFTGIASAKQDWRWLKQLALNSIEYSAMNSEEKTVAKEKWNQAWDHQFSRLAVDFVAGKILENWIMKIV
11
+ >Q9NC65
12
+ MFSQLVVWLLATSTVCLAWDNSWIMDMKYERYSQRRSYYLAEEEDRSVGSDIELTAKEQVVNERLMELKMTELKNGLQDPAGFIPWNHIFDVLYRINSSELFHIIQKMPKGGILHAHDTALCSTDYVISLTYEPNLWQCADPTTGAFQFLFSREAPTNTDTCTWTLVADERAKQGEENYNSALRSQLSMYNTNPIMHNRDVDSIWRQFMGIFGVNGGLLTYAPVWKAYYLQFLKEMFADGVQYLELRTTLPPLYDLDGKTYNEVEIMQIYYDATKEFKKQNPTFIGAKIIYAPVRVVDDAGIPALMAKVRELHEKFPDFMAGFDLVGQEDKGRPLIAFSREILKLPNSIDFYFHAGETNWDGMTDDNLIDAVLLGTKRIGHGYAVLKHPRVLKEVKRNKIAIEVCPASNQVLRLVADYRNHPGSVLLANKEYPVVISSDDPSFWEAKPLSHDFYMAFLGLASSRQDLRLLKQLAINSIKYSAMSPREKLQAMQMWEAEWKKFIDGFNA
13
+ >Q06K61
14
+ MFPRLIVWLLAASAVHAVLDISNIKPKRDYENFLQKYAEYADDEVDRSVGSDITLSLKEKFVNQYLMDLKTEELKAGLKNPSQFIPSNHFFSVLDRINSSEIFKIIRRMPKGAILHAHDTALCSTDYVVSITYRDHLWQCADPKTGALQFRFSKESPKNTDTCQWTPVSEERKNQGEEQYNSKLRSQLSLYNTDPINRSRDVDSIWNDFMGLFGVNFGLLTYAPVWKDYYKQFLKEMMEDGVQYLELRGTLPPLYDLDGKIYNEEQVVEIYYNVTEEFKKENSTFIGAKFIYAPVRFVNATGIKTLTTTVKQLHERFPDFLAGFDLVGQEDKGGPLIGFSRELLELPESINFFFHSGETNWNGMTDDNLIAAVTLGTKRIGHGYALFKHPRVLKQVKKDKIAIEVCPISNQVLRLVADMRNHPGSILLANKKYPMVISSDDPSFWEATPLSHDFYMAFMGLASYHQDLRMLKQLAINSLEYSSMTLEEKTNAMKLWEAEWEKFIKELETEVFSLLE
15
+ >A0A0H2ZM56
16
+ MADKKTVTPEEKKLVAEKHVDELVQKALVALEEMRKLNQEQVDYIVAKASVAALDAHGELALHAFEETGRGVFEDKATKNLFACEHVVNNMRHTKTVGVIEEDDVTGLTLIAEPVGVVCGITPTTNPTSTAIFKSLISLKTRNPIVFAFHPSAQESSAHAARIVRDAAIAAGAPENCVQWITQPSMEATSALMNHEGVATILATGGNAMVKAAYSCGKPALGVGAGNVPAYVEKSANIRQAAHDIVMSKSFDNGMVCASEQAVIIDKEIYDEFVAEFKSYHTYFVNKKEKALLEEFCFGVKANSKNCAGAKLNADIVGKPATWIAEQAGFTVPEGTNILAAECKEVGENEPLTREKLSPVIAVLKSESREDGITKARQMVEFNGLGHSAAIHTADEELTKEFGKAVKAIRVICNSPSTFGGIGDVYNAFLPSLTLGCGSYGRNSVGDNVSAINLLNIKKVGRRRNNMQWMKLPSKTYFERDSIQYLQKCRDVERVMIVTDHAMVELGFLDRIIEQLDLRRNKVVYQIFADVEPDPDITTVNRGTEIMRAFKPDTIIALGGGSPMDAAKVMWLFYEQPEVDFRDLVQKFMDIRKRAFKFPLLGKKTKFIAIPTTSGTGSEVTPFAVISDKANNRKYPIADYSLTPTVAIVDPALVLTVPGFVAADTGMDVLTHATEAYVSQMASDYTDGLALQAIKLVFENLESSVKNADFHSREKMHNASTIAGMAFANAFLGISHSMAHKIGAQFHTIHGRTNAILLPYVIRYNGTRPAKTATWPKYNYYRADEKYQDIARMLGLPASTPEEGVESYAKAVYELGERIGIQMNFRDQGIDEKEWKEHSRELAFLAYEDQCSPANPRLPMVDHMQEIIEDAYYGYKERPGRRK
17
+ >A0A0H2URT2
18
+ MADKKTVTPEEKKLVAEKHVDELVQKALVALEEMRKLDQEQVDYIVAKASVAALDAHGELALHAFEETGRGVFEDKATKNLFACEHVVNNMRHTKTVGVIEEDDVTGLTLIAEPVGVVCGITPTTNPTSTAIFKSLISLKTRNPIVFAFHPSAQESSAHAARIVRDAAIAAGAPENCVQWITQPSMEATSALMNHEGVATILATGGNAMVKAAYSCGKPALGVGAGNVPAYVEKSANIRQAAHDIVMSKSFDNGMVCASEQAVIIDKEIYDEFVAEFKSYHTYFVNKKEKALLEEFCFGVKANSKNCAGAKLNADIVGKPATWIAEQAGFTVPEGTNILAAECKEVGENEPLTREKLSPVIAVLKSESREDGITKARQMVEFNGLGHSAAIHTADEELTKEFGKAVKAIRVICNSPSTFGGIGDVYNAFLPSLTLGCGSYGRNSVGDNVSAINLLNIKKVGRRRNNMQWMKLPSKTYFERDSIQYLQKCRDVERVMIVTDHAMVELGFLDRIIEQLDLRRNKVVYQIFADVEPDPDITTVNRGTEIMRAFKPDTIIALGGGSPMDAAKVMWLFYEQPEVDFRDLVQKFMDIRKRAFKFPLLGKKTKFIAIPTTSGTGSEVTPFAVISDKANNRKYPIADYSLTPTVAIVDPALVLTVPGFVAADTGMDVLTHATEAYVSQMASDYTDGLALQAIKLVFENLESSVKNADFHSREKMHNASTIAGMAFANAFLGISHSMAHKIGAQFHTIHGRTNAILLPYVIRYNGTRPAKTATWPKYNYYRADEKYQDIARMLGLPASTPEEGVESYAKAVYELGERIGIQMNFRDQGIDEKEWKEHSRKLAFLAYEDQCSPANPRLPMVDHMQEIIEDAYYGYKERPGRRK
19
+ >Q8YMD9
20
+ MTSRIRFLMCPPDHYDVDYVINPWMEGNIHKSSRDRAVEQWQGLYQILKEHAIVDLVTPQKGWPDLVFTANAGLVLGDNVVLSRFLHKERQGEEPYFKEWFEGNGYTVYELPKDLPFEGAGDALLDREGRWLWAGYGFRSELDSHPYLAKWLDIEVLSLRLIDERFYHLDTCFCPLANGYLLYYPGAFDSYSNRLIEMRVAPEKRIAIAEADAVNFACNTVNVESIVIMNKASDALKQSLTGVGFQVLETPLTEFLKAGGAAKCLTLRVTEPVRDEVHANVYVESRIIRIEGHLLDSGLINRALDMIVDTGGSFQVLNFNLGEQRQSTSAAEVKVSAPSHEVMEEIISLLIDLGAVDLPQDERDAKLEPVIQDGVAPDDFYVSTIYPTEVRINGQWIKVENQRMDGAIAITQTPNGLLAQCKILRDLKAGEQVIVDVLGIRTIRKTESREQRNTQEFSFMSGGVSSERRVELVVEQVAWELRKIRDAGGKVVVTAGPVVIHTGGGEHLSRLIREGYVQALLGGNAIAVHDIEQNMMGTSLGVDMKRGVAVRGGHRHHLKVINTIRRHGSIAKGVESGIIRSGVMYECVRNQIPFVLAGSIRDDGPLPDTQMDLIKAQEEYAKHLEGAEMILMLSSMLHSIGVGNMTPAGVKMVCVDINPAVVTKLSDRGSIESVGVVTDVGLFLSLLTQQLDKLTSPYVSKVG
example/difference_20241122_ec_dict_list.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e8a013ba26e7f3482ddc072d390d529c8cb5f3c40e8e0f33c2189b7c3d6200e
3
+ size 1056366
example/protein_go_clean.fasta ADDED
The diff for this file is too large to render. See raw diff
 
example/protein_go_clean.json ADDED
@@ -0,0 +1,606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"protein_id": "A8CF74", "GO_id": ["0004497", "0000166"]}
2
+ {"protein_id": "A0A0L8M630", "GO_id": ["0008839", "0009089"]}
3
+ {"protein_id": "Q8XP19", "GO_id": ["0004565", "0004566", "0030246", "0005975", "0019391"]}
4
+ {"protein_id": "B5LAT8", "GO_id": ["0005737", "0045551", "0008270", "0009820", "0009809"]}
5
+ {"protein_id": "A0A805Z5R7", "GO_id": ["0016787", "0006629"]}
6
+ {"protein_id": "E9EE69", "GO_id": ["0030428", "0005935", "0005886", "0004100", "0006031", "1902404"]}
7
+ {"protein_id": "A2QU15", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
8
+ {"protein_id": "Q704E9", "GO_id": ["0016020", "0016717", "0006633"]}
9
+ {"protein_id": "A0A8A1G1R1", "GO_id": ["0005576", "0030246", "0004553", "0000272"]}
10
+ {"protein_id": "G3UYQ4", "GO_id": ["0005829", "0015630", "0031514", "0031965", "0005654", "0005524", "0016887", "0004550", "0050145"]}
11
+ {"protein_id": "E1CJK0", "GO_id": ["0005576", "0004806", "0016042"]}
12
+ {"protein_id": "Q9P451", "GO_id": ["0005576", "0004806", "0017000", "0016042", "0072330"]}
13
+ {"protein_id": "A0A384JDH0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
14
+ {"protein_id": "A0A348HAY2", "GO_id": ["0005576", "0052689", "0016042"]}
15
+ {"protein_id": "A0A1G9VRW7", "GO_id": ["0046872", "0010333"]}
16
+ {"protein_id": "Q5B027", "GO_id": ["0005576", "0046872", "0016491", "0000272"]}
17
+ {"protein_id": "Q8KSA6", "GO_id": ["0005576", "0008800", "0030655", "0046677"]}
18
+ {"protein_id": "I1RE72", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
19
+ {"protein_id": "E1CJK2", "GO_id": ["0005576", "0004806", "0016042"]}
20
+ {"protein_id": "B2ATL7", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
21
+ {"protein_id": "Q9PTF3", "GO_id": ["0036064", "0005737", "0005741", "0005524", "0004550", "0006241", "0007368", "0006281", "0006183", "0001822", "0008053", "0009142", "0106071", "0006228"]}
22
+ {"protein_id": "Q75WF1", "GO_id": ["0005782", "0005777", "0008445", "0047821", "0071949", "1990748", "0019478", "0019740"]}
23
+ {"protein_id": "A0A0E3USC3", "GO_id": ["0051213", "0046872", "0017000"]}
24
+ {"protein_id": "A0A7H0XJI9", "GO_id": ["0005576", "0046556", "0046373", "0045493"]}
25
+ {"protein_id": "H0QPM2", "GO_id": ["0050660", "0016614"]}
26
+ {"protein_id": "G2X5A0", "GO_id": ["0030428", "0005886", "0004100", "0006031", "0031505"]}
27
+ {"protein_id": "Q5K9E2", "GO_id": ["0005730", "0031499", "0005524", "0046872", "1990817", "0043634", "0031123"]}
28
+ {"protein_id": "A0A4S8L6U5", "GO_id": ["0005829", "0070012", "0004252", "0006508"]}
29
+ {"protein_id": "Q9HYK7", "GO_id": ["0004324", "0000166", "0034599", "0042167"]}
30
+ {"protein_id": "A0A7E5WTY7", "GO_id": ["0005576", "0005794", "0008199", "0008198", "0004322", "0006879", "0006826"]}
31
+ {"protein_id": "A8QDF0", "GO_id": ["0005576", "0004806", "0016042"]}
32
+ {"protein_id": "G2R6N0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
33
+ {"protein_id": "A0A5N6UUF2", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
34
+ {"protein_id": "A0A5N6UV50", "GO_id": ["0005576", "0008810", "0030248", "0046872", "0004497", "0030245"]}
35
+ {"protein_id": "B5I920", "GO_id": ["0005524", "0016301", "0046677"]}
36
+ {"protein_id": "J0MXK0", "GO_id": ["0016740"]}
37
+ {"protein_id": "O87333", "GO_id": ["0005829", "0005886", "0008713", "0009244"]}
38
+ {"protein_id": "A0A3F2YLY8", "GO_id": ["0016020", "0030288", "0005509", "0016614", "0015945"]}
39
+ {"protein_id": "Q5MIX2", "GO_id": ["0005615", "0004000", "0046872", "0006154", "0046103", "0009117"]}
40
+ {"protein_id": "Q06K77", "GO_id": ["0005576", "0005524", "0005509", "0004382", "0090729", "0045134", "0030166"]}
41
+ {"protein_id": "B6EP94", "GO_id": ["0016881", "0016746", "0005524", "0019290"]}
42
+ {"protein_id": "A0A2I2F2I5", "GO_id": ["0005948", "0005739", "0003984", "0050660", "0000287", "0004737", "0030976", "0009097", "0009099"]}
43
+ {"protein_id": "A0A384JZ02", "GO_id": ["0005576", "0008061", "0046872", "0004497", "0030245"]}
44
+ {"protein_id": "A0A2R6W0K6", "GO_id": ["0009570", "0005829", "0010319", "0000287", "0004614", "0005975", "0009590", "0006006", "0009409", "0019252", "0005986"]}
45
+ {"protein_id": "A9CEY7", "GO_id": ["0008726", "0046306"]}
46
+ {"protein_id": "A0A4Y5QZ62", "GO_id": ["0000287", "0010333", "0016102"]}
47
+ {"protein_id": "D9X0I3", "GO_id": ["0047456", "0051539", "0003994", "0046872", "0003723", "0006099"]}
48
+ {"protein_id": "Q9GTP7", "GO_id": ["0005576", "0004050", "0005524", "0005509", "0030899", "0004382", "0017110", "0090729", "0045134", "0002376", "0030166"]}
49
+ {"protein_id": "Q9HY79", "GO_id": ["0005829", "0070288", "0008199", "0004322", "0020037", "0005506", "0140315", "0006879", "0006826"]}
50
+ {"protein_id": "Q9HTH9", "GO_id": ["0047617", "0042413"]}
51
+ {"protein_id": "A0A2R8QP51", "GO_id": ["0005634", "0004177", "0008239", "0008236", "0006508", "0050727"]}
52
+ {"protein_id": "B7JA35", "GO_id": ["0016020", "0030170", "0008483", "0009245", "0000271"]}
53
+ {"protein_id": "G0SGC7", "GO_id": ["0005874", "0005743", "0005758", "0005886", "0005525", "0003924", "0140523", "0008289", "0180020", "0046872", "0008017", "0031623"]}
54
+ {"protein_id": "Q9NC65", "GO_id": ["0005615", "0004000", "0046872", "0006154", "0046103"]}
55
+ {"protein_id": "P73562", "GO_id": ["0008836", "0009089", "0045312", "0008295"]}
56
+ {"protein_id": "A0A2B4RNI3", "GO_id": ["0061501", "0005524", "0005525", "0046872", "0045087"]}
57
+ {"protein_id": "G2XJ47", "GO_id": ["0030428", "0005935", "0005886", "0004100", "0006031", "1902404"]}
58
+ {"protein_id": "I1RHF8", "GO_id": ["0005576", "0016787", "0016042"]}
59
+ {"protein_id": "S7Q0E7", "GO_id": ["0005886", "0098552", "0046872", "0004497", "0030245"]}
60
+ {"protein_id": "B2B403", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
61
+ {"protein_id": "Q8LPU1", "GO_id": ["0048046", "0005773", "0052736", "0006952", "0009620"]}
62
+ {"protein_id": "A0A1L8GLK3", "GO_id": ["0005737", "0016605", "0061630", "0008270", "0036297", "0016567"]}
63
+ {"protein_id": "Q19VG9", "GO_id": ["0005737", "0005507", "0004784"]}
64
+ {"protein_id": "B2RVI8", "GO_id": ["0005829", "0008146", "0008202"]}
65
+ {"protein_id": "Q06K61", "GO_id": ["0005615", "0004000", "0046872", "0006154", "0046103"]}
66
+ {"protein_id": "A0A649V088", "GO_id": ["0005576", "0008800", "0030655", "0046677"]}
67
+ {"protein_id": "Q8I4R4", "GO_id": ["0031410", "0005576", "0020009", "0008061", "0004568", "0008843", "0006032", "0000272"]}
68
+ {"protein_id": "A0A100IM63", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
69
+ {"protein_id": "A0A0F5HSE8", "GO_id": ["0000166", "0016639", "0006520"]}
70
+ {"protein_id": "W8JNL4", "GO_id": ["0033846"]}
71
+ {"protein_id": "A0A0F6AK91", "GO_id": ["0005829", "0047952", "0051287", "0005975", "0046167", "0046168", "0006650", "0008654"]}
72
+ {"protein_id": "B2AUV0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
73
+ {"protein_id": "A0A5N6UWA3", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
74
+ {"protein_id": "Q93227", "GO_id": ["0010008", "0005886", "0004842", "0008270", "0000209", "0016567", "0007338"]}
75
+ {"protein_id": "Q0ZQ45", "GO_id": ["0016853", "0017000"]}
76
+ {"protein_id": "Q8PWQ1", "GO_id": ["0051539", "0016829", "0046872", "0008299"]}
77
+ {"protein_id": "O76742", "GO_id": ["0120281", "0000421", "0005938", "0042995", "0031901", "0005770", "0031902", "0005764", "0043025", "0043204", "0045335", "0005886", "0098793", "0045202", "0031982", "0005525", "0003924", "0044877", "0061909", "0007298", "0061883", "0032456", "0006897", "0016197", "0034058", "0008333", "0032510", "1990182", "0006909", "0090385", "0015031", "0032482", "0160156", "0046718", "0033292", "0016192", "0048190"]}
78
+ {"protein_id": "A8CF75", "GO_id": ["0004497", "0000166"]}
79
+ {"protein_id": "Q9W4C3", "GO_id": ["0005829", "0005634", "0004843", "0008270", "0016579", "0006508", "0031647"]}
80
+ {"protein_id": "Q47595", "GO_id": ["0016757", "0009243"]}
81
+ {"protein_id": "Q10L01", "GO_id": ["0012505", "0005783", "0005794", "0005886", "0019706", "1902884", "1901002", "0006612"]}
82
+ {"protein_id": "A0A0H2ZM56", "GO_id": ["0008774", "0004029", "0120542", "0046872", "0006066", "0015976"]}
83
+ {"protein_id": "Q99QC1", "GO_id": ["0030288", "0008800", "0046872", "0000166", "0017001", "0046677"]}
84
+ {"protein_id": "A0A3M6TIF0", "GO_id": ["0061501", "0005524", "0005525", "0046872", "0045087"]}
85
+ {"protein_id": "I7ZK32", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
86
+ {"protein_id": "O53611", "GO_id": ["0005829", "0005576", "0009274", "0005886", "0004450", "0000287", "0042803", "0006097", "0006102", "0006099"]}
87
+ {"protein_id": "Q5XVM9", "GO_id": ["0005886", "0016787"]}
88
+ {"protein_id": "A7B555", "GO_id": ["0005829", "0008747", "0019262"]}
89
+ {"protein_id": "A0A4Y5QWA6", "GO_id": ["0000287", "0010333", "0016102"]}
90
+ {"protein_id": "A0A1V0QSH2", "GO_id": ["0000287", "0010333", "0016102"]}
91
+ {"protein_id": "A0A0H2URT2", "GO_id": ["0008774", "0004029", "0120542", "0046872", "0006066", "0015976"]}
92
+ {"protein_id": "A3MUS8", "GO_id": ["0005737", "0004067", "0008233", "0006508"]}
93
+ {"protein_id": "Q2U834", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031", "0030448"]}
94
+ {"protein_id": "A0A0A2J1Z6", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958", "0043386"]}
95
+ {"protein_id": "F1N206", "GO_id": ["1902493", "0043159", "0160157", "0005759", "0005739", "0031514", "0005634", "0160167", "0045252", "0045254", "0047101", "0004148", "0050660", "0034604", "0006103", "0009083", "0007369", "0006120", "0006508", "0006090", "0042391", "0048240"]}
96
+ {"protein_id": "Q1QYU7", "GO_id": ["0051537", "0005506", "0004497"]}
97
+ {"protein_id": "D4GSD6", "GO_id": ["0042802", "0004454", "0042803", "0006000"]}
98
+ {"protein_id": "A0A5J6BJT1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
99
+ {"protein_id": "A0A1V0QSG1", "GO_id": ["0009507", "0000287", "0010333", "0016102"]}
100
+ {"protein_id": "C5BK10", "GO_id": ["0005886", "0070566", "0005524", "0016874", "0071766", "0006633"]}
101
+ {"protein_id": "Q8YMD9", "GO_id": ["0016787", "0016829", "0000166"]}
102
+ {"protein_id": "Q9LCC6", "GO_id": ["0005829", "0008797", "0006531", "0006099"]}
103
+ {"protein_id": "Q93F76", "GO_id": ["0008800", "0030655", "0046677"]}
104
+ {"protein_id": "A0A6J1SUS3", "GO_id": ["0061501", "0005524", "0003690", "0005525", "0046872", "0003723", "0051607", "0045087"]}
105
+ {"protein_id": "Q89F89", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0071949", "0019478"]}
106
+ {"protein_id": "Q8DPI6", "GO_id": ["0016779", "0071555", "0070395"]}
107
+ {"protein_id": "Q9AI62", "GO_id": ["0004556", "0009313"]}
108
+ {"protein_id": "A0A1D8PT02", "GO_id": ["0062040", "0016020", "0005886", "0010181", "0003955", "0034599"]}
109
+ {"protein_id": "Q27493", "GO_id": ["0005739", "0005730", "0005736", "0003677", "0003899", "0032549", "0006351"]}
110
+ {"protein_id": "Q47594", "GO_id": ["0016757", "0009243"]}
111
+ {"protein_id": "F4JFR7", "GO_id": ["0005737", "0005634", "0141131", "0046872", "0006281"]}
112
+ {"protein_id": "A0A2I2F2J6", "GO_id": ["0016853", "0009813"]}
113
+ {"protein_id": "A0A0J9VPT0", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
114
+ {"protein_id": "E9EFH8", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
115
+ {"protein_id": "B0Y6Z7", "GO_id": ["0005789", "0004252", "0016740"]}
116
+ {"protein_id": "A0A0S2GKZ1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
117
+ {"protein_id": "A0A384JTK5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
118
+ {"protein_id": "A0A2V0R8Q9", "GO_id": ["0050053", "0009758"]}
119
+ {"protein_id": "O84925", "GO_id": ["0005576", "0016491"]}
120
+ {"protein_id": "Q8ECU3", "GO_id": ["0004497", "0009058"]}
121
+ {"protein_id": "Q5SLF5", "GO_id": ["0016787"]}
122
+ {"protein_id": "Q2G5J4", "GO_id": ["0008726", "0046306"]}
123
+ {"protein_id": "A0A4Y5QVX6", "GO_id": ["0000287", "0010333", "0016102"]}
124
+ {"protein_id": "A0A1V0QSG6", "GO_id": ["0000287", "0010333", "0016102"]}
125
+ {"protein_id": "A0A2V5GUR2", "GO_id": ["0003962", "0016853", "0030170", "0019346"]}
126
+ {"protein_id": "F4KAV2", "GO_id": ["0005737", "0005634", "0141131", "0046872", "0006281", "0010029"]}
127
+ {"protein_id": "P74535", "GO_id": ["0016787", "0016829", "0000166"]}
128
+ {"protein_id": "G2XEK6", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
129
+ {"protein_id": "Q8RQV0", "GO_id": ["0005576", "0030246", "0016757", "0004553", "0005975"]}
130
+ {"protein_id": "A0A101MN42", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958", "0043386"]}
131
+ {"protein_id": "R4LHX8", "GO_id": ["0033846"]}
132
+ {"protein_id": "Q1K7A4", "GO_id": ["0005739", "0003860", "0006574"]}
133
+ {"protein_id": "I3SL57", "GO_id": ["0009507", "0009536", "0004392", "0046872", "0051702", "0042167", "0006788", "0009877", "0015979", "0010024", "0009646", "0010167", "0009609"]}
134
+ {"protein_id": "A8QCV4", "GO_id": ["0005576", "0004806", "0016042"]}
135
+ {"protein_id": "H1AE14", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
136
+ {"protein_id": "G2RGE6", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
137
+ {"protein_id": "Q6YNE9", "GO_id": ["0005576", "0016158", "0003993"]}
138
+ {"protein_id": "A0A1D8PHR5", "GO_id": ["0016020", "0005886", "0010181", "0003955", "0016655", "0034599"]}
139
+ {"protein_id": "Q9HUF5", "GO_id": ["0005829", "0005886", "0008713", "0071968", "0009244"]}
140
+ {"protein_id": "Q9LJH2", "GO_id": ["0005737", "0005634", "0141131", "0046872", "0006281"]}
141
+ {"protein_id": "Q9XDP2", "GO_id": ["0042597", "0004062", "0047686"]}
142
+ {"protein_id": "Q8E0N2", "GO_id": ["0004565", "0004566", "0030246", "0005975", "0019391"]}
143
+ {"protein_id": "Q9RMT4", "GO_id": ["0005576", "0008800", "0030655", "0046677"]}
144
+ {"protein_id": "Q5SKU3", "GO_id": ["0016829", "0006631"]}
145
+ {"protein_id": "A0A0H2ZIF3", "GO_id": ["0051537", "0005506", "0004497"]}
146
+ {"protein_id": "B9SIM3", "GO_id": ["0009507", "0009899", "0000287", "0010333", "0009686"]}
147
+ {"protein_id": "A0A384J8V9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
148
+ {"protein_id": "P71905", "GO_id": ["0005737", "0005829", "0009274", "0005886", "0046872", "0004540", "0019843", "0006397", "0006364", "0008033"]}
149
+ {"protein_id": "J0MXJ0", "GO_id": ["0004751", "0019316", "0009052"]}
150
+ {"protein_id": "V5TF61", "GO_id": ["0005737", "0046872", "0004659", "0008299"]}
151
+ {"protein_id": "Q7WYA8", "GO_id": ["0042597", "0008800", "0008270", "0017001", "0046677"]}
152
+ {"protein_id": "Q48434", "GO_id": ["0030288", "0008800", "0017001", "0046677"]}
153
+ {"protein_id": "G4NIR3", "GO_id": ["0030428", "0005935", "0005886", "0004100", "0006031", "1902404"]}
154
+ {"protein_id": "Q0SG95", "GO_id": ["0050049", "0000166", "0050175", "0009094"]}
155
+ {"protein_id": "B0Y8Y4", "GO_id": ["0005737", "0005634", "0005524", "0004674", "0007155", "0050684", "0000245"]}
156
+ {"protein_id": "B2B629", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
157
+ {"protein_id": "G2QCJ3", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
158
+ {"protein_id": "Q5AQA6", "GO_id": ["0005576", "0008810", "0030248", "0046872", "0004497", "0030245"]}
159
+ {"protein_id": "Q7SHD9", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
160
+ {"protein_id": "A0A0A2J9B0", "GO_id": ["0008061", "0008843", "0006032", "0000272"]}
161
+ {"protein_id": "G4VJD5", "GO_id": ["0004619", "0061621", "0044542"]}
162
+ {"protein_id": "B0Y8W1", "GO_id": ["0016020", "0004252", "0006508"]}
163
+ {"protein_id": "Q970I2", "GO_id": ["0005524", "0140097", "0003677", "0004386", "0016787", "0006281"]}
164
+ {"protein_id": "Q8I8I2", "GO_id": ["0005737", "0003779", "0004332", "0006096"]}
165
+ {"protein_id": "Q5LIC7", "GO_id": ["0004565", "0005975"]}
166
+ {"protein_id": "P71420", "GO_id": ["0030288", "0008800", "0017001", "0046677"]}
167
+ {"protein_id": "A0A482WD11", "GO_id": ["0061501", "0005524", "0003690", "0005525", "0046872", "0003723", "0051607", "0045087"]}
168
+ {"protein_id": "A2R2G5", "GO_id": ["0030428", "0005935", "0005886", "0004100", "0006031", "1902404"]}
169
+ {"protein_id": "S5S833", "GO_id": ["0005794", "0000139", "0030246", "0030145", "0004653", "0016266", "0018242", "0018243"]}
170
+ {"protein_id": "A0A384JJB6", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
171
+ {"protein_id": "G2RB72", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
172
+ {"protein_id": "Q9LKZ1", "GO_id": ["0005524", "0004707", "0106310", "0009409", "0010225", "0009611"]}
173
+ {"protein_id": "O29889", "GO_id": ["0005524", "0140097", "0003677", "0004386", "0016787", "0006281"]}
174
+ {"protein_id": "V5TDY7", "GO_id": ["0004659"]}
175
+ {"protein_id": "A0A2Z5QL08", "GO_id": ["0042802", "0016210", "0042803", "0009715", "0009813", "0030639"]}
176
+ {"protein_id": "Q54XE5", "GO_id": ["0008721", "0046872", "1990748", "0036088"]}
177
+ {"protein_id": "Q5K9E9", "GO_id": ["0005829", "0005524", "0003921", "0006177"]}
178
+ {"protein_id": "D5SK09", "GO_id": ["0046872", "0010333"]}
179
+ {"protein_id": "A0A1Q3EPF5", "GO_id": ["0005829", "0070012", "0004252", "0006508"]}
180
+ {"protein_id": "A8QCV7", "GO_id": ["0004806", "0016042"]}
181
+ {"protein_id": "G3XAP7", "GO_id": ["0005576", "0008810", "0046872", "0004497", "0030245"]}
182
+ {"protein_id": "J9VRT1", "GO_id": ["0005737", "0016020", "0008445", "0071949", "0019478"]}
183
+ {"protein_id": "A5TY84", "GO_id": ["0005886", "0005524", "0004674", "0080090"]}
184
+ {"protein_id": "A0A7W3N5X5", "GO_id": ["0016832", "0004801", "0005975"]}
185
+ {"protein_id": "Q8ZPP5", "GO_id": ["0005886", "0005524", "0000155"]}
186
+ {"protein_id": "F1QB30", "GO_id": ["0005829", "0005783", "0005789", "0000151", "0043130", "0061630", "0008270", "0030968", "0070936", "0000209", "0034976", "0006511"]}
187
+ {"protein_id": "Q79FW0", "GO_id": ["0005829", "0005886", "0008696", "0030170", "0008483", "0019752", "0046656", "0009252", "0008360", "0046654"]}
188
+ {"protein_id": "A0A0E0RU58", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
189
+ {"protein_id": "G2WY98", "GO_id": ["0030428", "0045009", "0005886", "0004100", "0030476", "0006031", "0000920"]}
190
+ {"protein_id": "A0A0G4P2K0", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958", "0043386"]}
191
+ {"protein_id": "F1MLX0", "GO_id": ["0005739", "0046872", "0050163", "0006107"]}
192
+ {"protein_id": "Q8RWX4", "GO_id": ["0009507", "0009941", "0009707", "0005886", "0009536", "0010006", "0005524", "0004672", "0004674", "0009704", "1904216", "0045036"]}
193
+ {"protein_id": "D9N4H4", "GO_id": ["0005576", "0004806", "0016042"]}
194
+ {"protein_id": "Q59127", "GO_id": ["0005737", "0071949", "0018535", "0009820", "0019608"]}
195
+ {"protein_id": "Q0HWI9", "GO_id": ["0016881", "0005524", "0019290"]}
196
+ {"protein_id": "Q50LF1", "GO_id": ["0005737", "0046872", "0008115", "0046653"]}
197
+ {"protein_id": "Q47593", "GO_id": ["0005886", "0016757", "0009243"]}
198
+ {"protein_id": "A0A2Z5GDY5", "GO_id": ["0005737", "0005783", "0019897", "0005634", "0046872", "0070006", "0009877", "0006508", "0009609"]}
199
+ {"protein_id": "Q9SSA4", "GO_id": ["0005737", "0005829", "0005524", "0004674", "0004712", "0004713", "0071244", "1902456", "0009637", "0010114", "0001659"]}
200
+ {"protein_id": "A0A0E0S977", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
201
+ {"protein_id": "A0A0E3JXD9", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0009063"]}
202
+ {"protein_id": "Q2G5J3", "GO_id": ["0005829", "0016491"]}
203
+ {"protein_id": "A0A0H3GN27", "GO_id": ["0009986", "0005576", "0000015", "0000287", "0004634", "0006096"]}
204
+ {"protein_id": "W0W999", "GO_id": ["0033846", "0016787"]}
205
+ {"protein_id": "A0A803NI27", "GO_id": ["0031969", "0005789", "0005778", "0004420", "0015936", "0008299", "0016126"]}
206
+ {"protein_id": "B9SIM2", "GO_id": ["0009507", "0009899", "0000287", "0010333", "0009686"]}
207
+ {"protein_id": "A0A218MJF1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
208
+ {"protein_id": "D5FKJ2", "GO_id": ["0030170", "0008483", "0006520", "0009058"]}
209
+ {"protein_id": "Q9YA51", "GO_id": ["0051539", "0016829", "0046872", "0008299"]}
210
+ {"protein_id": "Q6PBS3", "GO_id": ["0005737", "0005524", "0004830", "0006436"]}
211
+ {"protein_id": "Q8PWQ0", "GO_id": ["0005737", "0018799", "0046872", "0008299"]}
212
+ {"protein_id": "Q9KJY7", "GO_id": ["0008800", "0030655", "0046677"]}
213
+ {"protein_id": "A0A1S4F2V5", "GO_id": ["0008234", "0006508"]}
214
+ {"protein_id": "A0A6I8RMG7", "GO_id": ["0016020", "0005509", "0016853"]}
215
+ {"protein_id": "Q79F72", "GO_id": ["0016020", "0016491", "0006636"]}
216
+ {"protein_id": "B9LS20", "GO_id": ["0005829", "0047975", "0017061", "0009164"]}
217
+ {"protein_id": "G2RGE5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
218
+ {"protein_id": "A8NCG7", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
219
+ {"protein_id": "Q93NH5", "GO_id": ["0005737", "0005506", "0018535", "0009820", "0019608"]}
220
+ {"protein_id": "A5U3A3", "GO_id": ["0005886", "0005524", "0004674"]}
221
+ {"protein_id": "D2RW80", "GO_id": ["0043917", "0046523", "0019509", "0019323"]}
222
+ {"protein_id": "C5BK13", "GO_id": ["0016020", "0016717", "0006631"]}
223
+ {"protein_id": "M1T1K4", "GO_id": ["0030170", "0008483", "0006520", "0009058"]}
224
+ {"protein_id": "A0A095C6S0", "GO_id": ["0005782", "0003884", "0071949", "1990748", "0019478", "0019740"]}
225
+ {"protein_id": "S5SC42", "GO_id": ["0016491", "0019380"]}
226
+ {"protein_id": "A0A1V0QSG0", "GO_id": ["0000287", "0010333", "0016102"]}
227
+ {"protein_id": "Q50248", "GO_id": ["0044569", "0009375", "0005886", "0051538", "0051539", "0009055", "0008901", "0046872", "0015948"]}
228
+ {"protein_id": "B5T072", "GO_id": ["0016757", "0009243"]}
229
+ {"protein_id": "A0A2R2JFW7", "GO_id": ["0005829", "0005634", "0004053", "0030145", "0019547", "0000050"]}
230
+ {"protein_id": "O53044", "GO_id": ["0030288", "0008800", "0017001", "0046677"]}
231
+ {"protein_id": "E9ECB5", "GO_id": ["0030428", "0005935", "0045009", "0000131", "0005886", "0005628", "0004100", "0030476", "0006031", "0097271"]}
232
+ {"protein_id": "D9N4H3", "GO_id": ["0005576", "0004806", "0016042"]}
233
+ {"protein_id": "Q7MTZ8", "GO_id": ["0005737", "0005886", "0046872", "0008758", "0009245"]}
234
+ {"protein_id": "Q9VQQ0", "GO_id": ["0005737", "0005634", "0000159", "0005524", "0046872", "0003755", "0008160", "0045175", "0007052", "1904785"]}
235
+ {"protein_id": "D5E1T2", "GO_id": ["0016832", "0004801", "0005975"]}
236
+ {"protein_id": "D5E1S9", "GO_id": ["0005737", "0008736", "0006004"]}
237
+ {"protein_id": "Q50LF0", "GO_id": ["0005737", "0000166", "0008115", "0046653"]}
238
+ {"protein_id": "M1T9N7", "GO_id": ["0008168", "0017000", "0032259"]}
239
+ {"protein_id": "G9MLG2", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
240
+ {"protein_id": "E9QSG0", "GO_id": ["0044695", "0012505", "0005789", "0061630", "0008270", "0043161", "0016567", "0051603"]}
241
+ {"protein_id": "Q4J8L0", "GO_id": ["0008821", "0003677", "0000287", "0006310", "0006281"]}
242
+ {"protein_id": "E1CJK1", "GO_id": ["0005576", "0004806", "0016042"]}
243
+ {"protein_id": "D9J041", "GO_id": ["0050660", "0016152", "0045340", "0003955", "0050661", "0016668", "0050787"]}
244
+ {"protein_id": "Q6D291", "GO_id": ["0042597", "0016787", "0046677"]}
245
+ {"protein_id": "A0A0X1KHF9", "GO_id": ["0005524", "0016301", "0046677"]}
246
+ {"protein_id": "Q8NQU4", "GO_id": ["0005886", "0015424", "0005524", "0016887"]}
247
+ {"protein_id": "G7XMT1", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
248
+ {"protein_id": "A0A2V0R8Y2", "GO_id": ["0050053", "0009758"]}
249
+ {"protein_id": "U6BYK3", "GO_id": ["0009507", "0047804", "0030170", "0009086", "0019346"]}
250
+ {"protein_id": "A2VD68", "GO_id": ["0036064", "0005737", "0005741", "0004550", "0006241", "0006281", "0006183", "0008053", "0009142", "0006228"]}
251
+ {"protein_id": "A0A7E6FSU6", "GO_id": ["0005782", "0008445", "0071949", "0019478"]}
252
+ {"protein_id": "O53443", "GO_id": ["0005829", "0005886", "0005524", "0005525", "0016853", "0046872", "0004518"]}
253
+ {"protein_id": "Q5JIW4", "GO_id": ["0004067", "0006520"]}
254
+ {"protein_id": "V4HM83", "GO_id": ["0050660", "0004499", "0050661"]}
255
+ {"protein_id": "I1S0T9", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
256
+ {"protein_id": "A9FDB7", "GO_id": ["0020037", "0005506", "0004497", "0016705"]}
257
+ {"protein_id": "Q79EF1", "GO_id": ["0016020", "0016491", "0006636"]}
258
+ {"protein_id": "A0A0H2ZM62", "GO_id": ["0005886", "0005524", "0000155"]}
259
+ {"protein_id": "E8WYN5", "GO_id": ["0016829", "0046872"]}
260
+ {"protein_id": "J9VPE7", "GO_id": ["0005782", "0003884", "0071949", "0019478", "0098754"]}
261
+ {"protein_id": "A0A143ZZK9", "GO_id": ["0005886", "0005524", "0016887", "0008556", "0008554", "0006874", "0034220", "0006813", "0006814"]}
262
+ {"protein_id": "H6C2T9", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
263
+ {"protein_id": "A0A7C9FSB8", "GO_id": ["0009507", "0004452", "0046872", "0015995", "0050992", "0009240"]}
264
+ {"protein_id": "Q5XTQ4", "GO_id": ["0005576", "0106435", "0016042"]}
265
+ {"protein_id": "A0A5N6V703", "GO_id": ["0005576", "0008810", "0030248", "0046872", "0004497", "0030245"]}
266
+ {"protein_id": "A0A0J9XBC9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
267
+ {"protein_id": "Q7S9V5", "GO_id": ["0005576", "0016158", "0003993"]}
268
+ {"protein_id": "Q59Y37", "GO_id": ["0005576", "0062040", "0030446", "0016020", "0005886", "0050625", "0010181", "0003955", "0034599"]}
269
+ {"protein_id": "D5E1S7", "GO_id": ["0008911"]}
270
+ {"protein_id": "F8VPU6", "GO_id": ["0004843", "0046872", "0016579", "0016567", "0006508"]}
271
+ {"protein_id": "A0A0H2ZNH9", "GO_id": ["0005886", "0004721", "0000155", "0004673", "0016036", "0006355", "0007165"]}
272
+ {"protein_id": "Q8EG04", "GO_id": ["0019003", "0005525", "0003924"]}
273
+ {"protein_id": "A0A1G9UQQ0", "GO_id": ["0046872", "0010333"]}
274
+ {"protein_id": "Q55406", "GO_id": ["0031676", "0016717", "0006636"]}
275
+ {"protein_id": "A0A100YXA1", "GO_id": ["0005829", "0016829", "0016740"]}
276
+ {"protein_id": "Q79FC3", "GO_id": ["0005576", "0033650", "0005886", "0016491"]}
277
+ {"protein_id": "Q5JGJ6", "GO_id": ["0051539", "0016829", "0046872", "0008299"]}
278
+ {"protein_id": "Q4X241", "GO_id": ["0005886", "0019003", "0005525", "0003924", "0030448", "0007165"]}
279
+ {"protein_id": "A0A2P6MHU9", "GO_id": ["0005737", "0008736", "0006004"]}
280
+ {"protein_id": "M4QN28", "GO_id": ["0005886", "0016757", "0009243"]}
281
+ {"protein_id": "E2RTQ7", "GO_id": ["0005813", "0032133", "0005737", "0097568", "0031514", "0031965", "0005634", "0005819", "0005876", "0051233", "0000922", "0097597", "0005524", "0008092", "0004674", "1902850", "0000281", "0140014", "0007052", "0051726", "0032465"]}
282
+ {"protein_id": "A0A0M3VI47", "GO_id": ["0009507", "0047804", "0030170", "0071266", "0019346"]}
283
+ {"protein_id": "G4Z2L3", "GO_id": ["0005886", "0004100", "0071555", "0006031"]}
284
+ {"protein_id": "A0A0H2ZJB2", "GO_id": ["0051537", "0046872", "0004497"]}
285
+ {"protein_id": "Q2U6T7", "GO_id": ["0005886", "0000293", "0046872", "0015677", "0006879", "0006826"]}
286
+ {"protein_id": "Q8DP70", "GO_id": ["0005576", "0016491"]}
287
+ {"protein_id": "A5U8X0", "GO_id": ["0003677", "0003917", "0046872", "0006265"]}
288
+ {"protein_id": "A0A1V0QSF3", "GO_id": ["0000287", "0010333", "0016102"]}
289
+ {"protein_id": "F1SVF8", "GO_id": ["0005886", "0008901", "0051911", "0016151", "0015948"]}
290
+ {"protein_id": "M9PGC5", "GO_id": ["0005737", "0005524", "0140693", "0019870", "0004672", "0106310", "0004674", "0006884", "0071474", "0035556", "0140694", "0050801", "0010766", "0090263", "1903288", "0035220"]}
291
+ {"protein_id": "Q93Z30", "GO_id": ["0005737", "0005829", "0005634", "0005524", "0004672", "0004712", "0007229", "0010119", "0009637"]}
292
+ {"protein_id": "Q89Y83", "GO_id": ["0005524", "0005525", "0046872", "0016779", "0051607", "0009117"]}
293
+ {"protein_id": "Q5F8J4", "GO_id": ["0004412", "0046872", "0070403", "0050661", "0009086", "0009088"]}
294
+ {"protein_id": "D0LZ73", "GO_id": ["0140737", "0004322", "0046872", "0006879", "0006826"]}
295
+ {"protein_id": "A0A0M0ELU2", "GO_id": ["0005737", "0016787", "0004664", "0009094", "0009372"]}
296
+ {"protein_id": "Q8YT18", "GO_id": ["0016829"]}
297
+ {"protein_id": "A0A1D8PU51", "GO_id": ["0016020", "0005739", "0051539", "0046872", "0008137", "0042773"]}
298
+ {"protein_id": "I6XFS7", "GO_id": ["0005576", "0030430", "0044196", "0008168", "0003676", "0031167"]}
299
+ {"protein_id": "B6QEB3", "GO_id": ["0005737", "0005507", "0004784"]}
300
+ {"protein_id": "G3X8Y1", "GO_id": ["0005737", "0005634", "0042802", "0030674", "0061630", "0008270", "0050904", "0002523", "1905517", "1901224", "0070936"]}
301
+ {"protein_id": "Q2U0G5", "GO_id": ["0030428", "0045009", "0005886", "0004100", "0030476", "0006031", "0000920"]}
302
+ {"protein_id": "A0A5A4WIX0", "GO_id": ["0004450", "0000287", "0051287", "0006097", "0006099"]}
303
+ {"protein_id": "A0A482A9N4", "GO_id": ["0005576", "0004497", "0030245"]}
304
+ {"protein_id": "Q5AZ52", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
305
+ {"protein_id": "A0A5N6UNY1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
306
+ {"protein_id": "C8CP46", "GO_id": ["0005576", "0016787"]}
307
+ {"protein_id": "E7F654", "GO_id": ["0005730", "0005634", "0005524", "0051731", "0003723", "0000448", "0060216", "0048565", "0031017", "0001889", "0061113", "0006364"]}
308
+ {"protein_id": "A0A2P6MHT4", "GO_id": ["0003872", "0005524", "0046872", "0006002"]}
309
+ {"protein_id": "A0A4Y5QWK5", "GO_id": ["0009507", "0000287", "0010333", "0016102"]}
310
+ {"protein_id": "E9P162", "GO_id": ["0004497", "0000166"]}
311
+ {"protein_id": "Q09HD0", "GO_id": ["0005576", "0008800", "0030655", "0046677"]}
312
+ {"protein_id": "G4YM00", "GO_id": ["0005886", "0004100", "0071555", "0006031"]}
313
+ {"protein_id": "Q0D1W9", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
314
+ {"protein_id": "Q4D5J7", "GO_id": ["0005789", "0009922", "0034625", "0034626", "0019367", "0030148", "0042761"]}
315
+ {"protein_id": "Q4WW94", "GO_id": ["0005634", "0005524", "0004674", "0007155", "0043484"]}
316
+ {"protein_id": "A0A7J6H013", "GO_id": ["0031969", "0005789", "0005778", "0004420", "0015936", "0008299", "0016126"]}
317
+ {"protein_id": "D8Q364", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
318
+ {"protein_id": "F8G0P2", "GO_id": ["0042597", "0000166", "0016491", "0009820", "0019608"]}
319
+ {"protein_id": "D9XF46", "GO_id": ["0047456", "0051539", "0003994", "0046872", "0017000", "0006099"]}
320
+ {"protein_id": "A1XLE2", "GO_id": ["0005829", "0005634", "0030234", "0016829", "0046872", "0019760", "0080028", "0018969"]}
321
+ {"protein_id": "A0A1V0QSH1", "GO_id": ["0000287", "0010333", "0016102"]}
322
+ {"protein_id": "Q9VQ29", "GO_id": ["0005743", "0005739", "0045275", "0051537", "0046872", "0016491", "0008121", "0006122"]}
323
+ {"protein_id": "Q0P9A8", "GO_id": ["0005829", "0008713", "0009244"]}
324
+ {"protein_id": "V4HJ70", "GO_id": ["0071949", "0016491"]}
325
+ {"protein_id": "H6CA42", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
326
+ {"protein_id": "A2QNG6", "GO_id": ["0030428", "0045009", "0005886", "0004100", "0030476", "0006031", "0000920"]}
327
+ {"protein_id": "Q1AYM8", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0071949", "0019478"]}
328
+ {"protein_id": "F9VNG5", "GO_id": ["0004412", "0046872", "0050661", "0009086", "0009088"]}
329
+ {"protein_id": "Q9VDA0", "GO_id": ["0005634", "0043138", "0005524", "0000400", "0009378", "0016787", "0006281", "0006302", "0045003", "0035825", "0036297", "0045950", "0000725"]}
330
+ {"protein_id": "A0A1V0QSH7", "GO_id": ["0005737", "0005634", "0004337", "0004161", "0046872", "0006695", "0045337"]}
331
+ {"protein_id": "Q6E7K9", "GO_id": ["0005886", "0070566", "0005524", "0016874", "0071766", "0006633"]}
332
+ {"protein_id": "S7QP81", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
333
+ {"protein_id": "F0NID4", "GO_id": ["0005524", "0003677", "0009378", "0016787", "0006310", "0006281"]}
334
+ {"protein_id": "Q9ZIT6", "GO_id": ["0005886", "0008918", "0046872", "0009244"]}
335
+ {"protein_id": "A0A5R8T042", "GO_id": ["0005576", "0008800", "0030655", "0046677"]}
336
+ {"protein_id": "G4N553", "GO_id": ["0071944", "0030428", "0016020", "0004100", "0006031"]}
337
+ {"protein_id": "A0A3S5XFG0", "GO_id": ["0046872", "0004659", "0046165", "0008299", "0043386"]}
338
+ {"protein_id": "Q79F73", "GO_id": ["0016020", "0016717", "0006636"]}
339
+ {"protein_id": "Q4WQ60", "GO_id": ["0005789", "0016740"]}
340
+ {"protein_id": "Q5U921", "GO_id": ["0016740"]}
341
+ {"protein_id": "O50131", "GO_id": ["0042802", "0030170", "0008483"]}
342
+ {"protein_id": "Q88JH5", "GO_id": ["0016020", "0030288", "0052934", "0005509"]}
343
+ {"protein_id": "J1H1J3", "GO_id": ["0004022", "0046872"]}
344
+ {"protein_id": "V5XKC3", "GO_id": ["0005737", "0140618", "0010106"]}
345
+ {"protein_id": "F2K079", "GO_id": ["0050660", "0004499", "0050661"]}
346
+ {"protein_id": "A0A0H3GPN8", "GO_id": ["0005886", "0005524", "0000155", "0007155"]}
347
+ {"protein_id": "Q8E8S0", "GO_id": ["0005886", "0009055", "0020037", "0046872", "0009061", "0019333"]}
348
+ {"protein_id": "Q7X2D3", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0071949", "0043799", "0019478"]}
349
+ {"protein_id": "B9SIL7", "GO_id": ["0009507", "0009899", "0034280", "0000287", "0010333", "0009686"]}
350
+ {"protein_id": "A0A1C9ZMC3", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
351
+ {"protein_id": "A0A1D8PS71", "GO_id": ["0009986", "0030287", "0005576", "0009277", "0008707", "0003993", "0030448"]}
352
+ {"protein_id": "B9CM12", "GO_id": ["0046872", "0016779", "0032923"]}
353
+ {"protein_id": "Q7CS24", "GO_id": ["0010181", "0004497", "0042602"]}
354
+ {"protein_id": "Q4X1A8", "GO_id": ["0000785", "0005737", "0005634", "0005524", "0106310", "0004674", "1903940", "1900237", "0045944"]}
355
+ {"protein_id": "A0A2C9JXL4", "GO_id": ["0016020", "0016263", "0030145", "0000166", "0016267", "0006486"]}
356
+ {"protein_id": "A0A0J9VGQ5", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
357
+ {"protein_id": "A0A0C3HJL3", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
358
+ {"protein_id": "Q704F0", "GO_id": ["0016020", "0016491", "0006636"]}
359
+ {"protein_id": "A0A1L7VFX3", "GO_id": ["0004337", "0004161", "0016829", "0046872", "0046165", "0008299", "0043386"]}
360
+ {"protein_id": "A0A5J6BJT3", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
361
+ {"protein_id": "A0A384JJE6", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
362
+ {"protein_id": "G2QA92", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
363
+ {"protein_id": "C4R4G9", "GO_id": ["0005782", "0005777", "0003884", "0071949", "0046436", "0019478", "0046416", "1902114", "0019740"]}
364
+ {"protein_id": "A0A1D8PJX3", "GO_id": ["0005743", "0005886", "0045275", "0051537", "0046872", "0016491", "0008121", "0006122"]}
365
+ {"protein_id": "A0A1D1VU85", "GO_id": ["0005507", "0042802", "0042803", "0004784", "0008270", "0019430"]}
366
+ {"protein_id": "B8G5D6", "GO_id": ["0010181", "0050661", "0003959"]}
367
+ {"protein_id": "Q39QF4", "GO_id": ["0003995", "0050660"]}
368
+ {"protein_id": "H6C7U6", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
369
+ {"protein_id": "Q8E8P8", "GO_id": ["0005886", "0046872", "0016653", "0006784"]}
370
+ {"protein_id": "E4Q361", "GO_id": ["0005829", "0004565", "0008422", "0016162", "0031217", "0009044", "0030245", "0009251", "0005990", "0070207", "0045493"]}
371
+ {"protein_id": "J9VUY6", "GO_id": ["0005829", "0005524", "0003922", "0003921", "0006177"]}
372
+ {"protein_id": "A0A223GEC9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
373
+ {"protein_id": "A0A5J6BJN2", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
374
+ {"protein_id": "A0A1D8PJ73", "GO_id": ["0016020", "0005743", "0005886", "0045271", "0008137"]}
375
+ {"protein_id": "D3ZDM7", "GO_id": ["0005737", "0005829", "0005782", "0005777", "0008445", "0071949", "0006531", "0019478", "0007625", "0042445", "0007320", "0050877", "0010646"]}
376
+ {"protein_id": "Q81BR3", "GO_id": ["0005524", "0008986", "0046677"]}
377
+ {"protein_id": "A0A2P6MHY1", "GO_id": ["0016832", "0008270", "0005975"]}
378
+ {"protein_id": "Q55131", "GO_id": ["0016491", "0008295"]}
379
+ {"protein_id": "A0A1C3YMT2", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
380
+ {"protein_id": "A2QHC2", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
381
+ {"protein_id": "Q1QYW1", "GO_id": ["0005737", "0010181", "0016491"]}
382
+ {"protein_id": "Q55231", "GO_id": ["0016020", "0016491", "0006636"]}
383
+ {"protein_id": "Q9HTF3", "GO_id": ["0051537", "0046872", "0004497", "0016491", "0031457"]}
384
+ {"protein_id": "Q8DN03", "GO_id": ["0005886", "0005524", "0000155"]}
385
+ {"protein_id": "A0A0J9XL55", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
386
+ {"protein_id": "G1FNI6", "GO_id": ["0005829", "0005634", "0030234", "0042802", "0016829", "0046872", "0042803", "0019760", "0080028"]}
387
+ {"protein_id": "A0A2U7R6V5", "GO_id": ["0005576", "0031176", "0045493"]}
388
+ {"protein_id": "Q7SH52", "GO_id": ["0005951", "0005737", "0005759", "0005524", "0004087", "0004088", "0046872", "0006526", "0006221"]}
389
+ {"protein_id": "R0IGL9", "GO_id": ["0005829", "0070330", "0052722", "0050660", "0010181", "0020037", "0005506", "0003958"]}
390
+ {"protein_id": "Q55230", "GO_id": ["0016020", "0016491", "0006636"]}
391
+ {"protein_id": "B2B5J7", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
392
+ {"protein_id": "Q9LKZ2", "GO_id": ["0005524", "0004707", "0010225", "0009611"]}
393
+ {"protein_id": "Q4J8K8", "GO_id": ["0005524", "0016887", "0003677", "0006310", "0006281"]}
394
+ {"protein_id": "A0A3Q0KHE7", "GO_id": ["0005737", "0005634", "0070611", "0042054", "0035242", "1904047", "0006338", "0032259", "0006355", "0048608", "0046500"]}
395
+ {"protein_id": "Q66KP0", "GO_id": ["0036064", "0005737", "0005741", "0004550", "0006241", "0006281", "0006183", "0008053", "0009142", "0006228"]}
396
+ {"protein_id": "C4R6B0", "GO_id": ["0005782", "0008445", "0071949", "0019478", "0046416", "0019740"]}
397
+ {"protein_id": "F8G0P1", "GO_id": ["0042597", "0016491", "0009820", "0019608"]}
398
+ {"protein_id": "J1H1H5", "GO_id": ["0005737", "0008736", "0005996"]}
399
+ {"protein_id": "A0A1V0QSF4", "GO_id": ["0009507", "0000287", "0010333", "0016102"]}
400
+ {"protein_id": "Q9FGB1", "GO_id": ["0005737", "0005829", "0005886", "0009506", "0009536", "0005524", "0004674", "0004712", "0004713", "0071244", "1902456", "0009637", "0010114", "0001659"]}
401
+ {"protein_id": "A5U3S4", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0071949", "0009060", "0019478", "0006546"]}
402
+ {"protein_id": "I1RIF1", "GO_id": ["0005576", "0016787", "0016042"]}
403
+ {"protein_id": "Q7KRU8", "GO_id": ["0005737", "0005576", "0070288", "0045169", "0005794", "0008199", "0008198", "0004322", "0008283", "1990461", "0006879", "0098711", "0009791", "0009620", "0030431"]}
404
+ {"protein_id": "Q1QYU6", "GO_id": ["0051537", "0046872", "0004497"]}
405
+ {"protein_id": "G1SPE9", "GO_id": ["0030054", "0031966", "0005778", "0005777", "0004366", "0016287", "0021587", "0008611", "0006631", "0006650", "0061024", "0030913", "0008654", "0007416", "0019432"]}
406
+ {"protein_id": "A0A5N6UX39", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
407
+ {"protein_id": "A0A1Y2IY60", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
408
+ {"protein_id": "A0A509AF62", "GO_id": ["0020011", "0016740"]}
409
+ {"protein_id": "Q8VQ46", "GO_id": ["0016020", "0008107", "0036065", "0009243", "0006486"]}
410
+ {"protein_id": "A5HKP3", "GO_id": ["0016787", "0006629"]}
411
+ {"protein_id": "C1G1C3", "GO_id": ["0004412", "0046872", "0050661", "0009090", "0009097", "0009086", "0009088"]}
412
+ {"protein_id": "A0A1G9FQX8", "GO_id": ["0033846"]}
413
+ {"protein_id": "Q5U922", "GO_id": ["0008720", "0051287"]}
414
+ {"protein_id": "S7QKE2", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
415
+ {"protein_id": "Q50LF2", "GO_id": ["0005737", "0000166", "0008115", "0046653"]}
416
+ {"protein_id": "Q50227", "GO_id": ["0005886", "0046872", "0016491", "0015948", "0022904"]}
417
+ {"protein_id": "Q58YW1", "GO_id": ["0005886", "0016740", "0009103"]}
418
+ {"protein_id": "A0A0M2HFA3", "GO_id": ["0050660", "0016614"]}
419
+ {"protein_id": "I1S097", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
420
+ {"protein_id": "W8X9R6", "GO_id": ["0005737", "0016491"]}
421
+ {"protein_id": "A0A6C0PI29", "GO_id": ["0005576", "0030600", "0017000", "0006629", "0072330", "0045493"]}
422
+ {"protein_id": "A0A7J6F8C5", "GO_id": ["0009570", "0051539", "0051745", "0046872", "0050992", "0019288"]}
423
+ {"protein_id": "A0A5A4WIZ7", "GO_id": ["0004450", "0046872", "0006097", "0006099"]}
424
+ {"protein_id": "F1QWW8", "GO_id": ["0005789", "0047560", "0070402", "0006666", "0090156", "0030220", "0030148", "0090520", "0006686"]}
425
+ {"protein_id": "A0A6C0M6J9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
426
+ {"protein_id": "A0A5J6BJQ5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
427
+ {"protein_id": "Q504M2", "GO_id": ["0005739", "0004741", "0046872", "0042093"]}
428
+ {"protein_id": "A0A2K8FTN3", "GO_id": ["0020011", "0016740"]}
429
+ {"protein_id": "A1E280", "GO_id": ["0004497", "0000166"]}
430
+ {"protein_id": "A0A1D8PDV7", "GO_id": ["0005737", "0005768", "0010008", "0000329", "0005794", "0016020", "0071561", "0005777", "0000407", "0034271", "0034272", "0016303", "0005524", "0004672", "0032120", "0000045", "0006914", "0071470", "0051365", "0009267", "0006897", "0030447", "0036180", "0036170", "0000280", "0030473", "0000425", "0046854", "0036092", "0048015", "0032968", "0006624", "0007034", "0007033", "0016192"]}
431
+ {"protein_id": "Q971U1", "GO_id": ["0005524", "0140097", "0003677", "0004386", "0016787"]}
432
+ {"protein_id": "Q39QF5", "GO_id": ["0003995", "0050660"]}
433
+ {"protein_id": "A3SI50", "GO_id": ["0050660", "0016627"]}
434
+ {"protein_id": "A0A0H2ZLL3", "GO_id": ["0005886", "0005524", "0016887", "0006865", "0015697"]}
435
+ {"protein_id": "B2ADG1", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
436
+ {"protein_id": "Q73MU2", "GO_id": ["0047304", "0046872", "0016779", "0032923", "0019700"]}
437
+ {"protein_id": "A5PJW8", "GO_id": ["0000781", "0005739", "0005665", "0003682", "0003677", "0003899", "0071667", "0016787", "0032549", "0003968", "0008270", "0006366"]}
438
+ {"protein_id": "F8VPZ3", "GO_id": ["0005794", "0000139", "0005509", "0004843", "1904263", "0016579", "0006508"]}
439
+ {"protein_id": "G2WS43", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
440
+ {"protein_id": "A5U6Z7", "GO_id": ["0005829", "0009295", "0003677", "0016491", "0030527", "0030261", "0006879"]}
441
+ {"protein_id": "A9FRJ0", "GO_id": ["0000166", "0016491"]}
442
+ {"protein_id": "F0NEL5", "GO_id": ["0003677", "0004519", "0046872", "0006310", "0006281"]}
443
+ {"protein_id": "A8Q9M3", "GO_id": ["0005576", "0004806", "0016042"]}
444
+ {"protein_id": "I1REU9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
445
+ {"protein_id": "J9VH79", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
446
+ {"protein_id": "A0A0H2UQZ4", "GO_id": ["0005576", "0016491"]}
447
+ {"protein_id": "B8Y445", "GO_id": ["0051539", "0046872", "0016491", "0015948"]}
448
+ {"protein_id": "A0A7J6HK32", "GO_id": ["0000287", "0010333", "0016102"]}
449
+ {"protein_id": "E9EDR6", "GO_id": ["0030428", "0045009", "0005886", "0004100", "0030476", "0006031", "0000920"]}
450
+ {"protein_id": "A5U229", "GO_id": ["0051539", "0005524", "0003677", "0003678", "0016818", "0046872", "0006310", "0006281"]}
451
+ {"protein_id": "A0A7J6G7L9", "GO_id": ["0009570", "0030604", "0030145", "0070402", "0051484"]}
452
+ {"protein_id": "A9ES55", "GO_id": ["0004324", "0000166", "0034599", "0042167"]}
453
+ {"protein_id": "A8QCW4", "GO_id": ["0005576", "0004806", "0016042"]}
454
+ {"protein_id": "A0A5J6BJN5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
455
+ {"protein_id": "C8V4I9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
456
+ {"protein_id": "Q71RT3", "GO_id": ["0005576", "0050053", "0009758"]}
457
+ {"protein_id": "A0A1D8PT03", "GO_id": ["0005737", "0032126", "0062040", "0016020", "0005886", "0010181", "0003955", "0034599", "0160020"]}
458
+ {"protein_id": "G4MVT6", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
459
+ {"protein_id": "W8X092", "GO_id": ["0005737", "0016491"]}
460
+ {"protein_id": "B0XQY0", "GO_id": ["0044695", "0005789", "0061630", "0008270", "0043161", "0016567"]}
461
+ {"protein_id": "B7JA34", "GO_id": ["0016020", "0000166", "0016491", "0009245"]}
462
+ {"protein_id": "Q7RWN7", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
463
+ {"protein_id": "A0R2V5", "GO_id": ["0005829", "0005524", "0005525", "0016853", "0046872", "0004518"]}
464
+ {"protein_id": "Q8IDQ9", "GO_id": ["0005794", "0000139", "0000234", "0032259", "0006656"]}
465
+ {"protein_id": "A0A0F7G352", "GO_id": ["0009507", "0062116", "0008234", "0050547", "0042802", "0042803", "0009699", "0006508", "0042189"]}
466
+ {"protein_id": "Q54129", "GO_id": ["0016757", "0009243", "0044010"]}
467
+ {"protein_id": "A0A2U7QU15", "GO_id": ["0005576", "0030248", "0031176", "0045493"]}
468
+ {"protein_id": "I1BJ58", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
469
+ {"protein_id": "A2QRA0", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
470
+ {"protein_id": "Q54795", "GO_id": ["0031676", "0016717", "0006636"]}
471
+ {"protein_id": "Q8T913", "GO_id": ["0043240", "0005634", "0061630", "0008270", "0006281", "0036297", "0006513"]}
472
+ {"protein_id": "A0A7J6HWR9", "GO_id": ["0005737", "0005634", "0004337", "0004161", "0046872", "0006695", "0045337"]}
473
+ {"protein_id": "B9T625", "GO_id": ["0009507", "0009899", "0000287", "0010333", "0009686"]}
474
+ {"protein_id": "S7RK00", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
475
+ {"protein_id": "Q59128", "GO_id": ["0005737", "0051537", "0046872", "0018535", "0009820", "0019608"]}
476
+ {"protein_id": "T2HG31", "GO_id": ["0005782", "0003884", "0071949", "0043799", "0019478", "0048599", "1905939"]}
477
+ {"protein_id": "G1TNM3", "GO_id": ["0022626", "0022627", "0005743", "0005730", "0005819", "0140078", "0003677", "0003723", "0003735", "0006915", "0051301", "0006281", "2001235", "0006417", "0006412"]}
478
+ {"protein_id": "Q50249", "GO_id": ["0005886", "0008901", "0016151", "0015948"]}
479
+ {"protein_id": "H6C4I7", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
480
+ {"protein_id": "O58802", "GO_id": ["0004412", "0046872", "0070403", "0050661", "0009086", "0009088"]}
481
+ {"protein_id": "Q1PX48", "GO_id": ["0044222", "0020037", "0033740", "0042802", "0046872", "0019331", "0006809", "0070207"]}
482
+ {"protein_id": "A0A5N6V3W5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
483
+ {"protein_id": "B2ARG6", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
484
+ {"protein_id": "Q8IJK4", "GO_id": ["0020011", "0016020", "0005524", "0004808", "0000049", "0103016", "0008033"]}
485
+ {"protein_id": "C0PBF8", "GO_id": ["0005737", "0003884", "0008445", "0019740"]}
486
+ {"protein_id": "Q9SAJ2", "GO_id": ["0005737", "0005524", "0004674", "0004712", "0009734", "0071365", "0010928", "0009733", "0007165"]}
487
+ {"protein_id": "A0A7W3RCJ3", "GO_id": ["0008911"]}
488
+ {"protein_id": "E9E1W3", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
489
+ {"protein_id": "A0A803PDZ0", "GO_id": ["0009507", "0008661", "0046872", "0052865", "0019682", "0016114", "0009228"]}
490
+ {"protein_id": "Q4W1X2", "GO_id": ["0000166", "0008767", "0071555"]}
491
+ {"protein_id": "A0A1C9ZP88", "GO_id": ["0005886", "0098552", "0046872", "0004497", "0030245"]}
492
+ {"protein_id": "A0A095CCB2", "GO_id": ["0005782", "0003884", "0071949", "0019478", "0098754"]}
493
+ {"protein_id": "Q8VYX1", "GO_id": ["0008170", "0000234", "0032259", "0006656"]}
494
+ {"protein_id": "A8BPK8", "GO_id": ["0005930", "0036064", "0097542", "1902671", "0097554", "1902677", "0097560", "1902673", "0097556", "1902675", "0097558", "0097568", "0016020", "0072686", "0005634", "1902672", "0097555", "1902678", "0097561", "1902674", "0097557", "1902676", "0097559", "0097597", "0005524", "0004672", "0004674", "0000278", "1902410", "0140014", "1905504", "1901978", "0031114"]}
495
+ {"protein_id": "O64768", "GO_id": ["0005737", "0005524", "0004674", "0004712", "0009734", "0071365", "0010928", "0009733", "0007165"]}
496
+ {"protein_id": "A0A1S3Z5Y0", "GO_id": ["0005737", "0005634", "0005524", "0004707", "0106310", "0004674", "0004713", "0006952", "0035556"]}
497
+ {"protein_id": "A0A1D8PJA8", "GO_id": ["0009986", "0005576", "0009277", "0030446", "0016798", "0071555"]}
498
+ {"protein_id": "G3XCW3", "GO_id": ["0009276", "0016020", "0098567", "0005886", "0016757", "0008755", "0043165", "0009103", "0009243"]}
499
+ {"protein_id": "G4N1B2", "GO_id": ["0030428", "0045009", "0005886", "0004100", "0030476", "0006031", "0000920"]}
500
+ {"protein_id": "A2QLV1", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
501
+ {"protein_id": "Q4WSZ0", "GO_id": ["0005789", "0047560", "0070402", "0006666", "0030148"]}
502
+ {"protein_id": "Q2VG90", "GO_id": ["0005576", "0004806", "0016042"]}
503
+ {"protein_id": "A8PX35", "GO_id": ["0005576", "0004806", "0016042"]}
504
+ {"protein_id": "Q4X066", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
505
+ {"protein_id": "K5VN09", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
506
+ {"protein_id": "Q1K8B6", "GO_id": ["0005576", "0005507", "0004497", "0016705", "0019825", "0030245", "0000272"]}
507
+ {"protein_id": "A0A1Y9G8H0", "GO_id": ["0005576", "0020037", "0046872", "0004601", "0006979", "0042311"]}
508
+ {"protein_id": "Q9Y795", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
509
+ {"protein_id": "Q6E7K8", "GO_id": ["0016020", "0016717", "0006631"]}
510
+ {"protein_id": "Q4WP32", "GO_id": ["0005576", "0004497", "0030245"]}
511
+ {"protein_id": "Q7SHI8", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
512
+ {"protein_id": "Q5B6H0", "GO_id": ["0005576", "0016798", "0046872", "0030245"]}
513
+ {"protein_id": "Q9HGY3", "GO_id": ["0005782", "0003884", "0071949", "0019478", "0019740"]}
514
+ {"protein_id": "Q6FSR7", "GO_id": ["0010008", "0005576", "0000329", "0005794", "0071561", "0005777", "0000407", "0034271", "0034272", "0016303", "0005524", "0004672", "0032120", "0000045", "0051365", "0006897", "0006879", "0034755", "0000425", "0048015", "0048227", "0032968", "0072665"]}
515
+ {"protein_id": "I0JWN7", "GO_id": ["0016020", "0042597", "0052933", "0052934", "0005509", "0015945"]}
516
+ {"protein_id": "A0A0E0RQ52", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
517
+ {"protein_id": "Q7CPF5", "GO_id": ["0005829", "0005524", "0046872", "0004747", "0006014"]}
518
+ {"protein_id": "I1RDA9", "GO_id": ["0005576", "0016787", "0016042"]}
519
+ {"protein_id": "C8V530", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
520
+ {"protein_id": "A0A1D6NER6", "GO_id": ["0008170", "0000234", "0032259", "0006656"]}
521
+ {"protein_id": "Q8CBC7", "GO_id": ["0005829", "0005634", "1904047", "0106340", "0016423", "0002181", "0022008", "0002130"]}
522
+ {"protein_id": "G2WYP9", "GO_id": ["0030428", "0005935", "0045009", "0000131", "0005886", "0005628", "0004100", "0030476", "0006031", "0097271"]}
523
+ {"protein_id": "Q6K461", "GO_id": ["0005737", "0005634", "0052845", "0016791"]}
524
+ {"protein_id": "Q9HTF4", "GO_id": ["0051537", "0005506", "0004497", "0031457"]}
525
+ {"protein_id": "Q4J834", "GO_id": ["0003677", "0004519", "0046872", "0006310", "0006281"]}
526
+ {"protein_id": "G2QNT0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
527
+ {"protein_id": "Q5BEI9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
528
+ {"protein_id": "A0A1D5NVS8", "GO_id": ["0000785", "0072487", "0005634", "0141054", "0140585", "0061630", "0008270", "0006974", "0040029", "0045893", "0006513", "0016567"]}
529
+ {"protein_id": "Q8IBT4", "GO_id": ["0020011", "0031071", "0006534"]}
530
+ {"protein_id": "A4D0H5", "GO_id": ["0004497", "0000166", "0017000"]}
531
+ {"protein_id": "Q50LE9", "GO_id": ["0005737", "0008115", "1901053"]}
532
+ {"protein_id": "A2QTJ1", "GO_id": ["0030428", "0005935", "0045009", "0000131", "0005886", "0005628", "0004100", "0030476", "0006031", "0097271"]}
533
+ {"protein_id": "A0A7J6EK66", "GO_id": ["0009507", "0008661", "0046872", "0052865", "0015995", "0019682", "0016114", "0009228"]}
534
+ {"protein_id": "A0A0K0VEZ4", "GO_id": ["0005576", "0004806", "0016042"]}
535
+ {"protein_id": "A0A384K4U6", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
536
+ {"protein_id": "G2QAB5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
537
+ {"protein_id": "Q9W2R3", "GO_id": ["0016324", "0005938", "0005694", "0035323", "0031514", "0005634", "0005886", "0016308", "0003383", "0071711", "0030866", "0046843", "0007032", "0001837", "0071963", "0030707", "0007444", "0007310", "0008103", "0048477", "0046854", "0008104", "1903689", "0007286", "0007283"]}
538
+ {"protein_id": "F8D9F4", "GO_id": ["0046872", "0016791", "0044283"]}
539
+ {"protein_id": "A0A0C1E1D0", "GO_id": ["0005829", "0004069", "0004838", "0030170", "0006532"]}
540
+ {"protein_id": "A5A8G0", "GO_id": ["0009507", "0009905", "0034281", "0009899", "0034280", "0000287", "0010333", "0016102", "0033332"]}
541
+ {"protein_id": "A0A2H3EDS0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
542
+ {"protein_id": "A0A089LI24", "GO_id": ["0030246", "0016757", "0005975"]}
543
+ {"protein_id": "Q50225", "GO_id": ["0044569", "0009375", "0005886", "0051538", "0051539", "0009055", "0008901", "0046872", "0051911", "0015948"]}
544
+ {"protein_id": "G5EJN7", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
545
+ {"protein_id": "G0R6T8", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
546
+ {"protein_id": "W4KMP1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
547
+ {"protein_id": "Q5B8T4", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
548
+ {"protein_id": "D6Y851", "GO_id": ["0005829", "0005524", "0005525", "0016853", "0046872", "0004518"]}
549
+ {"protein_id": "O18333", "GO_id": ["0120281", "0000421", "0031410", "0070971", "0005794", "0000139", "0043025", "0043204", "0005886", "0098975", "0048786", "0061175", "0031982", "0005525", "0003924", "0000287", "0061909", "0006888", "0008057", "0009306", "0032482", "0106104", "0099175", "0160156", "0046718", "0033292", "0016192"]}
550
+ {"protein_id": "Q39QF7", "GO_id": ["0008775", "0006083"]}
551
+ {"protein_id": "F8JK18", "GO_id": ["0046872", "0010333"]}
552
+ {"protein_id": "Q5BAP2", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
553
+ {"protein_id": "B2AVF1", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
554
+ {"protein_id": "E1AQY3", "GO_id": ["0004015", "0030170", "0009102", "0009448", "0009699"]}
555
+ {"protein_id": "Q9V3I2", "GO_id": ["0043679", "0005938", "0005829", "0005769", "0061645", "0030139", "0012505", "0032585", "0043025", "0043204", "0005886", "0098830", "0045202", "0008021", "0043195", "0031982", "0030234", "0005525", "0003924", "0048675", "0007409", "0007298", "0007349", "0061883", "0048813", "0046664", "0006897", "0016197", "0034058", "0007032", "0032509", "0035088", "0006887", "0048803", "0006886", "0036258", "0007399", "0006836", "0007220", "0048477", "0006909", "0048015", "0048227", "0032482", "0006898", "0032956", "0050803", "1903186", "0046718", "0048488", "0016189", "0016050", "0016192", "0035220"]}
556
+ {"protein_id": "A0A1S4CGX4", "GO_id": ["0005737", "0005524", "0004708", "0004674", "0006952", "0051707"]}
557
+ {"protein_id": "J1H0Z7", "GO_id": ["0016740"]}
558
+ {"protein_id": "Q87NI7", "GO_id": ["0005886", "0071111"]}
559
+ {"protein_id": "Q9WYJ1", "GO_id": ["0003941", "0030170", "0004794", "0009097", "0006565", "0006567"]}
560
+ {"protein_id": "Q7VNA4", "GO_id": ["0005829", "0008713", "0009244"]}
561
+ {"protein_id": "A0A1D8PHA3", "GO_id": ["0005743", "0005886", "0045275", "0009055", "0020037", "0046872", "0008121", "0006122"]}
562
+ {"protein_id": "A0A0J9XK58", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
563
+ {"protein_id": "Q9X7P6", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0071949", "0019478"]}
564
+ {"protein_id": "A0A0H4K9X4", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
565
+ {"protein_id": "G2RB73", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
566
+ {"protein_id": "A8CEI3", "GO_id": ["0009507", "0047804", "0030170", "0019346"]}
567
+ {"protein_id": "C5B120", "GO_id": ["0016020", "0030288", "0005509", "0016614", "0015945"]}
568
+ {"protein_id": "B1KN81", "GO_id": ["0042597", "0016829"]}
569
+ {"protein_id": "A0A1C9CXI1", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
570
+ {"protein_id": "A0A384JXL6", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
571
+ {"protein_id": "A5U1J9", "GO_id": ["0005737", "0017136", "0070403", "0036054", "0036055", "0008270"]}
572
+ {"protein_id": "A0A4Y5L9K3", "GO_id": ["0000287", "0010333", "0016102"]}
573
+ {"protein_id": "A0A1V0QSF8", "GO_id": ["0009507", "0000287", "0010333", "0016102"]}
574
+ {"protein_id": "A0A2I1D2M7", "GO_id": ["0003962", "0016853", "0030170", "0019346"]}
575
+ {"protein_id": "A9CEY6", "GO_id": ["0005829", "0016491"]}
576
+ {"protein_id": "H3ZR39", "GO_id": ["0042802", "0016853", "0030170", "0008483"]}
577
+ {"protein_id": "Q7S439", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
578
+ {"protein_id": "Q5B467", "GO_id": ["0005789", "0008168", "0032259"]}
579
+ {"protein_id": "A0A193CHJ5", "GO_id": ["0005576", "0005509", "0047498", "0099106", "0005543", "0090729", "0050482", "0016042", "0042130", "0006644"]}
580
+ {"protein_id": "A0A1V0QSF6", "GO_id": ["0000287", "0010333", "0016102"]}
581
+ {"protein_id": "Q9X4C9", "GO_id": ["0005829", "1990077", "0005524", "0016887", "0003677", "0003678", "0042802", "0006269"]}
582
+ {"protein_id": "A0A3G2S5J6", "GO_id": ["0005576", "0120516", "0047372", "0004806", "0016042"]}
583
+ {"protein_id": "A0A4P8PKE4", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
584
+ {"protein_id": "A0A5J6BJT0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
585
+ {"protein_id": "S5RR11", "GO_id": ["0051537", "0005506", "0016491"]}
586
+ {"protein_id": "A0A7W3RDA3", "GO_id": ["0005737", "0008736", "0006004"]}
587
+ {"protein_id": "E2JKI3", "GO_id": ["0003677", "0000287", "0009036", "0009307"]}
588
+ {"protein_id": "Q8DPL8", "GO_id": ["0016020", "0016787", "0000156", "0000155", "0030295", "0019901", "0007234", "0006355", "0007165"]}
589
+ {"protein_id": "Q4DUK4", "GO_id": ["0005789", "0009922", "0034625", "0034626", "0019367", "0030148", "0042761"]}
590
+ {"protein_id": "A0A478ECY3", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
591
+ {"protein_id": "A0A5J6BJP2", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
592
+ {"protein_id": "Q93TM1", "GO_id": ["0042597", "0050053", "0009758"]}
593
+ {"protein_id": "A0A499UB99", "GO_id": ["0005782", "0003884", "0071949", "0019478", "0046416", "0019740"]}
594
+ {"protein_id": "F8D4I6", "GO_id": ["0005524", "0016301", "0006796"]}
595
+ {"protein_id": "E5Y8P9", "GO_id": ["0016620"]}
596
+ {"protein_id": "A0A1V0QSF9", "GO_id": ["0009507", "0000287", "0010333", "0016102"]}
597
+ {"protein_id": "A0A0J9X285", "GO_id": ["0005829", "0005524", "0008902", "0008972", "0009228", "0009229"]}
598
+ {"protein_id": "A0A1C9CXI0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
599
+ {"protein_id": "Q1K4Q1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
600
+ {"protein_id": "Q9W413", "GO_id": ["0030678", "0005739", "0046872", "0004526", "0097745", "0090646", "0001682"]}
601
+ {"protein_id": "Q50250", "GO_id": ["0005886", "0046872", "0016491", "0015948", "0022904"]}
602
+ {"protein_id": "A0A1W7HCY1", "GO_id": ["0005737", "0051537", "0140618", "0052851", "0046872", "0010106"]}
603
+ {"protein_id": "A0A384JLD1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
604
+ {"protein_id": "G2QZK6", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
605
+ {"protein_id": "C8YTM5", "GO_id": ["0008170", "0000234", "0032259", "0006656"]}
606
+ {"protein_id": "A5U3A6", "GO_id": ["0005886", "0005524", "0004674", "0080090"]}
example/test_nt_seqs.fasta ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ >Bob
2
+ atgaaatataaacgcattgtgtttaaagtgggcaccagcagcctgaccaacgaagatggc
3
+ agcctgagccgcagcaaagtgaaagatattacccagcagctggcgatgctgcatgaagcg
4
+ ggccatgaactgattctggtgagcagcggcgcgattgcggcgggctttggcgcgctgggc
5
+ tttaaaaaacgcccgaccaaaattgcggataaacaggcgagcgcggcggtgggccagggc
6
+ ctgctgctggaagaatataccaccaacctgctgctgcgccagattgtgagcgcgcagatt
7
+ ctgctgacccaggatgattttgtggataaacgccgctataaaaacgcgcatcaggcgctg
8
+ agcgtgctgctgaaccgcggcgcgattccgattattaacgaaaacgatagcgtggtgatt
9
+ gatgaactgaaagtgggcgataacgataccctgagcgcgcaggtggcggcgatggtgcag
10
+ gcggatctgctggtgtttctgaccgatgtggatggcctgtataccggcaacccgaacagc
11
+ gatccgcgcgcgaaacgcctggaacgcattgaaaccattaaccgcgaaattattgatatg
12
+ gcgggcggcgcgggcagcagcaacggcaccggcggcatgctgaccaaaattaaagcggcg
13
+ accattgcgaccgaaagcggcgtgccggtgtatatttgcagcagcctgaaaagcgatagc
14
+ atgattgaagcggcggaagaaaccgaagatggcagctattttgtggcgcaggaaaaaggc
15
+ ctgcgcacccagaaacagtggctggcgttttatgcgcagagccagggcagcatttgggtg
16
+ gataaaggcgcggcggaagcgctgagccagtatggcaaaagcctgctgctgagcggcatt
17
+ gtggaagcggaaggcgtgtttagctatggcgatattgtgaccgtgtttgataaagaaagc
18
+ ggcaaaagcctgggcaaaggccgcgtgcagtttggcgcgagcgcgctggaagatatgctg
19
+ cgcagccagaaagcgaaaggcgtgctgatttatcgcgatgattggattagcattaccccg
20
+ gaaattcagctgctgtttaccgaattt
21
+ >Henry
22
+ atggaagtgaaaggcaaaaaaaaactgaccggcaaaggcaccaaaatgagccaggaaaaa
23
+ agcaaatttcataaaaacaacgatagcggcagcagcaaaacctttccgaaaaaagtggtg
24
+ aaagaaggcggcccgaaaattaccagcaaaaactttgaaaaaaccgcgaccaaaccgggc
25
+ aaaaaaggcgtgaaacagtttaaaaacaaacagcagggcgatcgcattccgaaaaacaaa
26
+ tttcagcaggcgaacaaatttaaccagaaacgcaaatttcagccggatagcaaaagcgat
27
+ gaaagcgcggcgaaaaaaccgaaatgggatgaatttaaaaaaaaaaaaaaagaactgaaa
28
+ cagagccgccagctgagcgataaaaccaactatgatattgtgattcgcgcgaaacagatt
29
+ tgggaaattctgcgccgcaaagattgcgataaagaaaaacgcgtgaaactgatgagcgat
30
+ ctgcagaaactgattcagggcaaaattaaaaccattgcgtttgcgcatgatagcacccgc
31
+ gtgattcagtgctatattcagtttggcaacgaagaacagcgcaaacaggcgtttgaagaa
32
+ ctgcgcggcgatctggtggaactgagcaaagcgaaatatagccgcaacattgtgaaaaaa
33
+ tttctgatgtatggcagcaaagcgcagattgcggaaattattcgcagctttaaaggccat
34
+ gtgcgcaaactgctgcgccatgcggaagcgagcgcgattgtggaatatgcgtataacgat
35
+ aaagcgattctggaacagcgcaacatgctgaccgaagaactgtatggcaacacctttcag
36
+ ctgtataaaagcgcggatcatccgaccctggataaagtgctggaagtgcagccggaaaaa
37
+ ctggaactgattatggatgaaatgaaacagattctgaccccgatggcgcagaaagaagcg
38
+ gtgattaaacatagcctggtgcataaagtgtttctggatttttttacctatgcgccgccg
39
+ aaactgcgcagcgaaatgattgaagcgattcgcgaagcggtggtgtatctggcgcatacc
40
+ catgatggcgcgcgcgtggcgatgtattgcctgtggcatggcaccccgaaagatcgcaaa
41
+ gtgattgtgaaaaccatgaaaacctatattgaaaaagtggcgaacggccagtatagccat
42
+ ctggtgctgctggcggcgtttgattgcattgatgataccaaactggtgaaacagattatt
43
+ attagcgaaattattaacagcctgccgaacattgtgaacgataaatatggccgcaaagtg
44
+ ctgctgtatctgctgagcccgcgcgatccggcgcataccgtgcgcgaaattattgaagtg
45
+ ctgcagaaaggcgatggcaacgcgcatagcaaaaaagataccgaaattcgccgccgcgaa
46
+ ctgctggaaagcattagcccggcgctgctgagctatctgcagggccatgcgcaggaagtg
47
+ gtgctggataaaagcgcgtgcgtgctggtggcggatattctgggcaccgcgaccggcgat
48
+ gtgcagccggcgatggatgcggtggcgagcctggcggcggcggaactgcatccgggcggc
49
+ aaagatggcgaactgcatattgcggaacatccggcgggccatctggtgctgaaatggctg
50
+ attgaacaggataaaaaaatgaaagaacgcggccgcgaaggctgctttgcgaaaaccctg
51
+ attgaacgcgtgggcgtgaaaaacctgaaaagctgggcgagcgtgaaccgcggcgcgatt
52
+ attctgagcagcctgctgcagagcagcgatcaggaagtggcgaacaaagtgaaagcgggc
53
+ ctgaaaagcctgattccggcgctggaaaaaagcaaaaacaccagcaaaggcattgaaatg
54
+ ctgctggaaaaactgaccgcg
55
+ >Wilf
56
+ atggcggcggaagaaggcgtggtgattgcgtgccataacaaagatgaatttgatgcgcag
57
+ atgaccaaagcgaaagaagcgggcaaagtggtgattattgattttaccgcgagctggtgc
58
+ ggcccgtgccgctttattgcgccggtgtttgcggaatatgcgaaaaaatttccgggcgcg
59
+ gtgtttctgaaagtggatgtggatgaactgaaagaagtggcggaaaaatataacgtggaa
60
+ gcgatgccgacctttctgtttattaaagatggcgcggaagcggataaagtggtgggcgcg
61
+ cgcaaagatgatctgcagaacaccattgtgaaacatgtgggcgcgaccgcggcgagcgcg
62
+ agcgcg
63
+ >reverse translation of P22298
64
+ ggccgcggccugcugccguuugugcugcuggcgcugggcauugcgccgugggcgguggaa
65
+ ggcgcggaaaacgcgcugaaaggcggcgcgugcccgccgcgcaaaauugugcagugccug
66
+ cgcuaugaaaaaccgaaaugcaccagcgauuggcagugcccggauaaaaaaaaaugcugc
67
+ cgcgauaccugcgcgauuaaaugccugaacccgguggcgauuaccaacccggugaaagug
68
+ aaaccgggcaaaugcccggugguguauggccagugcaugaugcugaacccgccgaaccau
69
+ ugcaaaaccgauagccagugccugggcgaucugaaaugcugcaaaagcaugugcggcaaa
70
+ gugugccugaccccggugaaagcg
71
+ >ENA|AACH01000027|AACH01000027.2 Saccharomyces mikatae IFO 1815 YM4906-Contig2858, whole genome shotgun sequence.
72
+ ctggtgctgctggcggcgtttgattgcattgatgataccaaactggtgaaacagattatt
73
+ attagcgaaattattaacagcctgccgaacattgtgaacgataaatatggccgcaaagtg
74
+ ctgctgtatctgctgagcccgcgcgatccggcgcataccgtgcgcgaaattattgaagtg
75
+ ctgcagaaaggcgatggcaacgcgcatagcaaaaaagataccgaaattcgccgccgcgaa
76
+ atgaaatataaacgcattgtgtttaaagtgggcaccagcagcctgaccaacgaagatggc
77
+ agcctgagccgcagcaaagtgaaagatattacccagcagctggcgatgctgcatgaagcg
78
+ ggccatgaactgattctggtgagcagcggcgcgattgcggcgggctttggcgcgctgggc
79
+ tttaaaaaacgcccgaccaaaattgcggataaacaggcgagcgcggcggtgggccagggc
80
+ ctgctgctggaagaatataccaccaacctgctgctgcgccagattgtgagcgcgcagatt
81
+ ctgctgacccaggatgattttgtggataaacgccgctataaaaacgcgcatcaggcgctg
82
+ agcgtgctgctgaaccgcggcgcgattccgattattaacgaaaacgatagcgtggtgatt
83
+ gatgaactgaaagtgggcgataacgataccctgagcgcgcaggtggcggcgatggtgcag
84
+ gcggatctgctggtgtttctgaccgatgtggatggcctgtataccggcaacccgaacagc
85
+ gatccgcgcgcgaaacgcctggaacgcattgaaaccattaaccgcgaaattattgatatg
86
+ gcgggcggcgcgggcagcagcaacggcaccggcggcatgctgaccaaaattaaagcggcg
87
+ accattgcgaccgaaagcggcgtgccggtgtatatttgcagcagcctgaaaagcgatagc
88
+ atgattgaagcggcggaagaaaccgaagatggcagctattttgtggcgcaggaaaaaggc
89
+ ctgcgcacccagaaacagtggctggcgttttatgcgcagagccagggcagcatttgggtg
90
+ gataaaggcgcggcggaagcgctgagccagtatggcaaaagcctgctgctgagcggcatt
91
+ gtggaagcggaaggcgtgtttagctatggcgatattgtgaccgtgtttgataaagaaagc
92
+ ggcaaaagcctgggcaaaggccgcgtgcagtttggcgcgagcgcgctggaagatatgctg
93
+ cgcagccagaaagcgaaaggcgtgctgatttatcgcgatgattggattagcattaccccg
94
+ gaaattcagctgctgtttaccgaattt
example/test_proteins.fasta ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ >A0A0H2ZM56
2
+ MADKKTVTPEEKKLVAEKHVDELVQKALVALEEMRKLNQEQVDYIVAKASVAALDAHGELALHAFEETGRGVFEDKATKNLFACEHVVNNMRHTKTVGVIEEDDVTGLTLIAEPVGVVCGITPTTNPTSTAIFKSLISLKTRNPIVFAFHPSAQESSAHAARIVRDAAIAAGAPENCVQWITQPSMEATSALMNHEGVATILATGGNAMVKAAYSCGKPALGVGAGNVPAYVEKSANIRQAAHDIVMSKSFDNGMVCASEQAVIIDKEIYDEFVAEFKSYHTYFVNKKEKALLEEFCFGVKANSKNCAGAKLNADIVGKPATWIAEQAGFTVPEGTNILAAECKEVGENEPLTREKLSPVIAVLKSESREDGITKARQMVEFNGLGHSAAIHTADEELTKEFGKAVKAIRVICNSPSTFGGIGDVYNAFLPSLTLGCGSYGRNSVGDNVSAINLLNIKKVGRRRNNMQWMKLPSKTYFERDSIQYLQKCRDVERVMIVTDHAMVELGFLDRIIEQLDLRRNKVVYQIFADVEPDPDITTVNRGTEIMRAFKPDTIIALGGGSPMDAAKVMWLFYEQPEVDFRDLVQKFMDIRKRAFKFPLLGKKTKFIAIPTTSGTGSEVTPFAVISDKANNRKYPIADYSLTPTVAIVDPALVLTVPGFVAADTGMDVLTHATEAYVSQMASDYTDGLALQAIKLVFENLESSVKNADFHSREKMHNASTIAGMAFANAFLGISHSMAHKIGAQFHTIHGRTNAILLPYVIRYNGTRPAKTATWPKYNYYRADEKYQDIARMLGLPASTPEEGVESYAKAVYELGERIGIQMNFRDQGIDEKEWKEHSRELAFLAYEDQCSPANPRLPMVDHMQEIIEDAYYGYKERPGRRK
3
+ >P05067
4
+ MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMNVQNGKWDSDPSGTKTCIDTKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPHFVIPYRCLVGEFVSDALLVPDKCKFLHQERMDVCETHLHWHTVAKETCSEKSTNLHDYGMLLPCGIDKFRGVEFVCCPLAEESDNVDSADAEEDDSDVWWGGADTDYADGSEDKVVEVAEEEEVAEVEEEEADDDEDDEDGDEVEEEAEEPYEEATERTTSIATTTTTTTESVEEVVREVCSEQAETGPCRAMISRWYFDVTEGKCAPFFYGGCGGNRNNFDTEEYCMAVCGSAMSQSLLKTTQEPLARDPVKLPTTAASTPDAVDKYLETPGDENEHAHFQKAKERLEAKHRERMSQVMREWEEAERQAKNLPKADKKAVIQHFQEKVESLEQEAANERQQLVETHMARVEAMLNDRRRLALENYITALQAVPPRPRHVFNMLKKYVRAEQKDRQHTLKHFEHVRMVDPKKAAQIRSQVMTHLRVIYERMNQSLSLLYNVPAVAEEIQDEVDELLQKEQNYSDDVLANMISEPRISYGNDALMPSLTETKTTVELLPVNGEFSLDDLQPWHSFGADSVPANTENEVEPVDARPAADRGLTTRPGSGLTNIKTEEISEVKMDAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIATVIVITLVMLKKKQYTSIHHGVVEVDAAVTPEERHLSKMQQNGYENPTYKFFEQMQN
go_integration_pipeline.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import sys
4
+ import argparse
5
+ from typing import Dict, List, Tuple, Optional
6
+ from collections import defaultdict
7
+ import torch
8
+ from tqdm import tqdm
9
+
10
+ # 添加路径
11
+ root_path = os.path.dirname((os.path.abspath(__file__)))
12
+ sys.path.append(root_path)
13
+ sys.path.append(os.path.join(root_path, "Models/ProTrek"))
14
+
15
+ from utils.protein_go_analysis import get_go_definition
16
+
17
+ class GOIntegrationPipeline:
18
+ def __init__(self,
19
+ identity_threshold: int = 80,
20
+ coverage_threshold: int = 80,
21
+ evalue_threshold: float = 1e-50,
22
+ topk: int = 2,
23
+ protrek_threshold: Optional[float] = None,
24
+ use_protrek: bool = False):
25
+ """
26
+ GO信息整合管道
27
+
28
+ Args:
29
+ identity_threshold: BLAST identity阈值 (0-100)
30
+ coverage_threshold: BLAST coverage阈值 (0-100)
31
+ evalue_threshold: BLAST E-value阈值
32
+ protrek_threshold: ProTrek分数阈值
33
+ use_protrek: 是否使用第二层ProTrek筛选
34
+ """
35
+ self.identity_threshold = identity_threshold
36
+ self.coverage_threshold = coverage_threshold
37
+ self.evalue_threshold = evalue_threshold
38
+ self.protrek_threshold = protrek_threshold
39
+ self.use_protrek = use_protrek
40
+ self.topk = topk
41
+
42
+ # 加载蛋白质-GO映射数据
43
+ self._load_protein_go_dict()
44
+
45
+ # 如果使用protrek,初始化模型
46
+ if self.use_protrek:
47
+ self._init_protrek_model()
48
+
49
+ def _init_protrek_model(self):
50
+ """初始化ProTrek模型"""
51
+ from model.ProTrek.protrek_trimodal_model import ProTrekTrimodalModel
52
+
53
+ config = {
54
+ "protein_config": "Models/ProTrek/weights/ProTrek_650M_UniRef50/esm2_t33_650M_UR50D",
55
+ "text_config": "Models/ProTrek/weights/ProTrek_650M_UniRef50/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
56
+ "structure_config": "Models/ProTrek/weights/ProTrek_650M_UniRef50/foldseek_t30_150M",
57
+ "load_protein_pretrained": False,
58
+ "load_text_pretrained": False,
59
+ "from_checkpoint": "Models/ProTrek/weights/ProTrek_650M_UniRef50/ProTrek_650M_UniRef50.pt"
60
+ }
61
+
62
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
63
+ self.protrek_model = ProTrekTrimodalModel(**config).to(self.device).eval()
64
+ print(f"ProTrek模型已加载到设备: {self.device}")
65
+
66
+ def _load_protein_go_dict(self):
67
+ """加载蛋白质-GO映射数据"""
68
+ self.protein_go_dict = {}
69
+ try:
70
+ with open('processed_data/protein_go.json', 'r') as f:
71
+ for line in f:
72
+ data = json.loads(line)
73
+ self.protein_go_dict[data['protein_id']] = data['GO_id']
74
+ print(f"成功加载蛋白质-GO映射数据,共{len(self.protein_go_dict)}条记录")
75
+ except Exception as e:
76
+ print(f"加载蛋白质-GO映射数据时发生错误: {str(e)}")
77
+ self.protein_go_dict = {}
78
+
79
+ def _get_go_from_uniprot_id(self, uniprot_id: str) -> List[str]:
80
+ """
81
+ 从Uniprot ID获取GO ID
82
+
83
+ Args:
84
+ uniprot_id: Uniprot ID
85
+
86
+ Returns:
87
+ 使用类内部加载的字典
88
+ """
89
+ # 使用类内部加载的字典
90
+ return [go_id.split("_")[-1] if "_" in go_id else go_id
91
+ for go_id in self.protein_go_dict.get(uniprot_id, [])]
92
+
93
+ def extract_blast_go_ids(self, blast_results: List[Dict],protein_id: str) -> List[str]:
94
+ """
95
+ 从BLAST结果中提取符合条件的GO ID
96
+
97
+ Args:
98
+ blast_results: BLAST结果列表
99
+ protein_id: 当前蛋白质ID(避免自身匹配)
100
+
101
+ Returns:
102
+ 符合条件的GO ID列表
103
+ """
104
+ go_ids = []
105
+
106
+ if self.topk > 0:
107
+ # 使用topk策略
108
+ for result in blast_results[:self.topk]:
109
+ hit_id = result.get('ID', '')
110
+ if hit_id == protein_id:
111
+ continue
112
+ go_ids.extend(self._get_go_from_uniprot_id(hit_id))
113
+ else:
114
+ # 使用阈值策略
115
+ for result in blast_results:
116
+ identity = float(result.get('Identity%', 0))
117
+ coverage = float(result.get('Coverage%', 0))
118
+ evalue = float(result.get('E-value', 1.0))
119
+
120
+ # 检查是否符合阈值条件
121
+ if (identity >= self.identity_threshold and
122
+ coverage >= self.coverage_threshold and
123
+ evalue <= self.evalue_threshold):
124
+
125
+ # 获取该hit的protein_id
126
+ hit_id = result.get('ID', '')
127
+ if hit_id == protein_id:
128
+ continue
129
+ go_ids.extend(self._get_go_from_uniprot_id(hit_id))
130
+
131
+ return go_ids
132
+
133
+ def first_level_filtering(self, interproscan_info: Dict, blast_info: Dict) -> Dict[str, List[str]]:
134
+ """
135
+ 第一层筛选:合并interproscan和符合条件的blast GO信息
136
+
137
+ Args:
138
+ interproscan_info: InterProScan结果
139
+ blast_info: BLAST结果
140
+
141
+ Returns:
142
+ 蛋白质ID到GO ID列表的映射
143
+ """
144
+ protein_go_dict = {}
145
+
146
+ for protein_id in interproscan_info.keys():
147
+ go_ids = set()
148
+
149
+ # 添加interproscan的GO信息
150
+ interproscan_gos = interproscan_info[protein_id].get('interproscan_results', {}).get('go_id', [])
151
+ interproscan_gos = [go_id.split(":")[-1] if ":" in go_id else go_id for go_id in interproscan_gos]
152
+ if interproscan_gos:
153
+ go_ids.update(interproscan_gos)
154
+
155
+ # 添加符合条件的blast GO信息
156
+ if protein_id in blast_info:
157
+ blast_results = blast_info[protein_id].get('blast_results', [])
158
+ blast_gos = self.extract_blast_go_ids(blast_results,protein_id)
159
+ go_ids.update(blast_gos)
160
+
161
+ protein_go_dict[protein_id] = list(go_ids)
162
+
163
+ return protein_go_dict
164
+
165
+ def calculate_protrek_scores(self, protein_sequences: Dict[str, str],
166
+ protein_go_dict: Dict[str, List[str]]) -> Dict[str, Dict]:
167
+ """
168
+ 计算ProTrek分数
169
+
170
+ Args:
171
+ protein_sequences: 蛋白质序列字典
172
+ protein_go_dict: 蛋白质GO映射
173
+
174
+ Returns:
175
+ 包含GO分数的字典
176
+ """
177
+ results = {}
178
+
179
+ for protein_id, go_ids in tqdm(protein_go_dict.items(), desc="计算ProTrek分数"):
180
+ if protein_id not in protein_sequences:
181
+ continue
182
+
183
+ protein_seq = protein_sequences[protein_id]
184
+ go_scores = {}
185
+
186
+ # 获取GO定义
187
+ go_definitions = {}
188
+ for go_id in go_ids:
189
+ definition = get_go_definition(go_id)
190
+ if definition:
191
+ go_definitions[go_id] = definition
192
+
193
+ if not go_definitions:
194
+ continue
195
+
196
+ try:
197
+ # 计算蛋白质序列嵌入
198
+ seq_emb = self.protrek_model.get_protein_repr([protein_seq])
199
+
200
+ # 计算文本嵌入和相似度分数
201
+ definitions = list(go_definitions.values())
202
+ text_embs = self.protrek_model.get_text_repr(definitions)
203
+
204
+ # 计算相似度分数
205
+ scores = (seq_emb @ text_embs.T) / self.protrek_model.temperature
206
+ scores = scores.cpu().numpy().flatten()
207
+
208
+ # 映射回GO ID
209
+ for i, go_id in enumerate(go_definitions.keys()):
210
+ go_scores[go_id] = float(scores[i])
211
+
212
+ except Exception as e:
213
+ print(f"计算 {protein_id} 的ProTrek分数时出错: {str(e)}")
214
+ continue
215
+
216
+ results[protein_id] = {
217
+ "protein_id": protein_id,
218
+ "GO_id": go_ids,
219
+ "Clip_score": go_scores
220
+ }
221
+
222
+ return results
223
+
224
+ def second_level_filtering(self, protrek_results: Dict[str, Dict]) -> Dict[str, List[str]]:
225
+ """
226
+ 第二层筛选:根据ProTrek阈值筛选GO
227
+
228
+ Args:
229
+ protrek_results: ProTrek计算结果
230
+
231
+ Returns:
232
+ 筛选后的蛋白质GO映射
233
+ """
234
+ filtered_results = {}
235
+
236
+ for protein_id, data in protrek_results.items():
237
+ clip_scores = data.get('Clip_score', {})
238
+ filtered_gos = []
239
+
240
+ for go_id, score in clip_scores.items():
241
+ if score >= self.protrek_threshold:
242
+ filtered_gos.append(go_id)
243
+
244
+ if filtered_gos:
245
+ filtered_results[protein_id] = filtered_gos
246
+
247
+ return filtered_results
248
+
249
+ def generate_filename(self, base_name: str, is_intermediate: bool = False) -> str:
250
+ """生成包含参数信息的文件名"""
251
+ if self.topk > 0:
252
+ # 如果使用topk,则只包含topk信息
253
+ params = f"topk{self.topk}"
254
+ else:
255
+ # 否则使用原有的参数组合
256
+ params = f"identity{self.identity_threshold}_coverage{self.coverage_threshold}_evalue{self.evalue_threshold:.0e}"
257
+
258
+ if self.use_protrek and self.protrek_threshold is not None:
259
+ params += f"_protrek{self.protrek_threshold}"
260
+
261
+ if is_intermediate:
262
+ return f"{base_name}_intermediate_{params}.json"
263
+ else:
264
+ return f"{base_name}_final_{params}.json"
265
+
266
+ def run(self, interproscan_info: Dict = None, blast_info: Dict = None,
267
+ interproscan_file: str = None, blast_file: str = None,
268
+ output_dir: str = "output"):
269
+ """
270
+ 运行GO整合管道
271
+
272
+ Args:
273
+ interproscan_info: InterProScan结果字典
274
+ blast_info: BLAST结果字典
275
+ interproscan_file: InterProScan结果文件路径
276
+ blast_file: BLAST结果文件路径
277
+ output_dir: 输出目录
278
+ """
279
+ # 加载数据
280
+ if interproscan_info is None and interproscan_file:
281
+ with open(interproscan_file, 'r') as f:
282
+ interproscan_info = json.load(f)
283
+
284
+ if blast_info is None and blast_file:
285
+ with open(blast_file, 'r') as f:
286
+ blast_info = json.load(f)
287
+
288
+ if not interproscan_info or not blast_info:
289
+ raise ValueError("必须提供interproscan_info和blast_info数据或文件路径")
290
+
291
+ # 确保输出目录存在
292
+ os.makedirs(output_dir, exist_ok=True)
293
+
294
+ print("开始第一层筛选...")
295
+ # 第一层筛选
296
+ protein_go_dict = self.first_level_filtering(interproscan_info, blast_info)
297
+
298
+ if not self.use_protrek:
299
+ # 不使用第二层筛选,直接保存结果
300
+ output_file = os.path.join(output_dir, self.generate_filename("go_integration"))
301
+ with open(output_file, 'w') as f:
302
+ for protein_id, go_ids in protein_go_dict.items():
303
+ result = {"protein_id": protein_id, "GO_id": go_ids}
304
+ f.write(json.dumps(result) + '\n')
305
+
306
+ print(f"第一层筛选完成,结果已保存到: {output_file}")
307
+ return output_file
308
+
309
+ print("开始第二层筛选...")
310
+ # 提取蛋白质序列
311
+ protein_sequences = {}
312
+ for protein_id, data in interproscan_info.items():
313
+ protein_sequences[protein_id] = data.get('sequence', '')
314
+
315
+ # 计算ProTrek分数
316
+ protrek_results = self.calculate_protrek_scores(protein_sequences, protein_go_dict)
317
+
318
+ # 保存中间结果
319
+ intermediate_file = os.path.join(output_dir, self.generate_filename("go_integration", is_intermediate=True))
320
+ with open(intermediate_file, 'w') as f:
321
+ for result in protrek_results.values():
322
+ f.write(json.dumps(result) + '\n')
323
+
324
+ print(f"ProTrek分数计算完成,中间结果已保存到: {intermediate_file}")
325
+
326
+ # 第二层筛选
327
+ if self.protrek_threshold is not None:
328
+ final_results = self.second_level_filtering(protrek_results)
329
+
330
+ # 保存最终结果
331
+ final_file = os.path.join(output_dir, self.generate_filename("go_integration"))
332
+ with open(final_file, 'w') as f:
333
+ for protein_id, go_ids in final_results.items():
334
+ result = {"protein_id": protein_id, "GO_id": go_ids}
335
+ f.write(json.dumps(result) + '\n')
336
+
337
+ print(f"第二层筛选完成,最终结果已保存到: {final_file}")
338
+ return final_file, intermediate_file
339
+
340
+ return intermediate_file
341
+
342
+ def main():
343
+ parser = argparse.ArgumentParser(description="GO信息整合管道")
344
+ parser.add_argument("--interproscan_file", type=str,default="data/processed_data/interproscan_info.json", help="InterProScan结果文件路径")
345
+ parser.add_argument("--blast_file", type=str, default="data/processed_data/blast_info.json", help="BLAST结果文件路径")
346
+ parser.add_argument("--identity", type=int, default=80, help="BLAST identity阈值 (0-100)")
347
+ parser.add_argument("--coverage", type=int, default=80, help="BLAST coverage阈值 (0-100)")
348
+ parser.add_argument("--evalue", type=float, default=1e-50, help="BLAST E-value阈值")
349
+ parser.add_argument("--topk", type=int, default=2, help="BLAST topk结果")
350
+ parser.add_argument("--protrek_threshold", type=float, help="ProTrek分数阈值")
351
+ parser.add_argument("--use_protrek", action="store_true", help="是否使用第二层ProTrek筛选")
352
+ parser.add_argument("--output_dir", type=str, default="data/processed_data/go_integration_results", help="输出目录")
353
+
354
+ args = parser.parse_args()
355
+
356
+ # 创建管道实例
357
+ pipeline = GOIntegrationPipeline(
358
+ identity_threshold=args.identity,
359
+ coverage_threshold=args.coverage,
360
+ evalue_threshold=args.evalue,
361
+ topk=args.topk,
362
+ protrek_threshold=args.protrek_threshold,
363
+ use_protrek=args.use_protrek
364
+ )
365
+
366
+ # 运行管道
367
+ pipeline.run(
368
+ interproscan_file=args.interproscan_file,
369
+ blast_file=args.blast_file,
370
+ output_dir=args.output_dir
371
+ )
372
+
373
+ if __name__ == "__main__":
374
+ main()
integrated_pipeline.py ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import sys
4
+ import argparse
5
+ from typing import Dict, List, Optional
6
+ from pathlib import Path
7
+ from tqdm import tqdm
8
+
9
+ # 添加必要的路径
10
+ root_path = os.path.dirname(os.path.abspath(__file__))
11
+ print(root_path)
12
+ sys.path.append(root_path)
13
+ sys.path.append(os.path.join(root_path, "Models/ProTrek"))
14
+
15
+ # 导入所需模块
16
+ from interproscan import InterproScan
17
+ from Bio.Blast.Applications import NcbiblastpCommandline
18
+ from utils.utils import extract_interproscan_metrics, get_seqnid, extract_blast_metrics, rename_interproscan_keys
19
+ from go_integration_pipeline import GOIntegrationPipeline
20
+ from utils.generate_protein_prompt import generate_prompt, get_interpro_manager, get_lmdb_connection
21
+ from utils.openai_access import call_chatgpt
22
+
23
+ class IntegratedProteinPipeline:
24
+ def __init__(self,
25
+ blast_database: str = "uniprot_swissprot",
26
+ expect_value: float = 0.01,
27
+ interproscan_path: str = "interproscan/interproscan-5.75-106.0/interproscan.sh",
28
+ interproscan_libraries: List[str] = None,
29
+ go_topk: int = 2,
30
+ selected_info_types: List[str] = None,
31
+ pfam_descriptions_path: str = None,
32
+ go_info_path: str = None,
33
+ interpro_data_path: str = None,
34
+ lmdb_path: str = None,
35
+ args: argparse.Namespace = None):
36
+ """
37
+ 整合蛋白质分析管道
38
+
39
+ Args:
40
+ blast_database: BLAST数据库名称
41
+ expect_value: BLAST E-value阈值
42
+ interproscan_path: InterProScan程序路径
43
+ interproscan_libraries: InterProScan库列表
44
+ go_topk: GO整合的topk参数
45
+ selected_info_types: prompt生成时选择的信息类型
46
+ pfam_descriptions_path: Pfam描述文件路径
47
+ go_info_path: GO信息文件路径
48
+ interpro_data_path: InterPro数据文件路径
49
+ lmdb_path: LMDB数据库路径
50
+ """
51
+ self.blast_database = blast_database
52
+ self.expect_value = expect_value
53
+ self.interproscan_path = interproscan_path
54
+ self.interproscan_libraries = interproscan_libraries or [
55
+ "PFAM", "PIRSR", "PROSITE_PROFILES", "SUPERFAMILY", "PRINTS",
56
+ "PANTHER", "CDD", "GENE3D", "NCBIFAM", "SFLM", "MOBIDB_LITE",
57
+ "COILS", "PROSITE_PATTERNS", "FUNFAM", "SMART"
58
+ ]
59
+ self.go_topk = go_topk
60
+ self.selected_info_types = selected_info_types or ['motif', 'go']
61
+
62
+ # 文件路径配置
63
+ self.pfam_descriptions_path = pfam_descriptions_path
64
+ self.go_info_path = go_info_path
65
+ self.interpro_data_path = interpro_data_path
66
+ self.lmdb_path = lmdb_path
67
+ self.interproscan_info_path = args.interproscan_info_path
68
+ self.blast_info_path = args.blast_info_path
69
+
70
+ # 初始化GO整合管道
71
+ self.go_pipeline = GOIntegrationPipeline(topk=self.go_topk)
72
+
73
+ # 初始化InterPro管理器(如果需要)
74
+ self.interpro_manager = None
75
+ other_types = [t for t in self.selected_info_types if t not in ['motif', 'go']]
76
+ if other_types and self.interpro_data_path:
77
+ self.interpro_manager = get_interpro_manager(self.interpro_data_path, None)
78
+
79
+ def step1_run_blast_and_interproscan(self, input_fasta: str, temp_dir: str = "temp") -> tuple:
80
+ """
81
+ 步骤1: 运行BLAST和InterProScan分析
82
+
83
+ Args:
84
+ input_fasta: 输入FASTA文件路径
85
+ temp_dir: 临时文件目录
86
+
87
+ Returns:
88
+ tuple: (interproscan_info, blast_info)
89
+ """
90
+ print("步骤1: 运行BLAST和InterProScan分析...")
91
+
92
+ # 创建临时目录
93
+ os.makedirs(temp_dir, exist_ok=True)
94
+
95
+ # 获取序列字典
96
+ seq_dict = get_seqnid(input_fasta)
97
+ print(f"读取到 {len(seq_dict)} 个序列")
98
+
99
+ # 运行BLAST
100
+ print("运行BLAST分析...")
101
+ blast_xml = os.path.join(temp_dir, "blast_results.xml")
102
+ blast_cmd = NcbiblastpCommandline(
103
+ query=input_fasta,
104
+ db=self.blast_database,
105
+ out=blast_xml,
106
+ outfmt=5, # XML格式
107
+ evalue=self.expect_value
108
+ )
109
+ blast_cmd()
110
+
111
+ # 提取BLAST结果
112
+ blast_results = extract_blast_metrics(blast_xml)
113
+ blast_info = {}
114
+ for uid, info in blast_results.items():
115
+ blast_info[uid] = {"sequence": seq_dict[uid], "blast_results": info}
116
+
117
+ # 运行InterProScan
118
+ print("运行InterProScan分析...")
119
+ interproscan_json = os.path.join(temp_dir, "interproscan_results.json")
120
+ interproscan = InterproScan(self.interproscan_path)
121
+ input_args = {
122
+ "fasta_file": input_fasta,
123
+ "goterms": True,
124
+ "pathways": True,
125
+ "save_dir": interproscan_json
126
+ }
127
+ interproscan.run(**input_args)
128
+
129
+ # 提取InterProScan结果
130
+ interproscan_results = extract_interproscan_metrics(
131
+ interproscan_json,
132
+ librarys=self.interproscan_libraries
133
+ )
134
+
135
+ interproscan_info = {}
136
+ for id, seq in seq_dict.items():
137
+ info = interproscan_results[seq]
138
+ info = rename_interproscan_keys(info)
139
+ interproscan_info[id] = {"sequence": seq, "interproscan_results": info}
140
+
141
+ # 清理临时文件
142
+ if os.path.exists(blast_xml):
143
+ os.remove(blast_xml)
144
+ if os.path.exists(interproscan_json):
145
+ os.remove(interproscan_json)
146
+
147
+ print(f"步骤1完成: 处理了 {len(interproscan_info)} 个蛋白质")
148
+ return interproscan_info, blast_info
149
+
150
+ def step2_integrate_go_information(self, interproscan_info: Dict, blast_info: Dict) -> Dict:
151
+ """
152
+ 步骤2: 整合GO信息
153
+
154
+ Args:
155
+ interproscan_info: InterProScan结果
156
+ blast_info: BLAST结果
157
+
158
+ Returns:
159
+ Dict: 整合后的GO信息
160
+ """
161
+ print("步骤2: 整合GO信息...")
162
+
163
+ # 使用GO整合管道进行第一层筛选
164
+ protein_go_dict = self.go_pipeline.first_level_filtering(interproscan_info, blast_info)
165
+
166
+ print(f"步骤2完成: 为 {len(protein_go_dict)} 个蛋白质整合了GO信息")
167
+ return protein_go_dict
168
+
169
+ def step3_generate_prompts(self, interproscan_info: Dict, blast_info: Dict,
170
+ protein_go_dict: Dict) -> Dict:
171
+ """
172
+ 步骤3: 生成蛋白质prompt
173
+
174
+ Args:
175
+ interproscan_info: InterProScan结果
176
+ blast_info: BLAST结果
177
+ protein_go_dict: 整合的GO信息
178
+
179
+ Returns:
180
+ Dict: 蛋白质ID到prompt的映射(如果有lmdb则包含QA对)
181
+ """
182
+ print("步骤3: 生成蛋白质prompt...")
183
+
184
+ # 创建临时的GO整合文件格式(用于generate_prompt函数)
185
+ temp_go_data = {}
186
+ for protein_id, go_ids in protein_go_dict.items():
187
+ temp_go_data[protein_id] = go_ids
188
+
189
+ prompts_data = {}
190
+
191
+ if self.lmdb_path:
192
+ # 如果有lmdb路径,处理QA数据
193
+ from utils.generate_protein_prompt import get_qa_data
194
+
195
+ global_index = 0
196
+ for protein_id in tqdm(interproscan_info.keys(), desc="生成prompts"):
197
+ # 获取QA对
198
+ qa_pairs = get_qa_data(protein_id, self.lmdb_path)
199
+
200
+ for qa_pair in qa_pairs:
201
+ question = qa_pair['question']
202
+ ground_truth = qa_pair['ground_truth']
203
+
204
+ # 生成prompt(需要修改generate_prompt函数以支持内存数据)
205
+ prompt = self._generate_prompt_from_memory(
206
+ protein_id, interproscan_info, temp_go_data, question
207
+ )
208
+
209
+ if prompt:
210
+ prompts_data[global_index] = {
211
+ "index": global_index,
212
+ "protein_id": protein_id,
213
+ "prompt": prompt,
214
+ "question": question,
215
+ "ground_truth": ground_truth
216
+ }
217
+ global_index += 1
218
+ else:
219
+ # 如果没有lmdb路径,按原来的方式处理
220
+ for protein_id in tqdm(interproscan_info.keys(), desc="生成prompts"):
221
+ prompt = self._generate_prompt_from_memory(
222
+ protein_id, interproscan_info, temp_go_data
223
+ )
224
+ if prompt:
225
+ prompts_data[protein_id] = prompt
226
+
227
+ print(f"步骤3完成: 生成了 {len(prompts_data)} 个prompt")
228
+ return prompts_data
229
+
230
+ def _generate_prompt_from_memory(self, protein_id: str, interproscan_info: Dict,
231
+ protein_go_dict: Dict, question: str = None) -> str:
232
+ """
233
+ 从内存中的数据生成prompt,包含完整的motif和GO定义
234
+ """
235
+ try:
236
+ from utils.protein_go_analysis import get_go_definition
237
+ from jinja2 import Template
238
+ from utils.generate_protein_prompt import get_prompt_template
239
+
240
+ # 获取GO分析结果
241
+ go_ids = protein_go_dict.get(protein_id, [])
242
+ go_annotations = []
243
+ all_related_definitions = {}
244
+
245
+ if go_ids:
246
+ for go_id in go_ids:
247
+ # 确保GO ID格式正确
248
+ clean_go_id = go_id.split(":")[-1] if ":" in go_id else go_id
249
+ go_annotations.append({"go_id": clean_go_id})
250
+
251
+ # 获取GO定义
252
+ definition = get_go_definition(clean_go_id,self.go_info_path)
253
+ if definition:
254
+ all_related_definitions[clean_go_id] = definition
255
+
256
+ # 获取motif信息
257
+ motif_pfam = {}
258
+ if self.pfam_descriptions_path:
259
+ try:
260
+ # 从interproscan结果中提取pfam信息
261
+ interproscan_results = interproscan_info[protein_id].get('interproscan_results', {})
262
+ pfam_entries = interproscan_results.get('pfam_id', [])
263
+
264
+ # 加载pfam描述
265
+ with open(self.pfam_descriptions_path, 'r') as f:
266
+ pfam_descriptions = json.load(f)
267
+
268
+ # 构建motif_pfam字典
269
+ for entry in pfam_entries:
270
+ for pfam_id, ipr_id in entry.items():
271
+ if pfam_id and pfam_id in pfam_descriptions:
272
+ motif_pfam[pfam_id] = pfam_descriptions[pfam_id]['description']
273
+
274
+ except Exception as e:
275
+ print(f"获取motif信息时出错: {str(e)}")
276
+
277
+ # 获取InterPro描述信息
278
+ interpro_descriptions = {}
279
+ other_types = [t for t in self.selected_info_types if t not in ['motif', 'go']]
280
+ if other_types and self.interpro_manager:
281
+ interpro_descriptions = self.interpro_manager.get_description(protein_id, other_types)
282
+
283
+ # 准备模板数据
284
+ template_data = {
285
+ "protein_id": protein_id,
286
+ "selected_info_types": self.selected_info_types,
287
+ "go_data": {
288
+ "status": "success" if go_annotations else "no_data",
289
+ "go_annotations": go_annotations,
290
+ "all_related_definitions": all_related_definitions
291
+ },
292
+ "motif_pfam": motif_pfam,
293
+ "interpro_descriptions": interpro_descriptions,
294
+ "question": question
295
+ }
296
+
297
+ # 使用模板生成prompt
298
+ PROMPT_TEMPLATE = get_prompt_template(self.selected_info_types, self.lmdb_path)
299
+ template = Template(PROMPT_TEMPLATE)
300
+ return template.render(**template_data)
301
+
302
+ except Exception as e:
303
+ print(f"生成prompt时出错 (protein_id: {protein_id}): {str(e)}")
304
+ # 如果出错,返回简化版本的prompt
305
+ return self._generate_simple_prompt(protein_id, interproscan_info, protein_go_dict, question)
306
+
307
+ def _generate_simple_prompt(self, protein_id: str, interproscan_info: Dict,
308
+ protein_go_dict: Dict, question: str = None) -> str:
309
+ """
310
+ 生成简化版本的prompt(作为备用)
311
+ """
312
+ # 获取蛋白质序列
313
+ sequence = interproscan_info[protein_id].get('sequence', '')
314
+
315
+ # 获取GO信息
316
+ go_ids = protein_go_dict.get(protein_id, [])
317
+
318
+ # 获取motif信息
319
+ interproscan_results = interproscan_info[protein_id].get('interproscan_results', {})
320
+ pfam_entries = interproscan_results.get('pfam_id', [])
321
+
322
+ # 简化的prompt生成逻辑
323
+ prompt_parts = []
324
+
325
+ if self.lmdb_path:
326
+ from utils.prompts import FUNCTION_PROMPT
327
+ prompt_parts.append(FUNCTION_PROMPT)
328
+ else:
329
+ from utils.prompts import ENZYME_PROMPT
330
+ prompt_parts.append(ENZYME_PROMPT)
331
+
332
+ prompt_parts.append("\ninput information:")
333
+
334
+ # 添加motif信息
335
+ if 'motif' in self.selected_info_types and pfam_entries:
336
+ prompt_parts.append("\nmotif:")
337
+ for entry in pfam_entries:
338
+ for key, value in entry.items():
339
+ if value:
340
+ prompt_parts.append(f"{value}: 无详细描述")
341
+
342
+ # 添加GO信息
343
+ if 'go' in self.selected_info_types and go_ids:
344
+ prompt_parts.append("\nGO:")
345
+ for i, go_id in enumerate(go_ids[:10], 1):
346
+ prompt_parts.append(f"▢ GO term{i}: {go_id}")
347
+ prompt_parts.append(f"• definition: 无详细定义")
348
+
349
+ if question:
350
+ prompt_parts.append(f"\nquestion: \n{question}")
351
+
352
+ return "\n".join(prompt_parts)
353
+
354
+ def step4_generate_llm_answers(self, prompts_data: Dict, save_dir: str) -> None:
355
+ """
356
+ 步骤4: 生成LLM答案
357
+
358
+ Args:
359
+ prompts_data: prompt数据
360
+ save_dir: 保存目录
361
+ """
362
+ print("步骤4: 生成LLM答案...")
363
+
364
+ # 创建保存目录
365
+ os.makedirs(save_dir, exist_ok=True)
366
+
367
+ if self.lmdb_path:
368
+ # 如果有lmdb路径,处理QA数据
369
+ for index, qa_item in tqdm(prompts_data.items(), desc="生成LLM答案"):
370
+ try:
371
+ protein_id = qa_item['protein_id']
372
+ prompt = qa_item['prompt']
373
+ question = qa_item['question']
374
+ ground_truth = qa_item['ground_truth']
375
+
376
+ # 调用LLM生成答案
377
+ llm_response = call_chatgpt(prompt)
378
+
379
+ # 构建结果数据
380
+ result = {
381
+ 'protein_id': protein_id,
382
+ 'index': index,
383
+ 'question': question,
384
+ 'ground_truth': ground_truth,
385
+ 'llm_answer': llm_response
386
+ }
387
+
388
+ # 保存文件
389
+ save_path = os.path.join(save_dir, f"{protein_id}_{index}.json")
390
+ with open(save_path, 'w') as f:
391
+ json.dump(result, f, indent=2, ensure_ascii=False)
392
+
393
+ except Exception as e:
394
+ print(f"处理索引 {index} 时出错: {str(e)}")
395
+ else:
396
+ # 如果没有lmdb路径,按原来的方式处理
397
+ for protein_id, prompt in tqdm(prompts_data.items(), desc="生成LLM答案"):
398
+ try:
399
+ # 调用LLM生成答案
400
+ llm_response = call_chatgpt(prompt)
401
+
402
+ # 构建结果数据
403
+ result = {
404
+ 'protein_id': protein_id,
405
+ 'prompt': prompt,
406
+ 'llm_answer': llm_response
407
+ }
408
+
409
+ # 保存文件
410
+ save_path = os.path.join(save_dir, f"{protein_id}.json")
411
+ with open(save_path, 'w') as f:
412
+ json.dump(result, f, indent=2, ensure_ascii=False)
413
+
414
+ except Exception as e:
415
+ print(f"处理蛋白质 {protein_id} 时出错: {str(e)}")
416
+
417
+ print(f"步骤4完成: 结果已保存到 {save_dir}")
418
+
419
+ def run(self, input_fasta: str, output_dir: str, temp_dir: str = "temp"):
420
+ """
421
+ 运行完整的工作流
422
+
423
+ Args:
424
+ input_fasta: 输入FASTA文件路径
425
+ output_dir: 输出目录
426
+ temp_dir: 临时文件目录
427
+ """
428
+ print(f"开始运行整合蛋白质分析管道...")
429
+ print(f"输入文件: {input_fasta}")
430
+ print(f"输出目录: {output_dir}")
431
+
432
+ # 创建输出目录
433
+ os.makedirs(output_dir, exist_ok=True)
434
+
435
+ try:
436
+ # 步骤1: 运行BLAST和InterProScan
437
+ if self.interproscan_info_path is None or self.blast_info_path is None:
438
+ interproscan_info, blast_info = self.step1_run_blast_and_interproscan(
439
+ input_fasta, temp_dir
440
+ )
441
+ else:
442
+ interproscan_info = json.load(open(self.interproscan_info_path))
443
+ blast_info = json.load(open(self.blast_info_path))
444
+
445
+ # 步骤2: 整合GO信息
446
+ protein_go_dict = self.step2_integrate_go_information(
447
+ interproscan_info, blast_info
448
+ )
449
+
450
+ # 步骤3: 生成prompt
451
+ prompts_data = self.step3_generate_prompts(
452
+ interproscan_info, blast_info, protein_go_dict
453
+ )
454
+ print(prompts_data)
455
+
456
+ # 步骤4: 生成LLM答案
457
+ self.step4_generate_llm_answers(prompts_data, output_dir)
458
+
459
+ print("整合管道运行完成!")
460
+
461
+ except Exception as e:
462
+ print(f"管道运行出错: {str(e)}")
463
+ raise
464
+ finally:
465
+ # 清理临时目录
466
+ print(f"清理临时目录: {temp_dir}")
467
+ if os.path.exists(temp_dir):
468
+ import shutil
469
+ shutil.rmtree(temp_dir)
470
+
471
+ def main():
472
+ parser = argparse.ArgumentParser(description="整合蛋白质分析管道")
473
+ parser.add_argument("--input_fasta", type=str, required=True, help="输入FASTA文件路径")
474
+ parser.add_argument("--output_dir", type=str, required=True, help="输出目录")
475
+ parser.add_argument("--temp_dir", type=str, default="temp", help="临时文件目录")
476
+ parser.add_argument('--interproscan_info_path', type=str, default=None, help="InterProScan结果文件路径")
477
+ parser.add_argument('--blast_info_path', type=str, default=None, help="BLAST结果文件路径")
478
+
479
+
480
+ # BLAST参数
481
+ parser.add_argument("--blast_database", type=str, default="uniprot_swissprot", help="BLAST数据库")
482
+ parser.add_argument("--expect_value", type=float, default=0.01, help="BLAST E-value阈值")
483
+
484
+ # InterProScan参数
485
+ parser.add_argument("--interproscan_path", type=str,
486
+ default="interproscan/interproscan-5.75-106.0/interproscan.sh",
487
+ help="InterProScan程序路径")
488
+
489
+ # GO整合参数
490
+ parser.add_argument("--go_topk", type=int, default=2, help="GO整合topk参数")
491
+
492
+ # Prompt生成参数
493
+ parser.add_argument("--selected_info_types", type=str, nargs='+',
494
+ default=['motif', 'go'], help="选择的信息类型")
495
+ parser.add_argument("--pfam_descriptions_path", type=str, default='data/raw_data/all_pfam_descriptions.json', help="Pfam描述文件路径")
496
+ parser.add_argument("--go_info_path", type=str, default='data/raw_data/go.json', help="GO信息文件路径")
497
+ parser.add_argument("--interpro_data_path", type=str, default='data/raw_data/interpro_data.json', help="InterPro数据文件路径")
498
+ parser.add_argument("--lmdb_path", type=str, help="LMDB数据库路径")
499
+
500
+ args = parser.parse_args()
501
+
502
+ # 创建管道实例
503
+ pipeline = IntegratedProteinPipeline(
504
+ blast_database=args.blast_database,
505
+ expect_value=args.expect_value,
506
+ interproscan_path=args.interproscan_path,
507
+ go_topk=args.go_topk,
508
+ selected_info_types=args.selected_info_types,
509
+ pfam_descriptions_path=args.pfam_descriptions_path,
510
+ go_info_path=args.go_info_path,
511
+ interpro_data_path=args.interpro_data_path,
512
+ lmdb_path=args.lmdb_path,
513
+ args=args
514
+ )
515
+
516
+ # 运行管道
517
+ pipeline.run(args.input_fasta, args.output_dir, args.temp_dir)
518
+
519
+ if __name__ == "__main__":
520
+ main()
interproscan.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datetime
3
+
4
+ class InterproScan():
5
+ def __init__(self, bash_path):
6
+ self.bash_path = bash_path
7
+
8
+ def run(self, fasta_file, goterms, pathways, save_dir) -> dict:
9
+ start_time = datetime.datetime.now()
10
+ temp_dir = f"{os.path.dirname(save_dir)}/temp"
11
+ if not os.path.exists(temp_dir):
12
+ os.makedirs(temp_dir)
13
+
14
+ seqs = self.read_fasta_to_list(fasta_file)
15
+ seqtype = self.is_protein_sequence(seqs)
16
+
17
+ # Call the InterproScan
18
+ cmd = f"{self.bash_path} \
19
+ -i {fasta_file} -o {save_dir} -f JSON"
20
+
21
+ cmd += f" -T {temp_dir}"
22
+ if goterms:
23
+ cmd += " -goterms"
24
+ if pathways:
25
+ cmd += " -pa"
26
+ if seqtype:
27
+ cmd += f" -t p"
28
+ else:
29
+ cmd += f" -t n"
30
+ print(cmd)
31
+ try:
32
+ os.system(cmd)
33
+ end_time = datetime.datetime.now()
34
+ spend_time = (end_time - start_time).total_seconds()
35
+ if os.listdir(save_dir):
36
+ print(f"InterproScan successfully completed. Output saved to {save_dir[len(self.out_dir)+1:]}.")
37
+ return {"output_dir": save_dir[len(self.out_dir)+1:], "duration": spend_time}
38
+
39
+ else:
40
+ raise Exception("InterproScan encountered an error. Please check your inputs and options.")
41
+
42
+ except Exception as e:
43
+ return {"error": str(e)}
44
+
45
+ def is_protein_sequence(self, sequences):
46
+ sequence = "".join(sequences)
47
+ # ATCG AUCG
48
+ if len(set(sequence.upper())) > 6:
49
+ return True
50
+ else:
51
+ return False
52
+
53
+ def read_fasta_to_list(self, file_path):
54
+ sequences = []
55
+ current_header = None
56
+ current_seq = []
57
+
58
+ with open(file_path, 'r') as f:
59
+ for line in f:
60
+ line = line.strip()
61
+ if line.startswith(">"):
62
+ if current_header is not None:
63
+ sequences.append("".join(current_seq))
64
+ current_header = line[1:]
65
+ current_seq = []
66
+ else:
67
+ current_seq.append(line)
68
+
69
+ if current_header is not None:
70
+ sequences.append("".join(current_seq))
71
+
72
+ return sequences
73
+
74
+
75
+ if __name__ == '__main__':
76
+ # Test
77
+ interproscan = InterproScan("interproscan/interproscan-5.75-106.0/interproscan.sh")
78
+ from utils.utils import get_protein_sequence_biopython, tofasta
79
+ import pickle
80
+
81
+ uids = []
82
+ seqs = []
83
+
84
+ with open("/zhangjiawei/interproscan/example/difference_20241122_ec_dict_list.pkl", "rb") as f:
85
+ datas = pickle.load(f)
86
+
87
+ for data in datas:
88
+ uids.append(data["uniprot_id"])
89
+ seqs.append(data["sequence"])
90
+
91
+ fasta_file = "example/protein_go_clean.fasta"
92
+
93
+ # seqs = [get_protein_sequence_biopython(uid) for uid in uids]
94
+
95
+ tofasta(fasta_file, uids, seqs)
96
+
97
+ input_args = {
98
+ "fasta_file": fasta_file,
99
+ "goterms": True,
100
+ "pathways": True,
101
+ "save_dir": "output/interproscan"
102
+ }
103
+
104
+ interproscan.run(**input_args)
105
+
106
+
107
+
pipeline.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from interproscan import InterproScan
2
+ from Bio.Blast.Applications import NcbiblastpCommandline
3
+ from utils.utils import extract_interproscan_metrics, get_seqnid, extract_blast_metrics, rename_interproscan_keys
4
+ import os
5
+ import json
6
+
7
+ # input fasta file
8
+ input_fasta = "evolla_test/test_hq0704_da_w_plddt_mask_hard_idnseqs.fasta"
9
+
10
+ #####################################################
11
+ # run blast
12
+ #####################################################
13
+ # settings
14
+ blast_database = "uniprot_swissprot"
15
+ expect_value = 0.01
16
+
17
+ blast_xml = "evolla_test/test_hq0704_da_w_plddt_mask_hard_blast.xml"
18
+
19
+
20
+ seq_dict = get_seqnid(input_fasta)
21
+
22
+ output_dir = os.path.dirname(blast_xml)
23
+ if not os.path.exists(output_dir):
24
+ os.makedirs(output_dir)
25
+
26
+ blast_cmd = NcbiblastpCommandline(
27
+ query=input_fasta,
28
+ db=blast_database,
29
+ out=blast_xml,
30
+ outfmt=5, # XML 格式
31
+ evalue=expect_value
32
+ )
33
+
34
+ blast_cmd() # 运行
35
+ blast_results = extract_blast_metrics(blast_xml)
36
+ blast_info = {}
37
+ for uid, info in blast_results.items():
38
+ blast_info[uid] = {"sequence": seq_dict[uid], "blast_results": info}
39
+
40
+ # save blast results
41
+ with open(blast_xml.replace(".xml", ".json"), "w") as f:
42
+ json.dump(blast_info, f, indent=4)
43
+
44
+
45
+ #####################################################
46
+ # run interproscan
47
+ #####################################################
48
+
49
+ # settings
50
+ goterms = True
51
+ pathways = True
52
+ interproscan_json = "evolla_test/test_hq0704_da_w_plddt_mask_hard_interproscan.json"
53
+ interproscan_path = "interproscan/interproscan-5.75-106.0/interproscan.sh"
54
+
55
+ librarys = ["PFAM", "PIRSR", "PROSITE_PROFILES", "SUPERFAMILY", "PRINTS", "PANTHER", "CDD", "GENE3D", "NCBIFAM", "SFLM", "MOBIDB_LITE", "COILS", "PROSITE_PATTERNS", "FUNFAM", "SMART"]
56
+
57
+ interproscan = InterproScan(interproscan_path)
58
+ input_args = {
59
+ "fasta_file": input_fasta,
60
+ "goterms": goterms,
61
+ "pathways": pathways,
62
+ "save_dir": interproscan_json}
63
+
64
+ interproscan.run(**input_args) # 运行
65
+
66
+ # output_name = input_fasta.split("/")[-1] + ".json"
67
+ interproscan_results = extract_interproscan_metrics(interproscan_json,
68
+ librarys=librarys)
69
+
70
+ interproscan_info = {}
71
+ for id, seq in seq_dict.items():
72
+ info = interproscan_results[seq]
73
+ info = rename_interproscan_keys(info)
74
+ interproscan_info[id] = {"sequence":seq, "interproscan_results": info}
75
+
76
+ # save blast results
77
+ with open(interproscan_json, "w") as f:
78
+ json.dump(interproscan_info, f, indent=4)
readme.md ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ProteinAgentQA: 智能蛋白质功能问答引擎
2
+
3
+ `ProteinAgentQA` 是一个基于大型语言模型(LLM)的智能问答系统,旨在弥合海量蛋白质数据与研究人员自然语言提问之间的鸿沟。项目内置两种针对不同应用场景的模式,使用户能够以最高效、最可靠的方式探索或查询蛋白质的功能信息。
4
+
5
+ - **项目状态**: 研发阶段
6
+ - **核心技术**: 生物信息学分析 (BLAST, InterProScan), 深度学习 (ProTrek), 大型语言模型 (LLM)
7
+
8
+ ---
9
+
10
+ ## 核心理念
11
+
12
+ 传统的蛋白质功能研究依赖于研究人员手动操作复杂的生物信息学工具并解读其结果。`ProteinAgentQA` 将这一过程自动化和智能化,通过两种独特的模式服务于蛋白质研究的全周期:从全新蛋白的初步功能探索,到对已知明星蛋白的快速信息检索。
13
+
14
+ ## 两大核心模式
15
+
16
+ 用户可以根据研究对象的性质,选择进入最适合的问答模式。
17
+
18
+ ### 1. 未知探索模式 (Unknown Exploration Mode)
19
+
20
+ **此模式专为功能未知、未经实验验证、或注释信息稀少的新型蛋白质设计。**
21
+
22
+ 当您面对一个全新的蛋白质序列,传统工具的注释结果可能充满噪音或过于笼统。本模式的目标是,通过一个先进的、经过严格验证的生物信息学流程,为该蛋白生成一个**高可信度的功能(GO)与结构域(Motif)集合**,并以此作为LLM回答问题的唯一事实依据。这确保了对未知蛋白功能推断的严谨性和可靠性。
23
+
24
+ #### 核心工作流:
25
+
26
+ 1. **输入**: 用户提供蛋白质的氨基酸序列。
27
+ 2. **后端自动化分析**: 系统自动触发三步分析流程:
28
+ * **第一步:候选功能集生成 (Candidate Generation)**
29
+ * 系统并行使用 **BLASTp** (对Swiss-Prot等高质量数据库) 和 **InterProScan** (整合Pfam, PROSITE等多个特征数据库) 对输入序列进行分析。
30
+ * 此阶段采用宽松策略,合并(Union)所有搜集到的GO Terms,旨在最大化**召回率(Recall)**,确保不遗漏任何潜在的功能线索。
31
+
32
+ * **第二步:AI语义过滤 (AI-Powered Semantic Filtering)**
33
+ * 所有候选GO term都将通过 **ProTrek** 模型进行打分。ProTrek是一个蛋白质-文本相关性评估工具,它能直接计算蛋白质序列与其功能描述文本之间的语义相关性分数。
34
+ * 这一步引入了与序列比对和特征匹配完全**正交(Orthogonal)**的证据,能极其有效地过滤掉由BLAST带来的“同源但功能不相关”或由InterProScan带来的“过于笼统”的假阳性注释。
35
+
36
+ * **第三步:数据驱动的阈值筛选 (Data-Driven Thresholding)**
37
+ * 系统会根据一个预设的ProTrek分数阈值,筛选出最终的“高可信度GO集”。
38
+ * **该阈值并非主观设定**,而是通过对一个包含607个新入库酶蛋白的验证集进行严格测试确定的。我们通过绘制**Precision-Recall曲线**,选取了使**F1-Score达到峰值**的分数作为最佳阈值,实现了准确率(Precision)和召回率(Recall)的最佳平衡。
39
+
40
+ 3. **LLM整合与问答**:
41
+ * 只有通过上述流程筛选出的高可信度GO Terms和Motifs,才会被作为上下文(Context)信息喂给大型语言模型。
42
+ * 用户的所有提问(如“这个蛋白可能的功能是什么?”、“它属于哪个蛋白家族?”、“它可能参与哪些生物学过程?”)都将由LLM**严格基于这份高质量的上下文**来回答,杜绝模型产生幻觉或进行无依据的猜测。
43
+
44
+ ### 2. 已知问答模式 (Known Q&A Mode)
45
+
46
+ **此模式专为已有充分研究和可靠注释的蛋白质(例如Swiss-Prot中的明星蛋白)设计,充当一个高效、精准的“蛋白质知识库私人助手”。**
47
+
48
+ #### 核心工作流:
49
+
50
+ 1. **输入**: 用户提供已知蛋白质的通用ID(如UniProt ID: `P53_HUMAN`)。
51
+ 2. **后端直接查询**: 系统直接访问并解析UniProt/Swiss-Prot等权威数据库,获取该蛋白质的**“金标准”(Ground Truth)**信息,包括但不限于:
52
+ * 官方功能注释
53
+ * 亚细胞定位
54
+ * 序列变体
55
+ * 翻译后修饰
56
+ * 相关通路等
57
+ 3. **LLM整合与问答**:
58
+ * 系统将这些权威、详尽的资料作为上下文提供给LLM。
59
+ * 用户可以围绕该蛋白进行任何提问,获得精准、可靠的答案。例如:“人类P53蛋白的亚细胞定位在哪里?”、“它有哪些主要的翻译后修饰位点?”。
60
+
61
+ ## 系统架构示意图
62
+
63
+ ```mermaid
64
+ graph TD
65
+ A[用户] --> B{ProteinAgentQA};
66
+ B --> C{模式选择};
67
+ C --> D[未知探索模式];
68
+ C --> E[已知问答模式];
69
+
70
+ D --> F[输入: 蛋白序列];
71
+ F --> G[后台分析流程];
72
+ G --> H[1. BLAST + InterProScan <br> (生成候选GO集)];
73
+ H --> I[2. ProTrek语义打分];
74
+ I --> J[3. F1最优阈值筛选];
75
+ J --> K[最终高可信GO/Motif];
76
+ K --> L[LLM上下文];
77
+
78
+ E --> M[输入: 蛋白ID];
79
+ M --> N[查询Swiss-Prot等数据库];
80
+ N --> O[获取Ground Truth信息];
81
+ O --> L[LLM上下文];
82
+
83
+ L --> P[大型语言模型];
84
+ A --> Q[提问];
85
+ Q --> P;
86
+ P --> R[生成回答];
87
+ R --> A;
88
+
89
+ subgraph "后端核心处理"
90
+ G
91
+ H
92
+ I
93
+ J
94
+ N
95
+ end
96
+ ```
97
+
98
+ ## 使用示例
99
+
100
+ ### 未知探索模式
101
+
102
+ > **用户输入**:
103
+ > `> MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVIDB...`
104
+
105
+ > **用户提问**:
106
+ > `这个蛋白的功能是什么?它依赖于什么离子?`
107
+
108
+ > **系统回答**:
109
+ > `基于对其序列的分析,该蛋白极有可能是一种GTP结合蛋白。其预测的分子功能包括“GTP binding (GO:0005525)”和“GTPase activity (GO:0003924)”。分析显示其包含一个P-loop NTPase结构域,该结构域通常需要镁离子(Magnesium ion)作为辅因子。`
110
+
111
+ ### 已知问答模式
112
+
113
+ > **用户输入**:
114
+ > `P04637` (人类P53蛋白的UniProt ID)
115
+
116
+ > **用户提问**:
117
+ > `总结一下这个蛋白在细胞周期调控中的作用。`
118
+
119
+ > **系统回答**:
120
+ > `根据UniProt的记录,细胞肿瘤抗原p53 (TP53) 是一个核心的肿瘤抑制因子。在细胞周期中,它充当一个关键的检查点调控蛋白。当检测到DNA损伤时,p53会被激活并积累,通过转录激活CDKN1A/p21等下游基因,来阻止细胞从G1期进入S期,从而为DNA修复提供时间。如果损伤无法修复,p53可以诱导细胞凋亡。`
setup.sh ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Define InterProScan version
4
+ IPS_VERSION="5.75-106.0"
5
+ CONDA_ENV_NAME="rag_llm"
6
+
7
+ IPS_DIR="interproscan-${IPS_VERSION}"
8
+ IPS_TAR="interproscan-${IPS_VERSION}-64-bit.tar.gz"
9
+ IPS_URL="https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/${IPS_VERSION}/${IPS_TAR}"
10
+
11
+
12
+ # Check if conda is available
13
+ if ! command -v conda &> /dev/null; then
14
+ echo "Error: conda is not installed or not in PATH"
15
+ echo "Please install Miniconda or Anaconda first"
16
+ exit 1
17
+ fi
18
+
19
+ # Create conda environment with Java 11
20
+ echo "Creating conda environment '${CONDA_ENV_NAME}' with OpenJDK 11..."
21
+ conda create -y -n ${CONDA_ENV_NAME} openjdk=11 python
22
+ if [ $? -ne 0 ]; then
23
+ echo "Error: Failed to create conda environment"
24
+ exit 1
25
+ fi
26
+
27
+ # Activate conda environment
28
+ echo "Activating conda environment..."
29
+ source $(conda info --base)/etc/profile.d/conda.sh
30
+ conda activate ${CONDA_ENV_NAME}
31
+
32
+ # Create installation directory
33
+ echo "Setting up InterProScan ${IPS_VERSION}..."
34
+ mkdir -p interproscan
35
+ cd interproscan || exit 1
36
+
37
+ # Download InterProScan and checksum
38
+ echo "Downloading InterProScan..."
39
+ wget -nc "${IPS_URL}"
40
+ wget -nc "${IPS_URL}.md5"
41
+
42
+ # Verify MD5 checksum
43
+ echo "Verifying download integrity..."
44
+ if ! md5sum -c "${IPS_TAR}.md5"; then
45
+ echo "ERROR: MD5 checksum verification failed!"
46
+ echo "The downloaded file may be corrupted. Please try downloading again."
47
+ exit 1
48
+ fi
49
+
50
+ # Extract package
51
+ echo "Extracting InterProScan..."
52
+ tar -xzf "${IPS_TAR}"
53
+
54
+ # Verify Java installation in conda env
55
+ echo "Checking Java environment in conda env..."
56
+ JAVA_VER=$(java -version 2>&1 | head -n 1 | awk -F '"' '{print $2}')
57
+
58
+ if [[ "$JAVA_VER" =~ ^11\. ]]; then
59
+ echo "Found compatible Java version in conda env: $JAVA_VER"
60
+ else
61
+ echo "Error: Java version in conda env is not 11.x (found: $JAVA_VER)"
62
+ exit 1
63
+ fi
64
+
65
+ # Run setup
66
+ echo "Running InterProScan setup..."
67
+ cd "${IPS_DIR}" || exit 1
68
+ python setup.py -f interproscan.properties
69
+
70
+ echo ""
71
+ echo "InterProScan installation completed in conda environment '${CONDA_ENV_NAME}'!"
72
+ echo "To use InterProScan, first activate the conda environment:"
73
+ echo "conda activate ${CONDA_ENV_NAME}"
74
+ echo "Then add InterProScan to your PATH:"
75
+ echo "export PATH=\$PATH:$(pwd)"
76
+ echo "You may also need to set INTERPROSCAN_HOME=$(pwd)"
77
+
78
+ cd ../
79
+
80
+ # install biopython for blast
81
+ echo "Installing Biopython for BLAST support..."
82
+ pip install biopython
83
+
84
+ echo "Biopython installation completed."
85
+
86
+ # Install BLAST from bioconda
87
+ echo "Installing BLAST from bioconda..."
88
+ conda config --add channels bioconda
89
+ conda config --add channels conda-forge
90
+ conda install -c bioconda blast=2.16.0 -y
91
+
92
+ mkdir -p blast_db
93
+ cd blast_db || exit 1
94
+
95
+ echo "Downloading UniProt SwissProt database..."
96
+ wget --quiet --show-progress -N https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
97
+
98
+ if [ -f "uniprot_sprot.fasta.gz" ]; then
99
+ echo "Decompressing database..."
100
+ gunzip -k uniprot_sprot.fasta.gz
101
+
102
+ if [ -f "uniprot_sprot.fasta" ]; then
103
+ echo "Creating BLAST database..."
104
+ makeblastdb -in uniprot_sprot.fasta -dbtype prot -out uniprot_swissprot -parse_seqids -title "UniProt SwissProt"
105
+
106
+ # Verify database creation
107
+ if [ -f "uniprot_swissprot.phr" ]; then
108
+ echo "BLAST database created successfully."
109
+ echo "You can now use it with: blastp -db uniprot_swissprot -query your_file.fasta"
110
+ else
111
+ echo "Error: BLAST database files not created!" >&2
112
+ exit 1
113
+ fi
114
+ else
115
+ echo "Error: Failed to decompress database!" >&2
116
+ exit 1
117
+ fi
118
+ else
119
+ echo "Error: Failed to download database!" >&2
120
+ exit 1
121
+ fi
122
+
123
+ export BLASTDB=$(pwd)
124
+ echo "BLASTDB environment variable set to: $BLASTDB"
125
+ echo "please add <export BLASTDB=$(pwd)> to your .bashrc or .zshrc file for persistent use."
126
+
127
+ # install python packages
128
+ echo "Installing required Python packages..."
129
+ pip install openai gradio torch
test_data/interproscan_info.json ADDED
The diff for this file is too large to render. See raw diff
 
utils/cal_pr.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from sklearn.metrics import precision_recall_curve, average_precision_score
5
+
6
+ def load_go_data(file_path):
7
+ """加载JSON文件中的GO数据"""
8
+ data = {}
9
+ with open(file_path, 'r') as f:
10
+ for line in f:
11
+ entry = json.loads(line)
12
+ data[entry["protein_id"]] = set(entry["GO_id"])
13
+ return data
14
+
15
+ def calculate_pr_metrics(true_go_file, pred_go_file, scores_file=None):
16
+ """计算precision、recall和绘制PR曲线"""
17
+ # 加载真实GO和预测GO数据
18
+ true_go_data = load_go_data(true_go_file)
19
+ pred_go_data = load_go_data(pred_go_file)
20
+
21
+ # 如果提供了分数文件,加载分数
22
+ scores = {}
23
+ if scores_file:
24
+ with open(scores_file, 'r') as f:
25
+ for line in f:
26
+ entry = json.loads(line)
27
+ scores[entry["protein_id"]] = {go: score for go, score in entry.get("GO_scores", {}).items()}
28
+
29
+ # 准备计算PR曲线的数据
30
+ all_true = []
31
+ all_scores = []
32
+
33
+ # 处理每个蛋白质
34
+ common_proteins = set(true_go_data.keys()) & set(pred_go_data.keys())
35
+
36
+ # 计算每个蛋白质的precision和recall
37
+ protein_metrics = {}
38
+
39
+ for protein_id in common_proteins:
40
+ true_gos = true_go_data[protein_id]
41
+ pred_gos = pred_go_data[protein_id]
42
+
43
+ # 计算当前蛋白质的precision和recall
44
+ if len(pred_gos) > 0:
45
+ precision = len(true_gos & pred_gos) / len(pred_gos)
46
+ else:
47
+ precision = 0.0
48
+
49
+ if len(true_gos) > 0:
50
+ recall = len(true_gos & pred_gos) / len(true_gos)
51
+ else:
52
+ recall = 1.0 # 如果没有真实GO,则recall为1
53
+
54
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
55
+
56
+ protein_metrics[protein_id] = {
57
+ "precision": precision,
58
+ "recall": recall,
59
+ "f1": f1
60
+ }
61
+
62
+ # 如果有分数,为PR曲线准备数据
63
+ if scores_file:
64
+ protein_scores = scores.get(protein_id, {})
65
+ for go in set(true_gos) | set(pred_go_data.get(protein_id, set())):
66
+ all_true.append(1 if go in true_gos else 0)
67
+ all_scores.append(protein_scores.get(go, 0.0))
68
+
69
+ # 计算整体指标
70
+ avg_precision = np.mean([m["precision"] for m in protein_metrics.values()])
71
+ avg_recall = np.mean([m["recall"] for m in protein_metrics.values()])
72
+ avg_f1 = np.mean([m["f1"] for m in protein_metrics.values()])
73
+
74
+ results = {
75
+ "average_precision": avg_precision,
76
+ "average_recall": avg_recall,
77
+ "average_f1": avg_f1,
78
+ "protein_metrics": protein_metrics
79
+ }
80
+
81
+ # 如果有分数,绘制PR曲线
82
+ if scores_file and all_true and all_scores:
83
+ all_true = np.array(all_true)
84
+ all_scores = np.array(all_scores)
85
+
86
+ precision, recall, thresholds = precision_recall_curve(all_true, all_scores)
87
+ avg_precision = average_precision_score(all_true, all_scores)
88
+
89
+ # 计算每个阈值的F1分数
90
+ f1_scores = np.zeros_like(thresholds)
91
+ for i, threshold in enumerate(thresholds):
92
+ f1_scores[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i]) if (precision[i] + recall[i]) > 0 else 0
93
+
94
+ # 找到最佳F1分数对应的阈值
95
+ best_f1_idx = np.argmax(f1_scores)
96
+ best_threshold = thresholds[best_f1_idx]
97
+ best_precision = precision[best_f1_idx]
98
+ best_recall = recall[best_f1_idx]
99
+ best_f1 = f1_scores[best_f1_idx]
100
+
101
+ # 绘制PR曲线
102
+ plt.figure(figsize=(10, 8))
103
+ plt.plot(recall, precision, label=f'平均精确率 = {avg_precision:.3f}')
104
+ plt.scatter(best_recall, best_precision, color='red',
105
+ label=f'最佳F1 = {best_f1:.3f} (阈值 = {best_threshold:.3f})')
106
+
107
+ plt.xlabel('Recall')
108
+ plt.ylabel('Precision')
109
+ plt.title('Precision-Recall 曲线')
110
+ plt.legend()
111
+ plt.grid(True)
112
+
113
+ # 保存图像
114
+ plt.savefig('pr_curve.png', dpi=300)
115
+ plt.close()
116
+
117
+ results.update({
118
+ "pr_curve": {
119
+ "precision": precision.tolist(),
120
+ "recall": recall.tolist(),
121
+ "thresholds": thresholds.tolist(),
122
+ "best_threshold": float(best_threshold),
123
+ "best_f1": float(best_f1)
124
+ }
125
+ })
126
+
127
+ return results
128
+
129
+ def main():
130
+ import argparse
131
+ parser = argparse.ArgumentParser(description='计算GO预测的Precision和Recall并绘制PR曲线')
132
+ parser.add_argument('--true', required=True, help='真实GO的JSON文件路径')
133
+ parser.add_argument('--pred', required=True, help='预测GO的JSON文件路径')
134
+ parser.add_argument('--scores', help='GO分数的JSON文件路径(可选)')
135
+ parser.add_argument('--output', default='test_results/pr_results.json', help='输出结果的JSON文件路径')
136
+
137
+ args = parser.parse_args()
138
+
139
+ results = calculate_pr_metrics(args.true, args.pred, args.scores)
140
+
141
+ # 保存结果
142
+ with open(args.output, 'w') as f:
143
+ json.dump(results, f, indent=2)
144
+
145
+ print(f"平均精确率: {results['average_precision']:.4f}")
146
+ print(f"平均召回率: {results['average_recall']:.4f}")
147
+ print(f"平均F1分数: {results['average_f1']:.4f}")
148
+
149
+ if 'pr_curve' in results:
150
+ print(f"最佳F1分数: {results['pr_curve']['best_f1']:.4f} (阈值: {results['pr_curve']['best_threshold']:.4f})")
151
+ print(f"PR曲线已保存为 pr_curve.png")
152
+
153
+ if __name__ == "__main__":
154
+ main()
utils/functions.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from tqdm import tqdm
3
+
4
+ def merge_interproscan_data(merge_file1, merge_file2):
5
+ """
6
+ 合并test_data/interproscan_info.json到data/processed_data/interproscan_info.json
7
+ 根据序列匹配进行合并,更新数据结构
8
+ """
9
+ # 读取源文件和目标文件
10
+ with open(merge_file1, 'r') as f:
11
+ target_data = json.load(f)
12
+
13
+ with open(merge_file2, 'r') as f:
14
+ source_data = json.load(f)
15
+
16
+ # 遍历目标文件中的每一项
17
+ for protein_id, protein_info in tqdm(target_data.items()):
18
+ sequence = protein_info['sequence']
19
+
20
+ # 在源文件中查找匹配的序列
21
+ if sequence in source_data:
22
+ source_info = source_data[sequence]
23
+
24
+ # 更新interproscan_results
25
+ if 'interproscan_results' not in protein_info:
26
+ protein_info['interproscan_results'] = {}
27
+
28
+ # 处理PFAM -> pfam_id
29
+ if 'PFAM' in source_info and source_info['PFAM']:
30
+ protein_info['interproscan_results']['pfam_id'] = source_info['PFAM']
31
+
32
+ # 处理GO -> go_id (保持原有结构)
33
+ if 'GO' in source_info and source_info['GO']:
34
+ protein_info['interproscan_results']['go_id'] = source_info['GO']
35
+
36
+ # 添加其他字段
37
+ for key, value in source_info.items():
38
+ if key not in ['PFAM', 'GO'] and value: # 跳过PFAM和GO,只处理其他非空字段
39
+ # 将字段名转换为小写加下划线的格式
40
+ field_name = key.lower().replace('_', '_')
41
+ protein_info['interproscan_results'][field_name] = value
42
+
43
+ # 保存更新后的数据
44
+ with open(merge_file1, 'w') as f:
45
+ json.dump(target_data, f, indent=4, ensure_ascii=False)
46
+
47
+ print("数据合并完成!")
48
+
49
+ if __name__ == "__main__":
50
+ import argparse
51
+ parser = argparse.ArgumentParser()
52
+ parser.add_argument('--merge_file1', default='data/processed_data/interproscan_info.json', help='合并的文件1')
53
+ parser.add_argument('--merge_file2', default='test_data/interproscan_info.json', help='合并的文件2')
54
+ args = parser.parse_args()
55
+ merge_interproscan_data(args.merge_file1, args.merge_file2)
56
+
utils/generate_llm_answers.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
4
+ import json
5
+ from pathlib import Path
6
+ from tqdm import tqdm
7
+ from utils.openai_access import call_chatgpt
8
+ from utils.mpr import MultipleProcessRunnerSimplifier
9
+ from utils.generate_protein_prompt import generate_prompt
10
+
11
+ qa_data = None
12
+
13
+ def _load_qa_data(prompt_path):
14
+ global qa_data
15
+ if qa_data is None:
16
+ qa_data = {}
17
+ with open(prompt_path, 'r') as f:
18
+ for line in f:
19
+ if line.strip():
20
+ item = json.loads(line.strip())
21
+ qa_data[item['index']] = item
22
+ return qa_data
23
+
24
+ def process_single_qa(process_id, idx, qa_index, writer, save_dir):
25
+ """处理单个QA对并生成答案"""
26
+ try:
27
+ qa_item = qa_data[qa_index]
28
+ protein_id = qa_item['protein_id']
29
+ prompt = qa_item['prompt']
30
+ question = qa_item['question']
31
+ ground_truth = qa_item['ground_truth']
32
+
33
+ # 调用LLM生成答案
34
+ llm_response = call_chatgpt(prompt)
35
+
36
+ # 构建结果数据
37
+ result = {
38
+ 'protein_id': protein_id,
39
+ 'index': qa_index,
40
+ 'question': question,
41
+ 'ground_truth': ground_truth,
42
+ 'llm_answer': llm_response
43
+ }
44
+
45
+ # 保存文件,文件名使用protein_id和index
46
+ save_path = os.path.join(save_dir, f"{protein_id}_{qa_index}.json")
47
+ with open(save_path, 'w') as f:
48
+ json.dump(result, f, indent=2, ensure_ascii=False)
49
+
50
+ except Exception as e:
51
+ print(f"Error processing QA index {qa_index}: {str(e)}")
52
+
53
+ def get_missing_qa_indices(save_dir):
54
+ """检查哪些QA索引尚未成功生成数据"""
55
+ # 获取所有应该生成的qa索引
56
+ all_qa_indices = list(qa_data.keys())
57
+
58
+ # 存储问题qa索引(包括空文件和未生成的文件)
59
+ problem_qa_indices = set()
60
+
61
+ # 检查每个应该存在的qa索引
62
+ for qa_index in tqdm(all_qa_indices, desc="检查QA数据文件"):
63
+ protein_id = qa_data[qa_index]['protein_id']
64
+ json_file = Path(save_dir) / f"{protein_id}_{qa_index}.json"
65
+
66
+ # 如果文件不存在,加入问题列表
67
+ if not json_file.exists():
68
+ problem_qa_indices.add(qa_index)
69
+ continue
70
+
71
+ # 检查文件内容
72
+ try:
73
+ with open(json_file, 'r') as f:
74
+ data = json.load(f)
75
+ # 检查文件内容是否为空或缺少必要字段
76
+ if (data is None or len(data) == 0 or
77
+ 'llm_answer' not in data or
78
+ data.get('llm_answer') is None or
79
+ data.get('llm_answer') == ''):
80
+ problem_qa_indices.add(qa_index)
81
+ json_file.unlink() # 删除空文件或不完整文件
82
+ except (json.JSONDecodeError, Exception) as e:
83
+ # 如果JSON解析失败,也认为是问题文件
84
+ problem_qa_indices.add(qa_index)
85
+ try:
86
+ json_file.unlink() # 删除损坏的文件
87
+ except:
88
+ pass
89
+
90
+ return problem_qa_indices
91
+
92
+ def main():
93
+ import argparse
94
+ parser = argparse.ArgumentParser()
95
+ parser.add_argument("--prompt_path", type=str,
96
+ default="data/processed_data/prompts@clean_test.jsonl",
97
+ help="Path to the JSONL file containing QA prompts")
98
+ parser.add_argument("--n_process", type=int, default=64,
99
+ help="Number of parallel processes")
100
+ parser.add_argument("--save_dir", type=str,
101
+ default="data/clean_test_results_top2",
102
+ help="Directory to save results")
103
+ parser.add_argument("--max_iterations", type=int, default=3,
104
+ help="Maximum number of iterations to try generating all QA pairs")
105
+ args = parser.parse_args()
106
+
107
+ # 创建保存目录
108
+ os.makedirs(args.save_dir, exist_ok=True)
109
+
110
+ # 加载QA数据
111
+ _load_qa_data(args.prompt_path)
112
+ print(f"已加载 {len(qa_data)} 个QA对")
113
+
114
+ # 循环检查和生成,直到所有QA对都已生成或达到最大迭代次数
115
+ iteration = 0
116
+ while iteration < args.max_iterations:
117
+ iteration += 1
118
+ print(f"\n开始第 {iteration} 轮检查和生成")
119
+
120
+ # 获取缺失的QA索引
121
+ missing_qa_indices = get_missing_qa_indices(args.save_dir)
122
+
123
+ # 如果没有缺失的QA索引,则完成
124
+ if not missing_qa_indices:
125
+ print("所有QA数据已成功生成!")
126
+ break
127
+
128
+ print(f"发现 {len(missing_qa_indices)} 个缺失的QA数据,准备生成")
129
+
130
+ # 将缺失的QA索引列表转换为列表
131
+ missing_qa_indices_list = sorted(list(missing_qa_indices))
132
+
133
+ # 保存当前缺失的QA索引列表,用于记录
134
+ missing_ids_file = Path(args.save_dir) / f"missing_qa_indices_iteration_{iteration}.txt"
135
+ with open(missing_ids_file, 'w') as f:
136
+ for qa_index in missing_qa_indices_list:
137
+ protein_id = qa_data[qa_index]['protein_id']
138
+ f.write(f"{protein_id}_{qa_index}\n")
139
+
140
+ # 使用多进程处理生成缺失的QA数据
141
+ mprs = MultipleProcessRunnerSimplifier(
142
+ data=missing_qa_indices_list,
143
+ do=lambda process_id, idx, qa_index, writer: process_single_qa(process_id, idx, qa_index, writer, args.save_dir),
144
+ n_process=args.n_process,
145
+ split_strategy="static"
146
+ )
147
+ mprs.run()
148
+
149
+ print(f"第 {iteration} 轮生成完成")
150
+
151
+ # 最后检查一次
152
+ final_missing_indices = get_missing_qa_indices(args.save_dir)
153
+ if final_missing_indices:
154
+ print(f"经过 {iteration} 轮生成后,仍有 {len(final_missing_indices)} 个QA数据未成功生成")
155
+ # 保存最终缺失的QA索引列表
156
+ final_missing_ids_file = Path(args.save_dir) / "final_missing_qa_indices.txt"
157
+ with open(final_missing_ids_file, 'w') as f:
158
+ for qa_index in sorted(final_missing_indices):
159
+ protein_id = qa_data[qa_index]['protein_id']
160
+ f.write(f"{protein_id}_{qa_index}\n")
161
+ print(f"最终缺失的QA索引已保存到: {final_missing_ids_file}")
162
+ else:
163
+ print(f"经过 {iteration} 轮生成,所有QA数据已成功生成!")
164
+
165
+ if __name__ == "__main__":
166
+ main()
utils/generate_llm_answers4enzyme.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
4
+ import json
5
+ from pathlib import Path
6
+ from tqdm import tqdm
7
+ from utils.openai_access import call_chatgpt
8
+ from utils.mpr import MultipleProcessRunnerSimplifier
9
+ from utils.generate_protein_prompt import generate_prompt
10
+
11
+ prompts = None
12
+
13
+ def _load_prompts(prompt_path):
14
+ global prompts
15
+ if prompts is None:
16
+ prompts = json.load(open(prompt_path, 'r'))
17
+ return prompts
18
+
19
+ def read_protein_ids(protein_id_path):
20
+ """读取蛋白质ID列表"""
21
+ with open(protein_id_path, 'r') as f:
22
+ protein_ids = [line.strip() for line in f if line.strip()]
23
+ return protein_ids
24
+
25
+ def process_single_protein(process_id, idx, protein_id, writer, save_dir):
26
+ """处理单个蛋白质的motif信息并生成摘要"""
27
+ try:
28
+ # prompt = generate_prompt(protein_id)
29
+ prompt = prompts[protein_id]
30
+ response = call_chatgpt(prompt)
31
+ # 写入单独的文件
32
+ save_path = os.path.join(save_dir, f"{protein_id}.json")
33
+ with open(save_path, 'w') as f:
34
+ json.dump(response, f, indent=2)
35
+
36
+ except Exception as e:
37
+ print(f"Error processing protein {protein_id}: {str(e)}")
38
+
39
+ def get_missing_protein_ids(save_dir):
40
+ """检查哪些蛋白质ID尚未成功生成数据"""
41
+ # 读取所有应该生成的protein_id
42
+ all_protein_ids = list(prompts.keys())
43
+ # with open(all_protein_ids_path, 'r') as f:
44
+ # all_protein_ids = set(line.strip() for line in f if line.strip())
45
+
46
+ # 存储问题protein_id(包括空文件和未生成的文件)
47
+ problem_protein_ids = set()
48
+
49
+ # 检查每个应该存在的protein_id
50
+ for protein_id in tqdm(all_protein_ids, desc="检查蛋白质数据文件"):
51
+ json_file = Path(save_dir) / f"{protein_id}.json"
52
+
53
+ # 如果文件不存在,加入问题列表
54
+ if not json_file.exists():
55
+ problem_protein_ids.add(protein_id)
56
+ continue
57
+
58
+ # 检查文件内容
59
+ try:
60
+ with open(json_file, 'r') as f:
61
+ data = json.load(f)
62
+ # 检查文件内容是否为空或null
63
+ if data is None or len(data) == 0:
64
+ problem_protein_ids.add(protein_id)
65
+ json_file.unlink() # 删除空文件
66
+ except (json.JSONDecodeError, Exception) as e:
67
+ # 如果JSON解析失败,也认为是问题文件
68
+ problem_protein_ids.add(protein_id)
69
+ try:
70
+ json_file.unlink() # 删除损坏的文件
71
+ except:
72
+ pass
73
+
74
+ return problem_protein_ids
75
+
76
+ def main():
77
+ import argparse
78
+ parser = argparse.ArgumentParser()
79
+ # parser.add_argument("--all_protein_ids_path", type=str,
80
+ # default="/zhuangkai/projects/TTS4Protein/data/processed_data/protein_id@1024_go@10_covermotif_go.txt",
81
+ # help="Path to the file containing all protein IDs that should be generated")
82
+ parser.add_argument("--prompt_path", type=str,
83
+ default="data/processed_data/prompts@clean_test.json",
84
+ help="Path to the file containing prompts")
85
+ parser.add_argument("--n_process", type=int, default=64,
86
+ help="Number of parallel processes")
87
+ parser.add_argument("--save_dir", type=str,
88
+ default="data/clean_test_results_top2",
89
+ help="Directory to save results")
90
+ parser.add_argument("--max_iterations", type=int, default=3,
91
+ help="Maximum number of iterations to try generating all proteins")
92
+ args = parser.parse_args()
93
+
94
+ # 创建保存目录
95
+ os.makedirs(args.save_dir, exist_ok=True)
96
+
97
+ # 加载提示
98
+ _load_prompts(args.prompt_path)
99
+ print(f"已加载 {len(prompts)} 个提示")
100
+
101
+ # 循环检查和生成,直到所有蛋白质都已生成或达到最大迭代次数
102
+ iteration = 0
103
+ while iteration < args.max_iterations:
104
+ iteration += 1
105
+ print(f"\n开始第 {iteration} 轮检查和生成")
106
+
107
+ # 获取缺失的蛋白质ID
108
+ missing_protein_ids = get_missing_protein_ids(args.save_dir)
109
+
110
+ # 如果没有缺失的蛋白质ID,则完成
111
+ if not missing_protein_ids:
112
+ print("所有蛋白质数据已成功生成!")
113
+ break
114
+
115
+ print(f"发现 {len(missing_protein_ids)} 个缺失的蛋白质数据,准备生成")
116
+
117
+ # 将缺失的蛋白质ID列表转换为列表
118
+ missing_protein_ids_list = sorted(list(missing_protein_ids))
119
+
120
+ # 保存当前缺失的蛋白质ID列表,用于记录
121
+ missing_ids_file = Path(args.save_dir) / f"missing_protein_ids_iteration_{iteration}.txt"
122
+ with open(missing_ids_file, 'w') as f:
123
+ for protein_id in missing_protein_ids_list:
124
+ f.write(f"{protein_id}\n")
125
+
126
+ # 使用多进程处理生成缺失的蛋白质数据
127
+ mprs = MultipleProcessRunnerSimplifier(
128
+ data=missing_protein_ids_list,
129
+ do=lambda process_id, idx, protein_id, writer: process_single_protein(process_id, idx, protein_id, writer, args.save_dir),
130
+ n_process=args.n_process,
131
+ split_strategy="static"
132
+ )
133
+ mprs.run()
134
+
135
+ print(f"第 {iteration} 轮生成完成")
136
+
137
+ # 最后检查一次
138
+ final_missing_ids = get_missing_protein_ids(args.save_dir)
139
+ if final_missing_ids:
140
+ print(f"经过 {iteration} 轮生成后,仍有 {len(final_missing_ids)} 个蛋白质数据未成功生成")
141
+ # 保存最终缺失的蛋白质ID列表
142
+ final_missing_ids_file = Path(args.save_dir) / "final_missing_protein_ids.txt"
143
+ with open(final_missing_ids_file, 'w') as f:
144
+ for protein_id in sorted(final_missing_ids):
145
+ f.write(f"{protein_id}\n")
146
+ print(f"最终缺失的蛋白质ID已保存到: {final_missing_ids_file}")
147
+ else:
148
+ print(f"经过 {iteration} 轮生成,所有蛋白质数据已成功生成!")
149
+
150
+ if __name__ == "__main__":
151
+ main()
utils/generate_protein_prompt.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys
3
+ import os
4
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
5
+ from jinja2 import Template
6
+ try:
7
+ from utils.protein_go_analysis import analyze_protein_go
8
+ from utils.prompts import ENZYME_PROMPT, RELATION_SEMANTIC_PROMPT, FUNCTION_PROMPT
9
+ from utils.get_motif import get_motif_pfam
10
+ except ImportError:
11
+ from protein_go_analysis import analyze_protein_go
12
+ from prompts import ENZYME_PROMPT, RELATION_SEMANTIC_PROMPT, FUNCTION_PROMPT
13
+ from get_motif import get_motif_pfam
14
+ from tqdm import tqdm
15
+
16
+ class InterProDescriptionManager:
17
+ """管理InterPro描述信息的类,避免重复读取文件"""
18
+
19
+ def __init__(self, interpro_data_path, interproscan_info_path):
20
+ """
21
+ 初始化时读取所有需要的数据
22
+
23
+ Args:
24
+ interpro_data_path: interpro_data.json文件路径
25
+ interproscan_info_path: interproscan_info.json文件路径
26
+ """
27
+ self.interpro_data_path = interpro_data_path
28
+ self.interproscan_info_path = interproscan_info_path
29
+ self.interpro_data = None
30
+ self.interproscan_info = None
31
+ self._load_data()
32
+
33
+ def _load_data(self):
34
+ """加载数据文件,只执行一次"""
35
+ if self.interpro_data_path and os.path.exists(self.interpro_data_path):
36
+ with open(self.interpro_data_path, 'r') as f:
37
+ self.interpro_data = json.load(f)
38
+
39
+ if self.interproscan_info_path and os.path.exists(self.interproscan_info_path):
40
+ with open(self.interproscan_info_path, 'r') as f:
41
+ self.interproscan_info = json.load(f)
42
+
43
+ def get_description(self, protein_id, selected_types=None):
44
+ """
45
+ 获取蛋白质的InterPro描述信息
46
+
47
+ Args:
48
+ protein_id: 蛋白质ID
49
+ selected_types: 需要获取的信息类型列表,如['superfamily', 'panther', 'gene3d']
50
+
51
+ Returns:
52
+ dict: 包含各类型描述信息的字典
53
+ """
54
+ if selected_types is None:
55
+ selected_types = []
56
+
57
+ if not self.interpro_data or not self.interproscan_info:
58
+ return {}
59
+
60
+ result = {}
61
+
62
+ # 检查蛋白质是否存在
63
+ if protein_id not in self.interproscan_info:
64
+ return result
65
+
66
+ protein_info = self.interproscan_info[protein_id]
67
+ interproscan_results = protein_info.get('interproscan_results', {})
68
+
69
+ # 遍历选定的类型
70
+ for info_type in selected_types:
71
+ if info_type in interproscan_results:
72
+ type_descriptions = {}
73
+
74
+ # 获取该类型的所有IPR ID
75
+ for entry in interproscan_results[info_type]:
76
+ for key, ipr_id in entry.items():
77
+ if ipr_id and ipr_id in self.interpro_data:
78
+ type_descriptions[ipr_id] = {
79
+ 'name': self.interpro_data[ipr_id].get('name', ''),
80
+ 'abstract': self.interpro_data[ipr_id].get('abstract', '')
81
+ }
82
+
83
+ if type_descriptions:
84
+ result[info_type] = type_descriptions
85
+
86
+ return result
87
+
88
+ # 全局变量来缓存InterProDescriptionManager实例和lmdb连接
89
+ _interpro_manager = None
90
+ _lmdb_db = None
91
+ _lmdb_path = None
92
+
93
+ def get_interpro_manager(interpro_data_path, interproscan_info_path):
94
+ """获取或创建InterProDescriptionManager实例"""
95
+ global _interpro_manager
96
+ if _interpro_manager is None:
97
+ _interpro_manager = InterProDescriptionManager(interpro_data_path, interproscan_info_path)
98
+ return _interpro_manager
99
+
100
+ def get_lmdb_connection(lmdb_path):
101
+ """获取或创建lmdb连接"""
102
+ global _lmdb_db, _lmdb_path
103
+ if _lmdb_db is None or _lmdb_path != lmdb_path:
104
+ if _lmdb_db is not None:
105
+ _lmdb_db.close()
106
+
107
+ if lmdb_path and os.path.exists(lmdb_path):
108
+ import lmdb
109
+ _lmdb_db = lmdb.open(lmdb_path, readonly=True)
110
+ _lmdb_path = lmdb_path
111
+ else:
112
+ _lmdb_db = None
113
+ _lmdb_path = None
114
+
115
+ return _lmdb_db
116
+
117
+ def get_prompt_template(selected_info_types=None,lmdb_path=None):
118
+ """
119
+ 获取prompt模板,支持可选的信息类型
120
+
121
+ Args:
122
+ selected_info_types: 需要包含的信息类型列表,如['motif', 'go', 'superfamily', 'panther']
123
+ """
124
+ if selected_info_types is None:
125
+ selected_info_types = ['motif', 'go'] # 默认包含motif和go信息
126
+ if lmdb_path is None:
127
+ PROMPT_TEMPLATE = ENZYME_PROMPT + "\n"
128
+ else:
129
+ PROMPT_TEMPLATE = FUNCTION_PROMPT + "\n"
130
+ PROMPT_TEMPLATE += """
131
+ input information:
132
+
133
+ {%- if 'motif' in selected_info_types and motif_pfam %}
134
+
135
+ motif:{% for motif_id, motif_info in motif_pfam.items() %}
136
+ {{motif_id}}: {{motif_info}}
137
+ {% endfor %}
138
+ {%- endif %}
139
+
140
+ {%- if 'go' in selected_info_types and go_data.status == 'success' %}
141
+
142
+ GO:{% for go_entry in go_data.go_annotations %}
143
+ ▢ GO term{{loop.index}}: {{go_entry.go_id}}
144
+ • definition: {{ go_data.all_related_definitions.get(go_entry.go_id, 'not found definition') }}
145
+ {% endfor %}
146
+ {%- endif %}
147
+
148
+ {%- for info_type in selected_info_types %}
149
+ {%- if info_type not in ['motif', 'go'] and interpro_descriptions.get(info_type) %}
150
+
151
+ {{info_type}}:{% for ipr_id, ipr_info in interpro_descriptions[info_type].items() %}
152
+ ▢ {{ipr_id}}: {{ipr_info.name}}
153
+ • description: {{ipr_info.abstract}}
154
+ {% endfor %}
155
+ {%- endif %}
156
+ {%- endfor %}
157
+
158
+ """
159
+ if lmdb_path is not None:
160
+ PROMPT_TEMPLATE += "\n" + "question: \n {{question}}"
161
+ return PROMPT_TEMPLATE
162
+
163
+ def get_qa_data(protein_id, lmdb_path):
164
+ """
165
+ 从lmdb中获取指定蛋白质的所有QA对
166
+
167
+ Args:
168
+ protein_id: 蛋白质ID
169
+ lmdb_path: lmdb数据库路径
170
+
171
+ Returns:
172
+ list: QA对列表,每个元素包含question和ground_truth
173
+ """
174
+ if not lmdb_path or not os.path.exists(lmdb_path):
175
+ return []
176
+
177
+ import json
178
+
179
+ qa_pairs = []
180
+
181
+ try:
182
+ db = get_lmdb_connection(lmdb_path)
183
+ if db is None:
184
+ return []
185
+
186
+ with db.begin() as txn:
187
+ # 遍历数字索引的数据,查找匹配的protein_id
188
+ cursor = txn.cursor()
189
+ for key, value in cursor:
190
+ try:
191
+ # 尝试将key解码为数字(数字索引的数据)
192
+ key_str = key.decode('utf-8')
193
+ if key_str.isdigit():
194
+ # 这是数字索引的数据,包含protein_id, question, ground_truth
195
+ data = json.loads(value.decode('utf-8'))
196
+ if isinstance(data, list) and len(data) >= 3:
197
+ stored_protein_id, question, ground_truth = data[0], data[1], data[2]
198
+ if stored_protein_id == protein_id:
199
+ qa_pairs.append({
200
+ 'question': question,
201
+ 'ground_truth': ground_truth
202
+ })
203
+ except Exception as e:
204
+ # 如果解析失败,跳过这个条目
205
+ continue
206
+ except Exception as e:
207
+ print(f"Error reading lmdb for protein {protein_id}: {e}")
208
+
209
+ return qa_pairs
210
+
211
+ def generate_prompt(protein_id, protein2gopath, protein2pfam_path, pfam_descriptions_path, go_info_path,
212
+ interpro_data_path=None, interproscan_info_path=None, selected_info_types=None, lmdb_path=None, interpro_manager=None, question=None):
213
+ """
214
+ 生成蛋白质prompt
215
+
216
+ Args:
217
+ selected_info_types: 需要包含的信息类型列表,如['motif', 'go', 'superfamily', 'panther']
218
+ interpro_data_path: interpro_data.json文件路径
219
+ interproscan_info_path: interproscan_info.json文件路径
220
+ interpro_manager: InterProDescriptionManager实例,如果提供则优先使用
221
+ question: 问题文本,用于QA任务
222
+ """
223
+ if selected_info_types is None:
224
+ selected_info_types = ['motif', 'go']
225
+
226
+ # 获取分析结果
227
+ analysis = analyze_protein_go(protein_id, protein2gopath, go_info_path)
228
+ motif_pfam = get_motif_pfam(protein_id, protein2pfam_path, pfam_descriptions_path)
229
+
230
+ # 获取InterPro描述信息(如果需要的话)
231
+ interpro_descriptions = {}
232
+ other_types = [t for t in selected_info_types if t not in ['motif', 'go']]
233
+ if other_types:
234
+ if interpro_manager:
235
+ # 使用提供的manager实例
236
+ interpro_descriptions = interpro_manager.get_description(protein_id, other_types)
237
+ elif interpro_data_path and interproscan_info_path:
238
+ # 使用全局缓存的manager
239
+ manager = get_interpro_manager(interpro_data_path, interproscan_info_path)
240
+ interpro_descriptions = manager.get_description(protein_id, other_types)
241
+
242
+ # 准备模板数据
243
+ template_data = {
244
+ "protein_id": protein_id,
245
+ "selected_info_types": selected_info_types,
246
+ "go_data": {
247
+ "status": analysis["status"],
248
+ "go_annotations": analysis["go_annotations"] if analysis["status"] == "success" else [],
249
+ "all_related_definitions": analysis["all_related_definitions"] if analysis["status"] == "success" else {}
250
+ },
251
+ "motif_pfam": motif_pfam,
252
+ "interpro_descriptions": interpro_descriptions,
253
+ "question": question
254
+ }
255
+
256
+ PROMPT_TEMPLATE = get_prompt_template(selected_info_types,lmdb_path)
257
+ template = Template(PROMPT_TEMPLATE)
258
+ return template.render(**template_data)
259
+
260
+ def save_prompts_parallel(protein_ids, output_path, protein2gopath, protein2pfam_path, pfam_descriptions_path, go_info_path,
261
+ interpro_data_path=None, interproscan_info_path=None, selected_info_types=None, lmdb_path=None, n_process=8):
262
+ """并行生成和保存protein prompts"""
263
+ import json
264
+ try:
265
+ from utils.mpr import MultipleProcessRunnerSimplifier
266
+ except ImportError:
267
+ from mpr import MultipleProcessRunnerSimplifier
268
+
269
+ if selected_info_types is None:
270
+ selected_info_types = ['motif', 'go']
271
+
272
+ # 在并行处理开始前创建InterProDescriptionManager实例
273
+ interpro_manager = None
274
+ other_types = [t for t in selected_info_types if t not in ['motif', 'go']]
275
+ if other_types and interpro_data_path and interproscan_info_path:
276
+ interpro_manager = InterProDescriptionManager(interpro_data_path, interproscan_info_path)
277
+
278
+ # 用于跟踪全局index的共享变量
279
+ if lmdb_path:
280
+ import multiprocessing
281
+ global_index = multiprocessing.Value('i', 0) # 共享整数,初始值为0
282
+ index_lock = multiprocessing.Lock() # 用于同步访问
283
+ else:
284
+ global_index = None
285
+ index_lock = None
286
+
287
+ results = {}
288
+
289
+ def process_protein(process_id, idx, protein_id, writer):
290
+ protein_id = protein_id.strip()
291
+
292
+ # 为每个进程初始化lmdb连接
293
+ if lmdb_path:
294
+ get_lmdb_connection(lmdb_path)
295
+
296
+ if lmdb_path:
297
+ # 如果有lmdb_path,处理QA数据
298
+ qa_pairs = get_qa_data(protein_id, lmdb_path)
299
+ for qa_pair in qa_pairs:
300
+ question = qa_pair['question']
301
+ ground_truth = qa_pair['ground_truth']
302
+ prompt = generate_prompt(protein_id, protein2gopath, protein2pfam_path, pfam_descriptions_path, go_info_path,
303
+ interpro_data_path, interproscan_info_path, selected_info_types, lmdb_path, interpro_manager, question)
304
+ if prompt == "":
305
+ continue
306
+ if writer:
307
+ # 获取并递增全局index
308
+ with index_lock:
309
+ current_index = global_index.value
310
+ global_index.value += 1
311
+
312
+ result = {
313
+ "index": current_index,
314
+ "protein_id": protein_id,
315
+ "prompt": prompt,
316
+ "question": question,
317
+ "ground_truth": ground_truth
318
+ }
319
+ writer.write(json.dumps(result) + '\n')
320
+ else:
321
+ # 如果没有lmdb_path,按原来的方式处理
322
+ prompt = generate_prompt(protein_id, protein2gopath, protein2pfam_path, pfam_descriptions_path, go_info_path,
323
+ interpro_data_path, interproscan_info_path, selected_info_types, lmdb_path, interpro_manager)
324
+ if prompt == "":
325
+ return
326
+ if writer:
327
+ result = {protein_id: prompt}
328
+ writer.write(json.dumps(result) + '\n')
329
+
330
+ # 使用MultipleProcessRunnerSimplifier进行并行处理
331
+ runner = MultipleProcessRunnerSimplifier(
332
+ data=protein_ids,
333
+ do=process_protein,
334
+ save_path=output_path + '.tmp',
335
+ n_process=n_process,
336
+ split_strategy="static"
337
+ )
338
+
339
+ runner.run()
340
+
341
+ # 清理全局lmdb连接
342
+ global _lmdb_db
343
+ if _lmdb_db is not None:
344
+ _lmdb_db.close()
345
+ _lmdb_db = None
346
+
347
+ if not lmdb_path:
348
+ # 如果没有lmdb_path,合并所有结果到一个字典(兼容旧格式)
349
+ final_results = {}
350
+ with open(output_path + '.tmp', 'r') as f:
351
+ for line in f:
352
+ if line.strip(): # 忽略空行
353
+ final_results.update(json.loads(line))
354
+
355
+ # 保存最终结果为正确的JSON格式
356
+ with open(output_path, 'w') as f:
357
+ json.dump(final_results, f, indent=2)
358
+ else:
359
+ # 如果有lmdb_path,直接保存为jsonl格式
360
+ import shutil
361
+ shutil.move(output_path + '.tmp', output_path)
362
+
363
+ # 删除临时文件(如果还存在的话)
364
+ if os.path.exists(output_path + '.tmp'):
365
+ os.remove(output_path + '.tmp')
366
+
367
+ if __name__ == "__main__":
368
+ import argparse
369
+ parser = argparse.ArgumentParser(description='Generate protein prompt')
370
+ parser.add_argument('--protein_path', type=str, default='data/raw_data/protein_ids_clean.txt')
371
+ parser.add_argument('--protein2pfam_path', type=str, default='data/processed_data/interproscan_info.json')
372
+ parser.add_argument('--pfam_descriptions_path', type=str, default='data/raw_data/all_pfam_descriptions.json')
373
+ parser.add_argument('--protein2gopath', type=str, default='data/processed_data/go_integration_final_topk2.json')
374
+ parser.add_argument('--go_info_path', type=str, default='data/raw_data/go.json')
375
+ parser.add_argument('--interpro_data_path', type=str, default='data/raw_data/interpro_data.json')
376
+ parser.add_argument('--interproscan_info_path', type=str, default='data/processed_data/interproscan_info.json')
377
+ parser.add_argument('--lmdb_path', type=str, default=None)
378
+ parser.add_argument('--output_path', type=str, default='data/processed_data/prompts@clean_test.json')
379
+ parser.add_argument('--selected_info_types', type=str, nargs='+', default=['motif', 'go'],
380
+ help='选择要包含的信息类型,如: motif go superfamily panther gene3d')
381
+ parser.add_argument('--n_process', type=int, default=32)
382
+ args = parser.parse_args()
383
+ #更新output_path,需要包含selected_info_types
384
+ args.output_path = args.output_path.replace('.json', '_' + '_'.join(args.selected_info_types) + '.json')
385
+ print(args)
386
+
387
+ with open(args.protein_path, 'r') as file:
388
+ protein_ids = file.readlines()
389
+
390
+ save_prompts_parallel(
391
+ protein_ids=protein_ids,
392
+ output_path=args.output_path,
393
+ n_process=args.n_process,
394
+ protein2gopath=args.protein2gopath,
395
+ protein2pfam_path=args.protein2pfam_path,
396
+ pfam_descriptions_path=args.pfam_descriptions_path,
397
+ go_info_path=args.go_info_path,
398
+ interpro_data_path=args.interpro_data_path,
399
+ interproscan_info_path=args.interproscan_info_path,
400
+ selected_info_types=args.selected_info_types,
401
+ lmdb_path=args.lmdb_path
402
+ )
403
+
404
+ # 测试示例
405
+ # protein_id = 'A8CF74'
406
+ # prompt = generate_prompt(protein_id, 'data/processed_data/go_integration_final_topk2.json',
407
+ # 'data/processed_data/interproscan_info.json', 'data/raw_data/all_pfam_descriptions.json',
408
+ # 'data/raw_data/go.json', 'data/raw_data/interpro_data.json',
409
+ # 'data/processed_data/interproscan_info.json',
410
+ # ['motif', 'go', 'superfamily', 'panther'])
411
+ # print(prompt)
412
+
413
+
utils/get_motif.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from collections import Counter
3
+ import os
4
+
5
+ _pfam_dict = None
6
+ _pfam_descriptions = None
7
+
8
+ def _load_pfam_data(protein2pfam_path):
9
+ global _pfam_dict
10
+ if _pfam_dict is None:
11
+ with open(protein2pfam_path, 'r') as file:
12
+ _pfam_dict = json.load(file)
13
+
14
+ def _load_pfam_descriptions(pfam_descriptions_path):
15
+ global _pfam_descriptions
16
+ if _pfam_descriptions is None:
17
+ with open(pfam_descriptions_path, 'r') as file:
18
+ _pfam_descriptions = json.load(file)
19
+
20
+ def get_motif_pfam(protein_id, protein2pfam_path, pfam_descriptions_path):
21
+ """
22
+ 获取指定蛋白质的pfam信息及其定义
23
+
24
+ 参数:
25
+ protein_id: str - 蛋白质ID
26
+ protein2pfam_path: str - interproscan_info.json文件路径
27
+ pfam_descriptions_path: str - pfam描述文件路径
28
+
29
+ 返回:
30
+ dict - pfam_id到定义的映射字典,例如{"PF04820": "definition content"}
31
+ """
32
+ _load_pfam_data(protein2pfam_path)
33
+ _load_pfam_descriptions(pfam_descriptions_path)
34
+
35
+ if protein_id not in _pfam_dict:
36
+ return {}
37
+
38
+ protein_info = _pfam_dict[protein_id]
39
+ _pfam_dicts = protein_info.get('interproscan_results', {}).get('pfam_id', [])
40
+ pfam_ids = []
41
+ for pfam_dict in _pfam_dicts:
42
+ for key,value in pfam_dict.items():
43
+ pfam_ids.append(key)
44
+
45
+ result = {}
46
+ for pfam_id in pfam_ids:
47
+ if pfam_id in _pfam_descriptions:
48
+ result[pfam_id] = _pfam_descriptions[pfam_id]['description']
49
+
50
+ return result
51
+
52
+ if __name__ == "__main__":
53
+ import argparse
54
+ parser = argparse.ArgumentParser()
55
+ parser.add_argument("--protein_id", type=str, required=False, default="A8CF74")
56
+ parser.add_argument("--protein2pfam_path", type=str, required=False, default="data/processed_data/interproscan_info.json")
57
+ parser.add_argument("--pfam_descriptions_path", type=str, required=False, default="data/raw_data/all_pfam_descriptions.json")
58
+ args = parser.parse_args()
59
+ result = get_motif_pfam(args.protein_id, args.protein2pfam_path, args.pfam_descriptions_path)
60
+ print(result)
61
+
62
+
63
+
utils/mpr.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import abc
2
+ import os
3
+ import time
4
+ import sys
5
+
6
+
7
+ from tqdm import tqdm
8
+ from math import ceil
9
+
10
+
11
+ class MultipleProcessRunner:
12
+ """
13
+ Abstarct class for running tasks with multiple process
14
+ There are three abstract methods that should be implemented:
15
+ 1. __len__() : return the length of data
16
+ 2. _target() : target function for each process
17
+ 3. _aggregate() : aggregate results from each process
18
+ """
19
+
20
+ def __init__(self,
21
+ data,
22
+ save_path=None,
23
+ n_process=1,
24
+ verbose=True,
25
+ total_only=True,
26
+ log_step=1,
27
+ start_method='fork',
28
+ split_strategy="queue"):
29
+ """
30
+ Args:
31
+ data : data to be processed that can be sliced
32
+
33
+ path : final output path
34
+
35
+ n_process: number of process
36
+
37
+ verbose : if True, display progress bar
38
+
39
+ total_only: If True, only total progress bar is displayed
40
+
41
+ log_step : For total progress bar, Next log will be printed when ``current iteration`` - ``last log
42
+ iteration`` >= log_step
43
+
44
+ start_method: start method for multiprocessing
45
+
46
+ split_strategy: method to split data, can be 'queue', 'static'. If 'queue', data will be put into a
47
+ queue and each process will get data from the queue. If 'static', data will be split
48
+ into n_process parts and each process will get one part.
49
+ """
50
+ self.data = data
51
+ self.save_path = save_path
52
+ self.n_process = n_process
53
+ self.verbose = verbose
54
+ self.total_only = total_only
55
+ self.log_step = log_step
56
+ self.start_method = start_method
57
+ self.split_strategy = split_strategy
58
+
59
+ assert self.split_strategy in ["queue", "static"], f"Split strategy must be 'queue' or 'static', but got {self.split_strategy}"
60
+
61
+ # get terminal width to format output
62
+ try:
63
+ self.terminal_y = os.get_terminal_size()[0]
64
+
65
+ except Exception as e:
66
+ print(e)
67
+ print("Can't get terminal size, set terminal_y = None")
68
+ self.terminal_y = None
69
+
70
+ def _s2hms(self, seconds: float):
71
+ """
72
+ convert second format of time into hour:minute:second format
73
+
74
+ """
75
+ m, s = divmod(seconds, 60)
76
+ h, m = divmod(m, 60)
77
+
78
+ return "%02d:%02d:%02d" % (h, m, s)
79
+
80
+ def _display_time(self, st_time, now, total):
81
+ ed_time = time.time()
82
+ running_time = ed_time - st_time
83
+ rest_time = running_time * (total - now) / now
84
+ iter_sec = f"{now / running_time:.2f}it/s" if now > running_time else f"{running_time / now:.2f}s/it"
85
+
86
+ return f' [{self._s2hms(running_time)} < {self._s2hms(rest_time)}, {iter_sec}]'
87
+
88
+ def _display_bar(self, now, total, length):
89
+ now = now if now <= total else total
90
+ num = now * length // total
91
+ progress_bar = '[' + '#' * num + '_' * (length - num) + ']'
92
+ return progress_bar
93
+
94
+ def _display_all(self, now, total, desc, st_time):
95
+ # make a progress bar
96
+ length = 50
97
+ progress_bar = self._display_bar(now, total, length)
98
+ time_display = self._display_time(st_time, now, total)
99
+
100
+ display = f'{desc}{progress_bar} {int(now / total * 100):02d}% {now}/{total}{time_display}'
101
+
102
+ # Clean a line
103
+ width = self.terminal_y if self.terminal_y is not None else 100
104
+ num_space = width - len(display)
105
+ if num_space > 0:
106
+ display += ' ' * num_space
107
+ else:
108
+ length += num_space
109
+ progress_bar = self._display_bar(now, total, length)
110
+ display = f'{desc}{progress_bar} {int(now / total * 100):02d}% {now}/{total}{time_display}'
111
+
112
+ # Set color
113
+ display = f"\033[31m{display}\033[0m"
114
+
115
+ return display
116
+
117
+ # Print progress bar at specific position in terminal
118
+ def terminal_progress_bar(self,
119
+ process_id: int,
120
+ now: int,
121
+ total: int,
122
+ desc: str = ''):
123
+ """
124
+
125
+ Args:
126
+ process_id: process id
127
+ now: now iteration number
128
+ total: total iteration number
129
+ desc: description
130
+
131
+ """
132
+ st_time = self.process_st_time[process_id]
133
+
134
+ # Aggregate total information
135
+ self.counts[process_id] = now
136
+ self._total_display(self.process_st_time["total"])
137
+
138
+ if not self.total_only:
139
+ process_display = self._display_all(now, total, desc, st_time)
140
+ if self.terminal_y is not None:
141
+ sys.stdout.write(f"\x1b7\x1b[{process_id + 1};{0}f{process_display}\x1b8")
142
+ sys.stdout.flush()
143
+ else:
144
+ print(f"\x1b7\x1b[{process_id + 1};{0}f{process_display}\x1b8", flush=True)
145
+
146
+ # Print global information
147
+ def _total_display(self, st_time):
148
+ if self.total_display_callable.value == 1:
149
+ self.total_display_callable.value = 0
150
+
151
+ cnt = sum([self.counts[i] for i in range(self.n_process)])
152
+ if cnt - self.last_cnt.value >= self.log_step:
153
+ total_display = self._display_all(cnt, self.__len__(), f"Total: ", st_time)
154
+ self.last_cnt.value = cnt
155
+
156
+ x = self.n_process + 1 if not self.total_only else 0
157
+ # if self.terminal_y is not None:
158
+ # sys.stdout.write(f"\x1b7\x1b[{x};{0}f{total_display}\x1b8")
159
+ # sys.stdout.flush()
160
+ # else:
161
+ # print(f"\x1b7\x1b[{x};{0}f{total_display}\x1b8", flush=True)
162
+ print(f"\r\x1b7\x1b[{x};{0}f{total_display}\x1b8", flush=True, end="")
163
+
164
+ self.total_display_callable.value = 1
165
+
166
+ def run(self):
167
+ """
168
+ The function is used to run a multi-process task
169
+ Returns: return the result of function '_aggregate()'
170
+ """
171
+
172
+ if self.split_strategy == "static":
173
+ return self.run_static()
174
+
175
+ elif self.split_strategy == "queue":
176
+ return self.run_queue()
177
+
178
+ def run_static(self):
179
+ """
180
+ Running multi-process task with static data splits
181
+ """
182
+
183
+ # import multiprocessing as mp
184
+ import multiprocess as mp
185
+ mp.set_start_method(self.start_method, force=True)
186
+
187
+ # total number of data that is already processed
188
+ self.counts = mp.Manager().dict({i: 0 for i in range(self.n_process)})
189
+
190
+ # record start time for each process
191
+ self.process_st_time = {"total": time.time()}
192
+
193
+ # set a lock to call total number display
194
+ self.total_display_callable = mp.Value('d', 1)
195
+
196
+ # Save last log iteration number
197
+ self.last_cnt = mp.Value('d', 0)
198
+
199
+ num_per_process = ceil(self.__len__() / self.n_process)
200
+
201
+ if self.save_path is not None:
202
+ file_name, suffix = os.path.splitext(self.save_path)
203
+
204
+ process_list = []
205
+ sub_paths = []
206
+ for i in range(self.n_process):
207
+ st = i * num_per_process
208
+ ed = st + num_per_process
209
+
210
+ # construct slice and sub path for sub process
211
+ data_slice = self.data[st: ed]
212
+
213
+ sub_path = None
214
+ # Create a directory to save sub-results
215
+ if self.save_path is not None:
216
+ save_dir = f"{file_name}{suffix}_temp"
217
+ os.makedirs(save_dir, exist_ok=True)
218
+ sub_path = f"{save_dir}/temp_{i}{suffix}"
219
+
220
+ # construct sub process
221
+ input_args = (i, data_slice, sub_path)
222
+ self.process_st_time[i] = time.time()
223
+ p = mp.Process(target=self._target_static, args=input_args)
224
+ p.start()
225
+
226
+ process_list.append(p)
227
+ sub_paths.append(sub_path)
228
+
229
+ for p in process_list:
230
+ p.join()
231
+
232
+ # aggregate results and remove temporary directory
233
+ results = self._aggregate(self.save_path, sub_paths)
234
+ if self.save_path is not None:
235
+ save_dir = f"{file_name}{suffix}_temp"
236
+ os.rmdir(save_dir)
237
+
238
+ return results
239
+
240
+ def run_queue(self):
241
+ """
242
+ Running multi-process task with shared queue
243
+ """
244
+
245
+ # import multiprocessing as mp
246
+ import multiprocess as mp
247
+ mp.set_start_method(self.start_method, force=True)
248
+
249
+ # total number of data that is already processed
250
+ self.counts = mp.Manager().dict({i: 0 for i in range(self.n_process)})
251
+
252
+ # Initialize a queue to input data
253
+ self.q = mp.Queue(self.__len__())
254
+ for d in tqdm(self.data, "Input data to queue"):
255
+ self.q.put(d)
256
+
257
+ # record start time for each process
258
+ self.process_st_time = {"total": time.time()}
259
+
260
+ # set a lock to call total number display
261
+ self.total_display_callable = mp.Value('d', 1)
262
+
263
+ # Save last log iteration number
264
+ self.last_cnt = mp.Value('d', 0)
265
+
266
+ if self.save_path is not None:
267
+ file_name, suffix = os.path.splitext(self.save_path)
268
+
269
+ process_list = []
270
+ sub_paths = []
271
+ for i in range(self.n_process):
272
+ sub_path = None
273
+ # Create a directory to save sub-results
274
+ if self.save_path is not None:
275
+ save_dir = f"{file_name}{suffix}_temp"
276
+ os.makedirs(save_dir, exist_ok=True)
277
+ sub_path = f"{save_dir}/temp_{i}{suffix}"
278
+
279
+ # construct sub process
280
+ input_args = (i, sub_path)
281
+ self.process_st_time[i] = time.time()
282
+ p = mp.Process(target=self._target_queue, args=input_args)
283
+ p.start()
284
+
285
+ process_list.append(p)
286
+ sub_paths.append(sub_path)
287
+
288
+ for p in process_list:
289
+ p.join()
290
+
291
+ # aggregate results and remove temporary directory
292
+ results = self._aggregate(self.save_path, sub_paths)
293
+ if self.save_path is not None:
294
+ save_dir = f"{file_name}{suffix}_temp"
295
+ os.rmdir(save_dir)
296
+
297
+ return results
298
+
299
+ @abc.abstractmethod
300
+ def _aggregate(self, final_path: str, sub_paths):
301
+ """
302
+ This function is used to aggregate results from sub processes into a file
303
+
304
+ Args:
305
+ final_path: path to save final results
306
+ sub_paths : list of sub paths
307
+
308
+ Returns: None or desirable results specified by user
309
+
310
+ """
311
+ raise NotImplementedError
312
+
313
+ @abc.abstractmethod
314
+ def _target_static(self, process_id, data, sub_path):
315
+ """
316
+ The main body to operate data in one process. This function is used when split_strategy is 'static'.
317
+
318
+ Args:
319
+ i : process id
320
+ data : data slice
321
+ sub_path: sub path to save results
322
+ """
323
+ raise NotImplementedError
324
+
325
+ @abc.abstractmethod
326
+ def _target_queue(self, process_id, sub_path):
327
+ """
328
+ The main body to operate data in one process. This function is used when split_strategy is 'queue'.
329
+
330
+ Args:
331
+ i : process id
332
+ sub_path: sub path to save results
333
+ """
334
+ raise NotImplementedError
335
+
336
+ @abc.abstractmethod
337
+ def __len__(self):
338
+ raise NotImplementedError
339
+
340
+
341
+ class MultipleProcessRunnerSimplifier(MultipleProcessRunner):
342
+ """
343
+ A simplified version of MultipleProcessRunner.
344
+ User only need to implement the function 'do', then it will be automatically executed
345
+ in every iteration after call the function 'run'.
346
+ If 'save_path' is specified, it will open a file in the 'sub_path' into which
347
+ user can write results, and results will be aggregated into 'save_path'.
348
+
349
+ The procedure would be like:
350
+ ...
351
+ with open(sub_path, 'w') as w:
352
+ for i, d in enumerate(data):
353
+ self.do(process_id, i, d, w) # You can write results into the file.
354
+ ...
355
+
356
+ The 'do' function should be like:
357
+ def do(process_id, idx, data, writer):
358
+ ...
359
+
360
+ If 'save_path' is None, the argument 'writer' will be set to None.
361
+
362
+ """
363
+
364
+ def __init__(self, data, do, return_results=False, **kwargs):
365
+
366
+ super().__init__(data=data, **kwargs)
367
+ self.do = do
368
+ self.return_results = return_results
369
+
370
+ def run(self):
371
+ self.start_time = time.time()
372
+ return super().run()
373
+
374
+ def _aggregate(self, final_path: str, sub_paths):
375
+ results = []
376
+
377
+ w = open(final_path, 'w') if final_path is not None else None
378
+
379
+ if self.verbose:
380
+ iterator = tqdm(enumerate(sub_paths), "Aggregating results...")
381
+ else:
382
+ iterator = enumerate(sub_paths)
383
+
384
+ for i, sub_path in iterator:
385
+ if sub_path is None and self.return_results:
386
+ sub_path = f"MultipleProcessRunnerSimplifier_{self.start_time}_{i}.tmp"
387
+
388
+ if sub_path is not None:
389
+ with open(sub_path, 'r') as r:
390
+ for line in r:
391
+ if w is not None:
392
+ w.write(line)
393
+
394
+ if self.return_results:
395
+ results.append(line[:-1])
396
+
397
+ os.remove(sub_path)
398
+
399
+ return results
400
+
401
+ def _target_static(self, process_id, data, sub_path):
402
+ if sub_path is None and self.return_results:
403
+ sub_path = f"MultipleProcessRunnerSimplifier_{self.start_time}_{process_id}.tmp"
404
+
405
+ w = open(sub_path, 'w') if sub_path is not None else None
406
+ for i, d in enumerate(data):
407
+ self.do(process_id, i, d, w)
408
+ if self.verbose:
409
+ self.terminal_progress_bar(process_id, i + 1, len(data), f"Process{process_id} running...")
410
+
411
+ if w is not None:
412
+ w.close()
413
+
414
+ def _target_queue(self, process_id, sub_path):
415
+ if sub_path is None and self.return_results:
416
+ sub_path = f"MultipleProcessRunnerSimplifier_{self.start_time}_{process_id}.tmp"
417
+
418
+ w = open(sub_path, 'w') if sub_path is not None else None
419
+ i = 0
420
+ while not self.q.empty():
421
+ d = self.q.get()
422
+ self.do(process_id, i, d, w)
423
+ if self.verbose:
424
+ self.terminal_progress_bar(process_id, i + 1, self.__len__(), f"Process{process_id} running...")
425
+
426
+ i += 1
427
+
428
+ if w is not None:
429
+ w.close()
430
+
431
+ def __len__(self):
432
+ return len(self.data)
433
+
utils/openai_access.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import time
3
+ from openai import OpenAI
4
+ # from zhipuai import ZhipuAI
5
+ import requests
6
+ import re
7
+ # from utils.parse_llm_output import try_parse_json_object
8
+
9
+ def get_oai_completion(prompt):
10
+ # api_pools = [
11
+ # ("your_api_key","base_url","model_name"),
12
+ # ]
13
+ api_pools = [
14
+ ("sk-0060332083ea440bb35b676df023ce01","https://dashscope.aliyuncs.com/compatible-mode/v1","deepseek-v3")
15
+ ]
16
+ api = api_pools[0]
17
+ api_key, base_url, model = api
18
+
19
+ if "GLM" in model:
20
+ client = ZhipuAI(api_key=api_key)
21
+ # from utils.prompts import GLM_JSON_RESPONSE_PREFIX, GLM_JSON_RESPONSE_SUFFIX, system_prompt
22
+ # system_prompt = f"{GLM_JSON_RESPONSE_PREFIX}{system_prompt}"
23
+ # user_prompt = f"{prompt}{GLM_JSON_RESPONSE_SUFFIX}"
24
+ system_prompt = "You are a helpful assistant."
25
+ user_prompt = prompt
26
+ else:
27
+ client = OpenAI(api_key=api_key, base_url=base_url)
28
+ system_prompt = "You are a helpful assistant."
29
+ user_prompt = prompt
30
+
31
+ try:
32
+ if "GLM" in model:
33
+ response = client.chat.completions.create(
34
+ model=model,
35
+ messages=[
36
+ {"role": "system", "content": system_prompt},
37
+ {"role": "user", "content": user_prompt},
38
+ ],
39
+ # response_format={ "type": "json_object" },
40
+ temperature=0.1,
41
+ top_p=0.7,
42
+ stream=False
43
+ )
44
+ else:
45
+ # print(user_prompt)
46
+ response = client.chat.completions.create(
47
+ model=model,
48
+ messages=[
49
+ {"role": "system", "content": system_prompt},
50
+ {"role": "user", "content": user_prompt},
51
+ ],
52
+ # response_format={ "type": "json_object" },
53
+ #根据任务的不同来调整
54
+ temperature=0.8,
55
+ max_tokens=8000,
56
+ stream=False
57
+ )
58
+ res = response.choices[0].message.content
59
+
60
+ # if "GLM" in model:
61
+ # pattern = re.compile(r"```(?:json\s+)?(\{.*?\})\s*```", re.DOTALL)
62
+ # match = pattern.search(res)
63
+ # if match:
64
+ # gpt_output, _ = try_parse_json_object(match.group(1).strip())
65
+ # else:
66
+ # gpt_output = res
67
+ # else:
68
+ # gpt_output = res
69
+
70
+ pattern = re.compile(r"```(?:json\s+)?(\{.*?\})\s*```", re.DOTALL)
71
+ match = pattern.search(res)
72
+ # if match:
73
+ # gpt_output, _ = try_parse_json_object(match.group(1).strip())
74
+ # else:
75
+ # gpt_output = res
76
+ gpt_output = res
77
+
78
+ return gpt_output
79
+
80
+ except requests.exceptions.Timeout:
81
+ print("The API request timed out. Please try again later.")
82
+ return None
83
+ except Exception as e:
84
+ print(e)
85
+ return None
86
+
87
+ def call_chatgpt(ins):
88
+ success = False
89
+ re_try_count = 5
90
+ ans = ''
91
+ while not success and re_try_count >= 0:
92
+ re_try_count -= 1
93
+ try:
94
+ ans = get_oai_completion(ins)
95
+ success = True
96
+ except Exception as e:
97
+ print(f"Retry times: {re_try_count}; Error: {e}", flush=True)
98
+ time.sleep(5)
99
+ return ans
utils/prompts.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ENZYME_PROMPT = """**You are a senior systems biologist.** Analyze the input information to predict ec number using structured reasoning. Crucially, implement a **self-correct mechanism** with these steps:
2
+
3
+ ### Self-Correct Protocol
4
+ 1. **Enzyme Verification**
5
+ - Discard ANY information contradicting the enzyme nature (catalytic activity).
6
+ - Example: If a GO term implies non-enzymatic function (e.g., structural role), reject it immediately.
7
+
8
+ 2. **Conflict Resolution (Majority Rule)**
9
+ - Identify conflicts between:
10
+ - Motif vs. Motif
11
+ - GO term vs. GO term
12
+ - Motif vs. GO term
13
+ - **Resolution Principle**:
14
+ - If one element (A) conflicts with ≥2 logically consistent elements (B,C,D), discard A.
15
+ - Preserve high-confidence information supported by multiple sources.
16
+ - *Note*: Compatible functions (e.g., catalytic activity + cofactor binding) are NOT conflicts.
17
+
18
+ 3. **Output Filtered Information**
19
+ - Explicitly list retained/discarded items with reasons before analysis.
20
+
21
+ ### Final Output Requirement for EC Number
22
+
23
+ After completing the full biological analysis, you **must** conclude your entire response with a special section for automated parsing. This section must adhere to the following precise logic and format:
24
+
25
+ **Decision Logic:**
26
+
27
+ 1. **Default to a Single EC Number:** Your primary goal is to predict the **single, most likely EC number** for the protein's primary catalytic activity.
28
+ 2. **Handling Ambiguity:** If the evidence suggests a single function but points to several possible EC numbers (e.g., a family motif describes related but distinct activities), you must **commit to one choice**. Select the EC number that is most representative, most common, or best supported by the combined evidence. **Do not list multiple options out of uncertainty.**
29
+ 3. **Exception for Bifunctionality:** You may only predict multiple EC numbers if there is **explicit and strong evidence that a single protein is bifunctional**, meaning it contains distinct domains that perform two or more separate catalytic reactions. This requires clear support, such as a motif description explicitly stating "bifunctional" or the presence of multiple, distinct top-level catalytic GO terms (e.g., both a kinase and a cyclase activity).
30
+
31
+ **Formatting Rules:**
32
+
33
+ 1. The section must begin on a new line with the exact tag: `[EC_PREDICTION]`
34
+ 2. **Single Prediction (Standard Case):** Follow the tag with a single space and the predicted EC number.
35
+ * Example: `[EC_PREDICTION] 1.14.99.54`
36
+ 3. **Bifunctional Prediction (Exceptional Case):** List the EC numbers separated by a comma with no spaces.
37
+ * Example: `[EC_PREDICTION] 2.7.1.1,4.6.1.1`
38
+ 4. Do not add any other text, explanation, or punctuation on this line.
39
+ """
40
+
41
+ RELATION_SEMANTIC_PROMPT = """
42
+ relation semantic:
43
+ • is_a: The is a relation forms the basic structure of GO. If we say A is a B, we mean that node A is a subtype of node B. For example, mitotic cell cycle is a cell cycle, or lyase activity is a catalytic activity.
44
+ • part_of: The part of relation is used to represent part-whole relationships. part of has a specific meaning in GO, and a part of relation would only be added between A and B if B is necessarily part of A: wherever B exists, it is as part of A, and the presence of the B implies the presence of A. However, given the occurrence of A, we cannot say for certain that B exists.
45
+ • has part: The logical complement to the part of relation is has part, which represents a part-whole relationship from the perspective of the parent. As with part of, the GO relation has part is only used in cases where A always has B as a part, i.e. where A necessarily has part B. If A exists, B will always exist; however, if B exists, we cannot say for certain that A exists. i.e. all A have part B; some B part of A.
46
+ • ends during: X ends_during Y iff: ((start(Y) before_or_simultaneous_with end(X)) AND end(X) before_or_simultaneous_with end(Y).
47
+ • happens during: X happens_during Y iff: (start(Y) before_or_simultaneous_with start(X)) AND (end(X) before_or_simultaneous_with end(Y))
48
+ • negatively regulates: p negatively regulates q iff p regulates q, and p decreases the rate or magnitude of execution of q.
49
+ • occurs in: b occurs_in c =def b is a process and c is a material entity or immaterial entity& there exists a spatiotemporal region r and b occupies_spatiotemporal_region r.& forall(t) if b exists_at t then c exists_at t & there exist spatial regions s and s’ where & b spatially_projects_onto s at t& c is occupies_spatial_region s’ at t& s is a proper_continuant_part_of s’ at t
50
+ • positively regulates: p positively regulates q iff p regulates q, and p increases the rate or magnitude of execution of q.
51
+ • regulates: A relation that describes case in which one process directly affects the manifestation of another process or quality, i.e. the former regulates the latter. The target of the regulation may be another process, for e.g., regulation of a pathway or an enzymatic reaction, or it may be a quality, such as cell size or pH. Analogously to part of, this relation is used specifically to mean necessarily regulates: if both A and B are present, B always regulates A, but A may not always be regulated by B., i.e. all B regulate A; some A are regulated by B.
52
+ • subproperty of: is used to establish a hierarchy among properties, indicating that a more specific property inherits characteristics from a more general one.
53
+ • inverse of: is used to define the reverse direction of a relationship between the same pair of individuals.
54
+ """
55
+
56
+ FUNCTION_PROMPT = """**You are a senior systems biologist.** Analyze the input information to answer the given question.
57
+ """
58
+
59
+ LLM_SCORE_PROMPT = """As an expert biologist, you are assigned to check one paragraph is aligned with facts or not. You will receive some facts, and
60
+ one paragraph. Score the paragraph between 0 to 100.
61
+ The score should be the format of {"score": score}
62
+ Here's the facts:
63
+ {{ground_truth}}
64
+ Here's the paragraph:
65
+ {{llm_answer}}
66
+ """
utils/protein_go_analysis.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys
3
+ import os
4
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
5
+ from collections import defaultdict
6
+
7
+ # 全局变量声明
8
+ _go_data = None
9
+ _protein_go_dict = None
10
+
11
+ def _load_go_data(go_info_path):
12
+ """懒加载GO数据"""
13
+ global _go_data
14
+ if _go_data is None:
15
+ try:
16
+ with open(go_info_path, 'r') as f:
17
+ _go_data = json.load(f)
18
+ except Exception as e:
19
+ print(f"加载GO数据文件时发生错误: {str(e)}")
20
+ _go_data = None
21
+
22
+ def _load_protein_go_dict(protein2gopath):
23
+ """懒加载蛋白质-GO映射数据"""
24
+ global _protein_go_dict
25
+ if _protein_go_dict is None:
26
+ try:
27
+ _protein_go_dict = {}
28
+ with open(protein2gopath, 'r') as f:
29
+ for line in f:
30
+ data = json.loads(line)
31
+ _protein_go_dict[data['protein_id']] = data['GO_id']
32
+ except Exception as e:
33
+ print(f"加载蛋白质-GO映射数据时发生错误: {str(e)}")
34
+ _protein_go_dict = None
35
+
36
+ def get_go_definition(go_id, go_info_path):
37
+ """获取GO term的定义"""
38
+ _load_go_data(go_info_path)
39
+ if _go_data is None:
40
+ return None
41
+
42
+ if not go_id.startswith('GO_'):
43
+ go_id = f"GO_{go_id}"
44
+ full_id = f"http://purl.obolibrary.org/obo/{go_id}"
45
+
46
+ for node in _go_data['graphs'][0]['nodes']:
47
+ if node['id'] == full_id:
48
+ if 'meta' in node and 'definition' in node['meta']:
49
+ return node['meta']['definition']['val']
50
+ return None
51
+
52
+ def analyze_protein_go(protein_id, protein2gopath, go_info_path):
53
+ """
54
+ 分析蛋白质的GO注释信息,包括GO ID和定义
55
+
56
+ 参数:
57
+ protein_id: str - 蛋白质ID
58
+ protein2gopath: str - 蛋白质-GO映射文件路径
59
+
60
+ 返回:
61
+ dict - 包含GO信息的字典
62
+ """
63
+ _load_protein_go_dict(protein2gopath)
64
+ if _protein_go_dict is None:
65
+ return {
66
+ "status": "error",
67
+ "message": "GO数据加载失败"
68
+ }
69
+
70
+ if protein_id not in _protein_go_dict:
71
+ return {
72
+ "status": "error",
73
+ "message": f"未找到蛋白质 {protein_id} 的GO注释"
74
+ }
75
+
76
+ go_ids = _protein_go_dict[protein_id]
77
+ go_info = []
78
+ all_definitions = {}
79
+
80
+ for go_id in go_ids:
81
+ # 获取GO定义
82
+ definition = get_go_definition(go_id, go_info_path)
83
+ if definition:
84
+ all_definitions[go_id] = definition
85
+
86
+ go_info.append({
87
+ "go_id": go_id
88
+ })
89
+
90
+ return {
91
+ "status": "success",
92
+ "protein_id": protein_id,
93
+ "go_annotations": go_info,
94
+ "all_related_definitions": all_definitions
95
+ }
96
+
97
+ # 使用示例
98
+ if __name__ == "__main__":
99
+ import argparse
100
+ parser = argparse.ArgumentParser(description='Analyze protein GO annotations')
101
+ parser.add_argument('--protein_id', type=str, default='A8CF74')
102
+ parser.add_argument('--protein2gopath', type=str, default='data/processed_data/go_integration_final_topk2.json')
103
+ parser.add_argument('--go_info_path', type=str, default='data/raw_data/go.json')
104
+ args = parser.parse_args()
105
+
106
+ result = analyze_protein_go(args.protein_id, args.protein2gopath, args.go_info_path)
107
+
108
+ if result["status"] == "success":
109
+ print(f"\nProtein {result['protein_id']} GO annotations:")
110
+
111
+ for anno in result["go_annotations"]:
112
+ print(f"\nGO ID: {anno['go_id']}")
113
+
114
+ print("\nAll related GO ID definitions:")
115
+ for go_id, definition in result["all_related_definitions"].items():
116
+ print(f"\nGO:{go_id}")
117
+ print(f"Definition: {definition}")
118
+ else:
119
+ print(result["message"])
utils/utils.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Bio import ExPASy
2
+ from Bio import SeqIO
3
+ import json
4
+ from Bio.Blast import NCBIXML
5
+
6
+ def get_protein_sequence_biopython(uniprot_id):
7
+ """
8
+ 使用BioPython通过UniProt ID获取蛋白质序列
9
+
10
+ 参数:
11
+ uniprot_id (str): UniProt ID (如P12345)
12
+
13
+ 返回:
14
+ str: 蛋白质序列或错误信息
15
+ """
16
+ try:
17
+ with ExPASy.get_sprot_raw(uniprot_id) as handle:
18
+ seq_record = SeqIO.read(handle, "swiss")
19
+ return str(seq_record.seq)
20
+ except Exception as e:
21
+ return f"Error: {str(e)}"
22
+
23
+
24
+ def extract_interproscan_metrics(file_path, librarys="PFAM"):
25
+ """
26
+ 从InterProScan JSON结果中提取蛋白质信息和域信息。
27
+ 参数:
28
+ file_path (str): InterProScan JSON结果文件路径
29
+ librarys (list): 需要提取的域库列表,默认为["PFAM"]
30
+ 返回:
31
+ dict: 包含蛋白质序列和对应域信息的字典
32
+ """
33
+ protein_info = {}
34
+ with open(file_path, 'r', encoding='utf-8') as file:
35
+ data = json.load(file)
36
+ results = data["results"]
37
+
38
+ for protein in results:
39
+ sequence = protein["sequence"]
40
+ domain_info = {}
41
+ for library in librarys:
42
+ domain_info[library] = []
43
+ domain_info["GO"] = []
44
+
45
+ matches = protein["matches"]
46
+ for match in matches:
47
+ if match["signature"]["signatureLibraryRelease"]["library"] in librarys:
48
+ if match["signature"]["entry"]:
49
+ domain_info[match["signature"]["signatureLibraryRelease"]["library"]].append({match["signature"]["accession"]: match["signature"]["entry"]["accession"]})
50
+ else:
51
+ domain_info[match["signature"]["signatureLibraryRelease"]["library"]].append({match["signature"]["accession"]: None})
52
+
53
+ # 处理GO信息
54
+ if match["signature"]["entry"]:
55
+ if match["signature"]["entry"]["goXRefs"]:
56
+ for goXRef in match["signature"]["entry"]["goXRefs"]:
57
+ if goXRef["databaseName"] == "GO":
58
+ domain_info["GO"].append(goXRef["id"])
59
+
60
+ protein_info[sequence] = domain_info
61
+
62
+ return protein_info
63
+
64
+
65
+ def get_seqnid(file_path):
66
+ seq_dict = {}
67
+ current_header = None
68
+ current_seq = []
69
+
70
+ with open(file_path, 'r') as f:
71
+ for line in f:
72
+ line = line.strip()
73
+ if line.startswith(">"):
74
+ if current_header is not None:
75
+ seq_dict[current_header] = "".join(current_seq)
76
+ current_header = line[1:].split()[0] # Take only the first part before whitespace
77
+ current_seq = []
78
+ else:
79
+ current_seq.append(line)
80
+
81
+ if current_header is not None:
82
+ seq_dict[current_header] = "".join(current_seq)
83
+
84
+ return seq_dict
85
+
86
+
87
+ def tofasta(fasta_path, uids, seqs):
88
+ """
89
+ Write sequences in FASTA format to a file.
90
+
91
+ Parameters:
92
+ - fasta_path: str, path to the output FASTA file
93
+ - uids: list of str, sequence identifiers (headers)
94
+ - seqs: list of str, corresponding sequences
95
+ """
96
+ if len(uids) != len(seqs):
97
+ raise ValueError("Length of uids and seqs must be equal")
98
+
99
+ with open(fasta_path, 'w') as f:
100
+ for uid, seq in zip(uids, seqs):
101
+ # Write header line starting with '>' followed by the uid
102
+ f.write(f">{uid}\n")
103
+ # Write sequence (you may want to split long sequences into multiple lines)
104
+ f.write(f"{seq}\n")
105
+
106
+
107
+ def extract_blast_metrics(xml_file):
108
+ """
109
+ 从BLAST XML结果中提取以下指标:
110
+ - ID (提取UniProt ID)
111
+ - Identity% (相似度百分比)
112
+ - Coverage (覆盖率)
113
+ - E-value
114
+ - Bit Score
115
+ - Positive% (相似残基百分比)
116
+ """
117
+ with open(xml_file) as f:
118
+ blast_records = NCBIXML.parse(f)
119
+ results = {}
120
+
121
+ for blast_record in blast_records:
122
+ _results = []
123
+ query_length = blast_record.query_length
124
+
125
+ for alignment in blast_record.alignments:
126
+ for hsp in alignment.hsps:
127
+ # 提取UniProt ID (格式如 sp|A0A0H2ZM56|ADHE_STRP2)
128
+ hit_id = alignment.hit_id.split("|")[1] if "|" in alignment.hit_id else alignment.hit_id
129
+
130
+ # 计算关键指标
131
+ identity_percent = (hsp.identities / hsp.align_length) * 100
132
+ coverage = (hsp.align_length / query_length) * 100
133
+ positive_percent = (hsp.positives / hsp.align_length) * 100
134
+
135
+ # 存储结果
136
+ _results.append({
137
+ "ID": hit_id,
138
+ "Identity%": round(identity_percent, 2),
139
+ "Coverage%": round(coverage, 2),
140
+ "E-value": f"{hsp.expect:.1e}" if hsp.expect < 0.001 else round(hsp.expect, 4),
141
+ "Bit Score": round(hsp.bits, 1),
142
+ "Positive%": round(positive_percent, 2)
143
+ })
144
+ results[blast_record.query] = _results
145
+ return results
146
+
147
+
148
+ def rename_interproscan_keys(interproscan_results):
149
+ new_results = {}
150
+ for key, value in interproscan_results.items():
151
+ if key == "PFAM":
152
+ new_results["pfam_id"] = value
153
+ elif key == "GO":
154
+ new_results["go_id"] = value
155
+ else:
156
+ new_results[key.lower()] = value
157
+
158
+ return new_results