Spaces:

ericzhang1122
/

protein_rag

Runtime error

App Files Files Community

ericzhang1122 commited on 13 days ago

Commit

5c20520

verified ·

1 Parent(s): d6a3a76

Upload folder using huggingface_hub

Browse files

Files changed (30) hide show

.gitignore +26 -0
README.md +2 -8
cal_llm_score.py +328 -0
calculate_ec_accuracy.py +312 -0
demo.py +495 -0
demo.sh +19 -0
example/difference_20241122_ec_dict_list.fasta +20 -0
example/difference_20241122_ec_dict_list.pkl +3 -0
example/protein_go_clean.fasta +0 -0
example/protein_go_clean.json +606 -0
example/test_nt_seqs.fasta +94 -0
example/test_proteins.fasta +4 -0
go_integration_pipeline.py +374 -0
integrated_pipeline.py +520 -0
interproscan.py +107 -0
pipeline.py +78 -0
readme.md +120 -0
setup.sh +129 -0
test_data/interproscan_info.json +0 -0
utils/cal_pr.py +154 -0
utils/functions.py +56 -0
utils/generate_llm_answers.py +166 -0
utils/generate_llm_answers4enzyme.py +151 -0
utils/generate_protein_prompt.py +413 -0
utils/get_motif.py +63 -0
utils/mpr.py +433 -0
utils/openai_access.py +99 -0
utils/prompts.py +66 -0
utils/protein_go_analysis.py +119 -0
utils/utils.py +158 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,26 @@

+data/
+output/
+interproscan/
+blast_db/
+*.pyc
+# other
+__pycache__/*
+.gradio/*
+install_blast.sh
+blast.py
+# data
+output/*
+# link
+interproscan
+# data
+evolla_test_data/*
+processed_data/*
+downloads/*
+test/*
+test_interproscan.py

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Protein Rag
-emoji: 📉
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
 sdk_version: 5.35.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: protein_rag
+app_file: demo.py
 sdk: gradio
 sdk_version: 5.35.0
 ---

cal_llm_score.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import json
+from pathlib import Path
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+from utils.openai_access import call_chatgpt
+from utils.mpr import MultipleProcessRunnerSimplifier
+from utils.prompts import LLM_SCORE_PROMPT
+import re
+qa_data = {}
+def load_qa_results_from_dir(results_dir):
+    """从结果目录加载所有QA结果"""
+    global qa_data
+    qa_data = {}
+    results_path = Path(results_dir)
+    json_files = list(results_path.glob("*.json"))
+    print(f"找到 {len(json_files)} 个结果文件")
+    for json_file in tqdm(json_files, desc="加载QA结果"):
+        try:
+            with open(json_file, 'r') as f:
+                data = json.load(f)
+                if ('index' in data and 'protein_id' in data and
+                    'ground_truth' in data and 'llm_answer' in data):
+                    qa_data[data['index']] = data
+        except Exception as e:
+            print(f"加载文件 {json_file} 时出错: {e}")
+    print(f"成功加载 {len(qa_data)} 个QA对")
+    return qa_data
+def extract_score_from_response(response):
+    """从LLM响应中提取分数"""
+    if not response:
+        return None
+    # 尝试解析JSON格式的响应
+    try:
+        if isinstance(response, str):
+            # 尝试直接解析JSON
+            json_match = re.search(r'\{[^}]*"score"[^}]*\}', response)
+            if json_match:
+                json_obj = json.loads(json_match.group())
+                return json_obj.get('score')
+            # 尝试提取数字
+            score_match = re.search(r'"score":\s*(\d+(?:\.\d+)?)', response)
+            if score_match:
+                return float(score_match.group(1))
+            # 尝试提取纯数字
+            number_match = re.search(r'(\d+(?:\.\d+)?)', response)
+            if number_match:
+                score = float(number_match.group(1))
+                if 0 <= score <= 100:
+                    return score
+        elif isinstance(response, dict):
+            return response.get('score')
+    except:
+        pass
+    return None
+def process_single_scoring(process_id, idx, qa_index, writer, save_dir):
+    """处理单个QA对的打分"""
+    try:
+        qa_item = qa_data[qa_index]
+        protein_id = qa_item['protein_id']
+        question = qa_item.get('question', '')
+        ground_truth = qa_item['ground_truth']
+        llm_answer = qa_item['llm_answer']
+        # 构建打分提示
+        scoring_prompt = LLM_SCORE_PROMPT.replace('{{ground_truth}}', str(ground_truth))
+        scoring_prompt = scoring_prompt.replace('{{llm_answer}}', str(llm_answer))
+        # 调用LLM进行打分
+        score_response = call_chatgpt(scoring_prompt)
+        score = extract_score_from_response(score_response)
+        # 构建结果数据
+        result = {
+            'index': qa_index,
+            'protein_id': protein_id,
+            'question': question,
+            'ground_truth': ground_truth,
+            'llm_answer': llm_answer,
+            'score': score,
+            'raw_score_response': score_response
+        }
+        # 保存文件
+        save_path = os.path.join(save_dir, f"score_{protein_id}_{qa_index}.json")
+        with open(save_path, 'w') as f:
+            json.dump(result, f, indent=2, ensure_ascii=False)
+    except Exception as e:
+        print(f"处理QA索引 {qa_index} 时出错: {str(e)}")
+def get_missing_score_indices(save_dir):
+    """检查哪些QA索引尚未完成打分"""
+    all_qa_indices = list(qa_data.keys())
+    problem_qa_indices = set()
+    for qa_index in tqdm(all_qa_indices, desc="检查打分文件"):
+        protein_id = qa_data[qa_index]['protein_id']
+        json_file = Path(save_dir) / f"score_{protein_id}_{qa_index}.json"
+        if not json_file.exists():
+            problem_qa_indices.add(qa_index)
+            continue
+        try:
+            with open(json_file, 'r') as f:
+                data = json.load(f)
+                if (data is None or len(data) == 0 or
+                    'score' not in data or
+                    data.get('score') is None):
+                    problem_qa_indices.add(qa_index)
+                    json_file.unlink()
+        except Exception as e:
+            problem_qa_indices.add(qa_index)
+            try:
+                json_file.unlink()
+            except:
+                pass
+    return problem_qa_indices
+def collect_scores_to_json(save_dir, output_json):
+    """收集所有打分结果并保存为JSON文件"""
+    results = []
+    save_path = Path(save_dir)
+    score_files = list(save_path.glob("score_*.json"))
+    for score_file in tqdm(score_files, desc="收集打分结果"):
+        try:
+            with open(score_file, 'r') as f:
+                data = json.load(f)
+                results.append({
+                    'index': data.get('index'),
+                    'protein_id': data.get('protein_id'),
+                    'question': data.get('question', ''),
+                    'ground_truth': data.get('ground_truth'),
+                    'llm_answer': data.get('llm_answer'),
+                    'score': data.get('score')
+                })
+        except Exception as e:
+            print(f"读取文件 {score_file} 时出错: {e}")
+    # 按index排序
+    results.sort(key=lambda x: x.get('index', 0))
+    # 保存为JSON文件
+    with open(output_json, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    print(f"打分结果已保存到: {output_json}")
+    # 转换为DataFrame用于分析
+    df = pd.DataFrame(results)
+    return df
+def analyze_scores(df):
+    """对打分结果进行统计分析"""
+    print("\n=== 打分结果统计分析 ===")
+    # 基本统计
+    valid_scores = df[df['score'].notna()]['score']
+    if len(valid_scores) == 0:
+        print("没有有效的打分结果")
+        return
+    print(f"总样本数: {len(df)}")
+    print(f"有效打分数: {len(valid_scores)}")
+    print(f"无效打分数: {len(df) - len(valid_scores)}")
+    print(f"有效率: {len(valid_scores)/len(df)*100:.2f}%")
+    print(f"\n分数统计:")
+    print(f"平均分: {valid_scores.mean():.2f}")
+    print(f"中位数: {valid_scores.median():.2f}")
+    print(f"标准差: {valid_scores.std():.2f}")
+    print(f"最高分: {valid_scores.max():.2f}")
+    print(f"最低分: {valid_scores.min():.2f}")
+    # 分数分布
+    print(f"\n分数分布:")
+    bins = [0, 20, 40, 60, 80, 100]
+    labels = ['0-20', '21-40', '41-60', '61-80', '81-100']
+    for i, (low, high) in enumerate(zip(bins[:-1], bins[1:])):
+        count = len(valid_scores[(valid_scores >= low) & (valid_scores <= high)])
+        percentage = count / len(valid_scores) * 100
+        print(f"{labels[i]}: {count} ({percentage:.1f}%)")
+    # 分位数
+    print(f"\n分位数:")
+    quantiles = [0.1, 0.25, 0.5, 0.75, 0.9]
+    for q in quantiles:
+        print(f"{int(q*100)}%分位数: {valid_scores.quantile(q):.2f}")
+    # 按蛋白质ID分析（如果样本足够多）
+    if len(df['protein_id'].unique()) > 1:
+        print(f"\n按蛋白质ID分析:")
+        protein_stats = df[df['score'].notna()].groupby('protein_id')['score'].agg(['count', 'mean', 'std']).round(2)
+        print(protein_stats.head(10))
+    # 保存统计分析结果
+    stats_result = {
+        "basic_stats": {
+            "total_samples": len(df),
+            "valid_scores": len(valid_scores),
+            "invalid_scores": len(df) - len(valid_scores),
+            "valid_rate": len(valid_scores)/len(df)*100,
+            "mean_score": float(valid_scores.mean()),
+            "median_score": float(valid_scores.median()),
+            "std_score": float(valid_scores.std()),
+            "max_score": float(valid_scores.max()),
+            "min_score": float(valid_scores.min())
+        },
+        "distribution": {},
+        "quantiles": {}
+    }
+    # 分数分布统计
+    for i, (low, high) in enumerate(zip(bins[:-1], bins[1:])):
+        count = len(valid_scores[(valid_scores >= low) & (valid_scores <= high)])
+        percentage = count / len(valid_scores) * 100
+        stats_result["distribution"][labels[i]] = {
+            "count": count,
+            "percentage": percentage
+        }
+    # 分位数统计
+    for q in quantiles:
+        stats_result["quantiles"][f"{int(q*100)}%"] = float(valid_scores.quantile(q))
+    return stats_result
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--results_dir", type=str,
+                      default="data/evolla_hard_motif_go",
+                      help="包含LLM答案结果的目录")
+    parser.add_argument("--n_process", type=int, default=32,
+                      help="并行进程数")
+    parser.add_argument("--save_dir", type=str,
+                      default="data/llm_scores",
+                      help="保存打分结果的目录")
+    parser.add_argument("--output_json", type=str,
+                      default="data/llm_scores_results.json",
+                      help="输出JSON文件路径")
+    parser.add_argument("--stats_json", type=str,
+                      default="data/llm_scores_stats.json",
+                      help="统计分析结果JSON文件路径")
+    parser.add_argument("--max_iterations", type=int, default=3,
+                      help="最大迭代次数")
+    args = parser.parse_args()
+    # 创建保存目录
+    os.makedirs(args.save_dir, exist_ok=True)
+    os.makedirs(os.path.dirname(args.output_json), exist_ok=True)
+    # 加载QA结果数据
+    load_qa_results_from_dir(args.results_dir)
+    if not qa_data:
+        print("没有找到有效的QA结果数据")
+        return
+    # 循环检查和打分
+    iteration = 0
+    while iteration < args.max_iterations:
+        iteration += 1
+        print(f"\n开始第 {iteration} 轮打分")
+        # 获取缺失打分的QA索引
+        missing_indices = get_missing_score_indices(args.save_dir)
+        if not missing_indices:
+            print("所有QA对已完成打分！")
+            break
+        print(f"发现 {len(missing_indices)} 个待打分的QA对")
+        missing_indices_list = sorted(list(missing_indices))
+        # 使用多进程处理打分
+        mprs = MultipleProcessRunnerSimplifier(
+            data=missing_indices_list,
+            do=lambda process_id, idx, qa_index, writer: process_single_scoring(process_id, idx, qa_index, writer, args.save_dir),
+            n_process=args.n_process,
+            split_strategy="static"
+        )
+        mprs.run()
+        print(f"第 {iteration} 轮打分完成")
+    # 收集结果并保存为JSON
+    df = collect_scores_to_json(args.save_dir, args.output_json)
+    # 进行统计分析
+    stats_result = analyze_scores(df)
+    # 保存统计分析结果为JSON
+    with open(args.stats_json, 'w', encoding='utf-8') as f:
+        json.dump(stats_result, f, indent=2, ensure_ascii=False)
+    print(f"统计分析结果已保存到: {args.stats_json}")
+    # 检查最终结果
+    final_missing = get_missing_score_indices(args.save_dir)
+    if final_missing:
+        print(f"\n仍有 {len(final_missing)} 个QA对未能成功打分")
+    else:
+        print(f"\n所有 {len(qa_data)} 个QA对已成功完成打分！")
+if __name__ == "__main__":
+    main()

calculate_ec_accuracy.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import pickle
+import json
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import re
+from collections import defaultdict
+def load_ground_truth(pkl_file):
+    """加载ground truth数据"""
+    with open(pkl_file, 'rb') as f:
+        data = pickle.load(f)
+    # 提取每个蛋白的EC号
+    gt_dict = {}
+    for item in data:
+        uniprot_id = item['uniprot_id']
+        ec_numbers = []
+        # 提取EC号
+        if 'ec' in item:
+            for ec_info in item['ec']:
+                if 'reaction' in ec_info and 'ecNumber' in ec_info['reaction']:
+                    ec_numbers.append(ec_info['reaction']['ecNumber'])
+        gt_dict[uniprot_id] = set(ec_numbers)  # 使用set去重
+    return gt_dict
+def extract_ec_prediction(json_content):
+    """从预测结果中提取EC号"""
+    # 查找[EC_PREDICTION]标签后的内容
+    pattern = r'\[EC_PREDICTION\]\s*([^\n\r]*)'
+    match = re.search(pattern, json_content)
+    if match:
+        line_content = match.group(1).strip()
+        # 修改EC号格式匹配，支持不完整的EC号（带有-的情况）
+        # 匹配格式：数字.数字.数字.数字 或 数字.数字.数字.- 或 数字.数字.-.- 或 数字.-.-.-
+        ec_pattern = r'\b\d+\.(?:\d+|-)\.(?:\d+|-)\.(?:\d+|-)'
+        ec_numbers = re.findall(ec_pattern, line_content)
+        return ec_numbers
+    return []
+def load_predictions(predictions_dir):
+    """加载所有预测结果"""
+    predictions = {}
+    for filename in os.listdir(predictions_dir):
+        if filename.endswith('.json'):
+            uniprot_id = filename.replace('.json', '')
+            filepath = os.path.join(predictions_dir, filename)
+            try:
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                # 提取EC预测
+                predicted_ecs = extract_ec_prediction(content)
+                predictions[uniprot_id] = predicted_ecs
+            except Exception as e:
+                print(f"处理文件 {filename} 时出错: {e}")
+    return predictions
+def calculate_accuracy(ground_truth, predictions, level=4):
+    """
+    计算EC号在指定级别上的准确率
+    level: 1-4，表示比较EC号的前几个数字
+    """
+    correct = 0
+    total = 0
+    for uniprot_id, gt_ecs in ground_truth.items():
+        if uniprot_id in predictions and predictions[uniprot_id]:
+            # 取预测的第一个EC号
+            pred_ec = predictions[uniprot_id][0]
+            # 检查是否有任何ground truth EC号在指定级别上与预测匹配
+            is_correct = False
+            for gt_ec in gt_ecs:
+                # 将EC号分割成组成部分
+                gt_parts = gt_ec.split('.')[:level]
+                pred_parts = pred_ec.split('.')[:level]
+                # 比较前level个部分是否相同
+                if gt_parts == pred_parts:
+                    is_correct = True
+                    break
+            if is_correct:
+                correct += 1
+            total += 1
+    accuracy = correct / total if total > 0 else 0
+    return accuracy, correct, total
+def calculate_prf1(ground_truth, predictions, level=4):
+    """
+    计算EC号在指定级别上的精确率、召回率和F1分数 (微平均)
+    level: 1-4，表示比较EC号的前几个数字
+    """
+    total_tp = 0
+    total_fp = 0
+    total_fn = 0
+    # 添加用于记录错误预测的字典
+    incorrect_proteins = {
+        'false_positives': [],  # 预测了但GT中没有的
+        'false_negatives': [],  # GT中有但没预测到的
+        'no_prediction': [],    # 有GT但没有预测的
+        'zero_prediction': []   # 预测了0个EC号的蛋白
+    }
+    for uniprot_id, gt_ecs_set in ground_truth.items():
+        if uniprot_id in predictions:
+            pred_ecs_set = set(predictions[uniprot_id])
+            # 如果GT是空的，跳过这个蛋白的评估
+            if not gt_ecs_set:
+                continue
+            # 检查是否预测了0个EC号
+            if not pred_ecs_set:
+                level_gt = set('.'.join(ec.split('.')[:level]) for ec in gt_ecs_set)
+                fn = len(level_gt)
+                total_fn += fn
+                incorrect_proteins['zero_prediction'].append({
+                    'protein_id': uniprot_id,
+                    'gt_ecs': list(level_gt)
+                })
+                continue
+            # --- 核心计算逻辑 ---
+            # 为了处理level，我们需要小心地计算交集
+            # level_gt = {'1.2.3.4' -> '1.2.3'}
+            level_gt = set('.'.join(ec.split('.')[:level]) for ec in gt_ecs_set)
+            level_pred = set('.'.join(ec.split('.')[:level]) for ec in pred_ecs_set)
+            # 计算 TP, FP, FN
+            tp = len(level_pred.intersection(level_gt))
+            fp = len(level_pred) - tp
+            fn = len(level_gt) - tp
+            total_tp += tp
+            total_fp += fp
+            total_fn += fn
+            # 记录有错误的蛋白ID
+            if fp > 0 or fn > 0:
+                fp_ecs = level_pred - level_gt  # 假阳性的EC号
+                fn_ecs = level_gt - level_pred  # 假阴性的EC号
+                if fp > 0:
+                    incorrect_proteins['false_positives'].append({
+                        'protein_id': uniprot_id,
+                        'predicted_ecs': list(fp_ecs),
+                        'gt_ecs': list(level_gt)
+                    })
+                if fn > 0:
+                    incorrect_proteins['false_negatives'].append({
+                        'protein_id': uniprot_id,
+                        'missed_ecs': list(fn_ecs),
+                        'predicted_ecs': list(level_pred)
+                    })
+        else:
+            # 有GT但没有预测的情况
+            if gt_ecs_set:
+                level_gt = set('.'.join(ec.split('.')[:level]) for ec in gt_ecs_set)
+                fn = len(level_gt)
+                total_fn += fn
+                incorrect_proteins['no_prediction'].append({
+                    'protein_id': uniprot_id,
+                    'gt_ecs': list(level_gt)
+                })
+    # 使用微平均计算总的 Precision, Recall, F1
+    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
+    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
+    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+    # total用于展示处理了多少个蛋白
+    total_proteins_evaluated = sum(1 for uid in ground_truth if uid in predictions and ground_truth[uid])
+    return {
+        'precision': precision,
+        'recall': recall,
+        'f1_score': f1,
+        'tp': total_tp,
+        'fp': total_fp,
+        'fn': total_fn,
+        'evaluated_proteins': total_proteins_evaluated,
+        'incorrect_proteins': incorrect_proteins
+    }
+def main():
+    # 文件路径
+    import argparse
+    parser = argparse.ArgumentParser(description='Calculate EC accuracy')
+    parser.add_argument('--pkl_file', type=str, default='data/raw_data/difference_20241122_ec_dict_list.pkl')
+    parser.add_argument('--predictions_dir', type=str, default='data/clean_test_results_top2go_deepseek-r1')
+    args = parser.parse_args()
+    pkl_file = args.pkl_file
+    predictions_dir = args.predictions_dir
+    print("正在加载ground truth数据...")
+    ground_truth = load_ground_truth(pkl_file)
+    print(f"加载了 {len(ground_truth)} 个蛋白的ground truth数据")
+    print("正在加载预测结果...")
+    predictions = load_predictions(predictions_dir)
+    print(f"加载了 {len(predictions)} 个蛋白的预测结果")
+    # print(f"predictions: {predictions}")
+    # print(f"ground_truth: {ground_truth}")
+    # 找到共同的蛋白ID
+    common_ids = set(ground_truth.keys()) & set(predictions.keys())
+    valid_ids = {uid for uid in common_ids if ground_truth[uid]} # 只评估那些有GT EC号的蛋白
+    print(f"共同且有GT的蛋白数量: {len(valid_ids)}")
+    # 过滤数据
+    filtered_gt = {uid: ground_truth[uid] for uid in valid_ids}
+    filtered_pred = {uid: predictions[uid] for uid in valid_ids}
+    # 计算不同级别的PRF1
+    results = {}
+    print("\n=== 评估结果 ===")
+    for level in [1, 2, 3, 4]:
+        metrics = calculate_prf1(filtered_gt, filtered_pred, level=level)
+        results[level] = metrics
+        print(f"--- EC号前{level}级 ---")
+        print(f"  Precision: {metrics['precision']:.4f}")
+        print(f"  Recall:    {metrics['recall']:.4f}")
+        print(f"  F1-Score:  {metrics['f1_score']:.4f}")
+        print(f"  (TP: {metrics['tp']}, FP: {metrics['fp']}, FN: {metrics['fn']})")
+        # 打印预测错误的蛋白ID
+        incorrect = metrics['incorrect_proteins']
+        if incorrect['false_positives']:
+            print(f"  假阳性错误 ({len(incorrect['false_positives'])}个蛋白):")
+            for item in incorrect['false_positives'][:10]:  # 只显示前10个
+                print(f"    {item['protein_id']}: 错误预测了 {item['predicted_ecs']}, GT是 {item['gt_ecs']}")
+            if len(incorrect['false_positives']) > 10:
+                print(f"    ... 还有 {len(incorrect['false_positives']) - 10} 个")
+        if incorrect['false_negatives']:
+            print(f"  假阴性错误 ({len(incorrect['false_negatives'])}个蛋白):")
+            for item in incorrect['false_negatives'][:10]:  # 只显示前10个
+                print(f"    {item['protein_id']}: 漏掉了 {item['missed_ecs']}, 预测了 {item['predicted_ecs']}")
+            if len(incorrect['false_negatives']) > 10:
+                print(f"    ... 还有 {len(incorrect['false_negatives']) - 10} 个")
+        if incorrect['zero_prediction']:
+            print(f"  零预测错误 ({len(incorrect['zero_prediction'])}个蛋白):")
+            for item in incorrect['zero_prediction']:
+                print(f"    {item['protein_id']}: GT是 {item['gt_ecs']}, 但预测了0个EC号")
+        if incorrect['no_prediction']:
+            print(f"  无预测错误 ({len(incorrect['no_prediction'])}个蛋白):")
+            for item in incorrect['no_prediction'][:10]:  # 只显示前10个
+                print(f"    {item['protein_id']}: GT是 {item['gt_ecs']}, 但没有预测")
+            if len(incorrect['no_prediction']) > 10:
+                print(f"    ... 还有 {len(incorrect['no_prediction']) - 10} 个")
+        print()  # 空行分隔
+    # 统计信息
+    print("\n=== 详细统计信息 ===")
+    # 统计ground truth中EC号的分布
+    gt_ec_counts = defaultdict(int)
+    for ecs in filtered_gt.values():
+        gt_ec_counts[len(ecs)] += 1
+    print("Ground truth中EC号数量分布:")
+    for count, freq in sorted(gt_ec_counts.items()):
+        print(f"  {count}个EC号: {freq}个蛋白")
+    # 统计预测结果中EC号的分布
+    pred_ec_counts = defaultdict(int)
+    for ecs in filtered_pred.values():
+        pred_ec_counts[len(ecs)] += 1
+    print("\n预测结果中EC号数量分布:")
+    for count, freq in sorted(pred_ec_counts.items()):
+        print(f"  {count}个EC号: {freq}个蛋白")
+    # 保存结果
+    output_file = 'test_results/ec_accuracy_results.json'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    # #保存ground truth
+    # with open('test_results/ground_truth.json', 'w', encoding='utf-8') as f:
+    #     json.dump(filtered_gt, f, indent=2, ensure_ascii=False)
+    # #保存预测结果
+    # with open('test_results/predictions.json', 'w', encoding='utf-8') as f:
+    #     json.dump(filtered_pred, f, indent=2, ensure_ascii=False)
+    print(f"\n结果已保存到 {output_file}")
+if __name__ == "__main__":
+    main()

demo.py ADDED Viewed

	@@ -0,0 +1,495 @@

+import os
+import json
+import sys
+import tempfile
+import gradio as gr
+from typing import Dict, List, Optional
+from pathlib import Path
+from Bio import SeqIO
+from io import StringIO
+# 添加必要的路径
+root_path = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(root_path)
+sys.path.append(os.path.join(root_path, "Models/ProTrek"))
+# 导入所需模块
+from interproscan import InterproScan
+from Bio.Blast.Applications import NcbiblastpCommandline
+from utils.utils import extract_interproscan_metrics, get_seqnid, extract_blast_metrics, rename_interproscan_keys
+from go_integration_pipeline import GOIntegrationPipeline
+from utils.openai_access import call_chatgpt
+from utils.prompts import FUNCTION_PROMPT
+def get_prompt_template(selected_info_types=None):
+    """
+    获取prompt模板，支持可选的信息类型
+    Args:
+        selected_info_types: 需要包含的信息类型列表，如['motif', 'go', 'superfamily', 'panther']
+    """
+    if selected_info_types is None:
+        selected_info_types = ['motif', 'go']  # 默认包含motif和go信息
+    PROMPT_TEMPLATE = FUNCTION_PROMPT + '\n' + """
+    input information:
+    {%- if 'motif' in selected_info_types and motif_pfam %}
+    motif:{% for motif_id, motif_info in motif_pfam.items() %}
+    {{motif_id}}: {{motif_info}}
+    {% endfor %}
+    {%- endif %}
+    {%- if 'go' in selected_info_types and go_data.status == 'success' %}
+    GO:{% for go_entry in go_data.go_annotations %}
+    ▢ GO term{{loop.index}}: {{go_entry.go_id}}
+    • definition: {{ go_data.all_related_definitions.get(go_entry.go_id, 'not found definition') }}
+    {% endfor %}
+    {%- endif %}
+    {%- for info_type in selected_info_types %}
+    {%- if info_type not in ['motif', 'go'] and interpro_descriptions.get(info_type) %}
+    {{info_type}}:{% for ipr_id, ipr_info in interpro_descriptions[info_type].items() %}
+    ▢ {{ipr_id}}: {{ipr_info.name}}
+    • description: {{ipr_info.abstract}}
+    {% endfor %}
+    {%- endif %}
+    {%- endfor %}
+    question: \n {{question}}
+    """
+    return PROMPT_TEMPLATE
+class ProteinAnalysisDemo:
+    def __init__(self):
+        """
+        蛋白质分析演示类
+        """
+        self.blast_database = "uniprot_swissprot"
+        self.expect_value = 0.01
+        self.interproscan_path = "interproscan/interproscan-5.75-106.0/interproscan.sh"
+        self.interproscan_libraries = [
+            "PFAM", "PIRSR", "PROSITE_PROFILES", "SUPERFAMILY", "PRINTS",
+            "PANTHER", "CDD", "GENE3D", "NCBIFAM", "SFLM", "MOBIDB_LITE",
+            "COILS", "PROSITE_PATTERNS", "FUNFAM", "SMART"
+        ]
+        self.go_topk = 2
+        self.selected_info_types = ['motif', 'go']
+        # 文件路径配置
+        self.pfam_descriptions_path = 'data/raw_data/all_pfam_descriptions.json'
+        self.go_info_path = 'data/raw_data/go.json'
+        self.interpro_data_path = 'data/raw_data/interpro_data.json'
+        # 初始化GO整合管道
+        self.go_pipeline = GOIntegrationPipeline(topk=self.go_topk)
+        # 初始化InterPro管理器（如果需要）
+        self.interpro_manager = None
+        other_types = [t for t in self.selected_info_types if t not in ['motif', 'go']]
+        if other_types and os.path.exists(self.interpro_data_path):
+            try:
+                from utils.generate_protein_prompt import get_interpro_manager
+                self.interpro_manager = get_interpro_manager(self.interpro_data_path, None)
+            except Exception as e:
+                print(f"初始化InterPro管理器失败: {str(e)}")
+    def validate_protein_sequence(self, sequence: str) -> bool:
+        """
+        验证蛋白质序列格式
+        """
+        if not sequence:
+            return False
+        # 移除空白字符
+        sequence = sequence.strip().upper()
+        # 检查是否包含有效的氨基酸字符
+        valid_aa = set('ACDEFGHIKLMNPQRSTVWY')
+        sequence_chars = set(sequence.replace('\n', '').replace(' ', ''))
+        return sequence_chars.issubset(valid_aa) and len(sequence) > 0
+    def parse_fasta_content(self, fasta_content: str) -> tuple:
+        """
+        解析FASTA内容，返回第一个序列
+        """
+        try:
+            fasta_io = StringIO(fasta_content)
+            records = list(SeqIO.parse(fasta_io, "fasta"))
+            if not records:
+                return None, "FASTA文件中没有找到有效的序列"
+            if len(records) > 1:
+                return None, "演示版本只支持单一序列，检测到多个序列"
+            record = records[0]
+            return str(record.seq), f"成功解析序列 ID: {record.id}"
+        except Exception as e:
+            return None, f"解析FASTA文件出错: {str(e)}"
+    def create_temp_fasta(self, sequence: str, seq_id: str = "demo_protein") -> str:
+        """
+        创建临时FASTA文件
+        """
+        temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False)
+        temp_file.write(f">{seq_id}\n{sequence}\n")
+        temp_file.close()
+        return temp_file.name
+    def run_blast_analysis(self, fasta_file: str, temp_dir: str) -> Dict:
+        """
+        运行BLAST分析
+        """
+        blast_xml = os.path.join(temp_dir, "blast_results.xml")
+        try:
+            blast_cmd = NcbiblastpCommandline(
+                query=fasta_file,
+                db=self.blast_database,
+                out=blast_xml,
+                outfmt=5,  # XML格式
+                evalue=self.expect_value
+            )
+            blast_cmd()
+            # 提取BLAST结果
+            blast_results = extract_blast_metrics(blast_xml)
+            # 获取序列字典
+            seq_dict = get_seqnid(fasta_file)
+            blast_info = {}
+            for uid, info in blast_results.items():
+                blast_info[uid] = {"sequence": seq_dict[uid], "blast_results": info}
+            return blast_info
+        except Exception as e:
+            print(f"BLAST分析出错: {str(e)}")
+            return {}
+        finally:
+            if os.path.exists(blast_xml):
+                os.remove(blast_xml)
+    def run_interproscan_analysis(self, fasta_file: str, temp_dir: str) -> Dict:
+        """
+        运行InterProScan分析
+        """
+        interproscan_json = os.path.join(temp_dir, "interproscan_results.json")
+        try:
+            interproscan = InterproScan(self.interproscan_path)
+            input_args = {
+                "fasta_file": fasta_file,
+                "goterms": True,
+                "pathways": True,
+                "save_dir": interproscan_json
+            }
+            interproscan.run(**input_args)
+            # 提取InterProScan结果
+            interproscan_results = extract_interproscan_metrics(
+                interproscan_json,
+                librarys=self.interproscan_libraries
+            )
+            # 获取序列字典
+            seq_dict = get_seqnid(fasta_file)
+            interproscan_info = {}
+            for id, seq in seq_dict.items():
+                info = interproscan_results[seq]
+                info = rename_interproscan_keys(info)
+                interproscan_info[id] = {"sequence": seq, "interproscan_results": info}
+            return interproscan_info
+        except Exception as e:
+            print(f"InterProScan分析出错: {str(e)}")
+            return {}
+        finally:
+            if os.path.exists(interproscan_json):
+                os.remove(interproscan_json)
+    def generate_prompt(self, protein_id: str, interproscan_info: Dict,
+                       protein_go_dict: Dict, question: str) -> str:
+        """
+        从内存中的数据生成prompt，包含完整的motif和GO定义
+        """
+        try:
+            from utils.protein_go_analysis import get_go_definition
+            from jinja2 import Template
+            # from utils.generate_protein_prompt import get_prompt_template
+            # 获取GO分析结果
+            go_ids = protein_go_dict.get(protein_id, [])
+            go_annotations = []
+            all_related_definitions = {}
+            if go_ids:
+                for go_id in go_ids:
+                    # 确保GO ID格式正确
+                    clean_go_id = go_id.split(":")[-1] if ":" in go_id else go_id
+                    go_annotations.append({"go_id": clean_go_id})
+                    # 获取GO定义
+                    if os.path.exists(self.go_info_path):
+                        definition = get_go_definition(clean_go_id, self.go_info_path)
+                        if definition:
+                            all_related_definitions[clean_go_id] = definition
+            # 获取motif信息
+            motif_pfam = {}
+            if os.path.exists(self.pfam_descriptions_path):
+                try:
+                    # 从interproscan结果中提取pfam信息
+                    interproscan_results = interproscan_info[protein_id].get('interproscan_results', {})
+                    pfam_entries = interproscan_results.get('pfam_id', [])
+                    # 加载pfam描述
+                    with open(self.pfam_descriptions_path, 'r') as f:
+                        pfam_descriptions = json.load(f)
+                    # 构建motif_pfam字典
+                    for entry in pfam_entries:
+                        for pfam_id, ipr_id in entry.items():
+                            if pfam_id and pfam_id in pfam_descriptions:
+                                motif_pfam[pfam_id] = pfam_descriptions[pfam_id]['description']
+                except Exception as e:
+                    print(f"获取motif信息时出错: {str(e)}")
+            # 获取InterPro描述信息
+            interpro_descriptions = {}
+            other_types = [t for t in self.selected_info_types if t not in ['motif', 'go']]
+            if other_types and self.interpro_manager:
+                interpro_descriptions = self.interpro_manager.get_description(protein_id, other_types)
+            # 准备模板数据
+            template_data = {
+                "protein_id": protein_id,
+                "selected_info_types": self.selected_info_types,
+                "go_data": {
+                    "status": "success" if go_annotations else "no_data",
+                    "go_annotations": go_annotations,
+                    "all_related_definitions": all_related_definitions
+                },
+                "motif_pfam": motif_pfam,
+                "interpro_descriptions": interpro_descriptions,
+                "question": question
+            }
+            # 使用模板生成prompt
+            PROMPT_TEMPLATE = get_prompt_template(self.selected_info_types)  # demo版本不使用lmdb
+            template = Template(PROMPT_TEMPLATE)
+            return template.render(**template_data)
+        except Exception as e:
+            print(f"生成prompt时出错 (protein_id: {protein_id}): {str(e)}")
+            # 如果出错，返回简化版本的prompt
+            return self._generate_fallback_prompt(protein_id, interproscan_info, protein_go_dict, question)
+    def _generate_fallback_prompt(self, protein_id: str, interproscan_info: Dict,
+                                 protein_go_dict: Dict, question: str) -> str:
+        """
+        生成备用prompt（当主要方法失败时使用）
+        """
+        from utils.prompts import FUNCTION_PROMPT
+        prompt_parts = [FUNCTION_PROMPT]
+        prompt_parts.append("\ninput information:")
+        # 添加motif信息
+        if 'motif' in self.selected_info_types:
+            interproscan_results = interproscan_info[protein_id].get('interproscan_results', {})
+            pfam_entries = interproscan_results.get('pfam_id', [])
+            if pfam_entries:
+                prompt_parts.append("\nmotif:")
+                for entry in pfam_entries:
+                    for key, value in entry.items():
+                        if value:
+                            prompt_parts.append(f"{value}: motif information")
+        # 添加GO信息
+        if 'go' in self.selected_info_types:
+            go_ids = protein_go_dict.get(protein_id, [])
+            if go_ids:
+                prompt_parts.append("\nGO:")
+                for i, go_id in enumerate(go_ids[:10], 1):
+                    prompt_parts.append(f"▢ GO term{i}: {go_id}")
+                    prompt_parts.append(f"• definition: GO term definition")
+        # 添加用户问题
+        prompt_parts.append(f"\nquestion: \n{question}")
+        return "\n".join(prompt_parts)
+    def analyze_protein(self, sequence_input: str, fasta_file, question: str) -> str:
+        """
+        分析蛋白质序列并回答问题
+        """
+        if not question.strip():
+            return "请输入您的问题"
+        # 确定使用哪个序列输入
+        final_sequence = None
+        sequence_source = ""
+        if fasta_file is not None:
+            # 优先使用上传的文件
+            try:
+                fasta_content = fasta_file.read().decode('utf-8')
+                final_sequence, parse_msg = self.parse_fasta_content(fasta_content)
+                if final_sequence is None:
+                    return f"文件解析错误: {parse_msg}"
+                sequence_source = f"来自上传文件: {parse_msg}"
+            except Exception as e:
+                return f"读取上传文件出错: {str(e)}"
+        elif sequence_input.strip():
+            # 使用文本框输入的序列
+            if self.validate_protein_sequence(sequence_input):
+                final_sequence = sequence_input.strip().upper().replace('\n', '').replace(' ', '')
+                sequence_source = "来自文本框输入"
+            else:
+                return "输入的序列格式不正确，请输入有效的蛋白质序列"
+        else:
+            return "请输入蛋白质序列或上传FASTA文件"
+        # 创建临时目录和文件
+        with tempfile.TemporaryDirectory() as temp_dir:
+            try:
+                # 创建临时FASTA文件
+                temp_fasta = self.create_temp_fasta(final_sequence, "demo_protein")
+                # 运行分析
+                status_msg = f"序列来源: {sequence_source}\n序列长度: {len(final_sequence)} 氨基酸\n\n正在进行分析...\n"
+                # 步骤1: BLAST和InterProScan分析
+                status_msg += "步骤1: 运行BLAST分析...\n"
+                blast_info = self.run_blast_analysis(temp_fasta, temp_dir)
+                status_msg += "步骤2: 运行InterProScan分析...\n"
+                interproscan_info = self.run_interproscan_analysis(temp_fasta, temp_dir)
+                if not blast_info or not interproscan_info:
+                    return status_msg + "分析失败: 无法获取BLAST或InterProScan结果"
+                # 步骤2: 整合GO信息
+                status_msg += "步骤3: 整合GO信息...\n"
+                protein_go_dict = self.go_pipeline.first_level_filtering(interproscan_info, blast_info)
+                # 步骤3: 生成prompt
+                status_msg += "步骤4: 生成分析prompt...\n"
+                protein_id = "demo_protein"
+                prompt = self.generate_prompt(protein_id, interproscan_info, protein_go_dict, question)
+                # 步骤4: 调用LLM生成答案
+                status_msg += "步骤5: 生成答案...\n"
+                llm_response = call_chatgpt(prompt)
+                # 组织最终结果
+                result = f"""
+{status_msg}
+=== 分析完成 ===
+问题: {question}
+答案: {llm_response}
+=== 分析详情 ===
+- BLAST匹配数: {len(blast_info.get(protein_id, {}).get('blast_results', []))}
+- InterProScan域数: {len(interproscan_info.get(protein_id, {}).get('interproscan_results', {}).get('pfam_id', []))}
+- GO术语数: {len(protein_go_dict.get(protein_id, []))}
+"""
+                return result
+            except Exception as e:
+                return f"分析过程中出错: {str(e)}"
+            finally:
+                # 清理临时文件
+                if 'temp_fasta' in locals() and os.path.exists(temp_fasta):
+                    os.remove(temp_fasta)
+def create_demo():
+    """
+    创建Gradio演示界面
+    """
+    analyzer = ProteinAnalysisDemo()
+    with gr.Blocks(title="蛋白质功能分析演示") as demo:
+        gr.Markdown("# 🧬 蛋白质功能分析演示")
+        gr.Markdown("输入蛋白质序列和问题，AI将基于BLAST、InterProScan和GO信息为您提供专业分析")
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📝 序列输入")
+                sequence_input = gr.Textbox(
+                    label="蛋白质序列",
+                    placeholder="请输入蛋白质序列（单字母氨基酸代码）...",
+                    lines=5,
+                    max_lines=10
+                )
+                gr.Markdown("**或者**")
+                fasta_file = gr.File(
+                    label="上传FASTA文件",
+                    file_types=[".fasta", ".fa", ".fas"],
+                    file_count="single"
+                )
+                gr.Markdown("### ❓ 您的问题")
+                question_input = gr.Textbox(
+                    label="问题",
+                    placeholder="请输入关于该蛋白质的问题，例如：这个蛋白质的主要功能是什么？",
+                    lines=3
+                )
+                analyze_btn = gr.Button("🔍 开始分析", variant="primary", size="lg")
+            with gr.Column(scale=2):
+                gr.Markdown("### 📊 分析结果")
+                output = gr.Textbox(
+                    label="分析结果",
+                    lines=20,
+                    max_lines=30,
+                    show_copy_button=True
+                )
+        # 示例
+        gr.Markdown("### 💡 示例")
+        gr.Examples(
+            examples=[
+                ["MKALIVLGLVLLSVTVQGKVFERCELARTLKRLGMDGYRGISLANWMCLAKWESGYNTRATNYNAGDRSTDYGIFQINSRYWCNDGKTPGAVNACHLSCSALLQDNIADAVACAKRVVRDPQGIRAWVAWRNRCQNRDVRQYVQGCGV", "这个蛋白质的主要功能是什么？"],
+                ["MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFPTSREJ", "这个蛋白质可能参与哪些生物学过程？"],
+                ["ATGAGTGAACGTCTGAAATCTATCATCACCGTCGACGACGAGAACGTCAAGCTGATCGACAAGATCCTGGCCTCCATCAAGGACCTGAACGAGCTGGTGGACATGATCGACGAGATCAAGAACGTCGACGACGAGCTGATCGACAAGATCCTGGCC", "这个序列编码的蛋白质具有什么结构特征？"]
+            ],
+            inputs=[sequence_input, question_input]
+        )
+        analyze_btn.click(
+            fn=analyzer.analyze_protein,
+            inputs=[sequence_input, fasta_file, question_input],
+            outputs=[output]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=30002,
+        share=True,
+        debug=False
+    )

demo.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+cd /zhangjiawei/protein_rag
+source /root/miniconda3/etc/profile.d/conda.sh
+conda activate rag_llm
+for arg in "$@"
+do
+    case $arg in
+        proxy=*) proxy="${arg#*=}" ;;
+        *) ;;
+    esac
+done
+# export http_proxy=http://${proxy}:4780 && export https_proxy=http://${proxy}:4780
+# echo http_proxy=http://${proxy}:4780
+# echo https_proxy=http://${proxy}:4780
+# http_proxy=http://=10.16.12.236:4780
+python demo.py

example/difference_20241122_ec_dict_list.fasta ADDED Viewed

	@@ -0,0 +1,20 @@

+>A8CF74
+MTAAYLKTAFGDRLSITVVESSRIGTIGVGEATFSDIQHFFQFLNLREQDWMPACNATYKLGIRFENWRHVGHHFYQPFEQIRPVYGFPLTDWWLHDAPTDRFDTDCFVMPNLCEAGRSPRHLDGTLADEDFVEEGDELANRTMSEHQGKSQFPYAYHFEAALLAKFLTGYAVDRGVEHVVDDVLDVRLDQRGWIEHVVTAEHGEIHGDLFVDCTGFRGLLLNKALGVPFVSYQDTLPNDSAVALQVPLDMQRRGIVPNTTATAREAGWIWTIPLFGRVGTGYVYAKDYLSPEEAERTLREFVGPAAADVEANHIRMRIGRSQESWRNNCVAIGLSSGFVEPLESTGIFFIHHAIEQLVKHFPAADWNPKSRDMYNSAVAHVMDGIREFLVIHYRGAARADNQYWRDTKTRPLPDGLAERIECWQTQLPDTETIYPYYHGLPPYSYMCILMGGGAIRTPASAALALTDQGAAQKEFAAVRDRAAQLRDTLPSHYEYLARMRGLDV
+>Q5B027
+MKAFFAISASTLLATVHGHGYLTVPASRTRLGFEAGIDTCPECSILEPVDAWPNVTEAQVGRSGPCGYNARVSVDYNQPGDNWGNEPVVTYKAGDIVEVQWCVDNNGDHGGMFTYGICQDQELVDKFLDPDYLPTEEEKQAAEDCFLQGELKCGDVDGQECEYSPDCGEGEACYRNDWFTCNAFEADSDRGCQGVDGAELNSCKTTIAGGYTVTKKIKIPDYTSEHTLLRFRWNSFQTPQIYLHCADPTIEGGMEVRMRMIVMHGSFGVDTQHSFGHSFGFQGEGVYRAYRYIRGVAIIQMNLNINASLLPQPTLPIRGWSTRNIQHT
+>A0A7H0XJI9
+MRFLKAKAGLVASGAFLLASVPVVAADCALPSTYSWTSTGPLANPKSGWTAIKDFSNVVFNNNHIVYASTTDANGNYGSMNFGTFSDWSGMASASQNKMSFSAVAPTLFYFQPKNIWVLAYQWGSSTFTYRTSNDPTNANGWSSEQALFSGQITGSSTGAIDQTLIGDSTHMYLFFAGDNGKIYRSSMPINNFPGNFGTSSEVVLSDSQNNLFEAVQVYTVKGQNKYLMIVEAIGSQGRYFRSFTATSLGGSWTPQATSESQPFAGKANSGATWTNDISHGDLVRTNPDQTMTIDPCNLQFLYQGKNPSAGGNYNTLPWRPGVLTLKN
+>D9X0I3
+MSANSFDARSTLQVGDESYEIFRLDKVEGSARLPYSLKVLLENLLRTEDGANITADHIRALGGWDSQAQPSQEIQFTPARVIMQDFTGVPCVVDLATMREAVKELGGDPAKINPLAPAELVIDHSVIADKFGTNDAFKQNVELEYGRNKERYQFLRWGQTAFDEFKVVPPGTGIVHQVNIEHLARTVMVRGGQAYPDTLVGTDSHTTMVNGLGVLGWGVGGIEAEAAMLGQPVSMLIPRVVGFKLTGELKPGTTATDLVLTITEMLRGHGVVGKFVEFYGEGVAATSLANRATIGNMSPEFGSTAAIFPIDDETLNYLRLTGRSEQQVALVESYAKEQGLWLDPAAEPDFSEKLELDLSTVVPSIAGPKRPQDRIVLAEAAQQFAKDVLNYVEAPAAQPAASASPVDEASAESFPASDAPAYGSQENGAGAPQHADGTGAAVPSNPVTVTAPDGTSYEIDHGAVTVAAITSCTNTSNPYVMVAAALVAKKAVEKGLTRKPWVKTTLAPGSKVVTDYFEKSGLTPYLDKVGFNLVGYGCTTCIGNSGPLPEEVSKAVNDHDLAVTSVLSGNRNFEGRINPDVKMNYLASPPLVVAYALAGSMKVDITKDALGTDQDGNPVYLKDIWPSEAEVNDVVANAIGEDMFSKSYSDVFAGDAQWQALPIPTGNTFEWDPESTYVRKPPYFEGMEMEPAPVEDIAGARVLAKLGDSVTTDHISPAGAIKADTPAGKYLTEHGVERRDFNSYGSRRGNHEVMIRGTFANIRLRNQIAPGTEGGYTRDFTKDDAPVSFIYDASRNYIEQGIPLVVLAGKEYGSGSSRDWAAKGTALLGVKAVIAESYERIHRSNLIGMGVLPLQFPEGQSAATLGLTGEETFSFSGVTELNNGTTPRTVKVTTDTGVEFDAVVRIDTPGEADYYRNGGIMQYVLRSLIRK
+>Q5MIX2
+MKILLAVVFVLNLTNLAVPQHLITSSPSLPESKPVGRRPTYEEYKQQRESFLQTEDHHLLGANVTLTENEQLVNKFIMQMKLDEMEKGFNDSYNFIPARHIFEVLDRFGQSKVFNVIRRLPKGGVLHAHDMALGSTDLIVNATYLENLWQKGNFGLNHGPEFKFSRERPGKEWSLVSEIRQWMTNEVYDAKVAEVFSLYNADPLNAYKSLDNVWSKFQNLFACLAPLITFAPVWRQYYHDSLKQFYDDHVQYLEFRGVLPEVYDLDGKVYSAEEIVQLYYEETEQFKAKYPDFIGVKFIYAPGRYASDEEFQKLLDTTNRLHKKFPNFLAGFDLVGQEDPGRSLFEFAPALLKLPASINFFFHAGETNWYGMKTDQNLVDAVLLGTKRIGHGFAVLKHPKVLKEIKRRQICIEINPISNQVLKLVQDQRNHPAALLFSDNYPVVVSSDDPSFGRSTPLSHDFYVAFTGIASAKQDWRWLKQLALNSIEYSAMNSEEKTVAKEKWNQAWDHQFSRLAVDFVAGKILENWIMKIV
+>Q9NC65
+MFSQLVVWLLATSTVCLAWDNSWIMDMKYERYSQRRSYYLAEEEDRSVGSDIELTAKEQVVNERLMELKMTELKNGLQDPAGFIPWNHIFDVLYRINSSELFHIIQKMPKGGILHAHDTALCSTDYVISLTYEPNLWQCADPTTGAFQFLFSREAPTNTDTCTWTLVADERAKQGEENYNSALRSQLSMYNTNPIMHNRDVDSIWRQFMGIFGVNGGLLTYAPVWKAYYLQFLKEMFADGVQYLELRTTLPPLYDLDGKTYNEVEIMQIYYDATKEFKKQNPTFIGAKIIYAPVRVVDDAGIPALMAKVRELHEKFPDFMAGFDLVGQEDKGRPLIAFSREILKLPNSIDFYFHAGETNWDGMTDDNLIDAVLLGTKRIGHGYAVLKHPRVLKEVKRNKIAIEVCPASNQVLRLVADYRNHPGSVLLANKEYPVVISSDDPSFWEAKPLSHDFYMAFLGLASSRQDLRLLKQLAINSIKYSAMSPREKLQAMQMWEAEWKKFIDGFNA
+>Q06K61
+MFPRLIVWLLAASAVHAVLDISNIKPKRDYENFLQKYAEYADDEVDRSVGSDITLSLKEKFVNQYLMDLKTEELKAGLKNPSQFIPSNHFFSVLDRINSSEIFKIIRRMPKGAILHAHDTALCSTDYVVSITYRDHLWQCADPKTGALQFRFSKESPKNTDTCQWTPVSEERKNQGEEQYNSKLRSQLSLYNTDPINRSRDVDSIWNDFMGLFGVNFGLLTYAPVWKDYYKQFLKEMMEDGVQYLELRGTLPPLYDLDGKIYNEEQVVEIYYNVTEEFKKENSTFIGAKFIYAPVRFVNATGIKTLTTTVKQLHERFPDFLAGFDLVGQEDKGGPLIGFSRELLELPESINFFFHSGETNWNGMTDDNLIAAVTLGTKRIGHGYALFKHPRVLKQVKKDKIAIEVCPISNQVLRLVADMRNHPGSILLANKKYPMVISSDDPSFWEATPLSHDFYMAFMGLASYHQDLRMLKQLAINSLEYSSMTLEEKTNAMKLWEAEWEKFIKELETEVFSLLE
+>A0A0H2ZM56
+MADKKTVTPEEKKLVAEKHVDELVQKALVALEEMRKLNQEQVDYIVAKASVAALDAHGELALHAFEETGRGVFEDKATKNLFACEHVVNNMRHTKTVGVIEEDDVTGLTLIAEPVGVVCGITPTTNPTSTAIFKSLISLKTRNPIVFAFHPSAQESSAHAARIVRDAAIAAGAPENCVQWITQPSMEATSALMNHEGVATILATGGNAMVKAAYSCGKPALGVGAGNVPAYVEKSANIRQAAHDIVMSKSFDNGMVCASEQAVIIDKEIYDEFVAEFKSYHTYFVNKKEKALLEEFCFGVKANSKNCAGAKLNADIVGKPATWIAEQAGFTVPEGTNILAAECKEVGENEPLTREKLSPVIAVLKSESREDGITKARQMVEFNGLGHSAAIHTADEELTKEFGKAVKAIRVICNSPSTFGGIGDVYNAFLPSLTLGCGSYGRNSVGDNVSAINLLNIKKVGRRRNNMQWMKLPSKTYFERDSIQYLQKCRDVERVMIVTDHAMVELGFLDRIIEQLDLRRNKVVYQIFADVEPDPDITTVNRGTEIMRAFKPDTIIALGGGSPMDAAKVMWLFYEQPEVDFRDLVQKFMDIRKRAFKFPLLGKKTKFIAIPTTSGTGSEVTPFAVISDKANNRKYPIADYSLTPTVAIVDPALVLTVPGFVAADTGMDVLTHATEAYVSQMASDYTDGLALQAIKLVFENLESSVKNADFHSREKMHNASTIAGMAFANAFLGISHSMAHKIGAQFHTIHGRTNAILLPYVIRYNGTRPAKTATWPKYNYYRADEKYQDIARMLGLPASTPEEGVESYAKAVYELGERIGIQMNFRDQGIDEKEWKEHSRELAFLAYEDQCSPANPRLPMVDHMQEIIEDAYYGYKERPGRRK
+>A0A0H2URT2
+MADKKTVTPEEKKLVAEKHVDELVQKALVALEEMRKLDQEQVDYIVAKASVAALDAHGELALHAFEETGRGVFEDKATKNLFACEHVVNNMRHTKTVGVIEEDDVTGLTLIAEPVGVVCGITPTTNPTSTAIFKSLISLKTRNPIVFAFHPSAQESSAHAARIVRDAAIAAGAPENCVQWITQPSMEATSALMNHEGVATILATGGNAMVKAAYSCGKPALGVGAGNVPAYVEKSANIRQAAHDIVMSKSFDNGMVCASEQAVIIDKEIYDEFVAEFKSYHTYFVNKKEKALLEEFCFGVKANSKNCAGAKLNADIVGKPATWIAEQAGFTVPEGTNILAAECKEVGENEPLTREKLSPVIAVLKSESREDGITKARQMVEFNGLGHSAAIHTADEELTKEFGKAVKAIRVICNSPSTFGGIGDVYNAFLPSLTLGCGSYGRNSVGDNVSAINLLNIKKVGRRRNNMQWMKLPSKTYFERDSIQYLQKCRDVERVMIVTDHAMVELGFLDRIIEQLDLRRNKVVYQIFADVEPDPDITTVNRGTEIMRAFKPDTIIALGGGSPMDAAKVMWLFYEQPEVDFRDLVQKFMDIRKRAFKFPLLGKKTKFIAIPTTSGTGSEVTPFAVISDKANNRKYPIADYSLTPTVAIVDPALVLTVPGFVAADTGMDVLTHATEAYVSQMASDYTDGLALQAIKLVFENLESSVKNADFHSREKMHNASTIAGMAFANAFLGISHSMAHKIGAQFHTIHGRTNAILLPYVIRYNGTRPAKTATWPKYNYYRADEKYQDIARMLGLPASTPEEGVESYAKAVYELGERIGIQMNFRDQGIDEKEWKEHSRKLAFLAYEDQCSPANPRLPMVDHMQEIIEDAYYGYKERPGRRK
+>Q8YMD9
+MTSRIRFLMCPPDHYDVDYVINPWMEGNIHKSSRDRAVEQWQGLYQILKEHAIVDLVTPQKGWPDLVFTANAGLVLGDNVVLSRFLHKERQGEEPYFKEWFEGNGYTVYELPKDLPFEGAGDALLDREGRWLWAGYGFRSELDSHPYLAKWLDIEVLSLRLIDERFYHLDTCFCPLANGYLLYYPGAFDSYSNRLIEMRVAPEKRIAIAEADAVNFACNTVNVESIVIMNKASDALKQSLTGVGFQVLETPLTEFLKAGGAAKCLTLRVTEPVRDEVHANVYVESRIIRIEGHLLDSGLINRALDMIVDTGGSFQVLNFNLGEQRQSTSAAEVKVSAPSHEVMEEIISLLIDLGAVDLPQDERDAKLEPVIQDGVAPDDFYVSTIYPTEVRINGQWIKVENQRMDGAIAITQTPNGLLAQCKILRDLKAGEQVIVDVLGIRTIRKTESREQRNTQEFSFMSGGVSSERRVELVVEQVAWELRKIRDAGGKVVVTAGPVVIHTGGGEHLSRLIREGYVQALLGGNAIAVHDIEQNMMGTSLGVDMKRGVAVRGGHRHHLKVINTIRRHGSIAKGVESGIIRSGVMYECVRNQIPFVLAGSIRDDGPLPDTQMDLIKAQEEYAKHLEGAEMILMLSSMLHSIGVGNMTPAGVKMVCVDINPAVVTKLSDRGSIESVGVVTDVGLFLSLLTQQLDKLTSPYVSKVG

example/difference_20241122_ec_dict_list.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e8a013ba26e7f3482ddc072d390d529c8cb5f3c40e8e0f33c2189b7c3d6200e
+size 1056366

example/protein_go_clean.fasta ADDED Viewed

The diff for this file is too large to render. See raw diff

example/protein_go_clean.json ADDED Viewed

	@@ -0,0 +1,606 @@

+{"protein_id": "A8CF74", "GO_id": ["0004497", "0000166"]}
+{"protein_id": "A0A0L8M630", "GO_id": ["0008839", "0009089"]}
+{"protein_id": "Q8XP19", "GO_id": ["0004565", "0004566", "0030246", "0005975", "0019391"]}
+{"protein_id": "B5LAT8", "GO_id": ["0005737", "0045551", "0008270", "0009820", "0009809"]}
+{"protein_id": "A0A805Z5R7", "GO_id": ["0016787", "0006629"]}
+{"protein_id": "E9EE69", "GO_id": ["0030428", "0005935", "0005886", "0004100", "0006031", "1902404"]}
+{"protein_id": "A2QU15", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "Q704E9", "GO_id": ["0016020", "0016717", "0006633"]}
+{"protein_id": "A0A8A1G1R1", "GO_id": ["0005576", "0030246", "0004553", "0000272"]}
+{"protein_id": "G3UYQ4", "GO_id": ["0005829", "0015630", "0031514", "0031965", "0005654", "0005524", "0016887", "0004550", "0050145"]}
+{"protein_id": "E1CJK0", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "Q9P451", "GO_id": ["0005576", "0004806", "0017000", "0016042", "0072330"]}
+{"protein_id": "A0A384JDH0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A348HAY2", "GO_id": ["0005576", "0052689", "0016042"]}
+{"protein_id": "A0A1G9VRW7", "GO_id": ["0046872", "0010333"]}
+{"protein_id": "Q5B027", "GO_id": ["0005576", "0046872", "0016491", "0000272"]}
+{"protein_id": "Q8KSA6", "GO_id": ["0005576", "0008800", "0030655", "0046677"]}
+{"protein_id": "I1RE72", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
+{"protein_id": "E1CJK2", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "B2ATL7", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q9PTF3", "GO_id": ["0036064", "0005737", "0005741", "0005524", "0004550", "0006241", "0007368", "0006281", "0006183", "0001822", "0008053", "0009142", "0106071", "0006228"]}
+{"protein_id": "Q75WF1", "GO_id": ["0005782", "0005777", "0008445", "0047821", "0071949", "1990748", "0019478", "0019740"]}
+{"protein_id": "A0A0E3USC3", "GO_id": ["0051213", "0046872", "0017000"]}
+{"protein_id": "A0A7H0XJI9", "GO_id": ["0005576", "0046556", "0046373", "0045493"]}
+{"protein_id": "H0QPM2", "GO_id": ["0050660", "0016614"]}
+{"protein_id": "G2X5A0", "GO_id": ["0030428", "0005886", "0004100", "0006031", "0031505"]}
+{"protein_id": "Q5K9E2", "GO_id": ["0005730", "0031499", "0005524", "0046872", "1990817", "0043634", "0031123"]}
+{"protein_id": "A0A4S8L6U5", "GO_id": ["0005829", "0070012", "0004252", "0006508"]}
+{"protein_id": "Q9HYK7", "GO_id": ["0004324", "0000166", "0034599", "0042167"]}
+{"protein_id": "A0A7E5WTY7", "GO_id": ["0005576", "0005794", "0008199", "0008198", "0004322", "0006879", "0006826"]}
+{"protein_id": "A8QDF0", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "G2R6N0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A5N6UUF2", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A5N6UV50", "GO_id": ["0005576", "0008810", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "B5I920", "GO_id": ["0005524", "0016301", "0046677"]}
+{"protein_id": "J0MXK0", "GO_id": ["0016740"]}
+{"protein_id": "O87333", "GO_id": ["0005829", "0005886", "0008713", "0009244"]}
+{"protein_id": "A0A3F2YLY8", "GO_id": ["0016020", "0030288", "0005509", "0016614", "0015945"]}
+{"protein_id": "Q5MIX2", "GO_id": ["0005615", "0004000", "0046872", "0006154", "0046103", "0009117"]}
+{"protein_id": "Q06K77", "GO_id": ["0005576", "0005524", "0005509", "0004382", "0090729", "0045134", "0030166"]}
+{"protein_id": "B6EP94", "GO_id": ["0016881", "0016746", "0005524", "0019290"]}
+{"protein_id": "A0A2I2F2I5", "GO_id": ["0005948", "0005739", "0003984", "0050660", "0000287", "0004737", "0030976", "0009097", "0009099"]}
+{"protein_id": "A0A384JZ02", "GO_id": ["0005576", "0008061", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A2R6W0K6", "GO_id": ["0009570", "0005829", "0010319", "0000287", "0004614", "0005975", "0009590", "0006006", "0009409", "0019252", "0005986"]}
+{"protein_id": "A9CEY7", "GO_id": ["0008726", "0046306"]}
+{"protein_id": "A0A4Y5QZ62", "GO_id": ["0000287", "0010333", "0016102"]}
+{"protein_id": "D9X0I3", "GO_id": ["0047456", "0051539", "0003994", "0046872", "0003723", "0006099"]}
+{"protein_id": "Q9GTP7", "GO_id": ["0005576", "0004050", "0005524", "0005509", "0030899", "0004382", "0017110", "0090729", "0045134", "0002376", "0030166"]}
+{"protein_id": "Q9HY79", "GO_id": ["0005829", "0070288", "0008199", "0004322", "0020037", "0005506", "0140315", "0006879", "0006826"]}
+{"protein_id": "Q9HTH9", "GO_id": ["0047617", "0042413"]}
+{"protein_id": "A0A2R8QP51", "GO_id": ["0005634", "0004177", "0008239", "0008236", "0006508", "0050727"]}
+{"protein_id": "B7JA35", "GO_id": ["0016020", "0030170", "0008483", "0009245", "0000271"]}
+{"protein_id": "G0SGC7", "GO_id": ["0005874", "0005743", "0005758", "0005886", "0005525", "0003924", "0140523", "0008289", "0180020", "0046872", "0008017", "0031623"]}
+{"protein_id": "Q9NC65", "GO_id": ["0005615", "0004000", "0046872", "0006154", "0046103"]}
+{"protein_id": "P73562", "GO_id": ["0008836", "0009089", "0045312", "0008295"]}
+{"protein_id": "A0A2B4RNI3", "GO_id": ["0061501", "0005524", "0005525", "0046872", "0045087"]}
+{"protein_id": "G2XJ47", "GO_id": ["0030428", "0005935", "0005886", "0004100", "0006031", "1902404"]}
+{"protein_id": "I1RHF8", "GO_id": ["0005576", "0016787", "0016042"]}
+{"protein_id": "S7Q0E7", "GO_id": ["0005886", "0098552", "0046872", "0004497", "0030245"]}
+{"protein_id": "B2B403", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q8LPU1", "GO_id": ["0048046", "0005773", "0052736", "0006952", "0009620"]}
+{"protein_id": "A0A1L8GLK3", "GO_id": ["0005737", "0016605", "0061630", "0008270", "0036297", "0016567"]}
+{"protein_id": "Q19VG9", "GO_id": ["0005737", "0005507", "0004784"]}
+{"protein_id": "B2RVI8", "GO_id": ["0005829", "0008146", "0008202"]}
+{"protein_id": "Q06K61", "GO_id": ["0005615", "0004000", "0046872", "0006154", "0046103"]}
+{"protein_id": "A0A649V088", "GO_id": ["0005576", "0008800", "0030655", "0046677"]}
+{"protein_id": "Q8I4R4", "GO_id": ["0031410", "0005576", "0020009", "0008061", "0004568", "0008843", "0006032", "0000272"]}
+{"protein_id": "A0A100IM63", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
+{"protein_id": "A0A0F5HSE8", "GO_id": ["0000166", "0016639", "0006520"]}
+{"protein_id": "W8JNL4", "GO_id": ["0033846"]}
+{"protein_id": "A0A0F6AK91", "GO_id": ["0005829", "0047952", "0051287", "0005975", "0046167", "0046168", "0006650", "0008654"]}
+{"protein_id": "B2AUV0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A5N6UWA3", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q93227", "GO_id": ["0010008", "0005886", "0004842", "0008270", "0000209", "0016567", "0007338"]}
+{"protein_id": "Q0ZQ45", "GO_id": ["0016853", "0017000"]}
+{"protein_id": "Q8PWQ1", "GO_id": ["0051539", "0016829", "0046872", "0008299"]}
+{"protein_id": "O76742", "GO_id": ["0120281", "0000421", "0005938", "0042995", "0031901", "0005770", "0031902", "0005764", "0043025", "0043204", "0045335", "0005886", "0098793", "0045202", "0031982", "0005525", "0003924", "0044877", "0061909", "0007298", "0061883", "0032456", "0006897", "0016197", "0034058", "0008333", "0032510", "1990182", "0006909", "0090385", "0015031", "0032482", "0160156", "0046718", "0033292", "0016192", "0048190"]}
+{"protein_id": "A8CF75", "GO_id": ["0004497", "0000166"]}
+{"protein_id": "Q9W4C3", "GO_id": ["0005829", "0005634", "0004843", "0008270", "0016579", "0006508", "0031647"]}
+{"protein_id": "Q47595", "GO_id": ["0016757", "0009243"]}
+{"protein_id": "Q10L01", "GO_id": ["0012505", "0005783", "0005794", "0005886", "0019706", "1902884", "1901002", "0006612"]}
+{"protein_id": "A0A0H2ZM56", "GO_id": ["0008774", "0004029", "0120542", "0046872", "0006066", "0015976"]}
+{"protein_id": "Q99QC1", "GO_id": ["0030288", "0008800", "0046872", "0000166", "0017001", "0046677"]}
+{"protein_id": "A0A3M6TIF0", "GO_id": ["0061501", "0005524", "0005525", "0046872", "0045087"]}
+{"protein_id": "I7ZK32", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
+{"protein_id": "O53611", "GO_id": ["0005829", "0005576", "0009274", "0005886", "0004450", "0000287", "0042803", "0006097", "0006102", "0006099"]}
+{"protein_id": "Q5XVM9", "GO_id": ["0005886", "0016787"]}
+{"protein_id": "A7B555", "GO_id": ["0005829", "0008747", "0019262"]}
+{"protein_id": "A0A4Y5QWA6", "GO_id": ["0000287", "0010333", "0016102"]}
+{"protein_id": "A0A1V0QSH2", "GO_id": ["0000287", "0010333", "0016102"]}
+{"protein_id": "A0A0H2URT2", "GO_id": ["0008774", "0004029", "0120542", "0046872", "0006066", "0015976"]}
+{"protein_id": "A3MUS8", "GO_id": ["0005737", "0004067", "0008233", "0006508"]}
+{"protein_id": "Q2U834", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031", "0030448"]}
+{"protein_id": "A0A0A2J1Z6", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958", "0043386"]}
+{"protein_id": "F1N206", "GO_id": ["1902493", "0043159", "0160157", "0005759", "0005739", "0031514", "0005634", "0160167", "0045252", "0045254", "0047101", "0004148", "0050660", "0034604", "0006103", "0009083", "0007369", "0006120", "0006508", "0006090", "0042391", "0048240"]}
+{"protein_id": "Q1QYU7", "GO_id": ["0051537", "0005506", "0004497"]}
+{"protein_id": "D4GSD6", "GO_id": ["0042802", "0004454", "0042803", "0006000"]}
+{"protein_id": "A0A5J6BJT1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A1V0QSG1", "GO_id": ["0009507", "0000287", "0010333", "0016102"]}
+{"protein_id": "C5BK10", "GO_id": ["0005886", "0070566", "0005524", "0016874", "0071766", "0006633"]}
+{"protein_id": "Q8YMD9", "GO_id": ["0016787", "0016829", "0000166"]}
+{"protein_id": "Q9LCC6", "GO_id": ["0005829", "0008797", "0006531", "0006099"]}
+{"protein_id": "Q93F76", "GO_id": ["0008800", "0030655", "0046677"]}
+{"protein_id": "A0A6J1SUS3", "GO_id": ["0061501", "0005524", "0003690", "0005525", "0046872", "0003723", "0051607", "0045087"]}
+{"protein_id": "Q89F89", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0071949", "0019478"]}
+{"protein_id": "Q8DPI6", "GO_id": ["0016779", "0071555", "0070395"]}
+{"protein_id": "Q9AI62", "GO_id": ["0004556", "0009313"]}
+{"protein_id": "A0A1D8PT02", "GO_id": ["0062040", "0016020", "0005886", "0010181", "0003955", "0034599"]}
+{"protein_id": "Q27493", "GO_id": ["0005739", "0005730", "0005736", "0003677", "0003899", "0032549", "0006351"]}
+{"protein_id": "Q47594", "GO_id": ["0016757", "0009243"]}
+{"protein_id": "F4JFR7", "GO_id": ["0005737", "0005634", "0141131", "0046872", "0006281"]}
+{"protein_id": "A0A2I2F2J6", "GO_id": ["0016853", "0009813"]}
+{"protein_id": "A0A0J9VPT0", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "E9EFH8", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
+{"protein_id": "B0Y6Z7", "GO_id": ["0005789", "0004252", "0016740"]}
+{"protein_id": "A0A0S2GKZ1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A384JTK5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A2V0R8Q9", "GO_id": ["0050053", "0009758"]}
+{"protein_id": "O84925", "GO_id": ["0005576", "0016491"]}
+{"protein_id": "Q8ECU3", "GO_id": ["0004497", "0009058"]}
+{"protein_id": "Q5SLF5", "GO_id": ["0016787"]}
+{"protein_id": "Q2G5J4", "GO_id": ["0008726", "0046306"]}
+{"protein_id": "A0A4Y5QVX6", "GO_id": ["0000287", "0010333", "0016102"]}
+{"protein_id": "A0A1V0QSG6", "GO_id": ["0000287", "0010333", "0016102"]}
+{"protein_id": "A0A2V5GUR2", "GO_id": ["0003962", "0016853", "0030170", "0019346"]}
+{"protein_id": "F4KAV2", "GO_id": ["0005737", "0005634", "0141131", "0046872", "0006281", "0010029"]}
+{"protein_id": "P74535", "GO_id": ["0016787", "0016829", "0000166"]}
+{"protein_id": "G2XEK6", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "Q8RQV0", "GO_id": ["0005576", "0030246", "0016757", "0004553", "0005975"]}
+{"protein_id": "A0A101MN42", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958", "0043386"]}
+{"protein_id": "R4LHX8", "GO_id": ["0033846"]}
+{"protein_id": "Q1K7A4", "GO_id": ["0005739", "0003860", "0006574"]}
+{"protein_id": "I3SL57", "GO_id": ["0009507", "0009536", "0004392", "0046872", "0051702", "0042167", "0006788", "0009877", "0015979", "0010024", "0009646", "0010167", "0009609"]}
+{"protein_id": "A8QCV4", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "H1AE14", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "G2RGE6", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q6YNE9", "GO_id": ["0005576", "0016158", "0003993"]}
+{"protein_id": "A0A1D8PHR5", "GO_id": ["0016020", "0005886", "0010181", "0003955", "0016655", "0034599"]}
+{"protein_id": "Q9HUF5", "GO_id": ["0005829", "0005886", "0008713", "0071968", "0009244"]}
+{"protein_id": "Q9LJH2", "GO_id": ["0005737", "0005634", "0141131", "0046872", "0006281"]}
+{"protein_id": "Q9XDP2", "GO_id": ["0042597", "0004062", "0047686"]}
+{"protein_id": "Q8E0N2", "GO_id": ["0004565", "0004566", "0030246", "0005975", "0019391"]}
+{"protein_id": "Q9RMT4", "GO_id": ["0005576", "0008800", "0030655", "0046677"]}
+{"protein_id": "Q5SKU3", "GO_id": ["0016829", "0006631"]}
+{"protein_id": "A0A0H2ZIF3", "GO_id": ["0051537", "0005506", "0004497"]}
+{"protein_id": "B9SIM3", "GO_id": ["0009507", "0009899", "0000287", "0010333", "0009686"]}
+{"protein_id": "A0A384J8V9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "P71905", "GO_id": ["0005737", "0005829", "0009274", "0005886", "0046872", "0004540", "0019843", "0006397", "0006364", "0008033"]}
+{"protein_id": "J0MXJ0", "GO_id": ["0004751", "0019316", "0009052"]}
+{"protein_id": "V5TF61", "GO_id": ["0005737", "0046872", "0004659", "0008299"]}
+{"protein_id": "Q7WYA8", "GO_id": ["0042597", "0008800", "0008270", "0017001", "0046677"]}
+{"protein_id": "Q48434", "GO_id": ["0030288", "0008800", "0017001", "0046677"]}
+{"protein_id": "G4NIR3", "GO_id": ["0030428", "0005935", "0005886", "0004100", "0006031", "1902404"]}
+{"protein_id": "Q0SG95", "GO_id": ["0050049", "0000166", "0050175", "0009094"]}
+{"protein_id": "B0Y8Y4", "GO_id": ["0005737", "0005634", "0005524", "0004674", "0007155", "0050684", "0000245"]}
+{"protein_id": "B2B629", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "G2QCJ3", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q5AQA6", "GO_id": ["0005576", "0008810", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q7SHD9", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A0A2J9B0", "GO_id": ["0008061", "0008843", "0006032", "0000272"]}
+{"protein_id": "G4VJD5", "GO_id": ["0004619", "0061621", "0044542"]}
+{"protein_id": "B0Y8W1", "GO_id": ["0016020", "0004252", "0006508"]}
+{"protein_id": "Q970I2", "GO_id": ["0005524", "0140097", "0003677", "0004386", "0016787", "0006281"]}
+{"protein_id": "Q8I8I2", "GO_id": ["0005737", "0003779", "0004332", "0006096"]}
+{"protein_id": "Q5LIC7", "GO_id": ["0004565", "0005975"]}
+{"protein_id": "P71420", "GO_id": ["0030288", "0008800", "0017001", "0046677"]}
+{"protein_id": "A0A482WD11", "GO_id": ["0061501", "0005524", "0003690", "0005525", "0046872", "0003723", "0051607", "0045087"]}
+{"protein_id": "A2R2G5", "GO_id": ["0030428", "0005935", "0005886", "0004100", "0006031", "1902404"]}
+{"protein_id": "S5S833", "GO_id": ["0005794", "0000139", "0030246", "0030145", "0004653", "0016266", "0018242", "0018243"]}
+{"protein_id": "A0A384JJB6", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "G2RB72", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q9LKZ1", "GO_id": ["0005524", "0004707", "0106310", "0009409", "0010225", "0009611"]}
+{"protein_id": "O29889", "GO_id": ["0005524", "0140097", "0003677", "0004386", "0016787", "0006281"]}
+{"protein_id": "V5TDY7", "GO_id": ["0004659"]}
+{"protein_id": "A0A2Z5QL08", "GO_id": ["0042802", "0016210", "0042803", "0009715", "0009813", "0030639"]}
+{"protein_id": "Q54XE5", "GO_id": ["0008721", "0046872", "1990748", "0036088"]}
+{"protein_id": "Q5K9E9", "GO_id": ["0005829", "0005524", "0003921", "0006177"]}
+{"protein_id": "D5SK09", "GO_id": ["0046872", "0010333"]}
+{"protein_id": "A0A1Q3EPF5", "GO_id": ["0005829", "0070012", "0004252", "0006508"]}
+{"protein_id": "A8QCV7", "GO_id": ["0004806", "0016042"]}
+{"protein_id": "G3XAP7", "GO_id": ["0005576", "0008810", "0046872", "0004497", "0030245"]}
+{"protein_id": "J9VRT1", "GO_id": ["0005737", "0016020", "0008445", "0071949", "0019478"]}
+{"protein_id": "A5TY84", "GO_id": ["0005886", "0005524", "0004674", "0080090"]}
+{"protein_id": "A0A7W3N5X5", "GO_id": ["0016832", "0004801", "0005975"]}
+{"protein_id": "Q8ZPP5", "GO_id": ["0005886", "0005524", "0000155"]}
+{"protein_id": "F1QB30", "GO_id": ["0005829", "0005783", "0005789", "0000151", "0043130", "0061630", "0008270", "0030968", "0070936", "0000209", "0034976", "0006511"]}
+{"protein_id": "Q79FW0", "GO_id": ["0005829", "0005886", "0008696", "0030170", "0008483", "0019752", "0046656", "0009252", "0008360", "0046654"]}
+{"protein_id": "A0A0E0RU58", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
+{"protein_id": "G2WY98", "GO_id": ["0030428", "0045009", "0005886", "0004100", "0030476", "0006031", "0000920"]}
+{"protein_id": "A0A0G4P2K0", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958", "0043386"]}
+{"protein_id": "F1MLX0", "GO_id": ["0005739", "0046872", "0050163", "0006107"]}
+{"protein_id": "Q8RWX4", "GO_id": ["0009507", "0009941", "0009707", "0005886", "0009536", "0010006", "0005524", "0004672", "0004674", "0009704", "1904216", "0045036"]}
+{"protein_id": "D9N4H4", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "Q59127", "GO_id": ["0005737", "0071949", "0018535", "0009820", "0019608"]}
+{"protein_id": "Q0HWI9", "GO_id": ["0016881", "0005524", "0019290"]}
+{"protein_id": "Q50LF1", "GO_id": ["0005737", "0046872", "0008115", "0046653"]}
+{"protein_id": "Q47593", "GO_id": ["0005886", "0016757", "0009243"]}
+{"protein_id": "A0A2Z5GDY5", "GO_id": ["0005737", "0005783", "0019897", "0005634", "0046872", "0070006", "0009877", "0006508", "0009609"]}
+{"protein_id": "Q9SSA4", "GO_id": ["0005737", "0005829", "0005524", "0004674", "0004712", "0004713", "0071244", "1902456", "0009637", "0010114", "0001659"]}
+{"protein_id": "A0A0E0S977", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
+{"protein_id": "A0A0E3JXD9", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0009063"]}
+{"protein_id": "Q2G5J3", "GO_id": ["0005829", "0016491"]}
+{"protein_id": "A0A0H3GN27", "GO_id": ["0009986", "0005576", "0000015", "0000287", "0004634", "0006096"]}
+{"protein_id": "W0W999", "GO_id": ["0033846", "0016787"]}
+{"protein_id": "A0A803NI27", "GO_id": ["0031969", "0005789", "0005778", "0004420", "0015936", "0008299", "0016126"]}
+{"protein_id": "B9SIM2", "GO_id": ["0009507", "0009899", "0000287", "0010333", "0009686"]}
+{"protein_id": "A0A218MJF1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "D5FKJ2", "GO_id": ["0030170", "0008483", "0006520", "0009058"]}
+{"protein_id": "Q9YA51", "GO_id": ["0051539", "0016829", "0046872", "0008299"]}
+{"protein_id": "Q6PBS3", "GO_id": ["0005737", "0005524", "0004830", "0006436"]}
+{"protein_id": "Q8PWQ0", "GO_id": ["0005737", "0018799", "0046872", "0008299"]}
+{"protein_id": "Q9KJY7", "GO_id": ["0008800", "0030655", "0046677"]}
+{"protein_id": "A0A1S4F2V5", "GO_id": ["0008234", "0006508"]}
+{"protein_id": "A0A6I8RMG7", "GO_id": ["0016020", "0005509", "0016853"]}
+{"protein_id": "Q79F72", "GO_id": ["0016020", "0016491", "0006636"]}
+{"protein_id": "B9LS20", "GO_id": ["0005829", "0047975", "0017061", "0009164"]}
+{"protein_id": "G2RGE5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A8NCG7", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q93NH5", "GO_id": ["0005737", "0005506", "0018535", "0009820", "0019608"]}
+{"protein_id": "A5U3A3", "GO_id": ["0005886", "0005524", "0004674"]}
+{"protein_id": "D2RW80", "GO_id": ["0043917", "0046523", "0019509", "0019323"]}
+{"protein_id": "C5BK13", "GO_id": ["0016020", "0016717", "0006631"]}
+{"protein_id": "M1T1K4", "GO_id": ["0030170", "0008483", "0006520", "0009058"]}
+{"protein_id": "A0A095C6S0", "GO_id": ["0005782", "0003884", "0071949", "1990748", "0019478", "0019740"]}
+{"protein_id": "S5SC42", "GO_id": ["0016491", "0019380"]}
+{"protein_id": "A0A1V0QSG0", "GO_id": ["0000287", "0010333", "0016102"]}
+{"protein_id": "Q50248", "GO_id": ["0044569", "0009375", "0005886", "0051538", "0051539", "0009055", "0008901", "0046872", "0015948"]}
+{"protein_id": "B5T072", "GO_id": ["0016757", "0009243"]}
+{"protein_id": "A0A2R2JFW7", "GO_id": ["0005829", "0005634", "0004053", "0030145", "0019547", "0000050"]}
+{"protein_id": "O53044", "GO_id": ["0030288", "0008800", "0017001", "0046677"]}
+{"protein_id": "E9ECB5", "GO_id": ["0030428", "0005935", "0045009", "0000131", "0005886", "0005628", "0004100", "0030476", "0006031", "0097271"]}
+{"protein_id": "D9N4H3", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "Q7MTZ8", "GO_id": ["0005737", "0005886", "0046872", "0008758", "0009245"]}
+{"protein_id": "Q9VQQ0", "GO_id": ["0005737", "0005634", "0000159", "0005524", "0046872", "0003755", "0008160", "0045175", "0007052", "1904785"]}
+{"protein_id": "D5E1T2", "GO_id": ["0016832", "0004801", "0005975"]}
+{"protein_id": "D5E1S9", "GO_id": ["0005737", "0008736", "0006004"]}
+{"protein_id": "Q50LF0", "GO_id": ["0005737", "0000166", "0008115", "0046653"]}
+{"protein_id": "M1T9N7", "GO_id": ["0008168", "0017000", "0032259"]}
+{"protein_id": "G9MLG2", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
+{"protein_id": "E9QSG0", "GO_id": ["0044695", "0012505", "0005789", "0061630", "0008270", "0043161", "0016567", "0051603"]}
+{"protein_id": "Q4J8L0", "GO_id": ["0008821", "0003677", "0000287", "0006310", "0006281"]}
+{"protein_id": "E1CJK1", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "D9J041", "GO_id": ["0050660", "0016152", "0045340", "0003955", "0050661", "0016668", "0050787"]}
+{"protein_id": "Q6D291", "GO_id": ["0042597", "0016787", "0046677"]}
+{"protein_id": "A0A0X1KHF9", "GO_id": ["0005524", "0016301", "0046677"]}
+{"protein_id": "Q8NQU4", "GO_id": ["0005886", "0015424", "0005524", "0016887"]}
+{"protein_id": "G7XMT1", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
+{"protein_id": "A0A2V0R8Y2", "GO_id": ["0050053", "0009758"]}
+{"protein_id": "U6BYK3", "GO_id": ["0009507", "0047804", "0030170", "0009086", "0019346"]}
+{"protein_id": "A2VD68", "GO_id": ["0036064", "0005737", "0005741", "0004550", "0006241", "0006281", "0006183", "0008053", "0009142", "0006228"]}
+{"protein_id": "A0A7E6FSU6", "GO_id": ["0005782", "0008445", "0071949", "0019478"]}
+{"protein_id": "O53443", "GO_id": ["0005829", "0005886", "0005524", "0005525", "0016853", "0046872", "0004518"]}
+{"protein_id": "Q5JIW4", "GO_id": ["0004067", "0006520"]}
+{"protein_id": "V4HM83", "GO_id": ["0050660", "0004499", "0050661"]}
+{"protein_id": "I1S0T9", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "A9FDB7", "GO_id": ["0020037", "0005506", "0004497", "0016705"]}
+{"protein_id": "Q79EF1", "GO_id": ["0016020", "0016491", "0006636"]}
+{"protein_id": "A0A0H2ZM62", "GO_id": ["0005886", "0005524", "0000155"]}
+{"protein_id": "E8WYN5", "GO_id": ["0016829", "0046872"]}
+{"protein_id": "J9VPE7", "GO_id": ["0005782", "0003884", "0071949", "0019478", "0098754"]}
+{"protein_id": "A0A143ZZK9", "GO_id": ["0005886", "0005524", "0016887", "0008556", "0008554", "0006874", "0034220", "0006813", "0006814"]}
+{"protein_id": "H6C2T9", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "A0A7C9FSB8", "GO_id": ["0009507", "0004452", "0046872", "0015995", "0050992", "0009240"]}
+{"protein_id": "Q5XTQ4", "GO_id": ["0005576", "0106435", "0016042"]}
+{"protein_id": "A0A5N6V703", "GO_id": ["0005576", "0008810", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A0J9XBC9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q7S9V5", "GO_id": ["0005576", "0016158", "0003993"]}
+{"protein_id": "Q59Y37", "GO_id": ["0005576", "0062040", "0030446", "0016020", "0005886", "0050625", "0010181", "0003955", "0034599"]}
+{"protein_id": "D5E1S7", "GO_id": ["0008911"]}
+{"protein_id": "F8VPU6", "GO_id": ["0004843", "0046872", "0016579", "0016567", "0006508"]}
+{"protein_id": "A0A0H2ZNH9", "GO_id": ["0005886", "0004721", "0000155", "0004673", "0016036", "0006355", "0007165"]}
+{"protein_id": "Q8EG04", "GO_id": ["0019003", "0005525", "0003924"]}
+{"protein_id": "A0A1G9UQQ0", "GO_id": ["0046872", "0010333"]}
+{"protein_id": "Q55406", "GO_id": ["0031676", "0016717", "0006636"]}
+{"protein_id": "A0A100YXA1", "GO_id": ["0005829", "0016829", "0016740"]}
+{"protein_id": "Q79FC3", "GO_id": ["0005576", "0033650", "0005886", "0016491"]}
+{"protein_id": "Q5JGJ6", "GO_id": ["0051539", "0016829", "0046872", "0008299"]}
+{"protein_id": "Q4X241", "GO_id": ["0005886", "0019003", "0005525", "0003924", "0030448", "0007165"]}
+{"protein_id": "A0A2P6MHU9", "GO_id": ["0005737", "0008736", "0006004"]}
+{"protein_id": "M4QN28", "GO_id": ["0005886", "0016757", "0009243"]}
+{"protein_id": "E2RTQ7", "GO_id": ["0005813", "0032133", "0005737", "0097568", "0031514", "0031965", "0005634", "0005819", "0005876", "0051233", "0000922", "0097597", "0005524", "0008092", "0004674", "1902850", "0000281", "0140014", "0007052", "0051726", "0032465"]}
+{"protein_id": "A0A0M3VI47", "GO_id": ["0009507", "0047804", "0030170", "0071266", "0019346"]}
+{"protein_id": "G4Z2L3", "GO_id": ["0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "A0A0H2ZJB2", "GO_id": ["0051537", "0046872", "0004497"]}
+{"protein_id": "Q2U6T7", "GO_id": ["0005886", "0000293", "0046872", "0015677", "0006879", "0006826"]}
+{"protein_id": "Q8DP70", "GO_id": ["0005576", "0016491"]}
+{"protein_id": "A5U8X0", "GO_id": ["0003677", "0003917", "0046872", "0006265"]}
+{"protein_id": "A0A1V0QSF3", "GO_id": ["0000287", "0010333", "0016102"]}
+{"protein_id": "F1SVF8", "GO_id": ["0005886", "0008901", "0051911", "0016151", "0015948"]}
+{"protein_id": "M9PGC5", "GO_id": ["0005737", "0005524", "0140693", "0019870", "0004672", "0106310", "0004674", "0006884", "0071474", "0035556", "0140694", "0050801", "0010766", "0090263", "1903288", "0035220"]}
+{"protein_id": "Q93Z30", "GO_id": ["0005737", "0005829", "0005634", "0005524", "0004672", "0004712", "0007229", "0010119", "0009637"]}
+{"protein_id": "Q89Y83", "GO_id": ["0005524", "0005525", "0046872", "0016779", "0051607", "0009117"]}
+{"protein_id": "Q5F8J4", "GO_id": ["0004412", "0046872", "0070403", "0050661", "0009086", "0009088"]}
+{"protein_id": "D0LZ73", "GO_id": ["0140737", "0004322", "0046872", "0006879", "0006826"]}
+{"protein_id": "A0A0M0ELU2", "GO_id": ["0005737", "0016787", "0004664", "0009094", "0009372"]}
+{"protein_id": "Q8YT18", "GO_id": ["0016829"]}
+{"protein_id": "A0A1D8PU51", "GO_id": ["0016020", "0005739", "0051539", "0046872", "0008137", "0042773"]}
+{"protein_id": "I6XFS7", "GO_id": ["0005576", "0030430", "0044196", "0008168", "0003676", "0031167"]}
+{"protein_id": "B6QEB3", "GO_id": ["0005737", "0005507", "0004784"]}
+{"protein_id": "G3X8Y1", "GO_id": ["0005737", "0005634", "0042802", "0030674", "0061630", "0008270", "0050904", "0002523", "1905517", "1901224", "0070936"]}
+{"protein_id": "Q2U0G5", "GO_id": ["0030428", "0045009", "0005886", "0004100", "0030476", "0006031", "0000920"]}
+{"protein_id": "A0A5A4WIX0", "GO_id": ["0004450", "0000287", "0051287", "0006097", "0006099"]}
+{"protein_id": "A0A482A9N4", "GO_id": ["0005576", "0004497", "0030245"]}
+{"protein_id": "Q5AZ52", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A5N6UNY1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "C8CP46", "GO_id": ["0005576", "0016787"]}
+{"protein_id": "E7F654", "GO_id": ["0005730", "0005634", "0005524", "0051731", "0003723", "0000448", "0060216", "0048565", "0031017", "0001889", "0061113", "0006364"]}
+{"protein_id": "A0A2P6MHT4", "GO_id": ["0003872", "0005524", "0046872", "0006002"]}
+{"protein_id": "A0A4Y5QWK5", "GO_id": ["0009507", "0000287", "0010333", "0016102"]}
+{"protein_id": "E9P162", "GO_id": ["0004497", "0000166"]}
+{"protein_id": "Q09HD0", "GO_id": ["0005576", "0008800", "0030655", "0046677"]}
+{"protein_id": "G4YM00", "GO_id": ["0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "Q0D1W9", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
+{"protein_id": "Q4D5J7", "GO_id": ["0005789", "0009922", "0034625", "0034626", "0019367", "0030148", "0042761"]}
+{"protein_id": "Q4WW94", "GO_id": ["0005634", "0005524", "0004674", "0007155", "0043484"]}
+{"protein_id": "A0A7J6H013", "GO_id": ["0031969", "0005789", "0005778", "0004420", "0015936", "0008299", "0016126"]}
+{"protein_id": "D8Q364", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "F8G0P2", "GO_id": ["0042597", "0000166", "0016491", "0009820", "0019608"]}
+{"protein_id": "D9XF46", "GO_id": ["0047456", "0051539", "0003994", "0046872", "0017000", "0006099"]}
+{"protein_id": "A1XLE2", "GO_id": ["0005829", "0005634", "0030234", "0016829", "0046872", "0019760", "0080028", "0018969"]}
+{"protein_id": "A0A1V0QSH1", "GO_id": ["0000287", "0010333", "0016102"]}
+{"protein_id": "Q9VQ29", "GO_id": ["0005743", "0005739", "0045275", "0051537", "0046872", "0016491", "0008121", "0006122"]}
+{"protein_id": "Q0P9A8", "GO_id": ["0005829", "0008713", "0009244"]}
+{"protein_id": "V4HJ70", "GO_id": ["0071949", "0016491"]}
+{"protein_id": "H6CA42", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
+{"protein_id": "A2QNG6", "GO_id": ["0030428", "0045009", "0005886", "0004100", "0030476", "0006031", "0000920"]}
+{"protein_id": "Q1AYM8", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0071949", "0019478"]}
+{"protein_id": "F9VNG5", "GO_id": ["0004412", "0046872", "0050661", "0009086", "0009088"]}
+{"protein_id": "Q9VDA0", "GO_id": ["0005634", "0043138", "0005524", "0000400", "0009378", "0016787", "0006281", "0006302", "0045003", "0035825", "0036297", "0045950", "0000725"]}
+{"protein_id": "A0A1V0QSH7", "GO_id": ["0005737", "0005634", "0004337", "0004161", "0046872", "0006695", "0045337"]}
+{"protein_id": "Q6E7K9", "GO_id": ["0005886", "0070566", "0005524", "0016874", "0071766", "0006633"]}
+{"protein_id": "S7QP81", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "F0NID4", "GO_id": ["0005524", "0003677", "0009378", "0016787", "0006310", "0006281"]}
+{"protein_id": "Q9ZIT6", "GO_id": ["0005886", "0008918", "0046872", "0009244"]}
+{"protein_id": "A0A5R8T042", "GO_id": ["0005576", "0008800", "0030655", "0046677"]}
+{"protein_id": "G4N553", "GO_id": ["0071944", "0030428", "0016020", "0004100", "0006031"]}
+{"protein_id": "A0A3S5XFG0", "GO_id": ["0046872", "0004659", "0046165", "0008299", "0043386"]}
+{"protein_id": "Q79F73", "GO_id": ["0016020", "0016717", "0006636"]}
+{"protein_id": "Q4WQ60", "GO_id": ["0005789", "0016740"]}
+{"protein_id": "Q5U921", "GO_id": ["0016740"]}
+{"protein_id": "O50131", "GO_id": ["0042802", "0030170", "0008483"]}
+{"protein_id": "Q88JH5", "GO_id": ["0016020", "0030288", "0052934", "0005509"]}
+{"protein_id": "J1H1J3", "GO_id": ["0004022", "0046872"]}
+{"protein_id": "V5XKC3", "GO_id": ["0005737", "0140618", "0010106"]}
+{"protein_id": "F2K079", "GO_id": ["0050660", "0004499", "0050661"]}
+{"protein_id": "A0A0H3GPN8", "GO_id": ["0005886", "0005524", "0000155", "0007155"]}
+{"protein_id": "Q8E8S0", "GO_id": ["0005886", "0009055", "0020037", "0046872", "0009061", "0019333"]}
+{"protein_id": "Q7X2D3", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0071949", "0043799", "0019478"]}
+{"protein_id": "B9SIL7", "GO_id": ["0009507", "0009899", "0034280", "0000287", "0010333", "0009686"]}
+{"protein_id": "A0A1C9ZMC3", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A1D8PS71", "GO_id": ["0009986", "0030287", "0005576", "0009277", "0008707", "0003993", "0030448"]}
+{"protein_id": "B9CM12", "GO_id": ["0046872", "0016779", "0032923"]}
+{"protein_id": "Q7CS24", "GO_id": ["0010181", "0004497", "0042602"]}
+{"protein_id": "Q4X1A8", "GO_id": ["0000785", "0005737", "0005634", "0005524", "0106310", "0004674", "1903940", "1900237", "0045944"]}
+{"protein_id": "A0A2C9JXL4", "GO_id": ["0016020", "0016263", "0030145", "0000166", "0016267", "0006486"]}
+{"protein_id": "A0A0J9VGQ5", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
+{"protein_id": "A0A0C3HJL3", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
+{"protein_id": "Q704F0", "GO_id": ["0016020", "0016491", "0006636"]}
+{"protein_id": "A0A1L7VFX3", "GO_id": ["0004337", "0004161", "0016829", "0046872", "0046165", "0008299", "0043386"]}
+{"protein_id": "A0A5J6BJT3", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A384JJE6", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "G2QA92", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "C4R4G9", "GO_id": ["0005782", "0005777", "0003884", "0071949", "0046436", "0019478", "0046416", "1902114", "0019740"]}
+{"protein_id": "A0A1D8PJX3", "GO_id": ["0005743", "0005886", "0045275", "0051537", "0046872", "0016491", "0008121", "0006122"]}
+{"protein_id": "A0A1D1VU85", "GO_id": ["0005507", "0042802", "0042803", "0004784", "0008270", "0019430"]}
+{"protein_id": "B8G5D6", "GO_id": ["0010181", "0050661", "0003959"]}
+{"protein_id": "Q39QF4", "GO_id": ["0003995", "0050660"]}
+{"protein_id": "H6C7U6", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
+{"protein_id": "Q8E8P8", "GO_id": ["0005886", "0046872", "0016653", "0006784"]}
+{"protein_id": "E4Q361", "GO_id": ["0005829", "0004565", "0008422", "0016162", "0031217", "0009044", "0030245", "0009251", "0005990", "0070207", "0045493"]}
+{"protein_id": "J9VUY6", "GO_id": ["0005829", "0005524", "0003922", "0003921", "0006177"]}
+{"protein_id": "A0A223GEC9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A5J6BJN2", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A1D8PJ73", "GO_id": ["0016020", "0005743", "0005886", "0045271", "0008137"]}
+{"protein_id": "D3ZDM7", "GO_id": ["0005737", "0005829", "0005782", "0005777", "0008445", "0071949", "0006531", "0019478", "0007625", "0042445", "0007320", "0050877", "0010646"]}
+{"protein_id": "Q81BR3", "GO_id": ["0005524", "0008986", "0046677"]}
+{"protein_id": "A0A2P6MHY1", "GO_id": ["0016832", "0008270", "0005975"]}
+{"protein_id": "Q55131", "GO_id": ["0016491", "0008295"]}
+{"protein_id": "A0A1C3YMT2", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "A2QHC2", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "Q1QYW1", "GO_id": ["0005737", "0010181", "0016491"]}
+{"protein_id": "Q55231", "GO_id": ["0016020", "0016491", "0006636"]}
+{"protein_id": "Q9HTF3", "GO_id": ["0051537", "0046872", "0004497", "0016491", "0031457"]}
+{"protein_id": "Q8DN03", "GO_id": ["0005886", "0005524", "0000155"]}
+{"protein_id": "A0A0J9XL55", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "G1FNI6", "GO_id": ["0005829", "0005634", "0030234", "0042802", "0016829", "0046872", "0042803", "0019760", "0080028"]}
+{"protein_id": "A0A2U7R6V5", "GO_id": ["0005576", "0031176", "0045493"]}
+{"protein_id": "Q7SH52", "GO_id": ["0005951", "0005737", "0005759", "0005524", "0004087", "0004088", "0046872", "0006526", "0006221"]}
+{"protein_id": "R0IGL9", "GO_id": ["0005829", "0070330", "0052722", "0050660", "0010181", "0020037", "0005506", "0003958"]}
+{"protein_id": "Q55230", "GO_id": ["0016020", "0016491", "0006636"]}
+{"protein_id": "B2B5J7", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q9LKZ2", "GO_id": ["0005524", "0004707", "0010225", "0009611"]}
+{"protein_id": "Q4J8K8", "GO_id": ["0005524", "0016887", "0003677", "0006310", "0006281"]}
+{"protein_id": "A0A3Q0KHE7", "GO_id": ["0005737", "0005634", "0070611", "0042054", "0035242", "1904047", "0006338", "0032259", "0006355", "0048608", "0046500"]}
+{"protein_id": "Q66KP0", "GO_id": ["0036064", "0005737", "0005741", "0004550", "0006241", "0006281", "0006183", "0008053", "0009142", "0006228"]}
+{"protein_id": "C4R6B0", "GO_id": ["0005782", "0008445", "0071949", "0019478", "0046416", "0019740"]}
+{"protein_id": "F8G0P1", "GO_id": ["0042597", "0016491", "0009820", "0019608"]}
+{"protein_id": "J1H1H5", "GO_id": ["0005737", "0008736", "0005996"]}
+{"protein_id": "A0A1V0QSF4", "GO_id": ["0009507", "0000287", "0010333", "0016102"]}
+{"protein_id": "Q9FGB1", "GO_id": ["0005737", "0005829", "0005886", "0009506", "0009536", "0005524", "0004674", "0004712", "0004713", "0071244", "1902456", "0009637", "0010114", "0001659"]}
+{"protein_id": "A5U3S4", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0071949", "0009060", "0019478", "0006546"]}
+{"protein_id": "I1RIF1", "GO_id": ["0005576", "0016787", "0016042"]}
+{"protein_id": "Q7KRU8", "GO_id": ["0005737", "0005576", "0070288", "0045169", "0005794", "0008199", "0008198", "0004322", "0008283", "1990461", "0006879", "0098711", "0009791", "0009620", "0030431"]}
+{"protein_id": "Q1QYU6", "GO_id": ["0051537", "0046872", "0004497"]}
+{"protein_id": "G1SPE9", "GO_id": ["0030054", "0031966", "0005778", "0005777", "0004366", "0016287", "0021587", "0008611", "0006631", "0006650", "0061024", "0030913", "0008654", "0007416", "0019432"]}
+{"protein_id": "A0A5N6UX39", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A1Y2IY60", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A509AF62", "GO_id": ["0020011", "0016740"]}
+{"protein_id": "Q8VQ46", "GO_id": ["0016020", "0008107", "0036065", "0009243", "0006486"]}
+{"protein_id": "A5HKP3", "GO_id": ["0016787", "0006629"]}
+{"protein_id": "C1G1C3", "GO_id": ["0004412", "0046872", "0050661", "0009090", "0009097", "0009086", "0009088"]}
+{"protein_id": "A0A1G9FQX8", "GO_id": ["0033846"]}
+{"protein_id": "Q5U922", "GO_id": ["0008720", "0051287"]}
+{"protein_id": "S7QKE2", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q50LF2", "GO_id": ["0005737", "0000166", "0008115", "0046653"]}
+{"protein_id": "Q50227", "GO_id": ["0005886", "0046872", "0016491", "0015948", "0022904"]}
+{"protein_id": "Q58YW1", "GO_id": ["0005886", "0016740", "0009103"]}
+{"protein_id": "A0A0M2HFA3", "GO_id": ["0050660", "0016614"]}
+{"protein_id": "I1S097", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "W8X9R6", "GO_id": ["0005737", "0016491"]}
+{"protein_id": "A0A6C0PI29", "GO_id": ["0005576", "0030600", "0017000", "0006629", "0072330", "0045493"]}
+{"protein_id": "A0A7J6F8C5", "GO_id": ["0009570", "0051539", "0051745", "0046872", "0050992", "0019288"]}
+{"protein_id": "A0A5A4WIZ7", "GO_id": ["0004450", "0046872", "0006097", "0006099"]}
+{"protein_id": "F1QWW8", "GO_id": ["0005789", "0047560", "0070402", "0006666", "0090156", "0030220", "0030148", "0090520", "0006686"]}
+{"protein_id": "A0A6C0M6J9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A5J6BJQ5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q504M2", "GO_id": ["0005739", "0004741", "0046872", "0042093"]}
+{"protein_id": "A0A2K8FTN3", "GO_id": ["0020011", "0016740"]}
+{"protein_id": "A1E280", "GO_id": ["0004497", "0000166"]}
+{"protein_id": "A0A1D8PDV7", "GO_id": ["0005737", "0005768", "0010008", "0000329", "0005794", "0016020", "0071561", "0005777", "0000407", "0034271", "0034272", "0016303", "0005524", "0004672", "0032120", "0000045", "0006914", "0071470", "0051365", "0009267", "0006897", "0030447", "0036180", "0036170", "0000280", "0030473", "0000425", "0046854", "0036092", "0048015", "0032968", "0006624", "0007034", "0007033", "0016192"]}
+{"protein_id": "Q971U1", "GO_id": ["0005524", "0140097", "0003677", "0004386", "0016787"]}
+{"protein_id": "Q39QF5", "GO_id": ["0003995", "0050660"]}
+{"protein_id": "A3SI50", "GO_id": ["0050660", "0016627"]}
+{"protein_id": "A0A0H2ZLL3", "GO_id": ["0005886", "0005524", "0016887", "0006865", "0015697"]}
+{"protein_id": "B2ADG1", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q73MU2", "GO_id": ["0047304", "0046872", "0016779", "0032923", "0019700"]}
+{"protein_id": "A5PJW8", "GO_id": ["0000781", "0005739", "0005665", "0003682", "0003677", "0003899", "0071667", "0016787", "0032549", "0003968", "0008270", "0006366"]}
+{"protein_id": "F8VPZ3", "GO_id": ["0005794", "0000139", "0005509", "0004843", "1904263", "0016579", "0006508"]}
+{"protein_id": "G2WS43", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
+{"protein_id": "A5U6Z7", "GO_id": ["0005829", "0009295", "0003677", "0016491", "0030527", "0030261", "0006879"]}
+{"protein_id": "A9FRJ0", "GO_id": ["0000166", "0016491"]}
+{"protein_id": "F0NEL5", "GO_id": ["0003677", "0004519", "0046872", "0006310", "0006281"]}
+{"protein_id": "A8Q9M3", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "I1REU9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "J9VH79", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A0H2UQZ4", "GO_id": ["0005576", "0016491"]}
+{"protein_id": "B8Y445", "GO_id": ["0051539", "0046872", "0016491", "0015948"]}
+{"protein_id": "A0A7J6HK32", "GO_id": ["0000287", "0010333", "0016102"]}
+{"protein_id": "E9EDR6", "GO_id": ["0030428", "0045009", "0005886", "0004100", "0030476", "0006031", "0000920"]}
+{"protein_id": "A5U229", "GO_id": ["0051539", "0005524", "0003677", "0003678", "0016818", "0046872", "0006310", "0006281"]}
+{"protein_id": "A0A7J6G7L9", "GO_id": ["0009570", "0030604", "0030145", "0070402", "0051484"]}
+{"protein_id": "A9ES55", "GO_id": ["0004324", "0000166", "0034599", "0042167"]}
+{"protein_id": "A8QCW4", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "A0A5J6BJN5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "C8V4I9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q71RT3", "GO_id": ["0005576", "0050053", "0009758"]}
+{"protein_id": "A0A1D8PT03", "GO_id": ["0005737", "0032126", "0062040", "0016020", "0005886", "0010181", "0003955", "0034599", "0160020"]}
+{"protein_id": "G4MVT6", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "W8X092", "GO_id": ["0005737", "0016491"]}
+{"protein_id": "B0XQY0", "GO_id": ["0044695", "0005789", "0061630", "0008270", "0043161", "0016567"]}
+{"protein_id": "B7JA34", "GO_id": ["0016020", "0000166", "0016491", "0009245"]}
+{"protein_id": "Q7RWN7", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0R2V5", "GO_id": ["0005829", "0005524", "0005525", "0016853", "0046872", "0004518"]}
+{"protein_id": "Q8IDQ9", "GO_id": ["0005794", "0000139", "0000234", "0032259", "0006656"]}
+{"protein_id": "A0A0F7G352", "GO_id": ["0009507", "0062116", "0008234", "0050547", "0042802", "0042803", "0009699", "0006508", "0042189"]}
+{"protein_id": "Q54129", "GO_id": ["0016757", "0009243", "0044010"]}
+{"protein_id": "A0A2U7QU15", "GO_id": ["0005576", "0030248", "0031176", "0045493"]}
+{"protein_id": "I1BJ58", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "A2QRA0", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
+{"protein_id": "Q54795", "GO_id": ["0031676", "0016717", "0006636"]}
+{"protein_id": "Q8T913", "GO_id": ["0043240", "0005634", "0061630", "0008270", "0006281", "0036297", "0006513"]}
+{"protein_id": "A0A7J6HWR9", "GO_id": ["0005737", "0005634", "0004337", "0004161", "0046872", "0006695", "0045337"]}
+{"protein_id": "B9T625", "GO_id": ["0009507", "0009899", "0000287", "0010333", "0009686"]}
+{"protein_id": "S7RK00", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q59128", "GO_id": ["0005737", "0051537", "0046872", "0018535", "0009820", "0019608"]}
+{"protein_id": "T2HG31", "GO_id": ["0005782", "0003884", "0071949", "0043799", "0019478", "0048599", "1905939"]}
+{"protein_id": "G1TNM3", "GO_id": ["0022626", "0022627", "0005743", "0005730", "0005819", "0140078", "0003677", "0003723", "0003735", "0006915", "0051301", "0006281", "2001235", "0006417", "0006412"]}
+{"protein_id": "Q50249", "GO_id": ["0005886", "0008901", "0016151", "0015948"]}
+{"protein_id": "H6C4I7", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "O58802", "GO_id": ["0004412", "0046872", "0070403", "0050661", "0009086", "0009088"]}
+{"protein_id": "Q1PX48", "GO_id": ["0044222", "0020037", "0033740", "0042802", "0046872", "0019331", "0006809", "0070207"]}
+{"protein_id": "A0A5N6V3W5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "B2ARG6", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q8IJK4", "GO_id": ["0020011", "0016020", "0005524", "0004808", "0000049", "0103016", "0008033"]}
+{"protein_id": "C0PBF8", "GO_id": ["0005737", "0003884", "0008445", "0019740"]}
+{"protein_id": "Q9SAJ2", "GO_id": ["0005737", "0005524", "0004674", "0004712", "0009734", "0071365", "0010928", "0009733", "0007165"]}
+{"protein_id": "A0A7W3RCJ3", "GO_id": ["0008911"]}
+{"protein_id": "E9E1W3", "GO_id": ["0030428", "0005886", "0004100", "0071555", "0006031"]}
+{"protein_id": "A0A803PDZ0", "GO_id": ["0009507", "0008661", "0046872", "0052865", "0019682", "0016114", "0009228"]}
+{"protein_id": "Q4W1X2", "GO_id": ["0000166", "0008767", "0071555"]}
+{"protein_id": "A0A1C9ZP88", "GO_id": ["0005886", "0098552", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A095CCB2", "GO_id": ["0005782", "0003884", "0071949", "0019478", "0098754"]}
+{"protein_id": "Q8VYX1", "GO_id": ["0008170", "0000234", "0032259", "0006656"]}
+{"protein_id": "A8BPK8", "GO_id": ["0005930", "0036064", "0097542", "1902671", "0097554", "1902677", "0097560", "1902673", "0097556", "1902675", "0097558", "0097568", "0016020", "0072686", "0005634", "1902672", "0097555", "1902678", "0097561", "1902674", "0097557", "1902676", "0097559", "0097597", "0005524", "0004672", "0004674", "0000278", "1902410", "0140014", "1905504", "1901978", "0031114"]}
+{"protein_id": "O64768", "GO_id": ["0005737", "0005524", "0004674", "0004712", "0009734", "0071365", "0010928", "0009733", "0007165"]}
+{"protein_id": "A0A1S3Z5Y0", "GO_id": ["0005737", "0005634", "0005524", "0004707", "0106310", "0004674", "0004713", "0006952", "0035556"]}
+{"protein_id": "A0A1D8PJA8", "GO_id": ["0009986", "0005576", "0009277", "0030446", "0016798", "0071555"]}
+{"protein_id": "G3XCW3", "GO_id": ["0009276", "0016020", "0098567", "0005886", "0016757", "0008755", "0043165", "0009103", "0009243"]}
+{"protein_id": "G4N1B2", "GO_id": ["0030428", "0045009", "0005886", "0004100", "0030476", "0006031", "0000920"]}
+{"protein_id": "A2QLV1", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
+{"protein_id": "Q4WSZ0", "GO_id": ["0005789", "0047560", "0070402", "0006666", "0030148"]}
+{"protein_id": "Q2VG90", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "A8PX35", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "Q4X066", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "K5VN09", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q1K8B6", "GO_id": ["0005576", "0005507", "0004497", "0016705", "0019825", "0030245", "0000272"]}
+{"protein_id": "A0A1Y9G8H0", "GO_id": ["0005576", "0020037", "0046872", "0004601", "0006979", "0042311"]}
+{"protein_id": "Q9Y795", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
+{"protein_id": "Q6E7K8", "GO_id": ["0016020", "0016717", "0006631"]}
+{"protein_id": "Q4WP32", "GO_id": ["0005576", "0004497", "0030245"]}
+{"protein_id": "Q7SHI8", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q5B6H0", "GO_id": ["0005576", "0016798", "0046872", "0030245"]}
+{"protein_id": "Q9HGY3", "GO_id": ["0005782", "0003884", "0071949", "0019478", "0019740"]}
+{"protein_id": "Q6FSR7", "GO_id": ["0010008", "0005576", "0000329", "0005794", "0071561", "0005777", "0000407", "0034271", "0034272", "0016303", "0005524", "0004672", "0032120", "0000045", "0051365", "0006897", "0006879", "0034755", "0000425", "0048015", "0048227", "0032968", "0072665"]}
+{"protein_id": "I0JWN7", "GO_id": ["0016020", "0042597", "0052933", "0052934", "0005509", "0015945"]}
+{"protein_id": "A0A0E0RQ52", "GO_id": ["0030428", "0005886", "0004100", "0006031"]}
+{"protein_id": "Q7CPF5", "GO_id": ["0005829", "0005524", "0046872", "0004747", "0006014"]}
+{"protein_id": "I1RDA9", "GO_id": ["0005576", "0016787", "0016042"]}
+{"protein_id": "C8V530", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A1D6NER6", "GO_id": ["0008170", "0000234", "0032259", "0006656"]}
+{"protein_id": "Q8CBC7", "GO_id": ["0005829", "0005634", "1904047", "0106340", "0016423", "0002181", "0022008", "0002130"]}
+{"protein_id": "G2WYP9", "GO_id": ["0030428", "0005935", "0045009", "0000131", "0005886", "0005628", "0004100", "0030476", "0006031", "0097271"]}
+{"protein_id": "Q6K461", "GO_id": ["0005737", "0005634", "0052845", "0016791"]}
+{"protein_id": "Q9HTF4", "GO_id": ["0051537", "0005506", "0004497", "0031457"]}
+{"protein_id": "Q4J834", "GO_id": ["0003677", "0004519", "0046872", "0006310", "0006281"]}
+{"protein_id": "G2QNT0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q5BEI9", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A1D5NVS8", "GO_id": ["0000785", "0072487", "0005634", "0141054", "0140585", "0061630", "0008270", "0006974", "0040029", "0045893", "0006513", "0016567"]}
+{"protein_id": "Q8IBT4", "GO_id": ["0020011", "0031071", "0006534"]}
+{"protein_id": "A4D0H5", "GO_id": ["0004497", "0000166", "0017000"]}
+{"protein_id": "Q50LE9", "GO_id": ["0005737", "0008115", "1901053"]}
+{"protein_id": "A2QTJ1", "GO_id": ["0030428", "0005935", "0045009", "0000131", "0005886", "0005628", "0004100", "0030476", "0006031", "0097271"]}
+{"protein_id": "A0A7J6EK66", "GO_id": ["0009507", "0008661", "0046872", "0052865", "0015995", "0019682", "0016114", "0009228"]}
+{"protein_id": "A0A0K0VEZ4", "GO_id": ["0005576", "0004806", "0016042"]}
+{"protein_id": "A0A384K4U6", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "G2QAB5", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q9W2R3", "GO_id": ["0016324", "0005938", "0005694", "0035323", "0031514", "0005634", "0005886", "0016308", "0003383", "0071711", "0030866", "0046843", "0007032", "0001837", "0071963", "0030707", "0007444", "0007310", "0008103", "0048477", "0046854", "0008104", "1903689", "0007286", "0007283"]}
+{"protein_id": "F8D9F4", "GO_id": ["0046872", "0016791", "0044283"]}
+{"protein_id": "A0A0C1E1D0", "GO_id": ["0005829", "0004069", "0004838", "0030170", "0006532"]}
+{"protein_id": "A5A8G0", "GO_id": ["0009507", "0009905", "0034281", "0009899", "0034280", "0000287", "0010333", "0016102", "0033332"]}
+{"protein_id": "A0A2H3EDS0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A089LI24", "GO_id": ["0030246", "0016757", "0005975"]}
+{"protein_id": "Q50225", "GO_id": ["0044569", "0009375", "0005886", "0051538", "0051539", "0009055", "0008901", "0046872", "0051911", "0015948"]}
+{"protein_id": "G5EJN7", "GO_id": ["0005829", "0070330", "0050660", "0010181", "0020037", "0005506", "0003958"]}
+{"protein_id": "G0R6T8", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "W4KMP1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q5B8T4", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "D6Y851", "GO_id": ["0005829", "0005524", "0005525", "0016853", "0046872", "0004518"]}
+{"protein_id": "O18333", "GO_id": ["0120281", "0000421", "0031410", "0070971", "0005794", "0000139", "0043025", "0043204", "0005886", "0098975", "0048786", "0061175", "0031982", "0005525", "0003924", "0000287", "0061909", "0006888", "0008057", "0009306", "0032482", "0106104", "0099175", "0160156", "0046718", "0033292", "0016192"]}
+{"protein_id": "Q39QF7", "GO_id": ["0008775", "0006083"]}
+{"protein_id": "F8JK18", "GO_id": ["0046872", "0010333"]}
+{"protein_id": "Q5BAP2", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "B2AVF1", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "E1AQY3", "GO_id": ["0004015", "0030170", "0009102", "0009448", "0009699"]}
+{"protein_id": "Q9V3I2", "GO_id": ["0043679", "0005938", "0005829", "0005769", "0061645", "0030139", "0012505", "0032585", "0043025", "0043204", "0005886", "0098830", "0045202", "0008021", "0043195", "0031982", "0030234", "0005525", "0003924", "0048675", "0007409", "0007298", "0007349", "0061883", "0048813", "0046664", "0006897", "0016197", "0034058", "0007032", "0032509", "0035088", "0006887", "0048803", "0006886", "0036258", "0007399", "0006836", "0007220", "0048477", "0006909", "0048015", "0048227", "0032482", "0006898", "0032956", "0050803", "1903186", "0046718", "0048488", "0016189", "0016050", "0016192", "0035220"]}
+{"protein_id": "A0A1S4CGX4", "GO_id": ["0005737", "0005524", "0004708", "0004674", "0006952", "0051707"]}
+{"protein_id": "J1H0Z7", "GO_id": ["0016740"]}
+{"protein_id": "Q87NI7", "GO_id": ["0005886", "0071111"]}
+{"protein_id": "Q9WYJ1", "GO_id": ["0003941", "0030170", "0004794", "0009097", "0006565", "0006567"]}
+{"protein_id": "Q7VNA4", "GO_id": ["0005829", "0008713", "0009244"]}
+{"protein_id": "A0A1D8PHA3", "GO_id": ["0005743", "0005886", "0045275", "0009055", "0020037", "0046872", "0008121", "0006122"]}
+{"protein_id": "A0A0J9XK58", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q9X7P6", "GO_id": ["0005737", "0005576", "0009274", "0003884", "0071949", "0019478"]}
+{"protein_id": "A0A0H4K9X4", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "G2RB73", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "A8CEI3", "GO_id": ["0009507", "0047804", "0030170", "0019346"]}
+{"protein_id": "C5B120", "GO_id": ["0016020", "0030288", "0005509", "0016614", "0015945"]}
+{"protein_id": "B1KN81", "GO_id": ["0042597", "0016829"]}
+{"protein_id": "A0A1C9CXI1", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A384JXL6", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A5U1J9", "GO_id": ["0005737", "0017136", "0070403", "0036054", "0036055", "0008270"]}
+{"protein_id": "A0A4Y5L9K3", "GO_id": ["0000287", "0010333", "0016102"]}
+{"protein_id": "A0A1V0QSF8", "GO_id": ["0009507", "0000287", "0010333", "0016102"]}
+{"protein_id": "A0A2I1D2M7", "GO_id": ["0003962", "0016853", "0030170", "0019346"]}
+{"protein_id": "A9CEY6", "GO_id": ["0005829", "0016491"]}
+{"protein_id": "H3ZR39", "GO_id": ["0042802", "0016853", "0030170", "0008483"]}
+{"protein_id": "Q7S439", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q5B467", "GO_id": ["0005789", "0008168", "0032259"]}
+{"protein_id": "A0A193CHJ5", "GO_id": ["0005576", "0005509", "0047498", "0099106", "0005543", "0090729", "0050482", "0016042", "0042130", "0006644"]}
+{"protein_id": "A0A1V0QSF6", "GO_id": ["0000287", "0010333", "0016102"]}
+{"protein_id": "Q9X4C9", "GO_id": ["0005829", "1990077", "0005524", "0016887", "0003677", "0003678", "0042802", "0006269"]}
+{"protein_id": "A0A3G2S5J6", "GO_id": ["0005576", "0120516", "0047372", "0004806", "0016042"]}
+{"protein_id": "A0A4P8PKE4", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A5J6BJT0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "S5RR11", "GO_id": ["0051537", "0005506", "0016491"]}
+{"protein_id": "A0A7W3RDA3", "GO_id": ["0005737", "0008736", "0006004"]}
+{"protein_id": "E2JKI3", "GO_id": ["0003677", "0000287", "0009036", "0009307"]}
+{"protein_id": "Q8DPL8", "GO_id": ["0016020", "0016787", "0000156", "0000155", "0030295", "0019901", "0007234", "0006355", "0007165"]}
+{"protein_id": "Q4DUK4", "GO_id": ["0005789", "0009922", "0034625", "0034626", "0019367", "0030148", "0042761"]}
+{"protein_id": "A0A478ECY3", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "A0A5J6BJP2", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q93TM1", "GO_id": ["0042597", "0050053", "0009758"]}
+{"protein_id": "A0A499UB99", "GO_id": ["0005782", "0003884", "0071949", "0019478", "0046416", "0019740"]}
+{"protein_id": "F8D4I6", "GO_id": ["0005524", "0016301", "0006796"]}
+{"protein_id": "E5Y8P9", "GO_id": ["0016620"]}
+{"protein_id": "A0A1V0QSF9", "GO_id": ["0009507", "0000287", "0010333", "0016102"]}
+{"protein_id": "A0A0J9X285", "GO_id": ["0005829", "0005524", "0008902", "0008972", "0009228", "0009229"]}
+{"protein_id": "A0A1C9CXI0", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q1K4Q1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "Q9W413", "GO_id": ["0030678", "0005739", "0046872", "0004526", "0097745", "0090646", "0001682"]}
+{"protein_id": "Q50250", "GO_id": ["0005886", "0046872", "0016491", "0015948", "0022904"]}
+{"protein_id": "A0A1W7HCY1", "GO_id": ["0005737", "0051537", "0140618", "0052851", "0046872", "0010106"]}
+{"protein_id": "A0A384JLD1", "GO_id": ["0005576", "0046872", "0004497", "0030245"]}
+{"protein_id": "G2QZK6", "GO_id": ["0005576", "0030248", "0046872", "0004497", "0030245"]}
+{"protein_id": "C8YTM5", "GO_id": ["0008170", "0000234", "0032259", "0006656"]}
+{"protein_id": "A5U3A6", "GO_id": ["0005886", "0005524", "0004674", "0080090"]}

example/test_nt_seqs.fasta ADDED Viewed

	@@ -0,0 +1,94 @@

+>Bob
+atgaaatataaacgcattgtgtttaaagtgggcaccagcagcctgaccaacgaagatggc
+agcctgagccgcagcaaagtgaaagatattacccagcagctggcgatgctgcatgaagcg
+ggccatgaactgattctggtgagcagcggcgcgattgcggcgggctttggcgcgctgggc
+tttaaaaaacgcccgaccaaaattgcggataaacaggcgagcgcggcggtgggccagggc
+ctgctgctggaagaatataccaccaacctgctgctgcgccagattgtgagcgcgcagatt
+ctgctgacccaggatgattttgtggataaacgccgctataaaaacgcgcatcaggcgctg
+agcgtgctgctgaaccgcggcgcgattccgattattaacgaaaacgatagcgtggtgatt
+gatgaactgaaagtgggcgataacgataccctgagcgcgcaggtggcggcgatggtgcag
+gcggatctgctggtgtttctgaccgatgtggatggcctgtataccggcaacccgaacagc
+gatccgcgcgcgaaacgcctggaacgcattgaaaccattaaccgcgaaattattgatatg
+gcgggcggcgcgggcagcagcaacggcaccggcggcatgctgaccaaaattaaagcggcg
+accattgcgaccgaaagcggcgtgccggtgtatatttgcagcagcctgaaaagcgatagc
+atgattgaagcggcggaagaaaccgaagatggcagctattttgtggcgcaggaaaaaggc
+ctgcgcacccagaaacagtggctggcgttttatgcgcagagccagggcagcatttgggtg
+gataaaggcgcggcggaagcgctgagccagtatggcaaaagcctgctgctgagcggcatt
+gtggaagcggaaggcgtgtttagctatggcgatattgtgaccgtgtttgataaagaaagc
+ggcaaaagcctgggcaaaggccgcgtgcagtttggcgcgagcgcgctggaagatatgctg
+cgcagccagaaagcgaaaggcgtgctgatttatcgcgatgattggattagcattaccccg
+gaaattcagctgctgtttaccgaattt
+>Henry
+atggaagtgaaaggcaaaaaaaaactgaccggcaaaggcaccaaaatgagccaggaaaaa
+agcaaatttcataaaaacaacgatagcggcagcagcaaaacctttccgaaaaaagtggtg
+aaagaaggcggcccgaaaattaccagcaaaaactttgaaaaaaccgcgaccaaaccgggc
+aaaaaaggcgtgaaacagtttaaaaacaaacagcagggcgatcgcattccgaaaaacaaa
+tttcagcaggcgaacaaatttaaccagaaacgcaaatttcagccggatagcaaaagcgat
+gaaagcgcggcgaaaaaaccgaaatgggatgaatttaaaaaaaaaaaaaaagaactgaaa
+cagagccgccagctgagcgataaaaccaactatgatattgtgattcgcgcgaaacagatt
+tgggaaattctgcgccgcaaagattgcgataaagaaaaacgcgtgaaactgatgagcgat
+ctgcagaaactgattcagggcaaaattaaaaccattgcgtttgcgcatgatagcacccgc
+gtgattcagtgctatattcagtttggcaacgaagaacagcgcaaacaggcgtttgaagaa
+ctgcgcggcgatctggtggaactgagcaaagcgaaatatagccgcaacattgtgaaaaaa
+tttctgatgtatggcagcaaagcgcagattgcggaaattattcgcagctttaaaggccat
+gtgcgcaaactgctgcgccatgcggaagcgagcgcgattgtggaatatgcgtataacgat
+aaagcgattctggaacagcgcaacatgctgaccgaagaactgtatggcaacacctttcag
+ctgtataaaagcgcggatcatccgaccctggataaagtgctggaagtgcagccggaaaaa
+ctggaactgattatggatgaaatgaaacagattctgaccccgatggcgcagaaagaagcg
+gtgattaaacatagcctggtgcataaagtgtttctggatttttttacctatgcgccgccg
+aaactgcgcagcgaaatgattgaagcgattcgcgaagcggtggtgtatctggcgcatacc
+catgatggcgcgcgcgtggcgatgtattgcctgtggcatggcaccccgaaagatcgcaaa
+gtgattgtgaaaaccatgaaaacctatattgaaaaagtggcgaacggccagtatagccat
+ctggtgctgctggcggcgtttgattgcattgatgataccaaactggtgaaacagattatt
+attagcgaaattattaacagcctgccgaacattgtgaacgataaatatggccgcaaagtg
+ctgctgtatctgctgagcccgcgcgatccggcgcataccgtgcgcgaaattattgaagtg
+ctgcagaaaggcgatggcaacgcgcatagcaaaaaagataccgaaattcgccgccgcgaa
+ctgctggaaagcattagcccggcgctgctgagctatctgcagggccatgcgcaggaagtg
+gtgctggataaaagcgcgtgcgtgctggtggcggatattctgggcaccgcgaccggcgat
+gtgcagccggcgatggatgcggtggcgagcctggcggcggcggaactgcatccgggcggc
+aaagatggcgaactgcatattgcggaacatccggcgggccatctggtgctgaaatggctg
+attgaacaggataaaaaaatgaaagaacgcggccgcgaaggctgctttgcgaaaaccctg
+attgaacgcgtgggcgtgaaaaacctgaaaagctgggcgagcgtgaaccgcggcgcgatt
+attctgagcagcctgctgcagagcagcgatcaggaagtggcgaacaaagtgaaagcgggc
+ctgaaaagcctgattccggcgctggaaaaaagcaaaaacaccagcaaaggcattgaaatg
+ctgctggaaaaactgaccgcg
+>Wilf
+atggcggcggaagaaggcgtggtgattgcgtgccataacaaagatgaatttgatgcgcag
+atgaccaaagcgaaagaagcgggcaaagtggtgattattgattttaccgcgagctggtgc
+ggcccgtgccgctttattgcgccggtgtttgcggaatatgcgaaaaaatttccgggcgcg
+gtgtttctgaaagtggatgtggatgaactgaaagaagtggcggaaaaatataacgtggaa
+gcgatgccgacctttctgtttattaaagatggcgcggaagcggataaagtggtgggcgcg
+cgcaaagatgatctgcagaacaccattgtgaaacatgtgggcgcgaccgcggcgagcgcg
+agcgcg
+>reverse translation of P22298
+ggccgcggccugcugccguuugugcugcuggcgcugggcauugcgccgugggcgguggaa
+ggcgcggaaaacgcgcugaaaggcggcgcgugcccgccgcgcaaaauugugcagugccug
+cgcuaugaaaaaccgaaaugcaccagcgauuggcagugcccggauaaaaaaaaaugcugc
+cgcgauaccugcgcgauuaaaugccugaacccgguggcgauuaccaacccggugaaagug
+aaaccgggcaaaugcccggugguguauggccagugcaugaugcugaacccgccgaaccau
+ugcaaaaccgauagccagugccugggcgaucugaaaugcugcaaaagcaugugcggcaaa
+gugugccugaccccggugaaagcg
+>ENA|AACH01000027|AACH01000027.2 Saccharomyces mikatae IFO 1815 YM4906-Contig2858, whole genome shotgun sequence.
+ctggtgctgctggcggcgtttgattgcattgatgataccaaactggtgaaacagattatt
+attagcgaaattattaacagcctgccgaacattgtgaacgataaatatggccgcaaagtg
+ctgctgtatctgctgagcccgcgcgatccggcgcataccgtgcgcgaaattattgaagtg
+ctgcagaaaggcgatggcaacgcgcatagcaaaaaagataccgaaattcgccgccgcgaa
+atgaaatataaacgcattgtgtttaaagtgggcaccagcagcctgaccaacgaagatggc
+agcctgagccgcagcaaagtgaaagatattacccagcagctggcgatgctgcatgaagcg
+ggccatgaactgattctggtgagcagcggcgcgattgcggcgggctttggcgcgctgggc
+tttaaaaaacgcccgaccaaaattgcggataaacaggcgagcgcggcggtgggccagggc
+ctgctgctggaagaatataccaccaacctgctgctgcgccagattgtgagcgcgcagatt
+ctgctgacccaggatgattttgtggataaacgccgctataaaaacgcgcatcaggcgctg
+agcgtgctgctgaaccgcggcgcgattccgattattaacgaaaacgatagcgtggtgatt
+gatgaactgaaagtgggcgataacgataccctgagcgcgcaggtggcggcgatggtgcag
+gcggatctgctggtgtttctgaccgatgtggatggcctgtataccggcaacccgaacagc
+gatccgcgcgcgaaacgcctggaacgcattgaaaccattaaccgcgaaattattgatatg
+gcgggcggcgcgggcagcagcaacggcaccggcggcatgctgaccaaaattaaagcggcg
+accattgcgaccgaaagcggcgtgccggtgtatatttgcagcagcctgaaaagcgatagc
+atgattgaagcggcggaagaaaccgaagatggcagctattttgtggcgcaggaaaaaggc
+ctgcgcacccagaaacagtggctggcgttttatgcgcagagccagggcagcatttgggtg
+gataaaggcgcggcggaagcgctgagccagtatggcaaaagcctgctgctgagcggcatt
+gtggaagcggaaggcgtgtttagctatggcgatattgtgaccgtgtttgataaagaaagc
+ggcaaaagcctgggcaaaggccgcgtgcagtttggcgcgagcgcgctggaagatatgctg
+cgcagccagaaagcgaaaggcgtgctgatttatcgcgatgattggattagcattaccccg
+gaaattcagctgctgtttaccgaattt

example/test_proteins.fasta ADDED Viewed

	@@ -0,0 +1,4 @@

+>A0A0H2ZM56
+MADKKTVTPEEKKLVAEKHVDELVQKALVALEEMRKLNQEQVDYIVAKASVAALDAHGELALHAFEETGRGVFEDKATKNLFACEHVVNNMRHTKTVGVIEEDDVTGLTLIAEPVGVVCGITPTTNPTSTAIFKSLISLKTRNPIVFAFHPSAQESSAHAARIVRDAAIAAGAPENCVQWITQPSMEATSALMNHEGVATILATGGNAMVKAAYSCGKPALGVGAGNVPAYVEKSANIRQAAHDIVMSKSFDNGMVCASEQAVIIDKEIYDEFVAEFKSYHTYFVNKKEKALLEEFCFGVKANSKNCAGAKLNADIVGKPATWIAEQAGFTVPEGTNILAAECKEVGENEPLTREKLSPVIAVLKSESREDGITKARQMVEFNGLGHSAAIHTADEELTKEFGKAVKAIRVICNSPSTFGGIGDVYNAFLPSLTLGCGSYGRNSVGDNVSAINLLNIKKVGRRRNNMQWMKLPSKTYFERDSIQYLQKCRDVERVMIVTDHAMVELGFLDRIIEQLDLRRNKVVYQIFADVEPDPDITTVNRGTEIMRAFKPDTIIALGGGSPMDAAKVMWLFYEQPEVDFRDLVQKFMDIRKRAFKFPLLGKKTKFIAIPTTSGTGSEVTPFAVISDKANNRKYPIADYSLTPTVAIVDPALVLTVPGFVAADTGMDVLTHATEAYVSQMASDYTDGLALQAIKLVFENLESSVKNADFHSREKMHNASTIAGMAFANAFLGISHSMAHKIGAQFHTIHGRTNAILLPYVIRYNGTRPAKTATWPKYNYYRADEKYQDIARMLGLPASTPEEGVESYAKAVYELGERIGIQMNFRDQGIDEKEWKEHSRELAFLAYEDQCSPANPRLPMVDHMQEIIEDAYYGYKERPGRRK
+>P05067
+MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMNVQNGKWDSDPSGTKTCIDTKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPHFVIPYRCLVGEFVSDALLVPDKCKFLHQERMDVCETHLHWHTVAKETCSEKSTNLHDYGMLLPCGIDKFRGVEFVCCPLAEESDNVDSADAEEDDSDVWWGGADTDYADGSEDKVVEVAEEEEVAEVEEEEADDDEDDEDGDEVEEEAEEPYEEATERTTSIATTTTTTTESVEEVVREVCSEQAETGPCRAMISRWYFDVTEGKCAPFFYGGCGGNRNNFDTEEYCMAVCGSAMSQSLLKTTQEPLARDPVKLPTTAASTPDAVDKYLETPGDENEHAHFQKAKERLEAKHRERMSQVMREWEEAERQAKNLPKADKKAVIQHFQEKVESLEQEAANERQQLVETHMARVEAMLNDRRRLALENYITALQAVPPRPRHVFNMLKKYVRAEQKDRQHTLKHFEHVRMVDPKKAAQIRSQVMTHLRVIYERMNQSLSLLYNVPAVAEEIQDEVDELLQKEQNYSDDVLANMISEPRISYGNDALMPSLTETKTTVELLPVNGEFSLDDLQPWHSFGADSVPANTENEVEPVDARPAADRGLTTRPGSGLTNIKTEEISEVKMDAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIATVIVITLVMLKKKQYTSIHHGVVEVDAAVTPEERHLSKMQQNGYENPTYKFFEQMQN

go_integration_pipeline.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import json
+import os
+import sys
+import argparse
+from typing import Dict, List, Tuple, Optional
+from collections import defaultdict
+import torch
+from tqdm import tqdm
+# 添加路径
+root_path = os.path.dirname((os.path.abspath(__file__)))
+sys.path.append(root_path)
+sys.path.append(os.path.join(root_path, "Models/ProTrek"))
+from utils.protein_go_analysis import get_go_definition
+class GOIntegrationPipeline:
+    def __init__(self,
+                 identity_threshold: int = 80,
+                 coverage_threshold: int = 80,
+                 evalue_threshold: float = 1e-50,
+                 topk: int = 2,
+                 protrek_threshold: Optional[float] = None,
+                 use_protrek: bool = False):
+        """
+        GO信息整合管道
+        Args:
+            identity_threshold: BLAST identity阈值 (0-100)
+            coverage_threshold: BLAST coverage阈值 (0-100)
+            evalue_threshold: BLAST E-value阈值
+            protrek_threshold: ProTrek分数阈值
+            use_protrek: 是否使用第二层ProTrek筛选
+        """
+        self.identity_threshold = identity_threshold
+        self.coverage_threshold = coverage_threshold
+        self.evalue_threshold = evalue_threshold
+        self.protrek_threshold = protrek_threshold
+        self.use_protrek = use_protrek
+        self.topk = topk
+        # 加载蛋白质-GO映射数据
+        self._load_protein_go_dict()
+        # 如果使用protrek，初始化模型
+        if self.use_protrek:
+            self._init_protrek_model()
+    def _init_protrek_model(self):
+        """初始化ProTrek模型"""
+        from model.ProTrek.protrek_trimodal_model import ProTrekTrimodalModel
+        config = {
+            "protein_config": "Models/ProTrek/weights/ProTrek_650M_UniRef50/esm2_t33_650M_UR50D",
+            "text_config": "Models/ProTrek/weights/ProTrek_650M_UniRef50/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
+            "structure_config": "Models/ProTrek/weights/ProTrek_650M_UniRef50/foldseek_t30_150M",
+            "load_protein_pretrained": False,
+            "load_text_pretrained": False,
+            "from_checkpoint": "Models/ProTrek/weights/ProTrek_650M_UniRef50/ProTrek_650M_UniRef50.pt"
+        }
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.protrek_model = ProTrekTrimodalModel(**config).to(self.device).eval()
+        print(f"ProTrek模型已加载到设备: {self.device}")
+    def _load_protein_go_dict(self):
+        """加载蛋白质-GO映射数据"""
+        self.protein_go_dict = {}
+        try:
+            with open('processed_data/protein_go.json', 'r') as f:
+                for line in f:
+                    data = json.loads(line)
+                    self.protein_go_dict[data['protein_id']] = data['GO_id']
+            print(f"成功加载蛋白质-GO映射数据，共{len(self.protein_go_dict)}条记录")
+        except Exception as e:
+            print(f"加载蛋白质-GO映射数据时发生错误: {str(e)}")
+            self.protein_go_dict = {}
+    def _get_go_from_uniprot_id(self, uniprot_id: str) -> List[str]:
+        """
+        从Uniprot ID获取GO ID
+        Args:
+            uniprot_id: Uniprot ID
+        Returns:
+            使用类内部加载的字典
+        """
+        # 使用类内部加载的字典
+        return [go_id.split("_")[-1] if "_" in go_id else go_id
+                for go_id in self.protein_go_dict.get(uniprot_id, [])]
+    def extract_blast_go_ids(self, blast_results: List[Dict],protein_id: str) -> List[str]:
+        """
+        从BLAST结果中提取符合条件的GO ID
+        Args:
+            blast_results: BLAST结果列表
+            protein_id: 当前蛋白质ID（避免自身匹配）
+        Returns:
+            符合条件的GO ID列表
+        """
+        go_ids = []
+        if self.topk > 0:
+            # 使用topk策略
+            for result in blast_results[:self.topk]:
+                hit_id = result.get('ID', '')
+                if hit_id == protein_id:
+                    continue
+                go_ids.extend(self._get_go_from_uniprot_id(hit_id))
+        else:
+            # 使用阈值策略
+            for result in blast_results:
+                identity = float(result.get('Identity%', 0))
+                coverage = float(result.get('Coverage%', 0))
+                evalue = float(result.get('E-value', 1.0))
+                # 检查是否符合阈值条件
+                if (identity >= self.identity_threshold and
+                    coverage >= self.coverage_threshold and
+                    evalue <= self.evalue_threshold):
+                    # 获取该hit的protein_id
+                    hit_id = result.get('ID', '')
+                    if hit_id == protein_id:
+                        continue
+                    go_ids.extend(self._get_go_from_uniprot_id(hit_id))
+        return go_ids
+    def first_level_filtering(self, interproscan_info: Dict, blast_info: Dict) -> Dict[str, List[str]]:
+        """
+        第一层筛选：合并interproscan和符合条件的blast GO信息
+        Args:
+            interproscan_info: InterProScan结果
+            blast_info: BLAST结果
+        Returns:
+            蛋白质ID到GO ID列表的映射
+        """
+        protein_go_dict = {}
+        for protein_id in interproscan_info.keys():
+            go_ids = set()
+            # 添加interproscan的GO信息
+            interproscan_gos = interproscan_info[protein_id].get('interproscan_results', {}).get('go_id', [])
+            interproscan_gos = [go_id.split(":")[-1] if ":" in go_id else go_id for go_id in interproscan_gos]
+            if interproscan_gos:
+                go_ids.update(interproscan_gos)
+            # 添加符合条件的blast GO信息
+            if protein_id in blast_info:
+                blast_results = blast_info[protein_id].get('blast_results', [])
+                blast_gos = self.extract_blast_go_ids(blast_results,protein_id)
+                go_ids.update(blast_gos)
+            protein_go_dict[protein_id] = list(go_ids)
+        return protein_go_dict
+    def calculate_protrek_scores(self, protein_sequences: Dict[str, str],
+                                protein_go_dict: Dict[str, List[str]]) -> Dict[str, Dict]:
+        """
+        计算ProTrek分数
+        Args:
+            protein_sequences: 蛋白质序列字典
+            protein_go_dict: 蛋白质GO映射
+        Returns:
+            包含GO分数的字典
+        """
+        results = {}
+        for protein_id, go_ids in tqdm(protein_go_dict.items(), desc="计算ProTrek分数"):
+            if protein_id not in protein_sequences:
+                continue
+            protein_seq = protein_sequences[protein_id]
+            go_scores = {}
+            # 获取GO定义
+            go_definitions = {}
+            for go_id in go_ids:
+                definition = get_go_definition(go_id)
+                if definition:
+                    go_definitions[go_id] = definition
+            if not go_definitions:
+                continue
+            try:
+                # 计算蛋白质序列嵌入
+                seq_emb = self.protrek_model.get_protein_repr([protein_seq])
+                # 计算文本嵌入和相似度分数
+                definitions = list(go_definitions.values())
+                text_embs = self.protrek_model.get_text_repr(definitions)
+                # 计算相似度分数
+                scores = (seq_emb @ text_embs.T) / self.protrek_model.temperature
+                scores = scores.cpu().numpy().flatten()
+                # 映射回GO ID
+                for i, go_id in enumerate(go_definitions.keys()):
+                    go_scores[go_id] = float(scores[i])
+            except Exception as e:
+                print(f"计算 {protein_id} 的ProTrek分数时出错: {str(e)}")
+                continue
+            results[protein_id] = {
+                "protein_id": protein_id,
+                "GO_id": go_ids,
+                "Clip_score": go_scores
+            }
+        return results
+    def second_level_filtering(self, protrek_results: Dict[str, Dict]) -> Dict[str, List[str]]:
+        """
+        第二层筛选：根据ProTrek阈值筛选GO
+        Args:
+            protrek_results: ProTrek计算结果
+        Returns:
+            筛选后的蛋白质GO映射
+        """
+        filtered_results = {}
+        for protein_id, data in protrek_results.items():
+            clip_scores = data.get('Clip_score', {})
+            filtered_gos = []
+            for go_id, score in clip_scores.items():
+                if score >= self.protrek_threshold:
+                    filtered_gos.append(go_id)
+            if filtered_gos:
+                filtered_results[protein_id] = filtered_gos
+        return filtered_results
+    def generate_filename(self, base_name: str, is_intermediate: bool = False) -> str:
+        """生成包含参数信息的文件名"""
+        if self.topk > 0:
+            # 如果使用topk，则只包含topk信息
+            params = f"topk{self.topk}"
+        else:
+            # 否则使用原有的参数组合
+            params = f"identity{self.identity_threshold}_coverage{self.coverage_threshold}_evalue{self.evalue_threshold:.0e}"
+        if self.use_protrek and self.protrek_threshold is not None:
+            params += f"_protrek{self.protrek_threshold}"
+        if is_intermediate:
+            return f"{base_name}_intermediate_{params}.json"
+        else:
+            return f"{base_name}_final_{params}.json"
+    def run(self, interproscan_info: Dict = None, blast_info: Dict = None,
+            interproscan_file: str = None, blast_file: str = None,
+            output_dir: str = "output"):
+        """
+        运行GO整合管道
+        Args:
+            interproscan_info: InterProScan结果字典
+            blast_info: BLAST结果字典
+            interproscan_file: InterProScan结果文件路径
+            blast_file: BLAST结果文件路径
+            output_dir: 输出目录
+        """
+        # 加载数据
+        if interproscan_info is None and interproscan_file:
+            with open(interproscan_file, 'r') as f:
+                interproscan_info = json.load(f)
+        if blast_info is None and blast_file:
+            with open(blast_file, 'r') as f:
+                blast_info = json.load(f)
+        if not interproscan_info or not blast_info:
+            raise ValueError("必须提供interproscan_info和blast_info数据或文件路径")
+        # 确保输出目录存在
+        os.makedirs(output_dir, exist_ok=True)
+        print("开始第一层筛选...")
+        # 第一层筛选
+        protein_go_dict = self.first_level_filtering(interproscan_info, blast_info)
+        if not self.use_protrek:
+            # 不使用第二层筛选，直接保存结果
+            output_file = os.path.join(output_dir, self.generate_filename("go_integration"))
+            with open(output_file, 'w') as f:
+                for protein_id, go_ids in protein_go_dict.items():
+                    result = {"protein_id": protein_id, "GO_id": go_ids}
+                    f.write(json.dumps(result) + '\n')
+            print(f"第一层筛选完成，结果已保存到: {output_file}")
+            return output_file
+        print("开始第二层筛选...")
+        # 提取蛋白质序列
+        protein_sequences = {}
+        for protein_id, data in interproscan_info.items():
+            protein_sequences[protein_id] = data.get('sequence', '')
+        # 计算ProTrek分数
+        protrek_results = self.calculate_protrek_scores(protein_sequences, protein_go_dict)
+        # 保存中间结果
+        intermediate_file = os.path.join(output_dir, self.generate_filename("go_integration", is_intermediate=True))
+        with open(intermediate_file, 'w') as f:
+            for result in protrek_results.values():
+                f.write(json.dumps(result) + '\n')
+        print(f"ProTrek分数计算完成，中间结果已保存到: {intermediate_file}")
+        # 第二层筛选
+        if self.protrek_threshold is not None:
+            final_results = self.second_level_filtering(protrek_results)
+            # 保存最终结果
+            final_file = os.path.join(output_dir, self.generate_filename("go_integration"))
+            with open(final_file, 'w') as f:
+                for protein_id, go_ids in final_results.items():
+                    result = {"protein_id": protein_id, "GO_id": go_ids}
+                    f.write(json.dumps(result) + '\n')
+            print(f"第二层筛选完成，最终结果已保存到: {final_file}")
+            return final_file, intermediate_file
+        return intermediate_file
+def main():
+    parser = argparse.ArgumentParser(description="GO信息整合管道")
+    parser.add_argument("--interproscan_file", type=str,default="data/processed_data/interproscan_info.json", help="InterProScan结果文件路径")
+    parser.add_argument("--blast_file", type=str, default="data/processed_data/blast_info.json", help="BLAST结果文件路径")
+    parser.add_argument("--identity", type=int, default=80, help="BLAST identity阈值 (0-100)")
+    parser.add_argument("--coverage", type=int, default=80, help="BLAST coverage阈值 (0-100)")
+    parser.add_argument("--evalue", type=float, default=1e-50, help="BLAST E-value阈值")
+    parser.add_argument("--topk", type=int, default=2, help="BLAST topk结果")
+    parser.add_argument("--protrek_threshold", type=float, help="ProTrek分数阈值")
+    parser.add_argument("--use_protrek", action="store_true", help="是否使用第二层ProTrek筛选")
+    parser.add_argument("--output_dir", type=str, default="data/processed_data/go_integration_results", help="输出目录")
+    args = parser.parse_args()
+    # 创建管道实例
+    pipeline = GOIntegrationPipeline(
+        identity_threshold=args.identity,
+        coverage_threshold=args.coverage,
+        evalue_threshold=args.evalue,
+        topk=args.topk,
+        protrek_threshold=args.protrek_threshold,
+        use_protrek=args.use_protrek
+    )
+    # 运行管道
+    pipeline.run(
+        interproscan_file=args.interproscan_file,
+        blast_file=args.blast_file,
+        output_dir=args.output_dir
+    )
+if __name__ == "__main__":
+    main()

integrated_pipeline.py ADDED Viewed

	@@ -0,0 +1,520 @@

+import os
+import json
+import sys
+import argparse
+from typing import Dict, List, Optional
+from pathlib import Path
+from tqdm import tqdm
+# 添加必要的路径
+root_path = os.path.dirname(os.path.abspath(__file__))
+print(root_path)
+sys.path.append(root_path)
+sys.path.append(os.path.join(root_path, "Models/ProTrek"))
+# 导入所需模块
+from interproscan import InterproScan
+from Bio.Blast.Applications import NcbiblastpCommandline
+from utils.utils import extract_interproscan_metrics, get_seqnid, extract_blast_metrics, rename_interproscan_keys
+from go_integration_pipeline import GOIntegrationPipeline
+from utils.generate_protein_prompt import generate_prompt, get_interpro_manager, get_lmdb_connection
+from utils.openai_access import call_chatgpt
+class IntegratedProteinPipeline:
+    def __init__(self,
+                 blast_database: str = "uniprot_swissprot",
+                 expect_value: float = 0.01,
+                 interproscan_path: str = "interproscan/interproscan-5.75-106.0/interproscan.sh",
+                 interproscan_libraries: List[str] = None,
+                 go_topk: int = 2,
+                 selected_info_types: List[str] = None,
+                 pfam_descriptions_path: str = None,
+                 go_info_path: str = None,
+                 interpro_data_path: str = None,
+                 lmdb_path: str = None,
+                 args: argparse.Namespace = None):
+        """
+        整合蛋白质分析管道
+        Args:
+            blast_database: BLAST数据库名称
+            expect_value: BLAST E-value阈值
+            interproscan_path: InterProScan程序路径
+            interproscan_libraries: InterProScan库列表
+            go_topk: GO整合的topk参数
+            selected_info_types: prompt生成时选择的信息类型
+            pfam_descriptions_path: Pfam描述文件路径
+            go_info_path: GO信息文件路径
+            interpro_data_path: InterPro数据文件路径
+            lmdb_path: LMDB数据库路径
+        """
+        self.blast_database = blast_database
+        self.expect_value = expect_value
+        self.interproscan_path = interproscan_path
+        self.interproscan_libraries = interproscan_libraries or [
+            "PFAM", "PIRSR", "PROSITE_PROFILES", "SUPERFAMILY", "PRINTS",
+            "PANTHER", "CDD", "GENE3D", "NCBIFAM", "SFLM", "MOBIDB_LITE",
+            "COILS", "PROSITE_PATTERNS", "FUNFAM", "SMART"
+        ]
+        self.go_topk = go_topk
+        self.selected_info_types = selected_info_types or ['motif', 'go']
+        # 文件路径配置
+        self.pfam_descriptions_path = pfam_descriptions_path
+        self.go_info_path = go_info_path
+        self.interpro_data_path = interpro_data_path
+        self.lmdb_path = lmdb_path
+        self.interproscan_info_path = args.interproscan_info_path
+        self.blast_info_path = args.blast_info_path
+        # 初始化GO整合管道
+        self.go_pipeline = GOIntegrationPipeline(topk=self.go_topk)
+        # 初始化InterPro管理器（如果需要）
+        self.interpro_manager = None
+        other_types = [t for t in self.selected_info_types if t not in ['motif', 'go']]
+        if other_types and self.interpro_data_path:
+            self.interpro_manager = get_interpro_manager(self.interpro_data_path, None)
+    def step1_run_blast_and_interproscan(self, input_fasta: str, temp_dir: str = "temp") -> tuple:
+        """
+        步骤1: 运行BLAST和InterProScan分析
+        Args:
+            input_fasta: 输入FASTA文件路径
+            temp_dir: 临时文件目录
+        Returns:
+            tuple: (interproscan_info, blast_info)
+        """
+        print("步骤1: 运行BLAST和InterProScan分析...")
+        # 创建临时目录
+        os.makedirs(temp_dir, exist_ok=True)
+        # 获取序列字典
+        seq_dict = get_seqnid(input_fasta)
+        print(f"读取到 {len(seq_dict)} 个序列")
+        # 运行BLAST
+        print("运行BLAST分析...")
+        blast_xml = os.path.join(temp_dir, "blast_results.xml")
+        blast_cmd = NcbiblastpCommandline(
+            query=input_fasta,
+            db=self.blast_database,
+            out=blast_xml,
+            outfmt=5,  # XML格式
+            evalue=self.expect_value
+        )
+        blast_cmd()
+        # 提取BLAST结果
+        blast_results = extract_blast_metrics(blast_xml)
+        blast_info = {}
+        for uid, info in blast_results.items():
+            blast_info[uid] = {"sequence": seq_dict[uid], "blast_results": info}
+        # 运行InterProScan
+        print("运行InterProScan分析...")
+        interproscan_json = os.path.join(temp_dir, "interproscan_results.json")
+        interproscan = InterproScan(self.interproscan_path)
+        input_args = {
+            "fasta_file": input_fasta,
+            "goterms": True,
+            "pathways": True,
+            "save_dir": interproscan_json
+        }
+        interproscan.run(**input_args)
+        # 提取InterProScan结果
+        interproscan_results = extract_interproscan_metrics(
+            interproscan_json,
+            librarys=self.interproscan_libraries
+        )
+        interproscan_info = {}
+        for id, seq in seq_dict.items():
+            info = interproscan_results[seq]
+            info = rename_interproscan_keys(info)
+            interproscan_info[id] = {"sequence": seq, "interproscan_results": info}
+        # 清理临时文件
+        if os.path.exists(blast_xml):
+            os.remove(blast_xml)
+        if os.path.exists(interproscan_json):
+            os.remove(interproscan_json)
+        print(f"步骤1完成: 处理了 {len(interproscan_info)} 个蛋白质")
+        return interproscan_info, blast_info
+    def step2_integrate_go_information(self, interproscan_info: Dict, blast_info: Dict) -> Dict:
+        """
+        步骤2: 整合GO信息
+        Args:
+            interproscan_info: InterProScan结果
+            blast_info: BLAST结果
+        Returns:
+            Dict: 整合后的GO信息
+        """
+        print("步骤2: 整合GO信息...")
+        # 使用GO整合管道进行第一层筛选
+        protein_go_dict = self.go_pipeline.first_level_filtering(interproscan_info, blast_info)
+        print(f"步骤2完成: 为 {len(protein_go_dict)} 个蛋白质整合了GO信息")
+        return protein_go_dict
+    def step3_generate_prompts(self, interproscan_info: Dict, blast_info: Dict,
+                              protein_go_dict: Dict) -> Dict:
+        """
+        步骤3: 生成蛋白质prompt
+        Args:
+            interproscan_info: InterProScan结果
+            blast_info: BLAST结果
+            protein_go_dict: 整合的GO信息
+        Returns:
+            Dict: 蛋白质ID到prompt的映射（如果有lmdb则包含QA对）
+        """
+        print("步骤3: 生成蛋白质prompt...")
+        # 创建临时的GO整合文件格式（用于generate_prompt函数）
+        temp_go_data = {}
+        for protein_id, go_ids in protein_go_dict.items():
+            temp_go_data[protein_id] = go_ids
+        prompts_data = {}
+        if self.lmdb_path:
+            # 如果有lmdb路径，处理QA数据
+            from utils.generate_protein_prompt import get_qa_data
+            global_index = 0
+            for protein_id in tqdm(interproscan_info.keys(), desc="生成prompts"):
+                # 获取QA对
+                qa_pairs = get_qa_data(protein_id, self.lmdb_path)
+                for qa_pair in qa_pairs:
+                    question = qa_pair['question']
+                    ground_truth = qa_pair['ground_truth']
+                    # 生成prompt（需要修改generate_prompt函数以支持内存数据）
+                    prompt = self._generate_prompt_from_memory(
+                        protein_id, interproscan_info, temp_go_data, question
+                    )
+                    if prompt:
+                        prompts_data[global_index] = {
+                            "index": global_index,
+                            "protein_id": protein_id,
+                            "prompt": prompt,
+                            "question": question,
+                            "ground_truth": ground_truth
+                        }
+                        global_index += 1
+        else:
+            # 如果没有lmdb路径，按原来的方式处理
+            for protein_id in tqdm(interproscan_info.keys(), desc="生成prompts"):
+                prompt = self._generate_prompt_from_memory(
+                    protein_id, interproscan_info, temp_go_data
+                )
+                if prompt:
+                    prompts_data[protein_id] = prompt
+        print(f"步骤3完成: 生成了 {len(prompts_data)} 个prompt")
+        return prompts_data
+    def _generate_prompt_from_memory(self, protein_id: str, interproscan_info: Dict,
+                                   protein_go_dict: Dict, question: str = None) -> str:
+        """
+        从内存中的数据生成prompt，包含完整的motif和GO定义
+        """
+        try:
+            from utils.protein_go_analysis import get_go_definition
+            from jinja2 import Template
+            from utils.generate_protein_prompt import get_prompt_template
+            # 获取GO分析结果
+            go_ids = protein_go_dict.get(protein_id, [])
+            go_annotations = []
+            all_related_definitions = {}
+            if go_ids:
+                for go_id in go_ids:
+                    # 确保GO ID格式正确
+                    clean_go_id = go_id.split(":")[-1] if ":" in go_id else go_id
+                    go_annotations.append({"go_id": clean_go_id})
+                    # 获取GO定义
+                    definition = get_go_definition(clean_go_id,self.go_info_path)
+                    if definition:
+                        all_related_definitions[clean_go_id] = definition
+            # 获取motif信息
+            motif_pfam = {}
+            if self.pfam_descriptions_path:
+                try:
+                    # 从interproscan结果中提取pfam信息
+                    interproscan_results = interproscan_info[protein_id].get('interproscan_results', {})
+                    pfam_entries = interproscan_results.get('pfam_id', [])
+                    # 加载pfam描述
+                    with open(self.pfam_descriptions_path, 'r') as f:
+                        pfam_descriptions = json.load(f)
+                    # 构建motif_pfam字典
+                    for entry in pfam_entries:
+                        for pfam_id, ipr_id in entry.items():
+                            if pfam_id and pfam_id in pfam_descriptions:
+                                motif_pfam[pfam_id] = pfam_descriptions[pfam_id]['description']
+                except Exception as e:
+                    print(f"获取motif信息时出错: {str(e)}")
+            # 获取InterPro描述信息
+            interpro_descriptions = {}
+            other_types = [t for t in self.selected_info_types if t not in ['motif', 'go']]
+            if other_types and self.interpro_manager:
+                interpro_descriptions = self.interpro_manager.get_description(protein_id, other_types)
+            # 准备模板数据
+            template_data = {
+                "protein_id": protein_id,
+                "selected_info_types": self.selected_info_types,
+                "go_data": {
+                    "status": "success" if go_annotations else "no_data",
+                    "go_annotations": go_annotations,
+                    "all_related_definitions": all_related_definitions
+                },
+                "motif_pfam": motif_pfam,
+                "interpro_descriptions": interpro_descriptions,
+                "question": question
+            }
+            # 使用模板生成prompt
+            PROMPT_TEMPLATE = get_prompt_template(self.selected_info_types, self.lmdb_path)
+            template = Template(PROMPT_TEMPLATE)
+            return template.render(**template_data)
+        except Exception as e:
+            print(f"生成prompt时出错 (protein_id: {protein_id}): {str(e)}")
+            # 如果出错，返回简化版本的prompt
+            return self._generate_simple_prompt(protein_id, interproscan_info, protein_go_dict, question)
+    def _generate_simple_prompt(self, protein_id: str, interproscan_info: Dict,
+                               protein_go_dict: Dict, question: str = None) -> str:
+        """
+        生成简化版本的prompt（作为备用）
+        """
+        # 获取蛋白质序列
+        sequence = interproscan_info[protein_id].get('sequence', '')
+        # 获取GO信息
+        go_ids = protein_go_dict.get(protein_id, [])
+        # 获取motif信息
+        interproscan_results = interproscan_info[protein_id].get('interproscan_results', {})
+        pfam_entries = interproscan_results.get('pfam_id', [])
+        # 简化的prompt生成逻辑
+        prompt_parts = []
+        if self.lmdb_path:
+            from utils.prompts import FUNCTION_PROMPT
+            prompt_parts.append(FUNCTION_PROMPT)
+        else:
+            from utils.prompts import ENZYME_PROMPT
+            prompt_parts.append(ENZYME_PROMPT)
+        prompt_parts.append("\ninput information:")
+        # 添加motif信息
+        if 'motif' in self.selected_info_types and pfam_entries:
+            prompt_parts.append("\nmotif:")
+            for entry in pfam_entries:
+                for key, value in entry.items():
+                    if value:
+                        prompt_parts.append(f"{value}: 无详细描述")
+        # 添加GO信息
+        if 'go' in self.selected_info_types and go_ids:
+            prompt_parts.append("\nGO:")
+            for i, go_id in enumerate(go_ids[:10], 1):
+                prompt_parts.append(f"▢ GO term{i}: {go_id}")
+                prompt_parts.append(f"• definition: 无详细定义")
+        if question:
+            prompt_parts.append(f"\nquestion: \n{question}")
+        return "\n".join(prompt_parts)
+    def step4_generate_llm_answers(self, prompts_data: Dict, save_dir: str) -> None:
+        """
+        步骤4: 生成LLM答案
+        Args:
+            prompts_data: prompt数据
+            save_dir: 保存目录
+        """
+        print("步骤4: 生成LLM答案...")
+        # 创建保存目录
+        os.makedirs(save_dir, exist_ok=True)
+        if self.lmdb_path:
+            # 如果有lmdb路径，处理QA数据
+            for index, qa_item in tqdm(prompts_data.items(), desc="生成LLM答案"):
+                try:
+                    protein_id = qa_item['protein_id']
+                    prompt = qa_item['prompt']
+                    question = qa_item['question']
+                    ground_truth = qa_item['ground_truth']
+                    # 调用LLM生成答案
+                    llm_response = call_chatgpt(prompt)
+                    # 构建结果数据
+                    result = {
+                        'protein_id': protein_id,
+                        'index': index,
+                        'question': question,
+                        'ground_truth': ground_truth,
+                        'llm_answer': llm_response
+                    }
+                    # 保存文件
+                    save_path = os.path.join(save_dir, f"{protein_id}_{index}.json")
+                    with open(save_path, 'w') as f:
+                        json.dump(result, f, indent=2, ensure_ascii=False)
+                except Exception as e:
+                    print(f"处理索引 {index} 时出错: {str(e)}")
+        else:
+            # 如果没有lmdb路径，按原来的方式处理
+            for protein_id, prompt in tqdm(prompts_data.items(), desc="生成LLM答案"):
+                try:
+                    # 调用LLM生成答案
+                    llm_response = call_chatgpt(prompt)
+                    # 构建结果数据
+                    result = {
+                        'protein_id': protein_id,
+                        'prompt': prompt,
+                        'llm_answer': llm_response
+                    }
+                    # 保存文件
+                    save_path = os.path.join(save_dir, f"{protein_id}.json")
+                    with open(save_path, 'w') as f:
+                        json.dump(result, f, indent=2, ensure_ascii=False)
+                except Exception as e:
+                    print(f"处理蛋白质 {protein_id} 时出错: {str(e)}")
+        print(f"步骤4完成: 结果已保存到 {save_dir}")
+    def run(self, input_fasta: str, output_dir: str, temp_dir: str = "temp"):
+        """
+        运行完整的工作流
+        Args:
+            input_fasta: 输入FASTA文件路径
+            output_dir: 输出目录
+            temp_dir: 临时文件目录
+        """
+        print(f"开始运行整合蛋白质分析管道...")
+        print(f"输入文件: {input_fasta}")
+        print(f"输出目录: {output_dir}")
+        # 创建输出目录
+        os.makedirs(output_dir, exist_ok=True)
+        try:
+            # 步骤1: 运行BLAST和InterProScan
+            if self.interproscan_info_path is None or self.blast_info_path is None:
+                interproscan_info, blast_info = self.step1_run_blast_and_interproscan(
+                    input_fasta, temp_dir
+                )
+            else:
+                interproscan_info = json.load(open(self.interproscan_info_path))
+                blast_info = json.load(open(self.blast_info_path))
+            # 步骤2: 整合GO信息
+            protein_go_dict = self.step2_integrate_go_information(
+                interproscan_info, blast_info
+            )
+            # 步骤3: 生成prompt
+            prompts_data = self.step3_generate_prompts(
+                interproscan_info, blast_info, protein_go_dict
+            )
+            print(prompts_data)
+            # 步骤4: 生成LLM答案
+            self.step4_generate_llm_answers(prompts_data, output_dir)
+            print("整合管道运行完成！")
+        except Exception as e:
+            print(f"管道运行出错: {str(e)}")
+            raise
+        finally:
+            # 清理临时目录
+            print(f"清理临时目录: {temp_dir}")
+            if os.path.exists(temp_dir):
+                import shutil
+                shutil.rmtree(temp_dir)
+def main():
+    parser = argparse.ArgumentParser(description="整合蛋白质分析管道")
+    parser.add_argument("--input_fasta", type=str, required=True, help="输入FASTA文件路径")
+    parser.add_argument("--output_dir", type=str, required=True, help="输出目录")
+    parser.add_argument("--temp_dir", type=str, default="temp", help="临时文件目录")
+    parser.add_argument('--interproscan_info_path', type=str, default=None, help="InterProScan结果文件路径")
+    parser.add_argument('--blast_info_path', type=str, default=None, help="BLAST结果文件路径")
+    # BLAST参数
+    parser.add_argument("--blast_database", type=str, default="uniprot_swissprot", help="BLAST数据库")
+    parser.add_argument("--expect_value", type=float, default=0.01, help="BLAST E-value阈值")
+    # InterProScan参数
+    parser.add_argument("--interproscan_path", type=str,
+                       default="interproscan/interproscan-5.75-106.0/interproscan.sh",
+                       help="InterProScan程序路径")
+    # GO整合参数
+    parser.add_argument("--go_topk", type=int, default=2, help="GO整合topk参数")
+    # Prompt生成参数
+    parser.add_argument("--selected_info_types", type=str, nargs='+',
+                       default=['motif', 'go'], help="选择的信息类型")
+    parser.add_argument("--pfam_descriptions_path", type=str, default='data/raw_data/all_pfam_descriptions.json', help="Pfam描述文件路径")
+    parser.add_argument("--go_info_path", type=str, default='data/raw_data/go.json', help="GO信息文件路径")
+    parser.add_argument("--interpro_data_path", type=str, default='data/raw_data/interpro_data.json', help="InterPro数据文件路径")
+    parser.add_argument("--lmdb_path", type=str, help="LMDB数据库路径")
+    args = parser.parse_args()
+    # 创建管道实例
+    pipeline = IntegratedProteinPipeline(
+        blast_database=args.blast_database,
+        expect_value=args.expect_value,
+        interproscan_path=args.interproscan_path,
+        go_topk=args.go_topk,
+        selected_info_types=args.selected_info_types,
+        pfam_descriptions_path=args.pfam_descriptions_path,
+        go_info_path=args.go_info_path,
+        interpro_data_path=args.interpro_data_path,
+        lmdb_path=args.lmdb_path,
+        args=args
+    )
+    # 运行管道
+    pipeline.run(args.input_fasta, args.output_dir, args.temp_dir)
+if __name__ == "__main__":
+    main()

interproscan.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import os
+import datetime
+class InterproScan():
+    def __init__(self, bash_path):
+        self.bash_path = bash_path
+    def run(self, fasta_file, goterms, pathways, save_dir) -> dict:
+        start_time = datetime.datetime.now()
+        temp_dir = f"{os.path.dirname(save_dir)}/temp"
+        if not os.path.exists(temp_dir):
+            os.makedirs(temp_dir)
+        seqs = self.read_fasta_to_list(fasta_file)
+        seqtype = self.is_protein_sequence(seqs)
+        # Call the InterproScan
+        cmd = f"{self.bash_path} \
+                -i {fasta_file} -o {save_dir} -f JSON"
+        cmd += f" -T {temp_dir}"
+        if goterms:
+            cmd += " -goterms"
+        if pathways:
+            cmd += " -pa"
+        if seqtype:
+            cmd += f" -t p"
+        else:
+            cmd += f" -t n"
+        print(cmd)
+        try:
+            os.system(cmd)
+            end_time = datetime.datetime.now()
+            spend_time = (end_time - start_time).total_seconds()
+            if os.listdir(save_dir):
+                print(f"InterproScan successfully completed. Output saved to {save_dir[len(self.out_dir)+1:]}.")
+                return {"output_dir": save_dir[len(self.out_dir)+1:], "duration": spend_time}
+            else:
+                raise Exception("InterproScan encountered an error. Please check your inputs and options.")
+        except Exception as e:
+            return {"error": str(e)}
+    def is_protein_sequence(self, sequences):
+        sequence = "".join(sequences)
+        # ATCG AUCG
+        if len(set(sequence.upper())) > 6:
+            return True
+        else:
+            return False
+    def read_fasta_to_list(self, file_path):
+        sequences = []
+        current_header = None
+        current_seq = []
+        with open(file_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line.startswith(">"):
+                    if current_header is not None:
+                        sequences.append("".join(current_seq))
+                    current_header = line[1:]
+                    current_seq = []
+                else:
+                    current_seq.append(line)
+            if current_header is not None:
+                sequences.append("".join(current_seq))
+        return sequences
+if __name__ == '__main__':
+    # Test
+    interproscan = InterproScan("interproscan/interproscan-5.75-106.0/interproscan.sh")
+    from utils.utils import get_protein_sequence_biopython, tofasta
+    import pickle
+    uids = []
+    seqs = []
+    with open("/zhangjiawei/interproscan/example/difference_20241122_ec_dict_list.pkl", "rb") as f:
+        datas = pickle.load(f)
+    for data in datas:
+        uids.append(data["uniprot_id"])
+        seqs.append(data["sequence"])
+    fasta_file = "example/protein_go_clean.fasta"
+    # seqs = [get_protein_sequence_biopython(uid) for uid in uids]
+    tofasta(fasta_file, uids, seqs)
+    input_args = {
+        "fasta_file": fasta_file,
+        "goterms": True,
+        "pathways": True,
+        "save_dir": "output/interproscan"
+    }
+    interproscan.run(**input_args)

pipeline.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from interproscan import InterproScan
+from Bio.Blast.Applications import NcbiblastpCommandline
+from utils.utils import extract_interproscan_metrics, get_seqnid, extract_blast_metrics, rename_interproscan_keys
+import os
+import json
+# input fasta file
+input_fasta = "evolla_test/test_hq0704_da_w_plddt_mask_hard_idnseqs.fasta"
+#####################################################
+# run blast
+#####################################################
+# settings
+blast_database = "uniprot_swissprot"
+expect_value = 0.01
+blast_xml = "evolla_test/test_hq0704_da_w_plddt_mask_hard_blast.xml"
+seq_dict = get_seqnid(input_fasta)
+output_dir = os.path.dirname(blast_xml)
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+blast_cmd = NcbiblastpCommandline(
+    query=input_fasta,
+    db=blast_database,
+    out=blast_xml,
+    outfmt=5,  # XML 格式
+    evalue=expect_value
+)
+blast_cmd()  # 运行
+blast_results = extract_blast_metrics(blast_xml)
+blast_info = {}
+for uid, info in blast_results.items():
+    blast_info[uid] = {"sequence": seq_dict[uid], "blast_results": info}
+# save blast results
+with open(blast_xml.replace(".xml", ".json"), "w") as f:
+    json.dump(blast_info, f, indent=4)
+#####################################################
+# run interproscan
+#####################################################
+# settings
+goterms = True
+pathways = True
+interproscan_json = "evolla_test/test_hq0704_da_w_plddt_mask_hard_interproscan.json"
+interproscan_path = "interproscan/interproscan-5.75-106.0/interproscan.sh"
+librarys = ["PFAM", "PIRSR", "PROSITE_PROFILES", "SUPERFAMILY", "PRINTS", "PANTHER", "CDD", "GENE3D", "NCBIFAM", "SFLM", "MOBIDB_LITE", "COILS", "PROSITE_PATTERNS", "FUNFAM", "SMART"]
+interproscan = InterproScan(interproscan_path)
+input_args = {
+        "fasta_file": input_fasta,
+        "goterms": goterms,
+        "pathways": pathways,
+        "save_dir": interproscan_json}
+interproscan.run(**input_args)  # 运行
+# output_name = input_fasta.split("/")[-1] + ".json"
+interproscan_results = extract_interproscan_metrics(interproscan_json,
+                                                    librarys=librarys)
+interproscan_info = {}
+for id, seq in seq_dict.items():
+    info = interproscan_results[seq]
+    info = rename_interproscan_keys(info)
+    interproscan_info[id] = {"sequence":seq, "interproscan_results": info}
+# save blast results
+with open(interproscan_json, "w") as f:
+    json.dump(interproscan_info, f, indent=4)

readme.md ADDED Viewed

	@@ -0,0 +1,120 @@

+# ProteinAgentQA: 智能蛋白质功能问答引擎
+`ProteinAgentQA` 是一个基于大型语言模型（LLM）的智能问答系统，旨在弥合海量蛋白质数据与研究人员自然语言提问之间的鸿沟。项目内置两种针对不同应用场景的模式，使用户能够以最高效、最可靠的方式探索或查询蛋白质的功能信息。
+-   **项目状态**: 研发阶段
+-   **核心技术**: 生物信息学分析 (BLAST, InterProScan), 深度学习 (ProTrek), 大型语言模型 (LLM)
+---
+## 核心理念
+传统的蛋白质功能研究依赖于研究人员手动操作复杂的生物信息学工具并解读其结果。`ProteinAgentQA` 将这一过程自动化和智能化，通过两种独特的模式服务于蛋白质研究的全周期：从全新蛋白的初步功能探索，到对已知明星蛋白的快速信息检索。
+## 两大核心模式
+用户可以根据研究对象的性质，选择进入最适合的问答模式。
+### 1. 未知探索模式 (Unknown Exploration Mode)
+**此模式专为功能未知、未经实验验证、或注释信息稀少的新型蛋白质设计。**
+当您面对一个全新的蛋白质序列，传统工具的注释结果可能充满噪音或过于笼统。本模式的目标是，通过一个先进的、经过严格验证的生物信息学流程，为该蛋白生成一个**高可信度的功能(GO)与结构域(Motif)集合**，并以此作为LLM回答问题的唯一事实依据。这确保了对未知蛋白功能推断的严谨性和可靠性。
+#### 核心工作流：
+1.  **输入**: 用户提供蛋白质的氨基酸序列。
+2.  **后端自动化分析**: 系统自动触发三步分析流程：
+    *   **第一步：候选功能集生成 (Candidate Generation)**
+        *   系统并行使用 **BLASTp** (对Swiss-Prot等高质量数据库) 和 **InterProScan** (整合Pfam, PROSITE等多个特征数据库) 对输入序列进行分析。
+        *   此阶段采用宽松策略，合并（Union）所有搜集到的GO Terms，旨在最大化**召回率(Recall)**，确保不遗漏任何潜在的功能线索。
+    *   **第二步：AI语义过滤 (AI-Powered Semantic Filtering)**
+        *   所有候选GO term都将通过 **ProTrek** 模型进行打分。ProTrek是一个蛋白质-文本相关性评估工具，它能直接计算蛋白质序列与其功能描述文本之间的语义相关性分数。
+        *   这一步引入了与序列比对和特征匹配完全**正交(Orthogonal)**的证据，能极其有效地过滤掉由BLAST带来的“同源但功能不相关”或由InterProScan带来的“过于笼统”的假阳性注释。
+    *   **第三步：数据驱动的阈值筛选 (Data-Driven Thresholding)**
+        *   系统会根据一个预设的ProTrek分数阈值，筛选出最终的“高可信度GO集”。
+        *   **该阈值并非主观设定**，而是通过对一个包含607个新入库酶蛋白的验证集进行严格测试确定的。我们通过绘制**Precision-Recall曲线**，选取了使**F1-Score达到峰值**的分数作为最佳阈值，实现了准确率(Precision)和召回率(Recall)的最佳平衡。
+3.  **LLM整合与问答**:
+    *   只有通过上述流程筛选出的高可信度GO Terms和Motifs，才会被作为上下文(Context)信息喂给大型语言模型。
+    *   用户的所有提问（如“这个蛋白可能的功能是什么？”、“它属于哪个蛋白家族？”、“它可能参与哪些生物学过程？”）都将由LLM**严格基于这份高质量的上下文**来回答，杜绝模型产生幻觉或进行无依据的猜测。
+### 2. 已知问答模式 (Known Q&A Mode)
+**此模式专为已有充分研究和可靠注释的蛋白质（例如Swiss-Prot中的明星蛋白）设计，充当一个高效、精准的“蛋白质知识库私人助手”。**
+#### 核心工作流：
+1.  **输入**: 用户提供已知蛋白质的通用ID（如UniProt ID: `P53_HUMAN`）。
+2.  **后端直接查询**: 系统直接访问并解析UniProt/Swiss-Prot等权威数据库，获取该蛋白质的**“金标准”(Ground Truth)**信息，包括但不限于：
+    *   官方功能注释
+    *   亚细胞定位
+    *   序列变体
+    *   翻译后修饰
+    *   相关通路等
+3.  **LLM整合与问答**:
+    *   系统将这些权威、详尽的资料作为上下文提供给LLM。
+    *   用户可以围绕该蛋白进行任何提问，获得精准、可靠的答案。例如：“人类P53蛋白的亚细胞定位在哪里？”、“它有哪些主要的翻译后修饰位点？”。
+## 系统架构示意图
+```mermaid
+graph TD
+    A[用户] --> B{ProteinAgentQA};
+    B --> C{模式选择};
+    C --> D[未知探索模式];
+    C --> E[已知问答模式];
+    D --> F[输入: 蛋白序列];
+    F --> G[后台分析流程];
+    G --> H[1. BLAST + InterProScan <br> (生成候选GO集)];
+    H --> I[2. ProTrek语义打分];
+    I --> J[3. F1最优阈值筛选];
+    J --> K[最终高可信GO/Motif];
+    K --> L[LLM上下文];
+    E --> M[输入: 蛋白ID];
+    M --> N[查询Swiss-Prot等数据库];
+    N --> O[获取Ground Truth信息];
+    O --> L[LLM上下文];
+    L --> P[大型语言模型];
+    A --> Q[提问];
+    Q --> P;
+    P --> R[生成回答];
+    R --> A;
+    subgraph "后端核心处理"
+        G
+        H
+        I
+        J
+        N
+    end
+```
+## 使用示例
+### 未知探索模式
+> **用户输入**:
+> `> MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVIDB...`
+> **用户提问**:
+> `这个蛋白的功能是什么？它依赖于什么离子？`
+> **系统回答**:
+> `基于对其序列的分析，该蛋白极有可能是一种GTP结合蛋白。其预测的分子功能包括“GTP binding (GO:0005525)”和“GTPase activity (GO:0003924)”。分析显示其包含一个P-loop NTPase结构域，该结构域通常需要镁离子(Magnesium ion)作为辅因子。`
+### 已知问答模式
+> **用户输入**:
+> `P04637` (人类P53蛋白的UniProt ID)
+> **用户提问**:
+> `总结一下这个蛋白在细胞周期调控中的作用。`
+> **系统回答**:
+> `根据UniProt的记录，细胞肿瘤抗原p53 (TP53) 是一个核心的肿瘤抑制因子。在细胞周期中，它充当一个关键的检查点调控蛋白。当检测到DNA损伤时，p53会被激活并积累，通过转录激活CDKN1A/p21等下游基因，来阻止细胞从G1期进入S期，从而为DNA修复提供时间。如果损伤无法修复，p53可以诱导细胞凋亡。`

setup.sh ADDED Viewed

	@@ -0,0 +1,129 @@

+#!/bin/bash
+# Define InterProScan version
+IPS_VERSION="5.75-106.0"
+CONDA_ENV_NAME="rag_llm"
+IPS_DIR="interproscan-${IPS_VERSION}"
+IPS_TAR="interproscan-${IPS_VERSION}-64-bit.tar.gz"
+IPS_URL="https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/${IPS_VERSION}/${IPS_TAR}"
+# Check if conda is available
+if ! command -v conda &> /dev/null; then
+    echo "Error: conda is not installed or not in PATH"
+    echo "Please install Miniconda or Anaconda first"
+    exit 1
+fi
+# Create conda environment with Java 11
+echo "Creating conda environment '${CONDA_ENV_NAME}' with OpenJDK 11..."
+conda create -y -n ${CONDA_ENV_NAME} openjdk=11 python
+if [ $? -ne 0 ]; then
+    echo "Error: Failed to create conda environment"
+    exit 1
+fi
+# Activate conda environment
+echo "Activating conda environment..."
+source $(conda info --base)/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# Create installation directory
+echo "Setting up InterProScan ${IPS_VERSION}..."
+mkdir -p interproscan
+cd interproscan || exit 1
+# Download InterProScan and checksum
+echo "Downloading InterProScan..."
+wget -nc "${IPS_URL}"
+wget -nc "${IPS_URL}.md5"
+# Verify MD5 checksum
+echo "Verifying download integrity..."
+if ! md5sum -c "${IPS_TAR}.md5"; then
+    echo "ERROR: MD5 checksum verification failed!"
+    echo "The downloaded file may be corrupted. Please try downloading again."
+    exit 1
+fi
+# Extract package
+echo "Extracting InterProScan..."
+tar -xzf "${IPS_TAR}"
+# Verify Java installation in conda env
+echo "Checking Java environment in conda env..."
+JAVA_VER=$(java -version 2>&1 | head -n 1 | awk -F '"' '{print $2}')
+if [[ "$JAVA_VER" =~ ^11\. ]]; then
+    echo "Found compatible Java version in conda env: $JAVA_VER"
+else
+    echo "Error: Java version in conda env is not 11.x (found: $JAVA_VER)"
+    exit 1
+fi
+# Run setup
+echo "Running InterProScan setup..."
+cd "${IPS_DIR}" || exit 1
+python setup.py -f interproscan.properties
+echo ""
+echo "InterProScan installation completed in conda environment '${CONDA_ENV_NAME}'!"
+echo "To use InterProScan, first activate the conda environment:"
+echo "conda activate ${CONDA_ENV_NAME}"
+echo "Then add InterProScan to your PATH:"
+echo "export PATH=\$PATH:$(pwd)"
+echo "You may also need to set INTERPROSCAN_HOME=$(pwd)"
+cd ../
+# install biopython for blast
+echo "Installing Biopython for BLAST support..."
+pip install biopython
+echo "Biopython installation completed."
+# Install BLAST from bioconda
+echo "Installing BLAST from bioconda..."
+conda config --add channels bioconda
+conda config --add channels conda-forge
+conda install -c bioconda blast=2.16.0 -y
+mkdir -p blast_db
+cd blast_db || exit 1
+echo "Downloading UniProt SwissProt database..."
+wget --quiet --show-progress -N https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
+if [ -f "uniprot_sprot.fasta.gz" ]; then
+    echo "Decompressing database..."
+    gunzip -k uniprot_sprot.fasta.gz
+    if [ -f "uniprot_sprot.fasta" ]; then
+        echo "Creating BLAST database..."
+        makeblastdb -in uniprot_sprot.fasta -dbtype prot -out uniprot_swissprot -parse_seqids -title "UniProt SwissProt"
+        # Verify database creation
+        if [ -f "uniprot_swissprot.phr" ]; then
+            echo "BLAST database created successfully."
+            echo "You can now use it with: blastp -db uniprot_swissprot -query your_file.fasta"
+        else
+            echo "Error: BLAST database files not created!" >&2
+            exit 1
+        fi
+    else
+        echo "Error: Failed to decompress database!" >&2
+        exit 1
+    fi
+else
+    echo "Error: Failed to download database!" >&2
+    exit 1
+fi
+export BLASTDB=$(pwd)
+echo "BLASTDB environment variable set to: $BLASTDB"
+echo "please add <export BLASTDB=$(pwd)> to your .bashrc or .zshrc file for persistent use."
+# install python packages
+echo "Installing required Python packages..."
+pip install openai gradio torch

test_data/interproscan_info.json ADDED Viewed

The diff for this file is too large to render. See raw diff

utils/cal_pr.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import json
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import precision_recall_curve, average_precision_score
+def load_go_data(file_path):
+    """加载JSON文件中的GO数据"""
+    data = {}
+    with open(file_path, 'r') as f:
+        for line in f:
+            entry = json.loads(line)
+            data[entry["protein_id"]] = set(entry["GO_id"])
+    return data
+def calculate_pr_metrics(true_go_file, pred_go_file, scores_file=None):
+    """计算precision、recall和绘制PR曲线"""
+    # 加载真实GO和预测GO数据
+    true_go_data = load_go_data(true_go_file)
+    pred_go_data = load_go_data(pred_go_file)
+    # 如果提供了分数文件，加载分数
+    scores = {}
+    if scores_file:
+        with open(scores_file, 'r') as f:
+            for line in f:
+                entry = json.loads(line)
+                scores[entry["protein_id"]] = {go: score for go, score in entry.get("GO_scores", {}).items()}
+    # 准备计算PR曲线的数据
+    all_true = []
+    all_scores = []
+    # 处理每个蛋白质
+    common_proteins = set(true_go_data.keys()) & set(pred_go_data.keys())
+    # 计算每个蛋白质的precision和recall
+    protein_metrics = {}
+    for protein_id in common_proteins:
+        true_gos = true_go_data[protein_id]
+        pred_gos = pred_go_data[protein_id]
+        # 计算当前蛋白质的precision和recall
+        if len(pred_gos) > 0:
+            precision = len(true_gos & pred_gos) / len(pred_gos)
+        else:
+            precision = 0.0
+        if len(true_gos) > 0:
+            recall = len(true_gos & pred_gos) / len(true_gos)
+        else:
+            recall = 1.0  # 如果没有真实GO，则recall为1
+        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
+        protein_metrics[protein_id] = {
+            "precision": precision,
+            "recall": recall,
+            "f1": f1
+        }
+        # 如果有分数，为PR曲线准备数据
+        if scores_file:
+            protein_scores = scores.get(protein_id, {})
+            for go in set(true_gos) | set(pred_go_data.get(protein_id, set())):
+                all_true.append(1 if go in true_gos else 0)
+                all_scores.append(protein_scores.get(go, 0.0))
+    # 计算整体指标
+    avg_precision = np.mean([m["precision"] for m in protein_metrics.values()])
+    avg_recall = np.mean([m["recall"] for m in protein_metrics.values()])
+    avg_f1 = np.mean([m["f1"] for m in protein_metrics.values()])
+    results = {
+        "average_precision": avg_precision,
+        "average_recall": avg_recall,
+        "average_f1": avg_f1,
+        "protein_metrics": protein_metrics
+    }
+    # 如果有分数，绘制PR曲线
+    if scores_file and all_true and all_scores:
+        all_true = np.array(all_true)
+        all_scores = np.array(all_scores)
+        precision, recall, thresholds = precision_recall_curve(all_true, all_scores)
+        avg_precision = average_precision_score(all_true, all_scores)
+        # 计算每个阈值的F1分数
+        f1_scores = np.zeros_like(thresholds)
+        for i, threshold in enumerate(thresholds):
+            f1_scores[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i]) if (precision[i] + recall[i]) > 0 else 0
+        # 找到最佳F1分数对应的阈值
+        best_f1_idx = np.argmax(f1_scores)
+        best_threshold = thresholds[best_f1_idx]
+        best_precision = precision[best_f1_idx]
+        best_recall = recall[best_f1_idx]
+        best_f1 = f1_scores[best_f1_idx]
+        # 绘制PR曲线
+        plt.figure(figsize=(10, 8))
+        plt.plot(recall, precision, label=f'平均精确率 = {avg_precision:.3f}')
+        plt.scatter(best_recall, best_precision, color='red',
+                   label=f'最佳F1 = {best_f1:.3f} (阈值 = {best_threshold:.3f})')
+        plt.xlabel('Recall')
+        plt.ylabel('Precision')
+        plt.title('Precision-Recall 曲线')
+        plt.legend()
+        plt.grid(True)
+        # 保存图像
+        plt.savefig('pr_curve.png', dpi=300)
+        plt.close()
+        results.update({
+            "pr_curve": {
+                "precision": precision.tolist(),
+                "recall": recall.tolist(),
+                "thresholds": thresholds.tolist(),
+                "best_threshold": float(best_threshold),
+                "best_f1": float(best_f1)
+            }
+        })
+    return results
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description='计算GO预测的Precision和Recall并绘制PR曲线')
+    parser.add_argument('--true', required=True, help='真实GO的JSON文件路径')
+    parser.add_argument('--pred', required=True, help='预测GO的JSON文件路径')
+    parser.add_argument('--scores', help='GO分数的JSON文件路径（可选）')
+    parser.add_argument('--output', default='test_results/pr_results.json', help='输出结果的JSON文件路径')
+    args = parser.parse_args()
+    results = calculate_pr_metrics(args.true, args.pred, args.scores)
+    # 保存结果
+    with open(args.output, 'w') as f:
+        json.dump(results, f, indent=2)
+    print(f"平均精确率: {results['average_precision']:.4f}")
+    print(f"平均召回率: {results['average_recall']:.4f}")
+    print(f"平均F1分数: {results['average_f1']:.4f}")
+    if 'pr_curve' in results:
+        print(f"最佳F1分数: {results['pr_curve']['best_f1']:.4f} (阈值: {results['pr_curve']['best_threshold']:.4f})")
+        print(f"PR曲线已保存为 pr_curve.png")
+if __name__ == "__main__":
+    main()

utils/functions.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import json
+from tqdm import tqdm
+def merge_interproscan_data(merge_file1, merge_file2):
+    """
+    合并test_data/interproscan_info.json到data/processed_data/interproscan_info.json
+    根据序列匹配进行合并，更新数据结构
+    """
+    # 读取源文件和目标文件
+    with open(merge_file1, 'r') as f:
+        target_data = json.load(f)
+    with open(merge_file2, 'r') as f:
+        source_data = json.load(f)
+    # 遍历目标文件中的每一项
+    for protein_id, protein_info in tqdm(target_data.items()):
+        sequence = protein_info['sequence']
+        # 在源文件中查找匹配的序列
+        if sequence in source_data:
+            source_info = source_data[sequence]
+            # 更新interproscan_results
+            if 'interproscan_results' not in protein_info:
+                protein_info['interproscan_results'] = {}
+            # 处理PFAM -> pfam_id
+            if 'PFAM' in source_info and source_info['PFAM']:
+                protein_info['interproscan_results']['pfam_id'] = source_info['PFAM']
+            # 处理GO -> go_id (保持原有结构)
+            if 'GO' in source_info and source_info['GO']:
+                protein_info['interproscan_results']['go_id'] = source_info['GO']
+            # 添加其他字段
+            for key, value in source_info.items():
+                if key not in ['PFAM', 'GO'] and value:  # 跳过PFAM和GO，只处理其他非空字段
+                    # 将字段名转换为小写加下划线的格式
+                    field_name = key.lower().replace('_', '_')
+                    protein_info['interproscan_results'][field_name] = value
+    # 保存更新后的数据
+    with open(merge_file1, 'w') as f:
+        json.dump(target_data, f, indent=4, ensure_ascii=False)
+    print("数据合并完成！")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--merge_file1', default='data/processed_data/interproscan_info.json', help='合并的文件1')
+    parser.add_argument('--merge_file2', default='test_data/interproscan_info.json', help='合并的文件2')
+    args = parser.parse_args()
+    merge_interproscan_data(args.merge_file1, args.merge_file2)

utils/generate_llm_answers.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import json
+from pathlib import Path
+from tqdm import tqdm
+from utils.openai_access import call_chatgpt
+from utils.mpr import MultipleProcessRunnerSimplifier
+from utils.generate_protein_prompt import generate_prompt
+qa_data = None
+def _load_qa_data(prompt_path):
+    global qa_data
+    if qa_data is None:
+        qa_data = {}
+        with open(prompt_path, 'r') as f:
+            for line in f:
+                if line.strip():
+                    item = json.loads(line.strip())
+                    qa_data[item['index']] = item
+    return qa_data
+def process_single_qa(process_id, idx, qa_index, writer, save_dir):
+    """处理单个QA对并生成答案"""
+    try:
+        qa_item = qa_data[qa_index]
+        protein_id = qa_item['protein_id']
+        prompt = qa_item['prompt']
+        question = qa_item['question']
+        ground_truth = qa_item['ground_truth']
+        # 调用LLM生成答案
+        llm_response = call_chatgpt(prompt)
+        # 构建结果数据
+        result = {
+            'protein_id': protein_id,
+            'index': qa_index,
+            'question': question,
+            'ground_truth': ground_truth,
+            'llm_answer': llm_response
+        }
+        # 保存文件，文件名使用protein_id和index
+        save_path = os.path.join(save_dir, f"{protein_id}_{qa_index}.json")
+        with open(save_path, 'w') as f:
+            json.dump(result, f, indent=2, ensure_ascii=False)
+    except Exception as e:
+        print(f"Error processing QA index {qa_index}: {str(e)}")
+def get_missing_qa_indices(save_dir):
+    """检查哪些QA索引尚未成功生成数据"""
+    # 获取所有应该生成的qa索引
+    all_qa_indices = list(qa_data.keys())
+    # 存储问题qa索引（包括空文件和未生成的文件）
+    problem_qa_indices = set()
+    # 检查每个应该存在的qa索引
+    for qa_index in tqdm(all_qa_indices, desc="检查QA数据文件"):
+        protein_id = qa_data[qa_index]['protein_id']
+        json_file = Path(save_dir) / f"{protein_id}_{qa_index}.json"
+        # 如果文件不存在，加入问题列表
+        if not json_file.exists():
+            problem_qa_indices.add(qa_index)
+            continue
+        # 检查文件内容
+        try:
+            with open(json_file, 'r') as f:
+                data = json.load(f)
+                # 检查文件内容是否为空或缺少必要字段
+                if (data is None or len(data) == 0 or
+                    'llm_answer' not in data or
+                    data.get('llm_answer') is None or
+                    data.get('llm_answer') == ''):
+                    problem_qa_indices.add(qa_index)
+                    json_file.unlink()  # 删除空文件或不完整文件
+        except (json.JSONDecodeError, Exception) as e:
+            # 如果JSON解析失败，也认为是问题文件
+            problem_qa_indices.add(qa_index)
+            try:
+                json_file.unlink()  # 删除损坏的文件
+            except:
+                pass
+    return problem_qa_indices
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt_path", type=str,
+                      default="data/processed_data/prompts@clean_test.jsonl",
+                      help="Path to the JSONL file containing QA prompts")
+    parser.add_argument("--n_process", type=int, default=64,
+                      help="Number of parallel processes")
+    parser.add_argument("--save_dir", type=str,
+                      default="data/clean_test_results_top2",
+                      help="Directory to save results")
+    parser.add_argument("--max_iterations", type=int, default=3,
+                      help="Maximum number of iterations to try generating all QA pairs")
+    args = parser.parse_args()
+    # 创建保存目录
+    os.makedirs(args.save_dir, exist_ok=True)
+    # 加载QA数据
+    _load_qa_data(args.prompt_path)
+    print(f"已加载 {len(qa_data)} 个QA对")
+    # 循环检查和生成，直到所有QA对都已生成或达到最大迭代次数
+    iteration = 0
+    while iteration < args.max_iterations:
+        iteration += 1
+        print(f"\n开始第 {iteration} 轮检查和生成")
+        # 获取缺失的QA索引
+        missing_qa_indices = get_missing_qa_indices(args.save_dir)
+        # 如果没有缺失的QA索引，则完成
+        if not missing_qa_indices:
+            print("所有QA数据已成功生成！")
+            break
+        print(f"发现 {len(missing_qa_indices)} 个缺失的QA数据，准备生成")
+        # 将缺失的QA索引列表转换为列表
+        missing_qa_indices_list = sorted(list(missing_qa_indices))
+        # 保存当前缺失的QA索引列表，用于记录
+        missing_ids_file = Path(args.save_dir) / f"missing_qa_indices_iteration_{iteration}.txt"
+        with open(missing_ids_file, 'w') as f:
+            for qa_index in missing_qa_indices_list:
+                protein_id = qa_data[qa_index]['protein_id']
+                f.write(f"{protein_id}_{qa_index}\n")
+        # 使用多进程处理生成缺失的QA数据
+        mprs = MultipleProcessRunnerSimplifier(
+            data=missing_qa_indices_list,
+            do=lambda process_id, idx, qa_index, writer: process_single_qa(process_id, idx, qa_index, writer, args.save_dir),
+            n_process=args.n_process,
+            split_strategy="static"
+        )
+        mprs.run()
+        print(f"第 {iteration} 轮生成完成")
+    # 最后检查一次
+    final_missing_indices = get_missing_qa_indices(args.save_dir)
+    if final_missing_indices:
+        print(f"经过 {iteration} 轮生成后，仍有 {len(final_missing_indices)} 个QA数据未成功生成")
+        # 保存最终缺失的QA索引列表
+        final_missing_ids_file = Path(args.save_dir) / "final_missing_qa_indices.txt"
+        with open(final_missing_ids_file, 'w') as f:
+            for qa_index in sorted(final_missing_indices):
+                protein_id = qa_data[qa_index]['protein_id']
+                f.write(f"{protein_id}_{qa_index}\n")
+        print(f"最终缺失的QA索引已保存到: {final_missing_ids_file}")
+    else:
+        print(f"经过 {iteration} 轮生成，所有QA数据已成功生成！")
+if __name__ == "__main__":
+    main()

utils/generate_llm_answers4enzyme.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import json
+from pathlib import Path
+from tqdm import tqdm
+from utils.openai_access import call_chatgpt
+from utils.mpr import MultipleProcessRunnerSimplifier
+from utils.generate_protein_prompt import generate_prompt
+prompts = None
+def _load_prompts(prompt_path):
+    global prompts
+    if prompts is None:
+        prompts = json.load(open(prompt_path, 'r'))
+    return prompts
+def read_protein_ids(protein_id_path):
+    """读取蛋白质ID列表"""
+    with open(protein_id_path, 'r') as f:
+        protein_ids = [line.strip() for line in f if line.strip()]
+    return protein_ids
+def process_single_protein(process_id, idx, protein_id, writer, save_dir):
+    """处理单个蛋白质的motif信息并生成摘要"""
+    try:
+        # prompt = generate_prompt(protein_id)
+        prompt = prompts[protein_id]
+        response = call_chatgpt(prompt)
+        # 写入单独的文件
+        save_path = os.path.join(save_dir, f"{protein_id}.json")
+        with open(save_path, 'w') as f:
+            json.dump(response, f, indent=2)
+    except Exception as e:
+        print(f"Error processing protein {protein_id}: {str(e)}")
+def get_missing_protein_ids(save_dir):
+    """检查哪些蛋白质ID尚未成功生成数据"""
+    # 读取所有应该生成的protein_id
+    all_protein_ids = list(prompts.keys())
+    # with open(all_protein_ids_path, 'r') as f:
+    #     all_protein_ids = set(line.strip() for line in f if line.strip())
+    # 存储问题protein_id（包括空文件和未生成的文件）
+    problem_protein_ids = set()
+    # 检查每个应该存在的protein_id
+    for protein_id in tqdm(all_protein_ids, desc="检查蛋白质数据文件"):
+        json_file = Path(save_dir) / f"{protein_id}.json"
+        # 如果文件不存在，加入问题列表
+        if not json_file.exists():
+            problem_protein_ids.add(protein_id)
+            continue
+        # 检查文件内容
+        try:
+            with open(json_file, 'r') as f:
+                data = json.load(f)
+                # 检查文件内容是否为空或null
+                if data is None or len(data) == 0:
+                    problem_protein_ids.add(protein_id)
+                    json_file.unlink()  # 删除空文件
+        except (json.JSONDecodeError, Exception) as e:
+            # 如果JSON解析失败，也认为是问题文件
+            problem_protein_ids.add(protein_id)
+            try:
+                json_file.unlink()  # 删除损坏的文件
+            except:
+                pass
+    return problem_protein_ids
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--all_protein_ids_path", type=str,
+    #                   default="/zhuangkai/projects/TTS4Protein/data/processed_data/protein_id@1024_go@10_covermotif_go.txt",
+    #                   help="Path to the file containing all protein IDs that should be generated")
+    parser.add_argument("--prompt_path", type=str,
+                      default="data/processed_data/prompts@clean_test.json",
+                      help="Path to the file containing prompts")
+    parser.add_argument("--n_process", type=int, default=64,
+                      help="Number of parallel processes")
+    parser.add_argument("--save_dir", type=str,
+                      default="data/clean_test_results_top2",
+                      help="Directory to save results")
+    parser.add_argument("--max_iterations", type=int, default=3,
+                      help="Maximum number of iterations to try generating all proteins")
+    args = parser.parse_args()
+    # 创建保存目录
+    os.makedirs(args.save_dir, exist_ok=True)
+    # 加载提示
+    _load_prompts(args.prompt_path)
+    print(f"已加载 {len(prompts)} 个提示")
+    # 循环检查和生成，直到所有蛋白质都已生成或达到最大迭代次数
+    iteration = 0
+    while iteration < args.max_iterations:
+        iteration += 1
+        print(f"\n开始第 {iteration} 轮检查和生成")
+        # 获取缺失的蛋白质ID
+        missing_protein_ids = get_missing_protein_ids(args.save_dir)
+        # 如果没有缺失的蛋白质ID，则完成
+        if not missing_protein_ids:
+            print("所有蛋白质数据已成功生成！")
+            break
+        print(f"发现 {len(missing_protein_ids)} 个缺失的蛋白质数据，准备生成")
+        # 将缺失的蛋白质ID列表转换为列表
+        missing_protein_ids_list = sorted(list(missing_protein_ids))
+        # 保存当前缺失的蛋白质ID列表，用于记录
+        missing_ids_file = Path(args.save_dir) / f"missing_protein_ids_iteration_{iteration}.txt"
+        with open(missing_ids_file, 'w') as f:
+            for protein_id in missing_protein_ids_list:
+                f.write(f"{protein_id}\n")
+        # 使用多进程处理生成缺失的蛋白质数据
+        mprs = MultipleProcessRunnerSimplifier(
+            data=missing_protein_ids_list,
+            do=lambda process_id, idx, protein_id, writer: process_single_protein(process_id, idx, protein_id, writer, args.save_dir),
+            n_process=args.n_process,
+            split_strategy="static"
+        )
+        mprs.run()
+        print(f"第 {iteration} 轮生成完成")
+    # 最后检查一次
+    final_missing_ids = get_missing_protein_ids(args.save_dir)
+    if final_missing_ids:
+        print(f"经过 {iteration} 轮生成后，仍有 {len(final_missing_ids)} 个蛋白质数据未成功生成")
+        # 保存最终缺失的蛋白质ID列表
+        final_missing_ids_file = Path(args.save_dir) / "final_missing_protein_ids.txt"
+        with open(final_missing_ids_file, 'w') as f:
+            for protein_id in sorted(final_missing_ids):
+                f.write(f"{protein_id}\n")
+        print(f"最终缺失的蛋白质ID已保存到: {final_missing_ids_file}")
+    else:
+        print(f"经过 {iteration} 轮生成，所有蛋白质数据已成功生成！")
+if __name__ == "__main__":
+    main()

utils/generate_protein_prompt.py ADDED Viewed

	@@ -0,0 +1,413 @@

+import json
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from jinja2 import Template
+try:
+    from utils.protein_go_analysis import analyze_protein_go
+    from utils.prompts import ENZYME_PROMPT, RELATION_SEMANTIC_PROMPT, FUNCTION_PROMPT
+    from utils.get_motif import get_motif_pfam
+except ImportError:
+    from protein_go_analysis import analyze_protein_go
+    from prompts import ENZYME_PROMPT, RELATION_SEMANTIC_PROMPT, FUNCTION_PROMPT
+    from get_motif import get_motif_pfam
+from tqdm import tqdm
+class InterProDescriptionManager:
+    """管理InterPro描述信息的类，避免重复读取文件"""
+    def __init__(self, interpro_data_path, interproscan_info_path):
+        """
+        初始化时读取所有需要的数据
+        Args:
+            interpro_data_path: interpro_data.json文件路径
+            interproscan_info_path: interproscan_info.json文件路径
+        """
+        self.interpro_data_path = interpro_data_path
+        self.interproscan_info_path = interproscan_info_path
+        self.interpro_data = None
+        self.interproscan_info = None
+        self._load_data()
+    def _load_data(self):
+        """加载数据文件，只执行一次"""
+        if self.interpro_data_path and os.path.exists(self.interpro_data_path):
+            with open(self.interpro_data_path, 'r') as f:
+                self.interpro_data = json.load(f)
+        if self.interproscan_info_path and os.path.exists(self.interproscan_info_path):
+            with open(self.interproscan_info_path, 'r') as f:
+                self.interproscan_info = json.load(f)
+    def get_description(self, protein_id, selected_types=None):
+        """
+        获取蛋白质的InterPro描述信息
+        Args:
+            protein_id: 蛋白质ID
+            selected_types: 需要获取的信息类型列表，如['superfamily', 'panther', 'gene3d']
+        Returns:
+            dict: 包含各类型描述信息的字典
+        """
+        if selected_types is None:
+            selected_types = []
+        if not self.interpro_data or not self.interproscan_info:
+            return {}
+        result = {}
+        # 检查蛋白质是否存在
+        if protein_id not in self.interproscan_info:
+            return result
+        protein_info = self.interproscan_info[protein_id]
+        interproscan_results = protein_info.get('interproscan_results', {})
+        # 遍历选定的类型
+        for info_type in selected_types:
+            if info_type in interproscan_results:
+                type_descriptions = {}
+                # 获取该类型的所有IPR ID
+                for entry in interproscan_results[info_type]:
+                    for key, ipr_id in entry.items():
+                        if ipr_id and ipr_id in self.interpro_data:
+                            type_descriptions[ipr_id] = {
+                                'name': self.interpro_data[ipr_id].get('name', ''),
+                                'abstract': self.interpro_data[ipr_id].get('abstract', '')
+                            }
+                if type_descriptions:
+                    result[info_type] = type_descriptions
+        return result
+# 全局变量来缓存InterProDescriptionManager实例和lmdb连接
+_interpro_manager = None
+_lmdb_db = None
+_lmdb_path = None
+def get_interpro_manager(interpro_data_path, interproscan_info_path):
+    """获取或创建InterProDescriptionManager实例"""
+    global _interpro_manager
+    if _interpro_manager is None:
+        _interpro_manager = InterProDescriptionManager(interpro_data_path, interproscan_info_path)
+    return _interpro_manager
+def get_lmdb_connection(lmdb_path):
+    """获取或创建lmdb连接"""
+    global _lmdb_db, _lmdb_path
+    if _lmdb_db is None or _lmdb_path != lmdb_path:
+        if _lmdb_db is not None:
+            _lmdb_db.close()
+        if lmdb_path and os.path.exists(lmdb_path):
+            import lmdb
+            _lmdb_db = lmdb.open(lmdb_path, readonly=True)
+            _lmdb_path = lmdb_path
+        else:
+            _lmdb_db = None
+            _lmdb_path = None
+    return _lmdb_db
+def get_prompt_template(selected_info_types=None,lmdb_path=None):
+    """
+    获取prompt模板，支持可选的信息类型
+    Args:
+        selected_info_types: 需要包含的信息类型列表，如['motif', 'go', 'superfamily', 'panther']
+    """
+    if selected_info_types is None:
+        selected_info_types = ['motif', 'go']  # 默认包含motif和go信息
+    if lmdb_path is None:
+        PROMPT_TEMPLATE = ENZYME_PROMPT + "\n"
+    else:
+        PROMPT_TEMPLATE = FUNCTION_PROMPT + "\n"
+    PROMPT_TEMPLATE += """
+    input information:
+    {%- if 'motif' in selected_info_types and motif_pfam %}
+    motif:{% for motif_id, motif_info in motif_pfam.items() %}
+    {{motif_id}}: {{motif_info}}
+    {% endfor %}
+    {%- endif %}
+    {%- if 'go' in selected_info_types and go_data.status == 'success' %}
+    GO:{% for go_entry in go_data.go_annotations %}
+    ▢ GO term{{loop.index}}: {{go_entry.go_id}}
+    • definition: {{ go_data.all_related_definitions.get(go_entry.go_id, 'not found definition') }}
+    {% endfor %}
+    {%- endif %}
+    {%- for info_type in selected_info_types %}
+    {%- if info_type not in ['motif', 'go'] and interpro_descriptions.get(info_type) %}
+    {{info_type}}:{% for ipr_id, ipr_info in interpro_descriptions[info_type].items() %}
+    ▢ {{ipr_id}}: {{ipr_info.name}}
+    • description: {{ipr_info.abstract}}
+    {% endfor %}
+    {%- endif %}
+    {%- endfor %}
+    """
+    if lmdb_path is not None:
+        PROMPT_TEMPLATE += "\n" + "question: \n {{question}}"
+    return PROMPT_TEMPLATE
+def get_qa_data(protein_id, lmdb_path):
+    """
+    从lmdb中获取指定蛋白质的所有QA对
+    Args:
+        protein_id: 蛋白质ID
+        lmdb_path: lmdb数据库路径
+    Returns:
+        list: QA对列表，每个元素包含question和ground_truth
+    """
+    if not lmdb_path or not os.path.exists(lmdb_path):
+        return []
+    import json
+    qa_pairs = []
+    try:
+        db = get_lmdb_connection(lmdb_path)
+        if db is None:
+            return []
+        with db.begin() as txn:
+            # 遍历数字索引的数据，查找匹配的protein_id
+            cursor = txn.cursor()
+            for key, value in cursor:
+                try:
+                    # 尝试将key解码为数字（数字索引的数据）
+                    key_str = key.decode('utf-8')
+                    if key_str.isdigit():
+                        # 这是数字索引的数据，包含protein_id, question, ground_truth
+                        data = json.loads(value.decode('utf-8'))
+                        if isinstance(data, list) and len(data) >= 3:
+                            stored_protein_id, question, ground_truth = data[0], data[1], data[2]
+                            if stored_protein_id == protein_id:
+                                qa_pairs.append({
+                                    'question': question,
+                                    'ground_truth': ground_truth
+                                })
+                except Exception as e:
+                    # 如果解析失败，跳过这个条目
+                    continue
+    except Exception as e:
+        print(f"Error reading lmdb for protein {protein_id}: {e}")
+    return qa_pairs
+def generate_prompt(protein_id, protein2gopath, protein2pfam_path, pfam_descriptions_path, go_info_path,
+                   interpro_data_path=None, interproscan_info_path=None, selected_info_types=None, lmdb_path=None, interpro_manager=None, question=None):
+    """
+    生成蛋白质prompt
+    Args:
+        selected_info_types: 需要包含的信息类型列表，如['motif', 'go', 'superfamily', 'panther']
+        interpro_data_path: interpro_data.json文件路径
+        interproscan_info_path: interproscan_info.json文件路径
+        interpro_manager: InterProDescriptionManager实例，如果提供则优先使用
+        question: 问题文本，用于QA任务
+    """
+    if selected_info_types is None:
+        selected_info_types = ['motif', 'go']
+    # 获取分析结果
+    analysis = analyze_protein_go(protein_id, protein2gopath, go_info_path)
+    motif_pfam = get_motif_pfam(protein_id, protein2pfam_path, pfam_descriptions_path)
+    # 获取InterPro描述信息（如果需要的话）
+    interpro_descriptions = {}
+    other_types = [t for t in selected_info_types if t not in ['motif', 'go']]
+    if other_types:
+        if interpro_manager:
+            # 使用提供的manager实例
+            interpro_descriptions = interpro_manager.get_description(protein_id, other_types)
+        elif interpro_data_path and interproscan_info_path:
+            # 使用全局缓存的manager
+            manager = get_interpro_manager(interpro_data_path, interproscan_info_path)
+            interpro_descriptions = manager.get_description(protein_id, other_types)
+    # 准备模板数据
+    template_data = {
+        "protein_id": protein_id,
+        "selected_info_types": selected_info_types,
+        "go_data": {
+            "status": analysis["status"],
+            "go_annotations": analysis["go_annotations"] if analysis["status"] == "success" else [],
+            "all_related_definitions": analysis["all_related_definitions"] if analysis["status"] == "success" else {}
+        },
+        "motif_pfam": motif_pfam,
+        "interpro_descriptions": interpro_descriptions,
+        "question": question
+    }
+    PROMPT_TEMPLATE = get_prompt_template(selected_info_types,lmdb_path)
+    template = Template(PROMPT_TEMPLATE)
+    return template.render(**template_data)
+def save_prompts_parallel(protein_ids, output_path, protein2gopath, protein2pfam_path, pfam_descriptions_path, go_info_path,
+                         interpro_data_path=None, interproscan_info_path=None, selected_info_types=None, lmdb_path=None, n_process=8):
+    """并行生成和保存protein prompts"""
+    import json
+    try:
+        from utils.mpr import MultipleProcessRunnerSimplifier
+    except ImportError:
+        from mpr import MultipleProcessRunnerSimplifier
+    if selected_info_types is None:
+        selected_info_types = ['motif', 'go']
+    # 在并行处理开始前创建InterProDescriptionManager实例
+    interpro_manager = None
+    other_types = [t for t in selected_info_types if t not in ['motif', 'go']]
+    if other_types and interpro_data_path and interproscan_info_path:
+        interpro_manager = InterProDescriptionManager(interpro_data_path, interproscan_info_path)
+    # 用于跟踪全局index的共享变量
+    if lmdb_path:
+        import multiprocessing
+        global_index = multiprocessing.Value('i', 0)  # 共享整数，初始值为0
+        index_lock = multiprocessing.Lock()  # 用于同步访问
+    else:
+        global_index = None
+        index_lock = None
+    results = {}
+    def process_protein(process_id, idx, protein_id, writer):
+        protein_id = protein_id.strip()
+        # 为每个进程初始化lmdb连接
+        if lmdb_path:
+            get_lmdb_connection(lmdb_path)
+        if lmdb_path:
+            # 如果有lmdb_path，处理QA数据
+            qa_pairs = get_qa_data(protein_id, lmdb_path)
+            for qa_pair in qa_pairs:
+                question = qa_pair['question']
+                ground_truth = qa_pair['ground_truth']
+                prompt = generate_prompt(protein_id, protein2gopath, protein2pfam_path, pfam_descriptions_path, go_info_path,
+                                       interpro_data_path, interproscan_info_path, selected_info_types, lmdb_path, interpro_manager, question)
+                if prompt == "":
+                    continue
+                if writer:
+                    # 获取并递增全局index
+                    with index_lock:
+                        current_index = global_index.value
+                        global_index.value += 1
+                    result = {
+                        "index": current_index,
+                        "protein_id": protein_id,
+                        "prompt": prompt,
+                        "question": question,
+                        "ground_truth": ground_truth
+                    }
+                    writer.write(json.dumps(result) + '\n')
+        else:
+            # 如果没有lmdb_path，按原来的方式处理
+            prompt = generate_prompt(protein_id, protein2gopath, protein2pfam_path, pfam_descriptions_path, go_info_path,
+                                   interpro_data_path, interproscan_info_path, selected_info_types, lmdb_path, interpro_manager)
+            if prompt == "":
+                return
+            if writer:
+                result = {protein_id: prompt}
+                writer.write(json.dumps(result) + '\n')
+    # 使用MultipleProcessRunnerSimplifier进行并行处理
+    runner = MultipleProcessRunnerSimplifier(
+        data=protein_ids,
+        do=process_protein,
+        save_path=output_path + '.tmp',
+        n_process=n_process,
+        split_strategy="static"
+    )
+    runner.run()
+    # 清理全局lmdb连接
+    global _lmdb_db
+    if _lmdb_db is not None:
+        _lmdb_db.close()
+        _lmdb_db = None
+    if not lmdb_path:
+        # 如果没有lmdb_path，合并所有结果到一个字典（兼容旧格式）
+        final_results = {}
+        with open(output_path + '.tmp', 'r') as f:
+            for line in f:
+                if line.strip():  # 忽略空行
+                    final_results.update(json.loads(line))
+        # 保存最终结果为正确的JSON格式
+        with open(output_path, 'w') as f:
+            json.dump(final_results, f, indent=2)
+    else:
+        # 如果有lmdb_path，直接保存为jsonl格式
+        import shutil
+        shutil.move(output_path + '.tmp', output_path)
+    # 删除临时文件（如果还存在的话）
+    if os.path.exists(output_path + '.tmp'):
+        os.remove(output_path + '.tmp')
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Generate protein prompt')
+    parser.add_argument('--protein_path', type=str, default='data/raw_data/protein_ids_clean.txt')
+    parser.add_argument('--protein2pfam_path', type=str, default='data/processed_data/interproscan_info.json')
+    parser.add_argument('--pfam_descriptions_path', type=str, default='data/raw_data/all_pfam_descriptions.json')
+    parser.add_argument('--protein2gopath', type=str, default='data/processed_data/go_integration_final_topk2.json')
+    parser.add_argument('--go_info_path', type=str, default='data/raw_data/go.json')
+    parser.add_argument('--interpro_data_path', type=str, default='data/raw_data/interpro_data.json')
+    parser.add_argument('--interproscan_info_path', type=str, default='data/processed_data/interproscan_info.json')
+    parser.add_argument('--lmdb_path', type=str, default=None)
+    parser.add_argument('--output_path', type=str, default='data/processed_data/prompts@clean_test.json')
+    parser.add_argument('--selected_info_types', type=str, nargs='+', default=['motif', 'go'],
+                       help='选择要包含的信息类型，如: motif go superfamily panther gene3d')
+    parser.add_argument('--n_process', type=int, default=32)
+    args = parser.parse_args()
+    #更新output_path，需要包含selected_info_types
+    args.output_path = args.output_path.replace('.json', '_' + '_'.join(args.selected_info_types) + '.json')
+    print(args)
+    with open(args.protein_path, 'r') as file:
+        protein_ids = file.readlines()
+    save_prompts_parallel(
+        protein_ids=protein_ids,
+        output_path=args.output_path,
+        n_process=args.n_process,
+        protein2gopath=args.protein2gopath,
+        protein2pfam_path=args.protein2pfam_path,
+        pfam_descriptions_path=args.pfam_descriptions_path,
+        go_info_path=args.go_info_path,
+        interpro_data_path=args.interpro_data_path,
+        interproscan_info_path=args.interproscan_info_path,
+        selected_info_types=args.selected_info_types,
+        lmdb_path=args.lmdb_path
+    )
+    # 测试示例
+    # protein_id = 'A8CF74'
+    # prompt = generate_prompt(protein_id, 'data/processed_data/go_integration_final_topk2.json',
+    #                         'data/processed_data/interproscan_info.json', 'data/raw_data/all_pfam_descriptions.json',
+    #                         'data/raw_data/go.json', 'data/raw_data/interpro_data.json',
+    #                         'data/processed_data/interproscan_info.json',
+    #                         ['motif', 'go', 'superfamily', 'panther'])
+    # print(prompt)

utils/get_motif.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import json
+from collections import Counter
+import os
+_pfam_dict = None
+_pfam_descriptions = None
+def _load_pfam_data(protein2pfam_path):
+    global _pfam_dict
+    if _pfam_dict is None:
+        with open(protein2pfam_path, 'r') as file:
+            _pfam_dict = json.load(file)
+def _load_pfam_descriptions(pfam_descriptions_path):
+    global _pfam_descriptions
+    if _pfam_descriptions is None:
+        with open(pfam_descriptions_path, 'r') as file:
+            _pfam_descriptions = json.load(file)
+def get_motif_pfam(protein_id, protein2pfam_path, pfam_descriptions_path):
+    """
+    获取指定蛋白质的pfam信息及其定义
+    参数:
+    protein_id: str - 蛋白质ID
+    protein2pfam_path: str - interproscan_info.json文件路径
+    pfam_descriptions_path: str - pfam描述文件路径
+    返回:
+    dict - pfam_id到定义的映射字典，例如{"PF04820": "definition content"}
+    """
+    _load_pfam_data(protein2pfam_path)
+    _load_pfam_descriptions(pfam_descriptions_path)
+    if protein_id not in _pfam_dict:
+        return {}
+    protein_info = _pfam_dict[protein_id]
+    _pfam_dicts = protein_info.get('interproscan_results', {}).get('pfam_id', [])
+    pfam_ids = []
+    for pfam_dict in _pfam_dicts:
+        for key,value in pfam_dict.items():
+            pfam_ids.append(key)
+    result = {}
+    for pfam_id in pfam_ids:
+        if pfam_id in _pfam_descriptions:
+            result[pfam_id] = _pfam_descriptions[pfam_id]['description']
+    return result
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--protein_id", type=str, required=False, default="A8CF74")
+    parser.add_argument("--protein2pfam_path", type=str, required=False, default="data/processed_data/interproscan_info.json")
+    parser.add_argument("--pfam_descriptions_path", type=str, required=False, default="data/raw_data/all_pfam_descriptions.json")
+    args = parser.parse_args()
+    result = get_motif_pfam(args.protein_id, args.protein2pfam_path, args.pfam_descriptions_path)
+    print(result)

utils/mpr.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import abc
+import os
+import time
+import sys
+from tqdm import tqdm
+from math import ceil
+class MultipleProcessRunner:
+    """
+    Abstarct class for running tasks with multiple process
+    There are three abstract methods that should be implemented:
+        1. __len__() : return the length of data
+        2. _target() : target function for each process
+        3. _aggregate() : aggregate results from each process
+    """
+    def __init__(self,
+                 data,
+                 save_path=None,
+                 n_process=1,
+                 verbose=True,
+                 total_only=True,
+                 log_step=1,
+                 start_method='fork',
+                 split_strategy="queue"):
+        """
+        Args:
+            data     : data to be processed that can be sliced
+            path     : final output path
+            n_process: number of process
+            verbose  : if True, display progress bar
+            total_only: If True, only total progress bar is displayed
+            log_step : For total progress bar, Next log will be printed when ``current iteration`` - ``last log
+                       iteration`` >= log_step
+            start_method: start method for multiprocessing
+            split_strategy: method to split data, can be 'queue', 'static'. If 'queue', data will be put into a
+                            queue and each process will get data from the queue. If 'static', data will be split
+                            into n_process parts and each process will get one part.
+        """
+        self.data = data
+        self.save_path = save_path
+        self.n_process = n_process
+        self.verbose = verbose
+        self.total_only = total_only
+        self.log_step = log_step
+        self.start_method = start_method
+        self.split_strategy = split_strategy
+        assert self.split_strategy in ["queue", "static"], f"Split strategy must be 'queue' or 'static', but got {self.split_strategy}"
+        # get terminal width to format output
+        try:
+            self.terminal_y = os.get_terminal_size()[0]
+        except Exception as e:
+            print(e)
+            print("Can't get terminal size, set terminal_y = None")
+            self.terminal_y = None
+    def _s2hms(self, seconds: float):
+        """
+        convert second format of time into hour:minute:second format
+        """
+        m, s = divmod(seconds, 60)
+        h, m = divmod(m, 60)
+        return "%02d:%02d:%02d" % (h, m, s)
+    def _display_time(self, st_time, now, total):
+        ed_time = time.time()
+        running_time = ed_time - st_time
+        rest_time = running_time * (total - now) / now
+        iter_sec = f"{now / running_time:.2f}it/s" if now > running_time else f"{running_time / now:.2f}s/it"
+        return f' [{self._s2hms(running_time)} < {self._s2hms(rest_time)}, {iter_sec}]'
+    def _display_bar(self, now, total, length):
+        now = now if now <= total else total
+        num = now * length // total
+        progress_bar = '[' + '#' * num + '_' * (length - num) + ']'
+        return progress_bar
+    def _display_all(self, now, total, desc, st_time):
+        # make a progress bar
+        length = 50
+        progress_bar = self._display_bar(now, total, length)
+        time_display = self._display_time(st_time, now, total)
+        display = f'{desc}{progress_bar} {int(now / total * 100):02d}% {now}/{total}{time_display}'
+        # Clean a line
+        width = self.terminal_y if self.terminal_y is not None else 100
+        num_space = width - len(display)
+        if num_space > 0:
+            display += ' ' * num_space
+        else:
+            length += num_space
+            progress_bar = self._display_bar(now, total, length)
+            display = f'{desc}{progress_bar} {int(now / total * 100):02d}% {now}/{total}{time_display}'
+        # Set color
+        display = f"\033[31m{display}\033[0m"
+        return display
+    # Print progress bar at specific position in terminal
+    def terminal_progress_bar(self,
+                              process_id: int,
+                              now: int,
+                              total: int,
+                              desc: str = ''):
+        """
+        Args:
+            process_id: process id
+            now: now iteration number
+            total: total iteration number
+            desc: description
+        """
+        st_time = self.process_st_time[process_id]
+        # Aggregate total information
+        self.counts[process_id] = now
+        self._total_display(self.process_st_time["total"])
+        if not self.total_only:
+            process_display = self._display_all(now, total, desc, st_time)
+            if self.terminal_y is not None:
+                sys.stdout.write(f"\x1b7\x1b[{process_id + 1};{0}f{process_display}\x1b8")
+                sys.stdout.flush()
+            else:
+                print(f"\x1b7\x1b[{process_id + 1};{0}f{process_display}\x1b8", flush=True)
+    # Print global information
+    def _total_display(self, st_time):
+        if self.total_display_callable.value == 1:
+            self.total_display_callable.value = 0
+            cnt = sum([self.counts[i] for i in range(self.n_process)])
+            if cnt - self.last_cnt.value >= self.log_step:
+                total_display = self._display_all(cnt, self.__len__(), f"Total: ", st_time)
+                self.last_cnt.value = cnt
+                x = self.n_process + 1 if not self.total_only else 0
+                # if self.terminal_y is not None:
+                #     sys.stdout.write(f"\x1b7\x1b[{x};{0}f{total_display}\x1b8")
+                #     sys.stdout.flush()
+                # else:
+                #     print(f"\x1b7\x1b[{x};{0}f{total_display}\x1b8", flush=True)
+                print(f"\r\x1b7\x1b[{x};{0}f{total_display}\x1b8", flush=True, end="")
+            self.total_display_callable.value = 1
+    def run(self):
+        """
+        The function is used to run a multi-process task
+        Returns: return the result of function '_aggregate()'
+        """
+        if self.split_strategy == "static":
+            return self.run_static()
+        elif self.split_strategy == "queue":
+            return self.run_queue()
+    def run_static(self):
+        """
+        Running multi-process task with static data splits
+        """
+        # import multiprocessing as mp
+        import multiprocess as mp
+        mp.set_start_method(self.start_method, force=True)
+        # total number of data that is already processed
+        self.counts = mp.Manager().dict({i: 0 for i in range(self.n_process)})
+        # record start time for each process
+        self.process_st_time = {"total": time.time()}
+        # set a lock to call total number display
+        self.total_display_callable = mp.Value('d', 1)
+        # Save last log iteration number
+        self.last_cnt = mp.Value('d', 0)
+        num_per_process = ceil(self.__len__() / self.n_process)
+        if self.save_path is not None:
+            file_name, suffix = os.path.splitext(self.save_path)
+        process_list = []
+        sub_paths = []
+        for i in range(self.n_process):
+            st = i * num_per_process
+            ed = st + num_per_process
+            # construct slice and sub path for sub process
+            data_slice = self.data[st: ed]
+            sub_path = None
+            # Create a directory to save sub-results
+            if self.save_path is not None:
+                save_dir = f"{file_name}{suffix}_temp"
+                os.makedirs(save_dir, exist_ok=True)
+                sub_path = f"{save_dir}/temp_{i}{suffix}"
+            # construct sub process
+            input_args = (i, data_slice, sub_path)
+            self.process_st_time[i] = time.time()
+            p = mp.Process(target=self._target_static, args=input_args)
+            p.start()
+            process_list.append(p)
+            sub_paths.append(sub_path)
+        for p in process_list:
+            p.join()
+        # aggregate results and remove temporary directory
+        results = self._aggregate(self.save_path, sub_paths)
+        if self.save_path is not None:
+            save_dir = f"{file_name}{suffix}_temp"
+            os.rmdir(save_dir)
+        return results
+    def run_queue(self):
+        """
+        Running multi-process task with shared queue
+        """
+        # import multiprocessing as mp
+        import multiprocess as mp
+        mp.set_start_method(self.start_method, force=True)
+        # total number of data that is already processed
+        self.counts = mp.Manager().dict({i: 0 for i in range(self.n_process)})
+        # Initialize a queue to input data
+        self.q = mp.Queue(self.__len__())
+        for d in tqdm(self.data, "Input data to queue"):
+            self.q.put(d)
+        # record start time for each process
+        self.process_st_time = {"total": time.time()}
+        # set a lock to call total number display
+        self.total_display_callable = mp.Value('d', 1)
+        # Save last log iteration number
+        self.last_cnt = mp.Value('d', 0)
+        if self.save_path is not None:
+            file_name, suffix = os.path.splitext(self.save_path)
+        process_list = []
+        sub_paths = []
+        for i in range(self.n_process):
+            sub_path = None
+            # Create a directory to save sub-results
+            if self.save_path is not None:
+                save_dir = f"{file_name}{suffix}_temp"
+                os.makedirs(save_dir, exist_ok=True)
+                sub_path = f"{save_dir}/temp_{i}{suffix}"
+            # construct sub process
+            input_args = (i, sub_path)
+            self.process_st_time[i] = time.time()
+            p = mp.Process(target=self._target_queue, args=input_args)
+            p.start()
+            process_list.append(p)
+            sub_paths.append(sub_path)
+        for p in process_list:
+            p.join()
+        # aggregate results and remove temporary directory
+        results = self._aggregate(self.save_path, sub_paths)
+        if self.save_path is not None:
+            save_dir = f"{file_name}{suffix}_temp"
+            os.rmdir(save_dir)
+        return results
+    @abc.abstractmethod
+    def _aggregate(self, final_path: str, sub_paths):
+        """
+        This function is used to aggregate results from sub processes into a file
+        Args:
+            final_path: path to save final results
+            sub_paths : list of sub paths
+        Returns: None or desirable results specified by user
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def _target_static(self, process_id, data, sub_path):
+        """
+        The main body to operate data in one process. This function is used when split_strategy is 'static'.
+        Args:
+            i       : process id
+            data    : data slice
+            sub_path: sub path to save results
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def _target_queue(self, process_id, sub_path):
+        """
+        The main body to operate data in one process. This function is used when split_strategy is 'queue'.
+        Args:
+            i       : process id
+            sub_path: sub path to save results
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def __len__(self):
+        raise NotImplementedError
+class MultipleProcessRunnerSimplifier(MultipleProcessRunner):
+    """
+    A simplified version of MultipleProcessRunner.
+    User only need to implement the function 'do', then it will be automatically executed
+    in every iteration after call the function 'run'.
+    If 'save_path' is specified, it will open a file in the 'sub_path' into which
+    user can write results, and results will be aggregated into 'save_path'.
+    The procedure would be like:
+        ...
+        with open(sub_path, 'w') as w:
+            for i, d in enumerate(data):
+                self.do(process_id, i, d, w) # You can write results into the file.
+                ...
+    The 'do' function should be like:
+        def do(process_id, idx, data, writer):
+            ...
+    If 'save_path' is None, the argument 'writer' will be set to None.
+    """
+    def __init__(self, data, do, return_results=False, **kwargs):
+        super().__init__(data=data, **kwargs)
+        self.do = do
+        self.return_results = return_results
+    def run(self):
+        self.start_time = time.time()
+        return super().run()
+    def _aggregate(self, final_path: str, sub_paths):
+        results = []
+        w = open(final_path, 'w') if final_path is not None else None
+        if self.verbose:
+            iterator = tqdm(enumerate(sub_paths), "Aggregating results...")
+        else:
+            iterator = enumerate(sub_paths)
+        for i, sub_path in iterator:
+            if sub_path is None and self.return_results:
+                sub_path = f"MultipleProcessRunnerSimplifier_{self.start_time}_{i}.tmp"
+            if sub_path is not None:
+                with open(sub_path, 'r') as r:
+                    for line in r:
+                        if w is not None:
+                            w.write(line)
+                        if self.return_results:
+                            results.append(line[:-1])
+                os.remove(sub_path)
+        return results
+    def _target_static(self, process_id, data, sub_path):
+        if sub_path is None and self.return_results:
+            sub_path = f"MultipleProcessRunnerSimplifier_{self.start_time}_{process_id}.tmp"
+        w = open(sub_path, 'w') if sub_path is not None else None
+        for i, d in enumerate(data):
+            self.do(process_id, i, d, w)
+            if self.verbose:
+                self.terminal_progress_bar(process_id, i + 1, len(data), f"Process{process_id} running...")
+        if w is not None:
+            w.close()
+    def _target_queue(self, process_id, sub_path):
+        if sub_path is None and self.return_results:
+            sub_path = f"MultipleProcessRunnerSimplifier_{self.start_time}_{process_id}.tmp"
+        w = open(sub_path, 'w') if sub_path is not None else None
+        i = 0
+        while not self.q.empty():
+            d = self.q.get()
+            self.do(process_id, i, d, w)
+            if self.verbose:
+                self.terminal_progress_bar(process_id, i + 1, self.__len__(), f"Process{process_id} running...")
+            i += 1
+        if w is not None:
+            w.close()
+    def __len__(self):
+        return len(self.data)

utils/openai_access.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import openai
+import time
+from openai import OpenAI
+# from zhipuai import ZhipuAI
+import requests
+import re
+# from utils.parse_llm_output import try_parse_json_object
+def get_oai_completion(prompt):
+    # api_pools = [
+    #     ("your_api_key","base_url","model_name"),
+    # ]
+    api_pools = [
+        ("sk-0060332083ea440bb35b676df023ce01","https://dashscope.aliyuncs.com/compatible-mode/v1","deepseek-v3")
+    ]
+    api = api_pools[0]
+    api_key, base_url, model = api
+    if "GLM" in model:
+        client = ZhipuAI(api_key=api_key)
+        # from utils.prompts import GLM_JSON_RESPONSE_PREFIX, GLM_JSON_RESPONSE_SUFFIX, system_prompt
+        # system_prompt = f"{GLM_JSON_RESPONSE_PREFIX}{system_prompt}"
+        # user_prompt = f"{prompt}{GLM_JSON_RESPONSE_SUFFIX}"
+        system_prompt = "You are a helpful assistant."
+        user_prompt = prompt
+    else:
+        client = OpenAI(api_key=api_key, base_url=base_url)
+        system_prompt = "You are a helpful assistant."
+        user_prompt = prompt
+    try:
+        if "GLM" in model:
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ],
+                # response_format={ "type": "json_object" },
+                temperature=0.1,
+                top_p=0.7,
+                stream=False
+            )
+        else:
+            # print(user_prompt)
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ],
+                # response_format={ "type": "json_object" },
+                #根据任务的不同来调整
+                temperature=0.8,
+                max_tokens=8000,
+                stream=False
+            )
+        res = response.choices[0].message.content
+        # if "GLM" in model:
+        #     pattern = re.compile(r"```(?:json\s+)?(\{.*?\})\s*```", re.DOTALL)
+        #     match = pattern.search(res)
+        #     if match:
+        #         gpt_output, _ = try_parse_json_object(match.group(1).strip())
+        #     else:
+        #         gpt_output = res
+        # else:
+        #     gpt_output = res
+        pattern = re.compile(r"```(?:json\s+)?(\{.*?\})\s*```", re.DOTALL)
+        match = pattern.search(res)
+        # if match:
+        #     gpt_output, _ = try_parse_json_object(match.group(1).strip())
+        # else:
+        #     gpt_output = res
+        gpt_output = res
+        return gpt_output
+    except requests.exceptions.Timeout:
+        print("The API request timed out. Please try again later.")
+        return None
+    except Exception as e:
+        print(e)
+        return None
+def call_chatgpt(ins):
+    success = False
+    re_try_count = 5
+    ans = ''
+    while not success and re_try_count >= 0:
+        re_try_count -= 1
+        try:
+            ans = get_oai_completion(ins)
+            success = True
+        except Exception as e:
+            print(f"Retry times: {re_try_count}; Error: {e}", flush=True)
+            time.sleep(5)
+    return ans

utils/prompts.py ADDED Viewed

	@@ -0,0 +1,66 @@

+ENZYME_PROMPT = """**You are a senior systems biologist.** Analyze the input information to predict ec number using structured reasoning. Crucially, implement a **self-correct mechanism** with these steps:
+### Self-Correct Protocol
+1. **Enzyme Verification**
+   - Discard ANY information contradicting the enzyme nature (catalytic activity).
+   - Example: If a GO term implies non-enzymatic function (e.g., structural role), reject it immediately.
+2. **Conflict Resolution (Majority Rule)**
+   - Identify conflicts between:
+     - Motif vs. Motif
+     - GO term vs. GO term
+     - Motif vs. GO term
+   - **Resolution Principle**:
+     - If one element (A) conflicts with ≥2 logically consistent elements (B,C,D), discard A.
+     - Preserve high-confidence information supported by multiple sources.
+   - *Note*: Compatible functions (e.g., catalytic activity + cofactor binding) are NOT conflicts.
+3. **Output Filtered Information**
+   - Explicitly list retained/discarded items with reasons before analysis.
+### Final Output Requirement for EC Number
+After completing the full biological analysis, you **must** conclude your entire response with a special section for automated parsing. This section must adhere to the following precise logic and format:
+**Decision Logic:**
+1.  **Default to a Single EC Number:** Your primary goal is to predict the **single, most likely EC number** for the protein's primary catalytic activity.
+2.  **Handling Ambiguity:** If the evidence suggests a single function but points to several possible EC numbers (e.g., a family motif describes related but distinct activities), you must **commit to one choice**. Select the EC number that is most representative, most common, or best supported by the combined evidence. **Do not list multiple options out of uncertainty.**
+3.  **Exception for Bifunctionality:** You may only predict multiple EC numbers if there is **explicit and strong evidence that a single protein is bifunctional**, meaning it contains distinct domains that perform two or more separate catalytic reactions. This requires clear support, such as a motif description explicitly stating "bifunctional" or the presence of multiple, distinct top-level catalytic GO terms (e.g., both a kinase and a cyclase activity).
+**Formatting Rules:**
+1.  The section must begin on a new line with the exact tag: `[EC_PREDICTION]`
+2.  **Single Prediction (Standard Case):** Follow the tag with a single space and the predicted EC number.
+    *   Example: `[EC_PREDICTION] 1.14.99.54`
+3.  **Bifunctional Prediction (Exceptional Case):** List the EC numbers separated by a comma with no spaces.
+    *   Example: `[EC_PREDICTION] 2.7.1.1,4.6.1.1`
+4.  Do not add any other text, explanation, or punctuation on this line.
+"""
+RELATION_SEMANTIC_PROMPT = """
+relation semantic:
+• is_a: The is a relation forms the basic structure of GO. If we say A is a B, we mean that node A is a subtype of node B. For example, mitotic cell cycle is a cell cycle, or lyase activity is a catalytic activity.
+• part_of: The part of relation is used to represent part-whole relationships. part of has a specific meaning in GO, and a part of relation would only be added between A and B if B is necessarily part of A: wherever B exists, it is as part of A, and the presence of the B implies the presence of A. However, given the occurrence of A, we cannot say for certain that B exists.
+• has part: The logical complement to the part of relation is has part, which represents a part-whole relationship from the perspective of the parent. As with part of, the GO relation has part is only used in cases where A always has B as a part, i.e. where A necessarily has part B. If A exists, B will always exist; however, if B exists, we cannot say for certain that A exists. i.e. all A have part B; some B part of A.
+• ends during: X ends_during Y iff: ((start(Y) before_or_simultaneous_with end(X)) AND end(X) before_or_simultaneous_with end(Y).
+• happens during: X happens_during Y iff: (start(Y) before_or_simultaneous_with start(X)) AND (end(X) before_or_simultaneous_with end(Y))
+• negatively regulates: p negatively regulates q iff p regulates q, and p decreases the rate or magnitude of execution of q.
+• occurs in: b occurs_in c =def b is a process and c is a material entity or immaterial entity& there exists a spatiotemporal region r and b occupies_spatiotemporal_region r.& forall(t) if b exists_at t then c exists_at t & there exist spatial regions s and s’ where & b spatially_projects_onto s at t& c is occupies_spatial_region s’ at t& s is a proper_continuant_part_of s’ at t
+• positively regulates: p positively regulates q iff p regulates q, and p increases the rate or magnitude of execution of q.
+• regulates: A relation that describes case in which one process directly affects the manifestation of another process or quality, i.e. the former regulates the latter. The target of the regulation may be another process, for e.g., regulation of a pathway or an enzymatic reaction, or it may be a quality, such as cell size or pH. Analogously to part of, this relation is used specifically to mean necessarily regulates: if both A and B are present, B always regulates A, but A may not always be regulated by B., i.e. all B regulate A; some A are regulated by B.
+• subproperty of: is used to establish a hierarchy among properties, indicating that a more specific property inherits characteristics from a more general one.
+• inverse of: is used to define the reverse direction of a relationship between the same pair of individuals.
+"""
+FUNCTION_PROMPT = """**You are a senior systems biologist.** Analyze the input information to answer the given question.
+"""
+LLM_SCORE_PROMPT = """As an expert biologist, you are assigned to check one paragraph is aligned with facts or not. You will receive some facts, and
+one paragraph. Score the paragraph between 0 to 100.
+The score should be the format of {"score": score}
+Here's the facts:
+{{ground_truth}}
+Here's the paragraph:
+{{llm_answer}}
+"""

utils/protein_go_analysis.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import json
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from collections import defaultdict
+# 全局变量声明
+_go_data = None
+_protein_go_dict = None
+def _load_go_data(go_info_path):
+    """懒加载GO数据"""
+    global _go_data
+    if _go_data is None:
+        try:
+            with open(go_info_path, 'r') as f:
+                _go_data = json.load(f)
+        except Exception as e:
+            print(f"加载GO数据文件时发生错误: {str(e)}")
+            _go_data = None
+def _load_protein_go_dict(protein2gopath):
+    """懒加载蛋白质-GO映射数据"""
+    global _protein_go_dict
+    if _protein_go_dict is None:
+        try:
+            _protein_go_dict = {}
+            with open(protein2gopath, 'r') as f:
+                for line in f:
+                    data = json.loads(line)
+                    _protein_go_dict[data['protein_id']] = data['GO_id']
+        except Exception as e:
+            print(f"加载蛋白质-GO映射数据时发生错误: {str(e)}")
+            _protein_go_dict = None
+def get_go_definition(go_id, go_info_path):
+    """获取GO term的定义"""
+    _load_go_data(go_info_path)
+    if _go_data is None:
+        return None
+    if not go_id.startswith('GO_'):
+        go_id = f"GO_{go_id}"
+    full_id = f"http://purl.obolibrary.org/obo/{go_id}"
+    for node in _go_data['graphs'][0]['nodes']:
+        if node['id'] == full_id:
+            if 'meta' in node and 'definition' in node['meta']:
+                return node['meta']['definition']['val']
+    return None
+def analyze_protein_go(protein_id, protein2gopath, go_info_path):
+    """
+    分析蛋白质的GO注释信息，包括GO ID和定义
+    参数：
+    protein_id: str - 蛋白质ID
+    protein2gopath: str - 蛋白质-GO映射文件路径
+    返回：
+    dict - 包含GO信息的字典
+    """
+    _load_protein_go_dict(protein2gopath)
+    if _protein_go_dict is None:
+        return {
+            "status": "error",
+            "message": "GO数据加载失败"
+        }
+    if protein_id not in _protein_go_dict:
+        return {
+            "status": "error",
+            "message": f"未找到蛋白质 {protein_id} 的GO注释"
+        }
+    go_ids = _protein_go_dict[protein_id]
+    go_info = []
+    all_definitions = {}
+    for go_id in go_ids:
+        # 获取GO定义
+        definition = get_go_definition(go_id, go_info_path)
+        if definition:
+            all_definitions[go_id] = definition
+        go_info.append({
+            "go_id": go_id
+        })
+    return {
+        "status": "success",
+        "protein_id": protein_id,
+        "go_annotations": go_info,
+        "all_related_definitions": all_definitions
+    }
+# 使用示例
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Analyze protein GO annotations')
+    parser.add_argument('--protein_id', type=str, default='A8CF74')
+    parser.add_argument('--protein2gopath', type=str, default='data/processed_data/go_integration_final_topk2.json')
+    parser.add_argument('--go_info_path', type=str, default='data/raw_data/go.json')
+    args = parser.parse_args()
+    result = analyze_protein_go(args.protein_id, args.protein2gopath, args.go_info_path)
+    if result["status"] == "success":
+        print(f"\nProtein {result['protein_id']} GO annotations:")
+        for anno in result["go_annotations"]:
+            print(f"\nGO ID: {anno['go_id']}")
+        print("\nAll related GO ID definitions:")
+        for go_id, definition in result["all_related_definitions"].items():
+            print(f"\nGO:{go_id}")
+            print(f"Definition: {definition}")
+    else:
+        print(result["message"])

utils/utils.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from Bio import ExPASy
+from Bio import SeqIO
+import json
+from Bio.Blast import NCBIXML
+def get_protein_sequence_biopython(uniprot_id):
+    """
+    使用BioPython通过UniProt ID获取蛋白质序列
+    参数:
+        uniprot_id (str): UniProt ID (如P12345)
+    返回:
+        str: 蛋白质序列或错误信息
+    """
+    try:
+        with ExPASy.get_sprot_raw(uniprot_id) as handle:
+            seq_record = SeqIO.read(handle, "swiss")
+            return str(seq_record.seq)
+    except Exception as e:
+        return f"Error: {str(e)}"
+def extract_interproscan_metrics(file_path, librarys="PFAM"):
+    """
+    从InterProScan JSON结果中提取蛋白质信息和域信息。
+    参数:
+        file_path (str): InterProScan JSON结果文件路径
+        librarys (list): 需要提取的域库列表，默认为["PFAM"]
+    返回:
+        dict: 包含蛋白质序列和对应域信息的字典
+    """
+    protein_info = {}
+    with open(file_path, 'r', encoding='utf-8') as file:
+        data = json.load(file)
+    results = data["results"]
+    for protein in results:
+        sequence = protein["sequence"]
+        domain_info = {}
+        for library in librarys:
+            domain_info[library] = []
+        domain_info["GO"] = []
+        matches = protein["matches"]
+        for match in matches:
+            if match["signature"]["signatureLibraryRelease"]["library"] in librarys:
+                if match["signature"]["entry"]:
+                    domain_info[match["signature"]["signatureLibraryRelease"]["library"]].append({match["signature"]["accession"]: match["signature"]["entry"]["accession"]})
+                else:
+                    domain_info[match["signature"]["signatureLibraryRelease"]["library"]].append({match["signature"]["accession"]: None})
+            # 处理GO信息
+            if match["signature"]["entry"]:
+                if match["signature"]["entry"]["goXRefs"]:
+                    for goXRef in match["signature"]["entry"]["goXRefs"]:
+                        if goXRef["databaseName"] == "GO":
+                            domain_info["GO"].append(goXRef["id"])
+        protein_info[sequence] = domain_info
+    return protein_info
+def get_seqnid(file_path):
+    seq_dict = {}
+    current_header = None
+    current_seq = []
+    with open(file_path, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith(">"):
+                if current_header is not None:
+                    seq_dict[current_header] = "".join(current_seq)
+                current_header = line[1:].split()[0]  # Take only the first part before whitespace
+                current_seq = []
+            else:
+                current_seq.append(line)
+        if current_header is not None:
+            seq_dict[current_header] = "".join(current_seq)
+    return seq_dict
+def tofasta(fasta_path, uids, seqs):
+    """
+    Write sequences in FASTA format to a file.
+    Parameters:
+    - fasta_path: str, path to the output FASTA file
+    - uids: list of str, sequence identifiers (headers)
+    - seqs: list of str, corresponding sequences
+    """
+    if len(uids) != len(seqs):
+        raise ValueError("Length of uids and seqs must be equal")
+    with open(fasta_path, 'w') as f:
+        for uid, seq in zip(uids, seqs):
+            # Write header line starting with '>' followed by the uid
+            f.write(f">{uid}\n")
+            # Write sequence (you may want to split long sequences into multiple lines)
+            f.write(f"{seq}\n")
+def extract_blast_metrics(xml_file):
+    """
+    从BLAST XML结果中提取以下指标：
+    - ID (提取UniProt ID)
+    - Identity% (相似度百分比)
+    - Coverage (覆盖率)
+    - E-value
+    - Bit Score
+    - Positive% (相似残基百分比)
+    """
+    with open(xml_file) as f:
+        blast_records = NCBIXML.parse(f)
+        results = {}
+        for blast_record in blast_records:
+            _results = []
+            query_length = blast_record.query_length
+            for alignment in blast_record.alignments:
+                for hsp in alignment.hsps:
+                    # 提取UniProt ID (格式如 sp|A0A0H2ZM56|ADHE_STRP2)
+                    hit_id = alignment.hit_id.split("|")[1] if "|" in alignment.hit_id else alignment.hit_id
+                    # 计算关键指标
+                    identity_percent = (hsp.identities / hsp.align_length) * 100
+                    coverage = (hsp.align_length / query_length) * 100
+                    positive_percent = (hsp.positives / hsp.align_length) * 100
+                    # 存储结果
+                    _results.append({
+                        "ID": hit_id,
+                        "Identity%": round(identity_percent, 2),
+                        "Coverage%": round(coverage, 2),
+                        "E-value": f"{hsp.expect:.1e}" if hsp.expect < 0.001 else round(hsp.expect, 4),
+                        "Bit Score": round(hsp.bits, 1),
+                        "Positive%": round(positive_percent, 2)
+                    })
+            results[blast_record.query] = _results
+        return results
+def rename_interproscan_keys(interproscan_results):
+    new_results = {}
+    for key, value in interproscan_results.items():
+        if key == "PFAM":
+            new_results["pfam_id"] = value
+        elif key == "GO":
+            new_results["go_id"] = value
+        else:
+            new_results[key.lower()] = value
+    return new_results