protein_rag / utils /get_motif.py
ericzhang1122's picture
Upload folder using huggingface_hub
5c20520 verified
import json
from collections import Counter
import os
_pfam_dict = None
_pfam_descriptions = None
def _load_pfam_data(protein2pfam_path):
global _pfam_dict
if _pfam_dict is None:
with open(protein2pfam_path, 'r') as file:
_pfam_dict = json.load(file)
def _load_pfam_descriptions(pfam_descriptions_path):
global _pfam_descriptions
if _pfam_descriptions is None:
with open(pfam_descriptions_path, 'r') as file:
_pfam_descriptions = json.load(file)
def get_motif_pfam(protein_id, protein2pfam_path, pfam_descriptions_path):
"""
获取指定蛋白质的pfam信息及其定义
参数:
protein_id: str - 蛋白质ID
protein2pfam_path: str - interproscan_info.json文件路径
pfam_descriptions_path: str - pfam描述文件路径
返回:
dict - pfam_id到定义的映射字典,例如{"PF04820": "definition content"}
"""
_load_pfam_data(protein2pfam_path)
_load_pfam_descriptions(pfam_descriptions_path)
if protein_id not in _pfam_dict:
return {}
protein_info = _pfam_dict[protein_id]
_pfam_dicts = protein_info.get('interproscan_results', {}).get('pfam_id', [])
pfam_ids = []
for pfam_dict in _pfam_dicts:
for key,value in pfam_dict.items():
pfam_ids.append(key)
result = {}
for pfam_id in pfam_ids:
if pfam_id in _pfam_descriptions:
result[pfam_id] = _pfam_descriptions[pfam_id]['description']
return result
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--protein_id", type=str, required=False, default="A8CF74")
parser.add_argument("--protein2pfam_path", type=str, required=False, default="data/processed_data/interproscan_info.json")
parser.add_argument("--pfam_descriptions_path", type=str, required=False, default="data/raw_data/all_pfam_descriptions.json")
args = parser.parse_args()
result = get_motif_pfam(args.protein_id, args.protein2pfam_path, args.pfam_descriptions_path)
print(result)