Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,7 +6,7 @@ import textwrap
|
|
| 6 |
from email.mime.multipart import MIMEMultipart
|
| 7 |
from email.mime.text import MIMEText
|
| 8 |
from email.utils import formatdate, make_msgid
|
| 9 |
-
from functools import cache
|
| 10 |
from math import pi
|
| 11 |
from time import sleep, time
|
| 12 |
from uuid import uuid4
|
|
@@ -25,6 +25,7 @@ import hydra
|
|
| 25 |
import pandas as pd
|
| 26 |
from pandarallel import pandarallel
|
| 27 |
import requests
|
|
|
|
| 28 |
from requests.adapters import HTTPAdapter, Retry
|
| 29 |
from markdown import markdown
|
| 30 |
from rdkit import Chem, DataStructs
|
|
@@ -291,29 +292,24 @@ def check_expiry():
|
|
| 291 |
send_email(job)
|
| 292 |
|
| 293 |
|
| 294 |
-
|
| 295 |
-
def max_tanimoto_similarity(smi, seen_smiles):
|
| 296 |
if smi is None:
|
| 297 |
return 0
|
|
|
|
|
|
|
| 298 |
mol = Chem.MolFromSmiles(smi)
|
| 299 |
if mol is None:
|
| 300 |
return 0
|
| 301 |
mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
mol_seen = Chem.MolFromSmiles(smiles)
|
| 305 |
-
mol_seen_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol_seen, radius=2, nBits=2048)
|
| 306 |
-
sim = DataStructs.TanimotoSimilarity(mol_ecfp, mol_seen_ecfp)
|
| 307 |
-
if sim == 1:
|
| 308 |
-
return 1
|
| 309 |
-
max_sim = max(sim, max_sim)
|
| 310 |
-
return max_sim
|
| 311 |
|
| 312 |
|
| 313 |
-
@cache
|
| 314 |
def max_sequence_identity(seq, seen_fastas):
|
| 315 |
if seq is None:
|
| 316 |
return 0
|
|
|
|
|
|
|
| 317 |
aligner = PairwiseAligner()
|
| 318 |
aligner.mode = 'local'
|
| 319 |
max_id = 0
|
|
@@ -328,16 +324,24 @@ def max_sequence_identity(seq, seen_fastas):
|
|
| 328 |
|
| 329 |
@cache
|
| 330 |
def get_seen_smiles(family, task):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
seen_smiles = pd.read_csv(
|
| 332 |
-
f'data/benchmarks/seen_compounds/{
|
| 333 |
-
return seen_smiles
|
| 334 |
|
| 335 |
|
| 336 |
@cache
|
| 337 |
def get_seen_fastas(family, task):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
seen_fastas = pd.read_csv(
|
| 339 |
-
f'data/benchmarks/seen_targets/{
|
| 340 |
-
return seen_fastas
|
| 341 |
|
| 342 |
|
| 343 |
@cache
|
|
@@ -709,7 +713,6 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
|
|
| 709 |
error = None
|
| 710 |
task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
|
| 711 |
predictions_file = None
|
| 712 |
-
|
| 713 |
df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
|
| 714 |
orig_df = pd.read_csv(predict_filepath)
|
| 715 |
alignment_df = get_fasta_family_map()
|
|
@@ -737,12 +740,9 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
|
|
| 737 |
if 'Target Family' not in orig_df.columns:
|
| 738 |
orig_df['Target Family'] = None
|
| 739 |
if orig_df['Target Family'].isna().any():
|
| 740 |
-
orig_df.loc[
|
| 741 |
-
orig_df['Target Family'].isna(), '
|
| 742 |
-
|
| 743 |
-
orig_df['Target Family'].isna(), 'X2'
|
| 744 |
-
].parallel_apply(detect_family)
|
| 745 |
-
|
| 746 |
detect_family.cache_clear()
|
| 747 |
|
| 748 |
orig_df = orig_df.merge(df_training[['X1', 'X2', 'Y']], on=['X1', 'X2'], how='left', indicator=False)
|
|
@@ -783,76 +783,82 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
|
|
| 783 |
prediction_df = pd.concat([prediction_df, predictions])
|
| 784 |
|
| 785 |
else:
|
| 786 |
-
predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}
|
| 787 |
task_value = TASK_MAP[task]
|
| 788 |
score = TASK_METRIC_MAP[task]
|
| 789 |
benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv')
|
| 790 |
predict_df = pd.read_csv(predict_filepath)
|
| 791 |
|
| 792 |
for family, subset in predict_df.groupby('Target Family'):
|
| 793 |
-
predict_subset_filepath =
|
|
|
|
|
|
|
| 794 |
subset.to_csv(predict_subset_filepath, index=False, na_rep='')
|
| 795 |
-
seen_compounds = get_seen_smiles(family, task_value)
|
| 796 |
|
|
|
|
| 797 |
if subset['X1'].iloc[0] in seen_compounds:
|
| 798 |
scenario = "Seen Compound"
|
| 799 |
else:
|
| 800 |
scenario = "Unseen Compound"
|
| 801 |
|
| 802 |
filtered_df = benchmark_df[(benchmark_df['Family'] == family.title())
|
| 803 |
-
& (benchmark_df['Scenario'] == scenario)
|
|
|
|
| 804 |
|
| 805 |
-
|
| 806 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
|
| 808 |
-
|
|
|
|
|
|
|
| 809 |
cfg = hydra.compose(
|
| 810 |
config_name="webserver_inference",
|
| 811 |
overrides=[f"task={task_value}",
|
| 812 |
f"preset={preset_value}",
|
| 813 |
-
# f"ckpt_path=D:/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
|
| 814 |
f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
|
| 815 |
f"data.data_file='{str(predict_subset_filepath)}'"])
|
| 816 |
|
| 817 |
predictions, _ = predict(cfg)
|
| 818 |
predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
|
| 819 |
-
predictions['Source'] = f'Predicted ({
|
|
|
|
| 820 |
prediction_df = pd.concat([prediction_df, predictions])
|
| 821 |
|
| 822 |
prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
|
| 823 |
prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
|
| 824 |
|
| 825 |
-
# prediction_df['Max. Tanimoto Similarity'] = prediction_df.groupby('Target Family')['X1'].apply(
|
| 826 |
-
# lambda group: group.parallel_apply(
|
| 827 |
-
# max_tanimoto_similarity,
|
| 828 |
-
# seen_smiles=tuple(get_seen_smiles(family=group.name, task=task_value))
|
| 829 |
-
# )
|
| 830 |
-
# ).values
|
| 831 |
-
#
|
| 832 |
-
# prediction_df['Max. Sequence Identity'] = prediction_df.groupby('Target Family')['X2'].apply(
|
| 833 |
-
# lambda group: group.parallel_apply(
|
| 834 |
-
# max_sequence_identity,
|
| 835 |
-
# seen_fastas=tuple(get_seen_fastas(family=group.name, task=task_value))
|
| 836 |
-
# )
|
| 837 |
-
# ).values
|
| 838 |
if "Include Max. Tanimoto Similarity" in opts:
|
| 839 |
for family in prediction_df['Target Family'].unique():
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
|
|
|
|
|
|
|
|
|
| 845 |
)
|
| 846 |
-
|
|
|
|
| 847 |
if "Include Max. Sequence Identity" in opts:
|
| 848 |
for family in prediction_df['Target Family'].unique():
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
seen_fastas=tuple(get_seen_fastas(family=family, task=task_value))
|
| 854 |
)
|
| 855 |
-
|
|
|
|
| 856 |
prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
|
| 857 |
status = "COMPLETED"
|
| 858 |
|
|
@@ -1968,9 +1974,8 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
| 1968 |
return [None, family]
|
| 1969 |
|
| 1970 |
if family == 'General':
|
| 1971 |
-
seen_targets =
|
| 1972 |
-
|
| 1973 |
-
if process_target_fasta(fasta) in seen_targets['X2'].values:
|
| 1974 |
scenario = "Seen Target"
|
| 1975 |
else:
|
| 1976 |
scenario = "Unseen Target"
|
|
@@ -1979,16 +1984,14 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
| 1979 |
& (benchmark_df['Type'] == 'General')]
|
| 1980 |
|
| 1981 |
else:
|
| 1982 |
-
seen_targets_general =
|
| 1983 |
-
|
| 1984 |
-
if process_target_fasta(fasta) in seen_targets_general['X2'].values:
|
| 1985 |
scenario_general = "Seen Target"
|
| 1986 |
else:
|
| 1987 |
scenario_general = "Unseen Target"
|
| 1988 |
|
| 1989 |
-
seen_targets_family =
|
| 1990 |
-
|
| 1991 |
-
if process_target_fasta(fasta) in seen_targets_family['X2'].values:
|
| 1992 |
scenario_family = "Seen Target"
|
| 1993 |
else:
|
| 1994 |
scenario_family = "Unseen Target"
|
|
@@ -2008,10 +2011,9 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
| 2008 |
scenario = "Unseen Target (<0.85 sequence identity)"
|
| 2009 |
|
| 2010 |
return {drug_screen_preset:
|
| 2011 |
-
|
| 2012 |
-
|
| 2013 |
-
|
| 2014 |
-
f"on {row['Family']}."),
|
| 2015 |
drug_screen_target_family:
|
| 2016 |
gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
|
| 2017 |
|
|
@@ -2569,4 +2571,4 @@ if __name__ == "__main__":
|
|
| 2569 |
hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
|
| 2570 |
demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
|
| 2571 |
scheduler.add_job(check_expiry, 'interval', hours=1)
|
| 2572 |
-
scheduler.start()
|
|
|
|
| 6 |
from email.mime.multipart import MIMEMultipart
|
| 7 |
from email.mime.text import MIMEText
|
| 8 |
from email.utils import formatdate, make_msgid
|
| 9 |
+
from functools import cache, partial
|
| 10 |
from math import pi
|
| 11 |
from time import sleep, time
|
| 12 |
from uuid import uuid4
|
|
|
|
| 25 |
import pandas as pd
|
| 26 |
from pandarallel import pandarallel
|
| 27 |
import requests
|
| 28 |
+
from rdkit.DataStructs import BulkTanimotoSimilarity
|
| 29 |
from requests.adapters import HTTPAdapter, Retry
|
| 30 |
from markdown import markdown
|
| 31 |
from rdkit import Chem, DataStructs
|
|
|
|
| 292 |
send_email(job)
|
| 293 |
|
| 294 |
|
| 295 |
+
def max_tanimoto_similarity(smi, seen_smiles_with_fp):
|
|
|
|
| 296 |
if smi is None:
|
| 297 |
return 0
|
| 298 |
+
if smi in seen_smiles_with_fp['X1'].values:
|
| 299 |
+
return 1
|
| 300 |
mol = Chem.MolFromSmiles(smi)
|
| 301 |
if mol is None:
|
| 302 |
return 0
|
| 303 |
mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
|
| 304 |
+
sims = BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'])
|
| 305 |
+
return max(sims)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
|
|
|
|
| 308 |
def max_sequence_identity(seq, seen_fastas):
|
| 309 |
if seq is None:
|
| 310 |
return 0
|
| 311 |
+
if seq in seen_fastas:
|
| 312 |
+
return 1
|
| 313 |
aligner = PairwiseAligner()
|
| 314 |
aligner.mode = 'local'
|
| 315 |
max_id = 0
|
|
|
|
| 324 |
|
| 325 |
@cache
|
| 326 |
def get_seen_smiles(family, task):
|
| 327 |
+
if family == 'General':
|
| 328 |
+
family = 'all_families_full'
|
| 329 |
+
else:
|
| 330 |
+
family = TARGET_FAMILY_MAP[family.title()]
|
| 331 |
seen_smiles = pd.read_csv(
|
| 332 |
+
f'data/benchmarks/seen_compounds/{family}_{task.lower()}_random_split.csv')
|
| 333 |
+
return seen_smiles
|
| 334 |
|
| 335 |
|
| 336 |
@cache
|
| 337 |
def get_seen_fastas(family, task):
|
| 338 |
+
if family == 'General':
|
| 339 |
+
family = 'all_families_full'
|
| 340 |
+
else:
|
| 341 |
+
family = TARGET_FAMILY_MAP[family.title()]
|
| 342 |
seen_fastas = pd.read_csv(
|
| 343 |
+
f'data/benchmarks/seen_targets/{family}_{task.lower()}_random_split.csv')
|
| 344 |
+
return seen_fastas
|
| 345 |
|
| 346 |
|
| 347 |
@cache
|
|
|
|
| 713 |
error = None
|
| 714 |
task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
|
| 715 |
predictions_file = None
|
|
|
|
| 716 |
df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
|
| 717 |
orig_df = pd.read_csv(predict_filepath)
|
| 718 |
alignment_df = get_fasta_family_map()
|
|
|
|
| 740 |
if 'Target Family' not in orig_df.columns:
|
| 741 |
orig_df['Target Family'] = None
|
| 742 |
if orig_df['Target Family'].isna().any():
|
| 743 |
+
orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
|
| 744 |
+
orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
|
| 745 |
+
)
|
|
|
|
|
|
|
|
|
|
| 746 |
detect_family.cache_clear()
|
| 747 |
|
| 748 |
orig_df = orig_df.merge(df_training[['X1', 'X2', 'Y']], on=['X1', 'X2'], how='left', indicator=False)
|
|
|
|
| 783 |
prediction_df = pd.concat([prediction_df, predictions])
|
| 784 |
|
| 785 |
else:
|
| 786 |
+
predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_family-recommended_predictions.csv'
|
| 787 |
task_value = TASK_MAP[task]
|
| 788 |
score = TASK_METRIC_MAP[task]
|
| 789 |
benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv')
|
| 790 |
predict_df = pd.read_csv(predict_filepath)
|
| 791 |
|
| 792 |
for family, subset in predict_df.groupby('Target Family'):
|
| 793 |
+
predict_subset_filepath = os.path.join(
|
| 794 |
+
os.path.dirname(predict_filepath), f'{job_id}_{family}_input.csv'
|
| 795 |
+
)
|
| 796 |
subset.to_csv(predict_subset_filepath, index=False, na_rep='')
|
|
|
|
| 797 |
|
| 798 |
+
seen_compounds = get_seen_smiles(family, task_value)['X1'].values
|
| 799 |
if subset['X1'].iloc[0] in seen_compounds:
|
| 800 |
scenario = "Seen Compound"
|
| 801 |
else:
|
| 802 |
scenario = "Unseen Compound"
|
| 803 |
|
| 804 |
filtered_df = benchmark_df[(benchmark_df['Family'] == family.title())
|
| 805 |
+
& (benchmark_df['Scenario'] == scenario)
|
| 806 |
+
& (benchmark_df['Type'] == 'Family')]
|
| 807 |
|
| 808 |
+
seen_compounds = get_seen_smiles('General', task_value)['X1'].values
|
| 809 |
+
if subset['X1'].iloc[0] in seen_compounds:
|
| 810 |
+
scenario = "Seen Compound"
|
| 811 |
+
else:
|
| 812 |
+
scenario = "Unseen Compound"
|
| 813 |
+
|
| 814 |
+
filtered_df = pd.concat([
|
| 815 |
+
filtered_df,
|
| 816 |
+
benchmark_df[(benchmark_df['Family'] == family.title())
|
| 817 |
+
& (benchmark_df['Scenario'] == scenario)
|
| 818 |
+
& (benchmark_df['Type'] == 'General')]
|
| 819 |
+
])
|
| 820 |
|
| 821 |
+
row = filtered_df.loc[filtered_df[score].idxmax()]
|
| 822 |
+
preset_value = PRESET_MAP[row['Model']]
|
| 823 |
+
target_family = TARGET_FAMILY_MAP[family.title()] if row['Type'] == 'Family' else 'general'
|
| 824 |
cfg = hydra.compose(
|
| 825 |
config_name="webserver_inference",
|
| 826 |
overrides=[f"task={task_value}",
|
| 827 |
f"preset={preset_value}",
|
|
|
|
| 828 |
f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
|
| 829 |
f"data.data_file='{str(predict_subset_filepath)}'"])
|
| 830 |
|
| 831 |
predictions, _ = predict(cfg)
|
| 832 |
predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
|
| 833 |
+
predictions['Source'] = (f'Predicted ({row["Model"]} '
|
| 834 |
+
f'{family.title() if row["Type"] == "Family" else "General"})')
|
| 835 |
prediction_df = pd.concat([prediction_df, predictions])
|
| 836 |
|
| 837 |
prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
|
| 838 |
prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
|
| 839 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 840 |
if "Include Max. Tanimoto Similarity" in opts:
|
| 841 |
for family in prediction_df['Target Family'].unique():
|
| 842 |
+
family_smiles_df = get_seen_smiles(family=family, task=task_value)
|
| 843 |
+
family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(
|
| 844 |
+
lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
|
| 845 |
+
Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
|
| 846 |
+
)
|
| 847 |
+
max_sim = cache(partial(max_tanimoto_similarity, seen_smiles_with_fp=family_smiles_df))
|
| 848 |
+
prediction_df.loc[prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity'] = (
|
| 849 |
+
prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
|
| 850 |
)
|
| 851 |
+
max_sim.cache_clear()
|
| 852 |
+
|
| 853 |
if "Include Max. Sequence Identity" in opts:
|
| 854 |
for family in prediction_df['Target Family'].unique():
|
| 855 |
+
family_fastas_df = get_seen_fastas(family=family, task=task_value)
|
| 856 |
+
max_id = cache(partial(max_sequence_identity, seen_fastas=family_fastas_df['X2'].values))
|
| 857 |
+
prediction_df.loc[prediction_df['Target Family'] == family, 'Max. Sequence Identity'] = (
|
| 858 |
+
prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
|
|
|
|
| 859 |
)
|
| 860 |
+
max_id.cache_clear()
|
| 861 |
+
|
| 862 |
prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
|
| 863 |
status = "COMPLETED"
|
| 864 |
|
|
|
|
| 1974 |
return [None, family]
|
| 1975 |
|
| 1976 |
if family == 'General':
|
| 1977 |
+
seen_targets = get_seen_fastas('General', task)['X2'].values
|
| 1978 |
+
if process_target_fasta(fasta) in seen_targets:
|
|
|
|
| 1979 |
scenario = "Seen Target"
|
| 1980 |
else:
|
| 1981 |
scenario = "Unseen Target"
|
|
|
|
| 1984 |
& (benchmark_df['Type'] == 'General')]
|
| 1985 |
|
| 1986 |
else:
|
| 1987 |
+
seen_targets_general = get_seen_fastas('General', task)['X2'].values
|
| 1988 |
+
if process_target_fasta(fasta) in seen_targets_general:
|
|
|
|
| 1989 |
scenario_general = "Seen Target"
|
| 1990 |
else:
|
| 1991 |
scenario_general = "Unseen Target"
|
| 1992 |
|
| 1993 |
+
seen_targets_family = get_seen_fastas(family, task)['X2'].values
|
| 1994 |
+
if process_target_fasta(fasta) in seen_targets_family:
|
|
|
|
| 1995 |
scenario_family = "Seen Target"
|
| 1996 |
else:
|
| 1997 |
scenario_family = "Unseen Target"
|
|
|
|
| 2011 |
scenario = "Unseen Target (<0.85 sequence identity)"
|
| 2012 |
|
| 2013 |
return {drug_screen_preset:
|
| 2014 |
+
gr.Dropdown(value=row['Model'],
|
| 2015 |
+
info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
|
| 2016 |
+
f"model with the best {score} in the {scenario} scenario on {row['Family']}."),
|
|
|
|
| 2017 |
drug_screen_target_family:
|
| 2018 |
gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
|
| 2019 |
|
|
|
|
| 2571 |
hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
|
| 2572 |
demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
|
| 2573 |
scheduler.add_job(check_expiry, 'interval', hours=1)
|
| 2574 |
+
scheduler.start()
|