Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -824,6 +824,93 @@ using the job id. You will also receive an email notification once the job is do
|
|
824 |
raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
|
825 |
|
826 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
827 |
def submit_predict(predict_filepath, task, preset, target_family, opts, job_info):
|
828 |
job_id = job_info['id']
|
829 |
status = job_info['status']
|
@@ -968,88 +1055,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
|
|
968 |
df_list = [prediction_df, annotated_df]
|
969 |
prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True)
|
970 |
|
971 |
-
|
972 |
-
if "Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set" in opts:
|
973 |
-
x2 = prediction_df['X2'].iloc[0]
|
974 |
-
|
975 |
-
prediction_df[[
|
976 |
-
'Max. Sequence Identity to Training Targets',
|
977 |
-
'Max. Id. Training Target'
|
978 |
-
]] = pd.Series(max_sequence_identity(x2, df_training))
|
979 |
-
|
980 |
-
if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
|
981 |
-
x2 = prediction_df['X2'].iloc[0]
|
982 |
-
pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
|
983 |
-
pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(smiles_to_ecfp)
|
984 |
-
|
985 |
-
@cache
|
986 |
-
def max_sim(smiles):
|
987 |
-
return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
|
988 |
-
|
989 |
-
prediction_df[[
|
990 |
-
'Max. Tanimoto Similarity to Known Ligands',
|
991 |
-
'Max. Sim. Ligand'
|
992 |
-
]] = prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
|
993 |
-
|
994 |
-
max_sim.cache_clear()
|
995 |
-
|
996 |
-
if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
|
997 |
-
x2 = prediction_df['X2'].iloc[0]
|
998 |
-
prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
|
999 |
-
|
1000 |
-
@cache
|
1001 |
-
def max_id(compound):
|
1002 |
-
pos_targets_df = df_training.loc[df_training['X1'] == compound]
|
1003 |
-
return max_sequence_identity(x2, seen_fastas=pos_targets_df)
|
1004 |
-
|
1005 |
-
prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
|
1006 |
-
'Max. Id. Target']] = (
|
1007 |
-
prediction_df['X1^'].parallel_apply(max_id).apply(pd.Series)
|
1008 |
-
)
|
1009 |
-
prediction_df.drop(['X1^'], axis=1, inplace=True)
|
1010 |
-
|
1011 |
-
max_id.cache_clear()
|
1012 |
-
|
1013 |
-
# Advanced options for Target Protein Identification
|
1014 |
-
if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
|
1015 |
-
x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
|
1016 |
-
prediction_df['FP'] = prediction_df['X1'].parallel_apply(smiles_to_ecfp)
|
1017 |
-
|
1018 |
-
prediction_df[[
|
1019 |
-
'Max. Tanimoto Similarity to Training Compounds',
|
1020 |
-
'Max. Sim. Training Compound'
|
1021 |
-
]] = pd.Series(max_tanimoto_similarity(x1, df_training))
|
1022 |
-
|
1023 |
-
if "Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound" in opts:
|
1024 |
-
x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
|
1025 |
-
pos_targets_df = df_training.loc[(df_training['X1'] == x1) & (df_training['Y'] == 1)].copy()
|
1026 |
-
|
1027 |
-
@cache
|
1028 |
-
def max_id(fasta):
|
1029 |
-
return max_sequence_identity(fasta, seen_fastas=pos_targets_df)
|
1030 |
-
|
1031 |
-
prediction_df[[
|
1032 |
-
'Max. Sequence Identity to Known Targets of Input Compound',
|
1033 |
-
'Max. Id. Target'
|
1034 |
-
]] = prediction_df['X2'].parallel_apply(max_id).apply(pd.Series)
|
1035 |
-
|
1036 |
-
max_id.cache_clear()
|
1037 |
-
|
1038 |
-
if "Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target" in opts:
|
1039 |
-
x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
|
1040 |
-
|
1041 |
-
@cache
|
1042 |
-
def max_sim(fasta):
|
1043 |
-
pos_targets_df = df_training.loc[(df_training['X2'] == fasta) & (df_training['Y'] == 1)].copy()
|
1044 |
-
pos_targets_df['FP'] = pos_targets_df['X1'].apply(smiles_to_ecfp)
|
1045 |
-
return max_tanimoto_similarity(x1, seen_smiles_with_fp=pos_targets_df)
|
1046 |
-
|
1047 |
-
prediction_df[[
|
1048 |
-
'Max. Tanimoto Similarity to Known Ligands of Identified Target',
|
1049 |
-
'Max. Sim. Ligand'
|
1050 |
-
]] = prediction_df['X2'].parallel_apply(max_sim).apply(pd.Series)
|
1051 |
-
|
1052 |
-
max_sim.cache_clear()
|
1053 |
|
1054 |
prediction_df.drop(
|
1055 |
[col for col in ['N', 'FP'] if col in prediction_df.columns], axis=1
|
@@ -1087,6 +1093,8 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
|
|
1087 |
def update_df(file, progress=gr.Progress(track_tqdm=True)):
|
1088 |
if file and Path(file).is_file():
|
1089 |
task = None
|
|
|
|
|
1090 |
if "_CPI_" in str(file):
|
1091 |
task = 'Compound-Protein Interaction'
|
1092 |
elif "_CPA_" in str(file):
|
@@ -1113,11 +1121,33 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
|
|
1113 |
if 'Y^' in df.columns:
|
1114 |
df['Y^'] = 10 ** (-df['Y^'])
|
1115 |
|
1116 |
-
|
1117 |
-
|
1118 |
-
|
1119 |
-
|
1120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1121 |
else:
|
1122 |
return {analyze_btn: gr.Button(interactive=False)}
|
1123 |
|
@@ -1253,6 +1283,21 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
|
|
1253 |
if unique_df is not None:
|
1254 |
if 'Target FASTA' in unique_df.columns:
|
1255 |
unique_df['Target FASTA'] = unique_df['Target FASTA'].str.replace('\n', '<br>')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1256 |
if any(unique_df.columns.isin(bool_cols)):
|
1257 |
unique_df = unique_df.style.applymap(
|
1258 |
lambda val: f"background-color: {bool_col_colors[val]}", subset=bool_cols)
|
@@ -1268,11 +1313,11 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
|
|
1268 |
uniprot_id_formatter = HTMLTemplateFormatter(
|
1269 |
template='<% if (value == value) { ' # Check if value is not NaN
|
1270 |
'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) '
|
1271 |
-
|
1272 |
'{ %><a href="https://www.uniprot.org/uniprotkb/<%= value %>" target="_blank"><%= value %></a><% '
|
1273 |
-
|
1274 |
-
'} else { %><div style="white-space: pre-wrap;"><%= value.match(/.{1,60}/g).join("<br>")
|
1275 |
-
'
|
1276 |
)
|
1277 |
pubchem_id_formatter = HTMLTemplateFormatter(
|
1278 |
template='<% if (value == value) { ' # Check if value is not NaN
|
@@ -1280,6 +1325,9 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
|
|
1280 |
'target="_blank"><%= value %></a>'
|
1281 |
'<% } else { %><% } %>' # Output empty string if value is NaN
|
1282 |
)
|
|
|
|
|
|
|
1283 |
bool_formatters = {col: BooleanFormatter() for col in bool_cols}
|
1284 |
float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns}
|
1285 |
other_formatters = {
|
@@ -1294,6 +1342,8 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
|
|
1294 |
'Max. Id. Target': uniprot_id_formatter,
|
1295 |
'Max. Sim. Training Compound': pubchem_id_formatter,
|
1296 |
'Max. Id. Training Target': uniprot_id_formatter,
|
|
|
|
|
1297 |
}
|
1298 |
formatters = {**bool_formatters, **float_formatters, **other_formatters}
|
1299 |
|
@@ -1492,7 +1542,7 @@ def create_pie_chart(df, category, value, top_k):
|
|
1492 |
return p
|
1493 |
|
1494 |
|
1495 |
-
def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_tqdm=True)):
|
1496 |
df_report = df.copy()
|
1497 |
try:
|
1498 |
for filter_name in filter_list:
|
@@ -1503,6 +1553,10 @@ def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_
|
|
1503 |
df_report[score_name] = df_report['Compound'].parallel_apply(
|
1504 |
lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
|
1505 |
|
|
|
|
|
|
|
|
|
1506 |
return (create_html_report(df_report, file=None, task=task), df_report,
|
1507 |
gr.File(visible=False), gr.File(visible=False))
|
1508 |
|
@@ -1726,10 +1780,12 @@ this serves as an additional indicator of the confidence level of the predicted
|
|
1726 |
higher identities usually lead to greater confidence in the predictions.<br>
|
1727 |
""")
|
1728 |
drug_screen_opts = gr.CheckboxGroup(
|
1729 |
-
label="Step 6. Select
|
|
|
1730 |
choices=DRUG_SCRENN_CPI_OPTS,
|
1731 |
-
info="
|
1732 |
-
"See the Help Tip on the right or the Documentation for detailed explanation."
|
|
|
1733 |
)
|
1734 |
with gr.Row():
|
1735 |
with gr.Column():
|
@@ -1845,8 +1901,9 @@ higher similarities usually correspond to greater prediction confidence.<br>
|
|
1845 |
""")
|
1846 |
target_identify_opts = gr.CheckboxGroup(
|
1847 |
choices=TARGET_IDENTIFY_CPI_OPTS,
|
1848 |
-
|
1849 |
-
|
|
|
1850 |
"See the Help Tip on the right or the Documentation for detailed explanation."
|
1851 |
)
|
1852 |
with gr.Row():
|
@@ -2021,8 +2078,11 @@ higher similarities usually correspond to greater prediction confidence.<br>
|
|
2021 |
label='Specify the Task Labels in the Uploaded Dataset')
|
2022 |
with gr.Column(scale=2):
|
2023 |
with gr.Row():
|
2024 |
-
|
2025 |
-
|
|
|
|
|
|
|
2026 |
with gr.Accordion('Report Generate Options', open=True):
|
2027 |
with gr.Row():
|
2028 |
csv_sep = gr.Radio(label='CSV Delimiter',
|
@@ -2784,7 +2844,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2784 |
)
|
2785 |
|
2786 |
analyze_btn.click(
|
2787 |
-
fn=submit_report, inputs=[raw_df, scores, filters, report_task], outputs=[
|
2788 |
html_report, report_df, csv_download_file, html_download_file]
|
2789 |
).success(
|
2790 |
fn=lambda: [gr.Button(interactive=True)] * 2,
|
@@ -2793,6 +2853,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2793 |
)
|
2794 |
|
2795 |
|
|
|
2796 |
def create_csv_report_file(df, file_report, task, sep, progress=gr.Progress(track_tqdm=True)):
|
2797 |
csv_sep_map = {
|
2798 |
'Comma': ',',
|
|
|
824 |
raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
|
825 |
|
826 |
|
827 |
+
def apply_advanced_opts(prediction_df, opts, df_training):
|
828 |
+
# Advanced options for Drug Hit Screening
|
829 |
+
if "Calculate Max. Sequence Identity between the Input Target and Targets in the Training Set" in opts:
|
830 |
+
x2 = prediction_df['X2'].iloc[0]
|
831 |
+
|
832 |
+
prediction_df[[
|
833 |
+
'Max. Sequence Identity to Training Targets',
|
834 |
+
'Max. Id. Training Target'
|
835 |
+
]] = pd.Series(max_sequence_identity(x2, df_training))
|
836 |
+
|
837 |
+
if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
|
838 |
+
x2 = prediction_df['X2'].iloc[0]
|
839 |
+
pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
|
840 |
+
pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(smiles_to_ecfp)
|
841 |
+
|
842 |
+
@cache
|
843 |
+
def max_sim(smiles):
|
844 |
+
return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
|
845 |
+
|
846 |
+
prediction_df[[
|
847 |
+
'Max. Tanimoto Similarity to Known Ligands',
|
848 |
+
'Max. Sim. Ligand'
|
849 |
+
]] = prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
|
850 |
+
|
851 |
+
max_sim.cache_clear()
|
852 |
+
|
853 |
+
if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
|
854 |
+
x2 = prediction_df['X2'].iloc[0]
|
855 |
+
prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
|
856 |
+
|
857 |
+
@cache
|
858 |
+
def max_id(compound):
|
859 |
+
pos_targets_df = df_training.loc[df_training['X1'] == compound]
|
860 |
+
return max_sequence_identity(x2, seen_fastas=pos_targets_df)
|
861 |
+
|
862 |
+
prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
|
863 |
+
'Max. Id. Target']] = (
|
864 |
+
prediction_df['X1^'].parallel_apply(max_id).apply(pd.Series)
|
865 |
+
)
|
866 |
+
prediction_df.drop(['X1^'], axis=1, inplace=True)
|
867 |
+
|
868 |
+
max_id.cache_clear()
|
869 |
+
|
870 |
+
# Advanced options for Target Protein Identification
|
871 |
+
if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
|
872 |
+
x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
|
873 |
+
prediction_df['FP'] = prediction_df['X1'].parallel_apply(smiles_to_ecfp)
|
874 |
+
|
875 |
+
prediction_df[[
|
876 |
+
'Max. Tanimoto Similarity to Training Compounds',
|
877 |
+
'Max. Sim. Training Compound'
|
878 |
+
]] = pd.Series(max_tanimoto_similarity(x1, df_training))
|
879 |
+
|
880 |
+
if "Calculate Max. Sequence Identity between the Identified Target and Known Targets of the Input Compound" in opts:
|
881 |
+
x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
|
882 |
+
pos_targets_df = df_training.loc[(df_training['X1'] == x1) & (df_training['Y'] == 1)].copy()
|
883 |
+
|
884 |
+
@cache
|
885 |
+
def max_id(fasta):
|
886 |
+
return max_sequence_identity(fasta, seen_fastas=pos_targets_df)
|
887 |
+
|
888 |
+
prediction_df[[
|
889 |
+
'Max. Sequence Identity to Known Targets of Input Compound',
|
890 |
+
'Max. Id. Target'
|
891 |
+
]] = prediction_df['X2'].parallel_apply(max_id).apply(pd.Series)
|
892 |
+
|
893 |
+
max_id.cache_clear()
|
894 |
+
|
895 |
+
if "Calculate Max. Tanimoto Similarity between the Input Compound and Known Ligands of the Identified Target" in opts:
|
896 |
+
x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
|
897 |
+
|
898 |
+
@cache
|
899 |
+
def max_sim(fasta):
|
900 |
+
pos_targets_df = df_training.loc[(df_training['X2'] == fasta) & (df_training['Y'] == 1)].copy()
|
901 |
+
pos_targets_df['FP'] = pos_targets_df['X1'].apply(smiles_to_ecfp)
|
902 |
+
return max_tanimoto_similarity(x1, seen_smiles_with_fp=pos_targets_df)
|
903 |
+
|
904 |
+
prediction_df[[
|
905 |
+
'Max. Tanimoto Similarity to Known Ligands of Identified Target',
|
906 |
+
'Max. Sim. Ligand'
|
907 |
+
]] = prediction_df['X2'].parallel_apply(max_sim).apply(pd.Series)
|
908 |
+
|
909 |
+
max_sim.cache_clear()
|
910 |
+
|
911 |
+
return prediction_df
|
912 |
+
|
913 |
+
|
914 |
def submit_predict(predict_filepath, task, preset, target_family, opts, job_info):
|
915 |
job_id = job_info['id']
|
916 |
status = job_info['status']
|
|
|
1055 |
df_list = [prediction_df, annotated_df]
|
1056 |
prediction_df = pd.concat([df for df in df_list if not df.empty], ignore_index=True)
|
1057 |
|
1058 |
+
prediction_df = apply_advanced_opts(prediction_df, opts, df_training)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1059 |
|
1060 |
prediction_df.drop(
|
1061 |
[col for col in ['N', 'FP'] if col in prediction_df.columns], axis=1
|
|
|
1093 |
def update_df(file, progress=gr.Progress(track_tqdm=True)):
|
1094 |
if file and Path(file).is_file():
|
1095 |
task = None
|
1096 |
+
job = None
|
1097 |
+
|
1098 |
if "_CPI_" in str(file):
|
1099 |
task = 'Compound-Protein Interaction'
|
1100 |
elif "_CPA_" in str(file):
|
|
|
1121 |
if 'Y^' in df.columns:
|
1122 |
df['Y^'] = 10 ** (-df['Y^'])
|
1123 |
|
1124 |
+
n_compound = df['X1'].nunique()
|
1125 |
+
n_protein = df['X2'].nunique()
|
1126 |
+
|
1127 |
+
if n_compound == 1 and n_protein >= 2:
|
1128 |
+
job = 'Target Protein Identification'
|
1129 |
+
if task == 'Compound-Protein Interaction':
|
1130 |
+
opts = TARGET_IDENTIFY_CPI_OPTS
|
1131 |
+
elif task == 'Compound-Protein Binding Affinity':
|
1132 |
+
opts = TARGET_IDENTIFY_CPA_OPTS
|
1133 |
+
if n_compound >= 2 and n_protein == 1:
|
1134 |
+
job = 'Drug Hit Screening'
|
1135 |
+
if task == 'Compound-Protein Interaction':
|
1136 |
+
opts = DRUG_SCRENN_CPI_OPTS
|
1137 |
+
elif task == 'Compound-Protein Binding Affinity':
|
1138 |
+
opts = DRUG_SCRENN_CPA_OPTS
|
1139 |
+
|
1140 |
+
return {
|
1141 |
+
html_report: create_html_report(df, file=None, task=task),
|
1142 |
+
raw_df: df,
|
1143 |
+
report_df: df.copy(),
|
1144 |
+
analyze_btn: gr.Button(interactive=True),
|
1145 |
+
report_task: task,
|
1146 |
+
job_opts: gr.CheckboxGroup(
|
1147 |
+
label=f'{job} Advanced Options',
|
1148 |
+
choices=opts,
|
1149 |
+
) if job else gr.CheckboxGroup(visible=False),
|
1150 |
+
}
|
1151 |
else:
|
1152 |
return {analyze_btn: gr.Button(interactive=False)}
|
1153 |
|
|
|
1283 |
if unique_df is not None:
|
1284 |
if 'Target FASTA' in unique_df.columns:
|
1285 |
unique_df['Target FASTA'] = unique_df['Target FASTA'].str.replace('\n', '<br>')
|
1286 |
+
|
1287 |
+
if 'Max. Sequence Identity to Training Targets' in unique_df.columns:
|
1288 |
+
# Add alert emoji for sequence identity below 0.85
|
1289 |
+
if unique_df['Max. Sequence Identity to Training Targets'].iloc[0] < 0.85:
|
1290 |
+
unique_df['Max. Sequence Identity to Training Targets'] = (
|
1291 |
+
f'{unique_df["Max. Sequence Identity to Training Targets"]:.3f} ⚠️'
|
1292 |
+
)
|
1293 |
+
|
1294 |
+
if 'Max. Tanimoto Similarity to Training Compounds' in unique_df.columns:
|
1295 |
+
# Add alert emoji for sequence identity below 0.85
|
1296 |
+
if unique_df['Max. Tanimoto Similarity to Training Compounds'].iloc[0] < 0.85:
|
1297 |
+
unique_df['Max. Tanimoto Similarity to Training Compounds'] = (
|
1298 |
+
f'{unique_df["Max. Tanimoto Similarity to Training Compounds"]:.3f} ⚠️'
|
1299 |
+
)
|
1300 |
+
|
1301 |
if any(unique_df.columns.isin(bool_cols)):
|
1302 |
unique_df = unique_df.style.applymap(
|
1303 |
lambda val: f"background-color: {bool_col_colors[val]}", subset=bool_cols)
|
|
|
1313 |
uniprot_id_formatter = HTMLTemplateFormatter(
|
1314 |
template='<% if (value == value) { ' # Check if value is not NaN
|
1315 |
'if (/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$/.test(value)) '
|
1316 |
+
# Check if value is a valid UniProt ID
|
1317 |
'{ %><a href="https://www.uniprot.org/uniprotkb/<%= value %>" target="_blank"><%= value %></a><% '
|
1318 |
+
# Else treat it as a sequence or other plain-text string, line-warping every 60 characters
|
1319 |
+
'} else { %><div style="white-space: pre-wrap;"><%= value.match(/.{1,60}/g).join("<br>") '
|
1320 |
+
'%></div><% } %><% } else { %><% } %>' # Output empty string if value is NaN
|
1321 |
)
|
1322 |
pubchem_id_formatter = HTMLTemplateFormatter(
|
1323 |
template='<% if (value == value) { ' # Check if value is not NaN
|
|
|
1325 |
'target="_blank"><%= value %></a>'
|
1326 |
'<% } else { %><% } %>' # Output empty string if value is NaN
|
1327 |
)
|
1328 |
+
alert_emoji_formatter = HTMLTemplateFormatter(
|
1329 |
+
template='<% if (value < 0.85) { %><%= value %> ⚠️<% } else { %><%= value %><% } %>'
|
1330 |
+
)
|
1331 |
bool_formatters = {col: BooleanFormatter() for col in bool_cols}
|
1332 |
float_formatters = {col: NumberFormatter(format='0.000') for col in df_html.select_dtypes('floating').columns}
|
1333 |
other_formatters = {
|
|
|
1342 |
'Max. Id. Target': uniprot_id_formatter,
|
1343 |
'Max. Sim. Training Compound': pubchem_id_formatter,
|
1344 |
'Max. Id. Training Target': uniprot_id_formatter,
|
1345 |
+
'Max. Sequence Identity to Training Targets': alert_emoji_formatter,
|
1346 |
+
'Max. Sequence Identity to Known Targets of Hit Compound': alert_emoji_formatter,
|
1347 |
}
|
1348 |
formatters = {**bool_formatters, **float_formatters, **other_formatters}
|
1349 |
|
|
|
1542 |
return p
|
1543 |
|
1544 |
|
1545 |
+
def submit_report(df, score_list, filter_list, opt_list, task, progress=gr.Progress(track_tqdm=True)):
|
1546 |
df_report = df.copy()
|
1547 |
try:
|
1548 |
for filter_name in filter_list:
|
|
|
1553 |
df_report[score_name] = df_report['Compound'].parallel_apply(
|
1554 |
lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
|
1555 |
|
1556 |
+
if opt_list:
|
1557 |
+
df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
|
1558 |
+
df_report = apply_advanced_opts(df_report, opt_list, df_training)
|
1559 |
+
|
1560 |
return (create_html_report(df_report, file=None, task=task), df_report,
|
1561 |
gr.File(visible=False), gr.File(visible=False))
|
1562 |
|
|
|
1780 |
higher identities usually lead to greater confidence in the predictions.<br>
|
1781 |
""")
|
1782 |
drug_screen_opts = gr.CheckboxGroup(
|
1783 |
+
label="Step 6. Select Advanced Options",
|
1784 |
+
value=DRUG_SCRENN_CPI_OPTS[0],
|
1785 |
choices=DRUG_SCRENN_CPI_OPTS,
|
1786 |
+
info="Advanced features - may increase the job computation time. "
|
1787 |
+
"See the Help Tip on the right or the Documentation for detailed explanation.",
|
1788 |
+
|
1789 |
)
|
1790 |
with gr.Row():
|
1791 |
with gr.Column():
|
|
|
1901 |
""")
|
1902 |
target_identify_opts = gr.CheckboxGroup(
|
1903 |
choices=TARGET_IDENTIFY_CPI_OPTS,
|
1904 |
+
value=TARGET_IDENTIFY_CPI_OPTS[0],
|
1905 |
+
label='Step 6. Select Advanced Options',
|
1906 |
+
info="Advanced features - may increase the job computation time. "
|
1907 |
"See the Help Tip on the right or the Documentation for detailed explanation."
|
1908 |
)
|
1909 |
with gr.Row():
|
|
|
2078 |
label='Specify the Task Labels in the Uploaded Dataset')
|
2079 |
with gr.Column(scale=2):
|
2080 |
with gr.Row():
|
2081 |
+
with gr.Row():
|
2082 |
+
scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Compound Scores')
|
2083 |
+
filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Compound Filters')
|
2084 |
+
job_opts = gr.CheckboxGroup(visible=False)
|
2085 |
+
|
2086 |
with gr.Accordion('Report Generate Options', open=True):
|
2087 |
with gr.Row():
|
2088 |
csv_sep = gr.Radio(label='CSV Delimiter',
|
|
|
2844 |
)
|
2845 |
|
2846 |
analyze_btn.click(
|
2847 |
+
fn=submit_report, inputs=[raw_df, scores, filters, job_opts, report_task], outputs=[
|
2848 |
html_report, report_df, csv_download_file, html_download_file]
|
2849 |
).success(
|
2850 |
fn=lambda: [gr.Button(interactive=True)] * 2,
|
|
|
2853 |
)
|
2854 |
|
2855 |
|
2856 |
+
|
2857 |
def create_csv_report_file(df, file_report, task, sep, progress=gr.Progress(track_tqdm=True)):
|
2858 |
csv_sep_map = {
|
2859 |
'Comma': ',',
|