Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,616 Bytes
f4a71b5 b50a168 f4a71b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import gradio as gr
from functools import partial
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from rdkit import Chem
import dreams.utils.spectra as su
import dreams.utils.io as io
from dreams.utils.spectra import PeakListModifiedCosine
from dreams.utils.data import MSData
from dreams.api import dreams_embeddings
from dreams.definitions import *
def predict(lib_pth, in_pth):
in_pth = Path(in_pth)
# in_pth = Path('DreaMS/data/MSV000086206/peak/mzml/S_N1.mzML') # Example dataset
msdata_lib = MSData.load(lib_pth)
embs_lib = msdata_lib[DREAMS_EMBEDDING]
print('Shape of the library embeddings:', embs_lib.shape)
msdata = MSData.load(in_pth)
embs = dreams_embeddings(msdata)
print('Shape of the query embeddings:', embs.shape)
sims = cosine_similarity(embs, embs_lib)
print('Shape of the similarity matrix:', sims.shape)
k = 5
topk_cands = np.argsort(sims, axis=1)[:, -k:][:, ::-1]
topk_cands.shape
# Construct a DataFrame with the top-k candidates for each spectrum and their corresponding similarities
df = []
cos_sim = su.PeakListModifiedCosine()
for i, topk in enumerate(tqdm(topk_cands)):
for n, j in enumerate(topk):
df.append({
'feature_id': i + 1,
'topk': n + 1,
'library_j': j,
'library_SMILES': msdata_lib.get_smiles(j),
'library_ID': msdata_lib.get_values('IDENTIFIER', j),
'DreaMS_similarity': sims[i, j],
'Modified_cosine_similarity': cos_sim(
spec1=msdata.get_spectra(i),
prec_mz1=msdata.get_prec_mzs(i),
spec2=msdata_lib.get_spectra(j),
prec_mz2=msdata_lib.get_prec_mzs(j),
),
'i': i,
'j': j,
})
df = pd.DataFrame(df)
# TODO Add some (random) name to the output file
df_path = io.append_to_stem(in_pth, 'MassSpecGym_hits').with_suffix('.csv')
df.to_csv(df_path, index=False)
# i = df_top1['i'].iloc[25]
# df_i = df[df['i'] == i]
# for _, row in df_i.iterrows():
# i, j = row['i'], row['j']
# print(f'Library ID: {row["library_ID"]} (top {row["topk"]} hit)')
# print(f'Query precursor m/z: {msdata.get_prec_mzs(i)}, Library precursor m/z: {msdata_lib.get_prec_mzs(j)}')
# print('DreaMS similarity:', row['DreaMS_similarity'])
# print('Modified cosine similarity:', row['Modified_cosine_similarity'])
# su.plot_spectrum(spec=msdata.get_spectra(i), mirror_spec=msdata_lib.get_spectra(j))
# display(Chem.MolFromSmiles(row['library_SMILES']))
# Sort hits by DreaMS similarity
# df_top1 = df[df['topk'] == 1].sort_values('DreaMS_similarity', ascending=False)
# df = df.set_index('feature_id').loc[df_top1['feature_id'].values].reset_index()
# df
return df, str(df_path)
app = gr.Blocks(theme=gr.themes.Default(primary_hue="green", secondary_hue="pink"))
with app:
# Input GUI
gr.Markdown(value="""
# DreaMS
""")
# gr.Image("assets/readme-dimer-close-up.png")
# gr.Markdown(value="""
# TODO Some description
# """)
with gr.Row(equal_height=True):
in_pth = gr.File(
file_count="single",
label=".mzML file (TODO Extend to other formats)"
)
lib_pth = Path('DreaMS/data/MassSpecGym_DreaMS.hdf5') # MassSpecGym library
# Predict GUI
predict_button = gr.Button(value="Run library matching", variant="primary")
# Output GUI
gr.Markdown("## Predictions")
df_file = gr.File(label="Download predictions as .csv", interactive=False, visible=True)
df = gr.Dataframe(
headers=["feature_id", "topk", "library_j", "library_SMILES", "library_ID", "DreaMS_similarity", "Modified_cosine_similarity", "i", "j"],
datatype=["number", "number", "number", "str", "str", "number", "number", "number", "number"],
col_count=(9, "fixed"),
)
# dropdown = gr.Dropdown(interactive=True, visible=False)
# dropdown_choices_to_plot_args = gr.State([])
# plot = gr.HTML()
# Main logic
inputs = [in_pth]
outputs = [df, df_file]
predict = partial(predict, lib_pth)
predict_button.click(predict, inputs=inputs, outputs=outputs)
# Update plot on dropdown change
# dropdown.change(update_plot, inputs=[dropdown, dropdown_choices_to_plot_args], outputs=[plot])
app.launch(allowed_paths=['./assets'])
|