File size: 4,616 Bytes
f4a71b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b50a168
f4a71b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
from functools import partial

import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from rdkit import Chem
import dreams.utils.spectra as su
import dreams.utils.io as io
from dreams.utils.spectra import PeakListModifiedCosine
from dreams.utils.data import MSData
from dreams.api import dreams_embeddings
from dreams.definitions import *


def predict(lib_pth, in_pth):
    in_pth = Path(in_pth)
    # in_pth = Path('DreaMS/data/MSV000086206/peak/mzml/S_N1.mzML')  # Example dataset
    
    msdata_lib = MSData.load(lib_pth)
    embs_lib = msdata_lib[DREAMS_EMBEDDING]
    print('Shape of the library embeddings:', embs_lib.shape)

    msdata = MSData.load(in_pth)
    embs = dreams_embeddings(msdata)
    print('Shape of the query embeddings:', embs.shape)

    sims = cosine_similarity(embs, embs_lib)
    print('Shape of the similarity matrix:', sims.shape)

    k = 5
    topk_cands = np.argsort(sims, axis=1)[:, -k:][:, ::-1]
    topk_cands.shape

    # Construct a DataFrame with the top-k candidates for each spectrum and their corresponding similarities
    df = []
    cos_sim = su.PeakListModifiedCosine()
    for i, topk in enumerate(tqdm(topk_cands)):
        for n, j in enumerate(topk):
            df.append({
                'feature_id': i + 1,
                'topk': n + 1,
                'library_j': j,
                'library_SMILES': msdata_lib.get_smiles(j),
                'library_ID': msdata_lib.get_values('IDENTIFIER', j),
                'DreaMS_similarity': sims[i, j],
                'Modified_cosine_similarity': cos_sim(
                    spec1=msdata.get_spectra(i),
                    prec_mz1=msdata.get_prec_mzs(i),
                    spec2=msdata_lib.get_spectra(j),
                    prec_mz2=msdata_lib.get_prec_mzs(j),
                ),
                'i': i,
                'j': j,
            })
    df = pd.DataFrame(df)

    # TODO Add some (random) name to the output file
    df_path = io.append_to_stem(in_pth, 'MassSpecGym_hits').with_suffix('.csv')
    df.to_csv(df_path, index=False)

    # i = df_top1['i'].iloc[25]
    # df_i = df[df['i'] == i]
    # for _, row in df_i.iterrows():
    #     i, j = row['i'], row['j']
    #     print(f'Library ID: {row["library_ID"]} (top {row["topk"]} hit)')
    #     print(f'Query precursor m/z: {msdata.get_prec_mzs(i)}, Library precursor m/z: {msdata_lib.get_prec_mzs(j)}')
    #     print('DreaMS similarity:', row['DreaMS_similarity'])
    #     print('Modified cosine similarity:', row['Modified_cosine_similarity'])
    #     su.plot_spectrum(spec=msdata.get_spectra(i), mirror_spec=msdata_lib.get_spectra(j))
    #     display(Chem.MolFromSmiles(row['library_SMILES']))

    # Sort hits by DreaMS similarity
    # df_top1 = df[df['topk'] == 1].sort_values('DreaMS_similarity', ascending=False)
    # df = df.set_index('feature_id').loc[df_top1['feature_id'].values].reset_index()
    # df

    return df, str(df_path)


app = gr.Blocks(theme=gr.themes.Default(primary_hue="green", secondary_hue="pink"))
with app:

    # Input GUI
    gr.Markdown(value="""
        # DreaMS
    """)
    # gr.Image("assets/readme-dimer-close-up.png")
    # gr.Markdown(value="""
    #     TODO Some description
    # """)
    with gr.Row(equal_height=True):
        in_pth = gr.File(
            file_count="single",
            label=".mzML file (TODO Extend to other formats)"
        )
    lib_pth = Path('DreaMS/data/MassSpecGym_DreaMS.hdf5')  # MassSpecGym library

    # Predict GUI
    predict_button = gr.Button(value="Run library matching", variant="primary")

    # Output GUI
    gr.Markdown("## Predictions")
    df_file = gr.File(label="Download predictions as .csv", interactive=False, visible=True)
    df = gr.Dataframe(
        headers=["feature_id", "topk", "library_j", "library_SMILES", "library_ID", "DreaMS_similarity", "Modified_cosine_similarity", "i", "j"],
        datatype=["number", "number", "number", "str", "str", "number", "number", "number", "number"],
        col_count=(9, "fixed"),
    )
    # dropdown = gr.Dropdown(interactive=True, visible=False)
    # dropdown_choices_to_plot_args = gr.State([])
    # plot = gr.HTML()

    # Main logic
    inputs = [in_pth]
    outputs = [df, df_file]
    predict = partial(predict, lib_pth)
    predict_button.click(predict, inputs=inputs, outputs=outputs)

    # Update plot on dropdown change
    # dropdown.change(update_plot, inputs=[dropdown, dropdown_choices_to_plot_args], outputs=[plot])


app.launch(allowed_paths=['./assets'])