File size: 12,835 Bytes
f4a71b5
3a7f59d
c53be73
 
fc34019
f4a71b5
bbf2542
 
f4a71b5
 
 
 
 
 
bbf2542
 
 
 
fc34019
 
f4a71b5
fc34019
f4a71b5
 
 
 
 
 
fc34019
bbf2542
 
 
 
 
 
 
 
fc34019
 
bbf2542
 
fc34019
 
bbf2542
 
 
fc34019
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbf2542
fc34019
bbf2542
fc34019
bbf2542
 
 
 
 
 
 
 
 
 
 
 
fc34019
bbf2542
fc34019
bbf2542
fc34019
bbf2542
fc34019
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbf2542
 
 
 
 
fc34019
bbf2542
 
 
 
 
926bf18
 
 
c53be73
926bf18
 
c53be73
 
fc34019
6a3bcfb
 
 
 
fc34019
 
 
 
c53be73
926bf18
 
 
 
 
 
c53be73
 
 
3a7f59d
 
 
 
bee03d0
f32e93f
 
b50a168
bbf2542
f4a71b5
fc34019
f4a71b5
 
 
 
fc34019
f4a71b5
fc34019
33fc999
3a7f59d
f4a71b5
 
fc34019
f4a71b5
 
 
fc34019
f4a71b5
 
 
fc34019
 
f4a71b5
fc34019
f4a71b5
 
fc34019
 
 
 
f4a71b5
bbf2542
 
 
f4a71b5
 
fc34019
 
f4a71b5
 
bbf2542
fc34019
bbf2542
33fc999
f4a71b5
 
 
bbf2542
f4a71b5
bbf2542
f4a71b5
 
 
 
33fc999
f4a71b5
 
 
 
1dd40c0
 
f4a71b5
fc34019
 
 
33fc999
 
 
fc34019
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33fc999
 
fc34019
 
 
 
 
 
 
 
f32e93f
fc34019
 
 
f4a71b5
 
 
f32e93f
 
 
 
 
 
 
 
 
c53be73
f32e93f
 
 
 
 
 
 
 
 
 
 
 
 
f4a71b5
 
 
1dd40c0
 
f4a71b5
1dd40c0
fc34019
 
 
f4a71b5
 
 
 
fc34019
f4a71b5
 
bbf2542
6a3bcfb
bbf2542
fc34019
bbf2542
f4a71b5
 
fc34019
f4a71b5
 
 
 
 
fc34019
 
 
 
 
bbf2542
 
fc34019
bbf2542
f4a71b5
 
 
 
 
 
fc34019
f4a71b5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
import gradio as gr
import spaces
import urllib.request
import os
from datetime import datetime
from functools import partial
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D
import base64
from io import BytesIO
from PIL import Image
import io
import dreams.utils.spectra as su
import dreams.utils.io as dio
from dreams.utils.spectra import PeakListModifiedCosine
from dreams.utils.data import MSData
from dreams.api import dreams_embeddings
from dreams.definitions import *


def smiles_to_html_img(smiles, img_size=200):
    """
    Convert SMILES to HTML image string for display in Gradio dataframe
    """
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return f"<div style='text-align: center; color: red;'>Invalid SMILES</div>"
        
        # Use PNG drawing for better control over cropping
        d2d = rdMolDraw2D.MolDraw2DCairo(img_size, img_size)
        opts = d2d.drawOptions()
        opts.clearBackground = False
        opts.padding = 0.05  # Minimal padding
        opts.bondLineWidth = 2.0  # Make bonds more visible
        d2d.DrawMolecule(mol)
        d2d.FinishDrawing()
        
        # Get PNG data
        png_data = d2d.GetDrawingText()
        
        # Convert PNG data to PIL Image for cropping
        img = Image.open(io.BytesIO(png_data))
        
        # Convert to RGBA if not already
        if img.mode != 'RGBA':
            img = img.convert('RGBA')
        
        # Get the bounding box of non-transparent pixels
        bbox = img.getbbox()
        if bbox:
            # Crop the image to remove transparent space
            img = img.crop(bbox)
        
        # Convert back to base64
        buffered = io.BytesIO()
        img.save(buffered, format='PNG')
        img_str = base64.b64encode(buffered.getvalue())
        img_str = f"data:image/png;base64,{repr(img_str)[2:-1]}"
        
        return f"<img src='{img_str}' style='max-width: 100%; height: auto;' title='{smiles}' />"
    except Exception as e:
        return f"<div style='text-align: center; color: red;'>Error: {str(e)}</div>"


def spectrum_to_html_img(spec1, spec2, img_size=1500):
    """
    Convert spectrum plot to HTML image string for display in Gradio dataframe
    """
    try:
        matplotlib.use('Agg')  # Use non-interactive backend
        
        # Create the plot using the existing function
        su.plot_spectrum(spec=spec1, mirror_spec=spec2, figsize=(2, 1))
        
        # Save the current figure to a buffer with transparent background
        buffered = BytesIO()
        plt.savefig(buffered, format='png', bbox_inches='tight', dpi=100, transparent=True)
        buffered.seek(0)
        
        # Convert to PIL Image for cropping
        img = Image.open(buffered)
        
        # Convert to RGBA if not already
        if img.mode != 'RGBA':
            img = img.convert('RGBA')
        
        # Get the bounding box of non-transparent pixels
        bbox = img.getbbox()
        if bbox:
            # Crop the image to remove transparent space
            img = img.crop(bbox)
        
        # Convert back to base64
        buffered_cropped = BytesIO()
        img.save(buffered_cropped, format='PNG')
        img_str = base64.b64encode(buffered_cropped.getvalue())
        img_str = f"data:image/png;base64,{repr(img_str)[2:-1]}"
        
        # Close the figure to free memory
        plt.close()
        
        return f"<img src='{img_str}' style='max-width: 100%; height: auto;' title='Spectrum comparison' />"
    except Exception as e:
        return f"<div style='text-align: center; color: red;'>Error: {str(e)}</div>"


def setup():
    # Download spectral library
    data_path = Path('./DreaMS/data')
    data_path.mkdir(parents=True, exist_ok=True)
    url = 'https://huggingface.co/datasets/roman-bushuiev/GeMS/resolve/main/data/auxiliary/MassSpecGym_DreaMS.hdf5'
    target_path = data_path / 'MassSpecGym_DreaMS.hdf5'
    if not target_path.exists():
        urllib.request.urlretrieve(url, target_path)

    # Download example file
    # example_url = 'https://huggingface.co/datasets/titodamiani/PiperNET/resolve/main/lcms/rawfiles/202312_147_P55-Leaf-r2_1uL.mzML'
    # example_path = Path('./data/202312_147_P55-Leaf-r2_1uL.mzML')
    example_url = 'https://huggingface.co/datasets/roman-bushuiev/GeMS/resolve/main/data/auxiliary/example_piper_2k_spectra.mgf'
    example_path = Path('./data/example_piper_2k_spectra.mgf')
    example_path.parent.mkdir(parents=True, exist_ok=True)
    if not example_path.exists():
        urllib.request.urlretrieve(example_url, example_path)

    # Run simple example as a test and to download weights
    example_url = 'https://raw.githubusercontent.com/pluskal-lab/DreaMS/cc806fa6fea281c1e57dd81fc512f71de9290017/data/examples/example_5_spectra.mgf'
    example_path = Path('./data/example_5_spectra.mgf')
    example_path.parent.mkdir(parents=True, exist_ok=True)
    if not example_path.exists():
        urllib.request.urlretrieve(example_url, example_path)
    embs = dreams_embeddings(example_path)
    print("Setup complete")


def _predict_gpu(msdata):
    embs = dreams_embeddings(msdata)
    return embs

@spaces.GPU
def _predict_core(lib_pth, in_pth, progress):
    """Core prediction function without error handling"""
    in_pth = Path(in_pth)
    # # in_pth = Path('DreaMS/data/MSV000086206/peak/mzml/S_N1.mzML')  # Example dataset
    
    progress(0, desc="Loading library data...")
    msdata_lib = MSData.load(lib_pth)
    embs_lib = msdata_lib[DREAMS_EMBEDDING]
    print('Shape of the library embeddings:', embs_lib.shape)

    progress(0.1, desc="Loading spectra data...")
    msdata = MSData.load(in_pth)
    
    progress(0.2, desc="Computing DreaMS embeddings...")
    embs = _predict_gpu(msdata)
    print('Shape of the query embeddings:', embs.shape)

    progress(0.4, desc="Computing similarity matrix...")
    sims = cosine_similarity(embs, embs_lib)
    print('Shape of the similarity matrix:', sims.shape)

    k = 1
    topk_cands = np.argsort(sims, axis=1)[:, -k:][:, ::-1]
    topk_cands.shape

    print(msdata.columns())

    # Construct a DataFrame with the top-k candidates for each spectrum and their corresponding similarities
    progress(0.5, desc="Constructing results table...")
    df = []
    cos_sim = su.PeakListModifiedCosine()
    total_spectra = len(topk_cands)
    
    for i, topk in enumerate(topk_cands):
        progress(0.5 + 0.4 * (i / total_spectra), desc=f"Processing hits for spectrum {i+1}/{total_spectra}...")
        for n, j in enumerate(topk):
            smiles = msdata_lib.get_smiles(j)
            spec1 = msdata.get_spectra(i)
            spec2 = msdata_lib.get_spectra(j)
            df.append({
                'feature_id': i + 1,
                'precursor_mz': msdata.get_prec_mzs(i),
                # 'RT': msdata.get_values('RTINSECONDS', i),
                'topk': n + 1,
                'library_j': j,
                'library_SMILES': smiles_to_html_img(smiles),
                'library_SMILES_raw': smiles,
                'Spectrum': spectrum_to_html_img(spec1, spec2),
                'Spectrum_raw': su.unpad_peak_list(spec1),
                'library_ID': msdata_lib.get_values('IDENTIFIER', j),
                'DreaMS_similarity': sims[i, j],
                'Modified_cosine_similarity': cos_sim(
                    spec1=spec1,
                    prec_mz1=msdata.get_prec_mzs(i),
                    spec2=spec2,
                    prec_mz2=msdata_lib.get_prec_mzs(j),
                ),
                'i': i,
                'j': j,
                'DreaMS_embedding': embs[i],
            })
    df = pd.DataFrame(df)

    # Sort hits by DreaMS similarity
    df_top1 = df[df['topk'] == 1].sort_values('DreaMS_similarity', ascending=False)
    df = df.set_index('feature_id').loc[df_top1['feature_id'].values].reset_index()

    progress(0.9, desc="Post-processing results...")
    # Remove unnecessary columns and round similarity scores
    df = df.drop(columns=['i', 'j', 'library_j'])
    df['DreaMS_similarity'] = df['DreaMS_similarity'].astype(float).round(4)
    df['Modified_cosine_similarity'] = df['Modified_cosine_similarity'].astype(float).round(4)
    df['precursor_mz'] = df['precursor_mz'].astype(float).round(4)
    # df['RT'] = df['RT'].round(1)
    df = df.rename(columns={
        'topk': 'Top k',
        'library_ID': 'Library ID',
        "feature_id": "Feature ID",
        "precursor_mz": "Precursor m/z",
        # "RT": "RT",
        "library_SMILES": "Molecule",
        "library_SMILES_raw": "SMILES",
        "Spectrum": "Spectrum",
        "Spectrum_raw": "Input Spectrum",
        "DreaMS_similarity": "DreaMS similarity",
        "Modified_cosine_similarity": "Modified cos similarity",
        "DreaMS_embedding": "DreaMS embedding",
    })

    progress(0.95, desc="Saving results to CSV...")
    # Save full df to .csv
    df_path = dio.append_to_stem(in_pth, f"MassSpecGym_hits_{datetime.now().strftime('%Y%m%d_%H%M%S')}").with_suffix('.csv')
    df_to_save = df.drop(columns=['Molecule', 'Spectrum', 'Top k'])
    df_to_save.to_csv(df_path, index=False)

    progress(0.98, desc="Filtering and sorting results...")
    # Postprocess to only show most relevant hits
    df = df.drop(columns=['DreaMS embedding', "SMILES", "Input Spectrum"])
    df = df[df['Top k'] == 1].sort_values('DreaMS similarity', ascending=False)
    df = df.drop(columns=['Top k'])
    df = df[df["DreaMS similarity"] >= 0.75]
    # Add row numbers as first column
    df.insert(0, 'Row', range(1, len(df) + 1))
    
    progress(1.0, desc=f"Predictions complete! Found {len(df)} high-confidence matches.")

    return df, str(df_path)


def predict(lib_pth, in_pth, progress=gr.Progress(track_tqdm=True)):
    """Wrapper function with error handling"""
    try:
        return _predict_core(lib_pth, in_pth, progress)
    except Exception as e:
        raise gr.Error(e)


# Set up
setup()

# Start the Gradio app
js_func = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'light') {
        url.searchParams.set('__theme', 'light');
        window.location.href = url.href;
    }
}
"""
app = gr.Blocks(theme=gr.themes.Default(primary_hue="yellow", secondary_hue="pink"), js=js_func)
with app:

    # Input GUI
    # gr.Markdown(value="""# DreaMS""")
    gr.Image("https://raw.githubusercontent.com/pluskal-lab/DreaMS/cc806fa6fea281c1e57dd81fc512f71de9290017/assets/dreams_background.png", label="DreaMS")
    gr.Markdown(value="""
        DreaMS (Deep Representations Empowering the Annotation of Mass Spectra) is a transformer-based
         neural network designed to interpret tandem mass spectrometry (MS/MS) data (<a href="https://www.nature.com/articles/s41587-025-02663-3">Bushuiev et al., Nature Biotechnology, 2025</a>).
         This website provides an easy access to perform library matching with DreaMS. Please upload
         your MS/MS file and click on the "Run DreaMS" button. Predictions may currently take up to 10 minutes for files with several thousands of spectra.
    """)
    with gr.Row(equal_height=True):
        in_pth = gr.File(
            file_count="single",
            label="Input MS/MS file (.mgf or .mzML)",
        )
    lib_pth = Path('DreaMS/data/MassSpecGym_DreaMS.hdf5')  # MassSpecGym library
    examples = gr.Examples(
        examples=["./data/example_5_spectra.mgf", "./data/example_piper_2k_spectra.mgf"],
        inputs=[in_pth],
        label="Examples (click on a file to load as input)",
    )

    # Predict GUI
    predict_button = gr.Button(value="Run DreaMS", variant="primary")

    # Output GUI
    gr.Markdown("## Predictions")
    df_file = gr.File(label="Download predictions as .csv", interactive=False, visible=True)
    df = gr.Dataframe(
        headers=["Row", "Feature ID", "Precursor m/z", "Molecule", "Spectrum", "Library ID", "DreaMS similarity", "Modified cosine similarity"],
        datatype=["number", "number", "number", "html", "html", "str", "number", "number"],
        col_count=(8, "fixed"),
        # wrap=True,
        column_widths=["25px", "25px", "28px", "60px", "60px", "50px", "40px", "40px"],
        max_height=1000,
        show_fullscreen_button=True,
        show_row_numbers=False,
        show_search='filter',
    )

    # Main logic
    inputs = [in_pth]
    outputs = [df, df_file]
    predict = partial(predict, lib_pth)
    predict_button.click(predict, inputs=inputs, outputs=outputs, show_progress="first")


app.launch(allowed_paths=['./assets'])