Spaces:
Running
Running
Upload 5 files
Browse files- UniProtKB_id_names.csv +0 -0
- app.py +161 -0
- clinvar_0325.csv.gz +3 -0
- logreg_params.npz +3 -0
- requirements.txt +4 -0
UniProtKB_id_names.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
st.set_page_config(layout="wide")
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
from zipfile import ZipFile
|
7 |
+
|
8 |
+
import plotly.express as px
|
9 |
+
import plotly.graph_objs as go
|
10 |
+
|
11 |
+
LLR_FILE = 'UniProtKB_human_VESM_llrs.zip'
|
12 |
+
|
13 |
+
df = pd.read_csv('UniProtKB_id_names.csv', index_col=0)
|
14 |
+
if 'shuffled_df' not in st.session_state:
|
15 |
+
st.session_state.shuffled_df = df.sample(frac=1)
|
16 |
+
df = st.session_state.shuffled_df
|
17 |
+
clinvar = pd.read_csv('clinvar_0325.csv.gz',index_col=0)
|
18 |
+
|
19 |
+
f = np.load("logreg_params.npz")
|
20 |
+
coef, intercept = f["coef"].item(), f["intercept"].item()
|
21 |
+
|
22 |
+
def load_LLR(uniprot_id):
|
23 |
+
'''Loads the LLRs for a given uniprot id. Returns a 20xL dataframe.
|
24 |
+
Rows are indexed by AA change,
|
25 |
+
(AAorder=['K','R','H','E','D','N','Q','T','S','C','G','A','V','L','I','M','P','Y','F','W'])
|
26 |
+
Columns indexed by WT_AA+position e.g., "G 12".
|
27 |
+
Usage: load_LLR('P01116') or load_LLR('P01116-2')
|
28 |
+
'''
|
29 |
+
with ZipFile(LLR_FILE) as myzip:
|
30 |
+
data = myzip.open(myzip.namelist()[0] + 'LLRs/' + uniprot_id + '.csv')
|
31 |
+
LLR = pd.read_csv(data, index_col=0)
|
32 |
+
if sigmoid:
|
33 |
+
p = 1/(1 + np.exp(-(LLR.values.ravel()*coef + intercept)))
|
34 |
+
LLR = pd.DataFrame(p.reshape(LLR.shape), index=LLR.index, columns=LLR.columns).round(4)
|
35 |
+
return LLR
|
36 |
+
|
37 |
+
def meltLLR(LLR, gene_prefix=None, ignore_pos=False):
|
38 |
+
vars = LLR.melt(ignore_index=False)
|
39 |
+
vars['variant'] = [''.join(i.split(' ')) + j for i, j in zip(vars['variable'], vars.index)]
|
40 |
+
vars['score'] = vars['value']
|
41 |
+
vars = vars.set_index('variant')
|
42 |
+
if not ignore_pos:
|
43 |
+
vars['pos'] = [int(i[1:-1]) for i in vars.index]
|
44 |
+
del vars['variable'], vars['value']
|
45 |
+
if gene_prefix is not None:
|
46 |
+
vars.index = gene_prefix + '_' + vars.index
|
47 |
+
return vars
|
48 |
+
|
49 |
+
|
50 |
+
def plot_interactive(uniprot_id, show_clinvar=False):
|
51 |
+
primaryLLR = load_LLR(uniprot_id)
|
52 |
+
template = 'plotly_white'
|
53 |
+
zmax=1.09 if sigmoid else 0
|
54 |
+
zmin=0 if sigmoid else -22
|
55 |
+
cmap='rdbu_r' if sigmoid else 'Viridis_r'
|
56 |
+
color = 'score' if sigmoid else 'LLR'
|
57 |
+
fig = px.imshow(
|
58 |
+
primaryLLR.values,
|
59 |
+
x=primaryLLR.columns,
|
60 |
+
y=primaryLLR.index,
|
61 |
+
color_continuous_scale=cmap,
|
62 |
+
zmax=zmax,
|
63 |
+
zmin=zmin,
|
64 |
+
labels=dict(y="Amino acid change", x="Protein sequence", color=color),
|
65 |
+
template=template,
|
66 |
+
title=selection
|
67 |
+
)
|
68 |
+
|
69 |
+
fig.update_xaxes(tickangle=-90,range=[0,99], rangeslider=dict(visible=True), dtick=1)
|
70 |
+
fig.update_yaxes(dtick=1)
|
71 |
+
fig.update_layout(
|
72 |
+
plot_bgcolor='rgba(0, 0, 0, 0)',
|
73 |
+
paper_bgcolor='rgba(0, 0, 0, 0)',
|
74 |
+
font={'family': 'Arial', 'size': 11},
|
75 |
+
hoverlabel=dict(font=dict(family='Arial', size=14))
|
76 |
+
)
|
77 |
+
|
78 |
+
fig.update_traces(
|
79 |
+
hovertemplate="<br>".join(["<b>%{x} %{y}</b> (%{z:.2f})"]) + '<extra></extra>'
|
80 |
+
)
|
81 |
+
|
82 |
+
if show_clinvar:
|
83 |
+
iso_clinvar = clinvar[clinvar.protein == uniprot_id]
|
84 |
+
iso_clinvar = iso_clinvar[iso_clinvar.GoldStars > 1]
|
85 |
+
b_mut = set(iso_clinvar[iso_clinvar.clinvar_label == 0.0].variant.values)
|
86 |
+
p_mut = set(iso_clinvar[iso_clinvar.clinvar_label == 1.0].variant.values)
|
87 |
+
|
88 |
+
hwt_x, hwt_y, cust = [], [], []
|
89 |
+
phwt_x, phwt_y, pcust = [], [], []
|
90 |
+
|
91 |
+
for i in primaryLLR.columns:
|
92 |
+
for j in list(primaryLLR.index):
|
93 |
+
mut = i[0] + i[2:] + j
|
94 |
+
if mut in b_mut:
|
95 |
+
hwt_x.append(i)
|
96 |
+
hwt_y.append(j)
|
97 |
+
cust.append(primaryLLR.loc[j, i])
|
98 |
+
elif mut in p_mut:
|
99 |
+
phwt_x.append(i)
|
100 |
+
phwt_y.append(j)
|
101 |
+
pcust.append(primaryLLR.loc[j, i])
|
102 |
+
|
103 |
+
# draw pathogenic
|
104 |
+
fig.add_trace(go.Scatter(
|
105 |
+
x=phwt_x, y=phwt_y, customdata=pcust,
|
106 |
+
mode='markers',
|
107 |
+
marker=dict(size=8, color='red'),
|
108 |
+
showlegend=False,
|
109 |
+
hoverlabel=dict(bgcolor='crimson', font_color='black'),
|
110 |
+
hovertemplate="<b>%{x} %{y}</b> (%{customdata:.2f})<extra></extra>"
|
111 |
+
))
|
112 |
+
# draw benign
|
113 |
+
fig.add_trace(go.Scatter(
|
114 |
+
x=hwt_x, y=hwt_y, customdata=cust,
|
115 |
+
mode='markers',
|
116 |
+
marker=dict(size=8, color='white'),
|
117 |
+
showlegend=False,
|
118 |
+
hoverlabel=dict(bgcolor='white', font_color='black'),
|
119 |
+
hovertemplate="<b>%{x} %{y}</b> (%{customdata:.2f})<extra></extra>"
|
120 |
+
))
|
121 |
+
|
122 |
+
fig.update_layout(
|
123 |
+
hovermode='closest',
|
124 |
+
hoverdistance=10
|
125 |
+
)
|
126 |
+
|
127 |
+
return fig
|
128 |
+
|
129 |
+
|
130 |
+
idx = df.index.get_loc('P32245') if 'P32245' in df.index else 0
|
131 |
+
selection = st.selectbox("uniprot_id:", df, index=idx)
|
132 |
+
uid = df[df.txt == selection].index.values[0]
|
133 |
+
|
134 |
+
col1, col2 = st.columns(2)
|
135 |
+
with col1:
|
136 |
+
sigmoid = st.checkbox(
|
137 |
+
"Calibrated VESM predictions (0: benign, 1: pathogenic)",
|
138 |
+
value=False
|
139 |
+
)
|
140 |
+
with col2:
|
141 |
+
show_clinvar = st.checkbox(
|
142 |
+
"Show ClinVar annotations (red: pathogenic, white: benign)",
|
143 |
+
value=False
|
144 |
+
)
|
145 |
+
|
146 |
+
fig = plot_interactive(uid, show_clinvar=show_clinvar)
|
147 |
+
fig.update_layout(width=800, height=600, autosize=False)
|
148 |
+
st.plotly_chart(fig, use_container_width=True)
|
149 |
+
|
150 |
+
st.download_button(
|
151 |
+
label="📥 Download as CSV",
|
152 |
+
data=meltLLR(load_LLR(uid)).to_csv(),
|
153 |
+
file_name=f"{selection}.csv",
|
154 |
+
mime='text/csv'
|
155 |
+
)
|
156 |
+
st.markdown("---")
|
157 |
+
|
158 |
+
st.markdown("""
|
159 |
+
- Bulk download precomputed scores at [VESM Effect Scores](https://huggingface.co/datasets/ntranoslab/vesm_scores) for all UniProt, hg19, and hg38 variants.
|
160 |
+
- Use VESM locally: Access the source code and installation instructions on [GitHub](https://github.com/ntranoslab/vesm).
|
161 |
+
""")
|
clinvar_0325.csv.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b692c7298c46bcf3397baaca93b89e8f22d32bc891dab0ac2e3af90ac8944c08
|
3 |
+
size 2128878
|
logreg_params.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6411c5d1bcab64080217d97be085397974a5f68b2a5a680e9081b7959289c81d
|
3 |
+
size 776
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair
|
2 |
+
streamlit
|
3 |
+
plotly
|
4 |
+
protobuf~=3.19.0
|