Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,57 @@
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
import random
|
4 |
import pandas as pd
|
5 |
import matplotlib.pyplot as plt
|
6 |
-
from io import BytesIO
|
7 |
import base64
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# 模拟数据 - 实际使用时需要替换为真实数据
|
10 |
species_data = {
|
@@ -12,30 +59,93 @@ species_data = {
|
|
12 |
"mouse": {"codon_table": {}, "trna": {}, "codon_usage": {}},
|
13 |
"virus": {"codon_table": {}, "trna": {}, "codon_usage": {}},
|
14 |
"Escherichia coli": {"codon_table": {}, "trna": {}, "codon_usage": {}},
|
15 |
-
"
|
16 |
"Pichia": {"codon_table": {}, "trna": {}, "codon_usage": {}},
|
17 |
}
|
18 |
|
19 |
-
#
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
def calculate_cds_variants(protein_seq):
|
29 |
-
|
|
|
30 |
aa_count = len(protein_seq)
|
31 |
-
return 2 ** aa_count #
|
32 |
|
33 |
-
def optimize_cds(protein_seq, species, method):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
# 生成20个优化序列示例
|
35 |
results = []
|
36 |
for i in range(20):
|
37 |
-
# 实际应根据优化方法生成序列
|
38 |
seq = ''.join(random.choices("ACGT", k=len(protein_seq)*3))
|
|
|
|
|
|
|
39 |
gc = random.uniform(0.3, 0.7)
|
40 |
trna = random.uniform(0.5, 1.0)
|
41 |
usage = random.uniform(0.6, 0.95)
|
@@ -43,132 +153,473 @@ def optimize_cds(protein_seq, species, method):
|
|
43 |
score = gc*0.25 + trna*0.25 + usage*0.25 + (-mfe/40)*0.25
|
44 |
|
45 |
results.append({
|
46 |
-
"
|
|
|
|
|
47 |
"GC%": f"{gc*100:.1f}%",
|
48 |
"tRNA": f"{trna:.3f}",
|
49 |
"Usage": f"{usage:.3f}",
|
50 |
"MFE": f"{mfe:.1f}",
|
51 |
"Score": f"{score:.3f}"
|
52 |
})
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
def design_mrna(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
# 生成20个设计结果示例
|
57 |
designs = []
|
58 |
for i in range(20):
|
59 |
-
utr5 = random.choice(
|
60 |
-
utr3 = random.choice(
|
61 |
full_seq = utr5 + cds_seq + utr3
|
|
|
|
|
|
|
|
|
62 |
mfe = random.uniform(-50, -20)
|
|
|
|
|
63 |
designs.append({
|
|
|
64 |
"Design": f"Design_{i+1}",
|
65 |
-
"5'UTR": utr5[:
|
66 |
-
"3'UTR": utr3[:
|
67 |
"MFE": f"{mfe:.1f}",
|
68 |
-
"
|
|
|
|
|
69 |
})
|
70 |
-
|
71 |
-
|
72 |
-
#
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
if __name__ == "__main__":
|
174 |
-
|
|
|
|
|
|
1 |
+
import html
|
2 |
+
from typing import Tuple
|
3 |
+
|
4 |
import gradio as gr
|
5 |
import numpy as np
|
6 |
import random
|
7 |
import pandas as pd
|
8 |
import matplotlib.pyplot as plt
|
9 |
+
from io import BytesIO, StringIO
|
10 |
import base64
|
11 |
+
import json
|
12 |
+
AA_str = 'ACDEFGHIKLMNPQRSTVWY*-'.lower()
|
13 |
+
|
14 |
+
AA_TO_CODONS = {"F": ["TTT","TTC"],
|
15 |
+
"L": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
|
16 |
+
"I": ["ATT", "ATC", "ATA"],
|
17 |
+
"M": ["ATG"],
|
18 |
+
"V": ["GTT", "GTC", "GTA", "GTG"],
|
19 |
+
"S": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
|
20 |
+
"P": ["CCT", "CCC", "CCA", "CCG"],
|
21 |
+
"T": ["ACT", "ACC", "ACA", "ACG"],
|
22 |
+
"A": ["GCT", "GCC", "GCA", "GCG"],
|
23 |
+
"Y": ["TAT", "TAC"],
|
24 |
+
"H": ["CAT", "CAC"],
|
25 |
+
"Q": ["CAA", "CAG"],
|
26 |
+
"N": ["AAT", "AAC"],
|
27 |
+
"K": ["AAA", "AAG"],
|
28 |
+
"D": ["GAT", "GAC"],
|
29 |
+
"E": ["GAA", "GAG"],
|
30 |
+
"C": ["TGT", "TGC"],
|
31 |
+
"W": ["TGG"],
|
32 |
+
"R": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
|
33 |
+
"G": ["GGT", "GGC", "GGA", "GGG"],
|
34 |
+
"*": ["TAA", "TAG", "TGA"]}
|
35 |
+
|
36 |
+
|
37 |
+
def reverse_dictionary(dictionary):
|
38 |
+
"""Return dict of {value: key, ->}
|
39 |
+
|
40 |
+
Input:
|
41 |
+
dictionary: dict of {key: [value, ->], ->}
|
42 |
+
Output:
|
43 |
+
reverse_dictionary: dict of {value: key, ->}
|
44 |
+
|
45 |
+
"""
|
46 |
+
reverse_dictionary = {}
|
47 |
+
|
48 |
+
for key, values in dictionary.items():
|
49 |
+
for value in values:
|
50 |
+
reverse_dictionary[value] = key
|
51 |
+
|
52 |
+
return reverse_dictionary
|
53 |
+
|
54 |
+
CODON_TO_AA = reverse_dictionary(AA_TO_CODONS)
|
55 |
|
56 |
# 模拟数据 - 实际使用时需要替换为真实数据
|
57 |
species_data = {
|
|
|
59 |
"mouse": {"codon_table": {}, "trna": {}, "codon_usage": {}},
|
60 |
"virus": {"codon_table": {}, "trna": {}, "codon_usage": {}},
|
61 |
"Escherichia coli": {"codon_table": {}, "trna": {}, "codon_usage": {}},
|
62 |
+
"saccharomyces cerevisiae": {"codon_table": {}, "trna": {}, "codon_usage": {}},
|
63 |
"Pichia": {"codon_table": {}, "trna": {}, "codon_usage": {}},
|
64 |
}
|
65 |
|
66 |
+
# 示例数据
|
67 |
+
EXAMPLE_PROTEIN = "MSFSRRPKITKSDIVDQISLNIRNNNLKLEKKYIRLVIDAFFEELKGNLCLNNVIEFRSFGTFEVRKRKGRLNARNPQTGEYVKVLDHHVAYFRPGKDLKERVWGIKG"
|
68 |
+
EXAMPLE_CDS = "atgagctttagccgccgcccgaaaattaccaaaagcgatattgtggatcagattagcctg\
|
69 |
+
aacattcgcaacaacaacctgaaactggaaaaaaaatatattcgcctggtgattgatgcg\
|
70 |
+
ttttttgaagaactgaaaggcaacctgtgcctgaacaacgtgattgaatttcgcagcttt\
|
71 |
+
ggcacctttgaagtgcgcaaacgcaaaggccgcctgaacgcgcgcaacccgcagaccggc\
|
72 |
+
gaatatgtgaaagtgctggatcatcatgtggcgtattttcgcccgggcaaagatctgaaa\
|
73 |
+
gaacgcgtgtggggcattaaaggc".upper().replace('T', 'U')
|
74 |
+
EXAMPLE_UTR5 = "GAAAAGAGCCCCGGAAAGGAUCUAUCCCUUCCUGUUCUGCUGCACGCAAAAGAACAGCCAAGGGGGAGGCCACC"
|
75 |
+
EXAMPLE_UTR3 = "GCUCGCUUUCUUGCUGUCCAAUUUCUAUUAAAGGUUCCUUUGUUCCCUAAGUCCAACUACUAAACUGGGGGAUAUUAUGAAGGGCCUUGAGCAUCUGGAUUCUGCCUAAUAAAAAACAUUUAUUUUCAUUGCAA"
|
76 |
+
EXAMPLE_MRNA = EXAMPLE_UTR5 + EXAMPLE_CDS + EXAMPLE_UTR3
|
77 |
+
|
78 |
+
|
79 |
+
def find_longest_cds(seq: str) -> Tuple[int, int]:
|
80 |
+
"""
|
81 |
+
在mRNA序列中查找最长的CDS区域
|
82 |
+
|
83 |
+
参数:
|
84 |
+
seq: mRNA序列
|
85 |
+
|
86 |
+
返回:
|
87 |
+
(start, end): CDS区域的起始和结束索引
|
88 |
+
"""
|
89 |
+
seq = seq.upper().replace('U', 'T')
|
90 |
+
best_start = -1
|
91 |
+
best_end = -1
|
92 |
+
max_length = 0
|
93 |
+
|
94 |
+
# 尝试所有可能的阅读框
|
95 |
+
for frame in range(3):
|
96 |
+
in_orf = False
|
97 |
+
current_start = -1
|
98 |
+
|
99 |
+
for pos in range(frame, len(seq) - 2, 3):
|
100 |
+
codon = seq[pos:pos + 3]
|
101 |
+
|
102 |
+
# 如果是起始密码子
|
103 |
+
if codon == "ATG" and not in_orf:
|
104 |
+
in_orf = True
|
105 |
+
current_start = pos
|
106 |
+
|
107 |
+
# 如果是终止密码子
|
108 |
+
elif in_orf and codon in ["TAA", "TAG", "TGA"]:
|
109 |
+
orf_length = pos - current_start
|
110 |
+
if orf_length > max_length:
|
111 |
+
max_length = orf_length
|
112 |
+
best_start = current_start
|
113 |
+
best_end = pos + 3
|
114 |
+
in_orf = False
|
115 |
+
|
116 |
+
# 处理没有终止密码子的情况
|
117 |
+
if in_orf:
|
118 |
+
orf_length = len(seq) - current_start
|
119 |
+
if orf_length > max_length:
|
120 |
+
max_length = orf_length
|
121 |
+
best_start = current_start
|
122 |
+
best_end = len(seq)
|
123 |
+
|
124 |
+
return best_start, best_end
|
125 |
|
126 |
def calculate_cds_variants(protein_seq):
|
127 |
+
if not protein_seq:
|
128 |
+
return 0
|
129 |
aa_count = len(protein_seq)
|
130 |
+
return min(2 ** aa_count, 10**15) # 限制上限避免过大数字
|
131 |
|
132 |
+
def optimize_cds(protein_seq, species, method, status_update):
|
133 |
+
if not protein_seq:
|
134 |
+
status_update("❌ Error: Please enter a protein sequence")
|
135 |
+
return pd.DataFrame(), None
|
136 |
+
|
137 |
+
status_update("🔄 Optimizing CDS sequences...")
|
138 |
+
|
139 |
+
# 计算潜在变异数
|
140 |
+
variants = calculate_cds_variants(protein_seq)
|
141 |
+
|
142 |
# 生成20个优化序列示例
|
143 |
results = []
|
144 |
for i in range(20):
|
|
|
145 |
seq = ''.join(random.choices("ACGT", k=len(protein_seq)*3))
|
146 |
+
# 序列截断显示
|
147 |
+
seq_display = seq[:30] + "..." if len(seq) > 30 else seq
|
148 |
+
|
149 |
gc = random.uniform(0.3, 0.7)
|
150 |
trna = random.uniform(0.5, 1.0)
|
151 |
usage = random.uniform(0.6, 0.95)
|
|
|
153 |
score = gc*0.25 + trna*0.25 + usage*0.25 + (-mfe/40)*0.25
|
154 |
|
155 |
results.append({
|
156 |
+
"Rank": i+1,
|
157 |
+
"Sequence": seq_display,
|
158 |
+
"Full_Sequence": seq, # 完整序列用于下载
|
159 |
"GC%": f"{gc*100:.1f}%",
|
160 |
"tRNA": f"{trna:.3f}",
|
161 |
"Usage": f"{usage:.3f}",
|
162 |
"MFE": f"{mfe:.1f}",
|
163 |
"Score": f"{score:.3f}"
|
164 |
})
|
165 |
+
|
166 |
+
df = pd.DataFrame(results)
|
167 |
+
display_df = df.drop(columns=['Full_Sequence']) # 显示时不包含完整序列
|
168 |
+
|
169 |
+
# 生成图表
|
170 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
171 |
+
scores = [float(x) for x in df["Score"]]
|
172 |
+
bars = ax.bar(range(1, len(scores)+1), scores, color='skyblue', alpha=0.7)
|
173 |
+
ax.set_xlabel("Sequence Rank")
|
174 |
+
ax.set_ylabel("Composite Score")
|
175 |
+
ax.set_title(f"CDS Optimization Results ({method})")
|
176 |
+
ax.grid(True, alpha=0.3)
|
177 |
+
|
178 |
+
# 高亮前5名
|
179 |
+
for i in range(min(5, len(bars))):
|
180 |
+
bars[i].set_color('orange')
|
181 |
+
|
182 |
+
status_update(f"✅ Successfully generated {len(results)} optimized sequences. Potential variants: {variants:,}")
|
183 |
+
|
184 |
+
return display_df, fig
|
185 |
|
186 |
+
def design_mrna(utr5_file, utr3_file, cds_seq, status_update):
|
187 |
+
if not cds_seq:
|
188 |
+
status_update("❌ Error: Please enter a CDS sequence")
|
189 |
+
return pd.DataFrame()
|
190 |
+
|
191 |
+
status_update("🔄 Designing mRNA sequences...")
|
192 |
+
|
193 |
+
# 默认UTR候选序列
|
194 |
+
default_utr5 = ["GGGAAAUAAGAGAGAAAAGAAGAGUAAGAAGAAAUAUAAGAGCCACCAUGG",
|
195 |
+
"GGGAAAUAAGAGAGAAAAGAAGAGUAAGAAGAAAUAUAAGAGCCACCAUGG"]
|
196 |
+
default_utr3 = ["AAUAAAGCUUUUGCUUUUGUGGUGAAAUUGUUAAUAAACUAUUUUUUUUUU",
|
197 |
+
"AAUAAAGCUUUUGCUUUUGUGGUGAAAUUGUUAAUAAACUAUUUUUUUUUU"]
|
198 |
+
|
199 |
# 生成20个设计结果示例
|
200 |
designs = []
|
201 |
for i in range(20):
|
202 |
+
utr5 = random.choice(default_utr5)
|
203 |
+
utr3 = random.choice(default_utr3)
|
204 |
full_seq = utr5 + cds_seq + utr3
|
205 |
+
|
206 |
+
# 序列截断显示
|
207 |
+
full_seq_display = full_seq[:40] + "..." if len(full_seq) > 40 else full_seq
|
208 |
+
|
209 |
mfe = random.uniform(-50, -20)
|
210 |
+
stability = random.uniform(0.6, 0.9)
|
211 |
+
|
212 |
designs.append({
|
213 |
+
"Rank": i+1,
|
214 |
"Design": f"Design_{i+1}",
|
215 |
+
"5'UTR": utr5[:15] + "..." if len(utr5) > 15 else utr5,
|
216 |
+
"3'UTR": utr3[:15] + "..." if len(utr3) > 15 else utr3,
|
217 |
"MFE": f"{mfe:.1f}",
|
218 |
+
"Stability": f"{stability:.3f}",
|
219 |
+
"Sequence": full_seq_display,
|
220 |
+
"Full_Sequence": full_seq # 完整序列用于下载
|
221 |
})
|
222 |
+
|
223 |
+
df = pd.DataFrame(designs)
|
224 |
+
display_df = df.drop(columns=['Full_Sequence']) # 显示时不包含完整序列
|
225 |
+
|
226 |
+
status_update(f"✅ Successfully designed {len(designs)} mRNA sequences")
|
227 |
+
|
228 |
+
return display_df
|
229 |
+
|
230 |
+
def download_cds_results(results_df):
|
231 |
+
if results_df is None or len(results_df) == 0:
|
232 |
+
return None
|
233 |
+
|
234 |
+
# 重新添加完整序列用于下载
|
235 |
+
download_data = []
|
236 |
+
for idx, row in results_df.iterrows():
|
237 |
+
download_data.append({
|
238 |
+
"Rank": row["Rank"],
|
239 |
+
"Full_Sequence": ''.join(random.choices("ACGT", k=150)), # 模拟完整序列
|
240 |
+
"GC%": row["GC%"],
|
241 |
+
"tRNA": row["tRNA"],
|
242 |
+
"Usage": row["Usage"],
|
243 |
+
"MFE": row["MFE"],
|
244 |
+
"Score": row["Score"]
|
245 |
+
})
|
246 |
+
|
247 |
+
download_df = pd.DataFrame(download_data)
|
248 |
+
|
249 |
+
# 保存为CSV
|
250 |
+
csv_buffer = StringIO()
|
251 |
+
download_df.to_csv(csv_buffer, index=False)
|
252 |
+
csv_content = csv_buffer.getvalue()
|
253 |
+
|
254 |
+
# 创建临时文件
|
255 |
+
filename = "cds_optimization_results.csv"
|
256 |
+
with open(filename, 'w') as f:
|
257 |
+
f.write(csv_content)
|
258 |
+
|
259 |
+
return filename
|
260 |
+
|
261 |
+
def download_mrna_results(results_df):
|
262 |
+
if results_df is None or len(results_df) == 0:
|
263 |
+
return None
|
264 |
+
|
265 |
+
# 重新添加完整序列用于下载
|
266 |
+
download_data = []
|
267 |
+
for idx, row in results_df.iterrows():
|
268 |
+
download_data.append({
|
269 |
+
"Rank": row["Rank"],
|
270 |
+
"Design": row["Design"],
|
271 |
+
"Full_Sequence": ''.join(random.choices("ACGT", k=300)), # 模拟完整序列
|
272 |
+
"5'UTR": row["5'UTR"],
|
273 |
+
"3'UTR": row["3'UTR"],
|
274 |
+
"MFE": row["MFE"],
|
275 |
+
"Stability": row["Stability"]
|
276 |
+
})
|
277 |
+
|
278 |
+
download_df = pd.DataFrame(download_data)
|
279 |
+
|
280 |
+
# 保存为CSV
|
281 |
+
csv_buffer = StringIO()
|
282 |
+
download_df.to_csv(csv_buffer, index=False)
|
283 |
+
csv_content = csv_buffer.getvalue()
|
284 |
+
|
285 |
+
# 创建临时文件
|
286 |
+
filename = "mrna_design_results.csv"
|
287 |
+
with open(filename, 'w') as f:
|
288 |
+
f.write(csv_content)
|
289 |
+
|
290 |
+
return filename
|
291 |
+
|
292 |
+
|
293 |
+
def validate_dna_sequence(seq):
|
294 |
+
if len(set(seq)-set('ACGTU'))>0:
|
295 |
+
return False, str(set(seq)-set('ACGTU'))
|
296 |
+
return True, ""
|
297 |
+
|
298 |
+
|
299 |
+
def translate_cds(cds_seq,repeat=1):
|
300 |
+
cds_seq = cds_seq.upper().replace('U', 'T')
|
301 |
+
amino_acid_list = []
|
302 |
+
for i in range(0, len(cds_seq), 3):
|
303 |
+
codon = cds_seq[i:i + 3]
|
304 |
+
amino_acid_list.append(CODON_TO_AA.get(codon, '-') * repeat)
|
305 |
+
amino_acid_seq = ''.join(amino_acid_list)
|
306 |
+
return amino_acid_seq
|
307 |
+
|
308 |
+
|
309 |
+
class MaoTaoWeb:
|
310 |
+
def __init__(self):
|
311 |
+
self.app = self.design_app()
|
312 |
+
|
313 |
+
def design_app(self):
|
314 |
+
# 创建Gradio界面
|
315 |
+
with gr.Blocks(title="Vaccine Designer", theme=gr.themes.Soft()) as app:
|
316 |
+
gr.Markdown("# 🧬 Vaccine Design Platform")
|
317 |
+
gr.Markdown("*Academic Collaboration Platform for mRNA Vaccine Design*")
|
318 |
+
|
319 |
+
# 全局状态显示
|
320 |
+
self.status_display = gr.Textbox(
|
321 |
+
label="Status",
|
322 |
+
value="Ready to start",
|
323 |
+
interactive=False,
|
324 |
+
container=True
|
325 |
+
)
|
326 |
+
|
327 |
+
# 创建各个标签页
|
328 |
+
self.mrna_annotation_tab()
|
329 |
+
self.cds_optimization_tab()
|
330 |
+
self.mrna_design_tab()
|
331 |
+
self.resources_tab()
|
332 |
+
|
333 |
+
return app
|
334 |
+
|
335 |
+
def mrna_annotation_tab(self):
|
336 |
+
with gr.Tab("🔬 mRNA Annotation"):
|
337 |
+
gr.Markdown("## mRNA Sequence Annotation")
|
338 |
+
with gr.Row():
|
339 |
+
with gr.Column(scale=3):
|
340 |
+
mrna_input = gr.Textbox(
|
341 |
+
label="mRNA Sequence",
|
342 |
+
placeholder="Enter mRNA sequence here...",
|
343 |
+
lines=5,
|
344 |
+
max_lines=10
|
345 |
+
)
|
346 |
+
with gr.Column(scale=1):
|
347 |
+
start_position = gr.Number(
|
348 |
+
label="CDS Start",
|
349 |
+
value=-1,
|
350 |
+
interactive=True,
|
351 |
+
precision=0,
|
352 |
+
)
|
353 |
+
stop_position = gr.Number(
|
354 |
+
label="CDS End",
|
355 |
+
value=-1,
|
356 |
+
interactive=True,
|
357 |
+
precision=0,
|
358 |
+
)
|
359 |
+
with gr.Row():
|
360 |
+
example_btn = gr.Button("Load Example", variant="secondary")
|
361 |
+
annotate_btn = gr.Button("Annotate Regions", variant="primary")
|
362 |
+
with gr.Row():
|
363 |
+
annotation_output = gr.HTML(
|
364 |
+
label="Sequence Regions",
|
365 |
+
value="<div style='font-family: monospace;'>Results will appear here</div>"
|
366 |
+
)
|
367 |
+
|
368 |
+
def annotate_sequence(seq,start=-1,end=-1):
|
369 |
+
if not seq:
|
370 |
+
return "<div style='color: red;'>Please enter a sequence</div>", "❌error"
|
371 |
+
|
372 |
+
if not validate_dna_sequence(seq):
|
373 |
+
return "<div style='color: red;'>Invalid sequence. Only A, C, G, T/U allowed.</div>", "❌error"
|
374 |
+
|
375 |
+
if start ==-1 and end ==-1:
|
376 |
+
start, end = find_longest_cds(seq)
|
377 |
+
status_msg = f"✅ Found CDS at position {start} to {end}"
|
378 |
+
else:
|
379 |
+
status_msg = f"✅ Using user-defined CDS at position {start} to {end}"
|
380 |
+
if start == -1:
|
381 |
+
return "<div style='color: red;'>No CDS found in sequence</div>", "❌error"
|
382 |
+
|
383 |
+
# 提取CDS序列
|
384 |
+
cds_seq = seq[start:end]
|
385 |
+
# 翻译CDS为氨基酸序列
|
386 |
+
aa_seq = translate_cds(cds_seq)
|
387 |
+
# 创建带颜色的HTML结果
|
388 |
+
html_result = "<div style='font-family: monospace; white-space: pre; margin-left: 15px;'>"
|
389 |
+
|
390 |
+
frame_lenth = 60
|
391 |
+
|
392 |
+
# CDS and proten
|
393 |
+
cds_formatted = '\n'.join([cds_seq[i:i + frame_lenth] for i in range(0, len(cds_seq), frame_lenth)])
|
394 |
+
aa_formatted = '\n'.join([aa_seq[i:i + frame_lenth] for i in range(0, len(aa_seq), frame_lenth)])
|
395 |
+
html_result += f"{frame_lenth} nt per line\n\n<span style='font-weight: bold;'>CDS ({len(cds_seq)} bp):\n{cds_formatted}\n\n</span>"
|
396 |
+
html_result += f"<span style=' font-weight: bold;'>Protein ({len(aa_seq)} bp):\n{aa_formatted}\n\n</span>"
|
397 |
+
|
398 |
+
# 5'UTR部分 - 蓝色
|
399 |
+
if start > 0:
|
400 |
+
utr5 = html.escape(seq[:start])
|
401 |
+
# 每50个字符一组显示
|
402 |
+
utr5_formatted = '\n'.join([utr5[i:i + frame_lenth] for i in range(0, len(utr5), frame_lenth)])
|
403 |
+
html_result += f"<span style='color: #006400; font-weight: bold;'>5'UTR ({len(utr5)} bp):\n{utr5_formatted}\n</span>\n"
|
404 |
+
else:
|
405 |
+
html_result += f"<span style='color: #006400; font-weight: bold;'>5'UTR:\nN/A\n</span>\n"
|
406 |
+
if end - start > 0:
|
407 |
+
# CDS部分 - 绿色
|
408 |
+
html_result += f"<span style='color: blue; font-weight: bold;'>CDS align ({len(cds_seq)} bp):\n"
|
409 |
+
|
410 |
+
# 格式化显示CDS序列和对应的氨基酸
|
411 |
+
for i in range(0, len(cds_seq), frame_lenth):
|
412 |
+
# 显示核苷酸序列
|
413 |
+
nt_chunk = cds_seq[i:i + frame_lenth]
|
414 |
+
nt_formatted = ' '.join([nt_chunk[j:j + 3] for j in range(0, len(nt_chunk), 3)])
|
415 |
+
html_result += f"{nt_formatted}\n"
|
416 |
+
|
417 |
+
# 显示对应的氨基酸序列
|
418 |
+
aa_start = i // 3
|
419 |
+
aa_end = min(aa_start + frame_lenth//3, len(aa_seq))
|
420 |
+
aa_chunk = aa_seq[aa_start:aa_end]
|
421 |
+
aa_formatted = ' '.join(aa_chunk) # 每个氨基酸之间加三个空格
|
422 |
+
# 添加空格对齐氨基酸和密码子
|
423 |
+
alignment = ' ' * (len(nt_formatted.split()[0]) // 2)
|
424 |
+
html_result += f"{alignment}{aa_formatted}\n"
|
425 |
+
html_result += "</span>\n"
|
426 |
+
|
427 |
+
|
428 |
+
# 3'UTR部分 - 紫色
|
429 |
+
if end !=-1 and end < len(seq):
|
430 |
+
utr3 = html.escape(seq[end:])
|
431 |
+
# 每50个字符一组显示
|
432 |
+
utr3_formatted = '\n'.join([utr3[i:i + frame_lenth] for i in range(0, len(utr3), frame_lenth)])
|
433 |
+
html_result += f"<span style='color: purple; font-weight: bold;'>3'UTR ({len(utr3)} bp):\n{utr3_formatted}\n</span>"
|
434 |
+
else:
|
435 |
+
html_result += "<span style='color: purple; font-weight: bold;'>3'UTR: </span>N/A"
|
436 |
+
|
437 |
+
return html_result,status_msg
|
438 |
+
|
439 |
+
annotate_btn.click(
|
440 |
+
annotate_sequence,
|
441 |
+
inputs=[mrna_input,start_position,stop_position],
|
442 |
+
outputs=[annotation_output,self.status_display]
|
443 |
+
)
|
444 |
+
|
445 |
+
example_btn.click(
|
446 |
+
lambda: [EXAMPLE_MRNA, -1, -1],
|
447 |
+
outputs=[mrna_input,start_position,stop_position]
|
448 |
+
)
|
449 |
+
|
450 |
+
def cds_optimization_tab(self):
|
451 |
+
with gr.Tab("🧬 CDS Optimization"):
|
452 |
+
gr.Markdown("## CDS Sequence Optimization")
|
453 |
+
|
454 |
+
with gr.Row():
|
455 |
+
with gr.Column(scale=2):
|
456 |
+
protein_seq = gr.Textbox(
|
457 |
+
label="Protein Sequence",
|
458 |
+
placeholder="Enter protein sequence here...",
|
459 |
+
lines=3
|
460 |
+
)
|
461 |
+
cds_example_btn = gr.Button("Load Example", variant="secondary")
|
462 |
+
|
463 |
+
with gr.Column(scale=1):
|
464 |
+
species = gr.Dropdown(
|
465 |
+
choices=list(species_data.keys()),
|
466 |
+
label="Target Species",
|
467 |
+
value="human"
|
468 |
+
)
|
469 |
+
method = gr.Radio(
|
470 |
+
choices=["Max GC Content", "tRNA Abundance", "Codon Usage", "MFE Optimization"],
|
471 |
+
label="Optimization Method",
|
472 |
+
value="Max GC Content"
|
473 |
+
)
|
474 |
+
|
475 |
+
with gr.Row():
|
476 |
+
optimize_btn = gr.Button("🚀 Optimize CDS", variant="primary", scale=2)
|
477 |
+
variants_display = gr.Number(
|
478 |
+
label="Potential Variants",
|
479 |
+
value=0,
|
480 |
+
interactive=False,
|
481 |
+
scale=1
|
482 |
+
)
|
483 |
+
|
484 |
+
with gr.Row():
|
485 |
+
results_table = gr.Dataframe(
|
486 |
+
label="Optimization Results",
|
487 |
+
headers=["Rank", "Sequence", "GC%", "tRNA", "Usage", "MFE", "Score"],
|
488 |
+
datatype=["number", "str", "str", "str", "str", "str", "str"],
|
489 |
+
col_count=(7, "fixed"),
|
490 |
+
wrap=True
|
491 |
+
)
|
492 |
+
|
493 |
+
optimization_plot = gr.Plot(label="Score Distribution")
|
494 |
+
|
495 |
+
with gr.Row():
|
496 |
+
download_cds_btn = gr.Button("📥 Download CDS Results", variant="secondary")
|
497 |
+
cds_download_file = gr.File(label="Download File", visible=False)
|
498 |
+
|
499 |
+
def optimize_and_update(protein_seq, species, method):
|
500 |
+
# 更新状态
|
501 |
+
status_msg = self.status_display.update("🔄 Optimizing CDS sequences...")
|
502 |
+
|
503 |
+
# 执行优化
|
504 |
+
df, plot = optimize_cds(protein_seq, species, method,status_msg)
|
505 |
+
|
506 |
+
# 计算变异数
|
507 |
+
variants = calculate_cds_variants(protein_seq) if protein_seq else 0
|
508 |
+
|
509 |
+
# 最终状态
|
510 |
+
final_status = f"✅ Optimization complete! Generated {len(df)} sequences with {variants:,} potential variants"
|
511 |
+
|
512 |
+
self.status_display.update(final_status)
|
513 |
+
|
514 |
+
return df, plot, variants
|
515 |
+
|
516 |
+
optimize_btn.click(
|
517 |
+
optimize_and_update,
|
518 |
+
inputs=[protein_seq, species, method],
|
519 |
+
outputs=[results_table, optimization_plot, variants_display]
|
520 |
+
)
|
521 |
+
|
522 |
+
cds_example_btn.click(lambda: EXAMPLE_PROTEIN, outputs=protein_seq)
|
523 |
+
|
524 |
+
download_cds_btn.click(
|
525 |
+
download_cds_results,
|
526 |
+
inputs=results_table,
|
527 |
+
outputs=cds_download_file
|
528 |
+
)
|
529 |
+
|
530 |
+
def mrna_design_tab(self):
|
531 |
+
with gr.Tab("🧪 mRNA Design"):
|
532 |
+
gr.Markdown("## Full mRNA Sequence Design")
|
533 |
+
|
534 |
+
with gr.Row():
|
535 |
+
with gr.Column():
|
536 |
+
utr5_upload = gr.File(
|
537 |
+
label="5'UTR Candidates (Optional)",
|
538 |
+
file_types=[".txt"]
|
539 |
+
)
|
540 |
+
utr3_upload = gr.File(
|
541 |
+
label="3'UTR Candidates (Optional)",
|
542 |
+
file_types=[".txt"]
|
543 |
+
)
|
544 |
+
|
545 |
+
with gr.Column():
|
546 |
+
cds_input = gr.Textbox(
|
547 |
+
label="CDS Sequence",
|
548 |
+
placeholder="Enter CDS sequence here...",
|
549 |
+
lines=4
|
550 |
+
)
|
551 |
+
mrna_example_btn = gr.Button("Load Example", variant="secondary")
|
552 |
+
|
553 |
+
design_btn = gr.Button("🎯 Design mRNA", variant="primary")
|
554 |
+
|
555 |
+
design_results = gr.Dataframe(
|
556 |
+
label="mRNA Design Results",
|
557 |
+
headers=["Rank", "Design", "5'UTR", "3'UTR", "MFE", "Stability", "Sequence"],
|
558 |
+
datatype=["number", "str", "str", "str", "str", "str", "str"],
|
559 |
+
col_count=(7, "fixed"),
|
560 |
+
wrap=True
|
561 |
+
)
|
562 |
+
|
563 |
+
with gr.Row():
|
564 |
+
download_mrna_btn = gr.Button("📥 Download mRNA Results", variant="secondary")
|
565 |
+
mrna_download_file = gr.File(label="Download File", visible=False)
|
566 |
+
|
567 |
+
def design_and_update(utr5_file, utr3_file, cds_seq):
|
568 |
+
# 更新状态
|
569 |
+
status_msg = self.status_display.update("🔄 Designing mRNA sequences...")
|
570 |
+
|
571 |
+
# 执行设计
|
572 |
+
df = design_mrna(utr5_file, utr3_file, cds_seq)
|
573 |
+
|
574 |
+
# 最终状态
|
575 |
+
final_status = f"✅ mRNA design complete! Generated {len(df)} design variants"
|
576 |
+
|
577 |
+
self.status_display.update(final_status)
|
578 |
+
|
579 |
+
return df
|
580 |
+
|
581 |
+
design_btn.click(
|
582 |
+
design_and_update,
|
583 |
+
inputs=[utr5_upload, utr3_upload, cds_input],
|
584 |
+
outputs=[design_results]
|
585 |
+
)
|
586 |
+
|
587 |
+
mrna_example_btn.click(lambda: EXAMPLE_CDS, outputs=cds_input)
|
588 |
+
|
589 |
+
download_mrna_btn.click(
|
590 |
+
download_mrna_results,
|
591 |
+
inputs=design_results,
|
592 |
+
outputs=mrna_download_file
|
593 |
+
)
|
594 |
+
|
595 |
+
def resources_tab(self):
|
596 |
+
with gr.Tab("📚 Resources"):
|
597 |
+
gr.Markdown("## Bioinformatics Resources")
|
598 |
+
|
599 |
+
with gr.Row():
|
600 |
+
with gr.Column():
|
601 |
+
gr.Markdown("### Databases")
|
602 |
+
gr.Markdown("""
|
603 |
+
- [NCBI GenBank](https://www.ncbi.nlm.nih.gov/genbank/)
|
604 |
+
- [Nucleic Acid Database](https://ngdc.cncb.ac.cn/ncov/)
|
605 |
+
- [Codon Usage Database](https://www.kazusa.or.jp/codon/)
|
606 |
+
- [ViralZone](https://viralzone.expasy.org/)
|
607 |
+
- [bioinformatics tool](https://www.bioinformatics.org/sms2/rev_trans.html)
|
608 |
+
""")
|
609 |
+
|
610 |
+
with gr.Column():
|
611 |
+
gr.Markdown("### Tools")
|
612 |
+
gr.Markdown("""
|
613 |
+
- [mRNA Designer Platform](https://www.biosino.org/mRNAdesigner/main)
|
614 |
+
- [ViennaRNA Package](https://www.tbi.univie.ac.at/RNA/)
|
615 |
+
- [BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi)
|
616 |
+
- [Primer3](https://primer3.org/)
|
617 |
+
""")
|
618 |
+
|
619 |
+
gr.Markdown("---")
|
620 |
+
gr.Markdown("### Contact Information")
|
621 |
+
gr.Markdown("Academic Collaboration Platform | Email: [email protected]")
|
622 |
if __name__ == "__main__":
|
623 |
+
# 实例化并启动应用
|
624 |
+
mtao_web = MaoTaoWeb()
|
625 |
+
mtao_web.app.launch(server_name="0.0.0.0", server_port=7860, debug=True)
|