julse commited on
Commit
cb2619e
·
verified ·
1 Parent(s): e3e8dae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +577 -126
app.py CHANGED
@@ -1,10 +1,57 @@
 
 
 
1
  import gradio as gr
2
  import numpy as np
3
  import random
4
  import pandas as pd
5
  import matplotlib.pyplot as plt
6
- from io import BytesIO
7
  import base64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # 模拟数据 - 实际使用时需要替换为真实数据
10
  species_data = {
@@ -12,30 +59,93 @@ species_data = {
12
  "mouse": {"codon_table": {}, "trna": {}, "codon_usage": {}},
13
  "virus": {"codon_table": {}, "trna": {}, "codon_usage": {}},
14
  "Escherichia coli": {"codon_table": {}, "trna": {}, "codon_usage": {}},
15
- "酿酒酵母": {"codon_table": {}, "trna": {}, "codon_usage": {}},
16
  "Pichia": {"codon_table": {}, "trna": {}, "codon_usage": {}},
17
  }
18
 
19
- # 模拟函数 - 实际需要生物信息学算法实现
20
- def find_longest_cds(seq):
21
- # 简化的ORF查找 - 实际应使用生物信息学库
22
- start = seq.find("ATG")
23
- stops = [seq.find("TAA", start), seq.find("TAG", start), seq.find("TGA", start)]
24
- stops = [s for s in stops if s > start]
25
- end = min(stops) + 3 if stops else len(seq)
26
- return start, end
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def calculate_cds_variants(protein_seq):
29
- # 简化的计算 - 实际应根据密码子表计算
 
30
  aa_count = len(protein_seq)
31
- return 2 ** aa_count # 示例值
32
 
33
- def optimize_cds(protein_seq, species, method):
 
 
 
 
 
 
 
 
 
34
  # 生成20个优化序列示例
35
  results = []
36
  for i in range(20):
37
- # 实际应根据优化方法生成序列
38
  seq = ''.join(random.choices("ACGT", k=len(protein_seq)*3))
 
 
 
39
  gc = random.uniform(0.3, 0.7)
40
  trna = random.uniform(0.5, 1.0)
41
  usage = random.uniform(0.6, 0.95)
@@ -43,132 +153,473 @@ def optimize_cds(protein_seq, species, method):
43
  score = gc*0.25 + trna*0.25 + usage*0.25 + (-mfe/40)*0.25
44
 
45
  results.append({
46
- "Sequence": seq,
 
 
47
  "GC%": f"{gc*100:.1f}%",
48
  "tRNA": f"{trna:.3f}",
49
  "Usage": f"{usage:.3f}",
50
  "MFE": f"{mfe:.1f}",
51
  "Score": f"{score:.3f}"
52
  })
53
- return pd.DataFrame(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- def design_mrna(utr5_candidates, utr3_candidates, cds_seq):
 
 
 
 
 
 
 
 
 
 
 
 
56
  # 生成20个设计结果示例
57
  designs = []
58
  for i in range(20):
59
- utr5 = random.choice(utr5_candidates)
60
- utr3 = random.choice(utr3_candidates)
61
  full_seq = utr5 + cds_seq + utr3
 
 
 
 
62
  mfe = random.uniform(-50, -20)
 
 
63
  designs.append({
 
64
  "Design": f"Design_{i+1}",
65
- "5'UTR": utr5[:10] + "..." if len(utr5) > 13 else utr5,
66
- "3'UTR": utr3[:10] + "..." if len(utr3) > 13 else utr3,
67
  "MFE": f"{mfe:.1f}",
68
- "Sequence": full_seq
 
 
69
  })
70
- return pd.DataFrame(designs)
71
-
72
- # 标注可视化函数
73
- def visualize_annotation(seq):
74
- start, end = find_longest_cds(seq)
75
- html = f"""
76
- <div style="font-family: monospace; font-size: 14px; line-height: 1.8;">
77
- <div style="background-color: #ffcccc; display: inline-block; padding: 2px;">
78
- 5'UTR: {seq[:start] if start > 0 else 'N/A'}
79
- </div>
80
- <div style="background-color: #ccffcc; display: inline-block; padding: 2px;">
81
- CDS: {seq[start:end] if start >=0 else 'N/A'}
82
- </div>
83
- <div style="background-color: #ccccff; display: inline-block; padding: 2px;">
84
- 3'UTR: {seq[end:] if end < len(seq) else 'N/A'}
85
- </div>
86
- </div>
87
- <p>Annotation Legend:</p>
88
- <div style="display: flex; gap: 10px;">
89
- <div style="background-color: #ffcccc; padding: 5px;">5'UTR</div>
90
- <div style="background-color: #ccffcc; padding: 5px;">CDS</div>
91
- <div style="background-color: #ccccff; padding: 5px;">3'UTR</div>
92
- </div>
93
- """
94
- return html
95
-
96
- # 创建Gradio界面
97
- with gr.Blocks(title="Vaccine Designer", theme=gr.themes.Soft()) as app:
98
- gr.Markdown("# 🧬 Vaccine Design Platform - Academic Collaboration")
99
-
100
- with gr.Tab("mRNA Annotation"):
101
- gr.Markdown("## mRNA Sequence Annotation")
102
- mrna_input = gr.Textbox(label="mRNA Sequence", placeholder="Enter mRNA sequence here...")
103
- annotate_btn = gr.Button("Annotate Regions")
104
- annotation_output = gr.HTML(label="Sequence Annotation")
105
- annotate_btn.click(visualize_annotation, inputs=mrna_input, outputs=annotation_output)
106
-
107
- with gr.Tab("CDS Variants"):
108
- gr.Markdown("## Calculate Potential CDS Variants")
109
- protein_input = gr.Textbox(label="Protein Sequence", placeholder="Enter protein sequence here...")
110
- calc_btn = gr.Button("Calculate Variants")
111
- variants_output = gr.Number(label="Potential CDS Variants")
112
- calc_btn.click(calculate_cds_variants, inputs=protein_input, outputs=variants_output)
113
-
114
- with gr.Tab("CDS Optimization"):
115
- gr.Markdown("## Optimize CDS Sequence")
116
- with gr.Row():
117
- protein_seq = gr.Textbox(label="Protein Sequence")
118
- species = gr.Dropdown(list(species_data.keys()), label="Species", value="human")
119
-
120
- method = gr.Radio(["Max GC Content", "tRNA Abundance", "Codon Usage", "MFE Optimization"],
121
- label="Optimization Method", value="Max GC Content")
122
-
123
- optimize_btn = gr.Button("Generate Optimized Sequences")
124
- results_table = gr.Dataframe(label="Top 20 Optimized Sequences", headers=["Sequence", "GC%", "tRNA", "Usage", "MFE", "Score"])
125
- optimize_btn.click(optimize_cds, inputs=[protein_seq, species, method], outputs=results_table)
126
-
127
- # 评分可视化
128
- plot = gr.Plot(label="Optimization Scores")
129
- def update_plot(df):
130
- if df is None or len(df) == 0:
131
- return None
132
- fig, ax = plt.subplots()
133
- scores = [float(x) for x in df["Score"]]
134
- ax.bar(range(len(scores)), scores, color='skyblue')
135
- ax.set_xlabel("Sequence Rank")
136
- ax.set_ylabel("Composite Score")
137
- ax.set_title("Optimization Scores of Top Sequences")
138
- return fig
139
- results_table.change(update_plot, inputs=results_table, outputs=plot)
140
-
141
- with gr.Tab("Full mRNA Design"):
142
- gr.Markdown("## Design Full mRNA Sequence")
143
- with gr.Row():
144
- utr5_upload = gr.File(label="Upload 5'UTR Candidates (txt)")
145
- utr3_upload = gr.File(label="Upload 3'UTR Candidates (txt)")
146
- cds_input = gr.Textbox(label="CDS Sequence")
147
- design_btn = gr.Button("Design mRNA Sequences")
148
- design_results = gr.Dataframe(label="Top 20 Designs", headers=["Design", "5'UTR", "3'UTR", "MFE", "Sequence"])
149
- design_btn.click(design_mrna, inputs=[utr5_upload, utr3_upload, cds_input], outputs=design_results)
150
-
151
- with gr.Tab("Resources & Links"):
152
- gr.Markdown("## Helpful Resources")
153
- gr.Markdown("""
154
- - [mRNA Designer Platform](https://www.biosino.org/mRNAdesigner/main)
155
- - [Nucleic Acid Database](https://ngdc.cncb.ac.cn/ncov/)
156
- - [NCBI GenBank](https://www.ncbi.nlm.nih.gov/genbank/)
157
- - [ViralZone](https://viralzone.expasy.org/)
158
- - [Codon Usage Database](https://www.kazusa.or.jp/codon/)
159
- """)
160
-
161
- gr.Markdown("## Download All Results")
162
- download_btn = gr.Button("Download Results Package")
163
- download_btn.click(lambda: "results.zip", outputs=gr.File(label="Download Results"))
164
-
165
- gr.Markdown("---")
166
- gr.HTML("""
167
- <div style="text-align: center; padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
168
- <p>Academic Collaboration Platform for Vaccine Design | Contact: bioinfo@university.edu</p>
169
- </div>
170
- """)
171
-
172
- # 运行应用
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  if __name__ == "__main__":
174
- app.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
1
+ import html
2
+ from typing import Tuple
3
+
4
  import gradio as gr
5
  import numpy as np
6
  import random
7
  import pandas as pd
8
  import matplotlib.pyplot as plt
9
+ from io import BytesIO, StringIO
10
  import base64
11
+ import json
12
+ AA_str = 'ACDEFGHIKLMNPQRSTVWY*-'.lower()
13
+
14
+ AA_TO_CODONS = {"F": ["TTT","TTC"],
15
+ "L": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
16
+ "I": ["ATT", "ATC", "ATA"],
17
+ "M": ["ATG"],
18
+ "V": ["GTT", "GTC", "GTA", "GTG"],
19
+ "S": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
20
+ "P": ["CCT", "CCC", "CCA", "CCG"],
21
+ "T": ["ACT", "ACC", "ACA", "ACG"],
22
+ "A": ["GCT", "GCC", "GCA", "GCG"],
23
+ "Y": ["TAT", "TAC"],
24
+ "H": ["CAT", "CAC"],
25
+ "Q": ["CAA", "CAG"],
26
+ "N": ["AAT", "AAC"],
27
+ "K": ["AAA", "AAG"],
28
+ "D": ["GAT", "GAC"],
29
+ "E": ["GAA", "GAG"],
30
+ "C": ["TGT", "TGC"],
31
+ "W": ["TGG"],
32
+ "R": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
33
+ "G": ["GGT", "GGC", "GGA", "GGG"],
34
+ "*": ["TAA", "TAG", "TGA"]}
35
+
36
+
37
+ def reverse_dictionary(dictionary):
38
+ """Return dict of {value: key, ->}
39
+
40
+ Input:
41
+ dictionary: dict of {key: [value, ->], ->}
42
+ Output:
43
+ reverse_dictionary: dict of {value: key, ->}
44
+
45
+ """
46
+ reverse_dictionary = {}
47
+
48
+ for key, values in dictionary.items():
49
+ for value in values:
50
+ reverse_dictionary[value] = key
51
+
52
+ return reverse_dictionary
53
+
54
+ CODON_TO_AA = reverse_dictionary(AA_TO_CODONS)
55
 
56
  # 模拟数据 - 实际使用时需要替换为真实数据
57
  species_data = {
 
59
  "mouse": {"codon_table": {}, "trna": {}, "codon_usage": {}},
60
  "virus": {"codon_table": {}, "trna": {}, "codon_usage": {}},
61
  "Escherichia coli": {"codon_table": {}, "trna": {}, "codon_usage": {}},
62
+ "saccharomyces cerevisiae": {"codon_table": {}, "trna": {}, "codon_usage": {}},
63
  "Pichia": {"codon_table": {}, "trna": {}, "codon_usage": {}},
64
  }
65
 
66
+ # 示例数据
67
+ EXAMPLE_PROTEIN = "MSFSRRPKITKSDIVDQISLNIRNNNLKLEKKYIRLVIDAFFEELKGNLCLNNVIEFRSFGTFEVRKRKGRLNARNPQTGEYVKVLDHHVAYFRPGKDLKERVWGIKG"
68
+ EXAMPLE_CDS = "atgagctttagccgccgcccgaaaattaccaaaagcgatattgtggatcagattagcctg\
69
+ aacattcgcaacaacaacctgaaactggaaaaaaaatatattcgcctggtgattgatgcg\
70
+ ttttttgaagaactgaaaggcaacctgtgcctgaacaacgtgattgaatttcgcagcttt\
71
+ ggcacctttgaagtgcgcaaacgcaaaggccgcctgaacgcgcgcaacccgcagaccggc\
72
+ gaatatgtgaaagtgctggatcatcatgtggcgtattttcgcccgggcaaagatctgaaa\
73
+ gaacgcgtgtggggcattaaaggc".upper().replace('T', 'U')
74
+ EXAMPLE_UTR5 = "GAAAAGAGCCCCGGAAAGGAUCUAUCCCUUCCUGUUCUGCUGCACGCAAAAGAACAGCCAAGGGGGAGGCCACC"
75
+ EXAMPLE_UTR3 = "GCUCGCUUUCUUGCUGUCCAAUUUCUAUUAAAGGUUCCUUUGUUCCCUAAGUCCAACUACUAAACUGGGGGAUAUUAUGAAGGGCCUUGAGCAUCUGGAUUCUGCCUAAUAAAAAACAUUUAUUUUCAUUGCAA"
76
+ EXAMPLE_MRNA = EXAMPLE_UTR5 + EXAMPLE_CDS + EXAMPLE_UTR3
77
+
78
+
79
+ def find_longest_cds(seq: str) -> Tuple[int, int]:
80
+ """
81
+ 在mRNA序列中查找最长的CDS区域
82
+
83
+ 参数:
84
+ seq: mRNA序列
85
+
86
+ 返回:
87
+ (start, end): CDS区域的起始和结束索引
88
+ """
89
+ seq = seq.upper().replace('U', 'T')
90
+ best_start = -1
91
+ best_end = -1
92
+ max_length = 0
93
+
94
+ # 尝试所有可能的阅读框
95
+ for frame in range(3):
96
+ in_orf = False
97
+ current_start = -1
98
+
99
+ for pos in range(frame, len(seq) - 2, 3):
100
+ codon = seq[pos:pos + 3]
101
+
102
+ # 如果是起始密码子
103
+ if codon == "ATG" and not in_orf:
104
+ in_orf = True
105
+ current_start = pos
106
+
107
+ # 如果是终止密码子
108
+ elif in_orf and codon in ["TAA", "TAG", "TGA"]:
109
+ orf_length = pos - current_start
110
+ if orf_length > max_length:
111
+ max_length = orf_length
112
+ best_start = current_start
113
+ best_end = pos + 3
114
+ in_orf = False
115
+
116
+ # 处理没有终止密码子的情况
117
+ if in_orf:
118
+ orf_length = len(seq) - current_start
119
+ if orf_length > max_length:
120
+ max_length = orf_length
121
+ best_start = current_start
122
+ best_end = len(seq)
123
+
124
+ return best_start, best_end
125
 
126
  def calculate_cds_variants(protein_seq):
127
+ if not protein_seq:
128
+ return 0
129
  aa_count = len(protein_seq)
130
+ return min(2 ** aa_count, 10**15) # 限制上限避免过大数字
131
 
132
+ def optimize_cds(protein_seq, species, method, status_update):
133
+ if not protein_seq:
134
+ status_update("❌ Error: Please enter a protein sequence")
135
+ return pd.DataFrame(), None
136
+
137
+ status_update("🔄 Optimizing CDS sequences...")
138
+
139
+ # 计算潜在变异数
140
+ variants = calculate_cds_variants(protein_seq)
141
+
142
  # 生成20个优化序列示例
143
  results = []
144
  for i in range(20):
 
145
  seq = ''.join(random.choices("ACGT", k=len(protein_seq)*3))
146
+ # 序列截断显示
147
+ seq_display = seq[:30] + "..." if len(seq) > 30 else seq
148
+
149
  gc = random.uniform(0.3, 0.7)
150
  trna = random.uniform(0.5, 1.0)
151
  usage = random.uniform(0.6, 0.95)
 
153
  score = gc*0.25 + trna*0.25 + usage*0.25 + (-mfe/40)*0.25
154
 
155
  results.append({
156
+ "Rank": i+1,
157
+ "Sequence": seq_display,
158
+ "Full_Sequence": seq, # 完整序列用于下载
159
  "GC%": f"{gc*100:.1f}%",
160
  "tRNA": f"{trna:.3f}",
161
  "Usage": f"{usage:.3f}",
162
  "MFE": f"{mfe:.1f}",
163
  "Score": f"{score:.3f}"
164
  })
165
+
166
+ df = pd.DataFrame(results)
167
+ display_df = df.drop(columns=['Full_Sequence']) # 显示时不包含完整序列
168
+
169
+ # 生成图表
170
+ fig, ax = plt.subplots(figsize=(10, 6))
171
+ scores = [float(x) for x in df["Score"]]
172
+ bars = ax.bar(range(1, len(scores)+1), scores, color='skyblue', alpha=0.7)
173
+ ax.set_xlabel("Sequence Rank")
174
+ ax.set_ylabel("Composite Score")
175
+ ax.set_title(f"CDS Optimization Results ({method})")
176
+ ax.grid(True, alpha=0.3)
177
+
178
+ # 高亮前5名
179
+ for i in range(min(5, len(bars))):
180
+ bars[i].set_color('orange')
181
+
182
+ status_update(f"✅ Successfully generated {len(results)} optimized sequences. Potential variants: {variants:,}")
183
+
184
+ return display_df, fig
185
 
186
+ def design_mrna(utr5_file, utr3_file, cds_seq, status_update):
187
+ if not cds_seq:
188
+ status_update("❌ Error: Please enter a CDS sequence")
189
+ return pd.DataFrame()
190
+
191
+ status_update("🔄 Designing mRNA sequences...")
192
+
193
+ # 默认UTR候选序列
194
+ default_utr5 = ["GGGAAAUAAGAGAGAAAAGAAGAGUAAGAAGAAAUAUAAGAGCCACCAUGG",
195
+ "GGGAAAUAAGAGAGAAAAGAAGAGUAAGAAGAAAUAUAAGAGCCACCAUGG"]
196
+ default_utr3 = ["AAUAAAGCUUUUGCUUUUGUGGUGAAAUUGUUAAUAAACUAUUUUUUUUUU",
197
+ "AAUAAAGCUUUUGCUUUUGUGGUGAAAUUGUUAAUAAACUAUUUUUUUUUU"]
198
+
199
  # 生成20个设计结果示例
200
  designs = []
201
  for i in range(20):
202
+ utr5 = random.choice(default_utr5)
203
+ utr3 = random.choice(default_utr3)
204
  full_seq = utr5 + cds_seq + utr3
205
+
206
+ # 序列截断显示
207
+ full_seq_display = full_seq[:40] + "..." if len(full_seq) > 40 else full_seq
208
+
209
  mfe = random.uniform(-50, -20)
210
+ stability = random.uniform(0.6, 0.9)
211
+
212
  designs.append({
213
+ "Rank": i+1,
214
  "Design": f"Design_{i+1}",
215
+ "5'UTR": utr5[:15] + "..." if len(utr5) > 15 else utr5,
216
+ "3'UTR": utr3[:15] + "..." if len(utr3) > 15 else utr3,
217
  "MFE": f"{mfe:.1f}",
218
+ "Stability": f"{stability:.3f}",
219
+ "Sequence": full_seq_display,
220
+ "Full_Sequence": full_seq # 完整序列用于下载
221
  })
222
+
223
+ df = pd.DataFrame(designs)
224
+ display_df = df.drop(columns=['Full_Sequence']) # 显示时不包含完整序列
225
+
226
+ status_update(f"✅ Successfully designed {len(designs)} mRNA sequences")
227
+
228
+ return display_df
229
+
230
+ def download_cds_results(results_df):
231
+ if results_df is None or len(results_df) == 0:
232
+ return None
233
+
234
+ # 重新添加完整序列用于下载
235
+ download_data = []
236
+ for idx, row in results_df.iterrows():
237
+ download_data.append({
238
+ "Rank": row["Rank"],
239
+ "Full_Sequence": ''.join(random.choices("ACGT", k=150)), # 模拟完整序列
240
+ "GC%": row["GC%"],
241
+ "tRNA": row["tRNA"],
242
+ "Usage": row["Usage"],
243
+ "MFE": row["MFE"],
244
+ "Score": row["Score"]
245
+ })
246
+
247
+ download_df = pd.DataFrame(download_data)
248
+
249
+ # 保存为CSV
250
+ csv_buffer = StringIO()
251
+ download_df.to_csv(csv_buffer, index=False)
252
+ csv_content = csv_buffer.getvalue()
253
+
254
+ # 创建临时文件
255
+ filename = "cds_optimization_results.csv"
256
+ with open(filename, 'w') as f:
257
+ f.write(csv_content)
258
+
259
+ return filename
260
+
261
+ def download_mrna_results(results_df):
262
+ if results_df is None or len(results_df) == 0:
263
+ return None
264
+
265
+ # 重新添加完整序列用于下载
266
+ download_data = []
267
+ for idx, row in results_df.iterrows():
268
+ download_data.append({
269
+ "Rank": row["Rank"],
270
+ "Design": row["Design"],
271
+ "Full_Sequence": ''.join(random.choices("ACGT", k=300)), # 模拟完整序列
272
+ "5'UTR": row["5'UTR"],
273
+ "3'UTR": row["3'UTR"],
274
+ "MFE": row["MFE"],
275
+ "Stability": row["Stability"]
276
+ })
277
+
278
+ download_df = pd.DataFrame(download_data)
279
+
280
+ # 保存为CSV
281
+ csv_buffer = StringIO()
282
+ download_df.to_csv(csv_buffer, index=False)
283
+ csv_content = csv_buffer.getvalue()
284
+
285
+ # 创建临时文件
286
+ filename = "mrna_design_results.csv"
287
+ with open(filename, 'w') as f:
288
+ f.write(csv_content)
289
+
290
+ return filename
291
+
292
+
293
+ def validate_dna_sequence(seq):
294
+ if len(set(seq)-set('ACGTU'))>0:
295
+ return False, str(set(seq)-set('ACGTU'))
296
+ return True, ""
297
+
298
+
299
+ def translate_cds(cds_seq,repeat=1):
300
+ cds_seq = cds_seq.upper().replace('U', 'T')
301
+ amino_acid_list = []
302
+ for i in range(0, len(cds_seq), 3):
303
+ codon = cds_seq[i:i + 3]
304
+ amino_acid_list.append(CODON_TO_AA.get(codon, '-') * repeat)
305
+ amino_acid_seq = ''.join(amino_acid_list)
306
+ return amino_acid_seq
307
+
308
+
309
+ class MaoTaoWeb:
310
+ def __init__(self):
311
+ self.app = self.design_app()
312
+
313
+ def design_app(self):
314
+ # 创建Gradio界面
315
+ with gr.Blocks(title="Vaccine Designer", theme=gr.themes.Soft()) as app:
316
+ gr.Markdown("# 🧬 Vaccine Design Platform")
317
+ gr.Markdown("*Academic Collaboration Platform for mRNA Vaccine Design*")
318
+
319
+ # 全局状态显示
320
+ self.status_display = gr.Textbox(
321
+ label="Status",
322
+ value="Ready to start",
323
+ interactive=False,
324
+ container=True
325
+ )
326
+
327
+ # 创建各个标签页
328
+ self.mrna_annotation_tab()
329
+ self.cds_optimization_tab()
330
+ self.mrna_design_tab()
331
+ self.resources_tab()
332
+
333
+ return app
334
+
335
+ def mrna_annotation_tab(self):
336
+ with gr.Tab("🔬 mRNA Annotation"):
337
+ gr.Markdown("## mRNA Sequence Annotation")
338
+ with gr.Row():
339
+ with gr.Column(scale=3):
340
+ mrna_input = gr.Textbox(
341
+ label="mRNA Sequence",
342
+ placeholder="Enter mRNA sequence here...",
343
+ lines=5,
344
+ max_lines=10
345
+ )
346
+ with gr.Column(scale=1):
347
+ start_position = gr.Number(
348
+ label="CDS Start",
349
+ value=-1,
350
+ interactive=True,
351
+ precision=0,
352
+ )
353
+ stop_position = gr.Number(
354
+ label="CDS End",
355
+ value=-1,
356
+ interactive=True,
357
+ precision=0,
358
+ )
359
+ with gr.Row():
360
+ example_btn = gr.Button("Load Example", variant="secondary")
361
+ annotate_btn = gr.Button("Annotate Regions", variant="primary")
362
+ with gr.Row():
363
+ annotation_output = gr.HTML(
364
+ label="Sequence Regions",
365
+ value="<div style='font-family: monospace;'>Results will appear here</div>"
366
+ )
367
+
368
+ def annotate_sequence(seq,start=-1,end=-1):
369
+ if not seq:
370
+ return "<div style='color: red;'>Please enter a sequence</div>", "❌error"
371
+
372
+ if not validate_dna_sequence(seq):
373
+ return "<div style='color: red;'>Invalid sequence. Only A, C, G, T/U allowed.</div>", "❌error"
374
+
375
+ if start ==-1 and end ==-1:
376
+ start, end = find_longest_cds(seq)
377
+ status_msg = f"✅ Found CDS at position {start} to {end}"
378
+ else:
379
+ status_msg = f"✅ Using user-defined CDS at position {start} to {end}"
380
+ if start == -1:
381
+ return "<div style='color: red;'>No CDS found in sequence</div>", "❌error"
382
+
383
+ # 提取CDS序列
384
+ cds_seq = seq[start:end]
385
+ # 翻译CDS为氨基酸序列
386
+ aa_seq = translate_cds(cds_seq)
387
+ # 创建带颜色的HTML结果
388
+ html_result = "<div style='font-family: monospace; white-space: pre; margin-left: 15px;'>"
389
+
390
+ frame_lenth = 60
391
+
392
+ # CDS and proten
393
+ cds_formatted = '\n'.join([cds_seq[i:i + frame_lenth] for i in range(0, len(cds_seq), frame_lenth)])
394
+ aa_formatted = '\n'.join([aa_seq[i:i + frame_lenth] for i in range(0, len(aa_seq), frame_lenth)])
395
+ html_result += f"{frame_lenth} nt per line\n\n<span style='font-weight: bold;'>CDS ({len(cds_seq)} bp):\n{cds_formatted}\n\n</span>"
396
+ html_result += f"<span style=' font-weight: bold;'>Protein ({len(aa_seq)} bp):\n{aa_formatted}\n\n</span>"
397
+
398
+ # 5'UTR部分 - 蓝色
399
+ if start > 0:
400
+ utr5 = html.escape(seq[:start])
401
+ # 每50个字符一组显示
402
+ utr5_formatted = '\n'.join([utr5[i:i + frame_lenth] for i in range(0, len(utr5), frame_lenth)])
403
+ html_result += f"<span style='color: #006400; font-weight: bold;'>5'UTR ({len(utr5)} bp):\n{utr5_formatted}\n</span>\n"
404
+ else:
405
+ html_result += f"<span style='color: #006400; font-weight: bold;'>5'UTR:\nN/A\n</span>\n"
406
+ if end - start > 0:
407
+ # CDS部分 - 绿色
408
+ html_result += f"<span style='color: blue; font-weight: bold;'>CDS align ({len(cds_seq)} bp):\n"
409
+
410
+ # 格式化显示CDS序列和对应的氨基酸
411
+ for i in range(0, len(cds_seq), frame_lenth):
412
+ # 显示核苷酸序列
413
+ nt_chunk = cds_seq[i:i + frame_lenth]
414
+ nt_formatted = ' '.join([nt_chunk[j:j + 3] for j in range(0, len(nt_chunk), 3)])
415
+ html_result += f"{nt_formatted}\n"
416
+
417
+ # 显示对应的氨基酸序列
418
+ aa_start = i // 3
419
+ aa_end = min(aa_start + frame_lenth//3, len(aa_seq))
420
+ aa_chunk = aa_seq[aa_start:aa_end]
421
+ aa_formatted = ' '.join(aa_chunk) # 每个氨基酸之间加三个空格
422
+ # 添加空格对齐氨基酸和密码子
423
+ alignment = ' ' * (len(nt_formatted.split()[0]) // 2)
424
+ html_result += f"{alignment}{aa_formatted}\n"
425
+ html_result += "</span>\n"
426
+
427
+
428
+ # 3'UTR部分 - 紫色
429
+ if end !=-1 and end < len(seq):
430
+ utr3 = html.escape(seq[end:])
431
+ # 每50个字符一组显示
432
+ utr3_formatted = '\n'.join([utr3[i:i + frame_lenth] for i in range(0, len(utr3), frame_lenth)])
433
+ html_result += f"<span style='color: purple; font-weight: bold;'>3'UTR ({len(utr3)} bp):\n{utr3_formatted}\n</span>"
434
+ else:
435
+ html_result += "<span style='color: purple; font-weight: bold;'>3'UTR: </span>N/A"
436
+
437
+ return html_result,status_msg
438
+
439
+ annotate_btn.click(
440
+ annotate_sequence,
441
+ inputs=[mrna_input,start_position,stop_position],
442
+ outputs=[annotation_output,self.status_display]
443
+ )
444
+
445
+ example_btn.click(
446
+ lambda: [EXAMPLE_MRNA, -1, -1],
447
+ outputs=[mrna_input,start_position,stop_position]
448
+ )
449
+
450
+ def cds_optimization_tab(self):
451
+ with gr.Tab("🧬 CDS Optimization"):
452
+ gr.Markdown("## CDS Sequence Optimization")
453
+
454
+ with gr.Row():
455
+ with gr.Column(scale=2):
456
+ protein_seq = gr.Textbox(
457
+ label="Protein Sequence",
458
+ placeholder="Enter protein sequence here...",
459
+ lines=3
460
+ )
461
+ cds_example_btn = gr.Button("Load Example", variant="secondary")
462
+
463
+ with gr.Column(scale=1):
464
+ species = gr.Dropdown(
465
+ choices=list(species_data.keys()),
466
+ label="Target Species",
467
+ value="human"
468
+ )
469
+ method = gr.Radio(
470
+ choices=["Max GC Content", "tRNA Abundance", "Codon Usage", "MFE Optimization"],
471
+ label="Optimization Method",
472
+ value="Max GC Content"
473
+ )
474
+
475
+ with gr.Row():
476
+ optimize_btn = gr.Button("🚀 Optimize CDS", variant="primary", scale=2)
477
+ variants_display = gr.Number(
478
+ label="Potential Variants",
479
+ value=0,
480
+ interactive=False,
481
+ scale=1
482
+ )
483
+
484
+ with gr.Row():
485
+ results_table = gr.Dataframe(
486
+ label="Optimization Results",
487
+ headers=["Rank", "Sequence", "GC%", "tRNA", "Usage", "MFE", "Score"],
488
+ datatype=["number", "str", "str", "str", "str", "str", "str"],
489
+ col_count=(7, "fixed"),
490
+ wrap=True
491
+ )
492
+
493
+ optimization_plot = gr.Plot(label="Score Distribution")
494
+
495
+ with gr.Row():
496
+ download_cds_btn = gr.Button("📥 Download CDS Results", variant="secondary")
497
+ cds_download_file = gr.File(label="Download File", visible=False)
498
+
499
+ def optimize_and_update(protein_seq, species, method):
500
+ # 更新状态
501
+ status_msg = self.status_display.update("🔄 Optimizing CDS sequences...")
502
+
503
+ # 执行优化
504
+ df, plot = optimize_cds(protein_seq, species, method,status_msg)
505
+
506
+ # 计算变异数
507
+ variants = calculate_cds_variants(protein_seq) if protein_seq else 0
508
+
509
+ # 最终状态
510
+ final_status = f"✅ Optimization complete! Generated {len(df)} sequences with {variants:,} potential variants"
511
+
512
+ self.status_display.update(final_status)
513
+
514
+ return df, plot, variants
515
+
516
+ optimize_btn.click(
517
+ optimize_and_update,
518
+ inputs=[protein_seq, species, method],
519
+ outputs=[results_table, optimization_plot, variants_display]
520
+ )
521
+
522
+ cds_example_btn.click(lambda: EXAMPLE_PROTEIN, outputs=protein_seq)
523
+
524
+ download_cds_btn.click(
525
+ download_cds_results,
526
+ inputs=results_table,
527
+ outputs=cds_download_file
528
+ )
529
+
530
+ def mrna_design_tab(self):
531
+ with gr.Tab("🧪 mRNA Design"):
532
+ gr.Markdown("## Full mRNA Sequence Design")
533
+
534
+ with gr.Row():
535
+ with gr.Column():
536
+ utr5_upload = gr.File(
537
+ label="5'UTR Candidates (Optional)",
538
+ file_types=[".txt"]
539
+ )
540
+ utr3_upload = gr.File(
541
+ label="3'UTR Candidates (Optional)",
542
+ file_types=[".txt"]
543
+ )
544
+
545
+ with gr.Column():
546
+ cds_input = gr.Textbox(
547
+ label="CDS Sequence",
548
+ placeholder="Enter CDS sequence here...",
549
+ lines=4
550
+ )
551
+ mrna_example_btn = gr.Button("Load Example", variant="secondary")
552
+
553
+ design_btn = gr.Button("🎯 Design mRNA", variant="primary")
554
+
555
+ design_results = gr.Dataframe(
556
+ label="mRNA Design Results",
557
+ headers=["Rank", "Design", "5'UTR", "3'UTR", "MFE", "Stability", "Sequence"],
558
+ datatype=["number", "str", "str", "str", "str", "str", "str"],
559
+ col_count=(7, "fixed"),
560
+ wrap=True
561
+ )
562
+
563
+ with gr.Row():
564
+ download_mrna_btn = gr.Button("📥 Download mRNA Results", variant="secondary")
565
+ mrna_download_file = gr.File(label="Download File", visible=False)
566
+
567
+ def design_and_update(utr5_file, utr3_file, cds_seq):
568
+ # 更新状态
569
+ status_msg = self.status_display.update("🔄 Designing mRNA sequences...")
570
+
571
+ # 执行设计
572
+ df = design_mrna(utr5_file, utr3_file, cds_seq)
573
+
574
+ # 最终状态
575
+ final_status = f"✅ mRNA design complete! Generated {len(df)} design variants"
576
+
577
+ self.status_display.update(final_status)
578
+
579
+ return df
580
+
581
+ design_btn.click(
582
+ design_and_update,
583
+ inputs=[utr5_upload, utr3_upload, cds_input],
584
+ outputs=[design_results]
585
+ )
586
+
587
+ mrna_example_btn.click(lambda: EXAMPLE_CDS, outputs=cds_input)
588
+
589
+ download_mrna_btn.click(
590
+ download_mrna_results,
591
+ inputs=design_results,
592
+ outputs=mrna_download_file
593
+ )
594
+
595
+ def resources_tab(self):
596
+ with gr.Tab("📚 Resources"):
597
+ gr.Markdown("## Bioinformatics Resources")
598
+
599
+ with gr.Row():
600
+ with gr.Column():
601
+ gr.Markdown("### Databases")
602
+ gr.Markdown("""
603
+ - [NCBI GenBank](https://www.ncbi.nlm.nih.gov/genbank/)
604
+ - [Nucleic Acid Database](https://ngdc.cncb.ac.cn/ncov/)
605
+ - [Codon Usage Database](https://www.kazusa.or.jp/codon/)
606
+ - [ViralZone](https://viralzone.expasy.org/)
607
+ - [bioinformatics tool](https://www.bioinformatics.org/sms2/rev_trans.html)
608
+ """)
609
+
610
+ with gr.Column():
611
+ gr.Markdown("### Tools")
612
+ gr.Markdown("""
613
+ - [mRNA Designer Platform](https://www.biosino.org/mRNAdesigner/main)
614
+ - [ViennaRNA Package](https://www.tbi.univie.ac.at/RNA/)
615
+ - [BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi)
616
+ - [Primer3](https://primer3.org/)
617
+ """)
618
+
619
+ gr.Markdown("---")
620
+ gr.Markdown("### Contact Information")
621
+ gr.Markdown("Academic Collaboration Platform | Email: [email protected]")
622
  if __name__ == "__main__":
623
+ # 实例化并启动应用
624
+ mtao_web = MaoTaoWeb()
625
+ mtao_web.app.launch(server_name="0.0.0.0", server_port=7860, debug=True)