Jayesh13 commited on
Commit
311e573
·
verified ·
1 Parent(s): 57fe77b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +339 -56
app.py CHANGED
@@ -7,98 +7,381 @@ import xlsxwriter
7
  from io import BytesIO
8
  from collections import Counter
9
  import matplotlib.pyplot as plt # For pie chart
 
10
 
11
- # Set of 20 standard amino acids
12
- AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")
13
 
14
- st.set_page_config(page_title="Amino Acid Percentage Tool", layout="wide")
15
- st.title("🧬 Amino Acid Percentage Analyzer")
 
 
 
 
16
 
17
- uploaded_file = st.file_uploader("Upload Excel file (with Entry, Protein Name, Sequence)", type=["xlsx"])
 
 
 
 
 
 
 
18
 
19
- if uploaded_file and st.button("Analyze File"):
20
- df = pd.read_excel(uploaded_file)
 
21
 
22
- if len(df.columns) < 3:
23
- st.error("The file must have at least three columns: Entry, Protein Name, Sequence")
24
- else:
25
- entry_col = df.columns[0]
26
- name_col = df.columns[1]
27
- seq_col = df.columns[2]
28
 
29
- all_counts = Counter()
30
- all_length = 0
 
31
 
32
- result_rows = []
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- for _, row in df.iterrows():
35
- entry = str(row[entry_col])
36
- name = str(row[name_col])
37
- sequence = str(row[seq_col]).replace(" ", "").replace("\"", "").strip().upper()
38
- sequence = ''.join(filter(lambda c: c in AMINO_ACIDS, sequence))
39
- length = len(sequence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- if length == 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  continue
 
 
 
 
 
 
43
 
44
- count = Counter(sequence)
45
- all_counts.update(count)
46
- all_length += length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- percentage = {aa: round(count[aa] / length * 100, 2) for aa in AMINO_ACIDS}
49
- result_rows.append({"Entry": entry, "Protein Name": name, **percentage})
 
50
 
51
- # Calculate overall percentage
52
- overall_percentage = {aa: round(all_counts[aa] / all_length * 100, 2) for aa in AMINO_ACIDS}
53
- overall_row = {"Entry": "OVERALL", "Protein Name": "ALL SEQUENCES", **overall_percentage}
54
 
55
- # Combine overall row first, then all individual rows
56
- df_result = pd.concat([pd.DataFrame([overall_row]), pd.DataFrame(result_rows)], ignore_index=True)
57
 
58
- st.dataframe(df_result)
 
 
59
 
60
- # 🔵 Pie Chart for Overall Stats
61
- st.subheader("🧁 Overall Amino Acid Composition (Pie Chart)")
 
 
 
62
 
63
- fig, ax = plt.subplots(figsize=(9, 9)) # 50% of typical view size
64
- labels = list(overall_percentage.keys())
65
- sizes = list(overall_percentage.values())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- # Filter out amino acids with 0% to avoid clutter
68
- filtered = [(label, size) for label, size in zip(labels, sizes) if size > 0]
69
- if filtered:
70
- labels, sizes = zip(*filtered)
 
 
 
71
 
72
- ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, counterclock=False)
73
- ax.axis('equal') # Equal aspect ratio ensures the pie is circular.
74
- st.pyplot(fig)
75
- else:
76
- st.info("No valid amino acids found to display in pie chart.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- # Export to Excel
79
- def to_excel(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  output = BytesIO()
81
  workbook = xlsxwriter.Workbook(output, {'in_memory': True})
82
- worksheet = workbook.add_worksheet("Amino Acid %")
83
 
84
- header_format = workbook.add_format({'bold': True, 'bg_color': '#CDEDF6'})
 
 
85
 
86
  for col_num, col_name in enumerate(df.columns):
87
  worksheet.write(0, col_num, col_name, header_format)
88
 
89
  for row_num, row in enumerate(df.itertuples(index=False), start=1):
90
  for col_num, value in enumerate(row):
91
- worksheet.write(row_num, col_num, value)
 
 
 
 
92
 
93
  workbook.close()
94
  output.seek(0)
95
  return output
96
 
97
- excel_file = to_excel(df_result)
98
 
99
  st.download_button(
100
- label="Download Excel Report",
101
  data=excel_file,
102
- file_name="amino_acid_percentage.xlsx",
103
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
104
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from io import BytesIO
8
  from collections import Counter
9
  import matplotlib.pyplot as plt # For pie chart
10
+ # 🔄 COMBINED STREAMLIT PROTEIN ANALYSIS TOOL WITH COLORED COMPARISON
11
 
12
+ import os
13
+ os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")
14
 
15
+ import streamlit as st
16
+ import pandas as pd
17
+ import xlsxwriter
18
+ from io import BytesIO
19
+ from collections import defaultdict
20
+ import hashlib
21
 
22
+ # MongoDB Setup
23
+ try:
24
+ from pymongo import MongoClient
25
+ client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
26
+ db = client['BTP_DB']
27
+ results_collection = db['protein_results']
28
+ except:
29
+ results_collection = None
30
 
31
+ # Utility Functions
32
+ def is_homo_repeat(s):
33
+ return all(c == s[0] for c in s)
34
 
35
+ def hash_sequence(sequence):
36
+ return hashlib.md5(sequence.encode()).hexdigest()
 
 
 
 
37
 
38
+ @st.cache_data(show_spinner=False)
39
+ def fragment_protein_sequence(sequence, max_length=1000):
40
+ return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
41
 
42
+ def find_homorepeats(protein):
43
+ n = len(protein)
44
+ freq = defaultdict(int)
45
+ i = 0
46
+ while i < n:
47
+ curr = protein[i]
48
+ repeat = ""
49
+ while i < n and curr == protein[i]:
50
+ repeat += protein[i]
51
+ i += 1
52
+ if len(repeat) > 1:
53
+ freq[repeat] += 1
54
+ return freq
55
 
56
+ def find_hetero_amino_acid_repeats(sequence):
57
+ repeat_counts = defaultdict(int)
58
+ for length in range(2, len(sequence) + 1):
59
+ for i in range(len(sequence) - length + 1):
60
+ substring = sequence[i:i+length]
61
+ repeat_counts[substring] += 1
62
+ return {k: v for k, v in repeat_counts.items() if v > 1}
63
+
64
+ def check_boundary_repeats(fragments, final_repeats, overlap=50):
65
+ for i in range(len(fragments) - 1):
66
+ left_overlap = fragments[i][-overlap:]
67
+ right_overlap = fragments[i + 1][:overlap]
68
+ overlap_region = left_overlap + right_overlap
69
+ boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
70
+ for substring, count in boundary_repeats.items():
71
+ if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
72
+ final_repeats[substring] += count
73
+ return final_repeats
74
+
75
+ def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
76
+ new_repeats = defaultdict(int)
77
+ for i in range(len(fragments) - 1):
78
+ left_overlap = fragments[i][-overlap:]
79
+ right_overlap = fragments[i + 1][:overlap]
80
+ overlap_region = left_overlap + right_overlap
81
+ boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
82
+ for substring, count in boundary_repeats.items():
83
+ if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
84
+ if substring not in final_repeats:
85
+ new_repeats[substring] += count
86
+ return new_repeats
87
+
88
+ def get_or_process_sequence(sequence, analysis_type, overlap=50):
89
+ if results_collection is None:
90
+ return {}
91
+ hash_input = f"{sequence}_{analysis_type}"
92
+ sequence_hash = hash_sequence(hash_input)
93
+ cached = results_collection.find_one({"_id": sequence_hash})
94
+ if cached:
95
+ return cached["repeats"]
96
+
97
+ fragments = fragment_protein_sequence(sequence)
98
+ final_repeats = defaultdict(int)
99
+
100
+ if analysis_type == "Hetero":
101
+ for fragment in fragments:
102
+ fragment_repeats = find_hetero_amino_acid_repeats(fragment)
103
+ for k, v in fragment_repeats.items():
104
+ final_repeats[k] += v
105
+ final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
106
+ new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
107
+ for k, v in new_repeats.items():
108
+ final_repeats[k] += v
109
+ final_repeats = {k: v for k, v in final_repeats.items() if not is_homo_repeat(k)}
110
 
111
+ elif analysis_type == "Homo":
112
+ final_repeats = find_homorepeats(sequence)
113
+
114
+ elif analysis_type == "Both":
115
+ hetero_repeats = defaultdict(int)
116
+ for fragment in fragments:
117
+ fragment_repeats = find_hetero_amino_acid_repeats(fragment)
118
+ for k, v in fragment_repeats.items():
119
+ hetero_repeats[k] += v
120
+ hetero_repeats = check_boundary_repeats(fragments, hetero_repeats, overlap)
121
+ new_repeats = find_new_boundary_repeats(fragments, hetero_repeats, overlap)
122
+ for k, v in new_repeats.items():
123
+ hetero_repeats[k] += v
124
+ hetero_repeats = {k: v for k, v in hetero_repeats.items() if not is_homo_repeat(k)}
125
+ homo_repeats = find_homorepeats(sequence)
126
+ final_repeats = homo_repeats.copy()
127
+ for k, v in hetero_repeats.items():
128
+ final_repeats[k] += v
129
+
130
+ results_collection.insert_one({
131
+ "_id": sequence_hash,
132
+ "sequence": sequence,
133
+ "analysis_type": analysis_type,
134
+ "repeats": dict(final_repeats)
135
+ })
136
+ return final_repeats
137
+
138
+ def process_excel(excel_data, analysis_type):
139
+ repeats = set()
140
+ sequence_data = []
141
+ count = 0
142
+ for sheet_name in excel_data.sheet_names:
143
+ df = excel_data.parse(sheet_name)
144
+ if len(df.columns) < 3:
145
+ st.error(f"Error: The sheet '{sheet_name}' must have at least three columns: ID, Protein Name, Sequence")
146
+ return None, None
147
+ for _, row in df.iterrows():
148
+ entry_id = str(row[0])
149
+ protein_name = str(row[1])
150
+ sequence = str(row[2]).replace('"', '').replace(' ', '').strip()
151
+ if not sequence:
152
  continue
153
+ count += 1
154
+ freq = get_or_process_sequence(sequence, analysis_type)
155
+ sequence_data.append((entry_id, protein_name, freq))
156
+ repeats.update(freq.keys())
157
+ st.toast(f"{count} sequences processed.")
158
+ return repeats, sequence_data
159
 
160
+ def create_excel(sequences_data, repeats, filenames):
161
+ output = BytesIO()
162
+ workbook = xlsxwriter.Workbook(output, {'in_memory': True})
163
+ for file_index, file_data in enumerate(sequences_data):
164
+ filename = filenames[file_index]
165
+ worksheet = workbook.add_worksheet(filename[:31])
166
+ worksheet.write(0, 0, "Entry")
167
+ worksheet.write(0, 1, "Protein Name")
168
+ col = 2
169
+ for repeat in sorted(repeats):
170
+ worksheet.write(0, col, repeat)
171
+ col += 1
172
+ row = 1
173
+ for entry_id, protein_name, freq in file_data:
174
+ worksheet.write(row, 0, entry_id)
175
+ worksheet.write(row, 1, protein_name)
176
+ col = 2
177
+ for repeat in sorted(repeats):
178
+ worksheet.write(row, col, freq.get(repeat, 0))
179
+ col += 1
180
+ row += 1
181
+ workbook.close()
182
+ output.seek(0)
183
+ return output
184
 
185
+ # Streamlit UI
186
+ st.set_page_config(page_title="Protein Tool", layout="wide")
187
+ st.title("🧬 Protein Analysis Toolkit")
188
 
189
+ app_choice = st.radio("Choose an option", ["🔁 Protein Repeat Finder", "📊 Protein Comparator", "🧪 Amino Acid Percentage Analyzer"])
 
 
190
 
 
 
191
 
192
+ if app_choice == "🔁 Protein Repeat Finder":
193
+ analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
194
+ uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
195
 
196
+ if 'all_sequences_data' not in st.session_state:
197
+ st.session_state.all_sequences_data = []
198
+ st.session_state.all_repeats = set()
199
+ st.session_state.filenames = []
200
+ st.session_state.excel_file = None
201
 
202
+ if uploaded_files and st.button("Process Files"):
203
+ st.session_state.all_repeats = set()
204
+ st.session_state.all_sequences_data = []
205
+ st.session_state.filenames = []
206
+ for file in uploaded_files:
207
+ excel_data = pd.ExcelFile(file)
208
+ repeats, sequence_data = process_excel(excel_data, analysis_type)
209
+ if repeats is not None:
210
+ st.session_state.all_repeats.update(repeats)
211
+ st.session_state.all_sequences_data.append(sequence_data)
212
+ st.session_state.filenames.append(file.name)
213
+ if st.session_state.all_sequences_data:
214
+ st.toast(f"Processed {len(uploaded_files)} file(s) successfully.")
215
+ st.session_state.excel_file = create_excel(
216
+ st.session_state.all_sequences_data,
217
+ st.session_state.all_repeats,
218
+ st.session_state.filenames
219
+ )
220
 
221
+ if st.session_state.excel_file:
222
+ st.download_button(
223
+ label="Download Excel file",
224
+ data=st.session_state.excel_file,
225
+ file_name="protein_repeat_results.xlsx",
226
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
227
+ )
228
 
229
+ if st.checkbox("Show Results Table"):
230
+ rows = []
231
+ for file_index, file_data in enumerate(st.session_state.all_sequences_data):
232
+ filename = st.session_state.filenames[file_index]
233
+ for entry_id, protein_name, freq in file_data:
234
+ row = {"Filename": filename, "Entry": entry_id, "Protein Name": protein_name}
235
+ row.update({repeat: freq.get(repeat, 0) for repeat in sorted(st.session_state.all_repeats)})
236
+ rows.append(row)
237
+ result_df = pd.DataFrame(rows)
238
+ st.dataframe(result_df)
239
+
240
+ elif app_choice == "📊 Protein Comparator":
241
+ st.write("Upload two Excel files with protein data to compare repeat frequencies.")
242
+
243
+ file1 = st.file_uploader("Upload First Excel File", type=["xlsx"], key="comp1")
244
+ file2 = st.file_uploader("Upload Second Excel File", type=["xlsx"], key="comp2")
245
+
246
+ if file1 and file2:
247
+ df1 = pd.read_excel(file1)
248
+ df2 = pd.read_excel(file2)
249
 
250
+ df1.columns = df1.columns.astype(str)
251
+ df2.columns = df2.columns.astype(str)
252
+
253
+ id_col = df1.columns[0]
254
+ name_col = df1.columns[1]
255
+ repeat_columns = df1.columns[2:]
256
+
257
+ diff_data = []
258
+ for i in range(min(len(df1), len(df2))):
259
+ row1 = df1.iloc[i]
260
+ row2 = df2.iloc[i]
261
+ diff_row = {"Entry": row1[id_col], "Protein Name": row1[name_col]}
262
+ for repeat in repeat_columns:
263
+ val1 = row1.get(repeat, 0)
264
+ val2 = row2.get(repeat, 0)
265
+ change = ((val2 - val1) / val1 * 100) if val1 != 0 else (100 if val2 > 0 else 0)
266
+ diff_row[repeat] = change
267
+ diff_data.append(diff_row)
268
+
269
+ result_df = pd.DataFrame(diff_data)
270
+ percent_cols = result_df.select_dtypes(include='number').columns
271
+ st.dataframe(result_df.style.format({col: "{:.2f}%" for col in percent_cols}))
272
+
273
+ def to_excel_with_colors(df):
274
  output = BytesIO()
275
  workbook = xlsxwriter.Workbook(output, {'in_memory': True})
276
+ worksheet = workbook.add_worksheet('Comparison')
277
 
278
+ green_format = workbook.add_format({'font_color': 'green'})
279
+ red_format = workbook.add_format({'font_color': 'red'})
280
+ header_format = workbook.add_format({'bold': True, 'bg_color': '#D7E4BC'})
281
 
282
  for col_num, col_name in enumerate(df.columns):
283
  worksheet.write(0, col_num, col_name, header_format)
284
 
285
  for row_num, row in enumerate(df.itertuples(index=False), start=1):
286
  for col_num, value in enumerate(row):
287
+ if col_num < 2:
288
+ worksheet.write(row_num, col_num, value)
289
+ else:
290
+ fmt = green_format if value > 0 else red_format if value < 0 else None
291
+ worksheet.write(row_num, col_num, f"{value:.2f}%", fmt)
292
 
293
  workbook.close()
294
  output.seek(0)
295
  return output
296
 
297
+ excel_file = to_excel_with_colors(result_df)
298
 
299
  st.download_button(
300
+ label="Download Colored Comparison Excel",
301
  data=excel_file,
302
+ file_name="comparison_result_colored.xlsx",
303
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
304
  )
305
+
306
+ elif app_choice == "🧪 Amino Acid Percentage Analyzer":
307
+ import matplotlib.pyplot as plt # Needed for pie chart
308
+
309
+ AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")
310
+
311
+ uploaded_file = st.file_uploader("Upload Excel file (with Entry, Protein Name, Sequence)", type=["xlsx"])
312
+
313
+ if uploaded_file and st.button("Analyze File"):
314
+ df = pd.read_excel(uploaded_file)
315
+
316
+ if len(df.columns) < 3:
317
+ st.error("The file must have at least three columns: Entry, Protein Name, Sequence")
318
+ else:
319
+ entry_col = df.columns[0]
320
+ name_col = df.columns[1]
321
+ seq_col = df.columns[2]
322
+
323
+ from collections import Counter
324
+ all_counts = Counter()
325
+ all_length = 0
326
+ result_rows = []
327
+
328
+ for _, row in df.iterrows():
329
+ entry = str(row[entry_col])
330
+ name = str(row[name_col])
331
+ sequence = str(row[seq_col]).replace(" ", "").replace("\"", "").strip().upper()
332
+ sequence = ''.join(filter(lambda c: c in AMINO_ACIDS, sequence))
333
+ length = len(sequence)
334
+
335
+ if length == 0:
336
+ continue
337
+
338
+ count = Counter(sequence)
339
+ all_counts.update(count)
340
+ all_length += length
341
+ percentage = {aa: round(count[aa] / length * 100, 2) for aa in AMINO_ACIDS}
342
+ result_rows.append({"Entry": entry, "Protein Name": name, **percentage})
343
+
344
+ overall_percentage = {aa: round(all_counts[aa] / all_length * 100, 2) for aa in AMINO_ACIDS}
345
+ overall_row = {"Entry": "OVERALL", "Protein Name": "ALL SEQUENCES", **overall_percentage}
346
+ df_result = pd.concat([pd.DataFrame([overall_row]), pd.DataFrame(result_rows)], ignore_index=True)
347
+
348
+ st.dataframe(df_result)
349
+
350
+ # 🔵 Pie Chart
351
+ st.subheader("🧁 Overall Amino Acid Composition (Pie Chart)")
352
+ fig, ax = plt.subplots(figsize=(9, 9))
353
+ labels = list(overall_percentage.keys())
354
+ sizes = list(overall_percentage.values())
355
+ filtered = [(label, size) for label, size in zip(labels, sizes) if size > 0]
356
+
357
+ if filtered:
358
+ labels, sizes = zip(*filtered)
359
+ ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, counterclock=False)
360
+ ax.axis('equal')
361
+ st.pyplot(fig)
362
+ else:
363
+ st.info("No valid amino acids found to display in pie chart.")
364
+
365
+ # Excel Export
366
+ def to_excel(df):
367
+ output = BytesIO()
368
+ workbook = xlsxwriter.Workbook(output, {'in_memory': True})
369
+ worksheet = workbook.add_worksheet("Amino Acid %")
370
+ header_format = workbook.add_format({'bold': True, 'bg_color': '#CDEDF6'})
371
+ for col_num, col_name in enumerate(df.columns):
372
+ worksheet.write(0, col_num, col_name, header_format)
373
+ for row_num, row in enumerate(df.itertuples(index=False), start=1):
374
+ for col_num, value in enumerate(row):
375
+ worksheet.write(row_num, col_num, value)
376
+ workbook.close()
377
+ output.seek(0)
378
+ return output
379
+
380
+ excel_file = to_excel(df_result)
381
+
382
+ st.download_button(
383
+ label="Download Excel Report",
384
+ data=excel_file,
385
+ file_name="amino_acid_percentage.xlsx",
386
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
387
+ )