Jayesh13 commited on
Commit
3069101
·
verified ·
1 Parent(s): a9cb000

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -0
app.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")
3
+
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import xlsxwriter
7
+ from io import BytesIO
8
+ from collections import defaultdict
9
+ import hashlib
10
+
11
+ # Optional for Repeats Functionality
12
+ try:
13
+ from pymongo import MongoClient
14
+ client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
15
+ db = client['BTP_DB']
16
+ results_collection = db['protein_results']
17
+ except:
18
+ results_collection = None
19
+
20
+ st.set_page_config(page_title="Protein Tool", layout="wide")
21
+ st.title("🧬 Protein Analysis Toolkit")
22
+
23
+ app_choice = st.radio("Choose an option", ["🔁 Protein Repeat Finder", "📊 Protein Comparator"])
24
+
25
+ # ------------------- REPEATS FUNCTIONALITY -------------------
26
+ if app_choice == "🔁 Protein Repeat Finder":
27
+ def is_homo_repeat(s):
28
+ return all(c == s[0] for c in s)
29
+
30
+ def hash_sequence(sequence):
31
+ return hashlib.md5(sequence.encode()).hexdigest()
32
+
33
+ @st.cache_data(show_spinner=False)
34
+ def fragment_protein_sequence(sequence, max_length=1000):
35
+ return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
36
+
37
+ def find_homorepeats(protein):
38
+ n = len(protein)
39
+ freq = defaultdict(int)
40
+ i = 0
41
+ while i < n:
42
+ curr = protein[i]
43
+ repeat = ""
44
+ while i < n and curr == protein[i]:
45
+ repeat += protein[i]
46
+ i += 1
47
+ if len(repeat) > 1:
48
+ freq[repeat] += 1
49
+ return freq
50
+
51
+ def find_hetero_amino_acid_repeats(sequence):
52
+ repeat_counts = defaultdict(int)
53
+ for length in range(2, len(sequence) + 1):
54
+ for i in range(len(sequence) - length + 1):
55
+ substring = sequence[i:i+length]
56
+ repeat_counts[substring] += 1
57
+ return {k: v for k, v in repeat_counts.items() if v > 1}
58
+
59
+ def check_boundary_repeats(fragments, final_repeats, overlap=50):
60
+ for i in range(len(fragments) - 1):
61
+ left_overlap = fragments[i][-overlap:]
62
+ right_overlap = fragments[i + 1][:overlap]
63
+ overlap_region = left_overlap + right_overlap
64
+ boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
65
+ for substring, count in boundary_repeats.items():
66
+ if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
67
+ final_repeats[substring] += count
68
+ return final_repeats
69
+
70
+ def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
71
+ new_repeats = defaultdict(int)
72
+ for i in range(len(fragments) - 1):
73
+ left_overlap = fragments[i][-overlap:]
74
+ right_overlap = fragments[i + 1][:overlap]
75
+ overlap_region = left_overlap + right_overlap
76
+ boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
77
+ for substring, count in boundary_repeats.items():
78
+ if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
79
+ if substring not in final_repeats:
80
+ new_repeats[substring] += count
81
+ return new_repeats
82
+
83
+ def get_or_process_sequence(sequence, analysis_type, overlap=50):
84
+ if not results_collection:
85
+ return {}
86
+
87
+ hash_input = f"{sequence}_{analysis_type}"
88
+ sequence_hash = hash_sequence(hash_input)
89
+ cached = results_collection.find_one({"_id": sequence_hash})
90
+ if cached:
91
+ return cached["repeats"]
92
+
93
+ fragments = fragment_protein_sequence(sequence)
94
+ final_repeats = defaultdict(int)
95
+
96
+ if analysis_type == "Hetero":
97
+ for fragment in fragments:
98
+ fragment_repeats = find_hetero_amino_acid_repeats(fragment)
99
+ for k, v in fragment_repeats.items():
100
+ final_repeats[k] += v
101
+ final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
102
+ new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
103
+ for k, v in new_repeats.items():
104
+ final_repeats[k] += v
105
+ final_repeats = {k: v for k, v in final_repeats.items() if not is_homo_repeat(k)}
106
+
107
+ elif analysis_type == "Homo":
108
+ final_repeats = find_homorepeats(sequence)
109
+
110
+ elif analysis_type == "Both":
111
+ hetero_repeats = defaultdict(int)
112
+ for fragment in fragments:
113
+ fragment_repeats = find_hetero_amino_acid_repeats(fragment)
114
+ for k, v in fragment_repeats.items():
115
+ hetero_repeats[k] += v
116
+ hetero_repeats = check_boundary_repeats(fragments, hetero_repeats, overlap)
117
+ new_repeats = find_new_boundary_repeats(fragments, hetero_repeats, overlap)
118
+ for k, v in new_repeats.items():
119
+ hetero_repeats[k] += v
120
+ hetero_repeats = {k: v for k, v in hetero_repeats.items() if not is_homo_repeat(k)}
121
+
122
+ homo_repeats = find_homorepeats(sequence)
123
+ final_repeats = homo_repeats.copy()
124
+ for k, v in hetero_repeats.items():
125
+ final_repeats[k] += v
126
+
127
+ results_collection.insert_one({
128
+ "_id": sequence_hash,
129
+ "sequence": sequence,
130
+ "analysis_type": analysis_type,
131
+ "repeats": dict(final_repeats)
132
+ })
133
+
134
+ return final_repeats
135
+
136
+ def process_excel(excel_data, analysis_type):
137
+ repeats = set()
138
+ sequence_data = []
139
+ count = 0
140
+ for sheet_name in excel_data.sheet_names:
141
+ df = excel_data.parse(sheet_name)
142
+ if len(df.columns) < 3:
143
+ st.error(f"Error: Sheet '{sheet_name}' must have at least 3 columns: ID, Name, Sequence.")
144
+ return None, None
145
+ for _, row in df.iterrows():
146
+ entry_id = str(row[0])
147
+ protein_name = str(row[1])
148
+ sequence = str(row[2]).replace('"', '').replace(' ', '').strip()
149
+ if not sequence:
150
+ continue
151
+ count += 1
152
+ freq = get_or_process_sequence(sequence, analysis_type)
153
+ sequence_data.append((entry_id, protein_name, freq))
154
+ repeats.update(freq.keys())
155
+ st.toast(f"{count} sequences processed.")
156
+ return repeats, sequence_data
157
+
158
+ def create_excel(sequences_data, repeats, filenames):
159
+ output = BytesIO()
160
+ workbook = xlsxwriter.Workbook(output, {'in_memory': True})
161
+ for file_index, file_data in enumerate(sequences_data):
162
+ filename = filenames[file_index]
163
+ worksheet = workbook.add_worksheet(filename[:31])
164
+ worksheet.write(0, 0, "Entry")
165
+ worksheet.write(0, 1, "Protein Name")
166
+ col = 2
167
+ for repeat in sorted(repeats):
168
+ worksheet.write(0, col, repeat)
169
+ col += 1
170
+ row = 1
171
+ for entry_id, protein_name, freq in file_data:
172
+ worksheet.write(row, 0, entry_id)
173
+ worksheet.write(row, 1, protein_name)
174
+ col = 2
175
+ for repeat in sorted(repeats):
176
+ worksheet.write(row, col, freq.get(repeat, 0))
177
+ col += 1
178
+ row += 1
179
+ workbook.close()
180
+ output.seek(0)
181
+ return output
182
+
183
+ analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
184
+ uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
185
+
186
+ if 'all_sequences_data' not in st.session_state:
187
+ st.session_state.all_sequences_data = []
188
+ st.session_state.all_repeats = set()
189
+ st.session_state.filenames = []
190
+ st.session_state.excel_file = None
191
+
192
+ if uploaded_files and st.button("Process Files"):
193
+ st.session_state.all_repeats = set()
194
+ st.session_state.all_sequences_data = []
195
+ st.session_state.filenames = []
196
+ for file in uploaded_files:
197
+ excel_data = pd.ExcelFile(file)
198
+ repeats, sequence_data = process_excel(excel_data, analysis_type)
199
+ if repeats is not None:
200
+ st.session_state.all_repeats.update(repeats)
201
+ st.session_state.all_sequences_data.append(sequence_data)
202
+ st.session_state.filenames.append(file.name)
203
+ if st.session_state.all_sequences_data:
204
+ st.toast(f"Processed {len(uploaded_files)} file(s) successfully.")
205
+ st.session_state.excel_file = create_excel(
206
+ st.session_state.all_sequences_data,
207
+ st.session_state.all_repeats,
208
+ st.session_state.filenames
209
+ )
210
+
211
+ if st.session_state.excel_file:
212
+ st.download_button(
213
+ label="Download Excel file",
214
+ data=st.session_state.excel_file,
215
+ file_name="protein_repeat_results.xlsx",
216
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
217
+ )
218
+
219
+ if st.checkbox("Show Results Table"):
220
+ rows = []
221
+ for file_index, file_data in enumerate(st.session_state.all_sequences_data):
222
+ filename = st.session_state.filenames[file_index]
223
+ for entry_id, protein_name, freq in file_data:
224
+ row = {"Filename": filename, "Entry": entry_id, "Protein Name": protein_name}
225
+ row.update({repeat: freq.get(repeat, 0) for repeat in sorted(st.session_state.all_repeats)})
226
+ rows.append(row)
227
+ result_df = pd.DataFrame(rows)
228
+ st.dataframe(result_df)
229
+
230
+ # ------------------- COMPARATOR FUNCTIONALITY -------------------
231
+ elif app_choice == "📊 Protein Comparator":
232
+ st.write("Upload two Excel files with protein data to compare repeat frequencies.")
233
+
234
+ file1 = st.file_uploader("Upload First Excel File", type=["xlsx"], key="comp1")
235
+ file2 = st.file_uploader("Upload Second Excel File", type=["xlsx"], key="comp2")
236
+
237
+ if file1 and file2:
238
+ df1 = pd.read_excel(file1, header=0)
239
+ df2 = pd.read_excel(file2, header=0)
240
+
241
+ df1.columns = df1.columns.astype(str)
242
+ df2.columns = df2.columns.astype(str)
243
+
244
+ id_col = df1.columns[0]
245
+ name_col = df1.columns[1]
246
+ repeat_columns = df1.columns[2:]
247
+
248
+ differences = []
249
+
250
+ for i in range(len(df1)):
251
+ row1 = df1.iloc[i]
252
+ row2 = df2.iloc[i] if i < len(df2) else None
253
+ if row2 is not None:
254
+ diff_row = {
255
+ "Entry": row1[id_col],
256
+ "Protein Name": row1[name_col]
257
+ }
258
+ for repeat in repeat_columns:
259
+ val1 = row1.get(repeat, 0)
260
+ val2 = row2.get(repeat, 0)
261
+ diff_row[repeat] = abs(val1 - val2)
262
+ differences.append(diff_row)
263
+
264
+ result_df = pd.DataFrame(differences)
265
+ st.dataframe(result_df)
266
+
267
+ def to_excel(df):
268
+ output = BytesIO()
269
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
270
+ df.to_excel(writer, index=False, sheet_name='Comparison')
271
+ writer.close()
272
+ output.seek(0)
273
+ return output
274
+
275
+ excel_file = to_excel(result_df)
276
+
277
+ st.download_button(
278
+ label="Download Comparison Excel",
279
+ data=excel_file,
280
+ file_name="comparison_result.xlsx",
281
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
282
+ )