Jayesh13 commited on
Commit
4675cde
Β·
verified Β·
1 Parent(s): 4edb22c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -254
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import os
2
  os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")
3
 
@@ -8,7 +10,7 @@ from io import BytesIO
8
  from collections import defaultdict
9
  import hashlib
10
 
11
- # Optional for Repeats Functionality
12
  try:
13
  from pymongo import MongoClient
14
  client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
@@ -17,169 +19,167 @@ try:
17
  except:
18
  results_collection = None
19
 
20
- st.set_page_config(page_title="Protein Tool", layout="wide")
21
- st.title("🧬 Protein Analysis Toolkit")
22
-
23
- app_choice = st.radio("Choose an option", ["πŸ” Protein Repeat Finder", "πŸ“Š Protein Comparator"])
24
-
25
- # ------------------- REPEATS FUNCTIONALITY -------------------
26
- if app_choice == "πŸ” Protein Repeat Finder":
27
- def is_homo_repeat(s):
28
- return all(c == s[0] for c in s)
29
-
30
- def hash_sequence(sequence):
31
- return hashlib.md5(sequence.encode()).hexdigest()
32
-
33
- @st.cache_data(show_spinner=False)
34
- def fragment_protein_sequence(sequence, max_length=1000):
35
- return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
36
-
37
- def find_homorepeats(protein):
38
- n = len(protein)
39
- freq = defaultdict(int)
40
- i = 0
41
- while i < n:
42
- curr = protein[i]
43
- repeat = ""
44
- while i < n and curr == protein[i]:
45
- repeat += protein[i]
46
- i += 1
47
- if len(repeat) > 1:
48
- freq[repeat] += 1
49
- return freq
50
-
51
- def find_hetero_amino_acid_repeats(sequence):
52
- repeat_counts = defaultdict(int)
53
- for length in range(2, len(sequence) + 1):
54
- for i in range(len(sequence) - length + 1):
55
- substring = sequence[i:i+length]
56
- repeat_counts[substring] += 1
57
- return {k: v for k, v in repeat_counts.items() if v > 1}
58
-
59
- def check_boundary_repeats(fragments, final_repeats, overlap=50):
60
- for i in range(len(fragments) - 1):
61
- left_overlap = fragments[i][-overlap:]
62
- right_overlap = fragments[i + 1][:overlap]
63
- overlap_region = left_overlap + right_overlap
64
- boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
65
- for substring, count in boundary_repeats.items():
66
- if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
67
- final_repeats[substring] += count
68
- return final_repeats
69
-
70
- def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
71
- new_repeats = defaultdict(int)
72
- for i in range(len(fragments) - 1):
73
- left_overlap = fragments[i][-overlap:]
74
- right_overlap = fragments[i + 1][:overlap]
75
- overlap_region = left_overlap + right_overlap
76
- boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
77
- for substring, count in boundary_repeats.items():
78
- if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
79
- if substring not in final_repeats:
80
- new_repeats[substring] += count
81
- return new_repeats
82
-
83
- def get_or_process_sequence(sequence, analysis_type, overlap=50):
84
- if results_collection is None:
85
- return {}
86
-
87
- hash_input = f"{sequence}_{analysis_type}"
88
- sequence_hash = hash_sequence(hash_input)
89
- cached = results_collection.find_one({"_id": sequence_hash})
90
- if cached:
91
- return cached["repeats"]
92
-
93
- fragments = fragment_protein_sequence(sequence)
94
- final_repeats = defaultdict(int)
95
-
96
- if analysis_type == "Hetero":
97
- for fragment in fragments:
98
- fragment_repeats = find_hetero_amino_acid_repeats(fragment)
99
- for k, v in fragment_repeats.items():
100
- final_repeats[k] += v
101
- final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
102
- new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
103
- for k, v in new_repeats.items():
104
  final_repeats[k] += v
105
- final_repeats = {k: v for k, v in final_repeats.items() if not is_homo_repeat(k)}
106
-
107
- elif analysis_type == "Homo":
108
- final_repeats = find_homorepeats(sequence)
109
-
110
- elif analysis_type == "Both":
111
- hetero_repeats = defaultdict(int)
112
- for fragment in fragments:
113
- fragment_repeats = find_hetero_amino_acid_repeats(fragment)
114
- for k, v in fragment_repeats.items():
115
- hetero_repeats[k] += v
116
- hetero_repeats = check_boundary_repeats(fragments, hetero_repeats, overlap)
117
- new_repeats = find_new_boundary_repeats(fragments, hetero_repeats, overlap)
118
- for k, v in new_repeats.items():
119
  hetero_repeats[k] += v
120
- hetero_repeats = {k: v for k, v in hetero_repeats.items() if not is_homo_repeat(k)}
121
-
122
- homo_repeats = find_homorepeats(sequence)
123
- final_repeats = homo_repeats.copy()
124
- for k, v in hetero_repeats.items():
125
- final_repeats[k] += v
126
-
127
- results_collection.insert_one({
128
- "_id": sequence_hash,
129
- "sequence": sequence,
130
- "analysis_type": analysis_type,
131
- "repeats": dict(final_repeats)
132
- })
133
-
134
- return final_repeats
135
-
136
- def process_excel(excel_data, analysis_type):
137
- repeats = set()
138
- sequence_data = []
139
- count = 0
140
- for sheet_name in excel_data.sheet_names:
141
- df = excel_data.parse(sheet_name)
142
- if len(df.columns) < 3:
143
- st.error(f"Error: Sheet '{sheet_name}' must have at least 3 columns: ID, Name, Sequence.")
144
- return None, None
145
- for _, row in df.iterrows():
146
- entry_id = str(row[0])
147
- protein_name = str(row[1])
148
- sequence = str(row[2]).replace('"', '').replace(' ', '').strip()
149
- if not sequence:
150
- continue
151
- count += 1
152
- freq = get_or_process_sequence(sequence, analysis_type)
153
- sequence_data.append((entry_id, protein_name, freq))
154
- repeats.update(freq.keys())
155
- st.toast(f"{count} sequences processed.")
156
- return repeats, sequence_data
157
-
158
- def create_excel(sequences_data, repeats, filenames):
159
- output = BytesIO()
160
- workbook = xlsxwriter.Workbook(output, {'in_memory': True})
161
- for file_index, file_data in enumerate(sequences_data):
162
- filename = filenames[file_index]
163
- worksheet = workbook.add_worksheet(filename[:31])
164
- worksheet.write(0, 0, "Entry")
165
- worksheet.write(0, 1, "Protein Name")
 
 
 
 
 
 
 
 
 
 
166
  col = 2
167
  for repeat in sorted(repeats):
168
- worksheet.write(0, col, repeat)
169
  col += 1
170
- row = 1
171
- for entry_id, protein_name, freq in file_data:
172
- worksheet.write(row, 0, entry_id)
173
- worksheet.write(row, 1, protein_name)
174
- col = 2
175
- for repeat in sorted(repeats):
176
- worksheet.write(row, col, freq.get(repeat, 0))
177
- col += 1
178
- row += 1
179
- workbook.close()
180
- output.seek(0)
181
- return output
182
 
 
 
 
 
 
 
 
183
  analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
184
  uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
185
 
@@ -227,101 +227,67 @@ if app_choice == "πŸ” Protein Repeat Finder":
227
  result_df = pd.DataFrame(rows)
228
  st.dataframe(result_df)
229
 
230
- # ------------------- COMPARATOR FUNCTIONALITY -------------------
231
- # ------------------- COMPARATOR FUNCTIONALITY -------------------
232
  elif app_choice == "πŸ“Š Protein Comparator":
233
- # st.set_page_config(page_title="Protein Repeat Comparator", layout="centered")
234
- st.title("🧬 Protein Repeat Comparator")
235
- st.write("Upload two Excel files with protein data. Frequency values should start from the first row (header).")
236
-
237
- uploaded_file1 = st.file_uploader("Upload First Excel File", type=["xlsx"], key="comp1")
238
- uploaded_file2 = st.file_uploader("Upload Second Excel File", type=["xlsx"], key="comp2")
239
-
240
- if uploaded_file1 and uploaded_file2:
241
- try:
242
- df1 = pd.read_excel(uploaded_file1, header=0)
243
- df2 = pd.read_excel(uploaded_file2, header=0)
244
-
245
- df1.columns = df1.columns.astype(str)
246
- df2.columns = df2.columns.astype(str)
247
-
248
- id_col = df1.columns[0]
249
- name_col = df1.columns[1]
250
- repeat_columns = df1.columns[2:]
251
-
252
- differences = []
253
-
254
- for _, row1 in df1.iterrows():
255
- entry_id = row1[id_col]
256
- protein_name = row1[name_col]
257
-
258
- row2_match = df2[(df2[id_col] == entry_id) & (df2[name_col] == protein_name)]
259
- if row2_match.empty:
260
- continue
261
-
262
- row2 = row2_match.iloc[0]
263
-
264
- for repeat_col in repeat_columns:
265
- freq1 = row1[repeat_col]
266
- freq2 = row2[repeat_col]
267
-
268
- if pd.isna(freq1) or pd.isna(freq2):
269
- continue
270
-
271
- if freq1 != freq2:
272
- if freq1 == 0:
273
- pct_change = "Infinity"
274
- else:
275
- pct_change = ((freq2 - freq1) / freq1) * 100
276
- pct_change = round(pct_change, 2)
277
-
278
- diff = abs(freq1 - freq2)
279
- differences.append({
280
- id_col: entry_id,
281
- name_col: protein_name,
282
- "Repeat": repeat_col,
283
- "Frequency File 1": freq1,
284
- "Frequency File 2": freq2,
285
- "Difference": diff,
286
- "%age Change": pct_change
287
- })
288
-
289
- if differences:
290
- result_df = pd.DataFrame(differences)
291
- result_df = result_df.sort_values(by="Difference", ascending=False)
292
-
293
- # Show DataFrame in Streamlit app
294
- st.subheader("πŸ” View Changed Repeats")
295
- st.dataframe(result_df, use_container_width=True)
296
-
297
- # Apply styling
298
- def color_pct(val):
299
- if isinstance(val, str) and val == "Infinity":
300
- return 'color: green'
301
- elif isinstance(val, (int, float)):
302
- if val > 0:
303
- return 'color: green'
304
- elif val < 0:
305
- return 'color: red'
306
- return ''
307
-
308
- styled_df = result_df.style.applymap(color_pct, subset=["%age Change"])
309
-
310
- # Save styled output
311
- output = BytesIO()
312
- with pd.ExcelWriter(output, engine='openpyxl') as writer:
313
- styled_df.to_excel(writer, index=False, sheet_name="Changed Repeats")
314
- output.seek(0)
315
-
316
- st.download_button(
317
- label="πŸ“₯ Download Excel File",
318
- data=output,
319
- file_name="changed_repeats_with_percentage.xlsx",
320
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
321
- )
322
- else:
323
- st.info("No changes in repeat frequencies were found.")
324
-
325
- except Exception as e:
326
- st.error(f"⚠ Error: {e}")
327
 
 
 
 
 
 
 
 
1
+ # πŸ”„ COMBINED STREAMLIT PROTEIN ANALYSIS TOOL WITH COLORED COMPARISON
2
+
3
  import os
4
  os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")
5
 
 
10
  from collections import defaultdict
11
  import hashlib
12
 
13
+ # MongoDB Setup
14
  try:
15
  from pymongo import MongoClient
16
  client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
 
19
  except:
20
  results_collection = None
21
 
22
+ # Utility Functions
23
+ def is_homo_repeat(s):
24
+ return all(c == s[0] for c in s)
25
+
26
+ def hash_sequence(sequence):
27
+ return hashlib.md5(sequence.encode()).hexdigest()
28
+
29
+ @st.cache_data(show_spinner=False)
30
+ def fragment_protein_sequence(sequence, max_length=1000):
31
+ return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
32
+
33
+ def find_homorepeats(protein):
34
+ n = len(protein)
35
+ freq = defaultdict(int)
36
+ i = 0
37
+ while i < n:
38
+ curr = protein[i]
39
+ repeat = ""
40
+ while i < n and curr == protein[i]:
41
+ repeat += protein[i]
42
+ i += 1
43
+ if len(repeat) > 1:
44
+ freq[repeat] += 1
45
+ return freq
46
+
47
+ def find_hetero_amino_acid_repeats(sequence):
48
+ repeat_counts = defaultdict(int)
49
+ for length in range(2, len(sequence) + 1):
50
+ for i in range(len(sequence) - length + 1):
51
+ substring = sequence[i:i+length]
52
+ repeat_counts[substring] += 1
53
+ return {k: v for k, v in repeat_counts.items() if v > 1}
54
+
55
+ def check_boundary_repeats(fragments, final_repeats, overlap=50):
56
+ for i in range(len(fragments) - 1):
57
+ left_overlap = fragments[i][-overlap:]
58
+ right_overlap = fragments[i + 1][:overlap]
59
+ overlap_region = left_overlap + right_overlap
60
+ boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
61
+ for substring, count in boundary_repeats.items():
62
+ if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
63
+ final_repeats[substring] += count
64
+ return final_repeats
65
+
66
+ def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
67
+ new_repeats = defaultdict(int)
68
+ for i in range(len(fragments) - 1):
69
+ left_overlap = fragments[i][-overlap:]
70
+ right_overlap = fragments[i + 1][:overlap]
71
+ overlap_region = left_overlap + right_overlap
72
+ boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
73
+ for substring, count in boundary_repeats.items():
74
+ if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
75
+ if substring not in final_repeats:
76
+ new_repeats[substring] += count
77
+ return new_repeats
78
+
79
+ def get_or_process_sequence(sequence, analysis_type, overlap=50):
80
+ if results_collection is None:
81
+ return {}
82
+ hash_input = f"{sequence}_{analysis_type}"
83
+ sequence_hash = hash_sequence(hash_input)
84
+ cached = results_collection.find_one({"_id": sequence_hash})
85
+ if cached:
86
+ return cached["repeats"]
87
+
88
+ fragments = fragment_protein_sequence(sequence)
89
+ final_repeats = defaultdict(int)
90
+
91
+ if analysis_type == "Hetero":
92
+ for fragment in fragments:
93
+ fragment_repeats = find_hetero_amino_acid_repeats(fragment)
94
+ for k, v in fragment_repeats.items():
 
 
 
 
 
 
 
 
 
 
 
95
  final_repeats[k] += v
96
+ final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
97
+ new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
98
+ for k, v in new_repeats.items():
99
+ final_repeats[k] += v
100
+ final_repeats = {k: v for k, v in final_repeats.items() if not is_homo_repeat(k)}
101
+
102
+ elif analysis_type == "Homo":
103
+ final_repeats = find_homorepeats(sequence)
104
+
105
+ elif analysis_type == "Both":
106
+ hetero_repeats = defaultdict(int)
107
+ for fragment in fragments:
108
+ fragment_repeats = find_hetero_amino_acid_repeats(fragment)
109
+ for k, v in fragment_repeats.items():
110
  hetero_repeats[k] += v
111
+ hetero_repeats = check_boundary_repeats(fragments, hetero_repeats, overlap)
112
+ new_repeats = find_new_boundary_repeats(fragments, hetero_repeats, overlap)
113
+ for k, v in new_repeats.items():
114
+ hetero_repeats[k] += v
115
+ hetero_repeats = {k: v for k, v in hetero_repeats.items() if not is_homo_repeat(k)}
116
+ homo_repeats = find_homorepeats(sequence)
117
+ final_repeats = homo_repeats.copy()
118
+ for k, v in hetero_repeats.items():
119
+ final_repeats[k] += v
120
+
121
+ results_collection.insert_one({
122
+ "_id": sequence_hash,
123
+ "sequence": sequence,
124
+ "analysis_type": analysis_type,
125
+ "repeats": dict(final_repeats)
126
+ })
127
+ return final_repeats
128
+
129
+ def process_excel(excel_data, analysis_type):
130
+ repeats = set()
131
+ sequence_data = []
132
+ count = 0
133
+ for sheet_name in excel_data.sheet_names:
134
+ df = excel_data.parse(sheet_name)
135
+ if len(df.columns) < 3:
136
+ st.error(f"Error: The sheet '{sheet_name}' must have at least three columns: ID, Protein Name, Sequence")
137
+ return None, None
138
+ for _, row in df.iterrows():
139
+ entry_id = str(row[0])
140
+ protein_name = str(row[1])
141
+ sequence = str(row[2]).replace('"', '').replace(' ', '').strip()
142
+ if not sequence:
143
+ continue
144
+ count += 1
145
+ freq = get_or_process_sequence(sequence, analysis_type)
146
+ sequence_data.append((entry_id, protein_name, freq))
147
+ repeats.update(freq.keys())
148
+ st.toast(f"{count} sequences processed.")
149
+ return repeats, sequence_data
150
+
151
+ def create_excel(sequences_data, repeats, filenames):
152
+ output = BytesIO()
153
+ workbook = xlsxwriter.Workbook(output, {'in_memory': True})
154
+ for file_index, file_data in enumerate(sequences_data):
155
+ filename = filenames[file_index]
156
+ worksheet = workbook.add_worksheet(filename[:31])
157
+ worksheet.write(0, 0, "Entry")
158
+ worksheet.write(0, 1, "Protein Name")
159
+ col = 2
160
+ for repeat in sorted(repeats):
161
+ worksheet.write(0, col, repeat)
162
+ col += 1
163
+ row = 1
164
+ for entry_id, protein_name, freq in file_data:
165
+ worksheet.write(row, 0, entry_id)
166
+ worksheet.write(row, 1, protein_name)
167
  col = 2
168
  for repeat in sorted(repeats):
169
+ worksheet.write(row, col, freq.get(repeat, 0))
170
  col += 1
171
+ row += 1
172
+ workbook.close()
173
+ output.seek(0)
174
+ return output
 
 
 
 
 
 
 
 
175
 
176
+ # Streamlit UI
177
+ st.set_page_config(page_title="Protein Tool", layout="wide")
178
+ st.title("🧬 Protein Analysis Toolkit")
179
+
180
+ app_choice = st.radio("Choose an option", ["πŸ” Protein Repeat Finder", "πŸ“Š Protein Comparator"])
181
+
182
+ if app_choice == "πŸ” Protein Repeat Finder":
183
  analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
184
  uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
185
 
 
227
  result_df = pd.DataFrame(rows)
228
  st.dataframe(result_df)
229
 
 
 
230
  elif app_choice == "πŸ“Š Protein Comparator":
231
+ st.write("Upload two Excel files with protein data to compare repeat frequencies.")
232
+
233
+ file1 = st.file_uploader("Upload First Excel File", type=["xlsx"], key="comp1")
234
+ file2 = st.file_uploader("Upload Second Excel File", type=["xlsx"], key="comp2")
235
+
236
+ if file1 and file2:
237
+ df1 = pd.read_excel(file1)
238
+ df2 = pd.read_excel(file2)
239
+
240
+ df1.columns = df1.columns.astype(str)
241
+ df2.columns = df2.columns.astype(str)
242
+
243
+ id_col = df1.columns[0]
244
+ name_col = df1.columns[1]
245
+ repeat_columns = df1.columns[2:]
246
+
247
+ diff_data = []
248
+ for i in range(min(len(df1), len(df2))):
249
+ row1 = df1.iloc[i]
250
+ row2 = df2.iloc[i]
251
+ diff_row = {"Entry": row1[id_col], "Protein Name": row1[name_col]}
252
+ for repeat in repeat_columns:
253
+ val1 = row1.get(repeat, 0)
254
+ val2 = row2.get(repeat, 0)
255
+ change = ((val2 - val1) / val1 * 100) if val1 != 0 else (100 if val2 > 0 else 0)
256
+ diff_row[repeat] = change
257
+ diff_data.append(diff_row)
258
+
259
+ result_df = pd.DataFrame(diff_data)
260
+ st.dataframe(result_df.style.format("{:.2f}%"))
261
+
262
+ def to_excel_with_colors(df):
263
+ output = BytesIO()
264
+ workbook = xlsxwriter.Workbook(output, {'in_memory': True})
265
+ worksheet = workbook.add_worksheet('Comparison')
266
+
267
+ green_format = workbook.add_format({'font_color': 'green'})
268
+ red_format = workbook.add_format({'font_color': 'red'})
269
+ header_format = workbook.add_format({'bold': True, 'bg_color': '#D7E4BC'})
270
+
271
+ for col_num, col_name in enumerate(df.columns):
272
+ worksheet.write(0, col_num, col_name, header_format)
273
+
274
+ for row_num, row in enumerate(df.itertuples(index=False), start=1):
275
+ for col_num, value in enumerate(row):
276
+ if col_num < 2:
277
+ worksheet.write(row_num, col_num, value)
278
+ else:
279
+ fmt = green_format if value > 0 else red_format if value < 0 else None
280
+ worksheet.write(row_num, col_num, f"{value:.2f}%", fmt)
281
+
282
+ workbook.close()
283
+ output.seek(0)
284
+ return output
285
+
286
+ excel_file = to_excel_with_colors(result_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
+ st.download_button(
289
+ label="Download Colored Comparison Excel",
290
+ data=excel_file,
291
+ file_name="comparison_result_colored.xlsx",
292
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
293
+ )