Jayesh13 commited on
Commit
03aaa04
·
verified ·
1 Parent(s): 78b154d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -0
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")
3
+
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import xlsxwriter
7
+ from io import BytesIO
8
+ from collections import defaultdict
9
+ from pymongo import MongoClient
10
+ import hashlib
11
+
12
+ # MongoDB setup
13
+ client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
14
+ db = client['BTP_DB']
15
+ results_collection = db['protein_results']
16
+
17
+ # Utility
18
+ def is_homo_repeat(s):
19
+ return all(c == s[0] for c in s)
20
+
21
+ def hash_sequence(sequence, analysis_type, overlap):
22
+ key_string = sequence + analysis_type + str(overlap)
23
+ return hashlib.md5(key_string.encode()).hexdigest()
24
+
25
+ @st.cache_data(show_spinner=False)
26
+ def fragment_protein_sequence(sequence, max_length=1000):
27
+ return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
28
+
29
+ def find_homorepeats(protein):
30
+ n = len(protein)
31
+ freq = defaultdict(int)
32
+ i = 0
33
+ while i < n:
34
+ curr = protein[i]
35
+ repeat = ""
36
+ while i < n and curr == protein[i]:
37
+ repeat += protein[i]
38
+ i += 1
39
+ if len(repeat) > 1:
40
+ freq[repeat] += 1
41
+ return freq
42
+
43
+ def find_hetero_amino_acid_repeats(sequence):
44
+ repeat_counts = defaultdict(int)
45
+ for length in range(2, len(sequence) + 1):
46
+ for i in range(len(sequence) - length + 1):
47
+ substring = sequence[i:i+length]
48
+ repeat_counts[substring] += 1
49
+ return {k: v for k, v in repeat_counts.items() if v > 1}
50
+
51
+ def check_boundary_repeats(fragments, final_repeats, overlap=50):
52
+ for i in range(len(fragments) - 1):
53
+ left_overlap = fragments[i][-overlap:]
54
+ right_overlap = fragments[i + 1][:overlap]
55
+ overlap_region = left_overlap + right_overlap
56
+ boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
57
+ for substring, count in boundary_repeats.items():
58
+ if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
59
+ final_repeats[substring] += count
60
+ return final_repeats
61
+
62
+ def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
63
+ new_repeats = defaultdict(int)
64
+ for i in range(len(fragments) - 1):
65
+ left_overlap = fragments[i][-overlap:]
66
+ right_overlap = fragments[i + 1][:overlap]
67
+ overlap_region = left_overlap + right_overlap
68
+ boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
69
+ for substring, count in boundary_repeats.items():
70
+ if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
71
+ if substring not in final_repeats:
72
+ new_repeats[substring] += count
73
+ return new_repeats
74
+
75
+ def get_or_process_sequence(sequence, analysis_type, overlap=50):
76
+ sequence_hash = hash_sequence(sequence, analysis_type, overlap)
77
+ cached = results_collection.find_one({"_id": sequence_hash})
78
+ if cached:
79
+ return cached["repeats"]
80
+
81
+ fragments = fragment_protein_sequence(sequence)
82
+ final_repeats = defaultdict(int)
83
+
84
+ if analysis_type == "Hetero":
85
+ for fragment in fragments:
86
+ fragment_repeats = find_hetero_amino_acid_repeats(fragment)
87
+ for k, v in fragment_repeats.items():
88
+ final_repeats[k] += v
89
+ final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
90
+ new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
91
+ for k, v in new_repeats.items():
92
+ final_repeats[k] += v
93
+ final_repeats = {k: v for k, v in final_repeats.items() if not is_homo_repeat(k)}
94
+
95
+ elif analysis_type == "Homo":
96
+ final_repeats = find_homorepeats(sequence)
97
+
98
+ elif analysis_type == "Both":
99
+ hetero_repeats = defaultdict(int)
100
+ for fragment in fragments:
101
+ fragment_repeats = find_hetero_amino_acid_repeats(fragment)
102
+ for k, v in fragment_repeats.items():
103
+ hetero_repeats[k] += v
104
+ hetero_repeats = check_boundary_repeats(fragments, hetero_repeats, overlap)
105
+ new_repeats = find_new_boundary_repeats(fragments, hetero_repeats, overlap)
106
+ for k, v in new_repeats.items():
107
+ hetero_repeats[k] += v
108
+ hetero_repeats = {k: v for k, v in hetero_repeats.items() if not is_homo_repeat(k)}
109
+
110
+ homo_repeats = find_homorepeats(sequence)
111
+ final_repeats = homo_repeats.copy()
112
+ for k, v in hetero_repeats.items():
113
+ final_repeats[k] += v
114
+
115
+ # Save to DB for caching
116
+ results_collection.insert_one({
117
+ "_id": sequence_hash,
118
+ "repeats": dict(final_repeats)
119
+ })
120
+ return final_repeats
121
+
122
+ def process_excel(excel_data, analysis_type):
123
+ repeats = set()
124
+ sequence_data = []
125
+ for sheet_name in excel_data.sheet_names:
126
+ df = excel_data.parse(sheet_name)
127
+ if len(df.columns) < 3:
128
+ st.error(f"Error: The sheet '{sheet_name}' must have at least three columns: ID, Protein Name, Sequence")
129
+ return None, None
130
+ for _, row in df.iterrows():
131
+ entry_id = str(row[0])
132
+ protein_name = str(row[1])
133
+ sequence = str(row[2]).replace('"', '').replace(' ', '')
134
+ freq = get_or_process_sequence(sequence, analysis_type)
135
+ sequence_data.append((entry_id, protein_name, freq))
136
+ repeats.update(freq.keys())
137
+ return repeats, sequence_data
138
+
139
+ def create_excel(sequences_data, repeats, filenames):
140
+ output = BytesIO()
141
+ workbook = xlsxwriter.Workbook(output, {'in_memory': True})
142
+ for file_index, file_data in enumerate(sequences_data):
143
+ filename = filenames[file_index]
144
+ worksheet = workbook.add_worksheet(filename[:31])
145
+ worksheet.write(0, 0, "Entry")
146
+ worksheet.write(0, 1, "Protein Name")
147
+ col = 2
148
+ for repeat in sorted(repeats):
149
+ worksheet.write(0, col, repeat)
150
+ col += 1
151
+ row = 1
152
+ for entry_id, protein_name, freq in file_data:
153
+ worksheet.write(row, 0, entry_id)
154
+ worksheet.write(row, 1, protein_name)
155
+ col = 2
156
+ for repeat in sorted(repeats):
157
+ worksheet.write(row, col, freq.get(repeat, 0))
158
+ col += 1
159
+ row += 1
160
+ workbook.close()
161
+ output.seek(0)
162
+ return output
163
+
164
+ # Streamlit UI
165
+ st.title("Protein Repeat Analysis with Caching")
166
+ analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
167
+ uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
168
+
169
+ if uploaded_files:
170
+ all_repeats = set()
171
+ all_sequences_data = []
172
+ filenames = []
173
+ for file in uploaded_files:
174
+ excel_data = pd.ExcelFile(file)
175
+ repeats, sequence_data = process_excel(excel_data, analysis_type)
176
+ if repeats is not None:
177
+ all_repeats.update(repeats)
178
+ all_sequences_data.append(sequence_data)
179
+ filenames.append(file.name)
180
+ if all_sequences_data:
181
+ st.success(f"Processed {len(uploaded_files)} files successfully!")
182
+ excel_file = create_excel(all_sequences_data, all_repeats, filenames)
183
+ st.download_button(
184
+ label="Download Excel file",
185
+ data=excel_file,
186
+ file_name="protein_repeat_results.xlsx",
187
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
188
+ )
189
+ if st.checkbox("Show Results Table"):
190
+ rows = []
191
+ for file_index, file_data in enumerate(all_sequences_data):
192
+ filename = filenames[file_index]
193
+ for entry_id, protein_name, freq in file_data:
194
+ row = {"Filename": filename, "Entry": entry_id, "Protein Name": protein_name}
195
+ row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_repeats)})
196
+ rows.append(row)
197
+ result_df = pd.DataFrame(rows)
198
+ st.dataframe(result_df)