Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -14,7 +14,7 @@ client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]
|
|
14 |
db = client['BTP_DB']
|
15 |
results_collection = db['protein_results']
|
16 |
|
17 |
-
# Utility
|
18 |
def is_homo_repeat(s):
|
19 |
return all(c == s[0] for c in s)
|
20 |
|
@@ -72,10 +72,8 @@ def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
|
|
72 |
return new_repeats
|
73 |
|
74 |
def get_or_process_sequence(sequence, analysis_type, overlap=50):
|
75 |
-
# Combine sequence and analysis_type to generate a unique hash
|
76 |
hash_input = f"{sequence}_{analysis_type}"
|
77 |
sequence_hash = hash_sequence(hash_input)
|
78 |
-
|
79 |
cached = results_collection.find_one({"_id": sequence_hash})
|
80 |
if cached:
|
81 |
return cached["repeats"]
|
@@ -114,7 +112,6 @@ def get_or_process_sequence(sequence, analysis_type, overlap=50):
|
|
114 |
for k, v in hetero_repeats.items():
|
115 |
final_repeats[k] += v
|
116 |
|
117 |
-
# Store result in MongoDB using combined hash
|
118 |
results_collection.insert_one({
|
119 |
"_id": sequence_hash,
|
120 |
"sequence": sequence,
|
@@ -124,7 +121,6 @@ def get_or_process_sequence(sequence, analysis_type, overlap=50):
|
|
124 |
|
125 |
return final_repeats
|
126 |
|
127 |
-
|
128 |
def process_excel(excel_data, analysis_type):
|
129 |
repeats = set()
|
130 |
sequence_data = []
|
@@ -138,7 +134,7 @@ def process_excel(excel_data, analysis_type):
|
|
138 |
entry_id = str(row[0])
|
139 |
protein_name = str(row[1])
|
140 |
sequence = str(row[2]).replace('"', '').replace(' ', '').strip()
|
141 |
-
if not sequence:
|
142 |
continue
|
143 |
count += 1
|
144 |
freq = get_or_process_sequence(sequence, analysis_type)
|
@@ -177,33 +173,50 @@ st.title("Protein Repeat Analysis with Caching")
|
|
177 |
analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
|
178 |
uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
|
179 |
|
180 |
-
|
181 |
-
|
182 |
-
all_sequences_data = []
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
for file in uploaded_files:
|
185 |
excel_data = pd.ExcelFile(file)
|
186 |
repeats, sequence_data = process_excel(excel_data, analysis_type)
|
187 |
if repeats is not None:
|
188 |
-
all_repeats.update(repeats)
|
189 |
-
all_sequences_data.append(sequence_data)
|
190 |
-
filenames.append(file.name)
|
191 |
-
if all_sequences_data:
|
192 |
st.toast(f"Processed {len(uploaded_files)} file(s) successfully.")
|
193 |
-
excel_file = create_excel(
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
file_name="protein_repeat_results.xlsx",
|
198 |
-
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
199 |
)
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
db = client['BTP_DB']
|
15 |
results_collection = db['protein_results']
|
16 |
|
17 |
+
# Utility functions
|
18 |
def is_homo_repeat(s):
|
19 |
return all(c == s[0] for c in s)
|
20 |
|
|
|
72 |
return new_repeats
|
73 |
|
74 |
def get_or_process_sequence(sequence, analysis_type, overlap=50):
|
|
|
75 |
hash_input = f"{sequence}_{analysis_type}"
|
76 |
sequence_hash = hash_sequence(hash_input)
|
|
|
77 |
cached = results_collection.find_one({"_id": sequence_hash})
|
78 |
if cached:
|
79 |
return cached["repeats"]
|
|
|
112 |
for k, v in hetero_repeats.items():
|
113 |
final_repeats[k] += v
|
114 |
|
|
|
115 |
results_collection.insert_one({
|
116 |
"_id": sequence_hash,
|
117 |
"sequence": sequence,
|
|
|
121 |
|
122 |
return final_repeats
|
123 |
|
|
|
124 |
def process_excel(excel_data, analysis_type):
|
125 |
repeats = set()
|
126 |
sequence_data = []
|
|
|
134 |
entry_id = str(row[0])
|
135 |
protein_name = str(row[1])
|
136 |
sequence = str(row[2]).replace('"', '').replace(' ', '').strip()
|
137 |
+
if not sequence:
|
138 |
continue
|
139 |
count += 1
|
140 |
freq = get_or_process_sequence(sequence, analysis_type)
|
|
|
173 |
analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
|
174 |
uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
|
175 |
|
176 |
+
# Initialize session state
|
177 |
+
if 'all_sequences_data' not in st.session_state:
|
178 |
+
st.session_state.all_sequences_data = []
|
179 |
+
if 'all_repeats' not in st.session_state:
|
180 |
+
st.session_state.all_repeats = set()
|
181 |
+
if 'filenames' not in st.session_state:
|
182 |
+
st.session_state.filenames = []
|
183 |
+
if 'excel_file' not in st.session_state:
|
184 |
+
st.session_state.excel_file = None
|
185 |
+
|
186 |
+
if uploaded_files and st.button("Process Files"):
|
187 |
+
st.session_state.all_repeats = set()
|
188 |
+
st.session_state.all_sequences_data = []
|
189 |
+
st.session_state.filenames = []
|
190 |
for file in uploaded_files:
|
191 |
excel_data = pd.ExcelFile(file)
|
192 |
repeats, sequence_data = process_excel(excel_data, analysis_type)
|
193 |
if repeats is not None:
|
194 |
+
st.session_state.all_repeats.update(repeats)
|
195 |
+
st.session_state.all_sequences_data.append(sequence_data)
|
196 |
+
st.session_state.filenames.append(file.name)
|
197 |
+
if st.session_state.all_sequences_data:
|
198 |
st.toast(f"Processed {len(uploaded_files)} file(s) successfully.")
|
199 |
+
st.session_state.excel_file = create_excel(
|
200 |
+
st.session_state.all_sequences_data,
|
201 |
+
st.session_state.all_repeats,
|
202 |
+
st.session_state.filenames
|
|
|
|
|
203 |
)
|
204 |
+
|
205 |
+
if st.session_state.excel_file:
|
206 |
+
st.download_button(
|
207 |
+
label="Download Excel file",
|
208 |
+
data=st.session_state.excel_file,
|
209 |
+
file_name="protein_repeat_results.xlsx",
|
210 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
211 |
+
)
|
212 |
+
|
213 |
+
if st.checkbox("Show Results Table"):
|
214 |
+
rows = []
|
215 |
+
for file_index, file_data in enumerate(st.session_state.all_sequences_data):
|
216 |
+
filename = st.session_state.filenames[file_index]
|
217 |
+
for entry_id, protein_name, freq in file_data:
|
218 |
+
row = {"Filename": filename, "Entry": entry_id, "Protein Name": protein_name}
|
219 |
+
row.update({repeat: freq.get(repeat, 0) for repeat in sorted(st.session_state.all_repeats)})
|
220 |
+
rows.append(row)
|
221 |
+
result_df = pd.DataFrame(rows)
|
222 |
+
st.dataframe(result_df)
|