Jayesh13 commited on
Commit
2b8cf16
·
verified ·
1 Parent(s): 9410fc6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -30
app.py CHANGED
@@ -14,7 +14,7 @@ client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]
14
  db = client['BTP_DB']
15
  results_collection = db['protein_results']
16
 
17
- # Utility
18
  def is_homo_repeat(s):
19
  return all(c == s[0] for c in s)
20
 
@@ -72,10 +72,8 @@ def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
72
  return new_repeats
73
 
74
  def get_or_process_sequence(sequence, analysis_type, overlap=50):
75
- # Combine sequence and analysis_type to generate a unique hash
76
  hash_input = f"{sequence}_{analysis_type}"
77
  sequence_hash = hash_sequence(hash_input)
78
-
79
  cached = results_collection.find_one({"_id": sequence_hash})
80
  if cached:
81
  return cached["repeats"]
@@ -114,7 +112,6 @@ def get_or_process_sequence(sequence, analysis_type, overlap=50):
114
  for k, v in hetero_repeats.items():
115
  final_repeats[k] += v
116
 
117
- # Store result in MongoDB using combined hash
118
  results_collection.insert_one({
119
  "_id": sequence_hash,
120
  "sequence": sequence,
@@ -124,7 +121,6 @@ def get_or_process_sequence(sequence, analysis_type, overlap=50):
124
 
125
  return final_repeats
126
 
127
-
128
  def process_excel(excel_data, analysis_type):
129
  repeats = set()
130
  sequence_data = []
@@ -138,7 +134,7 @@ def process_excel(excel_data, analysis_type):
138
  entry_id = str(row[0])
139
  protein_name = str(row[1])
140
  sequence = str(row[2]).replace('"', '').replace(' ', '').strip()
141
- if not sequence: # Skip empty sequence
142
  continue
143
  count += 1
144
  freq = get_or_process_sequence(sequence, analysis_type)
@@ -177,33 +173,50 @@ st.title("Protein Repeat Analysis with Caching")
177
  analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
178
  uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
179
 
180
- if uploaded_files:
181
- all_repeats = set()
182
- all_sequences_data = []
183
- filenames = []
 
 
 
 
 
 
 
 
 
 
184
  for file in uploaded_files:
185
  excel_data = pd.ExcelFile(file)
186
  repeats, sequence_data = process_excel(excel_data, analysis_type)
187
  if repeats is not None:
188
- all_repeats.update(repeats)
189
- all_sequences_data.append(sequence_data)
190
- filenames.append(file.name)
191
- if all_sequences_data:
192
  st.toast(f"Processed {len(uploaded_files)} file(s) successfully.")
193
- excel_file = create_excel(all_sequences_data, all_repeats, filenames)
194
- st.download_button(
195
- label="Download Excel file",
196
- data=excel_file,
197
- file_name="protein_repeat_results.xlsx",
198
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
199
  )
200
- if st.checkbox("Show Results Table"):
201
- rows = []
202
- for file_index, file_data in enumerate(all_sequences_data):
203
- filename = filenames[file_index]
204
- for entry_id, protein_name, freq in file_data:
205
- row = {"Filename": filename, "Entry": entry_id, "Protein Name": protein_name}
206
- row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_repeats)})
207
- rows.append(row)
208
- result_df = pd.DataFrame(rows)
209
- st.dataframe(result_df)
 
 
 
 
 
 
 
 
 
 
14
  db = client['BTP_DB']
15
  results_collection = db['protein_results']
16
 
17
+ # Utility functions
18
  def is_homo_repeat(s):
19
  return all(c == s[0] for c in s)
20
 
 
72
  return new_repeats
73
 
74
  def get_or_process_sequence(sequence, analysis_type, overlap=50):
 
75
  hash_input = f"{sequence}_{analysis_type}"
76
  sequence_hash = hash_sequence(hash_input)
 
77
  cached = results_collection.find_one({"_id": sequence_hash})
78
  if cached:
79
  return cached["repeats"]
 
112
  for k, v in hetero_repeats.items():
113
  final_repeats[k] += v
114
 
 
115
  results_collection.insert_one({
116
  "_id": sequence_hash,
117
  "sequence": sequence,
 
121
 
122
  return final_repeats
123
 
 
124
  def process_excel(excel_data, analysis_type):
125
  repeats = set()
126
  sequence_data = []
 
134
  entry_id = str(row[0])
135
  protein_name = str(row[1])
136
  sequence = str(row[2]).replace('"', '').replace(' ', '').strip()
137
+ if not sequence:
138
  continue
139
  count += 1
140
  freq = get_or_process_sequence(sequence, analysis_type)
 
173
  analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
174
  uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
175
 
176
+ # Initialize session state
177
+ if 'all_sequences_data' not in st.session_state:
178
+ st.session_state.all_sequences_data = []
179
+ if 'all_repeats' not in st.session_state:
180
+ st.session_state.all_repeats = set()
181
+ if 'filenames' not in st.session_state:
182
+ st.session_state.filenames = []
183
+ if 'excel_file' not in st.session_state:
184
+ st.session_state.excel_file = None
185
+
186
+ if uploaded_files and st.button("Process Files"):
187
+ st.session_state.all_repeats = set()
188
+ st.session_state.all_sequences_data = []
189
+ st.session_state.filenames = []
190
  for file in uploaded_files:
191
  excel_data = pd.ExcelFile(file)
192
  repeats, sequence_data = process_excel(excel_data, analysis_type)
193
  if repeats is not None:
194
+ st.session_state.all_repeats.update(repeats)
195
+ st.session_state.all_sequences_data.append(sequence_data)
196
+ st.session_state.filenames.append(file.name)
197
+ if st.session_state.all_sequences_data:
198
  st.toast(f"Processed {len(uploaded_files)} file(s) successfully.")
199
+ st.session_state.excel_file = create_excel(
200
+ st.session_state.all_sequences_data,
201
+ st.session_state.all_repeats,
202
+ st.session_state.filenames
 
 
203
  )
204
+
205
+ if st.session_state.excel_file:
206
+ st.download_button(
207
+ label="Download Excel file",
208
+ data=st.session_state.excel_file,
209
+ file_name="protein_repeat_results.xlsx",
210
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
211
+ )
212
+
213
+ if st.checkbox("Show Results Table"):
214
+ rows = []
215
+ for file_index, file_data in enumerate(st.session_state.all_sequences_data):
216
+ filename = st.session_state.filenames[file_index]
217
+ for entry_id, protein_name, freq in file_data:
218
+ row = {"Filename": filename, "Entry": entry_id, "Protein Name": protein_name}
219
+ row.update({repeat: freq.get(repeat, 0) for repeat in sorted(st.session_state.all_repeats)})
220
+ rows.append(row)
221
+ result_df = pd.DataFrame(rows)
222
+ st.dataframe(result_df)