Jayesh13 commited on
Commit
781ebc0
Β·
verified Β·
1 Parent(s): c3138dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -38
app.py CHANGED
@@ -1,16 +1,3 @@
1
- import os
2
- os.system("pip install streamlit pandas xlsxwriter openpyxl matplotlib seaborn")
3
-
4
- import streamlit as st
5
- import pandas as pd
6
- import xlsxwriter
7
- from io import BytesIO
8
- from collections import Counter
9
- import matplotlib.pyplot as plt
10
- import seaborn as sns
11
- # For pie chart
12
- # πŸ”„ COMBINED STREAMLIT PROTEIN ANALYSIS TOOL WITH COLORED COMPARISON
13
-
14
  import os
15
  os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")
16
 
@@ -20,6 +7,8 @@ import xlsxwriter
20
  from io import BytesIO
21
  from collections import defaultdict
22
  import hashlib
 
 
23
 
24
  # MongoDB Setup
25
  try:
@@ -190,7 +179,6 @@ st.title("🧬 Protein Analysis Toolkit")
190
 
191
  app_choice = st.radio("Choose an option", ["πŸ” Protein Repeat Finder", "πŸ“Š Protein Comparator", "πŸ§ͺ Amino Acid Percentage Analyzer"])
192
 
193
-
194
  if app_choice == "πŸ” Protein Repeat Finder":
195
  analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
196
  uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
@@ -224,10 +212,11 @@ if app_choice == "πŸ” Protein Repeat Finder":
224
  st.download_button(
225
  label="Download Excel file",
226
  data=st.session_state.excel_file,
227
- file_name="protein_repeat_results.xlsx",
228
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
229
  )
230
 
 
231
  if st.checkbox("Show Results Table"):
232
  rows = []
233
  for file_index, file_data in enumerate(st.session_state.all_sequences_data):
@@ -238,29 +227,29 @@ if app_choice == "πŸ” Protein Repeat Finder":
238
  rows.append(row)
239
  result_df = pd.DataFrame(rows)
240
  st.dataframe(result_df)
241
-
242
- # if st.checkbox("Repeat Cluster Visualization"):
243
- # repeat_counts = defaultdict(int)
244
- # for seq_data in st.session_state.all_sequences_data:
245
- # for _, _, freq_dict in seq_data:
246
- # for repeat, count in freq_dict.items():
247
- # repeat_counts[repeat] += count
248
-
249
- # if repeat_counts:
250
- # sorted_repeats = sorted(repeat_counts.items(), key=lambda x: x[1], reverse=True)
251
- # top_n = st.slider("Select number of top repeats to visualize", min_value=5, max_value=50, value=20)
252
- # top_repeats = sorted_repeats[:top_n]
253
- # repeats, counts = zip(*top_repeats)
254
-
255
- # plt.figure(figsize=(12, 6))
256
- # sns.barplot(x=list(repeats), y=list(counts), palette="viridis")
257
- # plt.xticks(rotation=45, ha='right')
258
- # plt.xlabel("Repeats")
259
- # plt.ylabel("Total Frequency")
260
- # plt.title("Top Repeat Clusters Across All Sequences")
261
- # st.pyplot(plt.gcf())
262
- # else:
263
- # st.warning("No repeat data available to visualize. Please upload files first.")
264
 
265
 
266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")
3
 
 
7
  from io import BytesIO
8
  from collections import defaultdict
9
  import hashlib
10
+ import matplotlib.pyplot as plt
11
+ import seaborn as sns
12
 
13
  # MongoDB Setup
14
  try:
 
179
 
180
  app_choice = st.radio("Choose an option", ["πŸ” Protein Repeat Finder", "πŸ“Š Protein Comparator", "πŸ§ͺ Amino Acid Percentage Analyzer"])
181
 
 
182
  if app_choice == "πŸ” Protein Repeat Finder":
183
  analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
184
  uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
 
212
  st.download_button(
213
  label="Download Excel file",
214
  data=st.session_state.excel_file,
215
+ file_name="Protein_Repeats_Analysis.xlsx",
216
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
217
  )
218
 
219
+ # Display results table and repeat cluster visualization
220
  if st.checkbox("Show Results Table"):
221
  rows = []
222
  for file_index, file_data in enumerate(st.session_state.all_sequences_data):
 
227
  rows.append(row)
228
  result_df = pd.DataFrame(rows)
229
  st.dataframe(result_df)
230
+
231
+ # Repeat Cluster Visualization
232
+ repeat_counts = defaultdict(int)
233
+ for seq_data in st.session_state.all_sequences_data:
234
+ for _, _, freq_dict in seq_data:
235
+ for repeat, count in freq_dict.items():
236
+ repeat_counts[repeat] += count
237
+
238
+ if repeat_counts:
239
+ sorted_repeats = sorted(repeat_counts.items(), key=lambda x: x[1], reverse=True)
240
+ top_n = st.slider("Select number of top repeats to visualize", min_value=5, max_value=50, value=20)
241
+ top_repeats = sorted_repeats[:top_n]
242
+ repeats, counts = zip(*top_repeats)
243
+
244
+ plt.figure(figsize=(12, 6))
245
+ sns.barplot(x=list(repeats), y=list(counts), palette="viridis")
246
+ plt.xticks(rotation=45, ha='right')
247
+ plt.xlabel("Repeats")
248
+ plt.ylabel("Total Frequency")
249
+ plt.title("Top Repeat Clusters Across All Sequences")
250
+ st.pyplot(plt.gcf())
251
+ else:
252
+ st.warning("No repeat data available to visualize. Please upload files first.")
253
 
254
 
255