Spaces:

Jayesh13
/

Homo_hetero_caching

Sleeping

App Files Files Community

Jayesh13 commited on Apr 13

Commit

2b8cf16

verified ·

1 Parent(s): 9410fc6

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -30

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]
 db = client['BTP_DB']
 results_collection = db['protein_results']
-# Utility
 def is_homo_repeat(s):
     return all(c == s[0] for c in s)
@@ -72,10 +72,8 @@ def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
     return new_repeats
 def get_or_process_sequence(sequence, analysis_type, overlap=50):
-    # Combine sequence and analysis_type to generate a unique hash
     hash_input = f"{sequence}_{analysis_type}"
     sequence_hash = hash_sequence(hash_input)
     cached = results_collection.find_one({"_id": sequence_hash})
     if cached:
         return cached["repeats"]
@@ -114,7 +112,6 @@ def get_or_process_sequence(sequence, analysis_type, overlap=50):
         for k, v in hetero_repeats.items():
             final_repeats[k] += v
-    # Store result in MongoDB using combined hash
     results_collection.insert_one({
         "_id": sequence_hash,
         "sequence": sequence,
@@ -124,7 +121,6 @@ def get_or_process_sequence(sequence, analysis_type, overlap=50):
     return final_repeats
 def process_excel(excel_data, analysis_type):
     repeats = set()
     sequence_data = []
@@ -138,7 +134,7 @@ def process_excel(excel_data, analysis_type):
             entry_id = str(row[0])
             protein_name = str(row[1])
             sequence = str(row[2]).replace('"', '').replace(' ', '').strip()
-            if not sequence:  # Skip empty sequence
                 continue
             count += 1
             freq = get_or_process_sequence(sequence, analysis_type)
@@ -177,33 +173,50 @@ st.title("Protein Repeat Analysis with Caching")
 analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
 uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
-if uploaded_files:
-    all_repeats = set()
-    all_sequences_data = []
-    filenames = []
     for file in uploaded_files:
         excel_data = pd.ExcelFile(file)
         repeats, sequence_data = process_excel(excel_data, analysis_type)
         if repeats is not None:
-            all_repeats.update(repeats)
-            all_sequences_data.append(sequence_data)
-            filenames.append(file.name)
-    if all_sequences_data:
         st.toast(f"Processed {len(uploaded_files)} file(s) successfully.")
-        excel_file = create_excel(all_sequences_data, all_repeats, filenames)
-        st.download_button(
-            label="Download Excel file",
-            data=excel_file,
-            file_name="protein_repeat_results.xlsx",
-            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
         )
-        if st.checkbox("Show Results Table"):
-            rows = []
-            for file_index, file_data in enumerate(all_sequences_data):
-                filename = filenames[file_index]
-                for entry_id, protein_name, freq in file_data:
-                    row = {"Filename": filename, "Entry": entry_id, "Protein Name": protein_name}
-                    row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_repeats)})
-                    rows.append(row)
-            result_df = pd.DataFrame(rows)
-            st.dataframe(result_df)

 db = client['BTP_DB']
 results_collection = db['protein_results']
+# Utility functions
 def is_homo_repeat(s):
     return all(c == s[0] for c in s)
     return new_repeats
 def get_or_process_sequence(sequence, analysis_type, overlap=50):
     hash_input = f"{sequence}_{analysis_type}"
     sequence_hash = hash_sequence(hash_input)
     cached = results_collection.find_one({"_id": sequence_hash})
     if cached:
         return cached["repeats"]
         for k, v in hetero_repeats.items():
             final_repeats[k] += v
     results_collection.insert_one({
         "_id": sequence_hash,
         "sequence": sequence,
     return final_repeats
 def process_excel(excel_data, analysis_type):
     repeats = set()
     sequence_data = []
             entry_id = str(row[0])
             protein_name = str(row[1])
             sequence = str(row[2]).replace('"', '').replace(' ', '').strip()
+            if not sequence:
                 continue
             count += 1
             freq = get_or_process_sequence(sequence, analysis_type)
 analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
 uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
+# Initialize session state
+if 'all_sequences_data' not in st.session_state:
+    st.session_state.all_sequences_data = []
+if 'all_repeats' not in st.session_state:
+    st.session_state.all_repeats = set()
+if 'filenames' not in st.session_state:
+    st.session_state.filenames = []
+if 'excel_file' not in st.session_state:
+    st.session_state.excel_file = None
+if uploaded_files and st.button("Process Files"):
+    st.session_state.all_repeats = set()
+    st.session_state.all_sequences_data = []
+    st.session_state.filenames = []
     for file in uploaded_files:
         excel_data = pd.ExcelFile(file)
         repeats, sequence_data = process_excel(excel_data, analysis_type)
         if repeats is not None:
+            st.session_state.all_repeats.update(repeats)
+            st.session_state.all_sequences_data.append(sequence_data)
+            st.session_state.filenames.append(file.name)
+    if st.session_state.all_sequences_data:
         st.toast(f"Processed {len(uploaded_files)} file(s) successfully.")
+        st.session_state.excel_file = create_excel(
+            st.session_state.all_sequences_data,
+            st.session_state.all_repeats,
+            st.session_state.filenames
         )
+if st.session_state.excel_file:
+    st.download_button(
+        label="Download Excel file",
+        data=st.session_state.excel_file,
+        file_name="protein_repeat_results.xlsx",
+        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    )
+if st.checkbox("Show Results Table"):
+    rows = []
+    for file_index, file_data in enumerate(st.session_state.all_sequences_data):
+        filename = st.session_state.filenames[file_index]
+        for entry_id, protein_name, freq in file_data:
+            row = {"Filename": filename, "Entry": entry_id, "Protein Name": protein_name}
+            row.update({repeat: freq.get(repeat, 0) for repeat in sorted(st.session_state.all_repeats)})
+            rows.append(row)
+    result_df = pd.DataFrame(rows)
+    st.dataframe(result_df)