Spaces:

Jayesh13
/

Homo_hetero

Sleeping

App Files Files Community

Jayesh13 commited on Apr 12

Commit

2dc6b65

verified ·

1 Parent(s): db9028b

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -42

app.py CHANGED Viewed

@@ -1,22 +1,17 @@
 import os
-os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")
 import random
 from collections import defaultdict
 from pymongo import MongoClient
 # MongoDB connection string (replace with your actual password)
 client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
-# Access the BTP_DB database and protein_results collection
 db = client['BTP_DB']
 results_collection = db['protein_results']
-# Function to generate a random protein sequence of given length
-def generate_protein_sequence(length):
-    amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # 20 standard amino acids
-    return ''.join(random.choices(amino_acids, k=length))
 # Function to fragment the protein sequence into chunks of max length 1000
 def fragment_protein_sequence(sequence, max_length=1000):
     return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
@@ -24,14 +19,10 @@ def fragment_protein_sequence(sequence, max_length=1000):
 # Function to find repeating amino acid sequences
 def find_hetero_amino_acid_repeats(sequence):
     repeat_counts = defaultdict(int)
-    # Iterate over all possible substring lengths
     for length in range(2, len(sequence) + 1):
         for i in range(len(sequence) - length + 1):
             substring = sequence[i:i+length]
             repeat_counts[substring] += 1
-    # Filter out substrings that occur only once
     return {k: v for k, v in repeat_counts.items() if v > 1}
 # Function to check and update repeats at boundaries
@@ -40,70 +31,95 @@ def check_boundary_repeats(fragments, final_repeats, overlap=50):
         left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
         right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
         overlap_region = left_overlap + right_overlap
         boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
         for substring, count in boundary_repeats.items():
             if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
-                final_repeats[substring] += count  # Only add if spanning both fragments
     return final_repeats
 # Function to find new repeats that only appear at fragmentation points
 def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
     new_repeats = defaultdict(int)
     for i in range(len(fragments) - 1):
         left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
         right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
         overlap_region = left_overlap + right_overlap
         boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
         for substring, count in boundary_repeats.items():
             if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
                 if substring not in final_repeats:
                     new_repeats[substring] += count
     return new_repeats
 # Main function to process the protein sequence
 def process_protein_sequence(sequence, overlap=50):
     fragments = fragment_protein_sequence(sequence)
-    # Step 1: Find repeats in each fragment
     final_repeats = defaultdict(int)
     for fragment in fragments:
         fragment_repeats = find_hetero_amino_acid_repeats(fragment)
         for k, v in fragment_repeats.items():
             final_repeats[k] += v
-    # Step 2: Check and update repeats at boundaries
     final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
-    # Step 3: Find new repeats emerging at boundaries
     new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
-    # Step 4: Merge new repeats into final dictionary
     for k, v in new_repeats.items():
         final_repeats[k] += v
     return final_repeats
-# Example to generate a protein sequence
-protein_sequence = generate_protein_sequence(3000)
-# Process the protein sequence
-calculated_repeats = process_protein_sequence(protein_sequence)
-# Prepare data to insert into MongoDB
-data_to_insert = {
-    "protein_sequence": protein_sequence,
-    "calculated_repeats": calculated_repeats
-}
-# Insert the results into the MongoDB collection
-inserted_id = results_collection.insert_one(data_to_insert).inserted_id
-# Print out the inserted document's ID
-print(f"Data successfully inserted with ID: {inserted_id}")

 import os
+os.system("pip install streamlit pandas xlsxwriter openpyxl")
+import pandas as pd
 import random
 from collections import defaultdict
 from pymongo import MongoClient
+import streamlit as st
 # MongoDB connection string (replace with your actual password)
 client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
 db = client['BTP_DB']
 results_collection = db['protein_results']
 # Function to fragment the protein sequence into chunks of max length 1000
 def fragment_protein_sequence(sequence, max_length=1000):
     return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
 # Function to find repeating amino acid sequences
 def find_hetero_amino_acid_repeats(sequence):
     repeat_counts = defaultdict(int)
     for length in range(2, len(sequence) + 1):
         for i in range(len(sequence) - length + 1):
             substring = sequence[i:i+length]
             repeat_counts[substring] += 1
     return {k: v for k, v in repeat_counts.items() if v > 1}
 # Function to check and update repeats at boundaries
         left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
         right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
         overlap_region = left_overlap + right_overlap
         boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
         for substring, count in boundary_repeats.items():
             if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
+                final_repeats[substring] += count
     return final_repeats
 # Function to find new repeats that only appear at fragmentation points
 def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
     new_repeats = defaultdict(int)
     for i in range(len(fragments) - 1):
         left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
         right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
         overlap_region = left_overlap + right_overlap
         boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
         for substring, count in boundary_repeats.items():
             if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
                 if substring not in final_repeats:
                     new_repeats[substring] += count
     return new_repeats
 # Main function to process the protein sequence
 def process_protein_sequence(sequence, overlap=50):
     fragments = fragment_protein_sequence(sequence)
     final_repeats = defaultdict(int)
+    # Find repeats in each fragment
     for fragment in fragments:
         fragment_repeats = find_hetero_amino_acid_repeats(fragment)
         for k, v in fragment_repeats.items():
             final_repeats[k] += v
+    # Check and update repeats at boundaries
     final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
+    # Find new repeats emerging at boundaries
     new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
+    # Merge new repeats into final dictionary
     for k, v in new_repeats.items():
         final_repeats[k] += v
     return final_repeats
+# Streamlit UI for uploading and processing the Excel file
+st.title("Protein Sequence Repeat Finder from Excel")
+# Step 1: Upload the Excel file
+uploaded_file = st.file_uploader("Upload Excel file containing Protein Sequences", type=["xlsx"])
+if uploaded_file is not None:
+    # Step 2: Read the Excel file using Pandas
+    df = pd.read_excel(uploaded_file)
+    # Show the first few rows of the uploaded data for preview
+    st.write("Preview of Uploaded Data:")
+    st.write(df.head())
+    # Step 3: Process each protein sequence
+    if st.button("Process Protein Sequences"):
+        results = []
+        for index, row in df.iterrows():
+            protein_id = row["Protein_ID"]
+            protein_name = row["Protein_Name"]
+            sequence = row["Protein_Sequence"]  # Assuming the protein sequence is in a column named 'Protein_Sequence'
+            # Process the protein sequence
+            repeats = process_protein_sequence(sequence)
+            # Prepare data for MongoDB
+            result_data = {
+                "protein_id": protein_id,
+                "protein_name": protein_name,
+                "protein_sequence": sequence,
+                "calculated_repeats": repeats
+            }
+            # Insert results into MongoDB
+            results_collection.insert_one(result_data)
+            # Add results to display
+            results.append({
+                "Protein ID": protein_id,
+                "Protein Name": protein_name,
+                "Repeats": repeats
+            })
+        # Step 4: Display the results
+        st.subheader("Protein Sequences Processed")
+        st.write(results)
+        st.success("Protein sequences processed and results stored in MongoDB.")