Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,17 @@
|
|
1 |
import os
|
2 |
-
os.system("pip install streamlit pandas xlsxwriter openpyxl
|
3 |
|
|
|
4 |
import random
|
5 |
from collections import defaultdict
|
6 |
from pymongo import MongoClient
|
|
|
7 |
|
8 |
# MongoDB connection string (replace with your actual password)
|
9 |
client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
|
10 |
-
|
11 |
-
# Access the BTP_DB database and protein_results collection
|
12 |
db = client['BTP_DB']
|
13 |
results_collection = db['protein_results']
|
14 |
|
15 |
-
# Function to generate a random protein sequence of given length
|
16 |
-
def generate_protein_sequence(length):
|
17 |
-
amino_acids = "ACDEFGHIKLMNPQRSTVWY" # 20 standard amino acids
|
18 |
-
return ''.join(random.choices(amino_acids, k=length))
|
19 |
-
|
20 |
# Function to fragment the protein sequence into chunks of max length 1000
|
21 |
def fragment_protein_sequence(sequence, max_length=1000):
|
22 |
return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
|
@@ -24,14 +19,10 @@ def fragment_protein_sequence(sequence, max_length=1000):
|
|
24 |
# Function to find repeating amino acid sequences
|
25 |
def find_hetero_amino_acid_repeats(sequence):
|
26 |
repeat_counts = defaultdict(int)
|
27 |
-
|
28 |
-
# Iterate over all possible substring lengths
|
29 |
for length in range(2, len(sequence) + 1):
|
30 |
for i in range(len(sequence) - length + 1):
|
31 |
substring = sequence[i:i+length]
|
32 |
repeat_counts[substring] += 1
|
33 |
-
|
34 |
-
# Filter out substrings that occur only once
|
35 |
return {k: v for k, v in repeat_counts.items() if v > 1}
|
36 |
|
37 |
# Function to check and update repeats at boundaries
|
@@ -40,70 +31,95 @@ def check_boundary_repeats(fragments, final_repeats, overlap=50):
|
|
40 |
left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
|
41 |
right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
|
42 |
overlap_region = left_overlap + right_overlap
|
43 |
-
|
44 |
boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
|
45 |
-
|
46 |
for substring, count in boundary_repeats.items():
|
47 |
if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
|
48 |
-
final_repeats[substring] += count
|
49 |
-
|
50 |
return final_repeats
|
51 |
|
52 |
# Function to find new repeats that only appear at fragmentation points
|
53 |
def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
|
54 |
new_repeats = defaultdict(int)
|
55 |
-
|
56 |
for i in range(len(fragments) - 1):
|
57 |
left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
|
58 |
right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
|
59 |
overlap_region = left_overlap + right_overlap
|
60 |
-
|
61 |
boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
|
62 |
-
|
63 |
for substring, count in boundary_repeats.items():
|
64 |
if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
|
65 |
if substring not in final_repeats:
|
66 |
new_repeats[substring] += count
|
67 |
-
|
68 |
return new_repeats
|
69 |
|
70 |
# Main function to process the protein sequence
|
71 |
def process_protein_sequence(sequence, overlap=50):
|
72 |
fragments = fragment_protein_sequence(sequence)
|
73 |
-
|
74 |
-
# Step 1: Find repeats in each fragment
|
75 |
final_repeats = defaultdict(int)
|
|
|
|
|
76 |
for fragment in fragments:
|
77 |
fragment_repeats = find_hetero_amino_acid_repeats(fragment)
|
78 |
for k, v in fragment_repeats.items():
|
79 |
final_repeats[k] += v
|
80 |
|
81 |
-
#
|
82 |
final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
|
83 |
|
84 |
-
#
|
85 |
new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
|
86 |
|
87 |
-
#
|
88 |
for k, v in new_repeats.items():
|
89 |
final_repeats[k] += v
|
90 |
|
91 |
return final_repeats
|
92 |
|
93 |
-
#
|
94 |
-
|
95 |
-
|
96 |
-
#
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
os.system("pip install streamlit pandas xlsxwriter openpyxl")
|
3 |
|
4 |
+
import pandas as pd
|
5 |
import random
|
6 |
from collections import defaultdict
|
7 |
from pymongo import MongoClient
|
8 |
+
import streamlit as st
|
9 |
|
10 |
# MongoDB connection string (replace with your actual password)
|
11 |
client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
|
|
|
|
|
12 |
db = client['BTP_DB']
|
13 |
results_collection = db['protein_results']
|
14 |
|
|
|
|
|
|
|
|
|
|
|
15 |
# Function to fragment the protein sequence into chunks of max length 1000
|
16 |
def fragment_protein_sequence(sequence, max_length=1000):
|
17 |
return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
|
|
|
19 |
# Function to find repeating amino acid sequences
|
20 |
def find_hetero_amino_acid_repeats(sequence):
|
21 |
repeat_counts = defaultdict(int)
|
|
|
|
|
22 |
for length in range(2, len(sequence) + 1):
|
23 |
for i in range(len(sequence) - length + 1):
|
24 |
substring = sequence[i:i+length]
|
25 |
repeat_counts[substring] += 1
|
|
|
|
|
26 |
return {k: v for k, v in repeat_counts.items() if v > 1}
|
27 |
|
28 |
# Function to check and update repeats at boundaries
|
|
|
31 |
left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
|
32 |
right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
|
33 |
overlap_region = left_overlap + right_overlap
|
|
|
34 |
boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
|
|
|
35 |
for substring, count in boundary_repeats.items():
|
36 |
if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
|
37 |
+
final_repeats[substring] += count
|
|
|
38 |
return final_repeats
|
39 |
|
40 |
# Function to find new repeats that only appear at fragmentation points
|
41 |
def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
|
42 |
new_repeats = defaultdict(int)
|
|
|
43 |
for i in range(len(fragments) - 1):
|
44 |
left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
|
45 |
right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
|
46 |
overlap_region = left_overlap + right_overlap
|
|
|
47 |
boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
|
|
|
48 |
for substring, count in boundary_repeats.items():
|
49 |
if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
|
50 |
if substring not in final_repeats:
|
51 |
new_repeats[substring] += count
|
|
|
52 |
return new_repeats
|
53 |
|
54 |
# Main function to process the protein sequence
|
55 |
def process_protein_sequence(sequence, overlap=50):
|
56 |
fragments = fragment_protein_sequence(sequence)
|
|
|
|
|
57 |
final_repeats = defaultdict(int)
|
58 |
+
|
59 |
+
# Find repeats in each fragment
|
60 |
for fragment in fragments:
|
61 |
fragment_repeats = find_hetero_amino_acid_repeats(fragment)
|
62 |
for k, v in fragment_repeats.items():
|
63 |
final_repeats[k] += v
|
64 |
|
65 |
+
# Check and update repeats at boundaries
|
66 |
final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
|
67 |
|
68 |
+
# Find new repeats emerging at boundaries
|
69 |
new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
|
70 |
|
71 |
+
# Merge new repeats into final dictionary
|
72 |
for k, v in new_repeats.items():
|
73 |
final_repeats[k] += v
|
74 |
|
75 |
return final_repeats
|
76 |
|
77 |
+
# Streamlit UI for uploading and processing the Excel file
|
78 |
+
st.title("Protein Sequence Repeat Finder from Excel")
|
79 |
+
|
80 |
+
# Step 1: Upload the Excel file
|
81 |
+
uploaded_file = st.file_uploader("Upload Excel file containing Protein Sequences", type=["xlsx"])
|
82 |
+
|
83 |
+
if uploaded_file is not None:
|
84 |
+
# Step 2: Read the Excel file using Pandas
|
85 |
+
df = pd.read_excel(uploaded_file)
|
86 |
+
|
87 |
+
# Show the first few rows of the uploaded data for preview
|
88 |
+
st.write("Preview of Uploaded Data:")
|
89 |
+
st.write(df.head())
|
90 |
+
|
91 |
+
# Step 3: Process each protein sequence
|
92 |
+
if st.button("Process Protein Sequences"):
|
93 |
+
results = []
|
94 |
+
|
95 |
+
for index, row in df.iterrows():
|
96 |
+
protein_id = row["Protein_ID"]
|
97 |
+
protein_name = row["Protein_Name"]
|
98 |
+
sequence = row["Protein_Sequence"] # Assuming the protein sequence is in a column named 'Protein_Sequence'
|
99 |
+
|
100 |
+
# Process the protein sequence
|
101 |
+
repeats = process_protein_sequence(sequence)
|
102 |
+
|
103 |
+
# Prepare data for MongoDB
|
104 |
+
result_data = {
|
105 |
+
"protein_id": protein_id,
|
106 |
+
"protein_name": protein_name,
|
107 |
+
"protein_sequence": sequence,
|
108 |
+
"calculated_repeats": repeats
|
109 |
+
}
|
110 |
+
|
111 |
+
# Insert results into MongoDB
|
112 |
+
results_collection.insert_one(result_data)
|
113 |
+
|
114 |
+
# Add results to display
|
115 |
+
results.append({
|
116 |
+
"Protein ID": protein_id,
|
117 |
+
"Protein Name": protein_name,
|
118 |
+
"Repeats": repeats
|
119 |
+
})
|
120 |
+
|
121 |
+
# Step 4: Display the results
|
122 |
+
st.subheader("Protein Sequences Processed")
|
123 |
+
st.write(results)
|
124 |
+
|
125 |
+
st.success("Protein sequences processed and results stored in MongoDB.")
|