Jayesh13 commited on
Commit
2dc6b65
·
verified ·
1 Parent(s): db9028b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -42
app.py CHANGED
@@ -1,22 +1,17 @@
1
  import os
2
- os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")
3
 
 
4
  import random
5
  from collections import defaultdict
6
  from pymongo import MongoClient
 
7
 
8
  # MongoDB connection string (replace with your actual password)
9
  client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
10
-
11
- # Access the BTP_DB database and protein_results collection
12
  db = client['BTP_DB']
13
  results_collection = db['protein_results']
14
 
15
- # Function to generate a random protein sequence of given length
16
- def generate_protein_sequence(length):
17
- amino_acids = "ACDEFGHIKLMNPQRSTVWY" # 20 standard amino acids
18
- return ''.join(random.choices(amino_acids, k=length))
19
-
20
  # Function to fragment the protein sequence into chunks of max length 1000
21
  def fragment_protein_sequence(sequence, max_length=1000):
22
  return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
@@ -24,14 +19,10 @@ def fragment_protein_sequence(sequence, max_length=1000):
24
  # Function to find repeating amino acid sequences
25
  def find_hetero_amino_acid_repeats(sequence):
26
  repeat_counts = defaultdict(int)
27
-
28
- # Iterate over all possible substring lengths
29
  for length in range(2, len(sequence) + 1):
30
  for i in range(len(sequence) - length + 1):
31
  substring = sequence[i:i+length]
32
  repeat_counts[substring] += 1
33
-
34
- # Filter out substrings that occur only once
35
  return {k: v for k, v in repeat_counts.items() if v > 1}
36
 
37
  # Function to check and update repeats at boundaries
@@ -40,70 +31,95 @@ def check_boundary_repeats(fragments, final_repeats, overlap=50):
40
  left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
41
  right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
42
  overlap_region = left_overlap + right_overlap
43
-
44
  boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
45
-
46
  for substring, count in boundary_repeats.items():
47
  if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
48
- final_repeats[substring] += count # Only add if spanning both fragments
49
-
50
  return final_repeats
51
 
52
  # Function to find new repeats that only appear at fragmentation points
53
  def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
54
  new_repeats = defaultdict(int)
55
-
56
  for i in range(len(fragments) - 1):
57
  left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
58
  right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
59
  overlap_region = left_overlap + right_overlap
60
-
61
  boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
62
-
63
  for substring, count in boundary_repeats.items():
64
  if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
65
  if substring not in final_repeats:
66
  new_repeats[substring] += count
67
-
68
  return new_repeats
69
 
70
  # Main function to process the protein sequence
71
  def process_protein_sequence(sequence, overlap=50):
72
  fragments = fragment_protein_sequence(sequence)
73
-
74
- # Step 1: Find repeats in each fragment
75
  final_repeats = defaultdict(int)
 
 
76
  for fragment in fragments:
77
  fragment_repeats = find_hetero_amino_acid_repeats(fragment)
78
  for k, v in fragment_repeats.items():
79
  final_repeats[k] += v
80
 
81
- # Step 2: Check and update repeats at boundaries
82
  final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
83
 
84
- # Step 3: Find new repeats emerging at boundaries
85
  new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
86
 
87
- # Step 4: Merge new repeats into final dictionary
88
  for k, v in new_repeats.items():
89
  final_repeats[k] += v
90
 
91
  return final_repeats
92
 
93
- # Example to generate a protein sequence
94
- protein_sequence = generate_protein_sequence(3000)
95
-
96
- # Process the protein sequence
97
- calculated_repeats = process_protein_sequence(protein_sequence)
98
-
99
- # Prepare data to insert into MongoDB
100
- data_to_insert = {
101
- "protein_sequence": protein_sequence,
102
- "calculated_repeats": calculated_repeats
103
- }
104
-
105
- # Insert the results into the MongoDB collection
106
- inserted_id = results_collection.insert_one(data_to_insert).inserted_id
107
-
108
- # Print out the inserted document's ID
109
- print(f"Data successfully inserted with ID: {inserted_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ os.system("pip install streamlit pandas xlsxwriter openpyxl")
3
 
4
+ import pandas as pd
5
  import random
6
  from collections import defaultdict
7
  from pymongo import MongoClient
8
+ import streamlit as st
9
 
10
  # MongoDB connection string (replace with your actual password)
11
  client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
 
 
12
  db = client['BTP_DB']
13
  results_collection = db['protein_results']
14
 
 
 
 
 
 
15
  # Function to fragment the protein sequence into chunks of max length 1000
16
  def fragment_protein_sequence(sequence, max_length=1000):
17
  return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
 
19
  # Function to find repeating amino acid sequences
20
  def find_hetero_amino_acid_repeats(sequence):
21
  repeat_counts = defaultdict(int)
 
 
22
  for length in range(2, len(sequence) + 1):
23
  for i in range(len(sequence) - length + 1):
24
  substring = sequence[i:i+length]
25
  repeat_counts[substring] += 1
 
 
26
  return {k: v for k, v in repeat_counts.items() if v > 1}
27
 
28
  # Function to check and update repeats at boundaries
 
31
  left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
32
  right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
33
  overlap_region = left_overlap + right_overlap
 
34
  boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
 
35
  for substring, count in boundary_repeats.items():
36
  if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
37
+ final_repeats[substring] += count
 
38
  return final_repeats
39
 
40
  # Function to find new repeats that only appear at fragmentation points
41
  def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
42
  new_repeats = defaultdict(int)
 
43
  for i in range(len(fragments) - 1):
44
  left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
45
  right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
46
  overlap_region = left_overlap + right_overlap
 
47
  boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
 
48
  for substring, count in boundary_repeats.items():
49
  if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
50
  if substring not in final_repeats:
51
  new_repeats[substring] += count
 
52
  return new_repeats
53
 
54
  # Main function to process the protein sequence
55
  def process_protein_sequence(sequence, overlap=50):
56
  fragments = fragment_protein_sequence(sequence)
 
 
57
  final_repeats = defaultdict(int)
58
+
59
+ # Find repeats in each fragment
60
  for fragment in fragments:
61
  fragment_repeats = find_hetero_amino_acid_repeats(fragment)
62
  for k, v in fragment_repeats.items():
63
  final_repeats[k] += v
64
 
65
+ # Check and update repeats at boundaries
66
  final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
67
 
68
+ # Find new repeats emerging at boundaries
69
  new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
70
 
71
+ # Merge new repeats into final dictionary
72
  for k, v in new_repeats.items():
73
  final_repeats[k] += v
74
 
75
  return final_repeats
76
 
77
+ # Streamlit UI for uploading and processing the Excel file
78
+ st.title("Protein Sequence Repeat Finder from Excel")
79
+
80
+ # Step 1: Upload the Excel file
81
+ uploaded_file = st.file_uploader("Upload Excel file containing Protein Sequences", type=["xlsx"])
82
+
83
+ if uploaded_file is not None:
84
+ # Step 2: Read the Excel file using Pandas
85
+ df = pd.read_excel(uploaded_file)
86
+
87
+ # Show the first few rows of the uploaded data for preview
88
+ st.write("Preview of Uploaded Data:")
89
+ st.write(df.head())
90
+
91
+ # Step 3: Process each protein sequence
92
+ if st.button("Process Protein Sequences"):
93
+ results = []
94
+
95
+ for index, row in df.iterrows():
96
+ protein_id = row["Protein_ID"]
97
+ protein_name = row["Protein_Name"]
98
+ sequence = row["Protein_Sequence"] # Assuming the protein sequence is in a column named 'Protein_Sequence'
99
+
100
+ # Process the protein sequence
101
+ repeats = process_protein_sequence(sequence)
102
+
103
+ # Prepare data for MongoDB
104
+ result_data = {
105
+ "protein_id": protein_id,
106
+ "protein_name": protein_name,
107
+ "protein_sequence": sequence,
108
+ "calculated_repeats": repeats
109
+ }
110
+
111
+ # Insert results into MongoDB
112
+ results_collection.insert_one(result_data)
113
+
114
+ # Add results to display
115
+ results.append({
116
+ "Protein ID": protein_id,
117
+ "Protein Name": protein_name,
118
+ "Repeats": repeats
119
+ })
120
+
121
+ # Step 4: Display the results
122
+ st.subheader("Protein Sequences Processed")
123
+ st.write(results)
124
+
125
+ st.success("Protein sequences processed and results stored in MongoDB.")