Spaces:
Sleeping
Sleeping
File size: 4,422 Bytes
444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 5a986b1 19a6e11 02632f9 5a986b1 19a6e11 444a81b 19a6e11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
os.system("pip install streamlit pandas xlsxwriter openpyxl")
import random
from collections import defaultdict
from pymongo import MongoClient
# MongoDB connection string (replace with your actual password)
client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
# Access the BTP_DB database and protein_results collection
db = client['BTP_DB']
results_collection = db['protein_results']
# Function to generate a random protein sequence of given length
def generate_protein_sequence(length):
amino_acids = "ACDEFGHIKLMNPQRSTVWY" # 20 standard amino acids
return ''.join(random.choices(amino_acids, k=length))
# Function to fragment the protein sequence into chunks of max length 1000
def fragment_protein_sequence(sequence, max_length=1000):
return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
# Function to find repeating amino acid sequences
def find_hetero_amino_acid_repeats(sequence):
repeat_counts = defaultdict(int)
# Iterate over all possible substring lengths
for length in range(2, len(sequence) + 1):
for i in range(len(sequence) - length + 1):
substring = sequence[i:i+length]
repeat_counts[substring] += 1
# Filter out substrings that occur only once
return {k: v for k, v in repeat_counts.items() if v > 1}
# Function to check and update repeats at boundaries
def check_boundary_repeats(fragments, final_repeats, overlap=50):
for i in range(len(fragments) - 1):
left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
overlap_region = left_overlap + right_overlap
boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
for substring, count in boundary_repeats.items():
if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
final_repeats[substring] += count # Only add if spanning both fragments
return final_repeats
# Function to find new repeats that only appear at fragmentation points
def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
new_repeats = defaultdict(int)
for i in range(len(fragments) - 1):
left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
overlap_region = left_overlap + right_overlap
boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
for substring, count in boundary_repeats.items():
if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
if substring not in final_repeats:
new_repeats[substring] += count
return new_repeats
# Main function to process the protein sequence
def process_protein_sequence(sequence, overlap=50):
fragments = fragment_protein_sequence(sequence)
# Step 1: Find repeats in each fragment
final_repeats = defaultdict(int)
for fragment in fragments:
fragment_repeats = find_hetero_amino_acid_repeats(fragment)
for k, v in fragment_repeats.items():
final_repeats[k] += v
# Step 2: Check and update repeats at boundaries
final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
# Step 3: Find new repeats emerging at boundaries
new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
# Step 4: Merge new repeats into final dictionary
for k, v in new_repeats.items():
final_repeats[k] += v
return final_repeats
# Example to generate a protein sequence
protein_sequence = generate_protein_sequence(3000)
# Process the protein sequence
calculated_repeats = process_protein_sequence(protein_sequence)
# Prepare data to insert into MongoDB
data_to_insert = {
"protein_sequence": protein_sequence,
"calculated_repeats": calculated_repeats
}
# Insert the results into the MongoDB collection
inserted_id = results_collection.insert_one(data_to_insert).inserted_id
# Print out the inserted document's ID
print(f"Data successfully inserted with ID: {inserted_id}") |