File size: 4,422 Bytes
444a81b
 
 
19a6e11
444a81b
19a6e11
444a81b
19a6e11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444a81b
 
19a6e11
 
444a81b
 
 
 
 
19a6e11
 
444a81b
19a6e11
444a81b
 
19a6e11
 
444a81b
19a6e11
444a81b
19a6e11
444a81b
 
19a6e11
 
444a81b
 
19a6e11
444a81b
 
19a6e11
444a81b
19a6e11
 
444a81b
19a6e11
444a81b
19a6e11
444a81b
 
 
 
19a6e11
444a81b
 
19a6e11
 
444a81b
5a986b1
19a6e11
 
 
 
 
02632f9
5a986b1
19a6e11
 
 
 
 
 
 
 
 
 
444a81b
 
19a6e11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
os.system("pip install streamlit pandas xlsxwriter openpyxl")

import random
from collections import defaultdict
from pymongo import MongoClient

# MongoDB connection string (replace with your actual password)
client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")

# Access the BTP_DB database and protein_results collection
db = client['BTP_DB']
results_collection = db['protein_results']

# Function to generate a random protein sequence of given length
def generate_protein_sequence(length):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # 20 standard amino acids
    return ''.join(random.choices(amino_acids, k=length))

# Function to fragment the protein sequence into chunks of max length 1000
def fragment_protein_sequence(sequence, max_length=1000):
    return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]

# Function to find repeating amino acid sequences
def find_hetero_amino_acid_repeats(sequence):
    repeat_counts = defaultdict(int)

    # Iterate over all possible substring lengths
    for length in range(2, len(sequence) + 1):
        for i in range(len(sequence) - length + 1):
            substring = sequence[i:i+length]
            repeat_counts[substring] += 1

    # Filter out substrings that occur only once
    return {k: v for k, v in repeat_counts.items() if v > 1}

# Function to check and update repeats at boundaries
def check_boundary_repeats(fragments, final_repeats, overlap=50):
    for i in range(len(fragments) - 1):
        left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
        right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
        overlap_region = left_overlap + right_overlap

        boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)

        for substring, count in boundary_repeats.items():
            if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
                final_repeats[substring] += count  # Only add if spanning both fragments

    return final_repeats

# Function to find new repeats that only appear at fragmentation points
def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
    new_repeats = defaultdict(int)

    for i in range(len(fragments) - 1):
        left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
        right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
        overlap_region = left_overlap + right_overlap

        boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)

        for substring, count in boundary_repeats.items():
            if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
                if substring not in final_repeats:
                    new_repeats[substring] += count

    return new_repeats

# Main function to process the protein sequence
def process_protein_sequence(sequence, overlap=50):
    fragments = fragment_protein_sequence(sequence)

    # Step 1: Find repeats in each fragment
    final_repeats = defaultdict(int)
    for fragment in fragments:
        fragment_repeats = find_hetero_amino_acid_repeats(fragment)
        for k, v in fragment_repeats.items():
            final_repeats[k] += v

    # Step 2: Check and update repeats at boundaries
    final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)

    # Step 3: Find new repeats emerging at boundaries
    new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)

    # Step 4: Merge new repeats into final dictionary
    for k, v in new_repeats.items():
        final_repeats[k] += v

    return final_repeats

# Example to generate a protein sequence
protein_sequence = generate_protein_sequence(3000)

# Process the protein sequence
calculated_repeats = process_protein_sequence(protein_sequence)

# Prepare data to insert into MongoDB
data_to_insert = {
    "protein_sequence": protein_sequence,
    "calculated_repeats": calculated_repeats
}

# Insert the results into the MongoDB collection
inserted_id = results_collection.insert_one(data_to_insert).inserted_id

# Print out the inserted document's ID
print(f"Data successfully inserted with ID: {inserted_id}")