File size: 4,022 Bytes
c7c7b6a
5a1e198
c7c7b6a
 
5e885f1
 
 
 
 
 
c7c7b6a
 
5e885f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Install required dependencies if not present
import os
os.system("pip install streamlit pandas xlsxwriter")

import streamlit as st
import pandas as pd
import xlsxwriter
from io import BytesIO
from collections import defaultdict



# Function to find repeated amino acids in the protein sequence
def find_homorepeats(protein):
    n = len(protein)
    freq = defaultdict(int)
    i = 0

    while i < n:
        curr = protein[i]
        repeat = ""
        while i < n and curr == protein[i]:
            repeat += protein[i]
            i += 1

        # Only consider repeats of length > 1
        if len(repeat) > 1:
            freq[repeat] += 1

    return freq

# Function to process a single CSV file and return its analysis
def process_csv(file):
    df = pd.read_csv(file)
    if len(df.columns) < 3:
        st.error(f"Error: The file must have at least three columns: ID, Protein Name, Sequence")
        return None

    # Storing entry ID, protein name, and sequence
    sequences = []
    for _, row in df.iterrows():
        entry_id = str(row[0])
        protein_name = str(row[1])
        sequence = str(row[2]).replace('"', '').replace(' ', '')
        sequences.append((entry_id, protein_name, sequence))

    # Analyzing homorepeats in the sequences
    homorepeats = set()
    sequence_data = []
    for entry_id, protein_name, sequence in sequences:
        freq = find_homorepeats(sequence)
        homorepeats.update(freq.keys())  # Collect unique homorepeats
        sequence_data.append((entry_id, protein_name, freq))

    return homorepeats, sequence_data

# Function to generate and download Excel workbook
def create_excel(sequences_data, homorepeats):
    output = BytesIO()
    workbook = xlsxwriter.Workbook(output, {'in_memory': True})
    worksheet = workbook.add_worksheet()

    # Write the header
    worksheet.write(0, 0, "Entry ID")
    worksheet.write(0, 1, "Protein Name")
    col = 2
    for repeat in sorted(homorepeats):
        worksheet.write(0, col, repeat)
        col += 1

    # Write data for each sequence
    row = 1
    for entry_id, protein_name, freq in sequences_data:
        worksheet.write(row, 0, entry_id)
        worksheet.write(row, 1, protein_name)
        col = 2
        for repeat in sorted(homorepeats):
            worksheet.write(row, col, freq.get(repeat, 0))
            col += 1
        row += 1

    workbook.close()
    output.seek(0)
    return output

# Streamlit UI components
st.title("Protein Homorepeat Analysis")

# Step 1: Upload CSV Files
uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type=["csv"])

# Step 2: Process files and display results
if uploaded_files:
    all_homorepeats = set()
    all_sequences_data = []

    for file in uploaded_files:
        homorepeats, sequence_data = process_csv(file)
        if homorepeats is not None:
            all_homorepeats.update(homorepeats)
            all_sequences_data.extend(sequence_data)

    if all_sequences_data:
        st.success(f"Processed {len(uploaded_files)} files successfully!")

        # Step 3: Generate and download the Excel report
        excel_file = create_excel(all_sequences_data, all_homorepeats)

        # Download the Excel file
        st.download_button(
            label="Download Excel file",
            data=excel_file,
            file_name="protein_homorepeat_results.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )

        # Step 4: Display summary table
        if st.checkbox("Show Results Table"):
            # Convert the sequences data into a DataFrame for easy display
            rows = []
            for entry_id, protein_name, freq in all_sequences_data:
                row = {"Entry ID": entry_id, "Protein Name": protein_name}
                row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_homorepeats)})
                rows.append(row)

            result_df = pd.DataFrame(rows)
            st.dataframe(result_df)