File size: 3,851 Bytes
3f486f6
cdc95c3
3f486f6
 
 
 
 
 
d4bc620
3f486f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c90078
 
 
3f486f6
 
 
cdc95c3
 
 
d4bc620
cdc95c3
 
 
d4bc620
cdc95c3
d4bc620
 
 
 
 
 
 
 
cdc95c3
3f486f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdc95c3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
os.system("pip install streamlit pandas xlsxwriter openpyxl matplotlib")

import streamlit as st
import pandas as pd
import xlsxwriter
from io import BytesIO
from collections import Counter
import matplotlib.pyplot as plt  # For pie chart

# Set of 20 standard amino acids
AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")

st.set_page_config(page_title="Amino Acid Percentage Tool", layout="wide")
st.title("🧬 Amino Acid Percentage Analyzer")

uploaded_file = st.file_uploader("Upload Excel file (with Entry, Protein Name, Sequence)", type=["xlsx"])

if uploaded_file and st.button("Analyze File"):
    df = pd.read_excel(uploaded_file)

    if len(df.columns) < 3:
        st.error("The file must have at least three columns: Entry, Protein Name, Sequence")
    else:
        entry_col = df.columns[0]
        name_col = df.columns[1]
        seq_col = df.columns[2]

        all_counts = Counter()
        all_length = 0

        result_rows = []

        for _, row in df.iterrows():
            entry = str(row[entry_col])
            name = str(row[name_col])
            sequence = str(row[seq_col]).replace(" ", "").replace("\"", "").strip().upper()
            sequence = ''.join(filter(lambda c: c in AMINO_ACIDS, sequence))
            length = len(sequence)

            if length == 0:
                continue

            count = Counter(sequence)
            all_counts.update(count)
            all_length += length

            percentage = {aa: round(count[aa] / length * 100, 2) for aa in AMINO_ACIDS}
            result_rows.append({"Entry": entry, "Protein Name": name, **percentage})

        # Calculate overall percentage
        overall_percentage = {aa: round(all_counts[aa] / all_length * 100, 2) for aa in AMINO_ACIDS}
        overall_row = {"Entry": "OVERALL", "Protein Name": "ALL SEQUENCES", **overall_percentage}

        # Combine overall row first, then all individual rows
        df_result = pd.concat([pd.DataFrame([overall_row]), pd.DataFrame(result_rows)], ignore_index=True)

        st.dataframe(df_result)

        # 🔵 Pie Chart for Overall Stats
        st.subheader("🧁 Overall Amino Acid Composition (Pie Chart)")

        fig, ax = plt.subplots(figsize=(6, 6))  # 50% of typical view size
        labels = list(overall_percentage.keys())
        sizes = list(overall_percentage.values())

        # Filter out amino acids with 0% to avoid clutter
        filtered = [(label, size) for label, size in zip(labels, sizes) if size > 0]
        if filtered:
            labels, sizes = zip(*filtered)

            ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, counterclock=False)
            ax.axis('equal')  # Equal aspect ratio ensures the pie is circular.
            st.pyplot(fig)
        else:
            st.info("No valid amino acids found to display in pie chart.")

        # Export to Excel
        def to_excel(df):
            output = BytesIO()
            workbook = xlsxwriter.Workbook(output, {'in_memory': True})
            worksheet = workbook.add_worksheet("Amino Acid %")

            header_format = workbook.add_format({'bold': True, 'bg_color': '#CDEDF6'})

            for col_num, col_name in enumerate(df.columns):
                worksheet.write(0, col_num, col_name, header_format)

            for row_num, row in enumerate(df.itertuples(index=False), start=1):
                for col_num, value in enumerate(row):
                    worksheet.write(row_num, col_num, value)

            workbook.close()
            output.seek(0)
            return output

        excel_file = to_excel(df_result)

        st.download_button(
            label="Download Excel Report",
            data=excel_file,
            file_name="amino_acid_percentage.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )