import os
os.system("pip install streamlit pandas xlsxwriter openpyxl")

import streamlit as st
import pandas as pd
import xlsxwriter
from io import BytesIO
from collections import Counter

# Set of 20 standard amino acids
AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")

st.set_page_config(page_title="Amino Acid Percentage Tool", layout="wide")
st.title("🧬 Amino Acid Percentage Analyzer")

uploaded_file = st.file_uploader("Upload Excel file (with Entry, Protein Name, Sequence)", type=["xlsx"])

if uploaded_file and st.button("Analyze File"):
    df = pd.read_excel(uploaded_file)

    if len(df.columns) < 3:
        st.error("The file must have at least three columns: Entry, Protein Name, Sequence")
    else:
        entry_col = df.columns[0]
        name_col = df.columns[1]
        seq_col = df.columns[2]

        all_counts = Counter()
        all_length = 0

        result_rows = []

        for _, row in df.iterrows():
            entry = str(row[entry_col])
            name = str(row[name_col])
            sequence = str(row[seq_col]).replace(" ", "").replace("\"", "").strip().upper()
            sequence = ''.join(filter(lambda c: c in AMINO_ACIDS, sequence))
            length = len(sequence)

            if length == 0:
                continue

            count = Counter(sequence)
            all_counts.update(count)
            all_length += length

            percentage = {aa: round(count[aa] / length * 100, 2) for aa in AMINO_ACIDS}
            result_rows.append({"Entry": entry, "Protein Name": name, **percentage})

        df_result = pd.DataFrame(result_rows)

        # Calculate overall percentage
        overall_percentage = {aa: round(all_counts[aa] / all_length * 100, 2) for aa in AMINO_ACIDS}
        overall_row = {"Entry": "OVERALL", "Protein Name": "ALL SEQUENCES", **overall_percentage}
        df_result = pd.concat([df_result, pd.DataFrame([overall_row])], ignore_index=True)

        st.dataframe(df_result)

        # Export to Excel
        def to_excel(df):
            output = BytesIO()
            workbook = xlsxwriter.Workbook(output, {'in_memory': True})
            worksheet = workbook.add_worksheet("Amino Acid %")

            header_format = workbook.add_format({'bold': True, 'bg_color': '#CDEDF6'})

            for col_num, col_name in enumerate(df.columns):
                worksheet.write(0, col_num, col_name, header_format)

            for row_num, row in enumerate(df.itertuples(index=False), start=1):
                for col_num, value in enumerate(row):
                    worksheet.write(row_num, col_num, value)

            workbook.close()
            output.seek(0)
            return output

        excel_file = to_excel(df_result)

        st.download_button(
            label="Download Excel Report",
            data=excel_file,
            file_name="amino_acid_percentage.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )