File size: 2,925 Bytes
5cd21b9
 
 
b57c1d3
 
26343ed
b57c1d3
 
 
128ce67
b57c1d3
 
 
 
 
 
128ce67
5223b02
 
 
128ce67
 
 
 
 
b533841
 
128ce67
a0d5a6c
5223b02
 
128ce67
5223b02
 
 
128ce67
 
 
5223b02
128ce67
5223b02
 
 
 
128ce67
 
 
 
5223b02
 
 
b533841
 
128ce67
5223b02
 
 
 
 
a0d5a6c
 
 
5223b02
a0d5a6c
128ce67
a0d5a6c
 
5223b02
128ce67
a0d5a6c
128ce67
a0d5a6c
128ce67
a0d5a6c
 
 
128ce67
26343ed
 
128ce67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
os.system("pip install streamlit pandas xlsxwriter openpyxl")

import streamlit as st
import pandas as pd
from io import BytesIO

st.set_page_config(page_title="Protein Repeat Comparator", layout="centered")
st.title("🧬 Protein Repeat Comparator")
st.write("Upload two Excel files (from 2nd row are frequencies). First column: Protein ID, Second column: Name.")

uploaded_file1 = st.file_uploader("Upload First Excel File", type=["xlsx"])
uploaded_file2 = st.file_uploader("Upload Second Excel File", type=["xlsx"])

if uploaded_file1 and uploaded_file2:
    try:
        # Read both Excel files assuming header is in 2nd row (i.e., row index 1)
        df1 = pd.read_excel(uploaded_file1, header=1)
        df2 = pd.read_excel(uploaded_file2, header=1)

        # Ensure columns are strings
        df1.columns = df1.columns.astype(str)
        df2.columns = df2.columns.astype(str)

        # Get ID and Name columns
        id_col = df1.columns[0]
        name_col = df1.columns[1]
        repeat_cols = df1.columns[2:]  # all other columns are repeat names

        records = []

        for idx, row1 in df1.iterrows():
            entry_id = row1[id_col]
            protein_name = row1[name_col]

            # Get matching row from second file
            row2_match = df2[(df2[id_col] == entry_id) & (df2[name_col] == protein_name)]
            if row2_match.empty:
                continue
            row2 = row2_match.iloc[0]

            for repeat in repeat_cols:
                freq1 = row1[repeat]
                freq2 = row2[repeat]

                if pd.isna(freq1) or pd.isna(freq2):
                    continue  # skip missing values

                if freq1 != freq2:
                    diff = abs(freq1 - freq2)
                    records.append({
                        id_col: entry_id,
                        name_col: protein_name,
                        "Repeat": repeat,
                        "Frequency File 1": freq1,
                        "Frequency File 2": freq2,
                        "Difference": diff
                    })

        if records:
            result_df = pd.DataFrame(records)
            result_df = result_df.sort_values(by="Difference", ascending=False)

            output = BytesIO()
            with pd.ExcelWriter(output, engine='openpyxl') as writer:
                result_df.to_excel(writer, index=False)
            output.seek(0)

            st.success("✅ Comparison complete! Only differences are shown below.")
            st.download_button(
                label="📥 Download Result Excel",
                data=output,
                file_name="protein_repeat_diff.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        else:
            st.info("No differences found between the two files.")

    except Exception as e:
        st.error(f"⚠️ Error: {e}")