File size: 2,683 Bytes
699dac2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import joblib
import logomaker
from sklearn.feature_extraction.text import CountVectorizer

# Load model and vectorizer
model = joblib.load("Model/naive_bayes_model.pkl")  # Update path if needed
vectorizer = joblib.load("Model/count_vectorizer.pkl")  # Update path if needed

# Class mapping
class_mappings = {
    0: "G Protein Coupled Receptors",
    1: "Tyrosine Kinase",
    2: "Tyrosine Phosphatase",
    3: "Synthetase",
    4: "Synthase",
    5: "Ion Channel",
    6: "Transcription Factor"
}

# Function to extract k-mers
def get_kmers(sequence, size=6):
    return [sequence[i:i+size] for i in range(len(sequence)-size+1)]

# Page title
st.title("🧬 DNA Sequence Classifier")

# Sidebar
st.sidebar.header("Input Options")
uploaded_file = st.sidebar.file_uploader("Upload DNA Sequence File (.txt)", type=["txt"])

# Read uploaded file
sequence = ""
if uploaded_file:
    raw = uploaded_file.read().decode("utf-8")
    # Remove FASTA headers if present
    sequence = ''.join([line.strip() for line in raw.splitlines() if not line.startswith(">")]).upper()
    st.subheader("πŸ“₯ Input DNA Sequence")
    st.text_area("Sequence (first 1000 characters shown)", sequence[:1000], height=150)

    # Base Distribution
    st.subheader("πŸ”¬ Nucleotide Distribution")
    base_counts = Counter(sequence)
    bases = ['A', 'T', 'G', 'C']
    counts = [base_counts.get(base, 0) for base in bases]
    fig1, ax1 = plt.subplots()
    ax1.bar(bases, counts, color=['green', 'red', 'blue', 'orange'])
    ax1.set_ylabel("Count")
    st.pyplot(fig1)

    # Top k-mers
    st.subheader("πŸ”  Top 10 6-mers")
    kmers = get_kmers(sequence, size=6)
    top_kmers = Counter(kmers).most_common(10)
    df_top = pd.DataFrame(top_kmers, columns=["6-mer", "Count"])
    st.dataframe(df_top)

    # Prediction
    st.subheader("πŸ€– Predicted Class")
    kmers_text = ' '.join(kmers)
    vectorized = vectorizer.transform([kmers_text])
    pred = model.predict(vectorized)[0]
    proba = model.predict_proba(vectorized)[0]

    st.markdown(f"### 🧬 Class: `{class_mappings[pred]}`")
    st.markdown(f"Confidence: `{proba[pred]*100:.2f}%`")

    # Optional: Show sequence logo (if short enough)
    if len(sequence) <= 100:
        st.subheader("πŸ“ˆ Sequence Logo")
        logo_df = logomaker.alignment_to_matrix([sequence])
        fig2, ax2 = plt.subplots(figsize=(10, 3))
        logomaker.Logo(logo_df, ax=ax2)
        st.pyplot(fig2)
else:
    st.info("Please upload a DNA sequence file to begin.")