|
import streamlit as st
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
from collections import Counter
|
|
import joblib
|
|
import logomaker
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
|
|
model = joblib.load("Model/naive_bayes_model.pkl")
|
|
vectorizer = joblib.load("Model/count_vectorizer.pkl")
|
|
|
|
|
|
class_mappings = {
|
|
0: "G Protein Coupled Receptors",
|
|
1: "Tyrosine Kinase",
|
|
2: "Tyrosine Phosphatase",
|
|
3: "Synthetase",
|
|
4: "Synthase",
|
|
5: "Ion Channel",
|
|
6: "Transcription Factor"
|
|
}
|
|
|
|
|
|
def get_kmers(sequence, size=6):
|
|
return [sequence[i:i+size] for i in range(len(sequence)-size+1)]
|
|
|
|
|
|
st.title("𧬠DNA Sequence Classifier")
|
|
|
|
|
|
st.sidebar.header("Input Options")
|
|
uploaded_file = st.sidebar.file_uploader("Upload DNA Sequence File (.txt)", type=["txt"])
|
|
|
|
|
|
sequence = ""
|
|
if uploaded_file:
|
|
raw = uploaded_file.read().decode("utf-8")
|
|
|
|
sequence = ''.join([line.strip() for line in raw.splitlines() if not line.startswith(">")]).upper()
|
|
st.subheader("π₯ Input DNA Sequence")
|
|
st.text_area("Sequence (first 1000 characters shown)", sequence[:1000], height=150)
|
|
|
|
|
|
st.subheader("π¬ Nucleotide Distribution")
|
|
base_counts = Counter(sequence)
|
|
bases = ['A', 'T', 'G', 'C']
|
|
counts = [base_counts.get(base, 0) for base in bases]
|
|
fig1, ax1 = plt.subplots()
|
|
ax1.bar(bases, counts, color=['green', 'red', 'blue', 'orange'])
|
|
ax1.set_ylabel("Count")
|
|
st.pyplot(fig1)
|
|
|
|
|
|
st.subheader("π Top 10 6-mers")
|
|
kmers = get_kmers(sequence, size=6)
|
|
top_kmers = Counter(kmers).most_common(10)
|
|
df_top = pd.DataFrame(top_kmers, columns=["6-mer", "Count"])
|
|
st.dataframe(df_top)
|
|
|
|
|
|
st.subheader("π€ Predicted Class")
|
|
kmers_text = ' '.join(kmers)
|
|
vectorized = vectorizer.transform([kmers_text])
|
|
pred = model.predict(vectorized)[0]
|
|
proba = model.predict_proba(vectorized)[0]
|
|
|
|
st.markdown(f"### 𧬠Class: `{class_mappings[pred]}`")
|
|
st.markdown(f"Confidence: `{proba[pred]*100:.2f}%`")
|
|
|
|
|
|
if len(sequence) <= 100:
|
|
st.subheader("π Sequence Logo")
|
|
logo_df = logomaker.alignment_to_matrix([sequence])
|
|
fig2, ax2 = plt.subplots(figsize=(10, 3))
|
|
logomaker.Logo(logo_df, ax=ax2)
|
|
st.pyplot(fig2)
|
|
else:
|
|
st.info("Please upload a DNA sequence file to begin.")
|
|
|