File size: 2,683 Bytes
699dac2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import joblib
import logomaker
from sklearn.feature_extraction.text import CountVectorizer
# Load model and vectorizer
model = joblib.load("Model/naive_bayes_model.pkl") # Update path if needed
vectorizer = joblib.load("Model/count_vectorizer.pkl") # Update path if needed
# Class mapping
class_mappings = {
0: "G Protein Coupled Receptors",
1: "Tyrosine Kinase",
2: "Tyrosine Phosphatase",
3: "Synthetase",
4: "Synthase",
5: "Ion Channel",
6: "Transcription Factor"
}
# Function to extract k-mers
def get_kmers(sequence, size=6):
return [sequence[i:i+size] for i in range(len(sequence)-size+1)]
# Page title
st.title("𧬠DNA Sequence Classifier")
# Sidebar
st.sidebar.header("Input Options")
uploaded_file = st.sidebar.file_uploader("Upload DNA Sequence File (.txt)", type=["txt"])
# Read uploaded file
sequence = ""
if uploaded_file:
raw = uploaded_file.read().decode("utf-8")
# Remove FASTA headers if present
sequence = ''.join([line.strip() for line in raw.splitlines() if not line.startswith(">")]).upper()
st.subheader("π₯ Input DNA Sequence")
st.text_area("Sequence (first 1000 characters shown)", sequence[:1000], height=150)
# Base Distribution
st.subheader("π¬ Nucleotide Distribution")
base_counts = Counter(sequence)
bases = ['A', 'T', 'G', 'C']
counts = [base_counts.get(base, 0) for base in bases]
fig1, ax1 = plt.subplots()
ax1.bar(bases, counts, color=['green', 'red', 'blue', 'orange'])
ax1.set_ylabel("Count")
st.pyplot(fig1)
# Top k-mers
st.subheader("π Top 10 6-mers")
kmers = get_kmers(sequence, size=6)
top_kmers = Counter(kmers).most_common(10)
df_top = pd.DataFrame(top_kmers, columns=["6-mer", "Count"])
st.dataframe(df_top)
# Prediction
st.subheader("π€ Predicted Class")
kmers_text = ' '.join(kmers)
vectorized = vectorizer.transform([kmers_text])
pred = model.predict(vectorized)[0]
proba = model.predict_proba(vectorized)[0]
st.markdown(f"### 𧬠Class: `{class_mappings[pred]}`")
st.markdown(f"Confidence: `{proba[pred]*100:.2f}%`")
# Optional: Show sequence logo (if short enough)
if len(sequence) <= 100:
st.subheader("π Sequence Logo")
logo_df = logomaker.alignment_to_matrix([sequence])
fig2, ax2 = plt.subplots(figsize=(10, 3))
logomaker.Logo(logo_df, ax=ax2)
st.pyplot(fig2)
else:
st.info("Please upload a DNA sequence file to begin.")
|