Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
from collections import Counter
|
5 |
+
import joblib
|
6 |
+
import logomaker
|
7 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
8 |
+
|
9 |
+
# Load model and vectorizer
|
10 |
+
model = joblib.load("Model/naive_bayes_model.pkl") # Update path if needed
|
11 |
+
vectorizer = joblib.load("Model/count_vectorizer.pkl") # Update path if needed
|
12 |
+
|
13 |
+
# Class mapping
|
14 |
+
class_mappings = {
|
15 |
+
0: "G Protein Coupled Receptors",
|
16 |
+
1: "Tyrosine Kinase",
|
17 |
+
2: "Tyrosine Phosphatase",
|
18 |
+
3: "Synthetase",
|
19 |
+
4: "Synthase",
|
20 |
+
5: "Ion Channel",
|
21 |
+
6: "Transcription Factor"
|
22 |
+
}
|
23 |
+
|
24 |
+
# Function to extract k-mers
|
25 |
+
def get_kmers(sequence, size=6):
|
26 |
+
return [sequence[i:i+size] for i in range(len(sequence)-size+1)]
|
27 |
+
|
28 |
+
# Page title
|
29 |
+
st.title("𧬠DNA Sequence Classifier")
|
30 |
+
|
31 |
+
# Sidebar
|
32 |
+
st.sidebar.header("Input Options")
|
33 |
+
uploaded_file = st.sidebar.file_uploader("Upload DNA Sequence File (.txt)", type=["txt"])
|
34 |
+
|
35 |
+
# Read uploaded file
|
36 |
+
sequence = ""
|
37 |
+
if uploaded_file:
|
38 |
+
raw = uploaded_file.read().decode("utf-8")
|
39 |
+
# Remove FASTA headers if present
|
40 |
+
sequence = ''.join([line.strip() for line in raw.splitlines() if not line.startswith(">")]).upper()
|
41 |
+
st.subheader("π₯ Input DNA Sequence")
|
42 |
+
st.text_area("Sequence (first 1000 characters shown)", sequence[:1000], height=150)
|
43 |
+
|
44 |
+
# Base Distribution
|
45 |
+
st.subheader("π¬ Nucleotide Distribution")
|
46 |
+
base_counts = Counter(sequence)
|
47 |
+
bases = ['A', 'T', 'G', 'C']
|
48 |
+
counts = [base_counts.get(base, 0) for base in bases]
|
49 |
+
fig1, ax1 = plt.subplots()
|
50 |
+
ax1.bar(bases, counts, color=['green', 'red', 'blue', 'orange'])
|
51 |
+
ax1.set_ylabel("Count")
|
52 |
+
st.pyplot(fig1)
|
53 |
+
|
54 |
+
# Top k-mers
|
55 |
+
st.subheader("π Top 10 6-mers")
|
56 |
+
kmers = get_kmers(sequence, size=6)
|
57 |
+
top_kmers = Counter(kmers).most_common(10)
|
58 |
+
df_top = pd.DataFrame(top_kmers, columns=["6-mer", "Count"])
|
59 |
+
st.dataframe(df_top)
|
60 |
+
|
61 |
+
# Prediction
|
62 |
+
st.subheader("π€ Predicted Class")
|
63 |
+
kmers_text = ' '.join(kmers)
|
64 |
+
vectorized = vectorizer.transform([kmers_text])
|
65 |
+
pred = model.predict(vectorized)[0]
|
66 |
+
proba = model.predict_proba(vectorized)[0]
|
67 |
+
|
68 |
+
st.markdown(f"### 𧬠Class: `{class_mappings[pred]}`")
|
69 |
+
st.markdown(f"Confidence: `{proba[pred]*100:.2f}%`")
|
70 |
+
|
71 |
+
# Optional: Show sequence logo (if short enough)
|
72 |
+
if len(sequence) <= 100:
|
73 |
+
st.subheader("π Sequence Logo")
|
74 |
+
logo_df = logomaker.alignment_to_matrix([sequence])
|
75 |
+
fig2, ax2 = plt.subplots(figsize=(10, 3))
|
76 |
+
logomaker.Logo(logo_df, ax=ax2)
|
77 |
+
st.pyplot(fig2)
|
78 |
+
else:
|
79 |
+
st.info("Please upload a DNA sequence file to begin.")
|