ahmadtalha commited on
Commit
699dac2
Β·
verified Β·
1 Parent(s): 3b68c42

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ from collections import Counter
5
+ import joblib
6
+ import logomaker
7
+ from sklearn.feature_extraction.text import CountVectorizer
8
+
9
+ # Load model and vectorizer
10
+ model = joblib.load("Model/naive_bayes_model.pkl") # Update path if needed
11
+ vectorizer = joblib.load("Model/count_vectorizer.pkl") # Update path if needed
12
+
13
+ # Class mapping
14
+ class_mappings = {
15
+ 0: "G Protein Coupled Receptors",
16
+ 1: "Tyrosine Kinase",
17
+ 2: "Tyrosine Phosphatase",
18
+ 3: "Synthetase",
19
+ 4: "Synthase",
20
+ 5: "Ion Channel",
21
+ 6: "Transcription Factor"
22
+ }
23
+
24
+ # Function to extract k-mers
25
+ def get_kmers(sequence, size=6):
26
+ return [sequence[i:i+size] for i in range(len(sequence)-size+1)]
27
+
28
+ # Page title
29
+ st.title("🧬 DNA Sequence Classifier")
30
+
31
+ # Sidebar
32
+ st.sidebar.header("Input Options")
33
+ uploaded_file = st.sidebar.file_uploader("Upload DNA Sequence File (.txt)", type=["txt"])
34
+
35
+ # Read uploaded file
36
+ sequence = ""
37
+ if uploaded_file:
38
+ raw = uploaded_file.read().decode("utf-8")
39
+ # Remove FASTA headers if present
40
+ sequence = ''.join([line.strip() for line in raw.splitlines() if not line.startswith(">")]).upper()
41
+ st.subheader("πŸ“₯ Input DNA Sequence")
42
+ st.text_area("Sequence (first 1000 characters shown)", sequence[:1000], height=150)
43
+
44
+ # Base Distribution
45
+ st.subheader("πŸ”¬ Nucleotide Distribution")
46
+ base_counts = Counter(sequence)
47
+ bases = ['A', 'T', 'G', 'C']
48
+ counts = [base_counts.get(base, 0) for base in bases]
49
+ fig1, ax1 = plt.subplots()
50
+ ax1.bar(bases, counts, color=['green', 'red', 'blue', 'orange'])
51
+ ax1.set_ylabel("Count")
52
+ st.pyplot(fig1)
53
+
54
+ # Top k-mers
55
+ st.subheader("πŸ”  Top 10 6-mers")
56
+ kmers = get_kmers(sequence, size=6)
57
+ top_kmers = Counter(kmers).most_common(10)
58
+ df_top = pd.DataFrame(top_kmers, columns=["6-mer", "Count"])
59
+ st.dataframe(df_top)
60
+
61
+ # Prediction
62
+ st.subheader("πŸ€– Predicted Class")
63
+ kmers_text = ' '.join(kmers)
64
+ vectorized = vectorizer.transform([kmers_text])
65
+ pred = model.predict(vectorized)[0]
66
+ proba = model.predict_proba(vectorized)[0]
67
+
68
+ st.markdown(f"### 🧬 Class: `{class_mappings[pred]}`")
69
+ st.markdown(f"Confidence: `{proba[pred]*100:.2f}%`")
70
+
71
+ # Optional: Show sequence logo (if short enough)
72
+ if len(sequence) <= 100:
73
+ st.subheader("πŸ“ˆ Sequence Logo")
74
+ logo_df = logomaker.alignment_to_matrix([sequence])
75
+ fig2, ax2 = plt.subplots(figsize=(10, 3))
76
+ logomaker.Logo(logo_df, ax=ax2)
77
+ st.pyplot(fig2)
78
+ else:
79
+ st.info("Please upload a DNA sequence file to begin.")