Spaces:

ahmadtalha
/

DNA-Sequence-Classification

Sleeping

App Files Files Community

DNA-Sequence-Classification / app.py

ahmadtalha

Upload app.py

699dac2 verified 10 days ago

raw

history blame contribute delete

2.68 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	from collections import Counter
	import joblib
	import logomaker
	from sklearn.feature_extraction.text import CountVectorizer

	# Load model and vectorizer
	model = joblib.load("Model/naive_bayes_model.pkl") # Update path if needed
	vectorizer = joblib.load("Model/count_vectorizer.pkl") # Update path if needed

	# Class mapping
	class_mappings = {
	0: "G Protein Coupled Receptors",
	1: "Tyrosine Kinase",
	2: "Tyrosine Phosphatase",
	3: "Synthetase",
	4: "Synthase",
	5: "Ion Channel",
	6: "Transcription Factor"
	}

	# Function to extract k-mers
	def get_kmers(sequence, size=6):
	return [sequence[i:i+size] for i in range(len(sequence)-size+1)]

	# Page title
	st.title("🧬 DNA Sequence Classifier")

	# Sidebar
	st.sidebar.header("Input Options")
	uploaded_file = st.sidebar.file_uploader("Upload DNA Sequence File (.txt)", type=["txt"])

	# Read uploaded file
	sequence = ""
	if uploaded_file:
	raw = uploaded_file.read().decode("utf-8")
	# Remove FASTA headers if present
	sequence = ''.join([line.strip() for line in raw.splitlines() if not line.startswith(">")]).upper()
	st.subheader("📥 Input DNA Sequence")
	st.text_area("Sequence (first 1000 characters shown)", sequence[:1000], height=150)

	# Base Distribution
	st.subheader("🔬 Nucleotide Distribution")
	base_counts = Counter(sequence)
	bases = ['A', 'T', 'G', 'C']
	counts = [base_counts.get(base, 0) for base in bases]
	fig1, ax1 = plt.subplots()
	ax1.bar(bases, counts, color=['green', 'red', 'blue', 'orange'])
	ax1.set_ylabel("Count")
	st.pyplot(fig1)

	# Top k-mers
	st.subheader("🔠 Top 10 6-mers")
	kmers = get_kmers(sequence, size=6)
	top_kmers = Counter(kmers).most_common(10)
	df_top = pd.DataFrame(top_kmers, columns=["6-mer", "Count"])
	st.dataframe(df_top)

	# Prediction
	st.subheader("🤖 Predicted Class")
	kmers_text = ' '.join(kmers)
	vectorized = vectorizer.transform([kmers_text])
	pred = model.predict(vectorized)[0]
	proba = model.predict_proba(vectorized)[0]

	st.markdown(f"### 🧬 Class: `{class_mappings[pred]}`")
	st.markdown(f"Confidence: `{proba[pred]*100:.2f}%`")

	# Optional: Show sequence logo (if short enough)
	if len(sequence) <= 100:
	st.subheader("📈 Sequence Logo")
	logo_df = logomaker.alignment_to_matrix([sequence])
	fig2, ax2 = plt.subplots(figsize=(10, 3))
	logomaker.Logo(logo_df, ax=ax2)
	st.pyplot(fig2)
	else:
	st.info("Please upload a DNA sequence file to begin.")