Spaces:

Sowmith22
/

Stackoverflow

Sleeping

App Files Files Community

Stackoverflow / app.py

Sowmith22

Update app.py

2df23b7 verified 21 days ago

raw

history blame contribute delete

4.55 kB

	import sklearn
	import streamlit as st
	import pandas as pd
	import re
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.preprocessing import MultiLabelBinarizer
	from sklearn.decomposition import TruncatedSVD
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.metrics import classification_report

	# Title
	st.title("Stack Overflow Tag Predictor")

	# Tabs
	tab1, tab2 = st.tabs(["📌 Business Problem & Goal", "🔍 EDA & Modeling"])

	# ---------------- Tab 1: Business Problem & Goal ----------------
	with tab1:
	st.header("📌 Business Problem & Goal")

	st.markdown("""
	🧩 Business Problem
	Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.
	Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.

	However, users often:
	- Misclassify or skip adding tags
	- Make it harder to retrieve relevant questions
	- Increase the burden on moderators for cleanup

	---

	🎯 Goal
	Build a machine learning model that can automatically predict relevant tags for a Stack Overflow question based on:
	- Title
	- Description (Body)

	This will:
	- Enhance user experience
	- Improve search relevance
	- Reduce manual tagging effort

	---

	🎯 Target Variable
	- This is a multi-label classification task.
	- Each question can have multiple tags.
	- For example: `['python', 'pandas', 'dataframe']`
	""")

	# ---------------- Tab 2: EDA & Modeling ----------------
	with tab2:
	st.header("🔍 EDA & Modeling")

	# Load dataset
	df = pd.read_excel(r"stack3.xlsx")
	st.success("✅ Data loaded successfully!")

	# Dataset Overview
	st.subheader("🔎 Dataset Overview")
	st.write(f"Shape of the dataset: {df.shape}")
	st.dataframe(df.head())

	st.write("Missing values in each column:")
	st.write(df.isna().sum())

	st.write(f"Number of duplicate rows: {df.duplicated().sum()}")

	# Data Cleaning
	st.subheader("🧹 Data Cleaning")
	df.drop_duplicates(inplace=True, ignore_index=True)
	df["clean_question"] = df["question"].str.lower()
	df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
	df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
	df["tag_list"] = df["tags"].str.split(",")

	st.write("Sample cleaned question:")
	st.write(df["clean_question"].iloc[0])

	# Feature Extraction
	st.subheader("🔠 Feature Extraction with TF-IDF")
	tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
	X = tfidf.fit_transform(df["clean_question"])

	st.write(f"TF-IDF matrix shape: {X.shape}")

	# Target Processing
	mlb = MultiLabelBinarizer()
	y = mlb.fit_transform(df["tag_list"])
	st.write(f"Number of unique tags: {len(mlb.classes_)}")

	# Dimensionality Reduction
	svd = TruncatedSVD(n_components=100)
	X_reduced = svd.fit_transform(X)

	# Train-Test Split
	X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

	# Model Training
	st.subheader("🤖 Model Training (Logistic Regression)")
	with st.spinner("Training the model..."):
	model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
	model.fit(X_train, y_train)
	st.success("✅ Model trained successfully!")

	# Prediction Demo
	st.subheader("🧪 Try it Out: Tag Prediction")
	user_question = st.text_input("Enter a Stack Overflow question (title + description):")

	if st.button("Predict Tags"):
	with st.spinner("Predicting..."):
	clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
	input_vector = tfidf.transform([clean_input])
	input_reduced = svd.transform(input_vector)
	prediction = model.predict(input_reduced)
	predicted_tags = mlb.inverse_transform(prediction)

	st.write("### 🔍 Prediction Result")
	st.write(f"Input Question: {user_question}")
	if predicted_tags[0]:
	st.success(f"Predicted Tags: {', '.join(predicted_tags[0])}")
	else:
	st.warning("No tags predicted.")