Stackoverflow / app.py
Sowmith22's picture
Update app.py
2df23b7 verified
import sklearn
import streamlit as st
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
# Title
st.title("Stack Overflow Tag Predictor")
# Tabs
tab1, tab2 = st.tabs(["πŸ“Œ Business Problem & Goal", "πŸ” EDA & Modeling"])
# ---------------- Tab 1: Business Problem & Goal ----------------
with tab1:
st.header("πŸ“Œ Business Problem & Goal")
st.markdown("""
**🧩 Business Problem**
Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.
Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.
However, users often:
- Misclassify or skip adding tags
- Make it harder to retrieve relevant questions
- Increase the burden on moderators for cleanup
---
**🎯 Goal**
Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
- **Title**
- **Description (Body)**
This will:
- Enhance user experience
- Improve search relevance
- Reduce manual tagging effort
---
**🎯 Target Variable**
- This is a **multi-label classification** task.
- Each question can have **multiple tags**.
- For example: `['python', 'pandas', 'dataframe']`
""")
# ---------------- Tab 2: EDA & Modeling ----------------
with tab2:
st.header("πŸ” EDA & Modeling")
# Load dataset
df = pd.read_excel(r"stack3.xlsx")
st.success("βœ… Data loaded successfully!")
# Dataset Overview
st.subheader("πŸ”Ž Dataset Overview")
st.write(f"Shape of the dataset: {df.shape}")
st.dataframe(df.head())
st.write("Missing values in each column:")
st.write(df.isna().sum())
st.write(f"Number of duplicate rows: {df.duplicated().sum()}")
# Data Cleaning
st.subheader("🧹 Data Cleaning")
df.drop_duplicates(inplace=True, ignore_index=True)
df["clean_question"] = df["question"].str.lower()
df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
df["tag_list"] = df["tags"].str.split(",")
st.write("Sample cleaned question:")
st.write(df["clean_question"].iloc[0])
# Feature Extraction
st.subheader("πŸ”  Feature Extraction with TF-IDF")
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(df["clean_question"])
st.write(f"TF-IDF matrix shape: {X.shape}")
# Target Processing
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["tag_list"])
st.write(f"Number of unique tags: {len(mlb.classes_)}")
# Dimensionality Reduction
svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(X)
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
# Model Training
st.subheader("πŸ€– Model Training (Logistic Regression)")
with st.spinner("Training the model..."):
model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
model.fit(X_train, y_train)
st.success("βœ… Model trained successfully!")
# Prediction Demo
st.subheader("πŸ§ͺ Try it Out: Tag Prediction")
user_question = st.text_input("Enter a Stack Overflow question (title + description):")
if st.button("Predict Tags"):
with st.spinner("Predicting..."):
clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
input_vector = tfidf.transform([clean_input])
input_reduced = svd.transform(input_vector)
prediction = model.predict(input_reduced)
predicted_tags = mlb.inverse_transform(prediction)
st.write("### πŸ” Prediction Result")
st.write(f"**Input Question:** {user_question}")
if predicted_tags[0]:
st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
else:
st.warning("No tags predicted.")