Spaces:
Sleeping
Sleeping
import sklearn | |
import streamlit as st | |
import pandas as pd | |
import re | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.preprocessing import MultiLabelBinarizer | |
from sklearn.decomposition import TruncatedSVD | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.multiclass import OneVsRestClassifier | |
from sklearn.metrics import classification_report | |
# Title | |
st.title("Stack Overflow Tag Predictor") | |
# Tabs | |
tab1, tab2 = st.tabs(["π Business Problem & Goal", "π EDA & Modeling"]) | |
# ---------------- Tab 1: Business Problem & Goal ---------------- | |
with tab1: | |
st.header("π Business Problem & Goal") | |
st.markdown(""" | |
**π§© Business Problem** | |
Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development. | |
Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability. | |
However, users often: | |
- Misclassify or skip adding tags | |
- Make it harder to retrieve relevant questions | |
- Increase the burden on moderators for cleanup | |
--- | |
**π― Goal** | |
Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on: | |
- **Title** | |
- **Description (Body)** | |
This will: | |
- Enhance user experience | |
- Improve search relevance | |
- Reduce manual tagging effort | |
--- | |
**π― Target Variable** | |
- This is a **multi-label classification** task. | |
- Each question can have **multiple tags**. | |
- For example: `['python', 'pandas', 'dataframe']` | |
""") | |
# ---------------- Tab 2: EDA & Modeling ---------------- | |
with tab2: | |
st.header("π EDA & Modeling") | |
# Load dataset | |
df = pd.read_excel(r"stack3.xlsx") | |
st.success("β Data loaded successfully!") | |
# Dataset Overview | |
st.subheader("π Dataset Overview") | |
st.write(f"Shape of the dataset: {df.shape}") | |
st.dataframe(df.head()) | |
st.write("Missing values in each column:") | |
st.write(df.isna().sum()) | |
st.write(f"Number of duplicate rows: {df.duplicated().sum()}") | |
# Data Cleaning | |
st.subheader("π§Ή Data Cleaning") | |
df.drop_duplicates(inplace=True, ignore_index=True) | |
df["clean_question"] = df["question"].str.lower() | |
df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True) | |
df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True) | |
df["tag_list"] = df["tags"].str.split(",") | |
st.write("Sample cleaned question:") | |
st.write(df["clean_question"].iloc[0]) | |
# Feature Extraction | |
st.subheader("π Feature Extraction with TF-IDF") | |
tfidf = TfidfVectorizer(max_features=5000, stop_words='english') | |
X = tfidf.fit_transform(df["clean_question"]) | |
st.write(f"TF-IDF matrix shape: {X.shape}") | |
# Target Processing | |
mlb = MultiLabelBinarizer() | |
y = mlb.fit_transform(df["tag_list"]) | |
st.write(f"Number of unique tags: {len(mlb.classes_)}") | |
# Dimensionality Reduction | |
svd = TruncatedSVD(n_components=100) | |
X_reduced = svd.fit_transform(X) | |
# Train-Test Split | |
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42) | |
# Model Training | |
st.subheader("π€ Model Training (Logistic Regression)") | |
with st.spinner("Training the model..."): | |
model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced')) | |
model.fit(X_train, y_train) | |
st.success("β Model trained successfully!") | |
# Prediction Demo | |
st.subheader("π§ͺ Try it Out: Tag Prediction") | |
user_question = st.text_input("Enter a Stack Overflow question (title + description):") | |
if st.button("Predict Tags"): | |
with st.spinner("Predicting..."): | |
clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower()) | |
input_vector = tfidf.transform([clean_input]) | |
input_reduced = svd.transform(input_vector) | |
prediction = model.predict(input_reduced) | |
predicted_tags = mlb.inverse_transform(prediction) | |
st.write("### π Prediction Result") | |
st.write(f"**Input Question:** {user_question}") | |
if predicted_tags[0]: | |
st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}") | |
else: | |
st.warning("No tags predicted.") | |