Spaces:
Sleeping
Sleeping
File size: 4,547 Bytes
6464883 2df23b7 6464883 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import sklearn
import streamlit as st
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
# Title
st.title("Stack Overflow Tag Predictor")
# Tabs
tab1, tab2 = st.tabs(["π Business Problem & Goal", "π EDA & Modeling"])
# ---------------- Tab 1: Business Problem & Goal ----------------
with tab1:
st.header("π Business Problem & Goal")
st.markdown("""
**π§© Business Problem**
Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.
Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.
However, users often:
- Misclassify or skip adding tags
- Make it harder to retrieve relevant questions
- Increase the burden on moderators for cleanup
---
**π― Goal**
Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
- **Title**
- **Description (Body)**
This will:
- Enhance user experience
- Improve search relevance
- Reduce manual tagging effort
---
**π― Target Variable**
- This is a **multi-label classification** task.
- Each question can have **multiple tags**.
- For example: `['python', 'pandas', 'dataframe']`
""")
# ---------------- Tab 2: EDA & Modeling ----------------
with tab2:
st.header("π EDA & Modeling")
# Load dataset
df = pd.read_excel(r"stack3.xlsx")
st.success("β
Data loaded successfully!")
# Dataset Overview
st.subheader("π Dataset Overview")
st.write(f"Shape of the dataset: {df.shape}")
st.dataframe(df.head())
st.write("Missing values in each column:")
st.write(df.isna().sum())
st.write(f"Number of duplicate rows: {df.duplicated().sum()}")
# Data Cleaning
st.subheader("π§Ή Data Cleaning")
df.drop_duplicates(inplace=True, ignore_index=True)
df["clean_question"] = df["question"].str.lower()
df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
df["tag_list"] = df["tags"].str.split(",")
st.write("Sample cleaned question:")
st.write(df["clean_question"].iloc[0])
# Feature Extraction
st.subheader("π Feature Extraction with TF-IDF")
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(df["clean_question"])
st.write(f"TF-IDF matrix shape: {X.shape}")
# Target Processing
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["tag_list"])
st.write(f"Number of unique tags: {len(mlb.classes_)}")
# Dimensionality Reduction
svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(X)
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
# Model Training
st.subheader("π€ Model Training (Logistic Regression)")
with st.spinner("Training the model..."):
model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
model.fit(X_train, y_train)
st.success("β
Model trained successfully!")
# Prediction Demo
st.subheader("π§ͺ Try it Out: Tag Prediction")
user_question = st.text_input("Enter a Stack Overflow question (title + description):")
if st.button("Predict Tags"):
with st.spinner("Predicting..."):
clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
input_vector = tfidf.transform([clean_input])
input_reduced = svd.transform(input_vector)
prediction = model.predict(input_reduced)
predicted_tags = mlb.inverse_transform(prediction)
st.write("### π Prediction Result")
st.write(f"**Input Question:** {user_question}")
if predicted_tags[0]:
st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
else:
st.warning("No tags predicted.")
|