Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,124 +1,125 @@
|
|
| 1 |
-
import
|
| 2 |
-
import
|
| 3 |
-
import
|
| 4 |
-
|
| 5 |
-
from sklearn.
|
| 6 |
-
from sklearn.
|
| 7 |
-
from sklearn.
|
| 8 |
-
from sklearn.
|
| 9 |
-
from sklearn.
|
| 10 |
-
from sklearn.
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
-
|
| 30 |
-
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
- **
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
-
|
| 42 |
-
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
-
|
| 49 |
-
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
st.
|
| 63 |
-
st.
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
st.write(
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
df
|
| 74 |
-
df["clean_question"] = df["
|
| 75 |
-
df["clean_question"] = df["clean_question"].str.replace(r'[
|
| 76 |
-
df["
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
st.write(
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
model
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
st.write(
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
| 1 |
+
import sklearn
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import re
|
| 5 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 6 |
+
from sklearn.preprocessing import MultiLabelBinarizer
|
| 7 |
+
from sklearn.decomposition import TruncatedSVD
|
| 8 |
+
from sklearn.model_selection import train_test_split
|
| 9 |
+
from sklearn.linear_model import LogisticRegression
|
| 10 |
+
from sklearn.multiclass import OneVsRestClassifier
|
| 11 |
+
from sklearn.metrics import classification_report
|
| 12 |
+
|
| 13 |
+
# Title
|
| 14 |
+
st.title("Stack Overflow Tag Predictor")
|
| 15 |
+
|
| 16 |
+
# Tabs
|
| 17 |
+
tab1, tab2 = st.tabs(["π Business Problem & Goal", "π EDA & Modeling"])
|
| 18 |
+
|
| 19 |
+
# ---------------- Tab 1: Business Problem & Goal ----------------
|
| 20 |
+
with tab1:
|
| 21 |
+
st.header("π Business Problem & Goal")
|
| 22 |
+
|
| 23 |
+
st.markdown("""
|
| 24 |
+
**π§© Business Problem**
|
| 25 |
+
Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.
|
| 26 |
+
Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.
|
| 27 |
+
|
| 28 |
+
However, users often:
|
| 29 |
+
- Misclassify or skip adding tags
|
| 30 |
+
- Make it harder to retrieve relevant questions
|
| 31 |
+
- Increase the burden on moderators for cleanup
|
| 32 |
+
|
| 33 |
+
---
|
| 34 |
+
|
| 35 |
+
**π― Goal**
|
| 36 |
+
Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
|
| 37 |
+
- **Title**
|
| 38 |
+
- **Description (Body)**
|
| 39 |
+
|
| 40 |
+
This will:
|
| 41 |
+
- Enhance user experience
|
| 42 |
+
- Improve search relevance
|
| 43 |
+
- Reduce manual tagging effort
|
| 44 |
+
|
| 45 |
+
---
|
| 46 |
+
|
| 47 |
+
**π― Target Variable**
|
| 48 |
+
- This is a **multi-label classification** task.
|
| 49 |
+
- Each question can have **multiple tags**.
|
| 50 |
+
- For example: `['python', 'pandas', 'dataframe']`
|
| 51 |
+
""")
|
| 52 |
+
|
| 53 |
+
# ---------------- Tab 2: EDA & Modeling ----------------
|
| 54 |
+
with tab2:
|
| 55 |
+
st.header("π EDA & Modeling")
|
| 56 |
+
|
| 57 |
+
# Load dataset
|
| 58 |
+
df = pd.read_excel(r"C:\Users\91879\Downloads\stack3.xlsx")
|
| 59 |
+
st.success("β
Data loaded successfully!")
|
| 60 |
+
|
| 61 |
+
# Dataset Overview
|
| 62 |
+
st.subheader("π Dataset Overview")
|
| 63 |
+
st.write(f"Shape of the dataset: {df.shape}")
|
| 64 |
+
st.dataframe(df.head())
|
| 65 |
+
|
| 66 |
+
st.write("Missing values in each column:")
|
| 67 |
+
st.write(df.isna().sum())
|
| 68 |
+
|
| 69 |
+
st.write(f"Number of duplicate rows: {df.duplicated().sum()}")
|
| 70 |
+
|
| 71 |
+
# Data Cleaning
|
| 72 |
+
st.subheader("π§Ή Data Cleaning")
|
| 73 |
+
df.drop_duplicates(inplace=True, ignore_index=True)
|
| 74 |
+
df["clean_question"] = df["question"].str.lower()
|
| 75 |
+
df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
|
| 76 |
+
df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
|
| 77 |
+
df["tag_list"] = df["tags"].str.split(",")
|
| 78 |
+
|
| 79 |
+
st.write("Sample cleaned question:")
|
| 80 |
+
st.write(df["clean_question"].iloc[0])
|
| 81 |
+
|
| 82 |
+
# Feature Extraction
|
| 83 |
+
st.subheader("π Feature Extraction with TF-IDF")
|
| 84 |
+
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
|
| 85 |
+
X = tfidf.fit_transform(df["clean_question"])
|
| 86 |
+
|
| 87 |
+
st.write(f"TF-IDF matrix shape: {X.shape}")
|
| 88 |
+
|
| 89 |
+
# Target Processing
|
| 90 |
+
mlb = MultiLabelBinarizer()
|
| 91 |
+
y = mlb.fit_transform(df["tag_list"])
|
| 92 |
+
st.write(f"Number of unique tags: {len(mlb.classes_)}")
|
| 93 |
+
|
| 94 |
+
# Dimensionality Reduction
|
| 95 |
+
svd = TruncatedSVD(n_components=100)
|
| 96 |
+
X_reduced = svd.fit_transform(X)
|
| 97 |
+
|
| 98 |
+
# Train-Test Split
|
| 99 |
+
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
|
| 100 |
+
|
| 101 |
+
# Model Training
|
| 102 |
+
st.subheader("π€ Model Training (Logistic Regression)")
|
| 103 |
+
with st.spinner("Training the model..."):
|
| 104 |
+
model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
|
| 105 |
+
model.fit(X_train, y_train)
|
| 106 |
+
st.success("β
Model trained successfully!")
|
| 107 |
+
|
| 108 |
+
# Prediction Demo
|
| 109 |
+
st.subheader("π§ͺ Try it Out: Tag Prediction")
|
| 110 |
+
user_question = st.text_input("Enter a Stack Overflow question (title + description):")
|
| 111 |
+
|
| 112 |
+
if st.button("Predict Tags"):
|
| 113 |
+
with st.spinner("Predicting..."):
|
| 114 |
+
clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
|
| 115 |
+
input_vector = tfidf.transform([clean_input])
|
| 116 |
+
input_reduced = svd.transform(input_vector)
|
| 117 |
+
prediction = model.predict(input_reduced)
|
| 118 |
+
predicted_tags = mlb.inverse_transform(prediction)
|
| 119 |
+
|
| 120 |
+
st.write("### π Prediction Result")
|
| 121 |
+
st.write(f"**Input Question:** {user_question}")
|
| 122 |
+
if predicted_tags[0]:
|
| 123 |
+
st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
|
| 124 |
+
else:
|
| 125 |
+
st.warning("No tags predicted.")
|