Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,124 +1,125 @@
|
|
1 |
-
import
|
2 |
-
import
|
3 |
-
import
|
4 |
-
|
5 |
-
from sklearn.
|
6 |
-
from sklearn.
|
7 |
-
from sklearn.
|
8 |
-
from sklearn.
|
9 |
-
from sklearn.
|
10 |
-
from sklearn.
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
-
|
30 |
-
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
- **
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
-
|
42 |
-
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
-
|
49 |
-
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
st.
|
63 |
-
st.
|
64 |
-
|
65 |
-
|
66 |
-
st.write(
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
df
|
74 |
-
df["clean_question"] = df["
|
75 |
-
df["clean_question"] = df["clean_question"].str.replace(r'[
|
76 |
-
df["
|
77 |
-
|
78 |
-
|
79 |
-
st.write(
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
model
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
st.write(
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
1 |
+
import sklearn
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import re
|
5 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
+
from sklearn.preprocessing import MultiLabelBinarizer
|
7 |
+
from sklearn.decomposition import TruncatedSVD
|
8 |
+
from sklearn.model_selection import train_test_split
|
9 |
+
from sklearn.linear_model import LogisticRegression
|
10 |
+
from sklearn.multiclass import OneVsRestClassifier
|
11 |
+
from sklearn.metrics import classification_report
|
12 |
+
|
13 |
+
# Title
|
14 |
+
st.title("Stack Overflow Tag Predictor")
|
15 |
+
|
16 |
+
# Tabs
|
17 |
+
tab1, tab2 = st.tabs(["π Business Problem & Goal", "π EDA & Modeling"])
|
18 |
+
|
19 |
+
# ---------------- Tab 1: Business Problem & Goal ----------------
|
20 |
+
with tab1:
|
21 |
+
st.header("π Business Problem & Goal")
|
22 |
+
|
23 |
+
st.markdown("""
|
24 |
+
**π§© Business Problem**
|
25 |
+
Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.
|
26 |
+
Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.
|
27 |
+
|
28 |
+
However, users often:
|
29 |
+
- Misclassify or skip adding tags
|
30 |
+
- Make it harder to retrieve relevant questions
|
31 |
+
- Increase the burden on moderators for cleanup
|
32 |
+
|
33 |
+
---
|
34 |
+
|
35 |
+
**π― Goal**
|
36 |
+
Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
|
37 |
+
- **Title**
|
38 |
+
- **Description (Body)**
|
39 |
+
|
40 |
+
This will:
|
41 |
+
- Enhance user experience
|
42 |
+
- Improve search relevance
|
43 |
+
- Reduce manual tagging effort
|
44 |
+
|
45 |
+
---
|
46 |
+
|
47 |
+
**π― Target Variable**
|
48 |
+
- This is a **multi-label classification** task.
|
49 |
+
- Each question can have **multiple tags**.
|
50 |
+
- For example: `['python', 'pandas', 'dataframe']`
|
51 |
+
""")
|
52 |
+
|
53 |
+
# ---------------- Tab 2: EDA & Modeling ----------------
|
54 |
+
with tab2:
|
55 |
+
st.header("π EDA & Modeling")
|
56 |
+
|
57 |
+
# Load dataset
|
58 |
+
df = pd.read_excel(r"C:\Users\91879\Downloads\stack3.xlsx")
|
59 |
+
st.success("β
Data loaded successfully!")
|
60 |
+
|
61 |
+
# Dataset Overview
|
62 |
+
st.subheader("π Dataset Overview")
|
63 |
+
st.write(f"Shape of the dataset: {df.shape}")
|
64 |
+
st.dataframe(df.head())
|
65 |
+
|
66 |
+
st.write("Missing values in each column:")
|
67 |
+
st.write(df.isna().sum())
|
68 |
+
|
69 |
+
st.write(f"Number of duplicate rows: {df.duplicated().sum()}")
|
70 |
+
|
71 |
+
# Data Cleaning
|
72 |
+
st.subheader("π§Ή Data Cleaning")
|
73 |
+
df.drop_duplicates(inplace=True, ignore_index=True)
|
74 |
+
df["clean_question"] = df["question"].str.lower()
|
75 |
+
df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
|
76 |
+
df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
|
77 |
+
df["tag_list"] = df["tags"].str.split(",")
|
78 |
+
|
79 |
+
st.write("Sample cleaned question:")
|
80 |
+
st.write(df["clean_question"].iloc[0])
|
81 |
+
|
82 |
+
# Feature Extraction
|
83 |
+
st.subheader("π Feature Extraction with TF-IDF")
|
84 |
+
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
|
85 |
+
X = tfidf.fit_transform(df["clean_question"])
|
86 |
+
|
87 |
+
st.write(f"TF-IDF matrix shape: {X.shape}")
|
88 |
+
|
89 |
+
# Target Processing
|
90 |
+
mlb = MultiLabelBinarizer()
|
91 |
+
y = mlb.fit_transform(df["tag_list"])
|
92 |
+
st.write(f"Number of unique tags: {len(mlb.classes_)}")
|
93 |
+
|
94 |
+
# Dimensionality Reduction
|
95 |
+
svd = TruncatedSVD(n_components=100)
|
96 |
+
X_reduced = svd.fit_transform(X)
|
97 |
+
|
98 |
+
# Train-Test Split
|
99 |
+
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
|
100 |
+
|
101 |
+
# Model Training
|
102 |
+
st.subheader("π€ Model Training (Logistic Regression)")
|
103 |
+
with st.spinner("Training the model..."):
|
104 |
+
model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
|
105 |
+
model.fit(X_train, y_train)
|
106 |
+
st.success("β
Model trained successfully!")
|
107 |
+
|
108 |
+
# Prediction Demo
|
109 |
+
st.subheader("π§ͺ Try it Out: Tag Prediction")
|
110 |
+
user_question = st.text_input("Enter a Stack Overflow question (title + description):")
|
111 |
+
|
112 |
+
if st.button("Predict Tags"):
|
113 |
+
with st.spinner("Predicting..."):
|
114 |
+
clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
|
115 |
+
input_vector = tfidf.transform([clean_input])
|
116 |
+
input_reduced = svd.transform(input_vector)
|
117 |
+
prediction = model.predict(input_reduced)
|
118 |
+
predicted_tags = mlb.inverse_transform(prediction)
|
119 |
+
|
120 |
+
st.write("### π Prediction Result")
|
121 |
+
st.write(f"**Input Question:** {user_question}")
|
122 |
+
if predicted_tags[0]:
|
123 |
+
st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
|
124 |
+
else:
|
125 |
+
st.warning("No tags predicted.")
|