Sowmith22 commited on
Commit
6464883
Β·
verified Β·
1 Parent(s): 881236a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -124
app.py CHANGED
@@ -1,124 +1,125 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import re
4
- from sklearn.feature_extraction.text import TfidfVectorizer
5
- from sklearn.preprocessing import MultiLabelBinarizer
6
- from sklearn.decomposition import TruncatedSVD
7
- from sklearn.model_selection import train_test_split
8
- from sklearn.linear_model import LogisticRegression
9
- from sklearn.multiclass import OneVsRestClassifier
10
- from sklearn.metrics import classification_report
11
-
12
- # Title
13
- st.title("Stack Overflow Tag Predictor")
14
-
15
- # Tabs
16
- tab1, tab2 = st.tabs(["πŸ“Œ Business Problem & Goal", "πŸ” EDA & Modeling"])
17
-
18
- # ---------------- Tab 1: Business Problem & Goal ----------------
19
- with tab1:
20
- st.header("πŸ“Œ Business Problem & Goal")
21
-
22
- st.markdown("""
23
- **🧩 Business Problem**
24
- Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.
25
- Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.
26
-
27
- However, users often:
28
- - Misclassify or skip adding tags
29
- - Make it harder to retrieve relevant questions
30
- - Increase the burden on moderators for cleanup
31
-
32
- ---
33
-
34
- **🎯 Goal**
35
- Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
36
- - **Title**
37
- - **Description (Body)**
38
-
39
- This will:
40
- - Enhance user experience
41
- - Improve search relevance
42
- - Reduce manual tagging effort
43
-
44
- ---
45
-
46
- **🎯 Target Variable**
47
- - This is a **multi-label classification** task.
48
- - Each question can have **multiple tags**.
49
- - For example: `['python', 'pandas', 'dataframe']`
50
- """)
51
-
52
- # ---------------- Tab 2: EDA & Modeling ----------------
53
- with tab2:
54
- st.header("πŸ” EDA & Modeling")
55
-
56
- # Load dataset
57
- df = pd.read_excel(r"C:\Users\91879\Downloads\stack3.xlsx")
58
- st.success("βœ… Data loaded successfully!")
59
-
60
- # Dataset Overview
61
- st.subheader("πŸ”Ž Dataset Overview")
62
- st.write(f"Shape of the dataset: {df.shape}")
63
- st.dataframe(df.head())
64
-
65
- st.write("Missing values in each column:")
66
- st.write(df.isna().sum())
67
-
68
- st.write(f"Number of duplicate rows: {df.duplicated().sum()}")
69
-
70
- # Data Cleaning
71
- st.subheader("🧹 Data Cleaning")
72
- df.drop_duplicates(inplace=True, ignore_index=True)
73
- df["clean_question"] = df["question"].str.lower()
74
- df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
75
- df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
76
- df["tag_list"] = df["tags"].str.split(",")
77
-
78
- st.write("Sample cleaned question:")
79
- st.write(df["clean_question"].iloc[0])
80
-
81
- # Feature Extraction
82
- st.subheader("πŸ”  Feature Extraction with TF-IDF")
83
- tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
84
- X = tfidf.fit_transform(df["clean_question"])
85
-
86
- st.write(f"TF-IDF matrix shape: {X.shape}")
87
-
88
- # Target Processing
89
- mlb = MultiLabelBinarizer()
90
- y = mlb.fit_transform(df["tag_list"])
91
- st.write(f"Number of unique tags: {len(mlb.classes_)}")
92
-
93
- # Dimensionality Reduction
94
- svd = TruncatedSVD(n_components=100)
95
- X_reduced = svd.fit_transform(X)
96
-
97
- # Train-Test Split
98
- X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
99
-
100
- # Model Training
101
- st.subheader("πŸ€– Model Training (Logistic Regression)")
102
- with st.spinner("Training the model..."):
103
- model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
104
- model.fit(X_train, y_train)
105
- st.success("βœ… Model trained successfully!")
106
-
107
- # Prediction Demo
108
- st.subheader("πŸ§ͺ Try it Out: Tag Prediction")
109
- user_question = st.text_input("Enter a Stack Overflow question (title + description):")
110
-
111
- if st.button("Predict Tags"):
112
- with st.spinner("Predicting..."):
113
- clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
114
- input_vector = tfidf.transform([clean_input])
115
- input_reduced = svd.transform(input_vector)
116
- prediction = model.predict(input_reduced)
117
- predicted_tags = mlb.inverse_transform(prediction)
118
-
119
- st.write("### πŸ” Prediction Result")
120
- st.write(f"**Input Question:** {user_question}")
121
- if predicted_tags[0]:
122
- st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
123
- else:
124
- st.warning("No tags predicted.")
 
 
1
+ import sklearn
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import re
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.preprocessing import MultiLabelBinarizer
7
+ from sklearn.decomposition import TruncatedSVD
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.linear_model import LogisticRegression
10
+ from sklearn.multiclass import OneVsRestClassifier
11
+ from sklearn.metrics import classification_report
12
+
13
+ # Title
14
+ st.title("Stack Overflow Tag Predictor")
15
+
16
+ # Tabs
17
+ tab1, tab2 = st.tabs(["πŸ“Œ Business Problem & Goal", "πŸ” EDA & Modeling"])
18
+
19
+ # ---------------- Tab 1: Business Problem & Goal ----------------
20
+ with tab1:
21
+ st.header("πŸ“Œ Business Problem & Goal")
22
+
23
+ st.markdown("""
24
+ **🧩 Business Problem**
25
+ Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.
26
+ Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.
27
+
28
+ However, users often:
29
+ - Misclassify or skip adding tags
30
+ - Make it harder to retrieve relevant questions
31
+ - Increase the burden on moderators for cleanup
32
+
33
+ ---
34
+
35
+ **🎯 Goal**
36
+ Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
37
+ - **Title**
38
+ - **Description (Body)**
39
+
40
+ This will:
41
+ - Enhance user experience
42
+ - Improve search relevance
43
+ - Reduce manual tagging effort
44
+
45
+ ---
46
+
47
+ **🎯 Target Variable**
48
+ - This is a **multi-label classification** task.
49
+ - Each question can have **multiple tags**.
50
+ - For example: `['python', 'pandas', 'dataframe']`
51
+ """)
52
+
53
+ # ---------------- Tab 2: EDA & Modeling ----------------
54
+ with tab2:
55
+ st.header("πŸ” EDA & Modeling")
56
+
57
+ # Load dataset
58
+ df = pd.read_excel(r"C:\Users\91879\Downloads\stack3.xlsx")
59
+ st.success("βœ… Data loaded successfully!")
60
+
61
+ # Dataset Overview
62
+ st.subheader("πŸ”Ž Dataset Overview")
63
+ st.write(f"Shape of the dataset: {df.shape}")
64
+ st.dataframe(df.head())
65
+
66
+ st.write("Missing values in each column:")
67
+ st.write(df.isna().sum())
68
+
69
+ st.write(f"Number of duplicate rows: {df.duplicated().sum()}")
70
+
71
+ # Data Cleaning
72
+ st.subheader("🧹 Data Cleaning")
73
+ df.drop_duplicates(inplace=True, ignore_index=True)
74
+ df["clean_question"] = df["question"].str.lower()
75
+ df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
76
+ df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
77
+ df["tag_list"] = df["tags"].str.split(",")
78
+
79
+ st.write("Sample cleaned question:")
80
+ st.write(df["clean_question"].iloc[0])
81
+
82
+ # Feature Extraction
83
+ st.subheader("πŸ”  Feature Extraction with TF-IDF")
84
+ tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
85
+ X = tfidf.fit_transform(df["clean_question"])
86
+
87
+ st.write(f"TF-IDF matrix shape: {X.shape}")
88
+
89
+ # Target Processing
90
+ mlb = MultiLabelBinarizer()
91
+ y = mlb.fit_transform(df["tag_list"])
92
+ st.write(f"Number of unique tags: {len(mlb.classes_)}")
93
+
94
+ # Dimensionality Reduction
95
+ svd = TruncatedSVD(n_components=100)
96
+ X_reduced = svd.fit_transform(X)
97
+
98
+ # Train-Test Split
99
+ X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
100
+
101
+ # Model Training
102
+ st.subheader("πŸ€– Model Training (Logistic Regression)")
103
+ with st.spinner("Training the model..."):
104
+ model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
105
+ model.fit(X_train, y_train)
106
+ st.success("βœ… Model trained successfully!")
107
+
108
+ # Prediction Demo
109
+ st.subheader("πŸ§ͺ Try it Out: Tag Prediction")
110
+ user_question = st.text_input("Enter a Stack Overflow question (title + description):")
111
+
112
+ if st.button("Predict Tags"):
113
+ with st.spinner("Predicting..."):
114
+ clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
115
+ input_vector = tfidf.transform([clean_input])
116
+ input_reduced = svd.transform(input_vector)
117
+ prediction = model.predict(input_reduced)
118
+ predicted_tags = mlb.inverse_transform(prediction)
119
+
120
+ st.write("### πŸ” Prediction Result")
121
+ st.write(f"**Input Question:** {user_question}")
122
+ if predicted_tags[0]:
123
+ st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
124
+ else:
125
+ st.warning("No tags predicted.")