Sowmith22 commited on
Commit
4f17804
Β·
verified Β·
1 Parent(s): c986236

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +124 -0
  3. requirements (4).txt +6 -0
  4. stack3 (1).xlsx +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ stack3[[:space:]](1).xlsx filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import re
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.preprocessing import MultiLabelBinarizer
6
+ from sklearn.decomposition import TruncatedSVD
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.linear_model import LogisticRegression
9
+ from sklearn.multiclass import OneVsRestClassifier
10
+ from sklearn.metrics import classification_report
11
+
12
+ # Title
13
+ st.title("Stack Overflow Tag Predictor")
14
+
15
+ # Tabs
16
+ tab1, tab2 = st.tabs(["πŸ“Œ Business Problem & Goal", "πŸ” EDA & Modeling"])
17
+
18
+ # ---------------- Tab 1: Business Problem & Goal ----------------
19
+ with tab1:
20
+ st.header("πŸ“Œ Business Problem & Goal")
21
+
22
+ st.markdown("""
23
+ **🧩 Business Problem**
24
+ Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.
25
+ Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.
26
+
27
+ However, users often:
28
+ - Misclassify or skip adding tags
29
+ - Make it harder to retrieve relevant questions
30
+ - Increase the burden on moderators for cleanup
31
+
32
+ ---
33
+
34
+ **🎯 Goal**
35
+ Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
36
+ - **Title**
37
+ - **Description (Body)**
38
+
39
+ This will:
40
+ - Enhance user experience
41
+ - Improve search relevance
42
+ - Reduce manual tagging effort
43
+
44
+ ---
45
+
46
+ **🎯 Target Variable**
47
+ - This is a **multi-label classification** task.
48
+ - Each question can have **multiple tags**.
49
+ - For example: `['python', 'pandas', 'dataframe']`
50
+ """)
51
+
52
+ # ---------------- Tab 2: EDA & Modeling ----------------
53
+ with tab2:
54
+ st.header("πŸ” EDA & Modeling")
55
+
56
+ # Load dataset
57
+ df = pd.read_excel(r"C:\Users\91879\Downloads\stack3.xlsx")
58
+ st.success("βœ… Data loaded successfully!")
59
+
60
+ # Dataset Overview
61
+ st.subheader("πŸ”Ž Dataset Overview")
62
+ st.write(f"Shape of the dataset: {df.shape}")
63
+ st.dataframe(df.head())
64
+
65
+ st.write("Missing values in each column:")
66
+ st.write(df.isna().sum())
67
+
68
+ st.write(f"Number of duplicate rows: {df.duplicated().sum()}")
69
+
70
+ # Data Cleaning
71
+ st.subheader("🧹 Data Cleaning")
72
+ df.drop_duplicates(inplace=True, ignore_index=True)
73
+ df["clean_question"] = df["question"].str.lower()
74
+ df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
75
+ df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
76
+ df["tag_list"] = df["tags"].str.split(",")
77
+
78
+ st.write("Sample cleaned question:")
79
+ st.write(df["clean_question"].iloc[0])
80
+
81
+ # Feature Extraction
82
+ st.subheader("πŸ”  Feature Extraction with TF-IDF")
83
+ tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
84
+ X = tfidf.fit_transform(df["clean_question"])
85
+
86
+ st.write(f"TF-IDF matrix shape: {X.shape}")
87
+
88
+ # Target Processing
89
+ mlb = MultiLabelBinarizer()
90
+ y = mlb.fit_transform(df["tag_list"])
91
+ st.write(f"Number of unique tags: {len(mlb.classes_)}")
92
+
93
+ # Dimensionality Reduction
94
+ svd = TruncatedSVD(n_components=100)
95
+ X_reduced = svd.fit_transform(X)
96
+
97
+ # Train-Test Split
98
+ X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
99
+
100
+ # Model Training
101
+ st.subheader("πŸ€– Model Training (Logistic Regression)")
102
+ with st.spinner("Training the model..."):
103
+ model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
104
+ model.fit(X_train, y_train)
105
+ st.success("βœ… Model trained successfully!")
106
+
107
+ # Prediction Demo
108
+ st.subheader("πŸ§ͺ Try it Out: Tag Prediction")
109
+ user_question = st.text_input("Enter a Stack Overflow question (title + description):")
110
+
111
+ if st.button("Predict Tags"):
112
+ with st.spinner("Predicting..."):
113
+ clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
114
+ input_vector = tfidf.transform([clean_input])
115
+ input_reduced = svd.transform(input_vector)
116
+ prediction = model.predict(input_reduced)
117
+ predicted_tags = mlb.inverse_transform(prediction)
118
+
119
+ st.write("### πŸ” Prediction Result")
120
+ st.write(f"**Input Question:** {user_question}")
121
+ if predicted_tags[0]:
122
+ st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
123
+ else:
124
+ st.warning("No tags predicted.")
requirements (4).txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ altair
2
+ pandas
3
+ streamlit
4
+ numpy
5
+ scikit-learn
6
+ lightgbm
stack3 (1).xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8693556e14fde78aebafd6280b8f3ee773c61c62f8a9361a3bda7f54cf5af50
3
+ size 2325996