File size: 4,547 Bytes
6464883
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2df23b7
6464883
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import sklearn
import streamlit as st
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# Title
st.title("Stack Overflow Tag Predictor")

# Tabs
tab1, tab2 = st.tabs(["πŸ“Œ Business Problem & Goal", "πŸ” EDA & Modeling"])

# ---------------- Tab 1: Business Problem & Goal ----------------
with tab1:
    st.header("πŸ“Œ Business Problem & Goal")

    st.markdown("""
    **🧩 Business Problem**  
    Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.  
    Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.

    However, users often:
    - Misclassify or skip adding tags
    - Make it harder to retrieve relevant questions
    - Increase the burden on moderators for cleanup

    ---

    **🎯 Goal**  
    Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
    - **Title**
    - **Description (Body)**

    This will:
    - Enhance user experience
    - Improve search relevance
    - Reduce manual tagging effort

    ---

    **🎯 Target Variable**  
    - This is a **multi-label classification** task.
    - Each question can have **multiple tags**.
    - For example: `['python', 'pandas', 'dataframe']`
    """)

# ---------------- Tab 2: EDA & Modeling ----------------
with tab2:
    st.header("πŸ” EDA & Modeling")

    # Load dataset
    df = pd.read_excel(r"stack3.xlsx")
    st.success("βœ… Data loaded successfully!")

    # Dataset Overview
    st.subheader("πŸ”Ž Dataset Overview")
    st.write(f"Shape of the dataset: {df.shape}")
    st.dataframe(df.head())

    st.write("Missing values in each column:")
    st.write(df.isna().sum())

    st.write(f"Number of duplicate rows: {df.duplicated().sum()}")

    # Data Cleaning
    st.subheader("🧹 Data Cleaning")
    df.drop_duplicates(inplace=True, ignore_index=True)
    df["clean_question"] = df["question"].str.lower()
    df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
    df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
    df["tag_list"] = df["tags"].str.split(",")

    st.write("Sample cleaned question:")
    st.write(df["clean_question"].iloc[0])

    # Feature Extraction
    st.subheader("πŸ”  Feature Extraction with TF-IDF")
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
    X = tfidf.fit_transform(df["clean_question"])

    st.write(f"TF-IDF matrix shape: {X.shape}")

    # Target Processing
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df["tag_list"])
    st.write(f"Number of unique tags: {len(mlb.classes_)}")

    # Dimensionality Reduction
    svd = TruncatedSVD(n_components=100)
    X_reduced = svd.fit_transform(X)

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

    # Model Training
    st.subheader("πŸ€– Model Training (Logistic Regression)")
    with st.spinner("Training the model..."):
        model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
        model.fit(X_train, y_train)
    st.success("βœ… Model trained successfully!")

    # Prediction Demo
    st.subheader("πŸ§ͺ Try it Out: Tag Prediction")
    user_question = st.text_input("Enter a Stack Overflow question (title + description):")

    if st.button("Predict Tags"):
        with st.spinner("Predicting..."):
            clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
            input_vector = tfidf.transform([clean_input])
            input_reduced = svd.transform(input_vector)
            prediction = model.predict(input_reduced)
            predicted_tags = mlb.inverse_transform(prediction)

            st.write("### πŸ” Prediction Result")
            st.write(f"**Input Question:** {user_question}")
            if predicted_tags[0]:
                st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
            else:
                st.warning("No tags predicted.")