Spaces:

Sowmith22
/

Stackoverflow

Sleeping

File size: 4,547 Bytes

import sklearn
import streamlit as st
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# Title
st.title("Stack Overflow Tag Predictor")

# Tabs
tab1, tab2 = st.tabs(["📌 Business Problem & Goal", "🔍 EDA & Modeling"])

# ---------------- Tab 1: Business Problem & Goal ----------------
with tab1:
    st.header("📌 Business Problem & Goal")

    st.markdown("""
    **🧩 Business Problem**  
    Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.  
    Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.

    However, users often:
    - Misclassify or skip adding tags
    - Make it harder to retrieve relevant questions
    - Increase the burden on moderators for cleanup

    ---

    **🎯 Goal**  
    Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
    - **Title**
    - **Description (Body)**

    This will:
    - Enhance user experience
    - Improve search relevance
    - Reduce manual tagging effort

    ---

    **🎯 Target Variable**  
    - This is a **multi-label classification** task.
    - Each question can have **multiple tags**.
    - For example: `['python', 'pandas', 'dataframe']`
    """)

# ---------------- Tab 2: EDA & Modeling ----------------
with tab2:
    st.header("🔍 EDA & Modeling")

    # Load dataset
    df = pd.read_excel(r"stack3.xlsx")
    st.success("✅ Data loaded successfully!")

    # Dataset Overview
    st.subheader("🔎 Dataset Overview")
    st.write(f"Shape of the dataset: {df.shape}")
    st.dataframe(df.head())

    st.write("Missing values in each column:")
    st.write(df.isna().sum())

    st.write(f"Number of duplicate rows: {df.duplicated().sum()}")

    # Data Cleaning
    st.subheader("🧹 Data Cleaning")
    df.drop_duplicates(inplace=True, ignore_index=True)
    df["clean_question"] = df["question"].str.lower()
    df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
    df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
    df["tag_list"] = df["tags"].str.split(",")

    st.write("Sample cleaned question:")
    st.write(df["clean_question"].iloc[0])

    # Feature Extraction
    st.subheader("🔠 Feature Extraction with TF-IDF")
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
    X = tfidf.fit_transform(df["clean_question"])

    st.write(f"TF-IDF matrix shape: {X.shape}")

    # Target Processing
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df["tag_list"])
    st.write(f"Number of unique tags: {len(mlb.classes_)}")

    # Dimensionality Reduction
    svd = TruncatedSVD(n_components=100)
    X_reduced = svd.fit_transform(X)

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

    # Model Training
    st.subheader("🤖 Model Training (Logistic Regression)")
    with st.spinner("Training the model..."):
        model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
        model.fit(X_train, y_train)
    st.success("✅ Model trained successfully!")

    # Prediction Demo
    st.subheader("🧪 Try it Out: Tag Prediction")
    user_question = st.text_input("Enter a Stack Overflow question (title + description):")

    if st.button("Predict Tags"):
        with st.spinner("Predicting..."):
            clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
            input_vector = tfidf.transform([clean_input])
            input_reduced = svd.transform(input_vector)
            prediction = model.predict(input_reduced)
            predicted_tags = mlb.inverse_transform(prediction)

            st.write("### 🔍 Prediction Result")
            st.write(f"**Input Question:** {user_question}")
            if predicted_tags[0]:
                st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
            else:
                st.warning("No tags predicted.")