import sklearn import streamlit as st import pandas as pd import re from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import MultiLabelBinarizer from sklearn.decomposition import TruncatedSVD from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier from sklearn.metrics import classification_report # Title st.title("Stack Overflow Tag Predictor") # Tabs tab1, tab2 = st.tabs(["๐Ÿ“Œ Business Problem & Goal", "๐Ÿ” EDA & Modeling"]) # ---------------- Tab 1: Business Problem & Goal ---------------- with tab1: st.header("๐Ÿ“Œ Business Problem & Goal") st.markdown(""" **๐Ÿงฉ Business Problem** Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development. Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability. However, users often: - Misclassify or skip adding tags - Make it harder to retrieve relevant questions - Increase the burden on moderators for cleanup --- **๐ŸŽฏ Goal** Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on: - **Title** - **Description (Body)** This will: - Enhance user experience - Improve search relevance - Reduce manual tagging effort --- **๐ŸŽฏ Target Variable** - This is a **multi-label classification** task. - Each question can have **multiple tags**. - For example: `['python', 'pandas', 'dataframe']` """) # ---------------- Tab 2: EDA & Modeling ---------------- with tab2: st.header("๐Ÿ” EDA & Modeling") # Load dataset df = pd.read_excel(r"stack3.xlsx") st.success("โœ… Data loaded successfully!") # Dataset Overview st.subheader("๐Ÿ”Ž Dataset Overview") st.write(f"Shape of the dataset: {df.shape}") st.dataframe(df.head()) st.write("Missing values in each column:") st.write(df.isna().sum()) st.write(f"Number of duplicate rows: {df.duplicated().sum()}") # Data Cleaning st.subheader("๐Ÿงน Data Cleaning") df.drop_duplicates(inplace=True, ignore_index=True) df["clean_question"] = df["question"].str.lower() df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True) df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True) df["tag_list"] = df["tags"].str.split(",") st.write("Sample cleaned question:") st.write(df["clean_question"].iloc[0]) # Feature Extraction st.subheader("๐Ÿ”  Feature Extraction with TF-IDF") tfidf = TfidfVectorizer(max_features=5000, stop_words='english') X = tfidf.fit_transform(df["clean_question"]) st.write(f"TF-IDF matrix shape: {X.shape}") # Target Processing mlb = MultiLabelBinarizer() y = mlb.fit_transform(df["tag_list"]) st.write(f"Number of unique tags: {len(mlb.classes_)}") # Dimensionality Reduction svd = TruncatedSVD(n_components=100) X_reduced = svd.fit_transform(X) # Train-Test Split X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42) # Model Training st.subheader("๐Ÿค– Model Training (Logistic Regression)") with st.spinner("Training the model..."): model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced')) model.fit(X_train, y_train) st.success("โœ… Model trained successfully!") # Prediction Demo st.subheader("๐Ÿงช Try it Out: Tag Prediction") user_question = st.text_input("Enter a Stack Overflow question (title + description):") if st.button("Predict Tags"): with st.spinner("Predicting..."): clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower()) input_vector = tfidf.transform([clean_input]) input_reduced = svd.transform(input_vector) prediction = model.predict(input_reduced) predicted_tags = mlb.inverse_transform(prediction) st.write("### ๐Ÿ” Prediction Result") st.write(f"**Input Question:** {user_question}") if predicted_tags[0]: st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}") else: st.warning("No tags predicted.")