Spaces:

Sowmith22
/

Stackoverflow

Sleeping

App Files Files Community

Sowmith22 commited on Jul 15

Commit

4f17804

verified ·

1 Parent(s): c986236

Upload 3 files

Browse files

Files changed (4) hide show

.gitattributes +1 -0
app.py +124 -0
requirements (4).txt +6 -0
stack3 (1).xlsx +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+stack3[[:space:]](1).xlsx filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import streamlit as st
+import pandas as pd
+import re
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.metrics import classification_report
+# Title
+st.title("Stack Overflow Tag Predictor")
+# Tabs
+tab1, tab2 = st.tabs(["📌 Business Problem & Goal", "🔍 EDA & Modeling"])
+# ---------------- Tab 1: Business Problem & Goal ----------------
+with tab1:
+    st.header("📌 Business Problem & Goal")
+    st.markdown("""
+    **🧩 Business Problem**
+    Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.
+    Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.
+    However, users often:
+    - Misclassify or skip adding tags
+    - Make it harder to retrieve relevant questions
+    - Increase the burden on moderators for cleanup
+    ---
+    **🎯 Goal**
+    Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
+    - **Title**
+    - **Description (Body)**
+    This will:
+    - Enhance user experience
+    - Improve search relevance
+    - Reduce manual tagging effort
+    ---
+    **🎯 Target Variable**
+    - This is a **multi-label classification** task.
+    - Each question can have **multiple tags**.
+    - For example: `['python', 'pandas', 'dataframe']`
+    """)
+# ---------------- Tab 2: EDA & Modeling ----------------
+with tab2:
+    st.header("🔍 EDA & Modeling")
+    # Load dataset
+    df = pd.read_excel(r"C:\Users\91879\Downloads\stack3.xlsx")
+    st.success("✅ Data loaded successfully!")
+    # Dataset Overview
+    st.subheader("🔎 Dataset Overview")
+    st.write(f"Shape of the dataset: {df.shape}")
+    st.dataframe(df.head())
+    st.write("Missing values in each column:")
+    st.write(df.isna().sum())
+    st.write(f"Number of duplicate rows: {df.duplicated().sum()}")
+    # Data Cleaning
+    st.subheader("🧹 Data Cleaning")
+    df.drop_duplicates(inplace=True, ignore_index=True)
+    df["clean_question"] = df["question"].str.lower()
+    df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
+    df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
+    df["tag_list"] = df["tags"].str.split(",")
+    st.write("Sample cleaned question:")
+    st.write(df["clean_question"].iloc[0])
+    # Feature Extraction
+    st.subheader("🔠 Feature Extraction with TF-IDF")
+    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
+    X = tfidf.fit_transform(df["clean_question"])
+    st.write(f"TF-IDF matrix shape: {X.shape}")
+    # Target Processing
+    mlb = MultiLabelBinarizer()
+    y = mlb.fit_transform(df["tag_list"])
+    st.write(f"Number of unique tags: {len(mlb.classes_)}")
+    # Dimensionality Reduction
+    svd = TruncatedSVD(n_components=100)
+    X_reduced = svd.fit_transform(X)
+    # Train-Test Split
+    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
+    # Model Training
+    st.subheader("🤖 Model Training (Logistic Regression)")
+    with st.spinner("Training the model..."):
+        model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
+        model.fit(X_train, y_train)
+    st.success("✅ Model trained successfully!")
+    # Prediction Demo
+    st.subheader("🧪 Try it Out: Tag Prediction")
+    user_question = st.text_input("Enter a Stack Overflow question (title + description):")
+    if st.button("Predict Tags"):
+        with st.spinner("Predicting..."):
+            clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
+            input_vector = tfidf.transform([clean_input])
+            input_reduced = svd.transform(input_vector)
+            prediction = model.predict(input_reduced)
+            predicted_tags = mlb.inverse_transform(prediction)
+            st.write("### 🔍 Prediction Result")
+            st.write(f"**Input Question:** {user_question}")
+            if predicted_tags[0]:
+                st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
+            else:
+                st.warning("No tags predicted.")

requirements (4).txt ADDED Viewed

	@@ -0,0 +1,6 @@

+altair
+pandas
+streamlit
+numpy
+scikit-learn
+lightgbm

stack3 (1).xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8693556e14fde78aebafd6280b8f3ee773c61c62f8a9361a3bda7f54cf5af50
+size 2325996