Spaces:

Sowmith22
/

Stackoverflow

Sleeping

App Files Files Community

Sowmith22 commited on 21 days ago

Commit

6464883

verified ·

1 Parent(s): 881236a

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -124

app.py CHANGED Viewed

@@ -1,124 +1,125 @@
-import streamlit as st
-import pandas as pd
-import re
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.preprocessing import MultiLabelBinarizer
-from sklearn.decomposition import TruncatedSVD
-from sklearn.model_selection import train_test_split
-from sklearn.linear_model import LogisticRegression
-from sklearn.multiclass import OneVsRestClassifier
-from sklearn.metrics import classification_report
-# Title
-st.title("Stack Overflow Tag Predictor")
-# Tabs
-tab1, tab2 = st.tabs(["📌 Business Problem & Goal", "🔍 EDA & Modeling"])
-# ---------------- Tab 1: Business Problem & Goal ----------------
-with tab1:
-    st.header("📌 Business Problem & Goal")
-    st.markdown("""
-    **🧩 Business Problem**
-    Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.
-    Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.
-    However, users often:
-    - Misclassify or skip adding tags
-    - Make it harder to retrieve relevant questions
-    - Increase the burden on moderators for cleanup
-    ---
-    **🎯 Goal**
-    Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
-    - **Title**
-    - **Description (Body)**
-    This will:
-    - Enhance user experience
-    - Improve search relevance
-    - Reduce manual tagging effort
-    ---
-    **🎯 Target Variable**
-    - This is a **multi-label classification** task.
-    - Each question can have **multiple tags**.
-    - For example: `['python', 'pandas', 'dataframe']`
-    """)
-# ---------------- Tab 2: EDA & Modeling ----------------
-with tab2:
-    st.header("🔍 EDA & Modeling")
-    # Load dataset
-    df = pd.read_excel(r"C:\Users\91879\Downloads\stack3.xlsx")
-    st.success("✅ Data loaded successfully!")
-    # Dataset Overview
-    st.subheader("🔎 Dataset Overview")
-    st.write(f"Shape of the dataset: {df.shape}")
-    st.dataframe(df.head())
-    st.write("Missing values in each column:")
-    st.write(df.isna().sum())
-    st.write(f"Number of duplicate rows: {df.duplicated().sum()}")
-    # Data Cleaning
-    st.subheader("🧹 Data Cleaning")
-    df.drop_duplicates(inplace=True, ignore_index=True)
-    df["clean_question"] = df["question"].str.lower()
-    df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
-    df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
-    df["tag_list"] = df["tags"].str.split(",")
-    st.write("Sample cleaned question:")
-    st.write(df["clean_question"].iloc[0])
-    # Feature Extraction
-    st.subheader("🔠 Feature Extraction with TF-IDF")
-    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
-    X = tfidf.fit_transform(df["clean_question"])
-    st.write(f"TF-IDF matrix shape: {X.shape}")
-    # Target Processing
-    mlb = MultiLabelBinarizer()
-    y = mlb.fit_transform(df["tag_list"])
-    st.write(f"Number of unique tags: {len(mlb.classes_)}")
-    # Dimensionality Reduction
-    svd = TruncatedSVD(n_components=100)
-    X_reduced = svd.fit_transform(X)
-    # Train-Test Split
-    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
-    # Model Training
-    st.subheader("🤖 Model Training (Logistic Regression)")
-    with st.spinner("Training the model..."):
-        model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
-        model.fit(X_train, y_train)
-    st.success("✅ Model trained successfully!")
-    # Prediction Demo
-    st.subheader("🧪 Try it Out: Tag Prediction")
-    user_question = st.text_input("Enter a Stack Overflow question (title + description):")
-    if st.button("Predict Tags"):
-        with st.spinner("Predicting..."):
-            clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
-            input_vector = tfidf.transform([clean_input])
-            input_reduced = svd.transform(input_vector)
-            prediction = model.predict(input_reduced)
-            predicted_tags = mlb.inverse_transform(prediction)
-            st.write("### 🔍 Prediction Result")
-            st.write(f"**Input Question:** {user_question}")
-            if predicted_tags[0]:
-                st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
-            else:
-                st.warning("No tags predicted.")

+import sklearn
+import streamlit as st
+import pandas as pd
+import re
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.metrics import classification_report
+# Title
+st.title("Stack Overflow Tag Predictor")
+# Tabs
+tab1, tab2 = st.tabs(["📌 Business Problem & Goal", "🔍 EDA & Modeling"])
+# ---------------- Tab 1: Business Problem & Goal ----------------
+with tab1:
+    st.header("📌 Business Problem & Goal")
+    st.markdown("""
+    **🧩 Business Problem**
+    Stack Overflow is one of the largest Q&A platforms for programmers, where users post questions related to programming and software development.
+    Each question is assigned tags (e.g., `python`, `machine-learning`, `web-development`) that help in categorizing and improving content discoverability.
+    However, users often:
+    - Misclassify or skip adding tags
+    - Make it harder to retrieve relevant questions
+    - Increase the burden on moderators for cleanup
+    ---
+    **🎯 Goal**
+    Build a **machine learning model** that can automatically predict relevant tags for a Stack Overflow question based on:
+    - **Title**
+    - **Description (Body)**
+    This will:
+    - Enhance user experience
+    - Improve search relevance
+    - Reduce manual tagging effort
+    ---
+    **🎯 Target Variable**
+    - This is a **multi-label classification** task.
+    - Each question can have **multiple tags**.
+    - For example: `['python', 'pandas', 'dataframe']`
+    """)
+# ---------------- Tab 2: EDA & Modeling ----------------
+with tab2:
+    st.header("🔍 EDA & Modeling")
+    # Load dataset
+    df = pd.read_excel(r"C:\Users\91879\Downloads\stack3.xlsx")
+    st.success("✅ Data loaded successfully!")
+    # Dataset Overview
+    st.subheader("🔎 Dataset Overview")
+    st.write(f"Shape of the dataset: {df.shape}")
+    st.dataframe(df.head())
+    st.write("Missing values in each column:")
+    st.write(df.isna().sum())
+    st.write(f"Number of duplicate rows: {df.duplicated().sum()}")
+    # Data Cleaning
+    st.subheader("🧹 Data Cleaning")
+    df.drop_duplicates(inplace=True, ignore_index=True)
+    df["clean_question"] = df["question"].str.lower()
+    df["clean_question"] = df["clean_question"].str.replace(r'<[^>]+>', '', regex=True)
+    df["clean_question"] = df["clean_question"].str.replace(r'[^a-zA-Z\s]', '', regex=True)
+    df["tag_list"] = df["tags"].str.split(",")
+    st.write("Sample cleaned question:")
+    st.write(df["clean_question"].iloc[0])
+    # Feature Extraction
+    st.subheader("🔠 Feature Extraction with TF-IDF")
+    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
+    X = tfidf.fit_transform(df["clean_question"])
+    st.write(f"TF-IDF matrix shape: {X.shape}")
+    # Target Processing
+    mlb = MultiLabelBinarizer()
+    y = mlb.fit_transform(df["tag_list"])
+    st.write(f"Number of unique tags: {len(mlb.classes_)}")
+    # Dimensionality Reduction
+    svd = TruncatedSVD(n_components=100)
+    X_reduced = svd.fit_transform(X)
+    # Train-Test Split
+    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
+    # Model Training
+    st.subheader("🤖 Model Training (Logistic Regression)")
+    with st.spinner("Training the model..."):
+        model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
+        model.fit(X_train, y_train)
+    st.success("✅ Model trained successfully!")
+    # Prediction Demo
+    st.subheader("🧪 Try it Out: Tag Prediction")
+    user_question = st.text_input("Enter a Stack Overflow question (title + description):")
+    if st.button("Predict Tags"):
+        with st.spinner("Predicting..."):
+            clean_input = re.sub(r'[^a-zA-Z\s]', '', user_question.lower())
+            input_vector = tfidf.transform([clean_input])
+            input_reduced = svd.transform(input_vector)
+            prediction = model.predict(input_reduced)
+            predicted_tags = mlb.inverse_transform(prediction)
+            st.write("### 🔍 Prediction Result")
+            st.write(f"**Input Question:** {user_question}")
+            if predicted_tags[0]:
+                st.success(f"**Predicted Tags:** {', '.join(predicted_tags[0])}")
+            else:
+                st.warning("No tags predicted.")