# cancer_prediction_app.py import streamlit as st import pandas as pd import numpy as np from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.linear_model import LogisticRegression # ----------------------------- # Backstory, Problem Statement, Applications # ----------------------------- st.title("🧬 Cancer Prediction using Machine Learning") st.sidebar.title("About Project") st.sidebar.markdown(""" **Backstory:** Cancer remains a leading cause of death worldwide. Early prediction and detection play a crucial role in improving survival rates. As data availability has increased, machine learning provides powerful tools to help medical professionals make more accurate predictions. **Problem Statement:** Develop a machine learning model that predicts the likelihood of cancer based on patient features. The goal is to assist healthcare providers in making better-informed diagnostic decisions. **Applications:** - Early detection and screening support. - Prioritization of high-risk patients. - Resource allocation in healthcare systems. """) st.markdown(""" This app allows you to manually input all patient features and receive a prediction indicating whether cancer is present or not. The model is built with a robust pipeline combining **imputation**, **scaling**, **encoding**, and **logistic regression**. """) # ----------------------------- # Load Data # ----------------------------- @st.cache_data def load_data(): data = pd.read_csv("cancer_prediction_data.csv") return data data = load_data() st.subheader("Dataset Columns:") st.write(list(data.columns)) # ----------------------------- # Identify Target and Features # ----------------------------- TARGET_COLUMN = "Cancer_Present" if TARGET_COLUMN not in data.columns: st.error(f"❌ Target column '{TARGET_COLUMN}' not found. Please check dataset columns above.") st.stop() X = data.drop(TARGET_COLUMN, axis=1) y = data[TARGET_COLUMN] # ----------------------------- # Detect categorical and numeric columns # ----------------------------- categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist() numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist() st.write("Categorical columns detected:", categorical_cols) st.write("Numeric columns detected:", numeric_cols) # ----------------------------- # Preprocessing pipeline # ----------------------------- numeric_pipeline = Pipeline([ ("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler()) ]) categorical_pipeline = Pipeline([ ("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OneHotEncoder(handle_unknown="ignore")) ]) preprocessor = ColumnTransformer([ ("num", numeric_pipeline, numeric_cols), ("cat", categorical_pipeline, categorical_cols) ]) pipeline = Pipeline([ ("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=1000)) ]) # ----------------------------- # Train the pipeline # ----------------------------- pipeline.fit(X, y) # ----------------------------- # Sidebar Inputs for Prediction # ----------------------------- st.sidebar.header("Enter Patient Features") def user_input_features(): input_data = {} # Numeric inputs for col in numeric_cols: val = st.sidebar.number_input( f"{col}", min_value=float(X[col].min()) if not pd.isna(X[col].min()) else 0.0, max_value=float(X[col].max()) if not pd.isna(X[col].max()) else 1.0, value=float(X[col].mean()) if not pd.isna(X[col].mean()) else 0.0 ) input_data[col] = val # Categorical inputs for col in categorical_cols: val = st.sidebar.selectbox( f"{col}", options=sorted(X[col].dropna().unique()) ) input_data[col] = val return pd.DataFrame([input_data]) input_df = user_input_features() # ----------------------------- # Make Prediction # ----------------------------- if st.button("Predict"): prediction = pipeline.predict(input_df)[0] prediction_proba = pipeline.predict_proba(input_df)[0][1] st.subheader("Prediction Result") if prediction == 1: st.error("⚠️ **Cancer is Present**") else: st.success("✅ **No Cancer**") # ----------------------------- # Additional Info # -----------------------------