Spaces:
Sleeping
Sleeping
File size: 4,516 Bytes
73d6415 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# cancer_prediction_app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
# -----------------------------
# Backstory, Problem Statement, Applications
# -----------------------------
st.title("🧬 Cancer Prediction using Machine Learning")
st.sidebar.title("About Project")
st.sidebar.markdown("""
**Backstory:**
Cancer remains a leading cause of death worldwide. Early prediction and detection play a crucial role in improving survival rates. As data availability has increased, machine learning provides powerful tools to help medical professionals make more accurate predictions.
**Problem Statement:**
Develop a machine learning model that predicts the likelihood of cancer based on patient features. The goal is to assist healthcare providers in making better-informed diagnostic decisions.
**Applications:**
- Early detection and screening support.
- Prioritization of high-risk patients.
- Resource allocation in healthcare systems.
""")
st.markdown("""
This app allows you to manually input all patient features and receive a prediction indicating whether cancer is present or not. The model is built with a robust pipeline combining **imputation**, **scaling**, **encoding**, and **logistic regression**.
""")
# -----------------------------
# Load Data
# -----------------------------
@st.cache_data
def load_data():
data = pd.read_csv("cancer_prediction_data.csv")
return data
data = load_data()
st.subheader("Dataset Columns:")
st.write(list(data.columns))
# -----------------------------
# Identify Target and Features
# -----------------------------
TARGET_COLUMN = "Cancer_Present"
if TARGET_COLUMN not in data.columns:
st.error(f"❌ Target column '{TARGET_COLUMN}' not found. Please check dataset columns above.")
st.stop()
X = data.drop(TARGET_COLUMN, axis=1)
y = data[TARGET_COLUMN]
# -----------------------------
# Detect categorical and numeric columns
# -----------------------------
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
st.write("Categorical columns detected:", categorical_cols)
st.write("Numeric columns detected:", numeric_cols)
# -----------------------------
# Preprocessing pipeline
# -----------------------------
numeric_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="mean")),
("scaler", StandardScaler())
])
categorical_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
("num", numeric_pipeline, numeric_cols),
("cat", categorical_pipeline, categorical_cols)
])
pipeline = Pipeline([
("preprocessor", preprocessor),
("classifier", LogisticRegression(max_iter=1000))
])
# -----------------------------
# Train the pipeline
# -----------------------------
pipeline.fit(X, y)
# -----------------------------
# Sidebar Inputs for Prediction
# -----------------------------
st.sidebar.header("Enter Patient Features")
def user_input_features():
input_data = {}
# Numeric inputs
for col in numeric_cols:
val = st.sidebar.number_input(
f"{col}",
min_value=float(X[col].min()) if not pd.isna(X[col].min()) else 0.0,
max_value=float(X[col].max()) if not pd.isna(X[col].max()) else 1.0,
value=float(X[col].mean()) if not pd.isna(X[col].mean()) else 0.0
)
input_data[col] = val
# Categorical inputs
for col in categorical_cols:
val = st.sidebar.selectbox(
f"{col}",
options=sorted(X[col].dropna().unique())
)
input_data[col] = val
return pd.DataFrame([input_data])
input_df = user_input_features()
# -----------------------------
# Make Prediction
# -----------------------------
if st.button("Predict"):
prediction = pipeline.predict(input_df)[0]
prediction_proba = pipeline.predict_proba(input_df)[0][1]
st.subheader("Prediction Result")
if prediction == 1:
st.error("⚠️ **Cancer is Present**")
else:
st.success("✅ **No Cancer**")
# -----------------------------
# Additional Info
# -----------------------------
|