Spaces:
Sleeping
Sleeping
# cancer_prediction_app.py | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
from sklearn.impute import SimpleImputer | |
from sklearn.compose import ColumnTransformer | |
from sklearn.linear_model import LogisticRegression | |
# ----------------------------- | |
# Backstory, Problem Statement, Applications | |
# ----------------------------- | |
st.title("𧬠Cancer Prediction using Machine Learning") | |
st.sidebar.title("About Project") | |
st.sidebar.markdown(""" | |
**Backstory:** | |
Cancer remains a leading cause of death worldwide. Early prediction and detection play a crucial role in improving survival rates. As data availability has increased, machine learning provides powerful tools to help medical professionals make more accurate predictions. | |
**Problem Statement:** | |
Develop a machine learning model that predicts the likelihood of cancer based on patient features. The goal is to assist healthcare providers in making better-informed diagnostic decisions. | |
**Applications:** | |
- Early detection and screening support. | |
- Prioritization of high-risk patients. | |
- Resource allocation in healthcare systems. | |
""") | |
st.markdown(""" | |
This app allows you to manually input all patient features and receive a prediction indicating whether cancer is present or not. The model is built with a robust pipeline combining **imputation**, **scaling**, **encoding**, and **logistic regression**. | |
""") | |
# ----------------------------- | |
# Load Data | |
# ----------------------------- | |
def load_data(): | |
data = pd.read_csv("cancer_prediction_data.csv") | |
return data | |
data = load_data() | |
st.subheader("Dataset Columns:") | |
st.write(list(data.columns)) | |
# ----------------------------- | |
# Identify Target and Features | |
# ----------------------------- | |
TARGET_COLUMN = "Cancer_Present" | |
if TARGET_COLUMN not in data.columns: | |
st.error(f"β Target column '{TARGET_COLUMN}' not found. Please check dataset columns above.") | |
st.stop() | |
X = data.drop(TARGET_COLUMN, axis=1) | |
y = data[TARGET_COLUMN] | |
# ----------------------------- | |
# Detect categorical and numeric columns | |
# ----------------------------- | |
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist() | |
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist() | |
st.write("Categorical columns detected:", categorical_cols) | |
st.write("Numeric columns detected:", numeric_cols) | |
# ----------------------------- | |
# Preprocessing pipeline | |
# ----------------------------- | |
numeric_pipeline = Pipeline([ | |
("imputer", SimpleImputer(strategy="mean")), | |
("scaler", StandardScaler()) | |
]) | |
categorical_pipeline = Pipeline([ | |
("imputer", SimpleImputer(strategy="most_frequent")), | |
("encoder", OneHotEncoder(handle_unknown="ignore")) | |
]) | |
preprocessor = ColumnTransformer([ | |
("num", numeric_pipeline, numeric_cols), | |
("cat", categorical_pipeline, categorical_cols) | |
]) | |
pipeline = Pipeline([ | |
("preprocessor", preprocessor), | |
("classifier", LogisticRegression(max_iter=1000)) | |
]) | |
# ----------------------------- | |
# Train the pipeline | |
# ----------------------------- | |
pipeline.fit(X, y) | |
# ----------------------------- | |
# Sidebar Inputs for Prediction | |
# ----------------------------- | |
st.sidebar.header("Enter Patient Features") | |
def user_input_features(): | |
input_data = {} | |
# Numeric inputs | |
for col in numeric_cols: | |
val = st.sidebar.number_input( | |
f"{col}", | |
min_value=float(X[col].min()) if not pd.isna(X[col].min()) else 0.0, | |
max_value=float(X[col].max()) if not pd.isna(X[col].max()) else 1.0, | |
value=float(X[col].mean()) if not pd.isna(X[col].mean()) else 0.0 | |
) | |
input_data[col] = val | |
# Categorical inputs | |
for col in categorical_cols: | |
val = st.sidebar.selectbox( | |
f"{col}", | |
options=sorted(X[col].dropna().unique()) | |
) | |
input_data[col] = val | |
return pd.DataFrame([input_data]) | |
input_df = user_input_features() | |
# ----------------------------- | |
# Make Prediction | |
# ----------------------------- | |
if st.button("Predict"): | |
prediction = pipeline.predict(input_df)[0] | |
prediction_proba = pipeline.predict_proba(input_df)[0][1] | |
st.subheader("Prediction Result") | |
if prediction == 1: | |
st.error("β οΈ **Cancer is Present**") | |
else: | |
st.success("β **No Cancer**") | |
# ----------------------------- | |
# Additional Info | |
# ----------------------------- | |