# cancer_prediction_app.py

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

# -----------------------------
# Backstory, Problem Statement, Applications
# -----------------------------
st.title("🧬 Cancer Prediction using Machine Learning")

st.sidebar.title("About Project")

st.sidebar.markdown("""
**Backstory:**  
Cancer remains a leading cause of death worldwide. Early prediction and detection play a crucial role in improving survival rates. As data availability has increased, machine learning provides powerful tools to help medical professionals make more accurate predictions.

**Problem Statement:**  
Develop a machine learning model that predicts the likelihood of cancer based on patient features. The goal is to assist healthcare providers in making better-informed diagnostic decisions.

**Applications:**  
- Early detection and screening support.
- Prioritization of high-risk patients.
- Resource allocation in healthcare systems.
""")

st.markdown("""
This app allows you to manually input all patient features and receive a prediction indicating whether cancer is present or not. The model is built with a robust pipeline combining **imputation**, **scaling**, **encoding**, and **logistic regression**.
""")

# -----------------------------
# Load Data
# -----------------------------
@st.cache_data
def load_data():
    data = pd.read_csv("cancer_prediction_data.csv")
    return data

data = load_data()

st.subheader("Dataset Columns:")
st.write(list(data.columns))

# -----------------------------
# Identify Target and Features
# -----------------------------
TARGET_COLUMN = "Cancer_Present"

if TARGET_COLUMN not in data.columns:
    st.error(f"❌ Target column '{TARGET_COLUMN}' not found. Please check dataset columns above.")
    st.stop()

X = data.drop(TARGET_COLUMN, axis=1)
y = data[TARGET_COLUMN]

# -----------------------------
# Detect categorical and numeric columns
# -----------------------------
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

st.write("Categorical columns detected:", categorical_cols)
st.write("Numeric columns detected:", numeric_cols)

# -----------------------------
# Preprocessing pipeline
# -----------------------------
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# -----------------------------
# Train the pipeline
# -----------------------------
pipeline.fit(X, y)

# -----------------------------
# Sidebar Inputs for Prediction
# -----------------------------
st.sidebar.header("Enter Patient Features")

def user_input_features():
    input_data = {}
    # Numeric inputs
    for col in numeric_cols:
        val = st.sidebar.number_input(
            f"{col}",
            min_value=float(X[col].min()) if not pd.isna(X[col].min()) else 0.0,
            max_value=float(X[col].max()) if not pd.isna(X[col].max()) else 1.0,
            value=float(X[col].mean()) if not pd.isna(X[col].mean()) else 0.0
        )
        input_data[col] = val
    # Categorical inputs
    for col in categorical_cols:
        val = st.sidebar.selectbox(
            f"{col}",
            options=sorted(X[col].dropna().unique())
        )
        input_data[col] = val
    return pd.DataFrame([input_data])

input_df = user_input_features()

# -----------------------------
# Make Prediction
# -----------------------------
if st.button("Predict"):
    prediction = pipeline.predict(input_df)[0]
    prediction_proba = pipeline.predict_proba(input_df)[0][1]

    st.subheader("Prediction Result")

    if prediction == 1:
        st.error("⚠️ **Cancer is Present**")
    else:
        st.success("✅ **No Cancer**")

# -----------------------------
# Additional Info
# -----------------------------