File size: 4,516 Bytes
73d6415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# cancer_prediction_app.py

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

# -----------------------------
# Backstory, Problem Statement, Applications
# -----------------------------
st.title("🧬 Cancer Prediction using Machine Learning")

st.sidebar.title("About Project")

st.sidebar.markdown("""
**Backstory:**  
Cancer remains a leading cause of death worldwide. Early prediction and detection play a crucial role in improving survival rates. As data availability has increased, machine learning provides powerful tools to help medical professionals make more accurate predictions.

**Problem Statement:**  
Develop a machine learning model that predicts the likelihood of cancer based on patient features. The goal is to assist healthcare providers in making better-informed diagnostic decisions.

**Applications:**  
- Early detection and screening support.
- Prioritization of high-risk patients.
- Resource allocation in healthcare systems.
""")

st.markdown("""
This app allows you to manually input all patient features and receive a prediction indicating whether cancer is present or not. The model is built with a robust pipeline combining **imputation**, **scaling**, **encoding**, and **logistic regression**.
""")

# -----------------------------
# Load Data
# -----------------------------
@st.cache_data
def load_data():
    data = pd.read_csv("cancer_prediction_data.csv")
    return data

data = load_data()

st.subheader("Dataset Columns:")
st.write(list(data.columns))

# -----------------------------
# Identify Target and Features
# -----------------------------
TARGET_COLUMN = "Cancer_Present"

if TARGET_COLUMN not in data.columns:
    st.error(f"❌ Target column '{TARGET_COLUMN}' not found. Please check dataset columns above.")
    st.stop()

X = data.drop(TARGET_COLUMN, axis=1)
y = data[TARGET_COLUMN]

# -----------------------------
# Detect categorical and numeric columns
# -----------------------------
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

st.write("Categorical columns detected:", categorical_cols)
st.write("Numeric columns detected:", numeric_cols)

# -----------------------------
# Preprocessing pipeline
# -----------------------------
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# -----------------------------
# Train the pipeline
# -----------------------------
pipeline.fit(X, y)

# -----------------------------
# Sidebar Inputs for Prediction
# -----------------------------
st.sidebar.header("Enter Patient Features")

def user_input_features():
    input_data = {}
    # Numeric inputs
    for col in numeric_cols:
        val = st.sidebar.number_input(
            f"{col}",
            min_value=float(X[col].min()) if not pd.isna(X[col].min()) else 0.0,
            max_value=float(X[col].max()) if not pd.isna(X[col].max()) else 1.0,
            value=float(X[col].mean()) if not pd.isna(X[col].mean()) else 0.0
        )
        input_data[col] = val
    # Categorical inputs
    for col in categorical_cols:
        val = st.sidebar.selectbox(
            f"{col}",
            options=sorted(X[col].dropna().unique())
        )
        input_data[col] = val
    return pd.DataFrame([input_data])

input_df = user_input_features()

# -----------------------------
# Make Prediction
# -----------------------------
if st.button("Predict"):
    prediction = pipeline.predict(input_df)[0]
    prediction_proba = pipeline.predict_proba(input_df)[0][1]

    st.subheader("Prediction Result")

    if prediction == 1:
        st.error("⚠️ **Cancer is Present**")
    else:
        st.success("✅ **No Cancer**")

# -----------------------------
# Additional Info
# -----------------------------