Sowmith22 commited on
Commit
73d6415
·
verified ·
1 Parent(s): a5c0c3b

Upload 3 files

Browse files
Files changed (3) hide show
  1. app (1).py +144 -0
  2. cancer_prediction_data.csv +0 -0
  3. requirements (2).txt +8 -0
app (1).py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cancer_prediction_app.py
2
+
3
+ import streamlit as st
4
+ import pandas as pd
5
+ import numpy as np
6
+ from sklearn.pipeline import Pipeline
7
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
8
+ from sklearn.impute import SimpleImputer
9
+ from sklearn.compose import ColumnTransformer
10
+ from sklearn.linear_model import LogisticRegression
11
+
12
+ # -----------------------------
13
+ # Backstory, Problem Statement, Applications
14
+ # -----------------------------
15
+ st.title("🧬 Cancer Prediction using Machine Learning")
16
+
17
+ st.sidebar.title("About Project")
18
+
19
+ st.sidebar.markdown("""
20
+ **Backstory:**
21
+ Cancer remains a leading cause of death worldwide. Early prediction and detection play a crucial role in improving survival rates. As data availability has increased, machine learning provides powerful tools to help medical professionals make more accurate predictions.
22
+
23
+ **Problem Statement:**
24
+ Develop a machine learning model that predicts the likelihood of cancer based on patient features. The goal is to assist healthcare providers in making better-informed diagnostic decisions.
25
+
26
+ **Applications:**
27
+ - Early detection and screening support.
28
+ - Prioritization of high-risk patients.
29
+ - Resource allocation in healthcare systems.
30
+ """)
31
+
32
+ st.markdown("""
33
+ This app allows you to manually input all patient features and receive a prediction indicating whether cancer is present or not. The model is built with a robust pipeline combining **imputation**, **scaling**, **encoding**, and **logistic regression**.
34
+ """)
35
+
36
+ # -----------------------------
37
+ # Load Data
38
+ # -----------------------------
39
+ @st.cache_data
40
+ def load_data():
41
+ data = pd.read_csv("cancer_prediction_data.csv")
42
+ return data
43
+
44
+ data = load_data()
45
+
46
+ st.subheader("Dataset Columns:")
47
+ st.write(list(data.columns))
48
+
49
+ # -----------------------------
50
+ # Identify Target and Features
51
+ # -----------------------------
52
+ TARGET_COLUMN = "Cancer_Present"
53
+
54
+ if TARGET_COLUMN not in data.columns:
55
+ st.error(f"❌ Target column '{TARGET_COLUMN}' not found. Please check dataset columns above.")
56
+ st.stop()
57
+
58
+ X = data.drop(TARGET_COLUMN, axis=1)
59
+ y = data[TARGET_COLUMN]
60
+
61
+ # -----------------------------
62
+ # Detect categorical and numeric columns
63
+ # -----------------------------
64
+ categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
65
+ numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
66
+
67
+ st.write("Categorical columns detected:", categorical_cols)
68
+ st.write("Numeric columns detected:", numeric_cols)
69
+
70
+ # -----------------------------
71
+ # Preprocessing pipeline
72
+ # -----------------------------
73
+ numeric_pipeline = Pipeline([
74
+ ("imputer", SimpleImputer(strategy="mean")),
75
+ ("scaler", StandardScaler())
76
+ ])
77
+
78
+ categorical_pipeline = Pipeline([
79
+ ("imputer", SimpleImputer(strategy="most_frequent")),
80
+ ("encoder", OneHotEncoder(handle_unknown="ignore"))
81
+ ])
82
+
83
+ preprocessor = ColumnTransformer([
84
+ ("num", numeric_pipeline, numeric_cols),
85
+ ("cat", categorical_pipeline, categorical_cols)
86
+ ])
87
+
88
+ pipeline = Pipeline([
89
+ ("preprocessor", preprocessor),
90
+ ("classifier", LogisticRegression(max_iter=1000))
91
+ ])
92
+
93
+ # -----------------------------
94
+ # Train the pipeline
95
+ # -----------------------------
96
+ pipeline.fit(X, y)
97
+
98
+ # -----------------------------
99
+ # Sidebar Inputs for Prediction
100
+ # -----------------------------
101
+ st.sidebar.header("Enter Patient Features")
102
+
103
+ def user_input_features():
104
+ input_data = {}
105
+ # Numeric inputs
106
+ for col in numeric_cols:
107
+ val = st.sidebar.number_input(
108
+ f"{col}",
109
+ min_value=float(X[col].min()) if not pd.isna(X[col].min()) else 0.0,
110
+ max_value=float(X[col].max()) if not pd.isna(X[col].max()) else 1.0,
111
+ value=float(X[col].mean()) if not pd.isna(X[col].mean()) else 0.0
112
+ )
113
+ input_data[col] = val
114
+ # Categorical inputs
115
+ for col in categorical_cols:
116
+ val = st.sidebar.selectbox(
117
+ f"{col}",
118
+ options=sorted(X[col].dropna().unique())
119
+ )
120
+ input_data[col] = val
121
+ return pd.DataFrame([input_data])
122
+
123
+ input_df = user_input_features()
124
+
125
+ # -----------------------------
126
+ # Make Prediction
127
+ # -----------------------------
128
+ if st.button("Predict"):
129
+ prediction = pipeline.predict(input_df)[0]
130
+ prediction_proba = pipeline.predict_proba(input_df)[0][1]
131
+
132
+ st.subheader("Prediction Result")
133
+
134
+ if prediction == 1:
135
+ st.error("⚠️ **Cancer is Present**")
136
+ else:
137
+ st.success("✅ **No Cancer**")
138
+
139
+ # -----------------------------
140
+ # Additional Info
141
+ # -----------------------------
142
+
143
+
144
+
cancer_prediction_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements (2).txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ altair
2
+ pandas
3
+ streamlit
4
+ streamlit_drawable_canvas
5
+ joblib
6
+ pillow
7
+ opencv-python
8
+ scikit-learn