Fouzanjaved commited on
Commit
5716541
·
verified ·
1 Parent(s): fae2bcb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.impute import SimpleImputer
3
+ from sklearn.preprocessing import MinMaxScaler
4
+
5
+ # Load data
6
+ df = pd.read_csv("diabetes.csv")
7
+
8
+ # Replace 0s with NaN (Glucose, BP, etc.)
9
+ cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
10
+ df[cols] = df[cols].replace(0, float('nan'))
11
+
12
+ # Impute missing values with mean
13
+ imputer = SimpleImputer(strategy="mean")
14
+ df[cols] = imputer.fit_transform(df[cols])
15
+
16
+ # Remove outliers using IQR
17
+ Q1 = df.quantile(0.25)
18
+ Q3 = df.quantile(0.75)
19
+ IQR = Q3 - Q1
20
+ df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
21
+
22
+ # Feature selection (keep: Pregnancies, Glucose, Insulin, BMI, Age)
23
+ X = df[["Pregnancies", "Glucose", "Insulin", "BMI", "Age"]]
24
+ y = df["Outcome"]
25
+
26
+ # Normalize to [0, 1]
27
+ scaler = MinMaxScaler()
28
+ X = scaler.fit_transform(X)
29
+
30
+
31
+
32
+
33
+ # Machine Learning Models (DT, KNN, RF, NB, AB, LR, SVM)
34
+ from sklearn.model_selection import train_test_split, cross_val_score
35
+ from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
36
+ from sklearn.svm import SVC
37
+ from sklearn.linear_model import LogisticRegression
38
+ from sklearn.tree import DecisionTreeClassifier
39
+ from sklearn.neighbors import KNeighborsClassifier
40
+ from sklearn.naive_bayes import GaussianNB
41
+
42
+ # Split data (85% train, 15% test)
43
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
44
+
45
+ # Initialize models
46
+ models = {
47
+ "DT": DecisionTreeClassifier(),
48
+ "KNN": KNeighborsClassifier(n_neighbors=7),
49
+ "RF": RandomForestClassifier(),
50
+ "NB": GaussianNB(),
51
+ "AB": AdaBoostClassifier(),
52
+ "LR": LogisticRegression(),
53
+ "SVM": SVC()
54
+ }
55
+
56
+ # Evaluate via k-fold CV (k=10)
57
+ for name, model in models.items():
58
+ scores = cross_val_score(model, X, y, cv=10, scoring="accuracy")
59
+ print(f"{name} CV Accuracy: {scores.mean():.2%}")
60
+
61
+ # Evaluate via train-test split
62
+ for name, model in models.items():
63
+ model.fit(X_train, y_train)
64
+ acc = model.score(X_test, y_test)
65
+ print(f"{name} Test Accuracy: {acc:.2%}")
66
+
67
+
68
+ #Neural Network (Keras)
69
+
70
+ from tensorflow.keras.models import Sequential
71
+ from tensorflow.keras.layers import Dense
72
+ from tensorflow.keras.optimizers import SGD
73
+
74
+ # NN with 2 hidden layers (architecture from paper)
75
+ model = Sequential([
76
+ Dense(26, activation="relu", input_shape=(5,)),
77
+ Dense(5, activation="relu"),
78
+ Dense(1, activation="sigmoid")
79
+ ])
80
+
81
+ # Compile with SGD (lr=0.01)
82
+ model.compile(optimizer=SGD(learning_rate=0.01),
83
+ loss="binary_crossentropy",
84
+ metrics=["accuracy"])
85
+
86
+ # Train for 400 epochs
87
+ history = model.fit(X_train, y_train, epochs=400, batch_size=32,
88
+ validation_data=(X_test, y_test), verbose=0)