import pandas as pd from sklearn.impute import SimpleImputer from sklearn.preprocessing import MinMaxScaler # Load data df = pd.read_csv("diabetes.csv") # Replace 0s with NaN (Glucose, BP, etc.) cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"] df[cols] = df[cols].replace(0, float('nan')) # Impute missing values with mean imputer = SimpleImputer(strategy="mean") df[cols] = imputer.fit_transform(df[cols]) # Remove outliers using IQR Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)] # Feature selection (keep: Pregnancies, Glucose, Insulin, BMI, Age) X = df[["Pregnancies", "Glucose", "Insulin", "BMI", "Age"]] y = df["Outcome"] # Normalize to [0, 1] scaler = MinMaxScaler() X = scaler.fit_transform(X) # Machine Learning Models (DT, KNN, RF, NB, AB, LR, SVM) from sklearn.model_selection import train_test_split, cross_val_score from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB # Split data (85% train, 15% test) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42) # Initialize models models = { "DT": DecisionTreeClassifier(), "KNN": KNeighborsClassifier(n_neighbors=7), "RF": RandomForestClassifier(), "NB": GaussianNB(), "AB": AdaBoostClassifier(), "LR": LogisticRegression(), "SVM": SVC() } # Evaluate via k-fold CV (k=10) for name, model in models.items(): scores = cross_val_score(model, X, y, cv=10, scoring="accuracy") print(f"{name} CV Accuracy: {scores.mean():.2%}") # Evaluate via train-test split for name, model in models.items(): model.fit(X_train, y_train) acc = model.score(X_test, y_test) print(f"{name} Test Accuracy: {acc:.2%}") #Neural Network (Keras) from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from tensorflow.keras.optimizers import SGD # NN with 2 hidden layers (architecture from paper) model = Sequential([ Dense(26, activation="relu", input_shape=(5,)), Dense(5, activation="relu"), Dense(1, activation="sigmoid") ]) # Compile with SGD (lr=0.01) model.compile(optimizer=SGD(learning_rate=0.01), loss="binary_crossentropy", metrics=["accuracy"]) # Train for 400 epochs history = model.fit(X_train, y_train, epochs=400, batch_size=32, validation_data=(X_test, y_test), verbose=0)