Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import joblib | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder | |
| from sklearn.impute import KNNImputer | |
| from sklearn.decomposition import PCA | |
| import pickle | |
| from tensorflow.keras.models import load_model | |
| import pickle | |
| import hdbscan | |
| # # Define the prediction function | |
| def predict_ann(age, workclass, education, occupation, race, gender, capital_gain, capital_loss, hours_per_week, native_country): | |
| # columns = { | |
| # "age": [age], "workclass":[workclass], "educational-num":[education], "marital-status":[marital_status], "occupation":[occupation], | |
| # "relationship":[relationship], "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], | |
| # "hours-per-week":[hours_per_week], "native-country":[native_country]} | |
| columns = { "0":[0], | |
| "age": [age], "workclass":[workclass], "educational-num":[education], "occupation":[occupation], | |
| "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], | |
| "hours-per-week":[hours_per_week], "native-country":[native_country]} | |
| df = pd.DataFrame(data=columns) | |
| fixed_features = cleaning_features(df,race,False) | |
| print(fixed_features) | |
| # with open('ann_model.pkl', 'rb') as ann_model_file: | |
| # ann_model = pickle.load(ann_model_file) | |
| scaler = StandardScaler() | |
| ann_model = load_model('ann_model.h5') | |
| prediction = ann_model.predict(fixed_features) | |
| # prediction = 1 | |
| return "Income >50K" if prediction == 1 else "Income <=50K" | |
| def predict_rf(age, workclass, education, occupation, race, gender, capital_gain, capital_loss, hours_per_week, native_country): | |
| # columns = { | |
| # "age": [age], "workclass":[workclass], "educational-num":[education], "marital-status":[marital_status], "occupation":[occupation], | |
| # "relationship":[relationship], "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], | |
| # "hours-per-week":[hours_per_week], "native-country":[native_country]} | |
| columns = { | |
| "age": [age], "workclass":[workclass], "educational-num":[education], "occupation":[occupation], | |
| "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], | |
| "hours-per-week":[hours_per_week], "native-country":[native_country]} | |
| df = pd.DataFrame(data=columns) | |
| fixed_features = cleaning_features(df,race,False) | |
| print(fixed_features) | |
| # with open('ann_model.pkl', 'rb') as ann_model_file: | |
| # ann_model = pickle.load(ann_model_file) | |
| scaler = StandardScaler() | |
| rf_model = pickle.load(open('rf_model.pkl', 'rb')) | |
| prediction = rf_model.predict(fixed_features) | |
| # prediction = 1 | |
| return "Income >50K" if prediction == 1 else "Income <=50K" | |
| def predict_hb(age, workclass, education, occupation, race, gender, capital_gain, capital_loss, hours_per_week, native_country): | |
| # columns = { | |
| # "age": [age], "workclass":[workclass], "educational-num":[education], "marital-status":[marital_status], "occupation":[occupation], | |
| # "relationship":[relationship], "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], | |
| # "hours-per-week":[hours_per_week], "native-country":[native_country]} | |
| columns = { | |
| "age": [age], "workclass":[workclass], "educational-num":[education], "occupation":[occupation], | |
| "race":[race], "gender":[gender], "capital-gain":[capital_gain], "capital-loss":[capital_loss], | |
| "hours-per-week":[hours_per_week], "native-country":[native_country]} | |
| df = pd.DataFrame(data=columns) | |
| fixed_features = cleaning_features(df,race,True) | |
| print(fixed_features) | |
| # with open('ann_model.pkl', 'rb') as ann_model_file: | |
| # ann_model = pickle.load(ann_model_file) | |
| scaler = StandardScaler() | |
| X = scaler.fit_transform(fixed_features) | |
| clusterer = hdbscan.HDBSCAN( | |
| min_cluster_size=220, | |
| min_samples=117, | |
| metric='euclidean', | |
| cluster_selection_method='eom', | |
| prediction_data=True, | |
| cluster_selection_epsilon=0.28479667859306007 | |
| ) | |
| prediction = clusterer.fit_predict(X) | |
| filename = 'hdbscan_model.pkl' | |
| pickle.dump(clusterer, open(filename, 'wb')) | |
| return f"Predicted Cluster (HDBSCAN): {prediction[-1]}" | |
| def cleaning_features(data,race,hdbscan): | |
| # with open('race_onehot_encoder.pkl', 'rb') as enc_file: | |
| # encoder = pickle.load(enc_file) | |
| with open('label_encoder_work.pkl', 'rb') as le_file: | |
| le_work = pickle.load(le_file) | |
| with open('label_encoder_occ.pkl', 'rb') as le_file: | |
| le_occ = pickle.load(le_file) | |
| with open('scaler.pkl', 'rb') as scaler_file: | |
| scaler = pickle.load(scaler_file) | |
| education_num_mapping = { | |
| "Preschool": 1, | |
| "1st-4th": 2, | |
| "5th-6th": 3, | |
| "7th-8th": 4, | |
| "9th": 5, | |
| "10th": 6, | |
| "11th": 7, | |
| "12th": 8, | |
| "HS-grad": 9, | |
| "Some-college": 10, | |
| "Assoc-voc": 11, | |
| "Assoc-acdm": 12, | |
| "Bachelors": 13, | |
| "Masters": 14, | |
| "Doctorate": 15, | |
| "Prof-school": 16 | |
| } | |
| race_categories = ["Amer-Indian-Eskimo", "Asian-Pac-Islander","Black", "Other","White"] | |
| gender_mapping = {"Male":1,"Female":0} | |
| country_mapping = {"United-States":1,"Other":0} | |
| numeric_cols = ['age', 'educational-num', 'hours-per-week'] | |
| # columns_to_encode = ['race','marital-status','relationship'] | |
| columns_to_encode = ['race'] | |
| data['workclass'] = le_work.transform(data['workclass']) | |
| data['occupation'] = le_occ.transform(data['occupation']) | |
| data['gender'] = data['gender'].map(gender_mapping) | |
| data['native-country'] = data['native-country'].map(country_mapping) | |
| data['educational-num'] = data['educational-num'].map(education_num_mapping) | |
| data[numeric_cols] = scaler.transform(data[numeric_cols]) | |
| for races in race_categories: | |
| if race == races: | |
| data[f'race_{races}'] = 1 | |
| else: | |
| data[f'race_{races}'] = 0 | |
| # for N in columns_to_encode: | |
| # race_encoded = encoder.transform(data[[N]]) | |
| # race_encoded_cols = encoder.get_feature_names_out([N]) | |
| # race_encoded_df = pd.DataFrame(race_encoded, columns=race_encoded_cols, index=data.index) | |
| # # Combine the encoded data with original dataframe | |
| # data = pd.concat([data.drop(N, axis=1), race_encoded_df], axis=1) | |
| data = data.drop(columns=['race']) | |
| data = pca(data) | |
| if(hdbscan): | |
| df_transformed = pd.read_csv('dataset.csv') | |
| X = df_transformed.drop('income', axis=1) | |
| data = pd.concat([X, data], ignore_index=True) | |
| data['capital-gain'] = np.log1p(data['capital-gain']) | |
| data['capital-loss'] = np.log1p(data['capital-loss']) | |
| scaler = joblib.load("robust_scaler.pkl") | |
| numerical_features = ['age', 'capital-gain', 'capital-loss', 'hours-per-week'] | |
| data[numerical_features] = scaler.transform(data[numerical_features]) | |
| return data | |
| # def pca(data): | |
| # encoder = OneHotEncoder(sparse_output=False) | |
| # one_hot_encoded = encoder.fit_transform(data[['workclass', 'occupation']]) | |
| # encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out()) | |
| # pca_net = PCA(n_components=10) | |
| # pca_result_net = pca_net.fit_transform(encoded_columns_df) | |
| # pca_columns = [f'pca_component_{i+1}' for i in range(10)] | |
| # pca_df = pd.DataFrame(pca_result_net, columns=pca_columns) | |
| # data = data.drop(columns=['workclass', 'occupation'], axis=1) #remove the original columns | |
| # data = pd.concat([data, pca_df], axis=1) | |
| # return data | |
| def pca(data): | |
| encoder_pkl = 'onehot_encoder.pkl' | |
| pca_model_pkl = 'pca.pkl' | |
| with open(pca_model_pkl, 'rb') as file: | |
| pca_model = pickle.load(file) | |
| with open(encoder_pkl, 'rb') as file: | |
| encoder = pickle.load(file) | |
| one_hot_encoded = encoder.transform(data[['workclass', 'occupation']]) | |
| encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out()) | |
| pca_result_net = pca_model.transform(encoded_columns_df) | |
| pca_columns = [f'pca_component_{i+1}' for i in range(pca_model.n_components_)] | |
| pca_df = pd.DataFrame(pca_result_net, columns=pca_columns) | |
| data = data.drop(columns=['workclass', 'occupation'], axis=1) | |
| data = pd.concat([data, pca_df], axis=1) | |
| return data | |
| def hbdscan_tranform(df_transformed): | |
| df_transformed['capital-gain'] = np.log1p(df_transformed['capital-gain']) | |
| df_transformed['capital-loss'] = np.log1p(df_transformed['capital-loss']) | |
| # Apply RobustScaler to all numerical features | |
| numerical_features = ['age', 'capital-gain', 'capital-loss', 'hours-per-week'] | |
| scaler = RobustScaler() | |
| df_transformed[numerical_features] = scaler.fit_transform(df_transformed[numerical_features]) | |
| return df_transformed | |
| # Shared inputs | |
| ann_inputs = [ | |
| gr.Slider(18, 90, step=1, label="Age"), | |
| gr.Dropdown( | |
| ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", | |
| "Local-gov", "State-gov", "Without-pay", "Never-worked"], | |
| label="Workclass" | |
| ), | |
| gr.Dropdown( | |
| ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", | |
| "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", | |
| "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"], | |
| label="Education" | |
| ), | |
| gr.Dropdown( | |
| ["Tech-support", "Craft-repair", "Other-service", "Sales", | |
| "Exec-managerial", "Prof-specialty", "Handlers-cleaners", | |
| "Machine-op-inspct", "Adm-clerical", "Farming-fishing", | |
| "Transport-moving", "Priv-house-serv", "Protective-serv", | |
| "Armed-Forces"], | |
| label="Occupation" | |
| ), | |
| gr.Dropdown( | |
| ["White", "Black", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other"], | |
| label="Race" | |
| ), | |
| gr.Dropdown( | |
| ["Male", "Female"], | |
| label="Gender" | |
| ), | |
| gr.Slider(1, 60, step=1, label="Hours Per Week"), | |
| gr.Slider(0, 100000, step=100, label="Capital Gain"), | |
| gr.Slider(0, 5000, step=50, label="Capital Loss"), | |
| gr.Dropdown( | |
| ["United-States", "Other"], | |
| label="Native Country" | |
| ) | |
| ] | |
| rf_inputs = [ | |
| gr.Slider(18, 90, step=1, label="Age"), | |
| gr.Dropdown( | |
| ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", | |
| "Local-gov", "State-gov", "Without-pay", "Never-worked"], | |
| label="Workclass" | |
| ), | |
| gr.Dropdown( | |
| ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", | |
| "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", | |
| "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"], | |
| label="Education" | |
| ), | |
| gr.Dropdown( | |
| ["Tech-support", "Craft-repair", "Other-service", "Sales", | |
| "Exec-managerial", "Prof-specialty", "Handlers-cleaners", | |
| "Machine-op-inspct", "Adm-clerical", "Farming-fishing", | |
| "Transport-moving", "Priv-house-serv", "Protective-serv", | |
| "Armed-Forces"], | |
| label="Occupation" | |
| ), | |
| gr.Dropdown( | |
| ["White", "Black", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other"], | |
| label="Race" | |
| ), | |
| gr.Dropdown( | |
| ["Male", "Female"], | |
| label="Gender" | |
| ), | |
| gr.Slider(1, 60, step=1, label="Hours Per Week"), | |
| gr.Slider(0, 100000, step=100, label="Capital Gain"), | |
| gr.Slider(0, 5000, step=50, label="Capital Loss"), | |
| gr.Dropdown( | |
| ["United-States", "Other"], | |
| label="Native Country" | |
| ) | |
| ] | |
| hbd_inputs = [ | |
| gr.Slider(18, 90, step=1, label="Age"), | |
| gr.Dropdown( | |
| ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", | |
| "Local-gov", "State-gov", "Without-pay", "Never-worked"], | |
| label="Workclass" | |
| ), | |
| gr.Dropdown( | |
| ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", | |
| "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", | |
| "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"], | |
| label="Education" | |
| ), | |
| gr.Dropdown( | |
| ["Tech-support", "Craft-repair", "Other-service", "Sales", | |
| "Exec-managerial", "Prof-specialty", "Handlers-cleaners", | |
| "Machine-op-inspct", "Adm-clerical", "Farming-fishing", | |
| "Transport-moving", "Priv-house-serv", "Protective-serv", | |
| "Armed-Forces"], | |
| label="Occupation" | |
| ), | |
| gr.Dropdown( | |
| ["White", "Black", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other"], | |
| label="Race" | |
| ), | |
| gr.Dropdown( | |
| ["Male", "Female"], | |
| label="Gender" | |
| ), | |
| gr.Slider(1, 60, step=1, label="Hours Per Week"), | |
| gr.Slider(0, 100000, step=100, label="Capital Gain"), | |
| gr.Slider(0, 5000, step=50, label="Capital Loss"), | |
| gr.Dropdown( | |
| ["United-States", "Other"], | |
| label="Native Country" | |
| ) | |
| ] | |
| # Interfaces for each model | |
| ann_interface = gr.Interface( | |
| fn=predict_ann, | |
| inputs=ann_inputs, | |
| outputs="text", | |
| title="Artificial Neural Network", | |
| description="Predict income using an Artificial Neural Network." | |
| ) | |
| rf_interface = gr.Interface( | |
| fn=predict_rf, | |
| inputs=rf_inputs, | |
| outputs="text", | |
| title="Random Forest", | |
| description="Predict income using a Random Forest model." | |
| ) | |
| hb_interface = gr.Interface( | |
| fn=predict_hb, | |
| inputs=hbd_inputs, | |
| outputs="text", | |
| title="HDBScan Clustering", | |
| description="Predict income using a HDBScan Clustering model." | |
| ) | |
| interface = gr.TabbedInterface( | |
| [ann_interface, rf_interface, hb_interface], | |
| ["ANN Model", "Random Forest Model", "HDBScan Model"] | |
| ) | |
| interface.launch() | |