Spaces:

AICOE-Datamatics
/

AiNext

Build error

File size: 11,593 Bytes

247c8df

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 12 10:00:30 2021

@author: Kishore
"""
################## Importing Modules ###########################################
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import math
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_curve, auc,roc_auc_score
import plotly.express as px
import plotly.graph_objects as go
import eli5
#####################################################################



############# Identifying the problem type (Classification/Regression) in Predictive Analytics ##########
def get_problem_type1(clean_data, dependent_variable):
    limit_number_of_class=10
    print("problem analysis")
    if (clean_data.dtypes[dependent_variable] == 'int32' or clean_data.dtypes[dependent_variable] == 'int64') and (clean_data[dependent_variable].nunique() <= limit_number_of_class):
        return "classification"
    else:
        return "regression"
#########################################################################################################


######################### Model Building For Predictive Aanalytics ############################
def model_build(clean_data, dependent_variable,problem_type,balance_data,steps_dict):
    print("Model build started")
    print("hi")
    d={}

    lst=[]
    # print(data_dict['path'])


    ######## data cleaning##########
    train_data = clean_data.drop(dependent_variable, axis=1)
    target_data = clean_data[dependent_variable]

    if problem_type=="classification":
        data_dict = {}
        ###### Models ####################
        if balance_data=="Auto":
            d={}
            d["Before Handling Imbalanced Dataset"]=target_data.value_counts()
            oversample = SMOTE()
            train_data, target_data = oversample.fit_resample(train_data, target_data)
            d["After Handling Imbalanced Dataset"] = target_data.value_counts()
            data_dict["Handling Imbalanced Dataset"]=d

        elif balance_data == "False":
            data_dict["Cannot Handle Imbalanced Dataset,It is set to False"] = ""

        X_train, X_test, y_train, y_test = train_test_split(train_data,target_data, test_size=0.3,
                                                            random_state=0)

        # pipeline_lr = Pipeline([('scalar1', StandardScaler()),
        #                         ('lr_classifier', LogisticRegression(random_state=0))])

        pipeline_dt = Pipeline([('scalar2', StandardScaler()),
                                ('dt_classifier', DecisionTreeClassifier())])

        pipeline_randomforest = Pipeline([('scalar3', StandardScaler()),
                                          ('rf_classifier', RandomForestClassifier())])
        pipeline_xgboost = Pipeline([('scalar4', StandardScaler()),
                                          ('xg_classifier',XGBClassifier() )])

        ############## Lets make the list of pipelines #####################
        pipelines = [pipeline_dt, pipeline_randomforest,pipeline_xgboost]

        best_accuracy = 0.0
        best_classifier = 0
        best_pipeline = ""

        ################## Dictionary of pipelines and classifier types for ease of reference ############
        pipe_dict = {0: 'Decision_Tree', 1: 'RandomForest',2:'XGBoost_Classifier'}

        ########## Fit the pipelines##################
        for pipe in pipelines:
            pipe.fit(X_train, y_train)

        models_info= {}
        for i, model in enumerate(pipelines):
            val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))
            lst.append(val)
            models_info[pipe_dict[i]]= model.score(X_test, y_test)
            print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)))
        df_models_info=pd.DataFrame(models_info.items(),columns=["Models","Accuracy"])

        for i, model in enumerate(pipelines):
            if model.score(X_test, y_test) > best_accuracy:
                best_accuracy = model.score(X_test, y_test)
                best_pipeline = model
                best_classifier = i
        # print(best_pipeline)

        html_object = eli5.show_weights(best_pipeline,feature_names=X_train.columns.tolist())
        result = pd.read_html(html_object.data)[0]
        data_dict['Model Interpretation'] = result

        val1 = 'Classifier with best accuracy:{}'.format(pipe_dict[best_classifier])
        lst.append(val1)
        print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

        y_pred = best_pipeline.predict(X_test)

        cn = confusion_matrix(y_test, y_pred)

        data_dict['Model details'] = lst
        fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models")
        fig.update_layout(yaxis_title="Accuracy")
        data_dict['model_comparison'] = fig

        data_dict['Best model']= lst[-1].split(':')[1]
        data_dict['Best pipeline'] = best_pipeline
        data_dict['Confusion Matrix'] = cn



        if len(X_train) <= 100000:
            cv = cross_val_score(best_pipeline, X_train, y_train, cv=5, scoring='accuracy')
            data_dict['Cross Validation'] = cv
        report = classification_report(y_test, y_pred)
        data_dict['Classification Report']=report

        y_scores = best_pipeline.predict_proba(X_test)

        # One hot encode the labels in order to plot them
        y_onehot = pd.get_dummies(y_test, columns=best_pipeline.classes_)

        # Create an empty figure, and iteratively add new lines
        # every time we compute a new class
        fig = go.Figure()
        fig.add_shape(
            type='line', line=dict(dash='dash'),
            x0=0, x1=1, y0=0, y1=1
        )

        for i in range(y_scores.shape[1]):
            y_true = y_onehot.iloc[:, i]
            y_score = y_scores[:, i]

            fpr, tpr, _ = roc_curve(y_true, y_score)
            auc_score = roc_auc_score(y_true, y_score)

            class_name=""
            for data1 in steps_dict['categorical_to_numeric']:
                for key, value in data1.items():
                    col_name = key.split('_encoded')[0]
                    if col_name == dependent_variable:
                        # print(col_name)
                        # print(value)
                        d = {}
                        for j, v in value.items():
                            if v == y_onehot.columns[i]:
                                class_name=j
                                break

            name = f"{class_name} (AUC={auc_score:.2f})"
            fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

        fig.update_layout(
            xaxis_title='False Positive Rate',
            yaxis_title='True Positive Rate',
            yaxis=dict(scaleanchor="x", scaleratio=1),
            xaxis=dict(constrain='domain'),
            width=700, height=500
        )
        data_dict['ROC Curve'] = fig
        print("model completed")

        return data_dict


    elif problem_type == "regression":
        data_dict={}
        X_train, X_test, y_train, y_test = train_test_split(train_data, target_data, test_size=0.3,random_state=0)
        pipeline_linear = Pipeline([('scalar1', StandardScaler()),('linear_cdt_regressor', LinearRegression())])
        #pipeline_lr = Pipeline([('scalar2', StandardScaler()),('lr_regressor', LogisticRegression())])
        pipeline_dt = Pipeline([('scalar2', StandardScaler()),('dt_regressor', DecisionTreeRegressor())])

        pipeline_randomforest = Pipeline([('scalar3', StandardScaler()),('rf_regressor', RandomForestRegressor())])
        pipeline_svm = Pipeline([('scalar4', StandardScaler()), ('svr',SVR(kernel='linear'))])


        pipeline_regression = [pipeline_linear,pipeline_dt,pipeline_randomforest,pipeline_svm]

        best_accuracy = 0.0
        best_regressor = 0
        best_pipeline = ""

        ################## Dictionary of pipelines and classifier types for ease of reference ############
        # pipe_dict = {0: 'Linear_Regression', 1: 'Logistic_Regression', 2: 'Decision_Tree', 3: 'RandomForest',4:'SVM'}
        pipe_dict = {0: 'Linear_Regression', 1: 'Decision_Tree', 2: 'RandomForest', 3: 'SVM'}

        for pipe in pipeline_regression:
            pipe.fit(X_train, y_train)

        models_info = {}
        for i, model in enumerate(pipeline_regression):
            val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))
            lst.append(val)
            models_info[pipe_dict[i]] = model.score(X_test, y_test)
            print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)))
        df_models_info = pd.DataFrame(models_info.items(), columns=["Models", "Accuracy"])

        for i, model in enumerate(pipeline_regression):
            if model.score(X_test, y_test) > best_accuracy:
                best_accuracy = model.score(X_test, y_test)
                best_pipeline = model
                best_regressor = i
        # print(best_pipeline)

        html_object = eli5.show_weights(best_pipeline, feature_names=X_train.columns.tolist())
        result = pd.read_html(html_object.data)[0]
        data_dict['Model Interpretation'] = result

        val1='Regressor with best accuracy:{}'.format(pipe_dict[best_regressor])
        lst.append(val1)
        print('Regressor with best accuracy:{}'.format(pipe_dict[best_regressor]))
        data_dict['Model details'] = lst
        fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models")
        fig.update_layout(yaxis_title="Accuracy")
        data_dict['model_comparison'] = fig
        data_dict['Best model'] = lst[-1].split(':')[1]
        data_dict['Best pipeline'] = best_pipeline
        y_pred = best_pipeline.predict(X_test)
        # print(y_pred)
        mse = mean_squared_error(y_test, y_pred)
        # print(mse)

        rmse = math.sqrt(mse)
        # print(rmse)
        r2 = r2_score(y_test, y_pred)
        statement_mse = "MEAN SQUARED ERROR : " + str(mse)
        statement_rmse = "ROOT MEAN SQUARED ERROR : " + str(rmse)
        statement_r2 = "R2 Score : " + str(r2)
        data_dict['MEAN SQUARED ERROR']=statement_mse
        data_dict['ROOT MEAN SQUARED ERROR']=statement_rmse
        data_dict['R2 Score']=statement_r2
        cv = cross_val_score(best_pipeline, X_train, y_train, cv=5)
        data_dict['Cross Validation']=cv

        fig = go.Figure([
            go.Scatter(y=y_test, name='Actual', mode='markers'),
            go.Scatter(y=y_pred, name='Predicted', mode='markers')
        ])
        fig.update_layout(
            title=str(lst[-1].split(':')[1]),
            xaxis_title="Count",
            yaxis_title="Target values")

        # plt.show()
        data_dict['Regression graph']=fig
        return data_dict

        
    else:
        return d

###############################################################################################