BP-GWAS-Prioritise

Sleeping

File size: 15,077 Bytes

8a6cf88

from shap_plots import shap_summary_plot, shap_dependence_plot
import plotly.tools as tls
import dash_core_components as dcc
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost
import shap
import matplotlib
import plotly.graph_objs as go
try:
    import matplotlib.pyplot as pl
    from matplotlib.colors import LinearSegmentedColormap
    from matplotlib.ticker import MaxNLocator
except ImportError:
    pass
from sklearn import preprocessing 

cdict1 = {
    'red': ((0.0, 0.11764705882352941, 0.11764705882352941),
            (1.0, 0.9607843137254902, 0.9607843137254902)),

    'green': ((0.0, 0.5333333333333333, 0.5333333333333333),
              (1.0, 0.15294117647058825, 0.15294117647058825)),

    'blue': ((0.0, 0.8980392156862745, 0.8980392156862745),
             (1.0, 0.3411764705882353, 0.3411764705882353)),

    'alpha': ((0.0, 1, 1),
              (0.5, 1, 1),
              (1.0, 1, 1))
}  # #1E88E5 -> #ff0052
red_blue = LinearSegmentedColormap('RedBlue', cdict1)

def matplotlib_to_plotly(cmap, pl_entries):
    h = 1.0/(pl_entries-1)
    pl_colorscale = []

    for k in range(pl_entries):
        C = list(map(np.uint8, np.array(cmap(k*h)[:3])*255))
        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])

    return pl_colorscale

red_blue = matplotlib_to_plotly(red_blue, 255)

def summary_plot_plotly_fig(dataset, shap_values, target='target column', max_display = 20):
    feature_names=dataset.columns
    mpl_fig = shap_summary_plot(shap_values, dataset, feature_names=feature_names, max_display=20)

    plotly_fig = tls.mpl_to_plotly(mpl_fig)

    plotly_fig['layout'] = {'xaxis': {'title': 'SHAP value (impact on model output)'}}

    feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0)[:-1])
    feature_order = feature_order[-min(max_display, len(feature_order)):]
    text = [feature_names[i] for i in feature_order]
    text = iter(text)

    for i in range(1, len(plotly_fig['data']), 2):
        t = text.__next__()
        plotly_fig['data'][i]['name'] = ''
        plotly_fig['data'][i]['text'] = t
        plotly_fig['data'][i]['hoverinfo'] = 'text'

    colorbar_trace  = go.Scatter(x=[None],
                                 y=[None],
                                 mode='markers',
                                 marker=dict(
                                     colorscale=red_blue, 
                                     showscale=True,
                                     cmin=-5,
                                     cmax=5,
                                     colorbar=dict(thickness=5, tickvals=[-5, 5], ticktext=['Low', 'High'], outlinewidth=0)
                                 ),
                                 hoverinfo='none'
                                )

    plotly_fig['layout']['showlegend'] = False
    plotly_fig['layout']['hovermode'] = 'closest'
    plotly_fig['layout']['height']=600
    plotly_fig['layout']['width']=500

    plotly_fig['layout']['xaxis'].update(zeroline=True, showline=True, ticklen=4, showgrid=False)
    plotly_fig['layout']['yaxis'].update(dict(visible=False))
    plotly_fig.add_trace(colorbar_trace)
    plotly_fig.layout.update(
                             annotations=[dict(
                                  x=1.18,
                                  align="right",
                                  valign="top",
                                  text='Feature value',
                                  showarrow=False,
                                  xref="paper",
                                  yref="paper",
                                  xanchor="right",
                                  yanchor="middle",
                                  textangle=-90,
                                  font=dict(family='Calibri', size=14)
                                )
                             ],
                             margin=dict(t=20)
                            )
    return plotly_fig

def train_model_and_return_shap_values(X, y, target):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

    X_train.fillna((-999), inplace=True) 
    X_test.fillna((-999), inplace=True)

    # Some of values are float or integer and some object. This is why we need to cast them:
    for f in X_train.columns: 
        if X_train[f].dtype=='object':
            lbl = preprocessing.LabelEncoder() 
            lbl.fit(list(X_train[f].values)) 
            X_train[f] = lbl.transform(list(X_train[f].values))

    for f in X_test.columns: 
        if X_test[f].dtype=='object': 
            lbl = preprocessing.LabelEncoder() 
            lbl.fit(list(X_test[f].values)) 
            X_test[f] = lbl.transform(list(X_test[f].values))

    X_train=np.array(X_train) 
    X_test=np.array(X_test) 
    X_train = X_train.astype(float) 
    X_test = X_test.astype(float)

    d_train = xgboost.DMatrix(X_train, label=y_train, feature_names=list(X))
    d_test = xgboost.DMatrix(X_test, label=y_test, feature_names=list(X))

    # train the model
    params = {
        "eta": 0.01,
        "subsample": 0.5,
        "base_score": np.mean(y_train),
        "silent": 1
    }

    model = xgboost.train(params, d_train, 5000, evals = [(d_test, "test")], verbose_eval=None, early_stopping_rounds=50)
    feature_names = model.feature_names
    shap_values = shap.TreeExplainer(model).shap_values(pd.DataFrame(X_train, columns=X.columns))
    return model, shap_values, feature_names

def dependence_plot_to_plotly_fig(dataset, target='target column', max_display=10):
    data = pd.read_csv(dataset, encoding="ISO-8859-1")
    X = data.drop(['target column'], axis=1)
    y = data[target]
    y = y/max(y)

    xgb_full = xgboost.DMatrix(X, label=y)

    # create a train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
    xgb_train = xgboost.DMatrix(X_train, label=y_train)
    xgb_test = xgboost.DMatrix(X_test, label=y_test)

    # use validation set to choose # of trees
    params = {
        # "eta": 0.002,
        # "max_depth": 3,
        # "subsample": 0.5,
        "silent": 1
    }
    model_train = xgboost.train(params, xgb_train, 3000, evals = [(xgb_test, "test")], verbose_eval=None)

    # train final model on the full data set
    params = {
        # "eta": 0.002,
        # "max_depth": 3, 
        # "subsample": 0.5,
        "silent": 1
    }
    model = xgboost.train(params, xgb_full, 1500, evals = [(xgb_full, "test")], verbose_eval=None)
    features = model.feature_names
    shap_values = shap.TreeExplainer(model).shap_values(X)

    feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0)[:-1])
    feature_order = feature_order[-min(max_display, len(feature_order)):]
    features = [features[i] for i in feature_order[::-1]]

    lis = []
    for i in features:
        mpl_fig, interaction_index = shap_dependence_plot(i, shap_values, X)
        plotly_fig = tls.mpl_to_plotly(mpl_fig)

        # The x-tick labels start by default from 0, which is not necessarily the min value of the feature.
        # So, we need to increment the x-tick labels by 1. But while doing so, the y-axis gets shifted.
        # To prevent that, we need to manually control the x-axis range from r_min to r_max
        new_x = []
        for j in plotly_fig['data'][0]['x']:
            new_x.append(j)

        r_min = min(plotly_fig['data'][0]['x'])
        r_max = max(plotly_fig['data'][0]['x'])

        plotly_fig['layout']['xaxis'].update(range=[r_min-1, r_max+1])
        plotly_fig['data'][0]['x'] = tuple(new_x)

        # Define the colorbar
        colorbar_trace  = go.Scatter(x=[None],
                                     y=[None],
                                     mode='markers',
                                     marker=dict(
                                         colorscale=red_blue, 
                                         showscale=True,
                                         colorbar=dict(thickness=5, outlinewidth=0),
                                         color=[min(X[X.columns[interaction_index]]), max(X[X.columns[interaction_index]])],
                                     ),
                                     hoverinfo='none'
                                    )

        plotly_fig['layout']['showlegend'] = False
        plotly_fig['layout']['hovermode'] = 'closest'
        plotly_fig['layout']['height']=380
        plotly_fig['layout']['width']=450
        plotly_fig['layout']['xaxis'].update(zeroline=True, 
                                             showline=True, 
                                             ticklen=4, 
                                             showgrid=False,
                                             tickmode='linear')
        title = plotly_fig['layout']['yaxis']['title']
        plotly_fig['layout']['yaxis'].update(title=title.split(' -')[0])

        plotly_fig.add_trace(colorbar_trace)
        plotly_fig.layout.update(
                                 annotations=[dict(
                                      x=1.23,
                                      align="right",
                                      valign="top",
                                      text=X.columns[interaction_index],
                                      showarrow=False,
                                      xref="paper",
                                      yref="paper",
                                      xanchor="right",
                                      yanchor="middle",
                                      textangle=-90,
                                      font=dict(family='Calibri', size=14)
                                    )
                                 ],
                                 margin=dict(t=50, b=50, l=50, r=80)
                                )
        lis.append(plotly_fig)
    return lis, features

def interaction_plot_to_plotly_fig(dataset, target_col='target column', max_display=10):
    data = pd.read_csv(dataset, encoding="ISO-8859-1")
    X = data.drop(['target column'], axis=1)
    y = data[target_col]
    y = y/max(y)

    xgb_full = xgboost.DMatrix(X, label=y)

    # create a train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
    xgb_train = xgboost.DMatrix(X_train, label=y_train)
    xgb_test = xgboost.DMatrix(X_test, label=y_test)

    # use validation set to choose # of trees
    params = {
        # "eta": 0.002,
        # "max_depth": 3,
        # "subsample": 0.5,
        "silent": 1
    }
    model_train = xgboost.train(params, xgb_train, 3000, evals = [(xgb_test, "test")], verbose_eval=None)

    # train final model on the full data set
    params = {
        # "eta": 0.002,
        # "max_depth": 3, 
        # "subsample": 0.5,
        "silent": 1
    }
    model = xgboost.train(params, xgb_full, 1500, evals = [(xgb_full, "test")], verbose_eval=None)
    features = model.feature_names
    shap_values = shap.TreeExplainer(model).shap_values(X)

    feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0)[:-1])
    feature_order = feature_order[-min(max_display, len(feature_order)):]
    features = [features[i] for i in feature_order[::-1]]

    shap_interaction_values = shap.TreeExplainer(model).shap_interaction_values(X)

    lis = []
    for i in features:
        for j in features:
            mpl_fig = pl.figure()
            ax = mpl_fig.add_subplot(111)
            _, interaction_index = shap_dependence_plot ( (i, j), shap_interaction_values, X.iloc[:2000,:] )
            plotly_fig = tls.mpl_to_plotly(mpl_fig)

            r_min = min(plotly_fig['data'][0]['x'])
            r_max = max(plotly_fig['data'][0]['x'])

            plotly_fig['layout']['xaxis'].update(range=[r_min-1, r_max+1])
            plotly_fig['layout']['showlegend'] = False
            plotly_fig['layout']['hovermode'] = 'closest'
            plotly_fig['layout']['height']=380
            plotly_fig['layout']['width']=450
            plotly_fig['layout']['xaxis'].update(zeroline=True, 
                                                 showline=True, 
                                                 ticklen=4, 
                                                 showgrid=False,
                                                 tickmode='linear')
            plotly_fig['layout']['yaxis'].update(showline=True)

            if i!=j:
                # plotly_fig['layout']['height']=380
                plotly_fig['layout']['width']=480
                plotly_fig['layout']['yaxis']['title'] = "SHAP interaction value for {} and {}".format(i.split('-')[0], j.split('-')[0])
                # Define the colorbar
                colorbar_trace = go.Scatter(x=[None],
                                            y=[None],
                                            mode='markers',
                                            marker=dict(
                                                colorscale=red_blue, 
                                                showscale=True,
                                                colorbar=dict(thickness=5, outlinewidth=0),
                                                color=[min(X[X.columns[interaction_index]]), max(X[X.columns[interaction_index]])],
                                            ),
                                            hoverinfo='none'
                                           )
                plotly_fig.add_trace(colorbar_trace)
                plotly_fig.layout.update(
                                         annotations=[dict(
                                              x=1.23,
                                              align="right",
                                              valign="top",
                                              text=X.columns[interaction_index],
                                              showarrow=False,
                                              xref="paper",
                                              yref="paper",
                                              xanchor="right",
                                              yanchor="middle",
                                              textangle=-90,
                                              font=dict(family='Calibri', size=14)
                                            )
                                         ],
                                         margin=dict(t=30, b=30, l=60, r=80)
                                        )
            else:
                plotly_fig['layout']['yaxis']['title'] = "SHAP main effect value for {}".format(i.split('-')[0])
            lis.append(plotly_fig)
    return lis, features