SHAP: SHapley Additive exPlanations

R xgboost Package

library(Wu)
library(data.table)
library("DALEX")
library(xgboost)

titanic_train <- titanic[,c("survived", "class", "gender", "age", "sibsp", "parch", "fare", "embarked")]
titanic_train$survived <- factor(titanic_train$survived)
titanic_train$gender <- factor(titanic_train$gender)
titanic_train$embarked <- factor(titanic_train$embarked)
titanic_train <- na.omit(titanic_train)

Predictors <- c("class", "gender", "age", "sibsp", "parch", "fare", "embarked")

dt <- as.data.table(titanic_train)
dt <- dt[, survived_n := as.numeric(survived %in% c("yes"))]
label <- dt$survived_n

set.seed(123456)
frml <- Wu::wu_formula(outcome = "", predictors = Predictors)
mmx <- model.matrix.lm(frml, data = dt[, ..Predictors], na.action = "na.pass")

ind <- sample(x = c(TRUE, FALSE), size = nrow(dt), replace = TRUE, prob = c(0.8, 0.2))
dtp <- mmx
dtp_train <- dtp[ind,]
dtp_test <- dtp[!ind,]
label_train <- label[ind]
label_test <- label[!ind]

dtp_DM <- xgb.DMatrix(dtp, label=label)
dtrain <- xgb.DMatrix(dtp_train, label = label_train)
dtest <- xgb.DMatrix(dtp_test, label = label_test)



watchlist <- list(train = dtrain, eval = dtest)
params <- list(max_depth = 4
             , objective = "binary:logistic"
             , eval_metric = "auc"
             , eta = 0.7
             , gamma = 2
               )


set.seed(123456)
bst_model <- xgb.train(params = params
                     , data = dtrain
                     , nrounds = 100
                     , watchlist = watchlist
                     , verbose = FALSE
                     , early_stopping_rounds = 10
                       )

pred <- predict(bst_model, dtp)




## bst_model$evaluation_log


## cv <- xgb.cv(
##   data = train
## , label = label
## , nfold = 3
## , max_depth = 3
## , eta = 0.1
## , nthread = 6
## , nrounds = 100
## , gamma = 1
## , eval_metric = 'auc'
## , objective = "binary:logistic"
## , prediction = TRUE
## , verbose = FALSE
## )

## it <- which.max(cv$evaluation_log$test_auc_mean)
## best.iter <- cv$evaluation_log$iter[it]



set.seed(123456)

m <- xgboost(
  data = dtp
, label = label
, max.depth = 3
, eta = 0.7
, nthread = 6
, gamma = 2
, nrounds = 2
, objective = "binary:logistic"
, verbose = FALSE
)



library(pROC)
pred <- predict(m, dtp, type = "response")
roc_m <- roc(label, pred, ci = TRUE, direction = "<")
plt_roc(roc_m) %>% ann("Model ROC on Train")

Variable Importance
variable mean_abs_shap
gendermale 0.7253892
class3rd 0.2115309
fare 0.1093484
age 0.0974512
classdeck crew 0.0709085
sibsp 0.0242600
(Intercept) 0.0000000
class2nd 0.0000000
classengineering crew 0.0000000
classrestaurant staff 0.0000000
classvictualling crew 0.0000000
parch 0.0000000
embarkedCherbourg 0.0000000
embarkedQueenstown 0.0000000
embarkedSouthampton 0.0000000

R shapper Package

https://cran.r-project.org/web/packages/shapper/vignettes/shapper_regression.html

Load Data

survived class gender age sibsp parch fare embarked
no 3rd male 42 0 0 7.11 Southampton
no 3rd male 13 0 2 20.05 Southampton
no 3rd male 16 1 1 20.05 Southampton
yes 3rd female 39 1 1 20.05 Southampton
yes 3rd female 16 0 0 7.13 Southampton
yes 3rd male 25 0 0 7.13 Southampton
Variable level Overall Missing
n 2179
class (%) 1st 317 (14.5) 0.0
2nd 270 (12.4)
3rd 702 (32.2)
deck crew 66 ( 3.0)
engineering crew 324 (14.9)
restaurant staff 69 ( 3.2)
victualling crew 431 (19.8)
gender (%) female 489 (22.4) 0.0
male 1690 (77.6)
age mean (SD) 30.41 (12.17) 0.0
median [IQR] 29.00 [22.00, 38.00] 0.0
median [range] 29.00 [0.17, 74.00] 0.0
sibsp (%) 0 1761 (80.8) 0.0
1 319 (14.6)
2 42 ( 1.9)
3 20 ( 0.9)
4 22 ( 1.0)
5 6 ( 0.3)
8 9 ( 0.4)
parch (%) 0 1872 (85.9) 0.0
1 170 ( 7.8)
2 113 ( 5.2)
3 8 ( 0.4)
4 6 ( 0.3)
5 6 ( 0.3)
6 2 ( 0.1)
9 2 ( 0.1)
fare mean (SD) 19.78 (43.42) 0.0
median [IQR] 7.15 [0.00, 20.11] 0.0
median [range] 7.15 [0.00, 512.06] 0.0
embarked (%) Belfast 188 ( 8.6) 0.0
Cherbourg 268 (12.3)
Queenstown 123 ( 5.6)
Southampton 1600 (73.4)

Build Model

Call: randomForest(formula = survived ~ ., data = titanic_train) Type of random forest: classification Number of trees: 500 No. of variables tried at each split: 2

 OOB estimate of  error rate: 18.59%

Confusion matrix: no yes class.error no 1374 96 0.06530612 yes 309 400 0.43582511

MeanDecreaseGini
class 85.32083
gender 164.94209
age 83.09432
sibsp 20.98685
parch 18.94307
fare 93.84826
embarked 19.98577

shapper

Preparation of a new explainer is initiated -> model label : randomForest ( default ) -> data : 2179 rows 7 cols -> target variable : 2179 values -> predict function : yhat.randomForest will be used ( default ) -> predicted values : No value for predict function target column. ( default ) -> model_info : package randomForest , ver. 4.7.1.1 , task classification ( default ) -> predicted values : numerical, min = 0 , mean = 0.2411381 , max = 1
-> residual function : difference between y and yhat ( default ) -> residuals : numerical, min = -0.906 , mean = 0.08424048 , max = 1
A new explainer has been created!

class gender age sibsp parch fare embarked id ylevel yhat yhat_mean vname attribution sign label
1 3rd male 42 0 0 7.11 Southampton 1 no 0.996 0.7588619 class 0.0516297
randomForest
1.2 3rd male 42 0 0 7.11 Southampton 1 no 0.996 0.7588619 gender 0.1086933
randomForest
1.3 3rd male 42 0 0 7.11 Southampton 1 no 0.996 0.7588619 age 0.0396387
randomForest
1.4 3rd male 42 0 0 7.11 Southampton 1 no 0.996 0.7588619 sibsp -0.0039936
randomForest
1.5 3rd male 42 0 0 7.11 Southampton 1 no 0.996 0.7588619 parch 0.0046673
randomForest
1.6 3rd male 42 0 0 7.11 Southampton 1 no 0.996 0.7588619 fare 0.0281959
randomForest
1.7 3rd male 42 0 0 7.11 Southampton 1 no 0.996 0.7588619 embarked 0.0083069
randomForest
1.1 3rd male 42 0 0 7.11 Southampton 1 yes 0.004 0.2411381 class -0.0516297
randomForest
1.1.1 3rd male 42 0 0 7.11 Southampton 1 yes 0.004 0.2411381 gender -0.1086933
randomForest
1.1.2 3rd male 42 0 0 7.11 Southampton 1 yes 0.004 0.2411381 age -0.0396387
randomForest
1.1.3 3rd male 42 0 0 7.11 Southampton 1 yes 0.004 0.2411381 sibsp 0.0039936
randomForest
1.1.4 3rd male 42 0 0 7.11 Southampton 1 yes 0.004 0.2411381 parch -0.0046673
randomForest
1.1.5 3rd male 42 0 0 7.11 Southampton 1 yes 0.004 0.2411381 fare -0.0281959
randomForest
1.1.6 3rd male 42 0 0 7.11 Southampton 1 yes 0.004 0.2411381 embarked -0.0083069
randomForest
2 3rd male 13 0 2 20.05 Southampton 2 no 0.822 0.7588619 class 0.0626171
randomForest
2.2 3rd male 13 0 2 20.05 Southampton 2 no 0.822 0.7588619 gender 0.0976547
randomForest
2.3 3rd male 13 0 2 20.05 Southampton 2 no 0.822 0.7588619 age -0.1372411
randomForest
2.4 3rd male 13 0 2 20.05 Southampton 2 no 0.822 0.7588619 sibsp -0.0120726
randomForest
2.5 3rd male 13 0 2 20.05 Southampton 2 no 0.822 0.7588619 parch -0.0151101
randomForest
2.6 3rd male 13 0 2 20.05 Southampton 2 no 0.822 0.7588619 fare 0.0443084
randomForest
2.7 3rd male 13 0 2 20.05 Southampton 2 no 0.822 0.7588619 embarked 0.0229816
randomForest
2.1 3rd male 13 0 2 20.05 Southampton 2 yes 0.178 0.2411381 class -0.0626171
randomForest
2.1.1 3rd male 13 0 2 20.05 Southampton 2 yes 0.178 0.2411381 gender -0.0976547
randomForest
2.1.2 3rd male 13 0 2 20.05 Southampton 2 yes 0.178 0.2411381 age 0.1372411
randomForest
2.1.3 3rd male 13 0 2 20.05 Southampton 2 yes 0.178 0.2411381 sibsp 0.0120726
randomForest
2.1.4 3rd male 13 0 2 20.05 Southampton 2 yes 0.178 0.2411381 parch 0.0151101
randomForest
2.1.5 3rd male 13 0 2 20.05 Southampton 2 yes 0.178 0.2411381 fare -0.0443084
randomForest
2.1.6 3rd male 13 0 2 20.05 Southampton 2 yes 0.178 0.2411381 embarked -0.0229816
randomForest

SHAPforxgboost Package

Parameter Searching

library(SHAPforxgboost)
suppressPackageStartupMessages({
library("SHAPforxgboost"); library("ggplot2"); library("xgboost")
library("data.table"); library("here")
})

y_var <-  "diffcwv"
dataX <- as.matrix(dataXY_df[,-..y_var])
dataX <- xgb.DMatrix(dataX)

## cv1 <- xgb.cv(data = dataX )

library(rBayesianOptimization)
cv_folds <- KFold(y_var, nfolds = 5, stratified = FALSE, seed = 123456)
xgb_cv_bayes <- function(nround
                       , max.depth
                       , min_child_weight
                       , subsample
                       , eta
                       , gamma
                       , colsample_bytree
                       , max_delta_step) {
  param <- list(booster = "gbtree",
                max_depth = max.depth,
                min_child_weight = min_child_weight,
                eta=eta,gamma=gamma,
                subsample = subsample, colsample_bytree = colsample_bytree,
                max_delta_step=max_delta_step,
                lambda = 1, alpha = 0,
                ## objective = "binary:logistic",
                objective = "reg:squarederror",
                eval_metric = "auc")
  cv <- xgb.cv(params = param
             , data = dataX
             , label = y_var
             , folds = cv_folds
             , nrounds = 1000
             , early_stopping_rounds = 10
             , maximize = TRUE
             , verbose = verbose
               )
  list(Score = cv$evaluation_log$test_auc_mean[cv$best_iteration],
       Pred=cv$best_iteration)
}


OPT_Res <- BayesianOptimization(
  xgb_cv_bayes
, bounds = list(max.depth =c(3L, 10L)
               ,min_child_weight = c(1L, 40L),
                subsample = c(0.6, 0.9),
                eta=c(0.01,0.3),gamma = c(0.0, 0.2),
                colsample_bytree=c(0.5,0.8)
               ,max_delta_step=c(1L,10L))
, init_grid_dt = NULL
, init_points = 10
, n_iter = 10
, acq = "ucb"
, kappa = 2.576
, eps = 0.0
, verbose = verbose
)

best_param <- list(
booster = "gbtree",
eval.metric = "auc",
objective = "binary:logistic",
max_depth = OPT_Res$Best_Par["max.depth"],
eta = OPT_Res$Best_Par["eta"],
gamma = OPT_Res$Best_Par["gamma"],
subsample = OPT_Res$Best_Par["subsample"],
colsample_bytree = OPT_Res$Best_Par["colsample_bytree"],
min_child_weight = OPT_Res$Best_Par["min_child_weight"],
max_delta_step = OPT_Res$Best_Par["max_delta_step"])
# number of rounds should be tuned using CV
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/
# However, nrounds can not be directly derivied from the bayesianoptimization function
# Here, OPT_Res$Pred, which was supposed to be used for cross-validation, is used to record the number of rounds
nrounds=OPT_Res$Pred[[which.max(OPT_Res$History$Value)]]
xgb_model <- xgb.train (params = best_param, data = dtrain, nrounds = nrounds)

XGBoost with Python

HyperOpt

  • best params from 10k evaluations with auc 0.597: {‘colsample_bytree’: 0.8559451937195118, ‘gamma’: 2.4014476785184327, ‘learning_rate’: 0.06121079922414588, ‘max_depth’: 18.0, ‘n_estimators’: 328.47103449384826, ‘reg_alpha’: 0.5487345673703201, ‘reg_lambda’: 0.5602856409735258, ‘subsample’: 0.5309213820780311}
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

clf = xgb.XGBClassifier()


space = {
    "n_estimators": hp.uniform('n_estimators', 30, 800),
    "learning_rate": hp.uniform('learning_rate', 0.0001, 0.2),
    "gamma": hp.uniform('gamma', 0.00001, 10),
    "reg_lambda": hp.uniform('reg_lambda', 0.00001, 10),
    "reg_alpha": hp.uniform('reg_alpha', 0.00001, 10),
    "subsample": hp.uniform('subsample', 0.5, 1.0),
    'max_depth' : hp.quniform('max_depth', 3, 18, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1)
}




def objective(space):
    clf=xgb.XGBClassifier(
        max_depth = int(space['max_depth']),
        gamma = space['gamma'],
        reg_lambda = space['reg_lambda'],
        reg_alpha = space['reg_alpha'],
        subsample = space['subsample'],
        colsample_bytree = space['colsample_bytree'],
        n_estimators = int(space['n_estimators']),
        learning_rate = space['learning_rate'],
        booster = "gbtree",
        objective = "binary:logistic",
        n_jobs = -1
    )
    evaluation = [(train_features, train_label), (test_features, test_label)]
    clf.fit(train_features, train_label,
            eval_set = evaluation,
            eval_metric = "auc",
            early_stopping_rounds = 20, verbose = False)
    pred = clf.predict(test_features)
    accuracy = -roc_auc_score(test_label, pred)
    return_dict = {'loss': accuracy, 'status': STATUS_OK}
    print(return_dict)
    return return_dict
    

trials = Trials()

best_hyperarams = fmin(fn = objective,
                       space = space,
                       algo = tpe.suggest,
                       max_evals = 10000,
                       trials = trials)

best_hyperarams

trials.__dict__

Ray

from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer

num_actors = 1
num_cpus_per_actor = 1

ray_params = RayParams(
    num_actors=num_actors, cpus_per_actor=num_cpus_per_actor)

def train_model(config):
    train_x, train_y = load_breast_cancer(return_X_y=True)
    train_set = RayDMatrix(train_x, train_y)
    evals_result = {}
    bst = train(
        params=config,
        dtrain=train_set,
        evals_result=evals_result,
        evals=[(train_set, "train")],
        verbose_eval=False,
        ray_params=ray_params)
    bst.save_model("model.xgb")

from ray import tune

# Specify the hyperparameter search space.
config = {
    "tree_method": "approx",
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"],
    "eta": tune.loguniform(1e-4, 1e-1),
    "subsample": tune.uniform(0.5, 1.0),
    "max_depth": tune.randint(1, 9)
}

# Make sure to use the `get_tune_resources` method to set the `resources_per_trial`
analysis = tune.run(
    train_model,
    config=config,
    metric="train-error",
    mode="min",
    num_samples=4,
    resources_per_trial=ray_params.get_tune_resources())
print("Best hyperparameters", analysis.best_config)
    


from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer

train_x, train_y = load_breast_cancer(return_X_y=True)
train_set = RayDMatrix(train_x, train_y)

evals_result = {}
bst = train(
    {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    },
    train_set,
    evals_result=evals_result,
    evals=[(train_set, "train")],
    verbose_eval=False,
    ray_params=RayParams(
        num_actors=2,  # Number of remote actors
        cpus_per_actor=1))

bst.save_model("model.xgb")
print("Final training error: {:.4f}".format(
    evals_result["train"]["error"][-1]))

AUC & Variable Importance

from sklearn import metrics
pred_train = xgb_cl_best.predict_proba(train_features)[:,1]
pred_train_pd = pd.DataFrame({'prob':pred_train})
pred_train_pd.to_feather("xgb.feather")


fpr, tpr, threshold = metrics.roc_curve(train_label, pred_train)
roc_auc = metrics.auc(fpr, tpr)
roc_auc


pred_test = xgb_cl_best.predict_proba(test_features)[:,1]
pred_test_pd = pd.DataFrame({'prob':pred_test})
pred_test_pd.to_feather("pred_test_xgb.feather")


fpr, tpr, threshold = metrics.roc_curve(test_label, pred_test)
roc_auc = metrics.auc(fpr, tpr)
roc_auc



importances_weight = xgb_cl_best.get_booster().get_score(importance_type = 'weight')

# importances = xgb_cl_best.feature_importances_
print(importances_weight)

importance_weight_xgb_ = pd.DataFrame([importances_weight]).transpose()
importance_weight_xgb_.index.name = "variable"
importance_weight_xgb_.reset_index(inplace=True)
importance_weight_xgb_.rename(columns={0: "importance_weight"}, inplace=True)


print(importance_weight_xgb_)
print(importance_weight_xgb_.shape)



# importance_weight_xgg = pd.DataFrame({'value':importances_weight, "variable":train_features.columns.values})

importance_weight_xgb_.to_feather("importance_weight_xgb.feather")


print(xgb_cl_best.feature_importances_)


# permutation importance
from sklearn.inspection import permutation_importance
importance_permutation = permutation_importance(xgb_cl_best, train_features, train_label)
print(importance_permutation.importances_mean)

importance_permutation_xgb_ = pd.DataFrame({'value':importance_permutation.importances_mean, "variable":train_features.columns.values})


importance_permutation_xgb_.to_feather("importance_permutation.feather")