Index

Reference

R xgboost Package

library(Wu)
library(data.table)
library("DALEX")
library(xgboost)

titanic_train <- titanic[,c("survived", "class", "gender", "age", "sibsp", "parch", "fare", "embarked")]
titanic_train$survived <- factor(titanic_train$survived)
titanic_train$gender <- factor(titanic_train$gender)
titanic_train$embarked <- factor(titanic_train$embarked)
titanic_train <- na.omit(titanic_train)

Predictors <- c("class", "gender", "age", "sibsp", "parch", "fare", "embarked")

dt <- as.data.table(titanic_train)
dt <- dt[, survived_n := as.numeric(survived %in% c("yes"))]
label <- dt$survived_n

set.seed(123456)
frml <- Wu::wu_formula(outcome = "", predictors = Predictors)
mmx <- model.matrix.lm(frml, data = dt[, ..Predictors], na.action = "na.pass")

ind <- sample(x = c(TRUE, FALSE), size = nrow(dt), replace = TRUE, prob = c(0.8, 0.2))
dtp <- mmx
dtp_train <- dtp[ind,]
dtp_test <- dtp[!ind,]
label_train <- label[ind]
label_test <- label[!ind]

dtp_DM <- xgb.DMatrix(dtp, label=label)
dtrain <- xgb.DMatrix(dtp_train, label = label_train)
dtest <- xgb.DMatrix(dtp_test, label = label_test)



watchlist <- list(train = dtrain, eval = dtest)
params <- list(max_depth = 4
             , objective = "binary:logistic"
             , eval_metric = "auc"
             , eta = 0.7
             , gamma = 2
               )


set.seed(123456)
bst_model <- xgb.train(params = params
                     , data = dtrain
                     , nrounds = 100
                     , watchlist = watchlist
                     , verbose = FALSE
                     , early_stopping_rounds = 10
                       )

pred <- predict(bst_model, dtp)




## bst_model$evaluation_log


## cv <- xgb.cv(
##   data = train
## , label = label
## , nfold = 3
## , max_depth = 3
## , eta = 0.1
## , nthread = 6
## , nrounds = 100
## , gamma = 1
## , eval_metric = 'auc'
## , objective = "binary:logistic"
## , prediction = TRUE
## , verbose = FALSE
## )

## it <- which.max(cv$evaluation_log$test_auc_mean)
## best.iter <- cv$evaluation_log$iter[it]



set.seed(123456)

m <- xgboost(
  data = dtp
, label = label
, max.depth = 3
, eta = 0.7
, nthread = 6
, gamma = 2
, nrounds = 2
, objective = "binary:logistic"
, verbose = FALSE
)



library(pROC)
pred <- predict(m, dtp, type = "response")
roc_m <- roc(label, pred, ci = TRUE, direction = "<")
plt_roc(roc_m) %>% ann("Model ROC on Train")

xgb.plot.deepness(m)

xgb.ggplot.deepness(m)

xgb.ggplot.importance(xgb.importance(model = m))

xgb.plot.shap(data=dtp, model=m, features = "age")

xgb.plot.shap(data=dtp, model=m, top_n = 5)

xgb.plot.shap.summary(data=dtp, model=m, top_n=5)

library(SHAPforxgboost)
shap_long <- shap.prep(xgb_model = m, X_train = dtp)
shap.importance(shap_long) %>% prt(caption = "Variable Importance")

Variable Importance
variable	mean_abs_shap
gendermale	0.7253892
class3rd	0.2115309
fare	0.1093484
age	0.0974512
classdeck crew	0.0709085
sibsp	0.0242600
(Intercept)	0.0000000
class2nd	0.0000000
classengineering crew	0.0000000
classrestaurant staff	0.0000000
classvictualling crew	0.0000000
parch	0.0000000
embarkedCherbourg	0.0000000
embarkedQueenstown	0.0000000
embarkedSouthampton	0.0000000

g1 <- shap.plot.dependence(
  data_long = shap_long
, x = "age"
, y = "age"
, color_feature = "gender") +
  ggtitle("SHAP Value of Age")

g1

Cross-Validation

Number of iterations

params <- list(max_depth=3
             , eta=0.01
             , subsample=2 / 3
             , colsample_bytree=1 / 5
               )

cv <- xgb.cv(data=dtrain
           , nrounds=300
           , nfold = 20
           , metrics = "auc"
           , params=params
           ## , objective="binary:logistic"
             )

print(cv)

plot(cv$evaluation_log$iter, cv$evaluation_log$test_auc_mean)

Selected Model

params <- list(max_depth = 7
             , objective = "binary:logistic"
             , eval_metric = "auc"
             , eta = 0.01
             , lambda=1
             , subsample=0.7
             , colsample_bytree=0.7
             , base_score=275 / (275 + 573)
               )

m <- xgboost(data=dtrain
           , nrounds=100
           , params=params
             , early_stopping_rounds = 200
             )

pred <- predict(m, dtp)

Grid Search

repeats <- 10
grd <- expand.grid(max_depth=3:15
                 , eta=c(0.01, 0.05, 0.1, 0.2, 0.3)
                 , lambda=c(0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1)
                 , auc=0
                 , nrounds=0
                 , rep=1:repeats
                   )
grd <- as.data.table(grd)
n <- nrow(grd)
## n <- 3

for(i in 1:n){
    params <- list(max_depth=grd$max_depth[i]
                 , eta=grd$eta[i]
                 , lambda=grd$lambda[i]
                   )
    cv <- xgb.cv(data=dtrain
           , nrounds=300
           , nfold = 5
           , metrics = "auc"
           , params=params
           , objective="binary:logistic"
           , verbose = FALSE)
    print(i)
    grd$auc[i] <- max(cv$evaluation_log$test_auc_mean)
    grd$nrounds[i] <- which.max(cv$evaluation_log$test_auc_mean)[1]
}
saveRDS(grd, file="xgboost_grid.RDS")


grd[, .(mauc = mean(auc)), by = .(max_depth, eta, lambda)][order(-mauc)][1:12, ]

print(grd[eta == 0.3 & lambda == 1 & max_depth == 7])


max(cv$evaluation_log$test_auc_mean)

which(max(cv$evaluation_log$test_auc_mean))

which.max(cv$evaluation_log$test_auc_mean)[1]

R shapper Package

https://cran.r-project.org/web/packages/shapper/vignettes/shapper_regression.html

Load Data

library("DALEX")
titanic_train <- titanic[,c("survived", "class", "gender", "age", "sibsp", "parch", "fare", "embarked")]
titanic_train$survived <- factor(titanic_train$survived)
titanic_train$gender <- factor(titanic_train$gender)
titanic_train$embarked <- factor(titanic_train$embarked)
titanic_train <- na.omit(titanic_train)
head(titanic_train) %>% prt()

survived	class	gender	age	sibsp	parch	fare	embarked
no	3rd	male	42	0	0	7.11	Southampton
no	3rd	male	13	0	2	20.05	Southampton
no	3rd	male	16	1	1	20.05	Southampton
yes	3rd	female	39	1	1	20.05	Southampton
yes	3rd	female	16	0	0	7.13	Southampton
yes	3rd	male	25	0	0	7.13	Southampton

Vars <- c(
  "class"
, "gender"
, "age"
, "sibsp"
, "parch"
, "fare"
, "embarked"
)


factorVars <- c(
  "class"
, "gender"
, "sibsp"
, "parch"
, "embarked"
)

tbl1n(data = titanic_train, vars = Vars, factorVars = factorVars) %>% prt()

Variable	level	Overall	Missing
n		2179
class (%)	1st	317 (14.5)	0.0
	2nd	270 (12.4)
	3rd	702 (32.2)
	deck crew	66 ( 3.0)
	engineering crew	324 (14.9)
	restaurant staff	69 ( 3.2)
	victualling crew	431 (19.8)
gender (%)	female	489 (22.4)	0.0
	male	1690 (77.6)
age	mean (SD)	30.41 (12.17)	0.0
	median [IQR]	29.00 [22.00, 38.00]	0.0
	median [range]	29.00 [0.17, 74.00]	0.0
sibsp (%)	0	1761 (80.8)	0.0
	1	319 (14.6)
	2	42 ( 1.9)
	3	20 ( 0.9)
	4	22 ( 1.0)
	5	6 ( 0.3)
	8	9 ( 0.4)
parch (%)	0	1872 (85.9)	0.0
	1	170 ( 7.8)
	2	113 ( 5.2)
	3	8 ( 0.4)
	4	6 ( 0.3)
	5	6 ( 0.3)
	6	2 ( 0.1)
	9	2 ( 0.1)
fare	mean (SD)	19.78 (43.42)	0.0
	median [IQR]	7.15 [0.00, 20.11]	0.0
	median [range]	7.15 [0.00, 512.06]	0.0
embarked (%)	Belfast	188 ( 8.6)	0.0
	Cherbourg	268 (12.3)
	Queenstown	123 ( 5.6)
	Southampton	1600 (73.4)

Build Model

library("randomForest")
set.seed(123)
model_rf <- randomForest(survived ~ . , data = titanic_train)


model_rf

Call: randomForest(formula = survived ~ ., data = titanic_train) Type of random forest: classification Number of trees: 500 No. of variables tried at each split: 2

 OOB estimate of  error rate: 18.59%

Confusion matrix: no yes class.error no 1374 96 0.06530612 yes 309 400 0.43582511

plot(model_rf)

model_rf$importance %>% prt()

	MeanDecreaseGini
class	85.32083
gender	164.94209
age	83.09432
sibsp	20.98685
parch	18.94307
fare	93.84826
embarked	19.98577

varImpPlot(model_rf)

shapper

library("DALEX")
exp_rf <- explain(model_rf, data = titanic_train[,-1], y = as.numeric(titanic_train[,1])-1)

Preparation of a new explainer is initiated -> model label : randomForest ( default ) -> data : 2179 rows 7 cols -> target variable : 2179 values -> predict function : yhat.randomForest will be used ( default ) -> predicted values : No value for predict function target column. ( default ) -> model_info : package randomForest , ver. 4.7.1.1 , task classification ( default ) -> predicted values : numerical, min = 0 , mean = 0.2411381 , max = 1
-> residual function : difference between y and yhat ( default ) -> residuals : numerical, min = -0.906 , mean = 0.08424048 , max = 1
A new explainer has been created!

p_function <- function(model, data) predict(model, newdata = data, type = "prob")

library("shapper")

ive_rf <- individual_variable_effect(
  model_rf
, data = titanic_train[, -1]
, predict_function = p_function
, new_observation = titanic_train[1:2, -1]
, nsamples = 50
)

ive_rf %>% prt()

	class	gender	age	parch	fare	embarked	id	ylevel	yhat	yhat_mean	vname	attribution	label
1	3rd	male	42	0	7.11	Southampton	1	no	0.996	0.7588619	class	0.0516297	randomForest
1.2	3rd	male	42	0	7.11	Southampton	1	no	0.996	0.7588619	gender	0.1086933	randomForest
1.3	3rd	male	42	0	7.11	Southampton	1	no	0.996	0.7588619	age	0.0396387	randomForest
1.4	3rd	male	42	0	7.11	Southampton	1	no	0.996	0.7588619	sibsp	-0.0039936	randomForest
1.5	3rd	male	42	0	7.11	Southampton	1	no	0.996	0.7588619	parch	0.0046673	randomForest
1.6	3rd	male	42	0	7.11	Southampton	1	no	0.996	0.7588619	fare	0.0281959	randomForest
1.7	3rd	male	42	0	7.11	Southampton	1	no	0.996	0.7588619	embarked	0.0083069	randomForest
1.1	3rd	male	42	0	7.11	Southampton	1	yes	0.004	0.2411381	class	-0.0516297	randomForest
1.1.1	3rd	male	42	0	7.11	Southampton	1	yes	0.004	0.2411381	gender	-0.1086933	randomForest
1.1.2	3rd	male	42	0	7.11	Southampton	1	yes	0.004	0.2411381	age	-0.0396387	randomForest
1.1.3	3rd	male	42	0	7.11	Southampton	1	yes	0.004	0.2411381	sibsp	0.0039936	randomForest
1.1.4	3rd	male	42	0	7.11	Southampton	1	yes	0.004	0.2411381	parch	-0.0046673	randomForest
1.1.5	3rd	male	42	0	7.11	Southampton	1	yes	0.004	0.2411381	fare	-0.0281959	randomForest
1.1.6	3rd	male	42	0	7.11	Southampton	1	yes	0.004	0.2411381	embarked	-0.0083069	randomForest
2	3rd	male	13	2	20.05	Southampton	2	no	0.822	0.7588619	class	0.0626171	randomForest
2.2	3rd	male	13	2	20.05	Southampton	2	no	0.822	0.7588619	gender	0.0976547	randomForest
2.3	3rd	male	13	2	20.05	Southampton	2	no	0.822	0.7588619	age	-0.1372411	randomForest
2.4	3rd	male	13	2	20.05	Southampton	2	no	0.822	0.7588619	sibsp	-0.0120726	randomForest
2.5	3rd	male	13	2	20.05	Southampton	2	no	0.822	0.7588619	parch	-0.0151101	randomForest
2.6	3rd	male	13	2	20.05	Southampton	2	no	0.822	0.7588619	fare	0.0443084	randomForest
2.7	3rd	male	13	2	20.05	Southampton	2	no	0.822	0.7588619	embarked	0.0229816	randomForest
2.1	3rd	male	13	2	20.05	Southampton	2	yes	0.178	0.2411381	class	-0.0626171	randomForest
2.1.1	3rd	male	13	2	20.05	Southampton	2	yes	0.178	0.2411381	gender	-0.0976547	randomForest
2.1.2	3rd	male	13	2	20.05	Southampton	2	yes	0.178	0.2411381	age	0.1372411	randomForest
2.1.3	3rd	male	13	2	20.05	Southampton	2	yes	0.178	0.2411381	sibsp	0.0120726	randomForest
2.1.4	3rd	male	13	2	20.05	Southampton	2	yes	0.178	0.2411381	parch	0.0151101	randomForest
2.1.5	3rd	male	13	2	20.05	Southampton	2	yes	0.178	0.2411381	fare	-0.0443084	randomForest
2.1.6	3rd	male	13	2	20.05	Southampton	2	yes	0.178	0.2411381	embarked	-0.0229816	randomForest

## plot(ive_rf)

SHAPforxgboost Package

https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/

Parameter Searching

library(SHAPforxgboost)
suppressPackageStartupMessages({
library("SHAPforxgboost"); library("ggplot2"); library("xgboost")
library("data.table"); library("here")
})

y_var <-  "diffcwv"
dataX <- as.matrix(dataXY_df[,-..y_var])
dataX <- xgb.DMatrix(dataX)

## cv1 <- xgb.cv(data = dataX )

library(rBayesianOptimization)
cv_folds <- KFold(y_var, nfolds = 5, stratified = FALSE, seed = 123456)
xgb_cv_bayes <- function(nround
                       , max.depth
                       , min_child_weight
                       , subsample
                       , eta
                       , gamma
                       , colsample_bytree
                       , max_delta_step) {
  param <- list(booster = "gbtree",
                max_depth = max.depth,
                min_child_weight = min_child_weight,
                eta=eta,gamma=gamma,
                subsample = subsample, colsample_bytree = colsample_bytree,
                max_delta_step=max_delta_step,
                lambda = 1, alpha = 0,
                ## objective = "binary:logistic",
                objective = "reg:squarederror",
                eval_metric = "auc")
  cv <- xgb.cv(params = param
             , data = dataX
             , label = y_var
             , folds = cv_folds
             , nrounds = 1000
             , early_stopping_rounds = 10
             , maximize = TRUE
             , verbose = verbose
               )
  list(Score = cv$evaluation_log$test_auc_mean[cv$best_iteration],
       Pred=cv$best_iteration)
}


OPT_Res <- BayesianOptimization(
  xgb_cv_bayes
, bounds = list(max.depth =c(3L, 10L)
               ,min_child_weight = c(1L, 40L),
                subsample = c(0.6, 0.9),
                eta=c(0.01,0.3),gamma = c(0.0, 0.2),
                colsample_bytree=c(0.5,0.8)
               ,max_delta_step=c(1L,10L))
, init_grid_dt = NULL
, init_points = 10
, n_iter = 10
, acq = "ucb"
, kappa = 2.576
, eps = 0.0
, verbose = verbose
)

best_param <- list(
booster = "gbtree",
eval.metric = "auc",
objective = "binary:logistic",
max_depth = OPT_Res$Best_Par["max.depth"],
eta = OPT_Res$Best_Par["eta"],
gamma = OPT_Res$Best_Par["gamma"],
subsample = OPT_Res$Best_Par["subsample"],
colsample_bytree = OPT_Res$Best_Par["colsample_bytree"],
min_child_weight = OPT_Res$Best_Par["min_child_weight"],
max_delta_step = OPT_Res$Best_Par["max_delta_step"])
# number of rounds should be tuned using CV
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/
# However, nrounds can not be directly derivied from the bayesianoptimization function
# Here, OPT_Res$Pred, which was supposed to be used for cross-validation, is used to record the number of rounds
nrounds=OPT_Res$Pred[[which.max(OPT_Res$History$Value)]]
xgb_model <- xgb.train (params = best_param, data = dtrain, nrounds = nrounds)

Model

library(SHAPforxgboost)
suppressPackageStartupMessages({
library("SHAPforxgboost"); library("ggplot2"); library("xgboost")
library("data.table"); library("here")
})

y_var <-  "diffcwv"
dataX <- as.matrix(dataXY_df[,-..y_var])

param_list <- list(objective = "reg:squarederror",  # For regression
                   eta = 0.02,
                   max_depth = 10,
                   gamma = 0.01,
                   subsample = 0.95
                   )


mod <- xgboost::xgboost(data = dataX, 
                        label = as.matrix(dataXY_df[[y_var]]), 
                        params = param_list, nrounds = 10,
                        verbose = FALSE, nthread = parallel::detectCores() - 2,
                        early_stopping_rounds = 8)

shap_values <- shap.values(xgb_model = mod, X_train = dataX)
# The ranked features by mean |SHAP|
shap_values$mean_shap_score %>% prt()


shap_values$shap_score[1:30,] %>% prt()


hist(shap_values$shap_score$forestProp_1km)

SHAP Summary Plot

shap_long <- shap.prep(xgb_model = mod, X_train = dataX)
# is the same as: using given shap_contrib
shap_long <- shap.prep(shap_contrib = shap_values$shap_score, X_train = dataX)

shap.plot.summary(shap_long)

shap.plot.summary.wrap1(model = mod, X = dataX)

shap.plot.summary.wrap2(shap_score = shap_values$shap_score, X = dataX)

Dependence Plot

g1 <- shap.plot.dependence(data_long = shap_long
                         , x = "dayint"
                         , y = "dayint"
                         , color_feature = "Column_WV") +
  ggtitle("(A) SHAP values of Time trend vs. Time Trend")

g2 <- shap.plot.dependence(data_long = shap_long
                         , x = "dayint"
                         , y = "Column_WV"
                         , color_feature = "Column_WV") +
  ggtitle("(B) SHAP values of CWV vs. Time Trend")

gridExtra::grid.arrange(g1, g2, ncol = 2)

Interaction Effects

fig_list <- lapply(names(shap_values$mean_shap_score)[1:9]
                 , shap.plot.dependence
                 , data_long = shap_long
                   )
gridExtra::grid.arrange(grobs = fig_list, ncol = 3)

XGBoost with Python

Data

import pandas as pd
import feather


import os
print(os.getcwd())
os.chdir("")
print(os.listdir())
print(os.getcwd())

path = "matrix_train.feather"

df = pd.read_feather(path)



print(df.columns.values.tolist())

print(df.columns.values.tolist())

train_label = df['outcome']
train_features = df.drop('outcome',1)

print(train_features.columns.values.tolist())

print(train_features.shape)

patht = "matrix_test.feather"

dft = pd.read_feather(patht)

print(dft.columns.values.tolist())

test_label = dft['outcome']
test_features = dft.drop('outcome',1)

Model

import xgboost as xgb
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import matplotlib.pyplot as plt
import pickle


xgb_cl_best = xgb.XGBClassifier(
    booster = "gbtree",
    objective = "binary:logistic",
    # eval_metric = "auc",
    n_estimators = 133,
    colsample_bytree = 0.85,
    subsample = 0.53,
    learning_rate = 0.16,
    max_depth = 4,
    gamma = 2.9,
    reg_lambda = 1,
    reg_alpha = 0.58,
    n_jobs = -1
)


eval_set = [(test_features, test_label)]

xgb_cl_best.fit(train_features,
                train_label,
                eval_metric = "auc",
                eval_set=eval_set,
                early_stopping_rounds = 50,
                verbose = True
                )


with open('xgb.pkl', 'wb') as outp:
    pickle.dump(xgb_cl_best, outp, pickle.HIGHEST_PROTOCOL)

    

xgb_cl_best

HyperOpt

best params from 10k evaluations with auc 0.597: {‘colsample_bytree’: 0.8559451937195118, ‘gamma’: 2.4014476785184327, ‘learning_rate’: 0.06121079922414588, ‘max_depth’: 18.0, ‘n_estimators’: 328.47103449384826, ‘reg_alpha’: 0.5487345673703201, ‘reg_lambda’: 0.5602856409735258, ‘subsample’: 0.5309213820780311}

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

clf = xgb.XGBClassifier()


space = {
    "n_estimators": hp.uniform('n_estimators', 30, 800),
    "learning_rate": hp.uniform('learning_rate', 0.0001, 0.2),
    "gamma": hp.uniform('gamma', 0.00001, 10),
    "reg_lambda": hp.uniform('reg_lambda', 0.00001, 10),
    "reg_alpha": hp.uniform('reg_alpha', 0.00001, 10),
    "subsample": hp.uniform('subsample', 0.5, 1.0),
    'max_depth' : hp.quniform('max_depth', 3, 18, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1)
}




def objective(space):
    clf=xgb.XGBClassifier(
        max_depth = int(space['max_depth']),
        gamma = space['gamma'],
        reg_lambda = space['reg_lambda'],
        reg_alpha = space['reg_alpha'],
        subsample = space['subsample'],
        colsample_bytree = space['colsample_bytree'],
        n_estimators = int(space['n_estimators']),
        learning_rate = space['learning_rate'],
        booster = "gbtree",
        objective = "binary:logistic",
        n_jobs = -1
    )
    evaluation = [(train_features, train_label), (test_features, test_label)]
    clf.fit(train_features, train_label,
            eval_set = evaluation,
            eval_metric = "auc",
            early_stopping_rounds = 20, verbose = False)
    pred = clf.predict(test_features)
    accuracy = -roc_auc_score(test_label, pred)
    return_dict = {'loss': accuracy, 'status': STATUS_OK}
    print(return_dict)
    return return_dict
    

trials = Trials()

best_hyperarams = fmin(fn = objective,
                       space = space,
                       algo = tpe.suggest,
                       max_evals = 10000,
                       trials = trials)

best_hyperarams

trials.__dict__

Ray

from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer

num_actors = 1
num_cpus_per_actor = 1

ray_params = RayParams(
    num_actors=num_actors, cpus_per_actor=num_cpus_per_actor)

def train_model(config):
    train_x, train_y = load_breast_cancer(return_X_y=True)
    train_set = RayDMatrix(train_x, train_y)
    evals_result = {}
    bst = train(
        params=config,
        dtrain=train_set,
        evals_result=evals_result,
        evals=[(train_set, "train")],
        verbose_eval=False,
        ray_params=ray_params)
    bst.save_model("model.xgb")

from ray import tune

# Specify the hyperparameter search space.
config = {
    "tree_method": "approx",
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"],
    "eta": tune.loguniform(1e-4, 1e-1),
    "subsample": tune.uniform(0.5, 1.0),
    "max_depth": tune.randint(1, 9)
}

# Make sure to use the `get_tune_resources` method to set the `resources_per_trial`
analysis = tune.run(
    train_model,
    config=config,
    metric="train-error",
    mode="min",
    num_samples=4,
    resources_per_trial=ray_params.get_tune_resources())
print("Best hyperparameters", analysis.best_config)
    


from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer

train_x, train_y = load_breast_cancer(return_X_y=True)
train_set = RayDMatrix(train_x, train_y)

evals_result = {}
bst = train(
    {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    },
    train_set,
    evals_result=evals_result,
    evals=[(train_set, "train")],
    verbose_eval=False,
    ray_params=RayParams(
        num_actors=2,  # Number of remote actors
        cpus_per_actor=1))

bst.save_model("model.xgb")
print("Final training error: {:.4f}".format(
    evals_result["train"]["error"][-1]))

AUC & Variable Importance

from sklearn import metrics
pred_train = xgb_cl_best.predict_proba(train_features)[:,1]
pred_train_pd = pd.DataFrame({'prob':pred_train})
pred_train_pd.to_feather("xgb.feather")


fpr, tpr, threshold = metrics.roc_curve(train_label, pred_train)
roc_auc = metrics.auc(fpr, tpr)
roc_auc


pred_test = xgb_cl_best.predict_proba(test_features)[:,1]
pred_test_pd = pd.DataFrame({'prob':pred_test})
pred_test_pd.to_feather("pred_test_xgb.feather")


fpr, tpr, threshold = metrics.roc_curve(test_label, pred_test)
roc_auc = metrics.auc(fpr, tpr)
roc_auc



importances_weight = xgb_cl_best.get_booster().get_score(importance_type = 'weight')

# importances = xgb_cl_best.feature_importances_
print(importances_weight)

importance_weight_xgb_ = pd.DataFrame([importances_weight]).transpose()
importance_weight_xgb_.index.name = "variable"
importance_weight_xgb_.reset_index(inplace=True)
importance_weight_xgb_.rename(columns={0: "importance_weight"}, inplace=True)


print(importance_weight_xgb_)
print(importance_weight_xgb_.shape)



# importance_weight_xgg = pd.DataFrame({'value':importances_weight, "variable":train_features.columns.values})

importance_weight_xgb_.to_feather("importance_weight_xgb.feather")


print(xgb_cl_best.feature_importances_)


# permutation importance
from sklearn.inspection import permutation_importance
importance_permutation = permutation_importance(xgb_cl_best, train_features, train_label)
print(importance_permutation.importances_mean)

importance_permutation_xgb_ = pd.DataFrame({'value':importance_permutation.importances_mean, "variable":train_features.columns.values})


importance_permutation_xgb_.to_feather("importance_permutation.feather")

SHAP

import shap

explainer = shap.TreeExplainer(xgb_cl_best)
shap_values = explainer.shap_values(train_features)
print(shap_values)

shap.summary_plot(shap_values, train_features, plot_type="bar")

shap.summary_plot(shap_values, train_features)

SHAP: SHapley Additive exPlanations

SHAP: SHapley Additive exPlanations

Index

Reference

R xgboost Package

Cross-Validation

Selected Model

Grid Search

R shapper Package

Load Data

Build Model

shapper

SHAPforxgboost Package

Parameter Searching

Model

SHAP Summary Plot

Dependence Plot

Interaction Effects

XGBoost with Python

Data

Model

HyperOpt

Ray

AUC & Variable Importance

SHAP