SHAP: SHapley Additive exPlanations
Reference
R xgboost Package
library(Wu)
library(data.table)
library("DALEX")
library(xgboost)
titanic_train <- titanic[,c("survived", "class", "gender", "age", "sibsp", "parch", "fare", "embarked")]
titanic_train$survived <- factor(titanic_train$survived)
titanic_train$gender <- factor(titanic_train$gender)
titanic_train$embarked <- factor(titanic_train$embarked)
titanic_train <- na.omit(titanic_train)
Predictors <- c("class", "gender", "age", "sibsp", "parch", "fare", "embarked")
dt <- as.data.table(titanic_train)
dt <- dt[, survived_n := as.numeric(survived %in% c("yes"))]
label <- dt$survived_n
set.seed(123456)
frml <- Wu::wu_formula(outcome = "", predictors = Predictors)
mmx <- model.matrix.lm(frml, data = dt[, ..Predictors], na.action = "na.pass")
ind <- sample(x = c(TRUE, FALSE), size = nrow(dt), replace = TRUE, prob = c(0.8, 0.2))
dtp <- mmx
dtp_train <- dtp[ind,]
dtp_test <- dtp[!ind,]
label_train <- label[ind]
label_test <- label[!ind]
dtp_DM <- xgb.DMatrix(dtp, label=label)
dtrain <- xgb.DMatrix(dtp_train, label = label_train)
dtest <- xgb.DMatrix(dtp_test, label = label_test)
watchlist <- list(train = dtrain, eval = dtest)
params <- list(max_depth = 4
, objective = "binary:logistic"
, eval_metric = "auc"
, eta = 0.7
, gamma = 2
)
set.seed(123456)
bst_model <- xgb.train(params = params
, data = dtrain
, nrounds = 100
, watchlist = watchlist
, verbose = FALSE
, early_stopping_rounds = 10
)
pred <- predict(bst_model, dtp)
## bst_model$evaluation_log
## cv <- xgb.cv(
## data = train
## , label = label
## , nfold = 3
## , max_depth = 3
## , eta = 0.1
## , nthread = 6
## , nrounds = 100
## , gamma = 1
## , eval_metric = 'auc'
## , objective = "binary:logistic"
## , prediction = TRUE
## , verbose = FALSE
## )
## it <- which.max(cv$evaluation_log$test_auc_mean)
## best.iter <- cv$evaluation_log$iter[it]
set.seed(123456)
m <- xgboost(
data = dtp
, label = label
, max.depth = 3
, eta = 0.7
, nthread = 6
, gamma = 2
, nrounds = 2
, objective = "binary:logistic"
, verbose = FALSE
)
library(pROC)
pred <- predict(m, dtp, type = "response")
roc_m <- roc(label, pred, ci = TRUE, direction = "<")
plt_roc(roc_m) %>% ann("Model ROC on Train")
library(SHAPforxgboost)
shap_long <- shap.prep(xgb_model = m, X_train = dtp)
shap.importance(shap_long) %>% prt(caption = "Variable Importance")
variable | mean_abs_shap |
---|---|
gendermale | 0.7253892 |
class3rd | 0.2115309 |
fare | 0.1093484 |
age | 0.0974512 |
classdeck crew | 0.0709085 |
sibsp | 0.0242600 |
(Intercept) | 0.0000000 |
class2nd | 0.0000000 |
classengineering crew | 0.0000000 |
classrestaurant staff | 0.0000000 |
classvictualling crew | 0.0000000 |
parch | 0.0000000 |
embarkedCherbourg | 0.0000000 |
embarkedQueenstown | 0.0000000 |
embarkedSouthampton | 0.0000000 |
g1 <- shap.plot.dependence(
data_long = shap_long
, x = "age"
, y = "age"
, color_feature = "gender") +
ggtitle("SHAP Value of Age")
g1
Cross-Validation
- Number of iterations
Selected Model
Grid Search
repeats <- 10
grd <- expand.grid(max_depth=3:15
, eta=c(0.01, 0.05, 0.1, 0.2, 0.3)
, lambda=c(0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1)
, auc=0
, nrounds=0
, rep=1:repeats
)
grd <- as.data.table(grd)
n <- nrow(grd)
## n <- 3
for(i in 1:n){
params <- list(max_depth=grd$max_depth[i]
, eta=grd$eta[i]
, lambda=grd$lambda[i]
)
cv <- xgb.cv(data=dtrain
, nrounds=300
, nfold = 5
, metrics = "auc"
, params=params
, objective="binary:logistic"
, verbose = FALSE)
print(i)
grd$auc[i] <- max(cv$evaluation_log$test_auc_mean)
grd$nrounds[i] <- which.max(cv$evaluation_log$test_auc_mean)[1]
}
saveRDS(grd, file="xgboost_grid.RDS")
grd[, .(mauc = mean(auc)), by = .(max_depth, eta, lambda)][order(-mauc)][1:12, ]
print(grd[eta == 0.3 & lambda == 1 & max_depth == 7])
max(cv$evaluation_log$test_auc_mean)
which(max(cv$evaluation_log$test_auc_mean))
which.max(cv$evaluation_log$test_auc_mean)[1]
R shapper Package
https://cran.r-project.org/web/packages/shapper/vignettes/shapper_regression.html
Load Data
library("DALEX")
titanic_train <- titanic[,c("survived", "class", "gender", "age", "sibsp", "parch", "fare", "embarked")]
titanic_train$survived <- factor(titanic_train$survived)
titanic_train$gender <- factor(titanic_train$gender)
titanic_train$embarked <- factor(titanic_train$embarked)
titanic_train <- na.omit(titanic_train)
head(titanic_train) %>% prt()
survived | class | gender | age | sibsp | parch | fare | embarked |
---|---|---|---|---|---|---|---|
no | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton |
no | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton |
no | 3rd | male | 16 | 1 | 1 | 20.05 | Southampton |
yes | 3rd | female | 39 | 1 | 1 | 20.05 | Southampton |
yes | 3rd | female | 16 | 0 | 0 | 7.13 | Southampton |
yes | 3rd | male | 25 | 0 | 0 | 7.13 | Southampton |
Vars <- c(
"class"
, "gender"
, "age"
, "sibsp"
, "parch"
, "fare"
, "embarked"
)
factorVars <- c(
"class"
, "gender"
, "sibsp"
, "parch"
, "embarked"
)
tbl1n(data = titanic_train, vars = Vars, factorVars = factorVars) %>% prt()
Variable | level | Overall | Missing |
---|---|---|---|
n | 2179 | ||
class (%) | 1st | 317 (14.5) | 0.0 |
2nd | 270 (12.4) | ||
3rd | 702 (32.2) | ||
deck crew | 66 ( 3.0) | ||
engineering crew | 324 (14.9) | ||
restaurant staff | 69 ( 3.2) | ||
victualling crew | 431 (19.8) | ||
gender (%) | female | 489 (22.4) | 0.0 |
male | 1690 (77.6) | ||
age | mean (SD) | 30.41 (12.17) | 0.0 |
median [IQR] | 29.00 [22.00, 38.00] | 0.0 | |
median [range] | 29.00 [0.17, 74.00] | 0.0 | |
sibsp (%) | 0 | 1761 (80.8) | 0.0 |
1 | 319 (14.6) | ||
2 | 42 ( 1.9) | ||
3 | 20 ( 0.9) | ||
4 | 22 ( 1.0) | ||
5 | 6 ( 0.3) | ||
8 | 9 ( 0.4) | ||
parch (%) | 0 | 1872 (85.9) | 0.0 |
1 | 170 ( 7.8) | ||
2 | 113 ( 5.2) | ||
3 | 8 ( 0.4) | ||
4 | 6 ( 0.3) | ||
5 | 6 ( 0.3) | ||
6 | 2 ( 0.1) | ||
9 | 2 ( 0.1) | ||
fare | mean (SD) | 19.78 (43.42) | 0.0 |
median [IQR] | 7.15 [0.00, 20.11] | 0.0 | |
median [range] | 7.15 [0.00, 512.06] | 0.0 | |
embarked (%) | Belfast | 188 ( 8.6) | 0.0 |
Cherbourg | 268 (12.3) | ||
Queenstown | 123 ( 5.6) | ||
Southampton | 1600 (73.4) |
Build Model
library("randomForest")
set.seed(123)
model_rf <- randomForest(survived ~ . , data = titanic_train)
model_rf
Call: randomForest(formula = survived ~ ., data = titanic_train) Type of random forest: classification Number of trees: 500 No. of variables tried at each split: 2
OOB estimate of error rate: 18.59%
Confusion matrix: no yes class.error no 1374 96 0.06530612 yes 309 400 0.43582511
MeanDecreaseGini | |
---|---|
class | 85.32083 |
gender | 164.94209 |
age | 83.09432 |
sibsp | 20.98685 |
parch | 18.94307 |
fare | 93.84826 |
embarked | 19.98577 |
shapper
library("DALEX")
exp_rf <- explain(model_rf, data = titanic_train[,-1], y = as.numeric(titanic_train[,1])-1)
Preparation of a new explainer is initiated -> model label : randomForest ( default ) -> data : 2179 rows 7 cols -> target variable : 2179 values -> predict function : yhat.randomForest will be used ( default ) -> predicted values : No value for predict function target column. ( default ) -> model_info : package randomForest , ver. 4.7.1.1 , task classification ( default ) -> predicted values : numerical, min = 0 , mean = 0.2411381 , max = 1
-> residual function : difference between y and yhat ( default ) -> residuals : numerical, min = -0.906 , mean = 0.08424048 , max = 1
A new explainer has been created!
p_function <- function(model, data) predict(model, newdata = data, type = "prob")
library("shapper")
ive_rf <- individual_variable_effect(
model_rf
, data = titanic_train[, -1]
, predict_function = p_function
, new_observation = titanic_train[1:2, -1]
, nsamples = 50
)
ive_rf %>% prt()
class | gender | age | sibsp | parch | fare | embarked | id | ylevel | yhat | yhat_mean | vname | attribution | sign | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | no | 0.996 | 0.7588619 | class | 0.0516297 |
|
randomForest |
1.2 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | no | 0.996 | 0.7588619 | gender | 0.1086933 |
|
randomForest |
1.3 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | no | 0.996 | 0.7588619 | age | 0.0396387 |
|
randomForest |
1.4 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | no | 0.996 | 0.7588619 | sibsp | -0.0039936 |
|
randomForest |
1.5 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | no | 0.996 | 0.7588619 | parch | 0.0046673 |
|
randomForest |
1.6 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | no | 0.996 | 0.7588619 | fare | 0.0281959 |
|
randomForest |
1.7 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | no | 0.996 | 0.7588619 | embarked | 0.0083069 |
|
randomForest |
1.1 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | yes | 0.004 | 0.2411381 | class | -0.0516297 |
|
randomForest |
1.1.1 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | yes | 0.004 | 0.2411381 | gender | -0.1086933 |
|
randomForest |
1.1.2 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | yes | 0.004 | 0.2411381 | age | -0.0396387 |
|
randomForest |
1.1.3 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | yes | 0.004 | 0.2411381 | sibsp | 0.0039936 |
|
randomForest |
1.1.4 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | yes | 0.004 | 0.2411381 | parch | -0.0046673 |
|
randomForest |
1.1.5 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | yes | 0.004 | 0.2411381 | fare | -0.0281959 |
|
randomForest |
1.1.6 | 3rd | male | 42 | 0 | 0 | 7.11 | Southampton | 1 | yes | 0.004 | 0.2411381 | embarked | -0.0083069 |
|
randomForest |
2 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | no | 0.822 | 0.7588619 | class | 0.0626171 |
|
randomForest |
2.2 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | no | 0.822 | 0.7588619 | gender | 0.0976547 |
|
randomForest |
2.3 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | no | 0.822 | 0.7588619 | age | -0.1372411 |
|
randomForest |
2.4 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | no | 0.822 | 0.7588619 | sibsp | -0.0120726 |
|
randomForest |
2.5 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | no | 0.822 | 0.7588619 | parch | -0.0151101 |
|
randomForest |
2.6 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | no | 0.822 | 0.7588619 | fare | 0.0443084 |
|
randomForest |
2.7 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | no | 0.822 | 0.7588619 | embarked | 0.0229816 |
|
randomForest |
2.1 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | yes | 0.178 | 0.2411381 | class | -0.0626171 |
|
randomForest |
2.1.1 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | yes | 0.178 | 0.2411381 | gender | -0.0976547 |
|
randomForest |
2.1.2 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | yes | 0.178 | 0.2411381 | age | 0.1372411 |
|
randomForest |
2.1.3 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | yes | 0.178 | 0.2411381 | sibsp | 0.0120726 |
|
randomForest |
2.1.4 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | yes | 0.178 | 0.2411381 | parch | 0.0151101 |
|
randomForest |
2.1.5 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | yes | 0.178 | 0.2411381 | fare | -0.0443084 |
|
randomForest |
2.1.6 | 3rd | male | 13 | 0 | 2 | 20.05 | Southampton | 2 | yes | 0.178 | 0.2411381 | embarked | -0.0229816 |
|
randomForest |
SHAPforxgboost Package
Parameter Searching
library(SHAPforxgboost)
suppressPackageStartupMessages({
library("SHAPforxgboost"); library("ggplot2"); library("xgboost")
library("data.table"); library("here")
})
y_var <- "diffcwv"
dataX <- as.matrix(dataXY_df[,-..y_var])
dataX <- xgb.DMatrix(dataX)
## cv1 <- xgb.cv(data = dataX )
library(rBayesianOptimization)
cv_folds <- KFold(y_var, nfolds = 5, stratified = FALSE, seed = 123456)
xgb_cv_bayes <- function(nround
, max.depth
, min_child_weight
, subsample
, eta
, gamma
, colsample_bytree
, max_delta_step) {
param <- list(booster = "gbtree",
max_depth = max.depth,
min_child_weight = min_child_weight,
eta=eta,gamma=gamma,
subsample = subsample, colsample_bytree = colsample_bytree,
max_delta_step=max_delta_step,
lambda = 1, alpha = 0,
## objective = "binary:logistic",
objective = "reg:squarederror",
eval_metric = "auc")
cv <- xgb.cv(params = param
, data = dataX
, label = y_var
, folds = cv_folds
, nrounds = 1000
, early_stopping_rounds = 10
, maximize = TRUE
, verbose = verbose
)
list(Score = cv$evaluation_log$test_auc_mean[cv$best_iteration],
Pred=cv$best_iteration)
}
OPT_Res <- BayesianOptimization(
xgb_cv_bayes
, bounds = list(max.depth =c(3L, 10L)
,min_child_weight = c(1L, 40L),
subsample = c(0.6, 0.9),
eta=c(0.01,0.3),gamma = c(0.0, 0.2),
colsample_bytree=c(0.5,0.8)
,max_delta_step=c(1L,10L))
, init_grid_dt = NULL
, init_points = 10
, n_iter = 10
, acq = "ucb"
, kappa = 2.576
, eps = 0.0
, verbose = verbose
)
best_param <- list(
booster = "gbtree",
eval.metric = "auc",
objective = "binary:logistic",
max_depth = OPT_Res$Best_Par["max.depth"],
eta = OPT_Res$Best_Par["eta"],
gamma = OPT_Res$Best_Par["gamma"],
subsample = OPT_Res$Best_Par["subsample"],
colsample_bytree = OPT_Res$Best_Par["colsample_bytree"],
min_child_weight = OPT_Res$Best_Par["min_child_weight"],
max_delta_step = OPT_Res$Best_Par["max_delta_step"])
# number of rounds should be tuned using CV
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/
# However, nrounds can not be directly derivied from the bayesianoptimization function
# Here, OPT_Res$Pred, which was supposed to be used for cross-validation, is used to record the number of rounds
nrounds=OPT_Res$Pred[[which.max(OPT_Res$History$Value)]]
xgb_model <- xgb.train (params = best_param, data = dtrain, nrounds = nrounds)
Model
library(SHAPforxgboost)
suppressPackageStartupMessages({
library("SHAPforxgboost"); library("ggplot2"); library("xgboost")
library("data.table"); library("here")
})
y_var <- "diffcwv"
dataX <- as.matrix(dataXY_df[,-..y_var])
param_list <- list(objective = "reg:squarederror", # For regression
eta = 0.02,
max_depth = 10,
gamma = 0.01,
subsample = 0.95
)
mod <- xgboost::xgboost(data = dataX,
label = as.matrix(dataXY_df[[y_var]]),
params = param_list, nrounds = 10,
verbose = FALSE, nthread = parallel::detectCores() - 2,
early_stopping_rounds = 8)
shap_values <- shap.values(xgb_model = mod, X_train = dataX)
# The ranked features by mean |SHAP|
shap_values$mean_shap_score %>% prt()
shap_values$shap_score[1:30,] %>% prt()
hist(shap_values$shap_score$forestProp_1km)
SHAP Summary Plot
shap_long <- shap.prep(xgb_model = mod, X_train = dataX)
# is the same as: using given shap_contrib
shap_long <- shap.prep(shap_contrib = shap_values$shap_score, X_train = dataX)
shap.plot.summary(shap_long)
shap.plot.summary.wrap1(model = mod, X = dataX)
shap.plot.summary.wrap2(shap_score = shap_values$shap_score, X = dataX)
Dependence Plot
g1 <- shap.plot.dependence(data_long = shap_long
, x = "dayint"
, y = "dayint"
, color_feature = "Column_WV") +
ggtitle("(A) SHAP values of Time trend vs. Time Trend")
g2 <- shap.plot.dependence(data_long = shap_long
, x = "dayint"
, y = "Column_WV"
, color_feature = "Column_WV") +
ggtitle("(B) SHAP values of CWV vs. Time Trend")
gridExtra::grid.arrange(g1, g2, ncol = 2)
XGBoost with Python
Data
import pandas as pd
import feather
import os
print(os.getcwd())
os.chdir("")
print(os.listdir())
print(os.getcwd())
path = "matrix_train.feather"
df = pd.read_feather(path)
print(df.columns.values.tolist())
print(df.columns.values.tolist())
train_label = df['outcome']
train_features = df.drop('outcome',1)
print(train_features.columns.values.tolist())
print(train_features.shape)
patht = "matrix_test.feather"
dft = pd.read_feather(patht)
print(dft.columns.values.tolist())
test_label = dft['outcome']
test_features = dft.drop('outcome',1)
Model
import xgboost as xgb
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import matplotlib.pyplot as plt
import pickle
xgb_cl_best = xgb.XGBClassifier(
booster = "gbtree",
objective = "binary:logistic",
# eval_metric = "auc",
n_estimators = 133,
colsample_bytree = 0.85,
subsample = 0.53,
learning_rate = 0.16,
max_depth = 4,
gamma = 2.9,
reg_lambda = 1,
reg_alpha = 0.58,
n_jobs = -1
)
eval_set = [(test_features, test_label)]
xgb_cl_best.fit(train_features,
train_label,
eval_metric = "auc",
eval_set=eval_set,
early_stopping_rounds = 50,
verbose = True
)
with open('xgb.pkl', 'wb') as outp:
pickle.dump(xgb_cl_best, outp, pickle.HIGHEST_PROTOCOL)
xgb_cl_best
HyperOpt
- best params from 10k evaluations with auc 0.597: {‘colsample_bytree’: 0.8559451937195118, ‘gamma’: 2.4014476785184327, ‘learning_rate’: 0.06121079922414588, ‘max_depth’: 18.0, ‘n_estimators’: 328.47103449384826, ‘reg_alpha’: 0.5487345673703201, ‘reg_lambda’: 0.5602856409735258, ‘subsample’: 0.5309213820780311}
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
clf = xgb.XGBClassifier()
space = {
"n_estimators": hp.uniform('n_estimators', 30, 800),
"learning_rate": hp.uniform('learning_rate', 0.0001, 0.2),
"gamma": hp.uniform('gamma', 0.00001, 10),
"reg_lambda": hp.uniform('reg_lambda', 0.00001, 10),
"reg_alpha": hp.uniform('reg_alpha', 0.00001, 10),
"subsample": hp.uniform('subsample', 0.5, 1.0),
'max_depth' : hp.quniform('max_depth', 3, 18, 1),
'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1)
}
def objective(space):
clf=xgb.XGBClassifier(
max_depth = int(space['max_depth']),
gamma = space['gamma'],
reg_lambda = space['reg_lambda'],
reg_alpha = space['reg_alpha'],
subsample = space['subsample'],
colsample_bytree = space['colsample_bytree'],
n_estimators = int(space['n_estimators']),
learning_rate = space['learning_rate'],
booster = "gbtree",
objective = "binary:logistic",
n_jobs = -1
)
evaluation = [(train_features, train_label), (test_features, test_label)]
clf.fit(train_features, train_label,
eval_set = evaluation,
eval_metric = "auc",
early_stopping_rounds = 20, verbose = False)
pred = clf.predict(test_features)
accuracy = -roc_auc_score(test_label, pred)
return_dict = {'loss': accuracy, 'status': STATUS_OK}
print(return_dict)
return return_dict
trials = Trials()
best_hyperarams = fmin(fn = objective,
space = space,
algo = tpe.suggest,
max_evals = 10000,
trials = trials)
best_hyperarams
trials.__dict__
Ray
from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer
num_actors = 1
num_cpus_per_actor = 1
ray_params = RayParams(
num_actors=num_actors, cpus_per_actor=num_cpus_per_actor)
def train_model(config):
train_x, train_y = load_breast_cancer(return_X_y=True)
train_set = RayDMatrix(train_x, train_y)
evals_result = {}
bst = train(
params=config,
dtrain=train_set,
evals_result=evals_result,
evals=[(train_set, "train")],
verbose_eval=False,
ray_params=ray_params)
bst.save_model("model.xgb")
from ray import tune
# Specify the hyperparameter search space.
config = {
"tree_method": "approx",
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
"eta": tune.loguniform(1e-4, 1e-1),
"subsample": tune.uniform(0.5, 1.0),
"max_depth": tune.randint(1, 9)
}
# Make sure to use the `get_tune_resources` method to set the `resources_per_trial`
analysis = tune.run(
train_model,
config=config,
metric="train-error",
mode="min",
num_samples=4,
resources_per_trial=ray_params.get_tune_resources())
print("Best hyperparameters", analysis.best_config)
from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer
train_x, train_y = load_breast_cancer(return_X_y=True)
train_set = RayDMatrix(train_x, train_y)
evals_result = {}
bst = train(
{
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
},
train_set,
evals_result=evals_result,
evals=[(train_set, "train")],
verbose_eval=False,
ray_params=RayParams(
num_actors=2, # Number of remote actors
cpus_per_actor=1))
bst.save_model("model.xgb")
print("Final training error: {:.4f}".format(
evals_result["train"]["error"][-1]))
AUC & Variable Importance
from sklearn import metrics
pred_train = xgb_cl_best.predict_proba(train_features)[:,1]
pred_train_pd = pd.DataFrame({'prob':pred_train})
pred_train_pd.to_feather("xgb.feather")
fpr, tpr, threshold = metrics.roc_curve(train_label, pred_train)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
pred_test = xgb_cl_best.predict_proba(test_features)[:,1]
pred_test_pd = pd.DataFrame({'prob':pred_test})
pred_test_pd.to_feather("pred_test_xgb.feather")
fpr, tpr, threshold = metrics.roc_curve(test_label, pred_test)
roc_auc = metrics.auc(fpr, tpr)
roc_auc
importances_weight = xgb_cl_best.get_booster().get_score(importance_type = 'weight')
# importances = xgb_cl_best.feature_importances_
print(importances_weight)
importance_weight_xgb_ = pd.DataFrame([importances_weight]).transpose()
importance_weight_xgb_.index.name = "variable"
importance_weight_xgb_.reset_index(inplace=True)
importance_weight_xgb_.rename(columns={0: "importance_weight"}, inplace=True)
print(importance_weight_xgb_)
print(importance_weight_xgb_.shape)
# importance_weight_xgg = pd.DataFrame({'value':importances_weight, "variable":train_features.columns.values})
importance_weight_xgb_.to_feather("importance_weight_xgb.feather")
print(xgb_cl_best.feature_importances_)
# permutation importance
from sklearn.inspection import permutation_importance
importance_permutation = permutation_importance(xgb_cl_best, train_features, train_label)
print(importance_permutation.importances_mean)
importance_permutation_xgb_ = pd.DataFrame({'value':importance_permutation.importances_mean, "variable":train_features.columns.values})
importance_permutation_xgb_.to_feather("importance_permutation.feather")