Random Forest
Simulate Data
library(data.table)
set.seed(123456)
n <- 5000
dt <- data.table(
p0 = rep(0.2, n)
, or1 = rep(1, n)
, var1 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
, var1n = rnorm(n, 0, 1)
, or2 = rep(1.1, n)
, var2 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
, var2n = rnorm(n, 0, 2)
, or3 = rep(1.2, n)
, var3 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.2, 0.8))
, var3n = rnorm(n, 0, 2)
, or4 = rep(1.5, n)
, var4 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
, var4n = rnorm(n, 0, 2)
, or5 = rep(1.7, n)
, var5 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.5, 0.5))
, var5n = rnorm(n, 0, 2)
, or6 = rep(2, n)
, var6 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
, var6n = rnorm(n, 0, 2)
, or7 = rep(5, n)
, var7 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.1, 0.9))
, var7n = rnorm(n, 0, 2)
)
dt <- dt[, odds0 := p0 / (1 - p0)
][, log_odds := log(odds0) +
var1 * log(or1) + var1n * log(or1) +
var2 * log(or2) + var2n * log(or2) +
var3 * log(or3) + var3n * log(or3) +
var4 * log(or4) + var4n * log(or4) +
var5 * log(or5) + var5n * log(or5) +
var6 * log(or6) + var6n * log(or6) +
var7 * log(or7) + var7n * log(or7)
][, p := exp(log_odds)/ (1 + exp(log_odds))]
vsample <- function(p){
sample(c(1, 0), size = 1, replace = TRUE, prob = c(p, 1 - p))
}
vsample <- Vectorize(vsample)
dt <- dt[, outcome := vsample(p)][, outcome := factor(outcome, levels = c(0, 1))]
unique(dt[, .(or1, or2, or3, or4, or5, or6, or7)]) %>% prt(caption = "Variables with Odds Ratios")
or1 | or2 | or3 | or4 | or5 | or6 | or7 |
---|---|---|---|---|---|---|
1 | 1.1 | 1.2 | 1.5 | 1.7 | 2 | 5 |
GLM
m <- glm(outcome ~ var1 + var2 + var3 + var4 + var5 + var6 + var7 +
var1n + var2n + var3n + var4n + var5n + var6n + var7n
, data = dt
, family = binomial
)
library(sjPlot)
tab_model(m)
 | outcome | ||
---|---|---|---|
Predictors | Odds Ratios | CI | p |
(Intercept) | 0.14 | 0.09 – 0.21 | <0.001 |
var1 | 1.15 | 0.94 – 1.41 | 0.160 |
var2 | 1.11 | 0.92 – 1.34 | 0.271 |
var3 | 1.41 | 1.12 – 1.78 | 0.004 |
var4 | 1.76 | 1.44 – 2.16 | <0.001 |
var5 | 1.97 | 1.64 – 2.38 | <0.001 |
var6 | 2.34 | 1.93 – 2.85 | <0.001 |
var7 | 5.44 | 4.01 – 7.42 | <0.001 |
var1n | 1.05 | 0.95 – 1.15 | 0.344 |
var2n | 1.11 | 1.06 – 1.17 | <0.001 |
var3n | 1.20 | 1.14 – 1.26 | <0.001 |
var4n | 1.50 | 1.42 – 1.58 | <0.001 |
var5n | 1.69 | 1.61 – 1.79 | <0.001 |
var6n | 2.10 | 1.97 – 2.23 | <0.001 |
var7n | 5.15 | 4.67 – 5.71 | <0.001 |
Observations | 5000 | ||
R2 Tjur | 0.620 |
Random Forest with R
- R randomForest package
Variable Importance
library(data.table)
set.seed(123456)
n <- 5000
dt <- data.table(
p0 = rep(0.2, n)
, or1 = rep(1, n)
, var1 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
, var1n = rnorm(n, 0, 1)
, or2 = rep(1.1, n)
, var2 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
, var2n = rnorm(n, 0, 2)
, or3 = rep(1.2, n)
, var3 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.2, 0.8))
, var3n = rnorm(n, 0, 2)
, or4 = rep(1.5, n)
, var4 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
, var4n = rnorm(n, 0, 2)
, or5 = rep(1.7, n)
, var5 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.5, 0.5))
, var5n = rnorm(n, 0, 2)
, or6 = rep(2, n)
, var6 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
, var6n = rnorm(n, 0, 2)
, or7 = rep(5, n)
, var7 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.1, 0.9))
, var7n = rnorm(n, 0, 2)
)
dt <- dt[, odds0 := p0 / (1 - p0)
][, log_odds := log(odds0) +
var1 * log(or1) + var1n * log(or1) +
var2 * log(or2) + var2n * log(or2) +
var3 * log(or3) + var3n * log(or3) +
var4 * log(or4) + var4n * log(or4) +
var5 * log(or5) + var5n * log(or5) +
var6 * log(or6) + var6n * log(or6) +
var7 * log(or7) + var7n * log(or7)
][, p := exp(log_odds)/ (1 + exp(log_odds))]
vsample <- function(p){
sample(c(1, 0), size = 1, replace = TRUE, prob = c(p, 1 - p))
}
vsample <- Vectorize(vsample)
dt <- dt[, outcome := vsample(p)][, outcome := factor(outcome, levels = c(0, 1))]
predictors <- c(
"var1"
, "var2"
, "var3"
, "var4"
, "var5"
, "var6"
, "var7"
, "var1n"
, "var2n"
, "var3n"
, "var4n"
, "var5n"
, "var6n"
, "var7n"
)
frml <- Wu::wu_formula(outcome = "", predictors = predictors)
mmx <- model.matrix.lm(frml, data = dt, na.action = "na.pass")
cn <- colnames(mmx)
frml <- Wu::wu_formula(outcome = "outcome", predictors = cn[-1])
dm <- cbind(dt[, .(outcome)], mmx[,-1])
library(randomForest)
## library(doParallel)
set.seed(123456)
## cl <- makePSOCKcluster(detectCores() - 2)
## registerDoParallel(cl)
rf <- randomForest(frml
, data = dm
, ntree = 2000
, mtry = 2
, sampsize = ceiling(0.632 * nrow(dt))
, importance = TRUE
, na.action = na.roughfix
)
## stopCluster(cl)
plot(rf)
vi <- importance(rf, type = 1)
vit <- data.table(variable = attr(vi, "dimnames")[[1]], vi)
colnames(vit) <- c("variable", "MeanDecreaseAccuracy")
vit <- vit[order(MeanDecreaseAccuracy)]
vit <- vit[, variable := factor(variable, levels = vit$variable)]
plt_varimp(data = vit, var_name = variable
, var_imp = MeanDecreaseAccuracy
, xlabel = "Mean Decrease Accuracy"
, ylabel = "Variable"
)
Grid Search
hyper_grid <- expand.grid(
mtry = seq(2, 16, by = 2),
node_size = seq(1, 15, by = 5),
sample_size = round(c(0.632, seq(0.5, 0.9, by=0.1)) * nrow(dt)),
oob_err=as.numeric(NA)
)
setDT(hyper_grid)
dim(hyper_grid)
i <- 1
m <- randomForest(
formula = frml
, data = dt
, ntree = 500
, mtry = hyper_grid$mtry[i]
, nodesize = hyper_grid$node_size[i]
, sampsize = hyper_grid$sample_size[i]
, replace = TRUE
)
for(i in 1:nrow(hyper_grid)) {
print(i)
set.seed(20220202)
m <- randomForest(
formula = frml
, data = dt
, ntree = 500
, mtry = hyper_grid$mtry[i]
, nodesize = hyper_grid$node_size[i]
, sampsize = hyper_grid$sample_size[i]
, replace = TRUE
)
oob <- median(m$err.rate[, "OOB"])
print(oob)
hyper_grid$oob_err[i] <- oob
}
hyper_grid <- hyper_grid[order(oob_err)]
hyper_grid[1:10, ]
which.min(hyper_grid$oob_err)
randomForest::tuneRF
- Tune for mtry, the number of variables randomly sampled as candidate at each split
rm_dt()
dtt <- readRDS(file = "_matrix_test.RDS")
dt <- readRDS(file = "_matrix_train.RDS")
dt <- dt[, recurrence := factor(recurrence, levels=c(0, 1))]
dt[, .N, by = .(recurrence)]
frml <- Wu::wu_formula(outcome = "recurrence", predictors = colnames(dt)[-1])
library(randomForest)
colnames(dt)
X <- copy(dt)[, outcome := NULL]
Y <- dt$outcome
table(Y)
class(Y)
tn <- tuneRF(
x=X
, y=Y
, mtryStart=5
, ntreeTry=50000
, stepFactor=1.5
, improve=0.0000001
, trace=TRUE
, plot=TRUE
)
tn
set.seed(2022)
rf <- randomForest(frml
, data = dt
, ntree = 10000
, mtry = 2
## , nodesize=180
## , samplesize=0.9
, sampsize = ceiling(0.5 * nrow(dt))
, replace=TRUE
## , importance = TRUE
## , do.trace = 20
## , na.action = na.roughfix
## , proximity = TRUE
)
pred <- predict(object = rf, type = "prob")
library(pROC)
r <- roc(dt$recurrence, pred[, 2], ci = TRUE, direction = "<")
r
pred_test <- predict(object = rf, newdata = dtt, type = "prob")
library(pROC)
r_test <- roc(dtt$recurrence, pred_test[, 2], ci = TRUE, direction = "<")
r_test
set.seed(2022)
rf <- randomForest(frml
, data = dt
, ntree = 2000
, mtry = 2
, nodesize=180
, sampsize = ceiling(0.632 * nrow(dt))
, replace=TRUE
, importance = TRUE
, na.action = na.roughfix
, proximity = TRUE
)
caret:: train
library(caret)
dt <- readRDS(file = "_matrix_train.RDS")
dt <- dt[, recurrence := factor(case_when(recurrence == 1 ~ "Stone"
, TRUE ~ "No")
, levels=c("No", "Stone"))]
frml <- Wu::wu_formula(outcome = "recurrence", predictors = colnames(dt)[-1])
control <- trainControl(
method="repeatedcv"
, number=10
, repeats=3
, search="random"
, classProbs = TRUE
)
library(doParallel)
cl <- makeCluster(8)
registerDoParallel(cl)
set.seed(2022)
rf_random <- train(frml
, data=dt
, method="rf"
, metric='ROC'
, tuneLength=15
, trControl=control
)
stopCluster(cl)
rf_random
Find max depth
rm_dt()
dt <- readRDS(file = "_matrix_train.RDS")
dt <- dt[, recurrence := factor(recurrence, levels=c(0, 1))]
frml <- Wu::wu_formula(outcome = "recurrence", predictors = colnames(dt)[-1])
library(pROC)
library(Wu)
library(randomForest)
library(doParallel)
md <- expand.grid(mtry=3
, nodesize=seq(180, 200, 10)
, ntree=c(5000)
, auc=0)
## md <- expand.grid(mtry=(4:5)
## , nodesize=seq(5, 10, 5)
## , ntree=c(500)
## , auc=0)
for(i in 1:nrow(md)){
nodesize <- md$nodesize[i]
ntree <- md$ntree[i]
mtry <- md$mtry[i]
print(c(as.character(mtry), as.character(nodesize)))
if(exists("rf")){rm(list=c("rf"))}
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
rf <- randomForest(x=dt[, -1]
, y=dt$recurrence
, ntree = ntree
, mtry = mtry
, nodesize=nodesize
, sampsize = ceiling(0.632 * nrow(dt))
, replace=TRUE
, importance = FALSE
, na.action = na.roughfix
, proximity = TRUE
)
stopCluster(cl)
pred <- predict(object = rf, type = "prob")
r <- roc(dt$recurrence, pred[, 2], ci = TRUE, direction = "<")
md$auc[i] <- r$auc
print(r$auc)
}
saveRDS(md, file="randomforestCV.RDS")
setDT(md)
print(md)
print(class(md))
print(md[, mean(auc), by = .(mtry)])
print(md[, mean(auc), by = .(nodesize)])
ggplot(md, aes(x=mtry, y=nodesize, size=auc)) + geom_point()
Partial Dependent Plot
Partial Depende Plot with Two Variables
Partial Dependent Plot with Two Continuous Variables
Random Forest with Python
- Settings
library(reticulate)
py_home <- py_config()
use_virtualenv(py_home$virtualenv)
knitr::knit_engines$set(python = reticulate::eng_python)
- Install Packages
library(reticulate)
virtualenv_install("py39bert", "sklearn")
Load Data
import os
path = os.getcwd()
os.chdir("/home/ghowoo/Works/stat")
import pandas as pd
df = pd.read_feather("/home/ghowoo/Works/stat/dt.feather")
from tabulate import tabulate
print(tabulate(df.head(), tablefmt = 'html'))
0 | 0.2 | 1 | 0 | 0.299872 | 1.1 | 1 | 1.47911 | 1.2 | 0 | -0.47626 | 1.5 | 1 | -0.0203482 | 1.7 | 0 | -1.49676 | 2 | 0 | -1.14178 | 5 | 1 | -2.11487 | 0.25 | -4.2196 | 0.0144914 | 0 |
1 | 0.2 | 1 | 0 | -0.528336 | 1.1 | 1 | 3.56512 | 1.2 | 0 | 3.12073 | 1.5 | 1 | 3.92747 | 1.7 | 0 | -0.681116 | 2 | 1 | 1.09368 | 5 | 1 | -3.60926 | 0.25 | -1.49394 | 0.183331 | 0 |
2 | 0.2 | 1 | 1 | -0.0378593 | 1.1 | 1 | -3.06997 | 1.2 | 1 | 1.57552 | 1.5 | 1 | -2.30824 | 1.7 | 1 | -0.259105 | 2 | 0 | -3.54988 | 5 | 1 | -2.4156 | 0.25 | -5.99023 | 0.00249683 | 0 |
3 | 0.2 | 1 | 1 | -0.524894 | 1.1 | 1 | 0.00541474 | 1.2 | 0 | 0.188444 | 1.5 | 1 | 0.309538 | 1.7 | 0 | 1.87648 | 2 | 0 | 2.04869 | 5 | 0 | 0.682341 | 0.25 | 2.7888 | 0.942068 | 1 |
4 | 0.2 | 1 | 1 | -0.263793 | 1.1 | 0 | 0.617045 | 1.2 | 1 | -0.39382 | 1.5 | 1 | -0.552127 | 1.7 | 0 | 3.83401 | 2 | 1 | 0.656636 | 5 | 1 | 0.97144 | 0.25 | 5.32027 | 0.995132 | 1 |
Split
- Split dataset into training set and test set
- 70% training and 30% test
X = df[["var1", "var2", "var3", "var4", "var5", "var6", "var7", "var1n", "var2n", "var3n", "var4n", "var5n", "var6n", "var7n"]]
y = df[["outcome"]]
from sklearn.model_selection import train_test_split
import numpy as np
np.random.seed(123456)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
Training
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=1000, criterion="entropy", bootstrap=True, max_features='sqrt', random_state=123456)
#print("Parameters currently in use:\n")
#print(clf.get_params())
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
RandomForestClassifier(criterion=‘entropy’, max_features=‘sqrt’, n_estimators=1000, random_state=123456)
y_pred=clf.predict(X_test)
#print(y_pred[0:10])
y_prob=clf.predict_proba(X_test)
#print(y_prob[0:10, 1])
from sklearn import metrics
print("Accuracy of the model: ", metrics.accuracy_score(y_test, y_pred))
#print(dir(clf))
#print(clf.feature_importances_)
Accuracy of the model: 0.8566666666666667
AUC
from sklearn.metrics import roc_auc_score
roc_value = roc_auc_score(y_test, y_prob[:,1])
print(roc_value)
0.9315550729791694
import matplotlib.pyplot as plt
from sklearn import metrics
metrics.plot_roc_curve(clf, X_test, y_test)
<sklearn.metrics._plot.roc_curve.RocCurveDisplay object at 0x7fa5c50b6c70>
/home/ghowoo/py39bert/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_roc_curve is deprecated; Function :func:plot_roc_curve
is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: :meth:sklearn.metric.RocCurveDisplay.from_predictions
or :meth:sklearn.metric.RocCurveDisplay.from_estimator
. warnings.warn(msg, category=FutureWarning)
Feature Importance
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_)
feature_names = pd.Series(clf.feature_names_in_)
di = {'variable': feature_names, 'importance': feature_imp}
di = pd.DataFrame(di)
print(tabulate(di, tablefmt='html'))
0 | var1 | 0.0106177 |
1 | var2 | 0.0109248 |
2 | var3 | 0.0095032 |
3 | var4 | 0.0119873 |
4 | var5 | 0.0123724 |
5 | var6 | 0.0133668 |
6 | var7 | 0.0118842 |
7 | var1n | 0.0645172 |
8 | var2n | 0.0667035 |
9 | var3n | 0.0700848 |
10 | var4n | 0.0876209 |
11 | var5n | 0.106656 |
12 | var6n | 0.138893 |
13 | var7n | 0.384869 |
Hyperparamters
from sklearn.model_selection import RandomizedSearchCV
n_estimators =[int(x) for x in np.linspace(start=200,stop=2000,num=10)]
print(n_estimators)
max_features =['auto', 'sqrt']
max_depth=[int(x) for x in np.linspace(10, 110,num=11)]
print(max_depth)
max_depth.append(None)
min_samples_split=[2,5,10]
min_samples_leaf=[1,2,4]
bootstrap=[True,False]
random_grid={'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
import pprint as pp
pp.pprint(random_grid)
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf_ranfom=RandomizedSearchCV(estimator=rf
, param_distributions=random_grid
, n_iter=300
, cv =5
, verbose=2
, random_state = 123456
, n_jobs = -1)
rf_ranfom.fit(X_train, y_train)
rf_ranfom.best_params_
Classification
# evaluate random forest algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=3)
# define the model
model = RandomForestClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
Prediction
# make predictions using random forest for classification
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=3)
# define the model
model = RandomForestClassifier()
# fit the model on the whole dataset
model.fit(X, y)
# make a single prediction
row = [[-8.52381793,5.24451077,-12.14967704,-2.92949242,0.99314133,0.67326595,-0.38657932,1.27955683,-0.60712621,3.20807316,0.60504151,-1.38706415,8.92444588,-7.43027595,-2.33653219,1.10358169,0.21547782,1.05057966,0.6975331,0.26076035]]
yhat = model.predict(row)
print('Predicted Class: %d' % yhat[0])