Random Forest
Simulate Data
library(data.table)
set.seed(123456)
n <- 5000
dt <- data.table(
    p0 = rep(0.2, n)
  , or1 = rep(1, n)
  , var1 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
  , var1n = rnorm(n, 0, 1)
  , or2 = rep(1.1, n)
  , var2 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
  , var2n = rnorm(n, 0, 2)
  , or3 = rep(1.2, n)
  , var3 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.2, 0.8))
  , var3n = rnorm(n, 0, 2)
  , or4 = rep(1.5, n)
  , var4 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
  , var4n = rnorm(n, 0, 2)
  , or5 = rep(1.7, n)
  , var5 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.5, 0.5))
  , var5n = rnorm(n, 0, 2)
  , or6 = rep(2, n)
  , var6 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
  , var6n = rnorm(n, 0, 2)
  , or7 = rep(5, n)
  , var7 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.1, 0.9))
  , var7n = rnorm(n, 0, 2)
)
dt <- dt[, odds0 := p0 / (1 - p0)
         ][, log_odds := log(odds0) +
                 var1 * log(or1) + var1n * log(or1) + 
                 var2 * log(or2) + var2n * log(or2) + 
                 var3 * log(or3) + var3n * log(or3) + 
                 var4 * log(or4) + var4n * log(or4) + 
                 var5 * log(or5) + var5n * log(or5) + 
                 var6 * log(or6) + var6n * log(or6) + 
                 var7 * log(or7) + var7n * log(or7)
           ][, p := exp(log_odds)/ (1 + exp(log_odds))]
vsample <- function(p){
    sample(c(1, 0), size = 1, replace = TRUE, prob = c(p, 1 - p))
}
vsample <- Vectorize(vsample)
dt <- dt[, outcome := vsample(p)][, outcome := factor(outcome, levels = c(0, 1))]
unique(dt[, .(or1, or2, or3, or4, or5, or6, or7)]) %>% prt(caption = "Variables with Odds Ratios")| or1 | or2 | or3 | or4 | or5 | or6 | or7 | 
|---|---|---|---|---|---|---|
| 1 | 1.1 | 1.2 | 1.5 | 1.7 | 2 | 5 | 
GLM
m <- glm(outcome ~ var1 + var2 + var3 + var4 + var5 + var6 + var7 +
             var1n + var2n + var3n + var4n + var5n + var6n + var7n
       , data = dt
       , family = binomial
         )
library(sjPlot)
tab_model(m)| Â | outcome | ||
|---|---|---|---|
| Predictors | Odds Ratios | CI | p | 
| (Intercept) | 0.14 | 0.09 – 0.21 | <0.001 | 
| var1 | 1.15 | 0.94 – 1.41 | 0.160 | 
| var2 | 1.11 | 0.92 – 1.34 | 0.271 | 
| var3 | 1.41 | 1.12 – 1.78 | 0.004 | 
| var4 | 1.76 | 1.44 – 2.16 | <0.001 | 
| var5 | 1.97 | 1.64 – 2.38 | <0.001 | 
| var6 | 2.34 | 1.93 – 2.85 | <0.001 | 
| var7 | 5.44 | 4.01 – 7.42 | <0.001 | 
| var1n | 1.05 | 0.95 – 1.15 | 0.344 | 
| var2n | 1.11 | 1.06 – 1.17 | <0.001 | 
| var3n | 1.20 | 1.14 – 1.26 | <0.001 | 
| var4n | 1.50 | 1.42 – 1.58 | <0.001 | 
| var5n | 1.69 | 1.61 – 1.79 | <0.001 | 
| var6n | 2.10 | 1.97 – 2.23 | <0.001 | 
| var7n | 5.15 | 4.67 – 5.71 | <0.001 | 
| Observations | 5000 | ||
| R2 Tjur | 0.620 | ||
Random Forest with R
- R randomForest package
 
Variable Importance
library(data.table)
set.seed(123456)
n <- 5000
dt <- data.table(
    p0 = rep(0.2, n)
  , or1 = rep(1, n)
  , var1 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
  , var1n = rnorm(n, 0, 1)
  , or2 = rep(1.1, n)
  , var2 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
  , var2n = rnorm(n, 0, 2)
  , or3 = rep(1.2, n)
  , var3 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.2, 0.8))
  , var3n = rnorm(n, 0, 2)
  , or4 = rep(1.5, n)
  , var4 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
  , var4n = rnorm(n, 0, 2)
  , or5 = rep(1.7, n)
  , var5 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.5, 0.5))
  , var5n = rnorm(n, 0, 2)
  , or6 = rep(2, n)
  , var6 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
  , var6n = rnorm(n, 0, 2)
  , or7 = rep(5, n)
  , var7 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.1, 0.9))
  , var7n = rnorm(n, 0, 2)
)
dt <- dt[, odds0 := p0 / (1 - p0)
         ][, log_odds := log(odds0) +
                 var1 * log(or1) + var1n * log(or1) + 
                 var2 * log(or2) + var2n * log(or2) + 
                 var3 * log(or3) + var3n * log(or3) + 
                 var4 * log(or4) + var4n * log(or4) + 
                 var5 * log(or5) + var5n * log(or5) + 
                 var6 * log(or6) + var6n * log(or6) + 
                 var7 * log(or7) + var7n * log(or7)
           ][, p := exp(log_odds)/ (1 + exp(log_odds))]
vsample <- function(p){
    sample(c(1, 0), size = 1, replace = TRUE, prob = c(p, 1 - p))
}
vsample <- Vectorize(vsample)
dt <- dt[, outcome := vsample(p)][, outcome := factor(outcome, levels = c(0, 1))]
predictors <- c(
    "var1"
  , "var2"
  , "var3"
  , "var4"
  , "var5"
  , "var6"
  , "var7"
  , "var1n"
  , "var2n"
  , "var3n"
  , "var4n"
  , "var5n"
  , "var6n"
  , "var7n"
)
frml <- Wu::wu_formula(outcome = "", predictors = predictors)
mmx <- model.matrix.lm(frml, data = dt, na.action = "na.pass")
cn <- colnames(mmx)
frml <- Wu::wu_formula(outcome = "outcome", predictors = cn[-1])
dm <- cbind(dt[, .(outcome)], mmx[,-1])
library(randomForest)
## library(doParallel)
set.seed(123456)
## cl <- makePSOCKcluster(detectCores() - 2)
## registerDoParallel(cl)
rf <- randomForest(frml
                 , data = dm
                 , ntree = 2000
                 , mtry = 2
                 , sampsize = ceiling(0.632 * nrow(dt))
                 , importance = TRUE
                 , na.action = na.roughfix
                   )
## stopCluster(cl)
plot(rf)vi <- importance(rf, type = 1)
vit <- data.table(variable = attr(vi, "dimnames")[[1]], vi)
colnames(vit) <- c("variable", "MeanDecreaseAccuracy")
vit <- vit[order(MeanDecreaseAccuracy)]
vit <- vit[, variable := factor(variable, levels = vit$variable)]
plt_varimp(data = vit, var_name = variable
         , var_imp = MeanDecreaseAccuracy
         , xlabel = "Mean Decrease Accuracy"
         , ylabel = "Variable"
           )Grid Search
hyper_grid <- expand.grid(
  mtry       = seq(2, 16, by = 2),
  node_size  = seq(1, 15, by = 5),
  sample_size = round(c(0.632, seq(0.5, 0.9, by=0.1)) * nrow(dt)),
  oob_err=as.numeric(NA)
)
setDT(hyper_grid)
dim(hyper_grid)
i <- 1
m <- randomForest(
    formula = frml
  , data = dt
  , ntree = 500
  , mtry = hyper_grid$mtry[i]
  , nodesize = hyper_grid$node_size[i]
  , sampsize = hyper_grid$sample_size[i]
  , replace = TRUE
)
for(i in 1:nrow(hyper_grid)) {
    print(i)
    set.seed(20220202)
    m <- randomForest(
        formula = frml
      , data = dt
      , ntree = 500
      , mtry = hyper_grid$mtry[i]
      , nodesize = hyper_grid$node_size[i]
      , sampsize = hyper_grid$sample_size[i]
      , replace = TRUE
    )
    oob <- median(m$err.rate[, "OOB"])
    print(oob)
    hyper_grid$oob_err[i] <- oob
}
hyper_grid <- hyper_grid[order(oob_err)]
hyper_grid[1:10, ]
which.min(hyper_grid$oob_err)randomForest::tuneRF
- Tune for mtry, the number of variables randomly sampled as candidate at each split
 
rm_dt()
dtt <- readRDS(file = "_matrix_test.RDS")
dt <- readRDS(file = "_matrix_train.RDS")
dt <- dt[, recurrence := factor(recurrence, levels=c(0, 1))]
dt[, .N, by = .(recurrence)]
frml <- Wu::wu_formula(outcome = "recurrence", predictors = colnames(dt)[-1])
library(randomForest)
colnames(dt)
X <- copy(dt)[, outcome := NULL]
Y <- dt$outcome
table(Y)
class(Y)
tn <- tuneRF(
    x=X
  , y=Y
  , mtryStart=5
  , ntreeTry=50000
  , stepFactor=1.5
  , improve=0.0000001
  , trace=TRUE
  , plot=TRUE
)
tn
set.seed(2022)
rf <- randomForest(frml
                 , data = dt
                 , ntree = 10000
                 , mtry = 2
                   ## , nodesize=180
                   ## , samplesize=0.9
                   , sampsize = ceiling(0.5 * nrow(dt))
                   , replace=TRUE
                 ## , importance = TRUE
                 ## , do.trace = 20
                   ## , na.action = na.roughfix
                   ## , proximity = TRUE
                   )
pred <- predict(object = rf, type = "prob")
library(pROC)
r <- roc(dt$recurrence, pred[, 2], ci = TRUE, direction = "<")
r
pred_test <- predict(object = rf, newdata = dtt, type = "prob")
library(pROC)
r_test <- roc(dtt$recurrence, pred_test[, 2], ci = TRUE, direction = "<")
r_test
set.seed(2022)
rf <- randomForest(frml
                 , data = dt
                 , ntree = 2000
                 , mtry = 2
                   , nodesize=180
                 , sampsize = ceiling(0.632 * nrow(dt))
                   , replace=TRUE
                 , importance = TRUE
                 , na.action = na.roughfix
                   , proximity = TRUE
                   )caret:: train
library(caret)
dt <- readRDS(file = "_matrix_train.RDS")
dt <- dt[, recurrence := factor(case_when(recurrence == 1 ~ "Stone"
                                              , TRUE ~ "No")
                                  , levels=c("No", "Stone"))]
frml <- Wu::wu_formula(outcome = "recurrence", predictors = colnames(dt)[-1])
control <- trainControl(
    method="repeatedcv"
  , number=10
  , repeats=3
  , search="random"
    , classProbs = TRUE
)
library(doParallel)
cl <- makeCluster(8)
registerDoParallel(cl)
set.seed(2022)
rf_random <- train(frml
                 , data=dt
                 , method="rf"
                 , metric='ROC'
                 , tuneLength=15
                 , trControl=control
                   )
stopCluster(cl)
rf_randomFind max depth
rm_dt()
dt <- readRDS(file = "_matrix_train.RDS")
dt <- dt[, recurrence := factor(recurrence, levels=c(0, 1))]
frml <- Wu::wu_formula(outcome = "recurrence", predictors = colnames(dt)[-1])
library(pROC)
library(Wu)
library(randomForest)
library(doParallel)
md <- expand.grid(mtry=3
               , nodesize=seq(180, 200, 10)
               , ntree=c(5000)
               , auc=0)
## md <- expand.grid(mtry=(4:5)
##                , nodesize=seq(5, 10, 5)
##                , ntree=c(500)
##                , auc=0)
for(i in 1:nrow(md)){
    nodesize <- md$nodesize[i]
    ntree <- md$ntree[i]
    mtry <- md$mtry[i]
    print(c(as.character(mtry), as.character(nodesize)))
    if(exists("rf")){rm(list=c("rf"))}
    cl <- makePSOCKcluster(6)
    registerDoParallel(cl)
    rf <- randomForest(x=dt[, -1]
                     , y=dt$recurrence
                     , ntree = ntree
                     , mtry = mtry
                     , nodesize=nodesize
                     , sampsize = ceiling(0.632 * nrow(dt))
                     , replace=TRUE
                     , importance = FALSE
                     , na.action = na.roughfix
                     , proximity = TRUE
                       )
    stopCluster(cl)
    pred <- predict(object = rf, type = "prob")
    r <- roc(dt$recurrence, pred[, 2], ci = TRUE, direction = "<")
    md$auc[i] <- r$auc
    print(r$auc)
}
saveRDS(md, file="randomforestCV.RDS")
setDT(md)
print(md)
print(class(md))
print(md[, mean(auc), by = .(mtry)])
print(md[, mean(auc), by = .(nodesize)])
ggplot(md, aes(x=mtry, y=nodesize, size=auc)) + geom_point()Partial Dependent Plot
Partial Depende Plot with Two Variables
library(pdp)
library(doMC)
registerDoMC(12)
pl <- partial(rf, pred.var = c("var3n", "var5"), grid.resolution = NULL, parallel = TRUE)
plotPartial(pl)Partial Dependent Plot with Two Continuous Variables
## library(doParallel)
library(pdp)
## cl <- makePSOCKcluster(detectCores() - 2)
## registerDoParallel(cl)
registerDoMC(12)
plc <- partial(rf, pred.var = c("var3n", "var5n"), parallel = TRUE)
## stopCluster(cl)
plotPartial(plc)Random Forest with Python
- Settings
 
 library(reticulate)
 py_home <- py_config()
 use_virtualenv(py_home$virtualenv)
 knitr::knit_engines$set(python = reticulate::eng_python)
- Install Packages
 
 library(reticulate)
 virtualenv_install("py39bert", "sklearn")
Load Data
import os
path = os.getcwd()
os.chdir("/home/ghowoo/Works/stat")
import pandas as pd
df = pd.read_feather("/home/ghowoo/Works/stat/dt.feather")
from tabulate import tabulate
print(tabulate(df.head(), tablefmt = 'html'))| 0 | 0.2 | 1 | 0 | 0.299872 | 1.1 | 1 | 1.47911 | 1.2 | 0 | -0.47626 | 1.5 | 1 | -0.0203482 | 1.7 | 0 | -1.49676 | 2 | 0 | -1.14178 | 5 | 1 | -2.11487 | 0.25 | -4.2196 | 0.0144914 | 0 | 
| 1 | 0.2 | 1 | 0 | -0.528336 | 1.1 | 1 | 3.56512 | 1.2 | 0 | 3.12073 | 1.5 | 1 | 3.92747 | 1.7 | 0 | -0.681116 | 2 | 1 | 1.09368 | 5 | 1 | -3.60926 | 0.25 | -1.49394 | 0.183331 | 0 | 
| 2 | 0.2 | 1 | 1 | -0.0378593 | 1.1 | 1 | -3.06997 | 1.2 | 1 | 1.57552 | 1.5 | 1 | -2.30824 | 1.7 | 1 | -0.259105 | 2 | 0 | -3.54988 | 5 | 1 | -2.4156 | 0.25 | -5.99023 | 0.00249683 | 0 | 
| 3 | 0.2 | 1 | 1 | -0.524894 | 1.1 | 1 | 0.00541474 | 1.2 | 0 | 0.188444 | 1.5 | 1 | 0.309538 | 1.7 | 0 | 1.87648 | 2 | 0 | 2.04869 | 5 | 0 | 0.682341 | 0.25 | 2.7888 | 0.942068 | 1 | 
| 4 | 0.2 | 1 | 1 | -0.263793 | 1.1 | 0 | 0.617045 | 1.2 | 1 | -0.39382 | 1.5 | 1 | -0.552127 | 1.7 | 0 | 3.83401 | 2 | 1 | 0.656636 | 5 | 1 | 0.97144 | 0.25 | 5.32027 | 0.995132 | 1 | 
Split
- Split dataset into training set and test set
 - 70% training and 30% test
 
X = df[["var1", "var2", "var3", "var4", "var5", "var6", "var7", "var1n", "var2n", "var3n", "var4n", "var5n", "var6n", "var7n"]]
y = df[["outcome"]]
from sklearn.model_selection import train_test_split
import numpy as np
np.random.seed(123456)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)Training
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=1000, criterion="entropy", bootstrap=True, max_features='sqrt', random_state=123456)
#print("Parameters currently in use:\n")
#print(clf.get_params())
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)RandomForestClassifier(criterion=‘entropy’, max_features=‘sqrt’, n_estimators=1000, random_state=123456)
y_pred=clf.predict(X_test)
#print(y_pred[0:10])
y_prob=clf.predict_proba(X_test)
#print(y_prob[0:10, 1])
from sklearn import metrics
print("Accuracy of the model: ", metrics.accuracy_score(y_test, y_pred))
#print(dir(clf))
#print(clf.feature_importances_)Accuracy of the model: 0.8566666666666667
AUC
from sklearn.metrics import roc_auc_score
roc_value = roc_auc_score(y_test, y_prob[:,1])
print(roc_value)0.9315550729791694
import matplotlib.pyplot as plt
from sklearn import metrics
metrics.plot_roc_curve(clf, X_test, y_test)<sklearn.metrics._plot.roc_curve.RocCurveDisplay object at 0x7fa5c50b6c70>
/home/ghowoo/py39bert/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_roc_curve is deprecated; Function :func:plot_roc_curve is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: :meth:sklearn.metric.RocCurveDisplay.from_predictions or :meth:sklearn.metric.RocCurveDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
Feature Importance
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_)
feature_names = pd.Series(clf.feature_names_in_)
di = {'variable': feature_names, 'importance': feature_imp}
di = pd.DataFrame(di)
print(tabulate(di, tablefmt='html'))| 0 | var1 | 0.0106177 | 
| 1 | var2 | 0.0109248 | 
| 2 | var3 | 0.0095032 | 
| 3 | var4 | 0.0119873 | 
| 4 | var5 | 0.0123724 | 
| 5 | var6 | 0.0133668 | 
| 6 | var7 | 0.0118842 | 
| 7 | var1n | 0.0645172 | 
| 8 | var2n | 0.0667035 | 
| 9 | var3n | 0.0700848 | 
| 10 | var4n | 0.0876209 | 
| 11 | var5n | 0.106656 | 
| 12 | var6n | 0.138893 | 
| 13 | var7n | 0.384869 | 
Hyperparamters
from sklearn.model_selection import RandomizedSearchCV
n_estimators =[int(x) for x in np.linspace(start=200,stop=2000,num=10)]
print(n_estimators)
max_features =['auto', 'sqrt']
max_depth=[int(x) for x in np.linspace(10, 110,num=11)]
print(max_depth)
max_depth.append(None)
min_samples_split=[2,5,10]
min_samples_leaf=[1,2,4]
bootstrap=[True,False]
random_grid={'n_estimators': n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf,
             'bootstrap': bootstrap}
import pprint as pp
pp.pprint(random_grid)
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf_ranfom=RandomizedSearchCV(estimator=rf
                             , param_distributions=random_grid
                             , n_iter=300
                             , cv =5
                             , verbose=2
                             , random_state = 123456
                             , n_jobs = -1)
rf_ranfom.fit(X_train, y_train)
rf_ranfom.best_params_Classification
# evaluate random forest algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=3)
# define the model
model = RandomForestClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))Prediction
# make predictions using random forest for classification
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=3)
# define the model
model = RandomForestClassifier()
# fit the model on the whole dataset
model.fit(X, y)
# make a single prediction
row = [[-8.52381793,5.24451077,-12.14967704,-2.92949242,0.99314133,0.67326595,-0.38657932,1.27955683,-0.60712621,3.20807316,0.60504151,-1.38706415,8.92444588,-7.43027595,-2.33653219,1.10358169,0.21547782,1.05057966,0.6975331,0.26076035]]
yhat = model.predict(row)
print('Predicted Class: %d' % yhat[0])