Random Forest

Simulate Data

library(data.table)

set.seed(123456)
n <- 5000
dt <- data.table(
    p0 = rep(0.2, n)
  , or1 = rep(1, n)
  , var1 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
  , var1n = rnorm(n, 0, 1)
  , or2 = rep(1.1, n)
  , var2 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
  , var2n = rnorm(n, 0, 2)
  , or3 = rep(1.2, n)
  , var3 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.2, 0.8))
  , var3n = rnorm(n, 0, 2)
  , or4 = rep(1.5, n)
  , var4 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
  , var4n = rnorm(n, 0, 2)
  , or5 = rep(1.7, n)
  , var5 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.5, 0.5))
  , var5n = rnorm(n, 0, 2)
  , or6 = rep(2, n)
  , var6 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
  , var6n = rnorm(n, 0, 2)
  , or7 = rep(5, n)
  , var7 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.1, 0.9))
  , var7n = rnorm(n, 0, 2)
)


dt <- dt[, odds0 := p0 / (1 - p0)
         ][, log_odds := log(odds0) +
                 var1 * log(or1) + var1n * log(or1) + 
                 var2 * log(or2) + var2n * log(or2) + 
                 var3 * log(or3) + var3n * log(or3) + 
                 var4 * log(or4) + var4n * log(or4) + 
                 var5 * log(or5) + var5n * log(or5) + 
                 var6 * log(or6) + var6n * log(or6) + 
                 var7 * log(or7) + var7n * log(or7)
           ][, p := exp(log_odds)/ (1 + exp(log_odds))]

vsample <- function(p){
    sample(c(1, 0), size = 1, replace = TRUE, prob = c(p, 1 - p))
}

vsample <- Vectorize(vsample)

dt <- dt[, outcome := vsample(p)][, outcome := factor(outcome, levels = c(0, 1))]

unique(dt[, .(or1, or2, or3, or4, or5, or6, or7)]) %>% prt(caption = "Variables with Odds Ratios")
Variables with Odds Ratios
or1 or2 or3 or4 or5 or6 or7
1 1.1 1.2 1.5 1.7 2 5

GLM

  outcome
Predictors Odds Ratios CI p
(Intercept) 0.14 0.09 – 0.21 <0.001
var1 1.15 0.94 – 1.41 0.160
var2 1.11 0.92 – 1.34 0.271
var3 1.41 1.12 – 1.78 0.004
var4 1.76 1.44 – 2.16 <0.001
var5 1.97 1.64 – 2.38 <0.001
var6 2.34 1.93 – 2.85 <0.001
var7 5.44 4.01 – 7.42 <0.001
var1n 1.05 0.95 – 1.15 0.344
var2n 1.11 1.06 – 1.17 <0.001
var3n 1.20 1.14 – 1.26 <0.001
var4n 1.50 1.42 – 1.58 <0.001
var5n 1.69 1.61 – 1.79 <0.001
var6n 2.10 1.97 – 2.23 <0.001
var7n 5.15 4.67 – 5.71 <0.001
Observations 5000
R2 Tjur 0.620

Random Forest with R

Variable Importance

library(data.table)

set.seed(123456)
n <- 5000
dt <- data.table(
    p0 = rep(0.2, n)
  , or1 = rep(1, n)
  , var1 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
  , var1n = rnorm(n, 0, 1)
  , or2 = rep(1.1, n)
  , var2 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
  , var2n = rnorm(n, 0, 2)
  , or3 = rep(1.2, n)
  , var3 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.2, 0.8))
  , var3n = rnorm(n, 0, 2)
  , or4 = rep(1.5, n)
  , var4 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
  , var4n = rnorm(n, 0, 2)
  , or5 = rep(1.7, n)
  , var5 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.5, 0.5))
  , var5n = rnorm(n, 0, 2)
  , or6 = rep(2, n)
  , var6 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
  , var6n = rnorm(n, 0, 2)
  , or7 = rep(5, n)
  , var7 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.1, 0.9))
  , var7n = rnorm(n, 0, 2)
)


dt <- dt[, odds0 := p0 / (1 - p0)
         ][, log_odds := log(odds0) +
                 var1 * log(or1) + var1n * log(or1) + 
                 var2 * log(or2) + var2n * log(or2) + 
                 var3 * log(or3) + var3n * log(or3) + 
                 var4 * log(or4) + var4n * log(or4) + 
                 var5 * log(or5) + var5n * log(or5) + 
                 var6 * log(or6) + var6n * log(or6) + 
                 var7 * log(or7) + var7n * log(or7)
           ][, p := exp(log_odds)/ (1 + exp(log_odds))]

vsample <- function(p){
    sample(c(1, 0), size = 1, replace = TRUE, prob = c(p, 1 - p))
}

vsample <- Vectorize(vsample)

dt <- dt[, outcome := vsample(p)][, outcome := factor(outcome, levels = c(0, 1))]


predictors <- c(
    "var1"
  , "var2"
  , "var3"
  , "var4"
  , "var5"
  , "var6"
  , "var7"
  , "var1n"
  , "var2n"
  , "var3n"
  , "var4n"
  , "var5n"
  , "var6n"
  , "var7n"
)

frml <- Wu::wu_formula(outcome = "", predictors = predictors)
mmx <- model.matrix.lm(frml, data = dt, na.action = "na.pass")
cn <- colnames(mmx)

frml <- Wu::wu_formula(outcome = "outcome", predictors = cn[-1])
dm <- cbind(dt[, .(outcome)], mmx[,-1])

library(randomForest)
## library(doParallel)


set.seed(123456)
## cl <- makePSOCKcluster(detectCores() - 2)
## registerDoParallel(cl)

rf <- randomForest(frml
                 , data = dm
                 , ntree = 2000
                 , mtry = 2
                 , sampsize = ceiling(0.632 * nrow(dt))
                 , importance = TRUE
                 , na.action = na.roughfix
                   )

## stopCluster(cl)

plot(rf)

AUC on the Traning Data

Random Forest with Python

  • Settings
 library(reticulate)
 py_home <- py_config()
 use_virtualenv(py_home$virtualenv)
 knitr::knit_engines$set(python = reticulate::eng_python)
  • Install Packages
 library(reticulate)
 virtualenv_install("py39bert", "sklearn")

Load Data

0 0.2 1 0 0.299872 1.1 1 1.47911 1.2 0 -0.47626 1.5 1 -0.0203482 1.7 0 -1.49676 2 0 -1.14178 5 1 -2.11487 0.25 -4.2196 0.0144914 0
1 0.2 1 0 -0.528336 1.1 1 3.56512 1.2 0 3.12073 1.5 1 3.92747 1.7 0 -0.681116 2 1 1.09368 5 1 -3.60926 0.25 -1.49394 0.183331 0
2 0.2 1 1 -0.0378593 1.1 1 -3.06997 1.2 1 1.57552 1.5 1 -2.30824 1.7 1 -0.259105 2 0 -3.54988 5 1 -2.4156 0.25 -5.99023 0.00249683 0
3 0.2 1 1 -0.524894 1.1 1 0.00541474 1.2 0 0.188444 1.5 1 0.309538 1.7 0 1.87648 2 0 2.04869 5 0 0.682341 0.25 2.7888 0.942068 1
4 0.2 1 1 -0.263793 1.1 0 0.617045 1.2 1 -0.39382 1.5 1 -0.552127 1.7 0 3.83401 2 1 0.656636 5 1 0.97144 0.25 5.32027 0.995132 1

AUC

0.9315550729791694

<sklearn.metrics._plot.roc_curve.RocCurveDisplay object at 0x7fa5c50b6c70>

/home/ghowoo/py39bert/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_roc_curve is deprecated; Function :func:plot_roc_curve is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: :meth:sklearn.metric.RocCurveDisplay.from_predictions or :meth:sklearn.metric.RocCurveDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)

Feature Importance

0 var1 0.0106177
1 var2 0.0109248
2 var3 0.0095032
3 var4 0.0119873
4 var5 0.0123724
5 var6 0.0133668
6 var7 0.0118842
7 var1n 0.0645172
8 var2n 0.0667035
9 var3n 0.0700848
10 var4n 0.0876209
11 var5n 0.106656
12 var6n 0.138893
13 var7n 0.384869