LASSO: Least Absolute Shrinkage and Selection Operator
Simulate data
library(data.table)
set.seed(123456)
n <- 1000
dt <- data.table(
p0 = rep(0.2, n)
, or1 = rep(1, n)
, var1 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
, or2 = rep(1.1, n)
, var2 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
, or3 = rep(1.2, n)
, var3 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.2, 0.8))
, or4 = rep(1.5, n)
, var4 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.3, 0.7))
, or5 = rep(1.7, n)
, var5 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.5, 0.5))
, or6 = rep(2, n)
, var6 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.4, 0.6))
, or7 = rep(5, n)
, var7 = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.1, 0.9))
)
dt <- dt[, odds0 := p0 / (1 - p0)
][, log_odds := log(odds0) +
var1 * log(or1) +
var2 * log(or2) +
var3 * log(or3) +
var4 * log(or4) +
var5 * log(or5) +
var6 * log(or6) +
var7 * log(or7)
][, p := exp(log_odds)/ (1 + exp(log_odds))]
vsample <- function(p){
sample(c(1, 0), size = 1, replace = TRUE, prob = c(p, 1 - p))
}
vsample <- Vectorize(vsample)
dt <- dt[, outcome := vsample(p)]
unique(dt[, .(or1, or2, or3, or4, or5, or6, or7)]) %>% prt(caption = "Variables with Odds Ratios")
or1 | or2 | or3 | or4 | or5 | or6 | or7 |
---|---|---|---|---|---|---|
1 | 1.1 | 1.2 | 1.5 | 1.7 | 2 | 5 |
GLM logistic regression on aggregated data
- The cbind method formula gives glm the numbers in each categories
dt1 <- data.table(
disease = c(55,42)
, healthy = c(67,34)
, treatment = c(1,0)
)
m1 <- glm(cbind(disease,healthy) ~ treatment
, family = binomial, data = dt1)
library(sjPlot)
tab_model(m1)
 | cbind(disease, healthy) | ||
---|---|---|---|
Predictors | Odds Ratios | CI | p |
(Intercept) | 1.24 | 0.79 – 1.95 | 0.360 |
treatment | 0.66 | 0.37 – 1.18 | 0.164 |
Observations | 2 |
dt2 <- data.table(
disease = c(1, 1, 0, 0)
, treatment = c(1, 0, 1, 0)
, weight = c(55, 42, 67, 34)
)
m2 <- glm(disease ~ treatment
, family = binomial
, data = dt2
, weights = weight)
library(sjPlot)
tab_model(m2)
 | disease | ||
---|---|---|---|
Predictors | Odds Ratios | CI | p |
(Intercept) | 1.24 | 0.79 – 1.95 | 0.360 |
treatment | 0.66 | 0.37 – 1.18 | 0.164 |
Observations | 4 | ||
R2 Tjur | 0.000 |
GLM
m <- glm(outcome ~ var1 + var2 + var3 + var4 + var5 + var6 + var7
, data = dt
, family = binomial
)
library(sjPlot)
tab_model(m)
 | outcome | ||
---|---|---|---|
Predictors | Odds Ratios | CI | p |
(Intercept) | 0.18 | 0.09 – 0.37 | <0.001 |
var1 | 1.05 | 0.74 – 1.47 | 0.796 |
var2 | 1.52 | 1.11 – 2.09 | 0.009 |
var3 | 1.51 | 1.03 – 2.19 | 0.031 |
var4 | 1.67 | 1.21 – 2.31 | 0.002 |
var5 | 1.55 | 1.13 – 2.12 | 0.007 |
var6 | 1.94 | 1.42 – 2.65 | <0.001 |
var7 | 5.04 | 3.16 – 8.10 | <0.001 |
Observations | 1000 | ||
R2 Tjur | 0.097 |
glmnet
- \(min_{\beta_0, \beta} \frac{1}{N} \sum_{i=1}^N w_il(y_i, \beta_0 + \beta^Tx_i) + \lambda[(1 - \alpha)||\beta||_2^2 + \alpha||\beta||_1]\)
- L1 penalty: \(\alpha = 1\)
mx <- dt[, .(var1, var2, var3, var4, var5, var6, var7)]
mx <- as.matrix(mx)
## ss <- scale(mmx)
## scale_scale <- attr(ss, 'scaled:scale')
set.seed(123456)
library(glmnet)
lss <- glmnet(x = mx
, y = dt$outcome
, family = "binomial"
, alpha = 1
, standardize = TRUE
)
library(plotmo)
plot_glmnet(lss, xvar = "rlambda")
Cross Validation
t1 <- Sys.time()
library(doParallel)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
set.seed(123456)
cvfit <- cv.glmnet(x = mx
, y = dt$outcome
, type.measure = "auc"
, family = "binomial"
, standardize = TRUE
, parallel = TRUE
)
stopCluster(cl)
t2 <- Sys.time()
print(t2 - t1)
Time difference of 1.967159 secs
The Selected Model
glmcoef <- coef(lss, cvfit$lambda.1se)
betas <- data.table(Variable = glmcoef@Dimnames[[1]]
, beta = as.matrix(glmcoef)
)
colnames(betas) <- c("Variable", "beta")
betas <- betas[, OR := exp(beta)]
betas %>% prt(digits = c(0, 6, 6, 6), caption = "Beta Coefficients of the LASSO Selected Model")
Variable | beta | OR |
---|---|---|
(Intercept) | 0.225767 | 1.253284 |
var1 | 0.000000 | 1.000000 |
var2 | 0.000000 | 1.000000 |
var3 | 0.000000 | 1.000000 |
var4 | 0.018369 | 1.018538 |
var5 | 0.004475 | 1.004485 |
var6 | 0.155754 | 1.168538 |
var7 | 1.001301 | 2.721821 |
AUC of The Selected Model
- For plotly output to set chunk option
out.height='200%'
Adaptive LASSO
set.seed(123456)
library(glmnet)
ridge <- glmnet(x = mx
, y = dt$outcome
, family = "binomial"
, alpha = 0
, standardize = TRUE
)
ridge_cv <- cv.glmnet(x = mx
, y = dt$outcome
, alpha = 0
, type.measure = "auc"
, family = "binomial"
, standardize = TRUE
)
best_ridge_coef <- as.numeric(coef(ridge, s = ridge_cv$lambda.1se))[-1]
t1 <- Sys.time()
library(doParallel)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
set.seed(20210817)
cvfit <- cv.glmnet(x = mx
, y = dt$outcome
, alpha = 1
, penalty.factor = 1 / abs(best_ridge_coef)
, type.measure = "auc"
, family = "binomial"
, standardize=TRUE
, parallel = TRUE
)
stopCluster(cl)
t2 <- Sys.time()
## print(t2 - t1)
lss_adp <- glmnet(x = mx
, y = dt$outcome
, alpha = 1
, penalty.factor = 1 / abs(best_ridge_coef)
, family = "binomial"
, standardize = TRUE
)
cfs <- lapply(cvfit$lambda, function(x){
l <- coef(lss_adp, x)
invisible(l@Dimnames[[1]][l@i + 1])
})
ls <- do.call(c, cfs)
ls <- unique(ls, fromLast = FALSE)[-1]
ls %>% prt(caption = "Order of Variables Selected", col.name = "Variable Name")
Variable Name |
---|
var7 |
var6 |
var4 |
var5 |
var2 |
var3 |
glmcoef <- coef(lss_adp, cvfit$lambda.1se)
betas <- data.table(Variable = glmcoef@Dimnames[[1]]
, beta = as.matrix(glmcoef)
)
colnames(betas) <- c("Variable", "beta")
betas <- betas[, OR := exp(beta)]
betas %>% prt(digits = c(0, 6, 6, 6), caption = "Beta Coefficients from Adaptive LASSO")
Variable | beta | OR |
---|---|---|
(Intercept) | -0.347459 | 0.706481 |
var1 | 0.000000 | 1.000000 |
var2 | 0.000000 | 1.000000 |
var3 | 0.000000 | 1.000000 |
var4 | 0.116261 | 1.123289 |
var5 | 0.059358 | 1.061155 |
var6 | 0.309546 | 1.362806 |
var7 | 1.451995 | 4.271626 |
R sessionInfo
R version 4.2.0 (2022-04-22) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 20.04.3 LTS
Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
locale: [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
attached base packages: [1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages: [1] pROC_1.18.0 doParallel_1.0.17 iterators_1.0.14
[4] foreach_1.5.2 plotmo_3.6.2 TeachingDemos_2.12 [7] plotrix_3.8-2 Formula_1.2-4 glmnet_4.1-6
[10] sjPlot_2.8.11 Wu_0.0.0.9000 flexdashboard_0.6.0 [13] lme4_1.1-30 Matrix_1.4-0 mgcv_1.8-38
[16] nlme_3.1-152 png_0.1-7 scales_1.2.0
[19] nnet_7.3-16 labelled_2.9.1 kableExtra_1.3.4
[22] plotly_4.10.0 gridExtra_2.3 ggplot2_3.3.6
[25] DT_0.24 tableone_0.13.2 magrittr_2.0.3
[28] lubridate_1.8.0 dplyr_1.0.9 plyr_1.8.7
[31] data.table_1.14.2 rmdformats_1.0.4 knitr_1.39
loaded via a namespace (and not attached): [1] TH.data_1.1-1 minqa_1.2.4 colorspace_2.0-3 ellipsis_0.3.2
[5] sjlabelled_1.2.0 estimability_1.4.1 parameters_0.18.2 rstudioapi_0.13
[9] fansi_1.0.3 mvtnorm_1.1-3 xml2_1.3.3 codetools_0.2-18
[13] splines_4.2.0 cachem_1.0.6 sjmisc_2.8.9 jsonlite_1.8.0
[17] nloptr_2.0.3 ggeffects_1.1.3 broom_1.0.0 effectsize_0.7.0.5 [21] compiler_4.2.0 httr_1.4.4 sjstats_0.18.1 emmeans_1.8.4-1
[25] backports_1.4.1 assertthat_0.2.1 fastmap_1.1.0 lazyeval_0.2.2
[29] survey_4.1-1 cli_3.3.0 htmltools_0.5.3 tools_4.2.0
[33] coda_0.19-4 gtable_0.3.0 glue_1.6.2 Rcpp_1.0.9
[37] jquerylib_0.1.4 vctrs_0.4.1 svglite_2.1.0 crosstalk_1.2.0
[41] insight_0.18.2 xfun_0.32 stringr_1.4.0 rvest_1.0.2
[45] lifecycle_1.0.1 klippy_0.0.0.9500 MASS_7.3-54 zoo_1.8-10
[49] hms_1.1.1 sandwich_3.0-2 yaml_2.3.5 sass_0.4.2
[53] stringi_1.7.8 highr_0.9 bayestestR_0.12.1 boot_1.3-28
[57] shape_1.4.6 rlang_1.0.4 pkgconfig_2.0.3 systemfonts_1.0.4 [61] evaluate_0.16 lattice_0.20-45 purrr_0.3.4 htmlwidgets_1.5.4 [65] tidyselect_1.1.2 bookdown_0.28 R6_2.5.1 generics_0.1.3
[69] multcomp_1.4-20 DBI_1.1.3 pillar_1.8.1 haven_2.5.0
[73] withr_2.5.0 survival_3.2-13 datawizard_0.5.0 tibble_3.1.8
[77] performance_0.9.2 modelr_0.1.8 utf8_1.2.2 rmarkdown_2.14
[81] grid_4.2.0 forcats_0.5.1 digest_0.6.29 webshot_0.5.3
[85] xtable_1.8-4 tidyr_1.2.0 munsell_0.5.0 viridisLite_0.4.0 [89] bslib_0.4.0 mitools_2.4