Regression in R
Package mgcv
Logistic Regression
Model
- The chunk is not run, eval=FALSE
library(mgcv)
m0 <- bam(IRF ~ PTOT_group +
age_group +
gender +
ethnicity +
s(icu_los_log, bs="cr", k=10) +
## s(hosplos_log, bs="cr", k=10) +
govtpay +
wintermonth +
ecmo +
ventcharge +
nmb72h +
steroid72h +
resp_medical +
card_surg +
infxn +
ccc_composite +
s(hospnorecode, bs="re")
, data=dt2
, family = binomial
, cluster = 8
)
Report Coefficients
fit_s <- summary(m0)
prt_coef.gam <- function(obj, exp = FALSE, digits = 2){
obj_summary <- summary(obj)
rtn <- data.table(
coef_name = names(obj_summary$p.coeff)
, coef_value = obj_summary$p.coeff
, df = sum(obj$edf)
)
rtn <- merge(
x = rtn
, y = data.table(coef_name = names(obj_summary$se), se = obj_summary$se)
, on = "coef_name"
, all.x = TRUE
, all.y = FALSE
)
rtn[, coef_value_lower := coef_value + qt(0.025, df) * se
][, coef_value_upper := coef_value + qt(0.975, df) * se
]
if (exp == TRUE){
rtn[, coef_value := exp(coef_value)
][, coef_value_lower := exp(coef_value_lower)
][, coef_value_upper := exp(coef_value_upper)]
}
rtn[, ci := paste0(
format(round(coef_value, digits), nsmall = digits)
, "("
, format(round(coef_value_lower, digits), nsmall = digits)
, ","
, format(round(coef_value_upper, digits), nsmall = digits)
, ")")
]
## print(rtn)
if (obj$family$family == "multinom"){
## print(obj$family$family)
}
return(rtn)
}
fit_coefs <- prt_coef.gam(m0, exp = TRUE, digits = 2)
fit_coefs <- fit_coefs[order(coef_name)][-1,]
coef_names <- get_coef_name(dt2[, c(
"PTOT_group"
, "age_group"
, "gender"
, "ethnicity"
, "govtpay"
, "wintermonth"
, "ecmo"
, "ventcharge"
, "nmb72h"
, "steroid72h"
, "resp_medical"
, "card_surg"
, "infxn"
, "ccc_composite"
)])
prt_m1_coef2 <- merge(
x = coef_names
, y = fit_coefs
, by.x = "coef.name"
, by.y = "coef_name"
, all.x = TRUE
, all.y = FALSE
)
knitr::kable(
prt_m1_coef2[order(order.level), list(label, level, ci)]
, caption = "Logistic Regression"
, col.names = c("Variable", "Level", "OR")
, align = c("l","r","r","r","r")
) %>%
styling()
Multinomial Logistic Regression
Model
library('mgcv')
Outcome <- "outcome2"
Predictors <- c(
"topquartile"
, "gender"
, "age_group"
, "ethnicity"
, "payor_class"
, "wintermonth"
, "ccc_composite"
, "neuro"
, "icu_los_in_weeks"
, "card_surg"
, "resp_medical"
, "infxn"
, "ecmo"
, "vent"
, "nmb72h"
, "steroid72h"
, "s(hospital_id, bs=\"re\")"
)
ts1 <- Sys.time()
fit <- mgcv::gam(
list(
wu_formula(Outcome, Predictors)
, wu_formula("", Predictors)
)
, family = multinom(K = 2)
, data = df7
, method = "REML"
## , trace = FALSE
)
ts2 <- Sys.time()
Report Coefficient
saveRDS(fit, "fit_aim1.RDS")
fit <- readRDS("fit_aim1.RDS")
fit_s <- summary(fit)
prt_coef.gam <- function(obj, exp = FALSE, digits = 2){
obj_summary <- summary(obj)
rtn <- data.table(
coef_name = names(obj_summary$p.coeff)
, coef_value = obj_summary$p.coeff
, df = sum(obj$edf)
)
rtn <- merge(
x = rtn
, y = data.table(coef_name = names(obj_summary$se), se = obj_summary$se)
, on = "coef_name"
, all.x = TRUE
, all.y = FALSE
)
rtn[, coef_value_lower := coef_value + qt(0.025, df) * se
][, coef_value_upper := coef_value + qt(0.975, df) * se
]
if (exp == TRUE){
rtn[, coef_value := exp(coef_value)
][, coef_value_lower := exp(coef_value_lower)
][, coef_value_upper := exp(coef_value_upper)]
}
rtn[, ci := paste0(
format(round(coef_value, digits), nsmall = digits)
, "("
, format(round(coef_value_lower, digits), nsmall = digits)
, ","
, format(round(coef_value_upper, digits), nsmall = digits)
, ")")
]
## print(rtn)
if (obj$family$family == "multinom"){
## print(obj$family$family)
}
return(rtn)
}
fit_coefs <- prt_coef.gam(fit, exp = TRUE, digits = 2)
fit_coefs <- fit_coefs[order(coef_name)]
cf_l <- fit_coefs[seq(3, 42, by = 2),][, ci_l := ci][, list(coef_name, ci_l)]
cf_r <- fit_coefs[seq(4, 32, by = 2),][, ci_r := ci][, list(ci_r)]
prt_m1_coef <- cbind(cf_l, cf_r)
coef_names <- get_coef_name(df7[, c(
"topquartile"
, "gender"
, "age_group"
, "ethnicity"
, "payor_class"
, "wintermonth"
, "ccc_composite"
, "neuro"
, "icu_los_in_weeks"
, "card_surg"
, "resp_medical"
, "infxn"
, "ecmo"
, "vent"
, "nmb72h"
, "steroid72h"
)])
prt_m1_coef2 <- merge(
x = coef_names
, y = prt_m1_coef
, by.x = "coef.name"
, by.y = "coef_name"
, all.x = TRUE
, all.y = FALSE
)
prt_m1_coef2 <- prt_m1_coef2[order(order.level), list(label, level, ci_l, ci_r)]
knitr::kable(
prt_m1_coef2
, caption = "Multinomial Logistic Regression"
, col.names = c("Variable", "Level", "OR(IRF:Other)", "OR(Death:Other)")
, align = c("l","r","r","r","r")
) %>%
styling()
Report p-values
fv <- anova(fit)
fv1 <- as.data.table(fv$pTerms.table)
fv1$coef_name <- rownames(fv$pTerms.table)
fv1 <- fv1[
order(coef_name)
][, p_value := as.numeric(`p-value`)
][, p_str := case_when(
p_value < 0.0001 ~ "<0.0001"
, TRUE ~ format(round(p_value, 4), nsmall = 4)
)
][]
fv1_l <- fv1[seq(1, 32, 2),][,df_l := df][, pvalue_l := p_str][,list(coef_name, df_l, pvalue_l)]
fv1_r <- fv1[seq(2, 32, 2),][,df_r := df][, pvalue_r := p_str][,list(df_r, pvalue_r)]
fv1p <- cbind(fv1_l, fv1_r)
fv2 <- as.data.table(fv$s.table)
fv2$coef_name_str <- rownames(fv$s.table)
fv2 <- fv2[
, coef_name := gsub("([^\\(]+)(\\()([^\\)]+)(\\))", "\\3", coef_name_str)
][
order(coef_name, coef_name_str)
][, p_value := as.numeric(`p-value`)
][, p_str := case_when(
p_value < 0.0001 ~ "<0.0001"
, TRUE ~ format(round(p_value, 4), nsmall = 4)
)
][, df := edf
][, list(coef_name, df, p_str)]
fv2_l <- fv2[seq(1, 5, 2),][,df_l := df][, pvalue_l := p_str][,list(coef_name, df_l, pvalue_l)]
fv2_r <- fv2[seq(2, 6, 2),][,df_r := df][, pvalue_r := p_str][,list(df_r, pvalue_r)]
fv2p <- cbind(fv2_l, fv2_r)
p_prt <- rbind(fv1p, fv2p)
knitr::kable(
fv1p
, caption = "Chisq Test"
, col.names = c("Variable", "Df(Other-IPR)", "p-value(Other-IPR)", "Df(Other-Death)", "p-value(Other-Death)")
, digits = c(0, 1, 0, 1, 0)
, align = c("l","r","r","r","r")
) %>%
styling()
Package BART
Continuous Outcome
##simulate training data
sigma = .1
f = function(x) {x^3}
set.seed(17)
n = 200
x = sort(2*runif(n)-1)
y = f(x) + sigma*rnorm(n)
#xtest: values we want to estimate f(x) at
# this is also our prediction for y.
xtest = seq(-1,1,by=.2)
plot(x,y,cex=.5)
points(xtest,rep(0,length(xtest)),col="red",pch=16,cex=.8)
library(BART)
set.seed(14) #it is MCMC, set the seed!!
rb = wbart(x,y,xtest,nskip=500,ndpost=2000)
qm = apply(rb$yhat.test,2,quantile,probs=c(.025,.975)) # post quantiles
Binary Outcome
library(BART)
data(ACTG175)
## exclude those who do not have CD4 count at 96 weeks
ex <- is.na(ACTG175$cd496)
table(ex)
summary(ACTG175$cd40)
## inclusion criteria are CD4 counts between 200 and 500
ACTG175$cd40 <- pmin(500, pmax(250, ACTG175$cd40))
summary(ACTG175$cd40)
## calculate relative CD4 decline
y <- ((ACTG175$cd496-ACTG175$cd40)/ACTG175$cd40)[!ex]
summary(y)
## 0=failure, 1=success
y <- 1*(y > -0.5)
table(y)
## summarize CD4 outcomes
table(y, ACTG175$arms[!ex])
table(y, ACTG175$arms[!ex])
matrix(table(ACTG175$arms[!ex]), nrow=2, ncol=4, byrow=TRUE)
train <- as.matrix(ACTG175)[!ex, -c(1, 14:15, 17, 18, 20:22, 24:27)]
train <- cbind(1*(ACTG175$strat[!ex]==1), 1*(ACTG175$strat[!ex]==2),
1*(ACTG175$strat[!ex]==3), train)
dimnames(train)[[2]][1:3] <- paste0('strat', 1:3)
train <- cbind(1*(ACTG175$arms[!ex]==0), 1*(ACTG175$arms[!ex]==1),
1*(ACTG175$arms[!ex]==2), 1*(ACTG175$arms[!ex]==3), train)
dimnames(train)[[2]][1:4] <- paste0('arm', 0:3)
N <- nrow(train)
test0 <- train; test0[ , 1:4] <- 0; test0[ , 1] <- 1
test1 <- train; test1[ , 1:4] <- 0; test1[ , 2] <- 1
test2 <- train; test2[ , 1:4] <- 0; test2[ , 3] <- 1
test3 <- train; test3[ , 1:4] <- 0; test3[ , 4] <- 1
test <- rbind(test0, test1, test2, test3)
##test BART with token run to ensure installation works
set.seed(21)
post <- pbart(train, y, test, nskip=5, ndpost=5)
## Not run:
set.seed(21)
post <- pbart(train, y, test)
## turn z-scores into probabilities
post$prob.test <- pnorm(post$yhat.test)
BART on Binary outcome
[1] 0.3000000 0.3396226
[1] 0.32
[1] 0.4
df <- data.frame(
y=c(y0, y1)
, treatment=rep(c(0, 1), each=n)
, noise=rnorm(2 * n)
)
library(BART)
y <- df$y
train <- df[, c("treatment", "noise")]
test1 <- train
test1$treatment <- 1
test0 <- train
test0$treatment <- 0
test <- rbind(test0, test1)
dim(test)
[1] 200 2
*****Into main of pbart *****Data: data:n,p,np: 100, 2, 200 y1,yn: 1, 0 x1,x[n*p]: 0.000000, 0.138691 xp1,xp[np*p]: 0.000000, 0.138691 *****Number of Trees: 50 *****Number of Cut Points: 1 … 100 *****burn and ndpost: 1000, 2500 *****Prior:mybeta,alpha,tau: 2.000000,0.950000,0.212132 *****binaryOffset: -0.358459 *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,2,0 *****nkeeptrain,nkeeptest,nkeeptreedraws: 2500,2500,2500 *****printevery: 100 *****skiptr,skipte,skiptreedraws: 1,1,1
MCMC done 0 (out of 3500) done 100 (out of 3500) done 200 (out of 3500) done 300 (out of 3500) done 400 (out of 3500) done 500 (out of 3500) done 600 (out of 3500) done 700 (out of 3500) done 800 (out of 3500) done 900 (out of 3500) done 1000 (out of 3500) done 1100 (out of 3500) done 1200 (out of 3500) done 1300 (out of 3500) done 1400 (out of 3500) done 1500 (out of 3500) done 1600 (out of 3500) done 1700 (out of 3500) done 1800 (out of 3500) done 1900 (out of 3500) done 2000 (out of 3500) done 2100 (out of 3500) done 2200 (out of 3500) done 2300 (out of 3500) done 2400 (out of 3500) done 2500 (out of 3500) done 2600 (out of 3500) done 2700 (out of 3500) done 2800 (out of 3500) done 2900 (out of 3500) done 3000 (out of 3500) done 3100 (out of 3500) done 3200 (out of 3500) done 3300 (out of 3500) done 3400 (out of 3500) time: 1s check counts trcnt,tecnt: 2500,2500
List of 14 $ yhat.train : num [1:2500, 1:100] -1.083 -1.104 -0.854 -0.314 -0.123 … $ yhat.test : num [1:2500, 1:200] -1.083 -1.104 -0.854 -0.314 -0.123 … $ varcount : int [1:2500, 1:2] 28 25 23 25 23 25 26 24 25 29 … ..- attr(, “dimnames”)=List of 2 .. ..$ : NULL .. ..$ : chr [1:2] “treatment” “noise” $ varprob : num [1:2500, 1:2] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 … ..- attr(, “dimnames”)=List of 2 .. ..$ : NULL .. ..$ : chr [1:2] “treatment” “noise” $ treedraws :List of 2 ..$ cutpoints:List of 2 .. ..$ treatment: num 0.5 .. ..$ noise : num [1:100] -2.3 -2.25 -2.2 -2.15 -2.1 … ..$ trees : chr “2500 50 231 0 0 -0.42579033832 0 0 -0.16950011113 0 0 -0.165008988311 0 0 0.255976901511 0 0 -0”| truncated $ proc.time : ‘proc_time’ Named num [1:5] 0.631 0.039 0.672 0 0 ..- attr(, “names”)= chr [1:5] “user.self” “sys.self” “elapsed” “user.child” … $ prob.train : num [1:2500, 1:100] 0.139 0.135 0.197 0.377 0.451 … $ prob.train.mean: num [1:100] 0.292 0.383 0.431 0.316 0.282 … $ prob.test : num [1:2500, 1:200] 0.139 0.135 0.197 0.377 0.451 … $ prob.test.mean : num [1:200] 0.292 0.383 0.431 0.316 0.282 … $ varcount.mean : Named num [1:2] 27.1 25.1 ..- attr(, “names”)= chr [1:2] “treatment” “noise” $ varprob.mean : Named num [1:2] 0.5 0.5 ..- attr(, “names”)= chr [1:2] “treatment” “noise” $ rm.const : int [1:2] 1 2 $ binaryOffset : num -0.358 - attr(, “class”)= chr “pbart”
[1] 2500 200
Min. 1st Qu. Median Mean 3rd Qu. Max. -1.3418 -1.1672 -0.7957 -0.8165 -0.5100 -0.1368
Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01152 0.22718 0.32082 0.33477 0.42733 0.90851
Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01896 0.27770 0.37644 0.38489 0.48340 0.95675
Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01152 0.21517 0.30501 0.32011 0.40898 0.90851
Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01964 0.29422 0.39389 0.39990 0.49842 0.95675
Leave One Out Cross Validation
caret
library(data.table)
library(caret)
data(iris)
setDT(iris)
iris <- iris[, flag_setosa := factor(Species %in% c("setosa"))
][, Species := NULL]
train_control <- trainControl(method = "LOOCV")
model <- train(flag_setosa ~ .
, data = iris
, trControl = train_control
, method = "glm"
, family = "binomial")
print(model)
Generalized Linear Model
150 samples 4 predictor 2 classes: ‘FALSE’, ‘TRUE’
No pre-processing Resampling: Leave-One-Out Cross-Validation Summary of sample sizes: 149, 149, 149, 149, 149, 149, … Resampling results:
Accuracy Kappa 1 1
[1] “train” “train.formula”
[1] confusionMatrix densityplot fitted ggplot
[5] histogram levels plot predict
[9] predictors print residuals stripplot
[13] summary update varImp xyplot
see ‘?methods’ for accessing help and source code
Computing Environment
R version 4.2.0 (2022-04-22) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 20.04.3 LTS
Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
locale: [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
attached base packages: [1] stats graphics grDevices utils datasets methods base
other attached packages: [1] caret_6.0-93 lattice_0.20-45 BART_2.9
[4] survival_3.2-13 Wu_0.0.0.9000 flexdashboard_0.6.0 [7] lme4_1.1-30 Matrix_1.4-0 mgcv_1.8-38
[10] nlme_3.1-152 png_0.1-7 scales_1.2.0
[13] nnet_7.3-16 labelled_2.9.1 kableExtra_1.3.4
[16] plotly_4.10.0 gridExtra_2.3 ggplot2_3.3.6
[19] DT_0.24 tableone_0.13.2 magrittr_2.0.3
[22] lubridate_1.8.0 dplyr_1.0.9 plyr_1.8.7
[25] data.table_1.14.2 rmdformats_1.0.4 knitr_1.39
loaded via a namespace (and not attached): [1] webshot_0.5.3 httr_1.4.4 tools_4.2.0
[4] bslib_0.4.0 utf8_1.2.2 R6_2.5.1
[7] rpart_4.1-15 DBI_1.1.3 lazyeval_0.2.2
[10] colorspace_2.0-3 withr_2.5.0 tidyselect_1.1.2
[13] compiler_4.2.0 cli_3.3.0 rvest_1.0.2
[16] xml2_1.3.3 bookdown_0.28 sass_0.4.2
[19] proxy_0.4-27 systemfonts_1.0.4 stringr_1.4.0
[22] digest_0.6.29 minqa_1.2.4 rmarkdown_2.14
[25] svglite_2.1.0 pkgconfig_2.0.3 htmltools_0.5.3
[28] parallelly_1.32.1 fastmap_1.1.0 htmlwidgets_1.5.4
[31] rlang_1.0.4 rstudioapi_0.13 jquerylib_0.1.4
[34] generics_0.1.3 jsonlite_1.8.0 ModelMetrics_1.2.2.2 [37] Rcpp_1.0.9 munsell_0.5.0 fansi_1.0.3
[40] lifecycle_1.0.1 pROC_1.18.0 stringi_1.7.8
[43] yaml_2.3.5 MASS_7.3-54 recipes_1.0.1
[46] grid_4.2.0 listenv_0.8.0 parallel_4.2.0
[49] forcats_0.5.1 haven_2.5.0 splines_4.2.0
[52] hms_1.1.1 pillar_1.8.1 boot_1.3-28
[55] stats4_4.2.0 reshape2_1.4.4 future.apply_1.9.0
[58] codetools_0.2-18 glue_1.6.2 evaluate_0.16
[61] mitools_2.4 vctrs_0.4.1 nloptr_2.0.3
[64] foreach_1.5.2 gtable_0.3.0 purrr_0.3.4
[67] tidyr_1.2.0 future_1.27.0 cachem_1.0.6
[70] xfun_0.32 gower_1.0.0 prodlim_2019.11.13
[73] survey_4.1-1 e1071_1.7-11 class_7.3-19
[76] viridisLite_0.4.0 timeDate_4021.104 tibble_3.1.8
[79] iterators_1.0.14 hardhat_1.2.0 lava_1.6.10
[82] globals_0.16.0 ellipsis_0.3.2 ipred_0.9-13