Regression in R

Index

Package mgcv

Logistic Regression

Model

The chunk is not run, eval=FALSE

library(mgcv)

m0 <- bam(IRF ~ PTOT_group +
             age_group +
             gender +
             ethnicity +
             s(icu_los_log, bs="cr", k=10) +
             ## s(hosplos_log, bs="cr", k=10) +
             govtpay +
             wintermonth +
             ecmo +
             ventcharge +
             nmb72h +
             steroid72h +
             resp_medical +
             card_surg +
             infxn +
             ccc_composite +
             s(hospnorecode, bs="re")
       , data=dt2
       , family = binomial
       , cluster = 8
         )

Report Coefficients

fit_s <- summary(m0)


prt_coef.gam <- function(obj, exp = FALSE, digits = 2){
    obj_summary <- summary(obj)
    rtn <- data.table(
        coef_name = names(obj_summary$p.coeff)
      , coef_value = obj_summary$p.coeff
      , df = sum(obj$edf)
    )
    rtn <- merge(
        x = rtn
      , y = data.table(coef_name = names(obj_summary$se), se = obj_summary$se)
      , on = "coef_name"
      , all.x = TRUE
      , all.y = FALSE
    )
    rtn[, coef_value_lower := coef_value + qt(0.025, df) * se
        ][, coef_value_upper := coef_value + qt(0.975, df) * se
          ]
    if (exp == TRUE){
        rtn[, coef_value := exp(coef_value)
            ][, coef_value_lower := exp(coef_value_lower)
              ][, coef_value_upper := exp(coef_value_upper)]
    }
    rtn[, ci := paste0(
              format(round(coef_value, digits), nsmall = digits)
            , "("
            , format(round(coef_value_lower, digits), nsmall = digits)
            , ","
            , format(round(coef_value_upper, digits), nsmall = digits)
            , ")")
        ]
    ## print(rtn)
    if (obj$family$family == "multinom"){
        ## print(obj$family$family)
    }
    return(rtn)
}



fit_coefs <- prt_coef.gam(m0, exp = TRUE, digits = 2)
fit_coefs <- fit_coefs[order(coef_name)][-1,]


coef_names <- get_coef_name(dt2[, c(
     "PTOT_group"
  , "age_group"
  , "gender"
  , "ethnicity"
  , "govtpay"
  , "wintermonth"
  , "ecmo"
  , "ventcharge"
  , "nmb72h"
  , "steroid72h"
  , "resp_medical"
  , "card_surg"
  , "infxn"
  , "ccc_composite"
)])

prt_m1_coef2 <- merge(
    x = coef_names
  , y = fit_coefs
  , by.x = "coef.name"
  , by.y = "coef_name"
  , all.x = TRUE
  , all.y = FALSE
)

knitr::kable(
           prt_m1_coef2[order(order.level), list(label, level, ci)]
         , caption = "Logistic Regression"
         , col.names = c("Variable", "Level", "OR")
         , align = c("l","r","r","r","r")
       ) %>%
    styling()

ROC

library(pROC)
dt$predictor <- plogis(mp$linear.predictors)
dt$response <- as.numeric(dt$mg_tp0.factor == "Yes")
r <- roc(dt, response, predictor)

Multinomial Logistic Regression

Model

library('mgcv')

Outcome <- "outcome2"
Predictors <- c(
    "topquartile"
  , "gender"
  , "age_group"
  , "ethnicity"
  , "payor_class"
  , "wintermonth"
  , "ccc_composite"
  , "neuro"
  , "icu_los_in_weeks"
  , "card_surg"
  , "resp_medical"
  , "infxn"
  , "ecmo"
  , "vent"
  , "nmb72h"
  , "steroid72h"
  , "s(hospital_id, bs=\"re\")"
)


ts1 <- Sys.time()
fit <- mgcv::gam(
    list(
        wu_formula(Outcome, Predictors)
     ,  wu_formula("", Predictors)
    )
  , family = multinom(K = 2)
  , data = df7
  , method = "REML"
  ## , trace = FALSE
    )
ts2 <- Sys.time()

Report Coefficient

saveRDS(fit, "fit_aim1.RDS")
fit <- readRDS("fit_aim1.RDS")
fit_s <- summary(fit)

prt_coef.gam <- function(obj, exp = FALSE, digits = 2){
    obj_summary <- summary(obj)
    rtn <- data.table(
        coef_name = names(obj_summary$p.coeff)
      , coef_value = obj_summary$p.coeff
      , df = sum(obj$edf)
    )
    rtn <- merge(
        x = rtn
      , y = data.table(coef_name = names(obj_summary$se), se = obj_summary$se)
      , on = "coef_name"
      , all.x = TRUE
      , all.y = FALSE
    )
    rtn[, coef_value_lower := coef_value + qt(0.025, df) * se
        ][, coef_value_upper := coef_value + qt(0.975, df) * se
          ]
    if (exp == TRUE){
        rtn[, coef_value := exp(coef_value)
            ][, coef_value_lower := exp(coef_value_lower)
              ][, coef_value_upper := exp(coef_value_upper)]
    }
    rtn[, ci := paste0(
              format(round(coef_value, digits), nsmall = digits)
            , "("
            , format(round(coef_value_lower, digits), nsmall = digits)
            , ","
            , format(round(coef_value_upper, digits), nsmall = digits)
            , ")")
        ]
    ## print(rtn)

    if (obj$family$family == "multinom"){
        ## print(obj$family$family)
    }
    return(rtn)
}

fit_coefs <- prt_coef.gam(fit, exp = TRUE, digits = 2)
fit_coefs <- fit_coefs[order(coef_name)]
cf_l <- fit_coefs[seq(3, 42, by = 2),][, ci_l := ci][, list(coef_name, ci_l)]
cf_r <- fit_coefs[seq(4, 32, by = 2),][, ci_r := ci][, list(ci_r)]

prt_m1_coef <- cbind(cf_l, cf_r)

coef_names <- get_coef_name(df7[, c(
    "topquartile"
  , "gender"
  , "age_group"
  , "ethnicity"
  , "payor_class"
  , "wintermonth"
  , "ccc_composite"
  , "neuro"
  , "icu_los_in_weeks"
  , "card_surg"
  , "resp_medical"
  , "infxn"
  , "ecmo"
  , "vent"
  , "nmb72h"
  , "steroid72h"
)])

prt_m1_coef2 <- merge(
    x = coef_names
  , y = prt_m1_coef
  , by.x = "coef.name"
  , by.y = "coef_name"
  , all.x = TRUE
  , all.y = FALSE
)

prt_m1_coef2 <- prt_m1_coef2[order(order.level), list(label, level, ci_l, ci_r)]
knitr::kable(
          prt_m1_coef2
         , caption = "Multinomial Logistic Regression"
         , col.names = c("Variable", "Level", "OR(IRF:Other)", "OR(Death:Other)")
         , align = c("l","r","r","r","r")
       ) %>%
    styling()

Report p-values

fv <- anova(fit)

fv1 <- as.data.table(fv$pTerms.table)
fv1$coef_name <- rownames(fv$pTerms.table)

fv1 <- fv1[
    order(coef_name)
][, p_value := as.numeric(`p-value`)
  ][, p_str := case_when(
          p_value < 0.0001 ~  "<0.0001"
        , TRUE ~ format(round(p_value, 4), nsmall = 4)
  )
    ][]

fv1_l <- fv1[seq(1, 32, 2),][,df_l := df][, pvalue_l := p_str][,list(coef_name, df_l, pvalue_l)]
fv1_r <- fv1[seq(2, 32, 2),][,df_r := df][, pvalue_r := p_str][,list(df_r, pvalue_r)]
fv1p <- cbind(fv1_l, fv1_r)

fv2 <- as.data.table(fv$s.table)
fv2$coef_name_str <- rownames(fv$s.table)
fv2 <- fv2[
    , coef_name := gsub("([^\\(]+)(\\()([^\\)]+)(\\))", "\\3", coef_name_str)
][
    order(coef_name, coef_name_str)
][, p_value := as.numeric(`p-value`)
  ][, p_str := case_when(
          p_value < 0.0001 ~  "<0.0001"
        , TRUE ~ format(round(p_value, 4), nsmall = 4)
  )
  ][, df := edf
    ][, list(coef_name, df, p_str)]

fv2_l <- fv2[seq(1, 5, 2),][,df_l := df][, pvalue_l := p_str][,list(coef_name, df_l, pvalue_l)]
fv2_r <- fv2[seq(2, 6, 2),][,df_r := df][, pvalue_r := p_str][,list(df_r, pvalue_r)]
fv2p <- cbind(fv2_l, fv2_r)

p_prt <- rbind(fv1p, fv2p)

knitr::kable(
           fv1p
         , caption = "Chisq Test"
         , col.names = c("Variable", "Df(Other-IPR)", "p-value(Other-IPR)", "Df(Other-Death)", "p-value(Other-Death)")
           , digits = c(0, 1, 0, 1, 0)
         , align = c("l","r","r","r","r")
       ) %>%
    styling()

Package BART

Continuous Outcome

##simulate training data
sigma = .1
f = function(x) {x^3}

set.seed(17)
n = 200
x = sort(2*runif(n)-1)

y = f(x) + sigma*rnorm(n)

#xtest: values we want to estimate f(x) at
#  this is also our prediction for y.
xtest = seq(-1,1,by=.2)

plot(x,y,cex=.5)
points(xtest,rep(0,length(xtest)),col="red",pch=16,cex=.8)

library(BART)
set.seed(14) #it is MCMC, set the seed!!
rb = wbart(x,y,xtest,nskip=500,ndpost=2000)

qm = apply(rb$yhat.test,2,quantile,probs=c(.025,.975)) # post quantiles

Binary Outcome

library(BART)

data(ACTG175)
## exclude those who do not have CD4 count at 96 weeks
ex <- is.na(ACTG175$cd496)
table(ex)

summary(ACTG175$cd40)
## inclusion criteria are CD4 counts between 200 and 500
ACTG175$cd40 <- pmin(500, pmax(250, ACTG175$cd40))
summary(ACTG175$cd40)

## calculate relative CD4 decline
y <- ((ACTG175$cd496-ACTG175$cd40)/ACTG175$cd40)[!ex]
summary(y)

## 0=failure, 1=success
y <- 1*(y > -0.5)
table(y)

## summarize CD4 outcomes
table(y, ACTG175$arms[!ex])
table(y, ACTG175$arms[!ex])
matrix(table(ACTG175$arms[!ex]), nrow=2, ncol=4, byrow=TRUE)

train <- as.matrix(ACTG175)[!ex, -c(1, 14:15, 17, 18, 20:22, 24:27)]
train <- cbind(1*(ACTG175$strat[!ex]==1), 1*(ACTG175$strat[!ex]==2),
1*(ACTG175$strat[!ex]==3), train)
dimnames(train)[[2]][1:3] <- paste0('strat', 1:3)
train <- cbind(1*(ACTG175$arms[!ex]==0), 1*(ACTG175$arms[!ex]==1),
1*(ACTG175$arms[!ex]==2), 1*(ACTG175$arms[!ex]==3), train)
dimnames(train)[[2]][1:4] <- paste0('arm', 0:3)
N <- nrow(train)
test0 <- train; test0[ , 1:4] <- 0; test0[ , 1] <- 1
test1 <- train; test1[ , 1:4] <- 0; test1[ , 2] <- 1
test2 <- train; test2[ , 1:4] <- 0; test2[ , 3] <- 1
test3 <- train; test3[ , 1:4] <- 0; test3[ , 4] <- 1
test <- rbind(test0, test1, test2, test3)


##test BART with token run to ensure installation works
set.seed(21)
post <- pbart(train, y, test, nskip=5, ndpost=5)


## Not run:
set.seed(21)
post <- pbart(train, y, test)

## turn z-scores into probabilities
post$prob.test <- pnorm(post$yhat.test)

BART on Binary outcome

p0 <- .3
odds0 <- p0 / (1 - p0)
or <- 1.2
odds1 <- odds0 * or
p1 <- odds1/(1 + odds1)

c(p0, p1)

[1] 0.3000000 0.3396226

n <- 50
set.seed(12345)
y0 <- rbinom(n, 1, p0)
mean(y0)

[1] 0.32

y1 <- rbinom(n, 1, p1)
mean(y1)

[1] 0.4

df <- data.frame(
    y=c(y0, y1)
  , treatment=rep(c(0, 1), each=n)
  , noise=rnorm(2 * n)
)

library(BART)
y <- df$y
train <- df[, c("treatment", "noise")]

test1 <- train
test1$treatment <- 1
test0 <- train
test0$treatment <- 0
test <- rbind(test0, test1)
dim(test)

[1] 200 2

## 200 2

pt <- pbart(x.train=train
          , y.train=y
          , x.test=test
          , ndpost=2500
          , nskip=1000
            )

*****Into main of pbart *****Data: data:n,p,np: 100, 2, 200 y1,yn: 1, 0 x1,x[n*p]: 0.000000, 0.138691 xp1,xp[np*p]: 0.000000, 0.138691 *****Number of Trees: 50 *****Number of Cut Points: 1 … 100 *****burn and ndpost: 1000, 2500 *****Prior:mybeta,alpha,tau: 2.000000,0.950000,0.212132 *****binaryOffset: -0.358459 *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,2,0 *****nkeeptrain,nkeeptest,nkeeptreedraws: 2500,2500,2500 *****printevery: 100 *****skiptr,skipte,skiptreedraws: 1,1,1

MCMC done 0 (out of 3500) done 100 (out of 3500) done 200 (out of 3500) done 300 (out of 3500) done 400 (out of 3500) done 500 (out of 3500) done 600 (out of 3500) done 700 (out of 3500) done 800 (out of 3500) done 900 (out of 3500) done 1000 (out of 3500) done 1100 (out of 3500) done 1200 (out of 3500) done 1300 (out of 3500) done 1400 (out of 3500) done 1500 (out of 3500) done 1600 (out of 3500) done 1700 (out of 3500) done 1800 (out of 3500) done 1900 (out of 3500) done 2000 (out of 3500) done 2100 (out of 3500) done 2200 (out of 3500) done 2300 (out of 3500) done 2400 (out of 3500) done 2500 (out of 3500) done 2600 (out of 3500) done 2700 (out of 3500) done 2800 (out of 3500) done 2900 (out of 3500) done 3000 (out of 3500) done 3100 (out of 3500) done 3200 (out of 3500) done 3300 (out of 3500) done 3400 (out of 3500) time: 1s check counts trcnt,tecnt: 2500,2500

str(pt)

List of 14 $ yhat.train : num [1:2500, 1:100] -1.083 -1.104 -0.854 -0.314 -0.123 … $ yhat.test : num [1:2500, 1:200] -1.083 -1.104 -0.854 -0.314 -0.123 … $ varcount : int [1:2500, 1:2] 28 25 23 25 23 25 26 24 25 29 … ..- attr(, “dimnames”)=List of 2 .. ..$ : NULL .. ..$ : chr [1:2] “treatment” “noise” $ varprob : num [1:2500, 1:2] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 … ..- attr(, “dimnames”)=List of 2 .. ..$ : NULL .. ..$ : chr [1:2] “treatment” “noise” $ treedraws :List of 2 ..$ cutpoints:List of 2 .. ..$ treatment: num 0.5 .. ..$ noise : num [1:100] -2.3 -2.25 -2.2 -2.15 -2.1 … ..$ trees : chr “2500 50 231 0 0 -0.42579033832 0 0 -0.16950011113 0 0 -0.165008988311 0 0 0.255976901511 0 0 -0”| truncated $ proc.time : ‘proc_time’ Named num [1:5] 0.631 0.039 0.672 0 0 ..- attr(, “names”)= chr [1:5] “user.self” “sys.self” “elapsed” “user.child” … $ prob.train : num [1:2500, 1:100] 0.139 0.135 0.197 0.377 0.451 … $ prob.train.mean: num [1:100] 0.292 0.383 0.431 0.316 0.282 … $ prob.test : num [1:2500, 1:200] 0.139 0.135 0.197 0.377 0.451 … $ prob.test.mean : num [1:200] 0.292 0.383 0.431 0.316 0.282 … $ varcount.mean : Named num [1:2] 27.1 25.1 ..- attr(, “names”)= chr [1:2] “treatment” “noise” $ varprob.mean : Named num [1:2] 0.5 0.5 ..- attr(, “names”)= chr [1:2] “treatment” “noise” $ rm.const : int [1:2] 1 2 $ binaryOffset : num -0.358 - attr(, “class”)= chr “pbart”

ps <- pt$yhat.test
dim(ps)

[1] 2500 200

## 250 rows of draws, and 200 columns of post z-scores
summary(ps[1, 1:100])

Min. 1st Qu. Median Mean 3rd Qu. Max. -1.3418 -1.1672 -0.7957 -0.8165 -0.5100 -0.1368

## average treatment effect
summary(as.vector(pnorm(ps[1:2500, 1:100])))

Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01152 0.22718 0.32082 0.33477 0.42733 0.90851

summary(as.vector(pnorm(ps[1:2500, 101:200])))

Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01896 0.27770 0.37644 0.38489 0.48340 0.95675

## average treatment effect on the treated
summary(as.vector(pnorm(ps[1:2500, 1:50])))

Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01152 0.21517 0.30501 0.32011 0.40898 0.90851

summary(as.vector(pnorm(ps[1:2500, 151:200])))

Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01964 0.29422 0.39389 0.39990 0.49842 0.95675

Leave One Out Cross Validation

caret

https://machinelearningmastery.com/how-to-estimate-model-accuracy-in-r-using-the-caret-package/

library(data.table)
library(caret)
data(iris)
setDT(iris)
iris <- iris[, flag_setosa := factor(Species %in% c("setosa"))
             ][, Species := NULL]
train_control <- trainControl(method = "LOOCV")
model <- train(flag_setosa ~ .
             , data = iris
             , trControl = train_control
             , method = "glm"
             , family = "binomial")
print(model)

Generalized Linear Model

150 samples 4 predictor 2 classes: ‘FALSE’, ‘TRUE’

No pre-processing Resampling: Leave-One-Out Cross-Validation Summary of sample sizes: 149, 149, 149, 149, 149, 149, … Resampling results:

Accuracy Kappa 1 1

class(model)

[1] “train” “train.formula”

methods(class = "train")

[1] confusionMatrix densityplot fitted ggplot
[5] histogram levels plot predict
[9] predictors print residuals stripplot
[13] summary update varImp xyplot
see ‘?methods’ for accessing help and source code

Computing Environment

sessionInfo()

R version 4.2.0 (2022-04-22) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 20.04.3 LTS

Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale: [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C

attached base packages: [1] stats graphics grDevices utils datasets methods base

other attached packages: [1] caret_6.0-93 lattice_0.20-45 BART_2.9
[4] survival_3.2-13 Wu_0.0.0.9000 flexdashboard_0.6.0 [7] lme4_1.1-30 Matrix_1.4-0 mgcv_1.8-38
[10] nlme_3.1-152 png_0.1-7 scales_1.2.0
[13] nnet_7.3-16 labelled_2.9.1 kableExtra_1.3.4
[16] plotly_4.10.0 gridExtra_2.3 ggplot2_3.3.6
[19] DT_0.24 tableone_0.13.2 magrittr_2.0.3
[22] lubridate_1.8.0 dplyr_1.0.9 plyr_1.8.7
[25] data.table_1.14.2 rmdformats_1.0.4 knitr_1.39

loaded via a namespace (and not attached): [1] webshot_0.5.3 httr_1.4.4 tools_4.2.0
[4] bslib_0.4.0 utf8_1.2.2 R6_2.5.1
[7] rpart_4.1-15 DBI_1.1.3 lazyeval_0.2.2
[10] colorspace_2.0-3 withr_2.5.0 tidyselect_1.1.2
[13] compiler_4.2.0 cli_3.3.0 rvest_1.0.2
[16] xml2_1.3.3 bookdown_0.28 sass_0.4.2
[19] proxy_0.4-27 systemfonts_1.0.4 stringr_1.4.0
[22] digest_0.6.29 minqa_1.2.4 rmarkdown_2.14
[25] svglite_2.1.0 pkgconfig_2.0.3 htmltools_0.5.3
[28] parallelly_1.32.1 fastmap_1.1.0 htmlwidgets_1.5.4
[31] rlang_1.0.4 rstudioapi_0.13 jquerylib_0.1.4
[34] generics_0.1.3 jsonlite_1.8.0 ModelMetrics_1.2.2.2 [37] Rcpp_1.0.9 munsell_0.5.0 fansi_1.0.3
[40] lifecycle_1.0.1 pROC_1.18.0 stringi_1.7.8
[43] yaml_2.3.5 MASS_7.3-54 recipes_1.0.1
[46] grid_4.2.0 listenv_0.8.0 parallel_4.2.0
[49] forcats_0.5.1 haven_2.5.0 splines_4.2.0
[52] hms_1.1.1 pillar_1.8.1 boot_1.3-28
[55] stats4_4.2.0 reshape2_1.4.4 future.apply_1.9.0
[58] codetools_0.2-18 glue_1.6.2 evaluate_0.16
[61] mitools_2.4 vctrs_0.4.1 nloptr_2.0.3
[64] foreach_1.5.2 gtable_0.3.0 purrr_0.3.4
[67] tidyr_1.2.0 future_1.27.0 cachem_1.0.6
[70] xfun_0.32 gower_1.0.0 prodlim_2019.11.13
[73] survey_4.1-1 e1071_1.7-11 class_7.3-19
[76] viridisLite_0.4.0 timeDate_4021.104 tibble_3.1.8
[79] iterators_1.0.14 hardhat_1.2.0 lava_1.6.10
[82] globals_0.16.0 ellipsis_0.3.2 ipred_0.9-13