R Data Manipulation with data.table
Read Data
Pull Data from REDCap
- Pull data from REDCap using API and save the csv file
 
Pull Data from MySQL
General Data Description
- Create a HTML file the describe all variables in a dataset
 
Rename Variables & Revalue A Vector
- match function matches the first vector to the second vector, gives the indexes of elements from vector one that matched in vector two
 - dplyr::rename function renames variable names
 
| x | 
|---|
| NA | 
| 2 | 
| NA | 
| 1 | 
| NA | 
| x | 
|---|
| A | 
| B | 
colnames(dt) <- plyr::revalue(colnames(dt), c("A"="Variable_A"))
colnames(dt) %>% prt(caption = "Column Names Changed")| x | 
|---|
| Variable_A | 
| B | 
## dplyr::rename(dt, var_b=B)
lst <- list("A"="Variable_A")
## names(lst)
## unlist(lst)
colnames(dt)[colnames(dt) %in% names(lst)] <- unlist(lst)[colnames(dt)[colnames(dt) %in% names(lst)]]
v <- c("A", "B", "C")
lst <- c(
    "A"="A-"
  , "B"
  , "C"="C+"
)
plyr::revalue(v, lst) %>% prt(caption = "plyr::revalue function")| x | 
|---|
| A- | 
| B | 
| C+ | 
| x | 
|---|
| A- | 
| B | 
| C+ | 
## class(v)
dt <- data.table(var1 = 1, var2 = 2, var3 = 3)
Wu::renameVariables(obj = dt, lst = c(var1 = "varA", var2 = "varB")) %>% prt()| varA | varB | var3 | 
|---|---|---|
| 1 | 2 | 3 | 
Regular Expression
Fill Missing Values
Replace elements in a vector by a specific value
[1] 1 0 3 0 5 6 7 8 0 10
Index of observations
dt <- data.table(
    group=c("A", "A", "B", "B", "B", "C", "C", "C", "C")
  , var1=1:9
)
dt <- dt[, index_i := .I
         ][, index_n := 1:.N
           ][, index_i_group := .I, by = .(group)
             ][, index_n_group := 1:.N, by = .(group)]
dt %>% prt()| group | var1 | index_i | index_n | index_i_group | index_n_group | 
|---|---|---|---|---|---|
| A | 1 | 1 | 1 | 1 | 1 | 
| A | 2 | 2 | 2 | 2 | 2 | 
| B | 3 | 3 | 3 | 3 | 1 | 
| B | 4 | 4 | 4 | 4 | 2 | 
| B | 5 | 5 | 5 | 5 | 3 | 
| C | 6 | 6 | 6 | 6 | 1 | 
| C | 7 | 7 | 7 | 7 | 2 | 
| C | 8 | 8 | 8 | 8 | 3 | 
| C | 9 | 9 | 9 | 9 | 4 | 
Last Observation Carry Forward (LOCF)
- data.table::nafill funtion only works for numeric variables
 - Workaround for character variables by filling indexes of character variables instead https://github.com/Rdatatable/data.table/issues/3992#issuecomment-546295277
 - type=“nocb”: Next Observation Carry Backward
 - type=“const”: fill with constant
 
dt <- data.table(
    n1=c(1, NA, 2, NA, NA, 3, 3, NA)
  , c1=c("A", "A", "B", NA, "C", NA, NA, NA)
)
dt <- dt[
  , n1_locf := nafill(n1, type="locf")
][, c1_locf := c1[nafill(replace(.I, is.na(c1), NA), type="locf")]
  ][, index := .I]
dt[, .(index, n1, n1_locf, c1, c1_locf)] %>% prt()| index | n1 | n1_locf | c1 | c1_locf | 
|---|---|---|---|---|
| 1 | 1 | 1 | A | A | 
| 2 | NA | 1 | A | A | 
| 3 | 2 | 2 | B | B | 
| 4 | NA | 2 | NA | B | 
| 5 | NA | 2 | C | C | 
| 6 | 3 | 3 | NA | C | 
| 7 | 3 | 3 | NA | C | 
| 8 | NA | 3 | NA | C | 
- More detailed process for character variables
 
Data Functions
Sum Multiple Columns
is.1 <- function(x){x %in% c(1)}
sum.1 <- function(x){sum(as.numeric(is.1(x)))}
dm <- dm[, race_sum := sum(.SD)
       , .SDcols = paste0("race___", as.character(1:6))
       , by = 1:nrow(dm)]
dm <- dm[, conditions_num := sum(is.1(hypertension)
                               , is.1(diabetes)
                               , is.1(heart_disease)
                               , is.1(renal_disease)
                               , is.1(malignancies)
                               , is.1(respiratory_disease)
                                 )
       , by = 1:nrow(dm)
         ]Rolling Average
library(zoo)
dt <- data.table(series = c(1:10, NA, 11:21))
dt <- dt[, mv2 := rollmean(series, k = 2, align = "right", na.pad = TRUE)
         ][, mv3 := rollmean(series, k = 3, align = "right", na.pad = TRUE)]
dt %>% prt()| series | mv2 | mv3 | 
|---|---|---|
| 1 | NA | NA | 
| 2 | 1.5 | NA | 
| 3 | 2.5 | 2 | 
| 4 | 3.5 | 3 | 
| 5 | 4.5 | 4 | 
| 6 | 5.5 | 5 | 
| 7 | 6.5 | 6 | 
| 8 | 7.5 | 7 | 
| 9 | 8.5 | 8 | 
| 10 | 9.5 | 9 | 
| NA | NA | NA | 
| 11 | NA | NA | 
| 12 | 11.5 | NA | 
| 13 | 12.5 | 12 | 
| 14 | 13.5 | 13 | 
| 15 | 14.5 | 14 | 
| 16 | 15.5 | 15 | 
| 17 | 16.5 | 16 | 
| 18 | 17.5 | 17 | 
| 19 | 18.5 | 18 | 
| 20 | 19.5 | 19 | 
| 21 | 20.5 | 20 | 
Weighted Rolling Average
dt <- data.table(series = c(1:10, NA, 11:21))
n <- 2
wts <- (1:n) / sum(1:n)
wm <- function(x){
    invisible(weighted.mean(x = x, w = wts))
}
dt <- dt[, mvw2 := frollapply(seq_len(.N)
                            , FUN = function(ind) wm(series[ind])
                            , n = 2
                            , align = "left"
                              )]
dt %>% prt()| series | mvw2 | 
|---|---|
| 1 | 1.666667 | 
| 2 | 2.666667 | 
| 3 | 3.666667 | 
| 4 | 4.666667 | 
| 5 | 5.666667 | 
| 6 | 6.666667 | 
| 7 | 7.666667 | 
| 8 | 8.666667 | 
| 9 | 9.666667 | 
| 10 | NA | 
| NA | NA | 
| 11 | 11.666667 | 
| 12 | 12.666667 | 
| 13 | 13.666667 | 
| 14 | 14.666667 | 
| 15 | 15.666667 | 
| 16 | 16.666667 | 
| 17 | 17.666667 | 
| 18 | 18.666667 | 
| 19 | 19.666667 | 
| 20 | 20.666667 | 
| 21 | NA | 
Weighted Rolling Median
dt <- data.table(var1 = c(1:10, NA, 11:21))
library(matrixStats)
n <- 4
wts <- c(0.1, 1, 1, 1)
wts <- wts / sum(wts)
wmd <- function(x){
    invisible(matrixStats::weightedMedian(x = x, w = wts))
}
dt <- dt[, mvwmd3 := frollapply(seq_len(.N)
                            , FUN = function(ind) wmd(var1[ind])
                            , n = 4
                            , align = "left"
                              )]
dt %>% prt()| var1 | mvwmd3 | 
|---|---|
| 1 | 2.95 | 
| 2 | 3.95 | 
| 3 | 4.95 | 
| 4 | 5.95 | 
| 5 | 6.95 | 
| 6 | 7.95 | 
| 7 | 8.95 | 
| 8 | NA | 
| 9 | NA | 
| 10 | NA | 
| NA | NA | 
| 11 | 12.95 | 
| 12 | 13.95 | 
| 13 | 14.95 | 
| 14 | 15.95 | 
| 15 | 16.95 | 
| 16 | 17.95 | 
| 17 | 18.95 | 
| 18 | 19.95 | 
| 19 | NA | 
| 20 | NA | 
| 21 | NA | 
Concatenate Multiple Rows by Group
a b ID 1: a A 1 2: b B 1 3: c C 1 4: d D 2 5: e E 2 6: f F 2 7: g G 2 8: h H 3 9: i I 3 10: j J 3
ID a 1: 1 a b c 2: 2 d e f g 3: 3 h i j
Shift Column
cf <- cf[
][, status_row_n_int := nafill(status_row_n, type = "locf"), by = list(mrn)
  ][, fio2_int := nafill(fio2_row_filled, type = "locf"), by = list(mrn)
    ][,  datetime_record_nx1 := shift(datetime_record, type="lead"), by = list(mrn)
      ][, time_int := as.numeric(difftime(datetime_record_nx1, datetime_record, units = "hours"))
        ]Permutate & combination
expand.grid(levels(dt$sex_2.factor)
               , levels(dt$age_group)
               , levels(dt$race_ethnicity)
               , levels(dt$com_obesity)
                 )
combn(letters[1:4], 2)
perm_without_replacement <- function(n, r){
  return(factorial(n)/factorial(n - r)/ factorial(r))
}
factorial(4)
# 9 choose 3
perm_without_replacement(9, 3) * perm_without_replacement(6, 3)
6 * 5 * 4 / (3 * 2)Split data.frame
Replicate Rows
| name | 
|---|
| a | 
| b | 
| c | 
| d | 
| name | index | 
|---|---|
| a | 1 | 
| a | 2 | 
| a | 3 | 
| b | 1 | 
| b | 2 | 
| b | 3 | 
| c | 1 | 
| c | 2 | 
| c | 3 | 
| d | 1 | 
| d | 2 | 
| d | 3 | 
dt <- data.table(
    id = 1:6
  , num = c(3, 5, 6, 3, 4, 5)
)
dt <- dt[rep(1:.N, times = num)][, sub_index := 1:.N, by = id]
dt %>% prt()| id | num | sub_index | 
|---|---|---|
| 1 | 3 | 1 | 
| 1 | 3 | 2 | 
| 1 | 3 | 3 | 
| 2 | 5 | 1 | 
| 2 | 5 | 2 | 
| 2 | 5 | 3 | 
| 2 | 5 | 4 | 
| 2 | 5 | 5 | 
| 3 | 6 | 1 | 
| 3 | 6 | 2 | 
| 3 | 6 | 3 | 
| 3 | 6 | 4 | 
| 3 | 6 | 5 | 
| 3 | 6 | 6 | 
| 4 | 3 | 1 | 
| 4 | 3 | 2 | 
| 4 | 3 | 3 | 
| 5 | 4 | 1 | 
| 5 | 4 | 2 | 
| 5 | 4 | 3 | 
| 5 | 4 | 4 | 
| 6 | 5 | 1 | 
| 6 | 5 | 2 | 
| 6 | 5 | 3 | 
| 6 | 5 | 4 | 
| 6 | 5 | 5 | 
Generate Random Variables
- Vectorize function
 - Generate random variables within data.table using values from another column
 
library(Wu)
library(data.table)
dt <- data.table(
    id = 1:6
  , m = c(1:6)
)
rnorm_vec <- function(x){
    rnorm(n = 1, mean = x)
}
rnorm_vec <- Vectorize(rnorm_vec)
dt <- dt[, value := rnorm_vec(m)]
dt %>% prt()| id | m | value | 
|---|---|---|
| 1 | 1 | 0.7553553 | 
| 2 | 2 | 1.2187691 | 
| 3 | 3 | -0.0309352 | 
| 4 | 4 | 5.4237023 | 
| 5 | 5 | 4.7998467 | 
| 6 | 6 | 3.8257029 | 
Pass Column Name to data.table within Function
| var1 | var2 | 
|---|---|
| 1 | A | 
| 2 | A | 
| 3 | B | 
| 4 | B | 
| 5 | C | 
| 6 | C | 
| 7 | D | 
| 8 | D | 
| 9 | E | 
| 10 | E | 
| var2 | 
|---|
| A | 
| A | 
| B | 
| B | 
| C | 
| C | 
| D | 
| D | 
| E | 
| E | 
| var2 | 
|---|
| A | 
| A | 
| B | 
| B | 
| C | 
| C | 
| D | 
| D | 
| E | 
| E | 
| get | N | 
|---|---|
| A | 2 | 
| B | 2 | 
| C | 2 | 
| D | 2 | 
| E | 2 | 
| var1 | var2 | 
|---|---|
| 1 | A+ | 
| 2 | A+ | 
| 3 | B+ | 
| 4 | B+ | 
| 5 | C+ | 
| 6 | C+ | 
| 7 | D+ | 
| 8 | D+ | 
| 9 | E+ | 
| 10 | E+ | 
pass_col <- function(clname){
    ## clname <- sym(clname)
    print(dt[, .N, by = clname])
}
pass_col(c("var1", "var2"))var1 var2 N 1: 1 A+ 1 2: 2 A+ 1 3: 3 B+ 1 4: 4 B+ 1 5: 5 C+ 1 6: 6 C+ 1 7: 7 D+ 1 8: 8 D+ 1 9: 9 E+ 1 10: 10 E+ 1 # Reshape
Reshape Wide to Long
| id | var_1 | var_2 | var_3 | 
|---|---|---|---|
| 1 | 10 | 21 | 31 | 
| 2 | 11 | 22 | 32 | 
| 3 | 12 | 23 | 33 | 
| 4 | 13 | 24 | 34 | 
| 5 | 14 | 25 | 35 | 
| 1 | 15 | 21 | 31 | 
| id | variable | value | 
|---|---|---|
| 1 | var_1 | 10 | 
| 2 | var_1 | 11 | 
| 3 | var_1 | 12 | 
| 4 | var_1 | 13 | 
| 5 | var_1 | 14 | 
| 1 | var_1 | 15 | 
| 1 | var_2 | 21 | 
| 2 | var_2 | 22 | 
| 3 | var_2 | 23 | 
| 4 | var_2 | 24 | 
| 5 | var_2 | 25 | 
| 1 | var_2 | 21 | 
| 1 | var_3 | 31 | 
| 2 | var_3 | 32 | 
| 3 | var_3 | 33 | 
| 4 | var_3 | 34 | 
| 5 | var_3 | 35 | 
| 1 | var_3 | 31 | 
Reshape Long to Wide
| id | day | value | 
|---|---|---|
| A | 1 | 1 | 
| A | 2 | 2 | 
| A | 3 | 3 | 
| B | 1 | 4 | 
| B | 2 | 5 | 
| B | 3 | 6 | 
[1] “data.table” “data.frame”
| id | 1 | 2 | 3 | 
|---|---|---|---|
| A | 1 | 2 | 3 | 
| B | 4 | 5 | 6 | 
Merge Multiple Tables
t1 <- data.table(id = 1:10, var1 = rnorm(10, 1))
t2 <- data.table(id = 1:10, var2 = rnorm(10, 10))
t3 <- data.table(id = 1:10, var3 = rnorm(10, 100))
t4 <- data.table(id = 1:10, var4 = rnorm(10, 1000))
setkey(t1, id)
setkey(t2, id)
setkey(t3, id)
setkey(t4, id)
t <- Reduce(function(...) merge(..., all = TRUE)
            , list(t1, t2, t3, t4))
t %>% prt(caption = "Merge Multiple Tables by Key")Date and Time
General
as.POSIXct(
    data$guidewire_datetime
  , format='%Y-%m-%d %H:%M'
  , tz='GMT')
as.Date(data$diag_ecg_date_time, format='%Y-%m-%d')
build_date <- function(year, month, date){
    as.Date(ISOdate(year, month, date))
}
get_datetime <- function(text){
    as.POSIXct(
        trimws(text)
      , format = "%Y-%m-%d %H:%M"
      , tz = "GMT"
      , origin = '1970-01-01'
    )
}
get_ed_arrival <- function(text){
    as.POSIXct(
        trimws(text)
      , format = "%m/%d/%y %H%M"
      , tz = "GMT"
      , origin = '1970-01-01'
    )
}
tf <- tf[
  , datetime_ed_arrival := get_ed_arrival(ed_arrival_time_str)
][, year_ed_arrival := format(datetime_ed_arrival, "%Y")
  ][, month_ed_arrival := format(datetime_ed_arrival, "%m")
  ][, yearmonth_ed_arrival := format(datetime_ed_arrival, "%Y%m")
    ][, weekdays_ed_arrival := weekdays(datetime_ed_arrival)
      ][, weekdays_ed_arrival := factor(
              weekdays_ed_arrival
            , levels = c(
                  "Sunday"
                , "Monday"
                , "Tuesday"
                , "Wednesday"
                , "Thursday"
                , "Friday"
                , "Saturday"
              )
          )
        ][, weekdays_ed_arrival_n := as.numeric(weekdays_ed_arrival)
          ][, hour_ed_arrival := data.table::hour(datetime_ed_arrival)
          ][, minute_ed_arrival := data.table::minute(datetime_ed_arrival)
            ][, hour_ed_arrival_c := strftime(datetime_ed_arrival, format = "%H", tz = "GMT")
              ][, minute_ed_arrival_c := strftime(datetime_ed_arrival, format = "%M", tz = "GMT")][, hour_ed_arrival_n := hour_ed_arrival + minute_ed_arrival/60]
tf <- tf[, flag_bussiness_hours := case_when(
               is.na(hour_ed_arrival) ~ as.character(NA)
             , hour_ed_arrival < 8 ~ "Non-Business Hours"
             , hour_ed_arrival == 17 & minute_ed_arrival >= 1 ~ "Non-Business Hours"
             , hour_ed_arrival > 17 ~ "Non-Business Hours"
             , TRUE ~ "Business Hours"
           )
         ][, flag_bussiness_hours := factor(
                 flag_bussiness_hours
               , levels = c("Business Hours", "Non-Business Hours")
             )]
tf <- tf[
  , flag_weekday := case_when(
        weekdays_ed_arrival %in% c("Tuesday", "Wednesday", "Thursday") ~ "Weekday"
      , weekdays_ed_arrival %in% c("Saturday", "Sunday") ~ "Weekend"
      , weekdays_ed_arrival %in% c("Friday") & hour_ed_arrival == 17 & minute_ed_arrival >= 1  ~ "Weekend"
      , weekdays_ed_arrival %in% c("Friday") & hour_ed_arrival > 17  ~ "Weekend"
      , weekdays_ed_arrival %in% c("Friday") ~ "Weekday"
      , weekdays_ed_arrival %in% c("Monday") & hour_ed_arrival == 7 & minute_ed_arrival <= 59 ~ "Weekend"
      , weekdays_ed_arrival %in% c("Monday") & hour_ed_arrival < 7 ~ "Weekend"
      , weekdays_ed_arrival %in% c("Monday") ~ "Weekday"
      , TRUE ~ as.character(NA)
    )
][, flag_weekday := factor(flag_weekday, levels = c("Weekday", "Weekend"))
  ][]
fp <- fp[
    , datetime_proc_start := get_datetime(proc_start_time_str)
][, year_proc_start := format(datetime_proc_start, "%Y")
  ][, yearmonth_proc_start := format(datetime_proc_start, "%Y%m")]
vs4 <- vs4[
  , ed_arrival_to_pas_discharge := as.numeric(difftime(
        datetime_pas_discharge
      , datetime_ed_arrival
      , units = "hours"
    ))
]
vs4 <- vs4[
    , date_ed_arrival := date(datetime_ed_arrival)
][, age_in_years := as.numeric(date_ed_arrival - dob)/365.25
  ][, age_group := case_when(
          age_in_years < 18 ~ "< 18"
        , age_in_years < 65 ~ "18-64"
        , age_in_years >= 65 ~ "65 +"
        , TRUE ~ as.character(NA)
      )
    ]Sequential Date
[1] “2019-07-01” “2019-07-02” “2019-07-03” “2019-07-04” “2019-07-05” [6] “2019-07-06” “2019-07-07” “2019-07-08” “2019-07-09” “2019-07-10”
[1] “2019-07-01 00:00:00” “2019-07-02 00:00:00” “2019-07-03 00:00:00” [4] “2019-07-04 00:00:00” “2019-07-05 00:00:00” “2019-07-06 00:00:00” [7] “2019-07-07 00:00:00” “2019-07-08 00:00:00” “2019-07-09 00:00:00” [10] “2019-07-10 00:00:00”
[1] “POSIXct” “POSIXt”
[1] “2019-07-01” “2019-07-02” “2019-07-03” “2019-07-04” “2019-07-05” [6] “2019-07-06” “2019-07-07” “2019-07-08” “2019-07-09” “2019-07-10”
[1] 0 0 0 0 0 0 0 0 0 0
Scale/One-hot encoding/Dummy Variable
Scale and Unscale
unscale <- function(x){
    x * attr(x, "scaled:scale") + attr(x, "scaled:center")
}
v <- c(1:10, NA)
vs <- scale(v)
vs2 <- (v - mean(v)) / sd(v)
v2 <- unscale(vs)
print(cbind(v, vs, vs2, v2))v            vs2   
[1,] 1 -1.4863011 NA 1 [2,] 2 -1.1560120 NA 2 [3,] 3 -0.8257228 NA 3 [4,] 4 -0.4954337 NA 4 [5,] 5 -0.1651446 NA 5 [6,] 6 0.1651446 NA 6 [7,] 7 0.4954337 NA 7 [8,] 8 0.8257228 NA 8 [9,] 9 1.1560120 NA 9 [10,] 10 1.4863011 NA 10 [11,] NA NA NA NA
num [1:11, 1] -1.486 -1.156 -0.826 -0.495 -0.165 … - attr(, “scaled:center”)= num 5.5 - attr(, “scaled:scale”)= num 3.03
[1] 5.5
[1] 3.02765
One-Hot Encoding
Wu::one_hot
- Single impute numeric variables as median and add a column indicating missing values
 - Add NA level to categorical variables
 
library(Wu)
set.seed(1234)
t <- data.table(var1=factor(sample(c("A", "B", "C", NA), 10, replace=TRUE), levels=c("A", "B", "C"))
              , var2=sample(c(1:10, rep(NA, 5)), 10)
              , var3=factor(rep(c("Red", "Green"), each=5), levels=c("Red", "Green", "Yellow"))
                )
t %>% prt()| var1 | var2 | var3 | 
|---|---|---|
| NA | 4 | Red | 
| NA | 2 | Red | 
| B | 7 | Red | 
| B | 6 | Red | 
| A | 10 | Red | 
| NA | NA | Green | 
| C | NA | Green | 
| A | NA | Green | 
| A | 8 | Green | 
| B | NA | Green | 
| var2 | var2_notA | var1-A | var1-B | var1-C | var3-Red | 
|---|---|---|---|---|---|
| 4.0 | 0 | 0 | 0 | 0 | 1 | 
| 2.0 | 0 | 0 | 0 | 0 | 1 | 
| 7.0 | 0 | 0 | 1 | 0 | 1 | 
| 6.0 | 0 | 0 | 1 | 0 | 1 | 
| 10.0 | 0 | 1 | 0 | 0 | 1 | 
| 6.5 | 1 | 0 | 0 | 1 | 0 | 
| 6.5 | 1 | 1 | 0 | 0 | 0 | 
| 6.5 | 1 | 1 | 0 | 0 | 0 | 
| 8.0 | 0 | 0 | 1 | 0 | 0 | 
| 6.5 | 1 | 0 | 0 | 0 | 1 | 
mltools::one_hot
- Only work on factors, not characters;
 - DEFAULT = “auto” encodes all unordered factor columns
 
library(data.table)
t <- data.table(var1=factor(sample(c("A", "B", "C", NA), 10, replace=TRUE), levels=c("A", "B", "C"))
              , var2=1:10
                , var3=factor(rep(c("Red", "Green"), each=5), levels=c("Red", "Green", "Yellow"))
                )
print(t)var1 var2 var3 1: 
var1_A var1_B var1_C var2 var3_Red var3_Green var3_Yellow 1: NA NA NA 1 1 0 0 2: NA NA NA 2 1 0 0 3: 1 0 0 3 1 0 0 4: NA NA NA 4 1 0 0 5: NA NA NA 5 1 0 0 6: NA NA NA 6 0 1 0 7: 0 0 1 7 0 1 0 8: NA NA NA 8 0 1 0 9: 0 0 1 9 0 1 0 10: 0 0 1 10 0 1 0
var1_NA var1_A var1_B var1_C var2 var3_Red var3_Green var3_Yellow 1: 1 0 0 0 1 1 0 0 2: 1 0 0 0 2 1 0 0 3: 0 1 0 0 3 1 0 0 4: 1 0 0 0 4 1 0 0 5: 1 0 0 0 5 1 0 0 6: 1 0 0 0 6 0 1 0 7: 0 0 0 1 7 0 1 0 8: 1 0 0 0 8 0 1 0 9: 0 0 0 1 9 0 1 0 10: 0 0 0 1 10 0 1 0
var1 var1_NA var1_A var1_B var1_C var2 var3 var3_Red var3_Green 1: 
var1 var1_NA var1_A var1_C var2 var3 var3_Red var3_Green 1: 
caret::dummyVars
- It converts all factor and character variables
 - fullRank remove referral level
 - Cannot drop unused levels
 
library(caret)
set.seed(1234)
t <- data.table(var1=factor(sample(c("A", "B", "C", NA), 10, replace=TRUE), levels=c("A", "B", "C"))
              , var2=1:10
              , var3=factor(rep(c("Red", "Green"), each=5), levels=c("Red", "Green", "Yellow"))
              , var4=rep(c("Low", "High"), 5)
                )
t %>% prt()| var1 | var2 | var3 | var4 | 
|---|---|---|---|
| NA | 1 | Red | Low | 
| NA | 2 | Red | High | 
| B | 3 | Red | Low | 
| B | 4 | Red | High | 
| A | 5 | Red | Low | 
| NA | 6 | Green | High | 
| C | 7 | Green | Low | 
| A | 8 | Green | High | 
| A | 9 | Green | Low | 
| B | 10 | Green | High | 
dummy <- dummyVars(" ~ .", data=t, fullRank = TRUE)
tdummy <- data.frame(predict(dummy, t))
tdummy %>% prt()| var1.B | var1.C | var2 | var3.Green | var3.Yellow | var4Low | 
|---|---|---|---|---|---|
| NA | NA | 1 | 0 | 0 | 1 | 
| NA | NA | 2 | 0 | 0 | 0 | 
| 1 | 0 | 3 | 0 | 0 | 1 | 
| 1 | 0 | 4 | 0 | 0 | 0 | 
| 0 | 0 | 5 | 0 | 0 | 1 | 
| NA | NA | 6 | 1 | 0 | 0 | 
| 0 | 1 | 7 | 1 | 0 | 1 | 
| 0 | 0 | 8 | 1 | 0 | 0 | 
| 0 | 0 | 9 | 1 | 0 | 1 | 
| 1 | 0 | 10 | 1 | 0 | 0 | 
model.matrix
var1A var1B var1C var2 var3Green var3Yellow var4Low 3 0 1 0 3 0 0 1 4 0 1 0 4 0 0 0 5 1 0 0 5 0 0 1 7 0 0 1 7 1 0 1 8 1 0 0 8 1 0 0 9 1 0 0 9 1 0 1 10 0 1 0 10 1 0 0 attr(,“assign”) [1] 1 1 1 2 3 3 4 attr(,“contrasts”) attr(,“contrasts”)$var1 [1] “contr.treatment”
attr(,“contrasts”)$var3 [1] “contr.treatment”
attr(,“contrasts”)$var4 [1] “contr.treatment”
Missing Values
- Hot Deck method: a missing value was imputed from a randomly selected from similar record. Cards that are “hot” is currently being processed. Last observation carried forward (LOCF) is a kind of hot-desk imputation.
 - Cold-deck: impute data from donors from another dataset.
 - pmm: predictive mean matching
 
mice package
| Ozone | Solar.R | Wind | Temp | Month | Day | 
|---|---|---|---|---|---|
| 41 | 190 | 7.4 | NA | 5 | 1 | 
| 36 | 118 | 8.0 | NA | 5 | 2 | 
| 12 | 149 | 12.6 | NA | 5 | 3 | 
| 18 | 313 | NA | NA | 5 | 4 | 
| NA | NA | NA | NA | 5 | 5 | 
| 28 | NA | NA | 66 | 5 | 6 | 
| 23 | 299 | NA | 65 | 5 | 7 | 
| 19 | 99 | NA | 59 | 5 | 8 | 
| 8 | 19 | NA | 61 | 5 | 9 | 
| NA | 194 | NA | 69 | 5 | 10 | 
| 7 | NA | 6.9 | 74 | 5 | 11 | 
| 16 | 256 | 9.7 | 69 | 5 | 12 | 
| 11 | 290 | 9.2 | 66 | 5 | 13 | 
| 14 | 274 | 10.9 | 68 | 5 | 14 | 
| 18 | 65 | 13.2 | 58 | 5 | 15 | 
| 14 | 334 | 11.5 | 64 | 5 | 16 | 
| 34 | 307 | 12.0 | 66 | 5 | 17 | 
| 6 | 78 | 18.4 | 57 | 5 | 18 | 
| 30 | 322 | 11.5 | 68 | 5 | 19 | 
| 11 | 44 | 9.7 | 62 | 5 | 20 | 
| 1 | 8 | 9.7 | 59 | 5 | 21 | 
| 11 | 320 | 16.6 | 73 | 5 | 22 | 
| 4 | 25 | 9.7 | 61 | 5 | 23 | 
| 32 | 92 | 12.0 | 61 | 5 | 24 | 
| NA | 66 | 16.6 | 57 | 5 | 25 | 
| NA | 266 | 14.9 | 58 | 5 | 26 | 
| NA | NA | 8.0 | 57 | 5 | 27 | 
| 23 | 13 | 12.0 | 67 | 5 | 28 | 
| 45 | 252 | 14.9 | 81 | 5 | 29 | 
| 115 | 223 | 5.7 | 79 | 5 | 30 | 
| 37 | 279 | 7.4 | 76 | 5 | 31 | 
| NA | 286 | 8.6 | 78 | 6 | 1 | 
| NA | 287 | 9.7 | 74 | 6 | 2 | 
| NA | 242 | 16.1 | 67 | 6 | 3 | 
| NA | 186 | 9.2 | 84 | 6 | 4 | 
| NA | 220 | 8.6 | 85 | 6 | 5 | 
| NA | 264 | 14.3 | 79 | 6 | 6 | 
| 29 | 127 | 9.7 | 82 | 6 | 7 | 
| NA | 273 | 6.9 | 87 | 6 | 8 | 
| 71 | 291 | 13.8 | 90 | 6 | 9 | 
| 39 | 323 | 11.5 | 87 | 6 | 10 | 
| NA | 259 | 10.9 | 93 | 6 | 11 | 
| NA | 250 | 9.2 | 92 | 6 | 12 | 
| 23 | 148 | 8.0 | 82 | 6 | 13 | 
| NA | 332 | 13.8 | 80 | 6 | 14 | 
| NA | 322 | 11.5 | 79 | 6 | 15 | 
| 21 | 191 | 14.9 | 77 | 6 | 16 | 
| 37 | 284 | 20.7 | 72 | 6 | 17 | 
| 20 | 37 | 9.2 | 65 | 6 | 18 | 
| 12 | 120 | 11.5 | 73 | 6 | 19 | 
| 13 | 137 | 10.3 | 76 | 6 | 20 | 
| NA | 150 | 6.3 | 77 | 6 | 21 | 
| NA | 59 | 1.7 | 76 | 6 | 22 | 
| NA | 91 | 4.6 | 76 | 6 | 23 | 
| NA | 250 | 6.3 | 76 | 6 | 24 | 
| NA | 135 | 8.0 | 75 | 6 | 25 | 
| NA | 127 | 8.0 | 78 | 6 | 26 | 
| NA | 47 | 10.3 | 73 | 6 | 27 | 
| NA | 98 | 11.5 | 80 | 6 | 28 | 
| NA | 31 | 14.9 | 77 | 6 | 29 | 
| NA | 138 | 8.0 | 83 | 6 | 30 | 
| 135 | 269 | 4.1 | 84 | 7 | 1 | 
| 49 | 248 | 9.2 | 85 | 7 | 2 | 
| 32 | 236 | 9.2 | 81 | 7 | 3 | 
| NA | 101 | 10.9 | 84 | 7 | 4 | 
| 64 | 175 | 4.6 | 83 | 7 | 5 | 
| 40 | 314 | 10.9 | 83 | 7 | 6 | 
| 77 | 276 | 5.1 | 88 | 7 | 7 | 
| 97 | 267 | 6.3 | 92 | 7 | 8 | 
| 97 | 272 | 5.7 | 92 | 7 | 9 | 
| 85 | 175 | 7.4 | 89 | 7 | 10 | 
| NA | 139 | 8.6 | 82 | 7 | 11 | 
| 10 | 264 | 14.3 | 73 | 7 | 12 | 
| 27 | 175 | 14.9 | 81 | 7 | 13 | 
| NA | 291 | 14.9 | 91 | 7 | 14 | 
| 7 | 48 | 14.3 | 80 | 7 | 15 | 
| 48 | 260 | 6.9 | 81 | 7 | 16 | 
| 35 | 274 | 10.3 | 82 | 7 | 17 | 
| 61 | 285 | 6.3 | 84 | 7 | 18 | 
| 79 | 187 | 5.1 | 87 | 7 | 19 | 
| 63 | 220 | 11.5 | 85 | 7 | 20 | 
| 16 | 7 | 6.9 | 74 | 7 | 21 | 
| NA | 258 | 9.7 | 81 | 7 | 22 | 
| NA | 295 | 11.5 | 82 | 7 | 23 | 
| 80 | 294 | 8.6 | 86 | 7 | 24 | 
| 108 | 223 | 8.0 | 85 | 7 | 25 | 
| 20 | 81 | 8.6 | 82 | 7 | 26 | 
| 52 | 82 | 12.0 | 86 | 7 | 27 | 
| 82 | 213 | 7.4 | 88 | 7 | 28 | 
| 50 | 275 | 7.4 | 86 | 7 | 29 | 
| 64 | 253 | 7.4 | 83 | 7 | 30 | 
| 59 | 254 | 9.2 | 81 | 7 | 31 | 
| 39 | 83 | 6.9 | 81 | 8 | 1 | 
| 9 | 24 | 13.8 | 81 | 8 | 2 | 
| 16 | 77 | 7.4 | 82 | 8 | 3 | 
| 78 | NA | 6.9 | 86 | 8 | 4 | 
| 35 | NA | 7.4 | 85 | 8 | 5 | 
| 66 | NA | 4.6 | 87 | 8 | 6 | 
| 122 | 255 | 4.0 | 89 | 8 | 7 | 
| 89 | 229 | 10.3 | 90 | 8 | 8 | 
| 110 | 207 | 8.0 | 90 | 8 | 9 | 
| NA | 222 | 8.6 | 92 | 8 | 10 | 
| NA | 137 | 11.5 | 86 | 8 | 11 | 
| 44 | 192 | 11.5 | 86 | 8 | 12 | 
| 28 | 273 | 11.5 | 82 | 8 | 13 | 
| 65 | 157 | 9.7 | 80 | 8 | 14 | 
| NA | 64 | 11.5 | 79 | 8 | 15 | 
| 22 | 71 | 10.3 | 77 | 8 | 16 | 
| 59 | 51 | 6.3 | 79 | 8 | 17 | 
| 23 | 115 | 7.4 | 76 | 8 | 18 | 
| 31 | 244 | 10.9 | 78 | 8 | 19 | 
| 44 | 190 | 10.3 | 78 | 8 | 20 | 
| 21 | 259 | 15.5 | 77 | 8 | 21 | 
| 9 | 36 | 14.3 | 72 | 8 | 22 | 
| NA | 255 | 12.6 | 75 | 8 | 23 | 
| 45 | 212 | 9.7 | 79 | 8 | 24 | 
| 168 | 238 | 3.4 | 81 | 8 | 25 | 
| 73 | 215 | 8.0 | 86 | 8 | 26 | 
| NA | 153 | 5.7 | 88 | 8 | 27 | 
| 76 | 203 | 9.7 | 97 | 8 | 28 | 
| 118 | 225 | 2.3 | 94 | 8 | 29 | 
| 84 | 237 | 6.3 | 96 | 8 | 30 | 
| 85 | 188 | 6.3 | 94 | 8 | 31 | 
| 96 | 167 | 6.9 | 91 | 9 | 1 | 
| 78 | 197 | 5.1 | 92 | 9 | 2 | 
| 73 | 183 | 2.8 | 93 | 9 | 3 | 
| 91 | 189 | 4.6 | 93 | 9 | 4 | 
| 47 | 95 | 7.4 | 87 | 9 | 5 | 
| 32 | 92 | 15.5 | 84 | 9 | 6 | 
| 20 | 252 | 10.9 | 80 | 9 | 7 | 
| 23 | 220 | 10.3 | 78 | 9 | 8 | 
| 21 | 230 | 10.9 | 75 | 9 | 9 | 
| 24 | 259 | 9.7 | 73 | 9 | 10 | 
| 44 | 236 | 14.9 | 81 | 9 | 11 | 
| 21 | 259 | 15.5 | 76 | 9 | 12 | 
| 28 | 238 | 6.3 | 77 | 9 | 13 | 
| 9 | 24 | 10.9 | 71 | 9 | 14 | 
| 13 | 112 | 11.5 | 71 | 9 | 15 | 
| 46 | 237 | 6.9 | 78 | 9 | 16 | 
| 18 | 224 | 13.8 | 67 | 9 | 17 | 
| 13 | 27 | 10.3 | 76 | 9 | 18 | 
| 24 | 238 | 10.3 | 68 | 9 | 19 | 
| 16 | 201 | 8.0 | 82 | 9 | 20 | 
| 13 | 238 | 12.6 | 64 | 9 | 21 | 
| 23 | 14 | 9.2 | 71 | 9 | 22 | 
| 36 | 139 | 10.3 | 81 | 9 | 23 | 
| 7 | 49 | 10.3 | 69 | 9 | 24 | 
| 14 | 20 | 16.6 | 63 | 9 | 25 | 
| 30 | 193 | 6.9 | 70 | 9 | 26 | 
| NA | 145 | 13.2 | 77 | 9 | 27 | 
| 14 | 191 | 14.3 | 75 | 9 | 28 | 
| 18 | 131 | 8.0 | 76 | 9 | 29 | 
| 20 | 223 | 11.5 | 68 | 9 | 30 | 
Month Day Temp Solar.R Wind Ozone
104 1 1 1 1 1 1 0 34 1 1 1 1 1 0 1 3 1 1 1 1 0 1 1 1 1 1 1 1 0 0 2 4 1 1 1 0 1 1 1 1 1 1 1 0 1 0 2 1 1 1 1 0 0 1 2 3 1 1 0 1 1 1 1 1 1 1 0 1 0 1 2 1 1 1 0 0 0 0 4 0 0 5 7 7 37 56
 Month Day Temp Solar.R Wind Ozone
104 1 1 1 1 1 1 0 34 1 1 1 1 1 0 1 3 1 1 1 1 0 1 1 1 1 1 1 1 0 0 2 4 1 1 1 0 1 1 1 1 1 1 1 0 1 0 2 1 1 1 1 0 0 1 2 3 1 1 0 1 1 1 1 1 1 1 0 1 0 1 2 1 1 1 0 0 0 0 4 0 0 5 7 7 37 56
Multiple-Imputation
library(mice)
set.seed(1234)
t <- data.table(id=1:10
              , var1=factor(sample(c("A", "B", "C", NA), 10, replace=TRUE), levels=c("A", "B", "C"))
              , var2=sample(c(1:10, rep(NA, 5)), 10, replace = TRUE)
              , var3=factor(rep(c("Red", "Green"), each=5), levels=c("Red", "Green", "Yellow"))
              , var4=rep(c("Low", "High"), 5)
                )
t %>% prt()| id | var1 | var2 | var3 | var4 | 
|---|---|---|---|---|
| 1 | NA | 4 | Red | Low | 
| 2 | NA | 2 | Red | High | 
| 3 | B | 7 | Red | Low | 
| 4 | B | 6 | Red | High | 
| 5 | A | 10 | Red | Low | 
| 6 | NA | 6 | Green | High | 
| 7 | C | NA | Green | Low | 
| 8 | A | NA | Green | High | 
| 9 | A | 4 | Green | Low | 
| 10 | B | NA | Green | High | 
ti <- mice(t
         , m=5
         , maxit = 50
         , method = 'pmm'
         , seed = 500
         , printFlag = FALSE
           )
tic <- complete(ti, action = "all", include = TRUE)
##tic <- complete(ti, action = 2L, include = TRUE)
tic[[2]] %>% prt()| id | var1 | var2 | var3 | var4 | 
|---|---|---|---|---|
| 1 | A | 4 | Red | Low | 
| 2 | A | 2 | Red | High | 
| 3 | B | 7 | Red | Low | 
| 4 | B | 6 | Red | High | 
| 5 | A | 10 | Red | Low | 
| 6 | B | 6 | Green | High | 
| 7 | C | 6 | Green | Low | 
| 8 | A | 6 | Green | High | 
| 9 | A | 4 | Green | Low | 
| 10 | B | 6 | Green | High | 
VIM package
library(VIM)
t <- data.table(id=1:10
                , var1=factor(sample(c("A", "B", "C", NA), 10, replace=TRUE), levels=c("A", "B", "C"))
              , var2=sample(c(1:10, NA), 10, replace = TRUE)
              , var3=factor(rep(c("Red", "Green"), each=5), levels=c("Red", "Green", "Yellow"))
              , var4=rep(c("Low", "High"), 5)
                )
matrixplot(t, sortby = c('id'))Computing Environment
R version 4.2.0 (2022-04-22) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 20.04.3 LTS
Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
locale: [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
attached base packages: [1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages: [1] VIM_6.2.2 colorspace_2.1-0 mice_3.15.0
[4] caret_6.0-93 lattice_0.20-45 matrixStats_0.63.0 [7] zoo_1.8-11 Wu_0.0.0.9000 flexdashboard_0.6.1 [10] lme4_1.1-31 Matrix_1.5-3 mgcv_1.8-38
[13] nlme_3.1-152 png_0.1-8 scales_1.2.1
[16] nnet_7.3-16 labelled_2.10.0 kableExtra_1.3.4
[19] plotly_4.10.1 gridExtra_2.3 ggplot2_3.4.1
[22] DT_0.27 tableone_0.13.2 magrittr_2.0.3
[25] lubridate_1.9.2 dplyr_1.1.0 plyr_1.8.8
[28] data.table_1.14.8 rmdformats_1.0.4 knitr_1.42
loaded via a namespace (and not attached): [1] minqa_1.2.5 ellipsis_0.3.2 class_7.3-19
[4] proxy_0.4-27 rstudioapi_0.14 listenv_0.9.0
[7] prodlim_2019.11.13 fansi_1.0.4 ranger_0.14.1
[10] xml2_1.3.3 codetools_0.2-18 splines_4.2.0
[13] robustbase_0.95-0 cachem_1.0.6 jsonlite_1.8.4
[16] nloptr_2.0.3 pROC_1.18.0 broom_1.0.3
[19] compiler_4.2.0 httr_1.4.5 backports_1.4.1
[22] fastmap_1.1.0 lazyeval_0.2.2 survey_4.1-1
[25] cli_3.6.0 htmltools_0.5.4 tools_4.2.0
[28] gtable_0.3.1 glue_1.6.2 reshape2_1.4.4
[31] mltools_0.3.5 Rcpp_1.0.10 carData_3.0-5
[34] jquerylib_0.1.4 vctrs_0.5.2 svglite_2.1.1
[37] iterators_1.0.14 lmtest_0.9-40 timeDate_4022.108
[40] laeken_0.5.2 gower_1.0.1 xfun_0.37
[43] stringr_1.5.0 globals_0.16.2 rvest_1.0.3
[46] timechange_0.2.0 lifecycle_1.0.3 future_1.31.0
[49] DEoptimR_1.0-11 MASS_7.3-54 ipred_0.9-13
[52] hms_1.1.2 parallel_4.2.0 yaml_2.3.7
[55] sass_0.4.5 rpart_4.1-15 stringi_1.7.12
[58] highr_0.9 foreach_1.5.2 e1071_1.7-13
[61] hardhat_1.2.0 boot_1.3-28 lava_1.7.2.1
[64] rlang_1.0.6 pkgconfig_2.0.3 systemfonts_1.0.4
[67] evaluate_0.20 purrr_1.0.1 recipes_1.0.5
[70] htmlwidgets_1.5.4 tidyselect_1.2.0 parallelly_1.34.0
[73] bookdown_0.32 R6_2.5.1 generics_0.1.3
[76] DBI_1.1.3 pillar_1.8.1 haven_2.5.2
[79] withr_2.5.0 abind_1.4-5 sp_1.6-0
[82] survival_3.2-13 tibble_3.1.8 future.apply_1.10.0 [85] car_3.1-1 utf8_1.2.2 rmarkdown_2.20
[88] forcats_1.0.0 ModelMetrics_1.2.2.2 vcd_1.4-11
[91] digest_0.6.29 webshot_0.5.4 tidyr_1.3.0
[94] stats4_4.2.0 munsell_0.5.0 viridisLite_0.4.1
[97] bslib_0.4.2 mitools_2.4