R Data Manipulation with data.table
Read Data
Pull Data from REDCap
- Pull data from REDCap using API and save the csv file
Pull Data from MySQL
General Data Description
- Create a HTML file the describe all variables in a dataset
Rename Variables & Revalue A Vector
- match function matches the first vector to the second vector, gives the indexes of elements from vector one that matched in vector two
- dplyr::rename function renames variable names
x |
---|
NA |
2 |
NA |
1 |
NA |
x |
---|
A |
B |
colnames(dt) <- plyr::revalue(colnames(dt), c("A"="Variable_A"))
colnames(dt) %>% prt(caption = "Column Names Changed")
x |
---|
Variable_A |
B |
## dplyr::rename(dt, var_b=B)
lst <- list("A"="Variable_A")
## names(lst)
## unlist(lst)
colnames(dt)[colnames(dt) %in% names(lst)] <- unlist(lst)[colnames(dt)[colnames(dt) %in% names(lst)]]
v <- c("A", "B", "C")
lst <- c(
"A"="A-"
, "B"
, "C"="C+"
)
plyr::revalue(v, lst) %>% prt(caption = "plyr::revalue function")
x |
---|
A- |
B |
C+ |
x |
---|
A- |
B |
C+ |
## class(v)
dt <- data.table(var1 = 1, var2 = 2, var3 = 3)
Wu::renameVariables(obj = dt, lst = c(var1 = "varA", var2 = "varB")) %>% prt()
varA | varB | var3 |
---|---|---|
1 | 2 | 3 |
Regular Expression
Fill Missing Values
Replace elements in a vector by a specific value
[1] 1 0 3 0 5 6 7 8 0 10
Index of observations
dt <- data.table(
group=c("A", "A", "B", "B", "B", "C", "C", "C", "C")
, var1=1:9
)
dt <- dt[, index_i := .I
][, index_n := 1:.N
][, index_i_group := .I, by = .(group)
][, index_n_group := 1:.N, by = .(group)]
dt %>% prt()
group | var1 | index_i | index_n | index_i_group | index_n_group |
---|---|---|---|---|---|
A | 1 | 1 | 1 | 1 | 1 |
A | 2 | 2 | 2 | 2 | 2 |
B | 3 | 3 | 3 | 3 | 1 |
B | 4 | 4 | 4 | 4 | 2 |
B | 5 | 5 | 5 | 5 | 3 |
C | 6 | 6 | 6 | 6 | 1 |
C | 7 | 7 | 7 | 7 | 2 |
C | 8 | 8 | 8 | 8 | 3 |
C | 9 | 9 | 9 | 9 | 4 |
Last Observation Carry Forward (LOCF)
- data.table::nafill funtion only works for numeric variables
- Workaround for character variables by filling indexes of character variables instead https://github.com/Rdatatable/data.table/issues/3992#issuecomment-546295277
- type=“nocb”: Next Observation Carry Backward
- type=“const”: fill with constant
dt <- data.table(
n1=c(1, NA, 2, NA, NA, 3, 3, NA)
, c1=c("A", "A", "B", NA, "C", NA, NA, NA)
)
dt <- dt[
, n1_locf := nafill(n1, type="locf")
][, c1_locf := c1[nafill(replace(.I, is.na(c1), NA), type="locf")]
][, index := .I]
dt[, .(index, n1, n1_locf, c1, c1_locf)] %>% prt()
index | n1 | n1_locf | c1 | c1_locf |
---|---|---|---|---|
1 | 1 | 1 | A | A |
2 | NA | 1 | A | A |
3 | 2 | 2 | B | B |
4 | NA | 2 | NA | B |
5 | NA | 2 | C | C |
6 | 3 | 3 | NA | C |
7 | 3 | 3 | NA | C |
8 | NA | 3 | NA | C |
- More detailed process for character variables
Data Functions
Sum Multiple Columns
is.1 <- function(x){x %in% c(1)}
sum.1 <- function(x){sum(as.numeric(is.1(x)))}
dm <- dm[, race_sum := sum(.SD)
, .SDcols = paste0("race___", as.character(1:6))
, by = 1:nrow(dm)]
dm <- dm[, conditions_num := sum(is.1(hypertension)
, is.1(diabetes)
, is.1(heart_disease)
, is.1(renal_disease)
, is.1(malignancies)
, is.1(respiratory_disease)
)
, by = 1:nrow(dm)
]
Rolling Average
library(zoo)
dt <- data.table(series = c(1:10, NA, 11:21))
dt <- dt[, mv2 := rollmean(series, k = 2, align = "right", na.pad = TRUE)
][, mv3 := rollmean(series, k = 3, align = "right", na.pad = TRUE)]
dt %>% prt()
series | mv2 | mv3 |
---|---|---|
1 | NA | NA |
2 | 1.5 | NA |
3 | 2.5 | 2 |
4 | 3.5 | 3 |
5 | 4.5 | 4 |
6 | 5.5 | 5 |
7 | 6.5 | 6 |
8 | 7.5 | 7 |
9 | 8.5 | 8 |
10 | 9.5 | 9 |
NA | NA | NA |
11 | NA | NA |
12 | 11.5 | NA |
13 | 12.5 | 12 |
14 | 13.5 | 13 |
15 | 14.5 | 14 |
16 | 15.5 | 15 |
17 | 16.5 | 16 |
18 | 17.5 | 17 |
19 | 18.5 | 18 |
20 | 19.5 | 19 |
21 | 20.5 | 20 |
Weighted Rolling Average
dt <- data.table(series = c(1:10, NA, 11:21))
n <- 2
wts <- (1:n) / sum(1:n)
wm <- function(x){
invisible(weighted.mean(x = x, w = wts))
}
dt <- dt[, mvw2 := frollapply(seq_len(.N)
, FUN = function(ind) wm(series[ind])
, n = 2
, align = "left"
)]
dt %>% prt()
series | mvw2 |
---|---|
1 | 1.666667 |
2 | 2.666667 |
3 | 3.666667 |
4 | 4.666667 |
5 | 5.666667 |
6 | 6.666667 |
7 | 7.666667 |
8 | 8.666667 |
9 | 9.666667 |
10 | NA |
NA | NA |
11 | 11.666667 |
12 | 12.666667 |
13 | 13.666667 |
14 | 14.666667 |
15 | 15.666667 |
16 | 16.666667 |
17 | 17.666667 |
18 | 18.666667 |
19 | 19.666667 |
20 | 20.666667 |
21 | NA |
Weighted Rolling Median
dt <- data.table(var1 = c(1:10, NA, 11:21))
library(matrixStats)
n <- 4
wts <- c(0.1, 1, 1, 1)
wts <- wts / sum(wts)
wmd <- function(x){
invisible(matrixStats::weightedMedian(x = x, w = wts))
}
dt <- dt[, mvwmd3 := frollapply(seq_len(.N)
, FUN = function(ind) wmd(var1[ind])
, n = 4
, align = "left"
)]
dt %>% prt()
var1 | mvwmd3 |
---|---|
1 | 2.95 |
2 | 3.95 |
3 | 4.95 |
4 | 5.95 |
5 | 6.95 |
6 | 7.95 |
7 | 8.95 |
8 | NA |
9 | NA |
10 | NA |
NA | NA |
11 | 12.95 |
12 | 13.95 |
13 | 14.95 |
14 | 15.95 |
15 | 16.95 |
16 | 17.95 |
17 | 18.95 |
18 | 19.95 |
19 | NA |
20 | NA |
21 | NA |
Concatenate Multiple Rows by Group
a b ID 1: a A 1 2: b B 1 3: c C 1 4: d D 2 5: e E 2 6: f F 2 7: g G 2 8: h H 3 9: i I 3 10: j J 3
ID a 1: 1 a b c 2: 2 d e f g 3: 3 h i j
Shift Column
cf <- cf[
][, status_row_n_int := nafill(status_row_n, type = "locf"), by = list(mrn)
][, fio2_int := nafill(fio2_row_filled, type = "locf"), by = list(mrn)
][, datetime_record_nx1 := shift(datetime_record, type="lead"), by = list(mrn)
][, time_int := as.numeric(difftime(datetime_record_nx1, datetime_record, units = "hours"))
]
Permutate & combination
expand.grid(levels(dt$sex_2.factor)
, levels(dt$age_group)
, levels(dt$race_ethnicity)
, levels(dt$com_obesity)
)
combn(letters[1:4], 2)
perm_without_replacement <- function(n, r){
return(factorial(n)/factorial(n - r)/ factorial(r))
}
factorial(4)
# 9 choose 3
perm_without_replacement(9, 3) * perm_without_replacement(6, 3)
6 * 5 * 4 / (3 * 2)
Split data.frame
Replicate Rows
name |
---|
a |
b |
c |
d |
name | index |
---|---|
a | 1 |
a | 2 |
a | 3 |
b | 1 |
b | 2 |
b | 3 |
c | 1 |
c | 2 |
c | 3 |
d | 1 |
d | 2 |
d | 3 |
dt <- data.table(
id = 1:6
, num = c(3, 5, 6, 3, 4, 5)
)
dt <- dt[rep(1:.N, times = num)][, sub_index := 1:.N, by = id]
dt %>% prt()
id | num | sub_index |
---|---|---|
1 | 3 | 1 |
1 | 3 | 2 |
1 | 3 | 3 |
2 | 5 | 1 |
2 | 5 | 2 |
2 | 5 | 3 |
2 | 5 | 4 |
2 | 5 | 5 |
3 | 6 | 1 |
3 | 6 | 2 |
3 | 6 | 3 |
3 | 6 | 4 |
3 | 6 | 5 |
3 | 6 | 6 |
4 | 3 | 1 |
4 | 3 | 2 |
4 | 3 | 3 |
5 | 4 | 1 |
5 | 4 | 2 |
5 | 4 | 3 |
5 | 4 | 4 |
6 | 5 | 1 |
6 | 5 | 2 |
6 | 5 | 3 |
6 | 5 | 4 |
6 | 5 | 5 |
Generate Random Variables
- Vectorize function
- Generate random variables within data.table using values from another column
library(Wu)
library(data.table)
dt <- data.table(
id = 1:6
, m = c(1:6)
)
rnorm_vec <- function(x){
rnorm(n = 1, mean = x)
}
rnorm_vec <- Vectorize(rnorm_vec)
dt <- dt[, value := rnorm_vec(m)]
dt %>% prt()
id | m | value |
---|---|---|
1 | 1 | 0.7553553 |
2 | 2 | 1.2187691 |
3 | 3 | -0.0309352 |
4 | 4 | 5.4237023 |
5 | 5 | 4.7998467 |
6 | 6 | 3.8257029 |
Pass Column Name to data.table within Function
var1 | var2 |
---|---|
1 | A |
2 | A |
3 | B |
4 | B |
5 | C |
6 | C |
7 | D |
8 | D |
9 | E |
10 | E |
var2 |
---|
A |
A |
B |
B |
C |
C |
D |
D |
E |
E |
var2 |
---|
A |
A |
B |
B |
C |
C |
D |
D |
E |
E |
get | N |
---|---|
A | 2 |
B | 2 |
C | 2 |
D | 2 |
E | 2 |
var1 | var2 |
---|---|
1 | A+ |
2 | A+ |
3 | B+ |
4 | B+ |
5 | C+ |
6 | C+ |
7 | D+ |
8 | D+ |
9 | E+ |
10 | E+ |
pass_col <- function(clname){
## clname <- sym(clname)
print(dt[, .N, by = clname])
}
pass_col(c("var1", "var2"))
var1 var2 N 1: 1 A+ 1 2: 2 A+ 1 3: 3 B+ 1 4: 4 B+ 1 5: 5 C+ 1 6: 6 C+ 1 7: 7 D+ 1 8: 8 D+ 1 9: 9 E+ 1 10: 10 E+ 1 # Reshape
Reshape Wide to Long
id | var_1 | var_2 | var_3 |
---|---|---|---|
1 | 10 | 21 | 31 |
2 | 11 | 22 | 32 |
3 | 12 | 23 | 33 |
4 | 13 | 24 | 34 |
5 | 14 | 25 | 35 |
1 | 15 | 21 | 31 |
id | variable | value |
---|---|---|
1 | var_1 | 10 |
2 | var_1 | 11 |
3 | var_1 | 12 |
4 | var_1 | 13 |
5 | var_1 | 14 |
1 | var_1 | 15 |
1 | var_2 | 21 |
2 | var_2 | 22 |
3 | var_2 | 23 |
4 | var_2 | 24 |
5 | var_2 | 25 |
1 | var_2 | 21 |
1 | var_3 | 31 |
2 | var_3 | 32 |
3 | var_3 | 33 |
4 | var_3 | 34 |
5 | var_3 | 35 |
1 | var_3 | 31 |
Reshape Long to Wide
id | day | value |
---|---|---|
A | 1 | 1 |
A | 2 | 2 |
A | 3 | 3 |
B | 1 | 4 |
B | 2 | 5 |
B | 3 | 6 |
[1] “data.table” “data.frame”
id | 1 | 2 | 3 |
---|---|---|---|
A | 1 | 2 | 3 |
B | 4 | 5 | 6 |
Merge Multiple Tables
t1 <- data.table(id = 1:10, var1 = rnorm(10, 1))
t2 <- data.table(id = 1:10, var2 = rnorm(10, 10))
t3 <- data.table(id = 1:10, var3 = rnorm(10, 100))
t4 <- data.table(id = 1:10, var4 = rnorm(10, 1000))
setkey(t1, id)
setkey(t2, id)
setkey(t3, id)
setkey(t4, id)
t <- Reduce(function(...) merge(..., all = TRUE)
, list(t1, t2, t3, t4))
t %>% prt(caption = "Merge Multiple Tables by Key")
Date and Time
General
as.POSIXct(
data$guidewire_datetime
, format='%Y-%m-%d %H:%M'
, tz='GMT')
as.Date(data$diag_ecg_date_time, format='%Y-%m-%d')
build_date <- function(year, month, date){
as.Date(ISOdate(year, month, date))
}
get_datetime <- function(text){
as.POSIXct(
trimws(text)
, format = "%Y-%m-%d %H:%M"
, tz = "GMT"
, origin = '1970-01-01'
)
}
get_ed_arrival <- function(text){
as.POSIXct(
trimws(text)
, format = "%m/%d/%y %H%M"
, tz = "GMT"
, origin = '1970-01-01'
)
}
tf <- tf[
, datetime_ed_arrival := get_ed_arrival(ed_arrival_time_str)
][, year_ed_arrival := format(datetime_ed_arrival, "%Y")
][, month_ed_arrival := format(datetime_ed_arrival, "%m")
][, yearmonth_ed_arrival := format(datetime_ed_arrival, "%Y%m")
][, weekdays_ed_arrival := weekdays(datetime_ed_arrival)
][, weekdays_ed_arrival := factor(
weekdays_ed_arrival
, levels = c(
"Sunday"
, "Monday"
, "Tuesday"
, "Wednesday"
, "Thursday"
, "Friday"
, "Saturday"
)
)
][, weekdays_ed_arrival_n := as.numeric(weekdays_ed_arrival)
][, hour_ed_arrival := data.table::hour(datetime_ed_arrival)
][, minute_ed_arrival := data.table::minute(datetime_ed_arrival)
][, hour_ed_arrival_c := strftime(datetime_ed_arrival, format = "%H", tz = "GMT")
][, minute_ed_arrival_c := strftime(datetime_ed_arrival, format = "%M", tz = "GMT")][, hour_ed_arrival_n := hour_ed_arrival + minute_ed_arrival/60]
tf <- tf[, flag_bussiness_hours := case_when(
is.na(hour_ed_arrival) ~ as.character(NA)
, hour_ed_arrival < 8 ~ "Non-Business Hours"
, hour_ed_arrival == 17 & minute_ed_arrival >= 1 ~ "Non-Business Hours"
, hour_ed_arrival > 17 ~ "Non-Business Hours"
, TRUE ~ "Business Hours"
)
][, flag_bussiness_hours := factor(
flag_bussiness_hours
, levels = c("Business Hours", "Non-Business Hours")
)]
tf <- tf[
, flag_weekday := case_when(
weekdays_ed_arrival %in% c("Tuesday", "Wednesday", "Thursday") ~ "Weekday"
, weekdays_ed_arrival %in% c("Saturday", "Sunday") ~ "Weekend"
, weekdays_ed_arrival %in% c("Friday") & hour_ed_arrival == 17 & minute_ed_arrival >= 1 ~ "Weekend"
, weekdays_ed_arrival %in% c("Friday") & hour_ed_arrival > 17 ~ "Weekend"
, weekdays_ed_arrival %in% c("Friday") ~ "Weekday"
, weekdays_ed_arrival %in% c("Monday") & hour_ed_arrival == 7 & minute_ed_arrival <= 59 ~ "Weekend"
, weekdays_ed_arrival %in% c("Monday") & hour_ed_arrival < 7 ~ "Weekend"
, weekdays_ed_arrival %in% c("Monday") ~ "Weekday"
, TRUE ~ as.character(NA)
)
][, flag_weekday := factor(flag_weekday, levels = c("Weekday", "Weekend"))
][]
fp <- fp[
, datetime_proc_start := get_datetime(proc_start_time_str)
][, year_proc_start := format(datetime_proc_start, "%Y")
][, yearmonth_proc_start := format(datetime_proc_start, "%Y%m")]
vs4 <- vs4[
, ed_arrival_to_pas_discharge := as.numeric(difftime(
datetime_pas_discharge
, datetime_ed_arrival
, units = "hours"
))
]
vs4 <- vs4[
, date_ed_arrival := date(datetime_ed_arrival)
][, age_in_years := as.numeric(date_ed_arrival - dob)/365.25
][, age_group := case_when(
age_in_years < 18 ~ "< 18"
, age_in_years < 65 ~ "18-64"
, age_in_years >= 65 ~ "65 +"
, TRUE ~ as.character(NA)
)
]
Sequential Date
[1] “2019-07-01” “2019-07-02” “2019-07-03” “2019-07-04” “2019-07-05” [6] “2019-07-06” “2019-07-07” “2019-07-08” “2019-07-09” “2019-07-10”
[1] “2019-07-01 00:00:00” “2019-07-02 00:00:00” “2019-07-03 00:00:00” [4] “2019-07-04 00:00:00” “2019-07-05 00:00:00” “2019-07-06 00:00:00” [7] “2019-07-07 00:00:00” “2019-07-08 00:00:00” “2019-07-09 00:00:00” [10] “2019-07-10 00:00:00”
[1] “POSIXct” “POSIXt”
[1] “2019-07-01” “2019-07-02” “2019-07-03” “2019-07-04” “2019-07-05” [6] “2019-07-06” “2019-07-07” “2019-07-08” “2019-07-09” “2019-07-10”
[1] 0 0 0 0 0 0 0 0 0 0
Scale/One-hot encoding/Dummy Variable
Scale and Unscale
unscale <- function(x){
x * attr(x, "scaled:scale") + attr(x, "scaled:center")
}
v <- c(1:10, NA)
vs <- scale(v)
vs2 <- (v - mean(v)) / sd(v)
v2 <- unscale(vs)
print(cbind(v, vs, vs2, v2))
v vs2
[1,] 1 -1.4863011 NA 1 [2,] 2 -1.1560120 NA 2 [3,] 3 -0.8257228 NA 3 [4,] 4 -0.4954337 NA 4 [5,] 5 -0.1651446 NA 5 [6,] 6 0.1651446 NA 6 [7,] 7 0.4954337 NA 7 [8,] 8 0.8257228 NA 8 [9,] 9 1.1560120 NA 9 [10,] 10 1.4863011 NA 10 [11,] NA NA NA NA
num [1:11, 1] -1.486 -1.156 -0.826 -0.495 -0.165 … - attr(, “scaled:center”)= num 5.5 - attr(, “scaled:scale”)= num 3.03
[1] 5.5
[1] 3.02765
One-Hot Encoding
Wu::one_hot
- Single impute numeric variables as median and add a column indicating missing values
- Add NA level to categorical variables
library(Wu)
set.seed(1234)
t <- data.table(var1=factor(sample(c("A", "B", "C", NA), 10, replace=TRUE), levels=c("A", "B", "C"))
, var2=sample(c(1:10, rep(NA, 5)), 10)
, var3=factor(rep(c("Red", "Green"), each=5), levels=c("Red", "Green", "Yellow"))
)
t %>% prt()
var1 | var2 | var3 |
---|---|---|
NA | 4 | Red |
NA | 2 | Red |
B | 7 | Red |
B | 6 | Red |
A | 10 | Red |
NA | NA | Green |
C | NA | Green |
A | NA | Green |
A | 8 | Green |
B | NA | Green |
var2 | var2_notA | var1-A | var1-B | var1-C | var3-Red |
---|---|---|---|---|---|
4.0 | 0 | 0 | 0 | 0 | 1 |
2.0 | 0 | 0 | 0 | 0 | 1 |
7.0 | 0 | 0 | 1 | 0 | 1 |
6.0 | 0 | 0 | 1 | 0 | 1 |
10.0 | 0 | 1 | 0 | 0 | 1 |
6.5 | 1 | 0 | 0 | 1 | 0 |
6.5 | 1 | 1 | 0 | 0 | 0 |
6.5 | 1 | 1 | 0 | 0 | 0 |
8.0 | 0 | 0 | 1 | 0 | 0 |
6.5 | 1 | 0 | 0 | 0 | 1 |
mltools::one_hot
- Only work on factors, not characters;
- DEFAULT = “auto” encodes all unordered factor columns
library(data.table)
t <- data.table(var1=factor(sample(c("A", "B", "C", NA), 10, replace=TRUE), levels=c("A", "B", "C"))
, var2=1:10
, var3=factor(rep(c("Red", "Green"), each=5), levels=c("Red", "Green", "Yellow"))
)
print(t)
var1 var2 var3 1:
var1_A var1_B var1_C var2 var3_Red var3_Green var3_Yellow 1: NA NA NA 1 1 0 0 2: NA NA NA 2 1 0 0 3: 1 0 0 3 1 0 0 4: NA NA NA 4 1 0 0 5: NA NA NA 5 1 0 0 6: NA NA NA 6 0 1 0 7: 0 0 1 7 0 1 0 8: NA NA NA 8 0 1 0 9: 0 0 1 9 0 1 0 10: 0 0 1 10 0 1 0
var1_NA var1_A var1_B var1_C var2 var3_Red var3_Green var3_Yellow 1: 1 0 0 0 1 1 0 0 2: 1 0 0 0 2 1 0 0 3: 0 1 0 0 3 1 0 0 4: 1 0 0 0 4 1 0 0 5: 1 0 0 0 5 1 0 0 6: 1 0 0 0 6 0 1 0 7: 0 0 0 1 7 0 1 0 8: 1 0 0 0 8 0 1 0 9: 0 0 0 1 9 0 1 0 10: 0 0 0 1 10 0 1 0
var1 var1_NA var1_A var1_B var1_C var2 var3 var3_Red var3_Green 1:
var1 var1_NA var1_A var1_C var2 var3 var3_Red var3_Green 1:
caret::dummyVars
- It converts all factor and character variables
- fullRank remove referral level
- Cannot drop unused levels
library(caret)
set.seed(1234)
t <- data.table(var1=factor(sample(c("A", "B", "C", NA), 10, replace=TRUE), levels=c("A", "B", "C"))
, var2=1:10
, var3=factor(rep(c("Red", "Green"), each=5), levels=c("Red", "Green", "Yellow"))
, var4=rep(c("Low", "High"), 5)
)
t %>% prt()
var1 | var2 | var3 | var4 |
---|---|---|---|
NA | 1 | Red | Low |
NA | 2 | Red | High |
B | 3 | Red | Low |
B | 4 | Red | High |
A | 5 | Red | Low |
NA | 6 | Green | High |
C | 7 | Green | Low |
A | 8 | Green | High |
A | 9 | Green | Low |
B | 10 | Green | High |
dummy <- dummyVars(" ~ .", data=t, fullRank = TRUE)
tdummy <- data.frame(predict(dummy, t))
tdummy %>% prt()
var1.B | var1.C | var2 | var3.Green | var3.Yellow | var4Low |
---|---|---|---|---|---|
NA | NA | 1 | 0 | 0 | 1 |
NA | NA | 2 | 0 | 0 | 0 |
1 | 0 | 3 | 0 | 0 | 1 |
1 | 0 | 4 | 0 | 0 | 0 |
0 | 0 | 5 | 0 | 0 | 1 |
NA | NA | 6 | 1 | 0 | 0 |
0 | 1 | 7 | 1 | 0 | 1 |
0 | 0 | 8 | 1 | 0 | 0 |
0 | 0 | 9 | 1 | 0 | 1 |
1 | 0 | 10 | 1 | 0 | 0 |
model.matrix
var1A var1B var1C var2 var3Green var3Yellow var4Low 3 0 1 0 3 0 0 1 4 0 1 0 4 0 0 0 5 1 0 0 5 0 0 1 7 0 0 1 7 1 0 1 8 1 0 0 8 1 0 0 9 1 0 0 9 1 0 1 10 0 1 0 10 1 0 0 attr(,“assign”) [1] 1 1 1 2 3 3 4 attr(,“contrasts”) attr(,“contrasts”)$var1 [1] “contr.treatment”
attr(,“contrasts”)$var3 [1] “contr.treatment”
attr(,“contrasts”)$var4 [1] “contr.treatment”
Missing Values
- Hot Deck method: a missing value was imputed from a randomly selected from similar record. Cards that are “hot” is currently being processed. Last observation carried forward (LOCF) is a kind of hot-desk imputation.
- Cold-deck: impute data from donors from another dataset.
- pmm: predictive mean matching
mice package
Ozone | Solar.R | Wind | Temp | Month | Day |
---|---|---|---|---|---|
41 | 190 | 7.4 | NA | 5 | 1 |
36 | 118 | 8.0 | NA | 5 | 2 |
12 | 149 | 12.6 | NA | 5 | 3 |
18 | 313 | NA | NA | 5 | 4 |
NA | NA | NA | NA | 5 | 5 |
28 | NA | NA | 66 | 5 | 6 |
23 | 299 | NA | 65 | 5 | 7 |
19 | 99 | NA | 59 | 5 | 8 |
8 | 19 | NA | 61 | 5 | 9 |
NA | 194 | NA | 69 | 5 | 10 |
7 | NA | 6.9 | 74 | 5 | 11 |
16 | 256 | 9.7 | 69 | 5 | 12 |
11 | 290 | 9.2 | 66 | 5 | 13 |
14 | 274 | 10.9 | 68 | 5 | 14 |
18 | 65 | 13.2 | 58 | 5 | 15 |
14 | 334 | 11.5 | 64 | 5 | 16 |
34 | 307 | 12.0 | 66 | 5 | 17 |
6 | 78 | 18.4 | 57 | 5 | 18 |
30 | 322 | 11.5 | 68 | 5 | 19 |
11 | 44 | 9.7 | 62 | 5 | 20 |
1 | 8 | 9.7 | 59 | 5 | 21 |
11 | 320 | 16.6 | 73 | 5 | 22 |
4 | 25 | 9.7 | 61 | 5 | 23 |
32 | 92 | 12.0 | 61 | 5 | 24 |
NA | 66 | 16.6 | 57 | 5 | 25 |
NA | 266 | 14.9 | 58 | 5 | 26 |
NA | NA | 8.0 | 57 | 5 | 27 |
23 | 13 | 12.0 | 67 | 5 | 28 |
45 | 252 | 14.9 | 81 | 5 | 29 |
115 | 223 | 5.7 | 79 | 5 | 30 |
37 | 279 | 7.4 | 76 | 5 | 31 |
NA | 286 | 8.6 | 78 | 6 | 1 |
NA | 287 | 9.7 | 74 | 6 | 2 |
NA | 242 | 16.1 | 67 | 6 | 3 |
NA | 186 | 9.2 | 84 | 6 | 4 |
NA | 220 | 8.6 | 85 | 6 | 5 |
NA | 264 | 14.3 | 79 | 6 | 6 |
29 | 127 | 9.7 | 82 | 6 | 7 |
NA | 273 | 6.9 | 87 | 6 | 8 |
71 | 291 | 13.8 | 90 | 6 | 9 |
39 | 323 | 11.5 | 87 | 6 | 10 |
NA | 259 | 10.9 | 93 | 6 | 11 |
NA | 250 | 9.2 | 92 | 6 | 12 |
23 | 148 | 8.0 | 82 | 6 | 13 |
NA | 332 | 13.8 | 80 | 6 | 14 |
NA | 322 | 11.5 | 79 | 6 | 15 |
21 | 191 | 14.9 | 77 | 6 | 16 |
37 | 284 | 20.7 | 72 | 6 | 17 |
20 | 37 | 9.2 | 65 | 6 | 18 |
12 | 120 | 11.5 | 73 | 6 | 19 |
13 | 137 | 10.3 | 76 | 6 | 20 |
NA | 150 | 6.3 | 77 | 6 | 21 |
NA | 59 | 1.7 | 76 | 6 | 22 |
NA | 91 | 4.6 | 76 | 6 | 23 |
NA | 250 | 6.3 | 76 | 6 | 24 |
NA | 135 | 8.0 | 75 | 6 | 25 |
NA | 127 | 8.0 | 78 | 6 | 26 |
NA | 47 | 10.3 | 73 | 6 | 27 |
NA | 98 | 11.5 | 80 | 6 | 28 |
NA | 31 | 14.9 | 77 | 6 | 29 |
NA | 138 | 8.0 | 83 | 6 | 30 |
135 | 269 | 4.1 | 84 | 7 | 1 |
49 | 248 | 9.2 | 85 | 7 | 2 |
32 | 236 | 9.2 | 81 | 7 | 3 |
NA | 101 | 10.9 | 84 | 7 | 4 |
64 | 175 | 4.6 | 83 | 7 | 5 |
40 | 314 | 10.9 | 83 | 7 | 6 |
77 | 276 | 5.1 | 88 | 7 | 7 |
97 | 267 | 6.3 | 92 | 7 | 8 |
97 | 272 | 5.7 | 92 | 7 | 9 |
85 | 175 | 7.4 | 89 | 7 | 10 |
NA | 139 | 8.6 | 82 | 7 | 11 |
10 | 264 | 14.3 | 73 | 7 | 12 |
27 | 175 | 14.9 | 81 | 7 | 13 |
NA | 291 | 14.9 | 91 | 7 | 14 |
7 | 48 | 14.3 | 80 | 7 | 15 |
48 | 260 | 6.9 | 81 | 7 | 16 |
35 | 274 | 10.3 | 82 | 7 | 17 |
61 | 285 | 6.3 | 84 | 7 | 18 |
79 | 187 | 5.1 | 87 | 7 | 19 |
63 | 220 | 11.5 | 85 | 7 | 20 |
16 | 7 | 6.9 | 74 | 7 | 21 |
NA | 258 | 9.7 | 81 | 7 | 22 |
NA | 295 | 11.5 | 82 | 7 | 23 |
80 | 294 | 8.6 | 86 | 7 | 24 |
108 | 223 | 8.0 | 85 | 7 | 25 |
20 | 81 | 8.6 | 82 | 7 | 26 |
52 | 82 | 12.0 | 86 | 7 | 27 |
82 | 213 | 7.4 | 88 | 7 | 28 |
50 | 275 | 7.4 | 86 | 7 | 29 |
64 | 253 | 7.4 | 83 | 7 | 30 |
59 | 254 | 9.2 | 81 | 7 | 31 |
39 | 83 | 6.9 | 81 | 8 | 1 |
9 | 24 | 13.8 | 81 | 8 | 2 |
16 | 77 | 7.4 | 82 | 8 | 3 |
78 | NA | 6.9 | 86 | 8 | 4 |
35 | NA | 7.4 | 85 | 8 | 5 |
66 | NA | 4.6 | 87 | 8 | 6 |
122 | 255 | 4.0 | 89 | 8 | 7 |
89 | 229 | 10.3 | 90 | 8 | 8 |
110 | 207 | 8.0 | 90 | 8 | 9 |
NA | 222 | 8.6 | 92 | 8 | 10 |
NA | 137 | 11.5 | 86 | 8 | 11 |
44 | 192 | 11.5 | 86 | 8 | 12 |
28 | 273 | 11.5 | 82 | 8 | 13 |
65 | 157 | 9.7 | 80 | 8 | 14 |
NA | 64 | 11.5 | 79 | 8 | 15 |
22 | 71 | 10.3 | 77 | 8 | 16 |
59 | 51 | 6.3 | 79 | 8 | 17 |
23 | 115 | 7.4 | 76 | 8 | 18 |
31 | 244 | 10.9 | 78 | 8 | 19 |
44 | 190 | 10.3 | 78 | 8 | 20 |
21 | 259 | 15.5 | 77 | 8 | 21 |
9 | 36 | 14.3 | 72 | 8 | 22 |
NA | 255 | 12.6 | 75 | 8 | 23 |
45 | 212 | 9.7 | 79 | 8 | 24 |
168 | 238 | 3.4 | 81 | 8 | 25 |
73 | 215 | 8.0 | 86 | 8 | 26 |
NA | 153 | 5.7 | 88 | 8 | 27 |
76 | 203 | 9.7 | 97 | 8 | 28 |
118 | 225 | 2.3 | 94 | 8 | 29 |
84 | 237 | 6.3 | 96 | 8 | 30 |
85 | 188 | 6.3 | 94 | 8 | 31 |
96 | 167 | 6.9 | 91 | 9 | 1 |
78 | 197 | 5.1 | 92 | 9 | 2 |
73 | 183 | 2.8 | 93 | 9 | 3 |
91 | 189 | 4.6 | 93 | 9 | 4 |
47 | 95 | 7.4 | 87 | 9 | 5 |
32 | 92 | 15.5 | 84 | 9 | 6 |
20 | 252 | 10.9 | 80 | 9 | 7 |
23 | 220 | 10.3 | 78 | 9 | 8 |
21 | 230 | 10.9 | 75 | 9 | 9 |
24 | 259 | 9.7 | 73 | 9 | 10 |
44 | 236 | 14.9 | 81 | 9 | 11 |
21 | 259 | 15.5 | 76 | 9 | 12 |
28 | 238 | 6.3 | 77 | 9 | 13 |
9 | 24 | 10.9 | 71 | 9 | 14 |
13 | 112 | 11.5 | 71 | 9 | 15 |
46 | 237 | 6.9 | 78 | 9 | 16 |
18 | 224 | 13.8 | 67 | 9 | 17 |
13 | 27 | 10.3 | 76 | 9 | 18 |
24 | 238 | 10.3 | 68 | 9 | 19 |
16 | 201 | 8.0 | 82 | 9 | 20 |
13 | 238 | 12.6 | 64 | 9 | 21 |
23 | 14 | 9.2 | 71 | 9 | 22 |
36 | 139 | 10.3 | 81 | 9 | 23 |
7 | 49 | 10.3 | 69 | 9 | 24 |
14 | 20 | 16.6 | 63 | 9 | 25 |
30 | 193 | 6.9 | 70 | 9 | 26 |
NA | 145 | 13.2 | 77 | 9 | 27 |
14 | 191 | 14.3 | 75 | 9 | 28 |
18 | 131 | 8.0 | 76 | 9 | 29 |
20 | 223 | 11.5 | 68 | 9 | 30 |
Month Day Temp Solar.R Wind Ozone
104 1 1 1 1 1 1 0 34 1 1 1 1 1 0 1 3 1 1 1 1 0 1 1 1 1 1 1 1 0 0 2 4 1 1 1 0 1 1 1 1 1 1 1 0 1 0 2 1 1 1 1 0 0 1 2 3 1 1 0 1 1 1 1 1 1 1 0 1 0 1 2 1 1 1 0 0 0 0 4 0 0 5 7 7 37 56
Month Day Temp Solar.R Wind Ozone
104 1 1 1 1 1 1 0 34 1 1 1 1 1 0 1 3 1 1 1 1 0 1 1 1 1 1 1 1 0 0 2 4 1 1 1 0 1 1 1 1 1 1 1 0 1 0 2 1 1 1 1 0 0 1 2 3 1 1 0 1 1 1 1 1 1 1 0 1 0 1 2 1 1 1 0 0 0 0 4 0 0 5 7 7 37 56
Multiple-Imputation
library(mice)
set.seed(1234)
t <- data.table(id=1:10
, var1=factor(sample(c("A", "B", "C", NA), 10, replace=TRUE), levels=c("A", "B", "C"))
, var2=sample(c(1:10, rep(NA, 5)), 10, replace = TRUE)
, var3=factor(rep(c("Red", "Green"), each=5), levels=c("Red", "Green", "Yellow"))
, var4=rep(c("Low", "High"), 5)
)
t %>% prt()
id | var1 | var2 | var3 | var4 |
---|---|---|---|---|
1 | NA | 4 | Red | Low |
2 | NA | 2 | Red | High |
3 | B | 7 | Red | Low |
4 | B | 6 | Red | High |
5 | A | 10 | Red | Low |
6 | NA | 6 | Green | High |
7 | C | NA | Green | Low |
8 | A | NA | Green | High |
9 | A | 4 | Green | Low |
10 | B | NA | Green | High |
ti <- mice(t
, m=5
, maxit = 50
, method = 'pmm'
, seed = 500
, printFlag = FALSE
)
tic <- complete(ti, action = "all", include = TRUE)
##tic <- complete(ti, action = 2L, include = TRUE)
tic[[2]] %>% prt()
id | var1 | var2 | var3 | var4 |
---|---|---|---|---|
1 | A | 4 | Red | Low |
2 | A | 2 | Red | High |
3 | B | 7 | Red | Low |
4 | B | 6 | Red | High |
5 | A | 10 | Red | Low |
6 | B | 6 | Green | High |
7 | C | 6 | Green | Low |
8 | A | 6 | Green | High |
9 | A | 4 | Green | Low |
10 | B | 6 | Green | High |
VIM package
library(VIM)
t <- data.table(id=1:10
, var1=factor(sample(c("A", "B", "C", NA), 10, replace=TRUE), levels=c("A", "B", "C"))
, var2=sample(c(1:10, NA), 10, replace = TRUE)
, var3=factor(rep(c("Red", "Green"), each=5), levels=c("Red", "Green", "Yellow"))
, var4=rep(c("Low", "High"), 5)
)
matrixplot(t, sortby = c('id'))
Computing Environment
R version 4.2.0 (2022-04-22) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 20.04.3 LTS
Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
locale: [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
attached base packages: [1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages: [1] VIM_6.2.2 colorspace_2.1-0 mice_3.15.0
[4] caret_6.0-93 lattice_0.20-45 matrixStats_0.63.0 [7] zoo_1.8-11 Wu_0.0.0.9000 flexdashboard_0.6.1 [10] lme4_1.1-31 Matrix_1.5-3 mgcv_1.8-38
[13] nlme_3.1-152 png_0.1-8 scales_1.2.1
[16] nnet_7.3-16 labelled_2.10.0 kableExtra_1.3.4
[19] plotly_4.10.1 gridExtra_2.3 ggplot2_3.4.1
[22] DT_0.27 tableone_0.13.2 magrittr_2.0.3
[25] lubridate_1.9.2 dplyr_1.1.0 plyr_1.8.8
[28] data.table_1.14.8 rmdformats_1.0.4 knitr_1.42
loaded via a namespace (and not attached): [1] minqa_1.2.5 ellipsis_0.3.2 class_7.3-19
[4] proxy_0.4-27 rstudioapi_0.14 listenv_0.9.0
[7] prodlim_2019.11.13 fansi_1.0.4 ranger_0.14.1
[10] xml2_1.3.3 codetools_0.2-18 splines_4.2.0
[13] robustbase_0.95-0 cachem_1.0.6 jsonlite_1.8.4
[16] nloptr_2.0.3 pROC_1.18.0 broom_1.0.3
[19] compiler_4.2.0 httr_1.4.5 backports_1.4.1
[22] fastmap_1.1.0 lazyeval_0.2.2 survey_4.1-1
[25] cli_3.6.0 htmltools_0.5.4 tools_4.2.0
[28] gtable_0.3.1 glue_1.6.2 reshape2_1.4.4
[31] mltools_0.3.5 Rcpp_1.0.10 carData_3.0-5
[34] jquerylib_0.1.4 vctrs_0.5.2 svglite_2.1.1
[37] iterators_1.0.14 lmtest_0.9-40 timeDate_4022.108
[40] laeken_0.5.2 gower_1.0.1 xfun_0.37
[43] stringr_1.5.0 globals_0.16.2 rvest_1.0.3
[46] timechange_0.2.0 lifecycle_1.0.3 future_1.31.0
[49] DEoptimR_1.0-11 MASS_7.3-54 ipred_0.9-13
[52] hms_1.1.2 parallel_4.2.0 yaml_2.3.7
[55] sass_0.4.5 rpart_4.1-15 stringi_1.7.12
[58] highr_0.9 foreach_1.5.2 e1071_1.7-13
[61] hardhat_1.2.0 boot_1.3-28 lava_1.7.2.1
[64] rlang_1.0.6 pkgconfig_2.0.3 systemfonts_1.0.4
[67] evaluate_0.20 purrr_1.0.1 recipes_1.0.5
[70] htmlwidgets_1.5.4 tidyselect_1.2.0 parallelly_1.34.0
[73] bookdown_0.32 R6_2.5.1 generics_0.1.3
[76] DBI_1.1.3 pillar_1.8.1 haven_2.5.2
[79] withr_2.5.0 abind_1.4-5 sp_1.6-0
[82] survival_3.2-13 tibble_3.1.8 future.apply_1.10.0 [85] car_3.1-1 utf8_1.2.2 rmarkdown_2.20
[88] forcats_1.0.0 ModelMetrics_1.2.2.2 vcd_1.4-11
[91] digest_0.6.29 webshot_0.5.4 tidyr_1.3.0
[94] stats4_4.2.0 munsell_0.5.0 viridisLite_0.4.1
[97] bslib_0.4.2 mitools_2.4