ACS Survey
American Community Survey (ACS)
- Reference Link: https://www.census.gov/programs-surveys/acs
library(Wu)
dt <- read.csv(file = "../data/csv_ptn/psam_p47.csv", stringsAsFactors = FALSE)
dim(dt)
## 67950 286
object.size(dt)
## 80861072 bytes
colnames(dt)
dt <- as.data.table(dt)
dt[,.N,by=list(RT)]
dt[,.N,by=list(SEX)]
dt[,.N,by=list(LANX)]
dt[,.N,by=list(PUMA)]
## Nashville
## 02502, Nashville North
## 02505, Nashville Southwest
## 02503, Nashville Center
## 02501, Nashville East
## 02504, Nashville Southeast
puma_nashville <- c(2501, 2502, 2503, 2504, 2505)
dtn <- dt[PUMA %in% puma_nashville]
dim(dtn)
## [1] 6490 286
var_names <- colnames(dtn)
var_names <- tolower(var_names)
colnames(dtn) <- var_names
dict <- read.csv(file = "../data/PUMS_Data_Dictionary_2018.csv", stringsAsFactors = FALSE, header = FALSE)
dim(dict)
## [1] 5945 7
dict <- unique(dict)
dim(dict)
## 5865 7
dict <- as.data.table(dict)
colnames(dict) <- c(
"row_type"
, "var_name"
, "var_type"
, "var_length"
, "var_description"
, "value_code"
, "value"
)
length(unique(dict$var_name))
## assign label
var_names_dt <- unique(colnames(dtn))
for(i in var_names_dt){
print(i)
var_desp <- dict[row_type == "NAME" & var_name == toupper(i), list(var_description)]$var_description[1]
print(var_desp)
Wu::label(dtn[[i]]) <- var_desp
}
saveRDS(dtn, file = "acs2018_nashville.RDS")
Nashville Data
library(Wu)
data("acs2018_nashville")
dt <- acs2018_nashville
## glm
colnames(dt)[1:22]
lapply(dt[,1:133], label)
dt[,.N,by=list(mar)]
dt[,.N,by=list(nwav)][order(nwav)]
dt[,.N,by=list(schl)][order(schl)]
dt[,.N,by=list(rac1p)][order(rac1p)]
summary(dt$agep)
table(dt$sex)
dt[, .N, by = list(eng)]
## to predict the ability to work
dt <- dt[
, available_for_work := case_when(
nwav %in% c(1) ~ "Yes"
, TRUE ~ "No"
)
][, available_for_work := factor(available_for_work, levels = c("No", "Yes"))
][, gender := case_when(
sex %in% c(1) ~ "Male"
, sex %in% c(2) ~ "Female"
, TRUE ~ as.character(NA)
)
][, gender := factor(gender, levels = c("Female", "Male"))
][, marital_status := case_when(
mar %in% c(1) ~ "Married"
, TRUE ~ "Single"
)][, marital_status := factor(marital_status, levels = c("Married", "Single"))
][, race := case_when(
rac1p %in% c(1) ~ "White"
, rac1p %in% c(2) ~ "Black"
, TRUE ~ "Others"
)][, race := factor(race, levels = c("White", "Black", "Others"))
][]
dt[,.N,by=list(race, rac1p)]
dt[,.N,by=list(sex, gender)]
dt[,.N,by=list(marital_status, mar)]
m <- glm(available_for_work ~ agep + gender + marital_status + race
, data = dt
, family = binomial(link = logit)
)
m1 <- glm(available_for_work ~ race + agep + gender + marital_status
, data = dt
, family = binomial(link = logit)
)
summary(m)
anova(m)
library(caret)
varImp(m)
varImp(m, scale = FALSE)
## ?varImp
methods(varImp)
caret:::varImp.glm
caret:::varImpDependencies
caret:::getModelInfo
load(system.file("models", "models.RData", package = "caret"))
models
str(models)
models$glm$varImp
## ?varImp
## for glm caret use its z value for variable importance;
values <- summary(m)$coef
varImps <- abs(values[-1, grep("value$", colnames(values)), drop = FALSE])
library(car)
car::Anova(m, type = "II", test.statistics = "LR")
car::Anova(m1, type = "II", test.statistics = "LR")
drop1(m, test = "LRT")
anova(m)
anova(m1)
car::Anova(m, type = "II", test.statistics = "LR")
car::Anova(m, type = "III")
R sessionInfo
R version 4.2.0 (2022-04-22) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 20.04.3 LTS
Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
locale: [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
attached base packages: [1] stats graphics grDevices utils datasets methods base
other attached packages: [1] Wu_0.0.0.9000 flexdashboard_0.6.0 lme4_1.1-30
[4] Matrix_1.4-0 mgcv_1.8-38 nlme_3.1-152
[7] png_0.1-7 scales_1.2.0 nnet_7.3-16
[10] labelled_2.9.1 kableExtra_1.3.4 plotly_4.10.0
[13] gridExtra_2.3 ggplot2_3.3.6 DT_0.24
[16] tableone_0.13.2 magrittr_2.0.3 lubridate_1.8.0
[19] dplyr_1.0.9 plyr_1.8.7 data.table_1.14.2
[22] rmdformats_1.0.4 knitr_1.39
loaded via a namespace (and not attached): [1] httr_1.4.4 sass_0.4.2 tidyr_1.2.0 jsonlite_1.8.0
[5] viridisLite_0.4.0 splines_4.2.0 bslib_0.4.0 assertthat_0.2.1 [9] yaml_2.3.5 pillar_1.8.1 lattice_0.20-45 glue_1.6.2
[13] digest_0.6.29 rvest_1.0.2 minqa_1.2.4 colorspace_2.0-3 [17] htmltools_0.5.3 survey_4.1-1 pkgconfig_2.0.3 haven_2.5.0
[21] bookdown_0.28 purrr_0.3.4 webshot_0.5.3 svglite_2.1.0
[25] tibble_3.1.8 generics_0.1.3 ellipsis_0.3.2 cachem_1.0.6
[29] withr_2.5.0 klippy_0.0.0.9500 lazyeval_0.2.2 cli_3.3.0
[33] survival_3.2-13 evaluate_0.16 fansi_1.0.3 MASS_7.3-54
[37] forcats_0.5.1 xml2_1.3.3 tools_4.2.0 hms_1.1.1
[41] mitools_2.4 lifecycle_1.0.1 stringr_1.4.0 munsell_0.5.0
[45] compiler_4.2.0 jquerylib_0.1.4 systemfonts_1.0.4 rlang_1.0.4
[49] grid_4.2.0 nloptr_2.0.3 rstudioapi_0.13 htmlwidgets_1.5.4 [53] rmarkdown_2.14 boot_1.3-28 gtable_0.3.0 DBI_1.1.3
[57] R6_2.5.1 fastmap_1.1.0 utf8_1.2.2 stringi_1.7.8
[61] Rcpp_1.0.9 vctrs_0.4.1 tidyselect_1.1.2 xfun_0.32