Power and Sample Size Simulation for Two Proportions
An Example of Power Simulation
Effect Size
- Rate of the control group: 30%
- Effect size: odds ratio of 3
- Expected rate of the case group: 56.25%
library(knitr)
p0 <- 0.3
or <- 3
odds0 <- p0 / (1 - p0)
odds1 <- odds0 * or
p1 <- odds1 / (1 + odds1)
kable(p1, col.names = "Expected Rate of the Treatment Group") %>% kable_styling(full_width = FALSE)
Expected Rate of the Treatment Group |
---|
0.5625 |
Generate Target Population
pop0 <- rbinom(n = 1000000, size = 1, prob = p0)
pop1 <- rbinom(n = 1000000, size = 1, prob = p1)
kable(mean(pop0), col.names = "Rate of the Simulated Control Population") %>% kable_styling(full_width = FALSE)
Rate of the Simulated Control Population |
---|
0.299048 |
Rate of the Simulated Treatment Population |
---|
0.56325 |
Fisher’s Exact Test
sample_size <- 50
sample0 <- sample(x = pop0, size = sample_size, replace = FALSE)
sample1 <- sample(x = pop1, size = sample_size, replace = FALSE)
treatment <- c(rep(0, sample_size), rep(1, sample_size))
outcome <- c(sample0, sample1)
freq_table <- table(treatment, outcome)
kable(freq_table, caption = "2x2 Contingency Table") %>% kable_styling(full_width = FALSE)
0 | 1 | |
---|---|---|
0 | 37 | 13 |
1 | 19 | 31 |
ftst <- fisher.test(x = freq_table, hybrid = FALSE, alternative = "two.side", conf.int = FALSE)
kable(ftst$p.value, col.names = "P-value for Fisher's Exact Test") %>% kable_styling(full_width = FALSE)
P-value for Fisher’s Exact Test |
---|
0.000543 |
Write A Function Simulating Power
fx_fs <- function(sample_size = 50, alpha = 0.05){
sample0 <- sample(x = pop0, size = sample_size, replace = FALSE)
sample1 <- sample(x = pop1, size = sample_size, replace = FALSE)
treatment <- c(rep(0, sample_size), rep(1, sample_size))
outcome <- c(sample0, sample1)
freq_table <- table(treatment, outcome)
ftst <- fisher.test(x = freq_table, hybrid = FALSE, alternative = "two.side", conf.int = FALSE)
invisible(ftst$p.value < alpha)
}
kable(fx_fs(), col.names = "Reject Null Hypothesis if p-value < 0.05") %>% kable_styling(full_width = FALSE)
Reject Null Hypothesis if p-value < 0.05 |
---|
FALSE |
Simulate 10000 Times
user system elapsed 16.897 8.827 25.739
power <- mean(lst)
kable(power, col.names = "Power Calculated by 10000 Times Simulation") %>% kable_styling(full_width = FALSE)
Power Calculated by 10000 Times Simulation |
---|
0.7133 |
Make It Faster
library(parallel)
system.time(lst <- unlist(mclapply(1:10000, function(x) fx_fs(sample_size = 50, alpha = 0.05), mc.cores = detectCores())))
user system elapsed 60.650 43.450 8.121
power <- mean(lst)
kable(power, col.names = "Power Calculated by Parallel Process") %>% kable_styling(full_width = FALSE)
Power Calculated by Parallel Process |
---|
0.7163 |
A Function for Power Simulation
- The function simulates tests using glm, Chi-squared test, and Fisher’s exact test.
- It uses R6 class and parallel computing.
library(data.table)
library(R6)
library(parallel)
SimProp <- R6Class(
"SimProp",
list(
p_control = NULL,
odds_control = NULL,
treatment_effect_or = NULL,
p_case = NULL,
odds_case = NULL,
n_case = NULL,
n_control_case_ratio = NULL,
n_control = NULL,
grid_table = NULL,
target_data = NULL,
n_target_data_case = NULL,
n_target_data_control = NULL,
n_simulation = 1000,
alpha = 0.05,
method = "glm",
power_table = NULL,
fx_p2odds = function(x) {
x / (1 - x)
},
fx_odds2p = function(x) {
x / (1 + x)
},
initialize = function(p_control, treatment_effect_or, n_case, n_control_case_ratio = 1, ...) {
self$p_control <- p_control
self$odds_control <- self$p_control / (1 - self$p_control)
self$treatment_effect_or <- treatment_effect_or
self$odds_case <- self$odds_control * self$treatment_effect_or
self$p_case <- self$odds_case / (self$odds_case + 1)
self$n_case <- n_case
self$n_control_case_ratio <- n_control_case_ratio
self$n_control <- self$n_case * self$n_control_case_ratio
self$n_target_data_case <- max(self$n_case) * 100
self$n_target_data_control <- max(self$n_control) * 100
self$grid_table <- as.data.table(expand.grid(
p_control = p_control,
treatment_effect_or = treatment_effect_or,
n_case = n_case,
n_control_case_ratio = n_control_case_ratio
))
self$grid_table <- self$grid_table[
, odds_control := self$fx_p2odds(p_control)
][, odds_case := odds_control * treatment_effect_or][, p_case := self$fx_odds2p(odds_case)][, n_control := n_case * n_control_case_ratio][, .(
n_case, p_case, odds_case, n_control,
p_control, odds_control, treatment_effect_or, n_control_case_ratio
)]
},
gen_target_data = function() {
self$target_data <- lapply(1:nrow(self$grid_table), function(x) {
list(
data_case = rbinom(n = self$n_target_data_case, size = 1, prob = self$grid_table$p_case[x]),
data_control = rbinom(n = self$n_target_data_control, size = 1, prob = self$grid_table$p_control[x]),
p_case = self$grid_table$p_case[x],
n_case = self$grid_table$n_case[x],
p_control = self$grid_table$p_control[x],
n_control = self$grid_table$n_control[x]
)
})
invisible(self)
},
sample_target_data = function(lst) {
require(data.table)
sample_case <- data.table(
treatment = rep(1, lst$n_case),
outcome = sample(x = lst[["data_case"]], size = lst$n_case, replace = FALSE)
)
sample_control <- data.table(
treatment = rep(0, lst$n_control),
outcome = sample(x = lst[["data_control"]], size = lst$n_control, replace = FALSE)
)
invisible(rbind(sample_case, sample_control))
},
get_pvalue_glm = function(data) {
m <- glm(outcome ~ treatment, data = data, family = binomial(link = logit))
ms <- summary(m)
invisible(ms$coefficients["treatment", "Pr(>|z|)"])
},
get_pvalue_chisq = function(data) {
invisible(chisq.test(table(data))$p.value)
},
get_pvalue_fisher = function(data) {
invisible(fisher.test(table(data), hybrid = FALSE, conf.int = FALSE)$p.value)
},
gen_tests = function(lst) {
pvalue_lst <- lapply(X = 1:self$n_simulation, FUN = function(x) {
data <- self$sample_target_data(lst)
if (self$method == "glm") {
pvalue <- self$get_pvalue_glm(data)
} else if (self$method == "chisq") {
pvalue <- self$get_pvalue_chisq(data)
} else if (self$method == "fisher") {
pvalue <- self$get_pvalue_fisher(data)
} else {
pvalue <- NA
}
})
invisible(unlist(pvalue_lst) < self$alpha)
},
gen_power = function() {
require(parallel)
self$power_table <- parallel::mclapply(X = self$target_data, FUN = function(x) {
mean(self$gen_tests(x), na.rm = TRUE)
}, mc.cores = detectCores())
self$grid_table$alpha <- self$alpha
self$grid_table$method <- self$method
self$grid_table$power <- self$power_table
invisible(self)
}
)
)
sp <- SimProp$new(p_control = c(0.3), treatment_effect_or = 3, n_case = c(30, 50, 70))
sp$n_simulation <- 5000
sp$gen_target_data()
sp$method <- "glm"
sp$gen_power()
kable(sp$grid_table, caption = "Power if test using glm") %>% kable_styling(full_width = FALSE)
n_case | p_case | odds_case | n_control | p_control | odds_control | treatment_effect_or | n_control_case_ratio | alpha | method | power |
---|---|---|---|---|---|---|---|---|---|---|
30 | 0.5625 | 1.285714 | 30 | 0.3 | 0.4285714 | 3 | 1 | 0.05 | glm | 0.5292 |
50 | 0.5625 | 1.285714 | 50 | 0.3 | 0.4285714 | 3 | 1 | 0.05 | glm | 0.7858 |
70 | 0.5625 | 1.285714 | 70 | 0.3 | 0.4285714 | 3 | 1 | 0.05 | glm | 0.9126 |
sp$method <- "chisq"
sp$gen_power()
kable(sp$grid_table, caption = "Power if chisq test") %>% kable_styling(full_width = FALSE)
n_case | p_case | odds_case | n_control | p_control | odds_control | treatment_effect_or | n_control_case_ratio | alpha | method | power |
---|---|---|---|---|---|---|---|---|---|---|
30 | 0.5625 | 1.285714 | 30 | 0.3 | 0.4285714 | 3 | 1 | 0.05 | chisq | 0.4358 |
50 | 0.5625 | 1.285714 | 50 | 0.3 | 0.4285714 | 3 | 1 | 0.05 | chisq | 0.7126 |
70 | 0.5625 | 1.285714 | 70 | 0.3 | 0.4285714 | 3 | 1 | 0.05 | chisq | 0.889 |
sp$method <- "fisher"
sp$gen_power()
kable(sp$grid_table, caption = "Power if using Fisher's exact test") %>% kable_styling(full_width = FALSE)
n_case | p_case | odds_case | n_control | p_control | odds_control | treatment_effect_or | n_control_case_ratio | alpha | method | power |
---|---|---|---|---|---|---|---|---|---|---|
30 | 0.5625 | 1.285714 | 30 | 0.3 | 0.4285714 | 3 | 1 | 0.05 | fisher | 0.4298 |
50 | 0.5625 | 1.285714 | 50 | 0.3 | 0.4285714 | 3 | 1 | 0.05 | fisher | 0.7144 |
70 | 0.5625 | 1.285714 | 70 | 0.3 | 0.4285714 | 3 | 1 | 0.05 | fisher | 0.8948 |
Power for ANOVA
## install.packages("pwr")
library(pwr)
N <- 1354
k <- 4
t <- pwr.anova.test(k = 2, n = 110, sig.level = 0.05, power = 0.8)
N <- 220
N <- 100
for (k in 2:7) {
print(k)
pwri <- pwr.anova.test(k = k, n = round(N / k), sig.level = 0.05, power = 0.8)
ti <- c(k, pwri$f)
if (k == 2) {
rtn <- ti
} else {
rtn <- rbind(rtn, ti)
}
}
rtn
rtn2 <- rtn
rtn * 2
str(ti)
N <- 220
N <- 100
for (k in 2:7) {
print(k)
print(round(N / k))
x <- pwr.t.test(n = round(N / k), d = NULL, sig.level = 0.05, power=0.8,
type = c("two.sample"),
alternative = c("two.sided")
)
ti <- c(k, x$d)
if (k == 2) {
rtn <- ti
} else {
rtn <- rbind(rtn, ti)
}
}
rtn
citation("pwr")
x <- pwr.t.test(n = 50, d = NULL, sig.level = 0.05, power=0.8,
type = c("two.sample"),
alternative = c("two.sided")
)
str(x)
x
pwr.anova.test(f = 0.1, k = 2, n = 675, sig.level = 0.05)
library(Superpower)
a1b1:0
a1b2:0.2
a2b1:0.2
a2b2:0.4
design_result <- ANOVA_design(design = "2w*2w",
n = round(1354 / 4),
mu = c(0, 0.1, 0.1, 0.4),
sd = 1,
r=0,
plot = FALSE)
power_result <- ANOVA_power(design_result, alpha_level = 0.05,
p_adjust = "none", seed = 2019, nsims = 100)
## compare t-test and anova
pwr.anova.test(k = 2,
n = NULL,
f = 0.2,
sig.level = 0.05,
power = 0.8)
pwr.t.test(n = NULL,
d = 0.2,
sig.level = 0.05,
type = "two.sample",
alternative = "two.sided",
power = 0.8)
pwr.anova.test(k = 2,
n = NULL,
f = 0.1,
sig.level = 0.05,
power = 0.8)
pwr.t.test(n = NULL,
d = 0.2,
sig.level = 0.05,
type = "two.sample",
alternative = "greater",
power = 0.8)
Statistics for Stratified Systematic Sampling
- \(N\) is the number of total houses.
- \(L\) is the number of strata (sub-districts within the survey area).
- \(N_h\) is the number of total houses within the stratum \(h\).
- \(n_h\) is the number of sampled houses within the stratum \(h\).
- \(s^2_h\) is the estimated variance of the stratum \(h\).
- \(x_{h,i}\) is the value of \(x\) for the element \(i\) within the stratum \(h\).
- \(\bar{x}_h\) is the mean of \(x\) within the stratum \(h\).
- \(\bar{x}_{str}\) is the mean estimate for entire population.
- \(p_{y,str}\) is the proportion estimate for entire population.
Estimated mean of the stratum \(h\): \[ \bar{x}_h = \frac{\sum_{i=1}^{n_h}(x_{h,i})}{n_h}\]
Estimated proportion of the stratum \(h\): \[ P_{hy} = \frac{\sum_{i=1}^{n_h}(Y_{h,i})}{n_h}\]
Variance estimate of the stratum \(h\):
\[ s^2_h = \frac{\sum_{i=1}^{n_h}(x_{h,i} - \bar{x}_h)^2}{n_h - 1}\] \[ s^2_h = \frac{n_h}{n_h - 1}P_{py}(1 - P_{py})\]
Design Effect \[\rho=1 + \delta_x(n-1)\]
Estimate Mean for Entire Population
\[\bar{x}_{str} = \frac{\sum_{h=1}^LN_h\bar{x}_h}{N}\]
Standard Error of Mean Estimate \[\hat{SE}(\bar{x}_{str}) = \sqrt{\sum_{h=1}^L(\frac{N_h}{N})^2\frac{s_{hx}^2}{n_h}(\frac{N_h-n_h}{N_h})} \]
Estimate Proportion for Entire Population
\[p_{y,str}=\frac{\sum_{h=1}^LN_hP_h}{N}\]
Standard Error for Proportion Estimate
\[\hat{SE}(p_{y,str}) = \sqrt{\sum_{h=1}^L(\frac{N_h}{N})^2\frac{p_{hy}(1-p_{hy})}{n_h-1}(\frac{N_h-n_h}{N_h})} \]