Power and Sample Size Simulation for Two Proportions

An Example of Power Simulation

Effect Size

Rate of the control group: 30%
Effect size: odds ratio of 3
Expected rate of the case group: 56.25%

library(knitr)
p0 <- 0.3
or <- 3
odds0 <- p0 / (1 - p0)
odds1 <- odds0 * or
p1 <- odds1 / (1 + odds1)
kable(p1, col.names = "Expected Rate of the Treatment Group") %>% kable_styling(full_width = FALSE)

Expected Rate of the Treatment Group
0.5625

Generate Target Population

pop0 <- rbinom(n = 1000000, size = 1, prob = p0)
pop1 <- rbinom(n = 1000000, size = 1, prob = p1)

kable(mean(pop0), col.names = "Rate of the Simulated  Control Population") %>% kable_styling(full_width = FALSE)

Rate of the Simulated Control Population
0.299048

kable(mean(pop1), col.names = "Rate of the Simulated  Treatment Population") %>% kable_styling(full_width = FALSE)

Rate of the Simulated Treatment Population
0.56325

Fisher’s Exact Test

sample_size <- 50
sample0 <- sample(x = pop0, size = sample_size, replace = FALSE)
sample1 <- sample(x = pop1, size = sample_size, replace = FALSE)


treatment <- c(rep(0, sample_size), rep(1, sample_size))
outcome <- c(sample0, sample1)

freq_table <- table(treatment, outcome)

kable(freq_table, caption = "2x2 Contingency Table") %>% kable_styling(full_width = FALSE)

2x2 Contingency Table
	0	1
0	37	13
1	19	31

ftst <- fisher.test(x = freq_table, hybrid = FALSE, alternative = "two.side", conf.int = FALSE)

kable(ftst$p.value, col.names = "P-value for Fisher's Exact Test") %>% kable_styling(full_width = FALSE)

P-value for Fisher’s Exact Test
0.000543

Write A Function Simulating Power

fx_fs <- function(sample_size = 50, alpha = 0.05){
  sample0 <- sample(x = pop0, size = sample_size, replace = FALSE)
  sample1 <- sample(x = pop1, size = sample_size, replace = FALSE)
  treatment <- c(rep(0, sample_size), rep(1, sample_size))
  outcome <- c(sample0, sample1)
  freq_table <- table(treatment, outcome)
  ftst <- fisher.test(x = freq_table, hybrid = FALSE, alternative = "two.side", conf.int = FALSE)
  invisible(ftst$p.value < alpha)
}

kable(fx_fs(), col.names = "Reject Null Hypothesis if p-value < 0.05") %>% kable_styling(full_width = FALSE)

Reject Null Hypothesis if p-value < 0.05
FALSE

Simulate 10000 Times

system.time(lst <- replicate(n = 10000, expr = fx_fs(sample_size = 50, alpha = 0.05), simplify = array))

user system elapsed 16.897 8.827 25.739

power <- mean(lst)
kable(power, col.names = "Power Calculated by 10000 Times Simulation") %>% kable_styling(full_width = FALSE)

Power Calculated by 10000 Times Simulation
0.7133

Make It Faster

library(parallel)


system.time(lst <- unlist(mclapply(1:10000, function(x) fx_fs(sample_size = 50, alpha = 0.05), mc.cores = detectCores())))

user system elapsed 60.650 43.450 8.121

power <- mean(lst)
kable(power, col.names = "Power Calculated by Parallel Process") %>% kable_styling(full_width = FALSE)

Power Calculated by Parallel Process
0.7163

A Function for Power Simulation

The function simulates tests using glm, Chi-squared test, and Fisher’s exact test.
It uses R6 class and parallel computing.

library(data.table)
library(R6)
library(parallel)

SimProp <- R6Class(
  "SimProp",
  list(
    p_control = NULL,
    odds_control = NULL,
    treatment_effect_or = NULL,
    p_case = NULL,
    odds_case = NULL,
    n_case = NULL,
    n_control_case_ratio = NULL,
    n_control = NULL,
    grid_table = NULL,
    target_data = NULL,
    n_target_data_case = NULL,
    n_target_data_control = NULL,
    n_simulation = 1000,
    alpha = 0.05,
    method = "glm",
    power_table = NULL,
    fx_p2odds = function(x) {
      x / (1 - x)
    },
    fx_odds2p = function(x) {
      x / (1 + x)
    },
    initialize = function(p_control, treatment_effect_or, n_case, n_control_case_ratio = 1, ...) {
      self$p_control <- p_control
      self$odds_control <- self$p_control / (1 - self$p_control)
      self$treatment_effect_or <- treatment_effect_or
      self$odds_case <- self$odds_control * self$treatment_effect_or
      self$p_case <- self$odds_case / (self$odds_case + 1)
      self$n_case <- n_case
      self$n_control_case_ratio <- n_control_case_ratio
      self$n_control <- self$n_case * self$n_control_case_ratio
      self$n_target_data_case <- max(self$n_case) * 100
      self$n_target_data_control <- max(self$n_control) * 100
      self$grid_table <- as.data.table(expand.grid(
        p_control = p_control,
        treatment_effect_or = treatment_effect_or,
        n_case = n_case,
        n_control_case_ratio = n_control_case_ratio
      ))
      self$grid_table <- self$grid_table[
        , odds_control := self$fx_p2odds(p_control)
      ][, odds_case := odds_control * treatment_effect_or][, p_case := self$fx_odds2p(odds_case)][, n_control := n_case * n_control_case_ratio][, .(
        n_case, p_case, odds_case, n_control,
        p_control, odds_control, treatment_effect_or, n_control_case_ratio
      )]
    },
    gen_target_data = function() {
      self$target_data <- lapply(1:nrow(self$grid_table), function(x) {
        list(
          data_case = rbinom(n = self$n_target_data_case, size = 1, prob = self$grid_table$p_case[x]),
          data_control = rbinom(n = self$n_target_data_control, size = 1, prob = self$grid_table$p_control[x]),
          p_case = self$grid_table$p_case[x],
          n_case = self$grid_table$n_case[x],
          p_control = self$grid_table$p_control[x],
          n_control = self$grid_table$n_control[x]
        )
      })
      invisible(self)
    },
    sample_target_data = function(lst) {
      require(data.table)
      sample_case <- data.table(
        treatment = rep(1, lst$n_case),
        outcome = sample(x = lst[["data_case"]], size = lst$n_case, replace = FALSE)
      )
      sample_control <- data.table(
        treatment = rep(0, lst$n_control),
        outcome = sample(x = lst[["data_control"]], size = lst$n_control, replace = FALSE)
      )
      invisible(rbind(sample_case, sample_control))
    },
    get_pvalue_glm = function(data) {
      m <- glm(outcome ~ treatment, data = data, family = binomial(link = logit))
      ms <- summary(m)
      invisible(ms$coefficients["treatment", "Pr(>|z|)"])
    },
    get_pvalue_chisq = function(data) {
      invisible(chisq.test(table(data))$p.value)
    },
    get_pvalue_fisher = function(data) {
      invisible(fisher.test(table(data), hybrid = FALSE, conf.int = FALSE)$p.value)
    },
    gen_tests = function(lst) {
      pvalue_lst <- lapply(X = 1:self$n_simulation, FUN = function(x) {
        data <- self$sample_target_data(lst)
        if (self$method == "glm") {
          pvalue <- self$get_pvalue_glm(data)
        } else if (self$method == "chisq") {
          pvalue <- self$get_pvalue_chisq(data)
        } else if (self$method == "fisher") {
          pvalue <- self$get_pvalue_fisher(data)
        } else {
          pvalue <- NA
        }
      })
      invisible(unlist(pvalue_lst) < self$alpha)
    },
    gen_power = function() {
      require(parallel)
      self$power_table <- parallel::mclapply(X = self$target_data, FUN = function(x) {
        mean(self$gen_tests(x), na.rm = TRUE)
      }, mc.cores = detectCores())
      self$grid_table$alpha <- self$alpha
      self$grid_table$method <- self$method
      self$grid_table$power <- self$power_table
      invisible(self)
    }
  )
)


sp <- SimProp$new(p_control = c(0.3), treatment_effect_or = 3, n_case = c(30, 50, 70))
sp$n_simulation <- 5000
sp$gen_target_data()


sp$method <- "glm"
sp$gen_power()
kable(sp$grid_table, caption = "Power if test using glm") %>% kable_styling(full_width = FALSE)

Power if test using glm
n_case	p_case	odds_case	n_control	p_control	odds_control	treatment_effect_or	n_control_case_ratio	alpha	method	power
30	0.5625	1.285714	30	0.3	0.4285714	3	1	0.05	glm	0.5292
50	0.5625	1.285714	50	0.3	0.4285714	3	1	0.05	glm	0.7858
70	0.5625	1.285714	70	0.3	0.4285714	3	1	0.05	glm	0.9126

sp$method <- "chisq"
sp$gen_power()
kable(sp$grid_table, caption = "Power if chisq test") %>% kable_styling(full_width = FALSE)

Power if chisq test
n_case	p_case	odds_case	n_control	p_control	odds_control	treatment_effect_or	n_control_case_ratio	alpha	method	power
30	0.5625	1.285714	30	0.3	0.4285714	3	1	0.05	chisq	0.4358
50	0.5625	1.285714	50	0.3	0.4285714	3	1	0.05	chisq	0.7126
70	0.5625	1.285714	70	0.3	0.4285714	3	1	0.05	chisq	0.889

sp$method <- "fisher"
sp$gen_power()
kable(sp$grid_table, caption = "Power if using Fisher's exact test") %>% kable_styling(full_width = FALSE)

Power if using Fisher’s exact test
n_case	p_case	odds_case	n_control	p_control	odds_control	treatment_effect_or	n_control_case_ratio	alpha	method	power
30	0.5625	1.285714	30	0.3	0.4285714	3	1	0.05	fisher	0.4298
50	0.5625	1.285714	50	0.3	0.4285714	3	1	0.05	fisher	0.7144
70	0.5625	1.285714	70	0.3	0.4285714	3	1	0.05	fisher	0.8948

Power for ANOVA

## install.packages("pwr")

library(pwr)

N <- 1354
k <- 4


t <- pwr.anova.test(k = 2, n = 110, sig.level = 0.05, power = 0.8)


N <- 220
N <- 100
for (k in 2:7) {
    print(k)
    pwri <- pwr.anova.test(k = k, n = round(N / k), sig.level = 0.05, power = 0.8)
    ti <- c(k, pwri$f)
    if (k == 2) {
        rtn <- ti
    } else {
        rtn <- rbind(rtn, ti)
    }
}

rtn

rtn2 <- rtn
rtn * 2

str(ti)


N <- 220
N <- 100
for (k in 2:7) {
    print(k)
    print(round(N / k))
    x <- pwr.t.test(n = round(N / k), d = NULL, sig.level = 0.05, power=0.8, 
    type = c("two.sample"),
    alternative = c("two.sided")
    )
    ti <- c(k, x$d)
        if (k == 2) {
        rtn <- ti
    } else {
        rtn <- rbind(rtn, ti)
    }
}

rtn


citation("pwr")

x <- pwr.t.test(n = 50, d = NULL, sig.level = 0.05, power=0.8, 
    type = c("two.sample"),
    alternative = c("two.sided")
    )
str(x)
x


pwr.anova.test(f = 0.1, k = 2, n = 675, sig.level = 0.05)


library(Superpower)

a1b1:0
a1b2:0.2
a2b1:0.2
a2b2:0.4

design_result <- ANOVA_design(design = "2w*2w",
                              n = round(1354 / 4),
                              mu = c(0, 0.1, 0.1, 0.4),
                              sd = 1,
                              r=0, 
                              plot = FALSE)


power_result <- ANOVA_power(design_result, alpha_level = 0.05,
      p_adjust = "none", seed = 2019, nsims = 100)






## compare t-test and anova

pwr.anova.test(k = 2,
               n = NULL,
               f = 0.2, 
               sig.level = 0.05, 
               power = 0.8) 


pwr.t.test(n = NULL,
           d = 0.2,
           sig.level = 0.05,
           type = "two.sample",  
           alternative = "two.sided", 
           power = 0.8)

pwr.anova.test(k = 2,
               n = NULL,
               f = 0.1, 
               sig.level = 0.05, 
               power = 0.8)

pwr.t.test(n = NULL,
           d = 0.2,
           sig.level = 0.05,
           type = "two.sample",  
           alternative = "greater", 
           power = 0.8)

Statistics for Stratified Systematic Sampling

\(N\) is the number of total houses.
\(L\) is the number of strata (sub-districts within the survey area).
\(N_h\) is the number of total houses within the stratum \(h\).
\(n_h\) is the number of sampled houses within the stratum \(h\).
\(s^2_h\) is the estimated variance of the stratum \(h\).
\(x_{h,i}\) is the value of \(x\) for the element \(i\) within the stratum \(h\).
\(\bar{x}_h\) is the mean of \(x\) within the stratum \(h\).
\(\bar{x}_{str}\) is the mean estimate for entire population.
\(p_{y,str}\) is the proportion estimate for entire population.

Estimated mean of the stratum \(h\): \[ \bar{x}_h = \frac{\sum_{i=1}^{n_h}(x_{h,i})}{n_h}\]

Estimated proportion of the stratum \(h\): \[ P_{hy} = \frac{\sum_{i=1}^{n_h}(Y_{h,i})}{n_h}\]

Variance estimate of the stratum \(h\):

\[ s^2_h = \frac{\sum_{i=1}^{n_h}(x_{h,i} - \bar{x}_h)^2}{n_h - 1}\] \[ s^2_h = \frac{n_h}{n_h - 1}P_{py}(1 - P_{py})\]

Design Effect \[\rho=1 + \delta_x(n-1)\]

Estimate Mean for Entire Population

\[\bar{x}_{str} = \frac{\sum_{h=1}^LN_h\bar{x}_h}{N}\]

Standard Error of Mean Estimate \[\hat{SE}(\bar{x}_{str}) = \sqrt{\sum_{h=1}^L(\frac{N_h}{N})^2\frac{s_{hx}^2}{n_h}(\frac{N_h-n_h}{N_h})} \]

Estimate Proportion for Entire Population

\[p_{y,str}=\frac{\sum_{h=1}^LN_hP_h}{N}\]

Standard Error for Proportion Estimate

\[\hat{SE}(p_{y,str}) = \sqrt{\sum_{h=1}^L(\frac{N_h}{N})^2\frac{p_{hy}(1-p_{hy})}{n_h-1}(\frac{N_h-n_h}{N_h})} \]