Power and Sample Size Simulation for Two Proportions

An Example of Power Simulation

Effect Size

  • Rate of the control group: 30%
  • Effect size: odds ratio of 3
  • Expected rate of the case group: 56.25%
Expected Rate of the Treatment Group
0.5625

A Function for Power Simulation

  • The function simulates tests using glm, Chi-squared test, and Fisher’s exact test.
  • It uses R6 class and parallel computing.
library(data.table)
library(R6)
library(parallel)

SimProp <- R6Class(
  "SimProp",
  list(
    p_control = NULL,
    odds_control = NULL,
    treatment_effect_or = NULL,
    p_case = NULL,
    odds_case = NULL,
    n_case = NULL,
    n_control_case_ratio = NULL,
    n_control = NULL,
    grid_table = NULL,
    target_data = NULL,
    n_target_data_case = NULL,
    n_target_data_control = NULL,
    n_simulation = 1000,
    alpha = 0.05,
    method = "glm",
    power_table = NULL,
    fx_p2odds = function(x) {
      x / (1 - x)
    },
    fx_odds2p = function(x) {
      x / (1 + x)
    },
    initialize = function(p_control, treatment_effect_or, n_case, n_control_case_ratio = 1, ...) {
      self$p_control <- p_control
      self$odds_control <- self$p_control / (1 - self$p_control)
      self$treatment_effect_or <- treatment_effect_or
      self$odds_case <- self$odds_control * self$treatment_effect_or
      self$p_case <- self$odds_case / (self$odds_case + 1)
      self$n_case <- n_case
      self$n_control_case_ratio <- n_control_case_ratio
      self$n_control <- self$n_case * self$n_control_case_ratio
      self$n_target_data_case <- max(self$n_case) * 100
      self$n_target_data_control <- max(self$n_control) * 100
      self$grid_table <- as.data.table(expand.grid(
        p_control = p_control,
        treatment_effect_or = treatment_effect_or,
        n_case = n_case,
        n_control_case_ratio = n_control_case_ratio
      ))
      self$grid_table <- self$grid_table[
        , odds_control := self$fx_p2odds(p_control)
      ][, odds_case := odds_control * treatment_effect_or][, p_case := self$fx_odds2p(odds_case)][, n_control := n_case * n_control_case_ratio][, .(
        n_case, p_case, odds_case, n_control,
        p_control, odds_control, treatment_effect_or, n_control_case_ratio
      )]
    },
    gen_target_data = function() {
      self$target_data <- lapply(1:nrow(self$grid_table), function(x) {
        list(
          data_case = rbinom(n = self$n_target_data_case, size = 1, prob = self$grid_table$p_case[x]),
          data_control = rbinom(n = self$n_target_data_control, size = 1, prob = self$grid_table$p_control[x]),
          p_case = self$grid_table$p_case[x],
          n_case = self$grid_table$n_case[x],
          p_control = self$grid_table$p_control[x],
          n_control = self$grid_table$n_control[x]
        )
      })
      invisible(self)
    },
    sample_target_data = function(lst) {
      require(data.table)
      sample_case <- data.table(
        treatment = rep(1, lst$n_case),
        outcome = sample(x = lst[["data_case"]], size = lst$n_case, replace = FALSE)
      )
      sample_control <- data.table(
        treatment = rep(0, lst$n_control),
        outcome = sample(x = lst[["data_control"]], size = lst$n_control, replace = FALSE)
      )
      invisible(rbind(sample_case, sample_control))
    },
    get_pvalue_glm = function(data) {
      m <- glm(outcome ~ treatment, data = data, family = binomial(link = logit))
      ms <- summary(m)
      invisible(ms$coefficients["treatment", "Pr(>|z|)"])
    },
    get_pvalue_chisq = function(data) {
      invisible(chisq.test(table(data))$p.value)
    },
    get_pvalue_fisher = function(data) {
      invisible(fisher.test(table(data), hybrid = FALSE, conf.int = FALSE)$p.value)
    },
    gen_tests = function(lst) {
      pvalue_lst <- lapply(X = 1:self$n_simulation, FUN = function(x) {
        data <- self$sample_target_data(lst)
        if (self$method == "glm") {
          pvalue <- self$get_pvalue_glm(data)
        } else if (self$method == "chisq") {
          pvalue <- self$get_pvalue_chisq(data)
        } else if (self$method == "fisher") {
          pvalue <- self$get_pvalue_fisher(data)
        } else {
          pvalue <- NA
        }
      })
      invisible(unlist(pvalue_lst) < self$alpha)
    },
    gen_power = function() {
      require(parallel)
      self$power_table <- parallel::mclapply(X = self$target_data, FUN = function(x) {
        mean(self$gen_tests(x), na.rm = TRUE)
      }, mc.cores = detectCores())
      self$grid_table$alpha <- self$alpha
      self$grid_table$method <- self$method
      self$grid_table$power <- self$power_table
      invisible(self)
    }
  )
)


sp <- SimProp$new(p_control = c(0.3), treatment_effect_or = 3, n_case = c(30, 50, 70))
sp$n_simulation <- 5000
sp$gen_target_data()


sp$method <- "glm"
sp$gen_power()
kable(sp$grid_table, caption = "Power if test using glm") %>% kable_styling(full_width = FALSE)
Power if test using glm
n_case p_case odds_case n_control p_control odds_control treatment_effect_or n_control_case_ratio alpha method power
30 0.5625 1.285714 30 0.3 0.4285714 3 1 0.05 glm 0.5292
50 0.5625 1.285714 50 0.3 0.4285714 3 1 0.05 glm 0.7858
70 0.5625 1.285714 70 0.3 0.4285714 3 1 0.05 glm 0.9126
Power if chisq test
n_case p_case odds_case n_control p_control odds_control treatment_effect_or n_control_case_ratio alpha method power
30 0.5625 1.285714 30 0.3 0.4285714 3 1 0.05 chisq 0.4358
50 0.5625 1.285714 50 0.3 0.4285714 3 1 0.05 chisq 0.7126
70 0.5625 1.285714 70 0.3 0.4285714 3 1 0.05 chisq 0.889
Power if using Fisher’s exact test
n_case p_case odds_case n_control p_control odds_control treatment_effect_or n_control_case_ratio alpha method power
30 0.5625 1.285714 30 0.3 0.4285714 3 1 0.05 fisher 0.4298
50 0.5625 1.285714 50 0.3 0.4285714 3 1 0.05 fisher 0.7144
70 0.5625 1.285714 70 0.3 0.4285714 3 1 0.05 fisher 0.8948

Power for ANOVA

Statistics for Stratified Systematic Sampling

  • \(N\) is the number of total houses.
  • \(L\) is the number of strata (sub-districts within the survey area).
  • \(N_h\) is the number of total houses within the stratum \(h\).
  • \(n_h\) is the number of sampled houses within the stratum \(h\).
  • \(s^2_h\) is the estimated variance of the stratum \(h\).
  • \(x_{h,i}\) is the value of \(x\) for the element \(i\) within the stratum \(h\).
  • \(\bar{x}_h\) is the mean of \(x\) within the stratum \(h\).
  • \(\bar{x}_{str}\) is the mean estimate for entire population.
  • \(p_{y,str}\) is the proportion estimate for entire population.

Estimated mean of the stratum \(h\): \[ \bar{x}_h = \frac{\sum_{i=1}^{n_h}(x_{h,i})}{n_h}\]

Estimated proportion of the stratum \(h\): \[ P_{hy} = \frac{\sum_{i=1}^{n_h}(Y_{h,i})}{n_h}\]

Variance estimate of the stratum \(h\):

\[ s^2_h = \frac{\sum_{i=1}^{n_h}(x_{h,i} - \bar{x}_h)^2}{n_h - 1}\] \[ s^2_h = \frac{n_h}{n_h - 1}P_{py}(1 - P_{py})\]

Design Effect \[\rho=1 + \delta_x(n-1)\]

Estimate Mean for Entire Population

\[\bar{x}_{str} = \frac{\sum_{h=1}^LN_h\bar{x}_h}{N}\]

Standard Error of Mean Estimate \[\hat{SE}(\bar{x}_{str}) = \sqrt{\sum_{h=1}^L(\frac{N_h}{N})^2\frac{s_{hx}^2}{n_h}(\frac{N_h-n_h}{N_h})} \]

Estimate Proportion for Entire Population

\[p_{y,str}=\frac{\sum_{h=1}^LN_hP_h}{N}\]

Standard Error for Proportion Estimate

\[\hat{SE}(p_{y,str}) = \sqrt{\sum_{h=1}^L(\frac{N_h}{N})^2\frac{p_{hy}(1-p_{hy})}{n_h-1}(\frac{N_h-n_h}{N_h})} \]