# RESULTS REPLICATION
# ====================
# This script fits generalized linear mixed models (GLMMs) to estimate
# proportions of correct case management and other clinical quality indicators,
# accounting for clustering at the clinic and standardized patient (SP) levels.

library(dplyr)
library(tidyr)
library(glmmTMB)
library(emmeans)

###############################################################
######################## MODEL FORM ###########################
###############################################################

# Define the site factor levels used for stratified analysis
site_levels <- c("A", "B", "C")

#' Fit a binomial GLMM and return summary statistics
#'
#' @param dat       Data frame containing the outcome and grouping variables
#' @param outcome   Character string specifying the outcome variable (or cbind formula for proportions)
#' @param site_name Optional site identifier for subsetting; NULL fits overall model
#' @param db_cols   Character vector of column names required for complete-case analysis
#'
#' @return A list containing group label, outcome name, sample size, number of clinics/SPs,
#'         degrees of freedom, estimated probability (p_hat), and mean linear predictor (eta_bar)
fit_and_summarise <- function(dat,
                              outcome,
                              site_name = NULL,
                              db_cols) {
  
  # Subset to specific site if requested; otherwise use full dataset
  d <- if (is.null(site_name)) dat else subset(dat, site == site_name)
  
  # Restrict to complete cases for the columns needed in the model
  
  d <- d[complete.cases(d[, db_cols]), ]
  
  # Drop unused factor levels from clustering variables after subsetting
  d$clinic_names <- droplevels(factor(d$clinic_names))
  d$sp_names     <- droplevels(factor(d$sp_names))
  
  # Count unique clusters
  
  n_clinics <- nlevels(d$clinic_names)
  n_sps     <- nlevels(d$sp_names)
  
  
  # Require at least 2 clinics to estimate between-clinic variance
  if (n_clinics < 2) stop("Need at least 2 clinics after cleaning for: ",
                          ifelse(is.null(site_name), "Overall", site_name))
  
  # Degrees of freedom based on number of clinics (conservative choice for CIs)
  df_use <- n_clinics - 1
  tcrit  <- qt(0.975, df = df_use)  # Two-sided 95% critical value
  
  # Build formula: intercept-only fixed effects with crossed random intercepts
  form <- as.formula(paste0(outcome, " ~ 1 + (1 | clinic_names) + (1 | sp_names)"))
  
  # Fit the GLMM with REML estimation
  
  mod <- glmmTMB(
    form,
    data   = d,
    REML   = TRUE,
    family = binomial(link = "logit")
  )
  
  # -----------------------------
  # Compute marginal predictions
  # -----------------------------
  
  # Predicted linear predictor (eta) including random effects
  eta_hat <- predict(mod, type = "link", re.form = NULL)
  
  # Convert to probability scale
  p_hat_i <- plogis(eta_hat)
  
  # Average predicted probability across observations
  p_hat <- mean(p_hat_i)
  
  # Extract variance-covariance matrix for fixed effects
  V <- vcov(mod)$cond
  X <- model.matrix(mod)
  
  # -----------------------------
  # Delta-method SE for mean probability
  # -----------------------------
  # Gradient of mean(p) w.r.t. fixed effects uses chain rule through logistic
  w    <- p_hat_i * (1 - p_hat_i)         # Derivative of logistic at each eta
  grad <- colMeans(X * w)                  # Average gradient
  se_p <- sqrt(drop(t(grad) %*% V %*% grad))
  
  # Wald CI on probability scale (truncated to [0,1])
  ci_prob <- p_hat + c(-1, 1) * tcrit * se_p
  ci_prob <- pmin(pmax(ci_prob, 0), 1)
  
  # -----------------------------
  # Alternative: CI via mean eta then back-transform
  # -----------------------------
  eta_bar    <- mean(eta_hat)
  se_eta_bar <- sqrt(drop(t(colMeans(X)) %*% V %*% colMeans(X)))
  ci_eta     <- eta_bar + c(-1, 1) * tcrit * se_eta_bar
  ci_eta_prob <- plogis(ci_eta)
  
  # Return summary statistics
  list(
    group     = if (is.null(site_name)) "Overall" else site_name,
    outcome   = outcome,
    n         = nrow(d),
    n_clinics = n_clinics,
    n_sps     = n_sps,
    df        = df_use,
    p_hat     = p_hat,
    eta_bar   = eta_bar
  )
}

###############################################################
################# CORRECT CASE MANAGEMENT #####################
###############################################################

# Fit overall model (all sites combined)
overall_res <- fit_and_summarise(
  wp3data_rep,
  outcome  = "correct_case_management",
  db_cols  = c("correct_case_management", "sp_names", "clinic_names")
)

# Fit site-stratified models
site_res <- setNames(
  lapply(site_levels, \(s) fit_and_summarise(
    wp3data_rep,
    outcome   = "correct_case_management",
    site_name = s,
    db_cols   = c("correct_case_management", "sp_names", "clinic_names")
  )),
  site_levels
)

# Combine results and print
results_all <- c(list(Overall = overall_res), site_res)
print(results_all)

###############################################################
############### CARE THAT AVOIDS NEGLECT ######################
###############################################################

# Overall model for "avoiding neglect" outcome
overall_res <- fit_and_summarise(
  wp3data_rep,
  outcome = "avoiding_neglect_management",
  db_cols = c("avoiding_neglect_management", "sp_names", "clinic_names")
)

# Site-stratified models
# NOTE: db_cols uses correct_case_management here—verify this matches intended filtering
site_res <- setNames(
  lapply(site_levels, \(s) fit_and_summarise(
    wp3data_rep,
    outcome   = "avoiding_neglect_management",
    site_name = s,
    db_cols   = c("correct_case_management", "sp_names", "clinic_names")
  )),
  site_levels
)

results_all <- c(list(Overall = overall_res), site_res)
print(results_all)

###############################################################
################## HISTORY PROPORTIONS ########################
###############################################################

# Model history-taking as a binomial proportion: successes / total possible items
# cbind(successes, failures) format for binomial family
overall_res <- fit_and_summarise(
  wp3data_rep,
  outcome = "cbind(history_score, (max_history_score - history_score))",
  db_cols = c("history_score", "max_history_score", "sp_names", "clinic_names")
)

site_res <- setNames(
  lapply(site_levels, \(s) fit_and_summarise(
    wp3data_rep,
    outcome   = "cbind(history_score, (max_history_score - history_score))",
    site_name = s,
    db_cols   = c("history_score", "max_history_score", "sp_names", "clinic_names")
  )),
  site_levels
)

results_all <- c(list(Overall = overall_res), site_res)
print(results_all)

###############################################################
################ EXAMINATION PROPORTIONS ######################
###############################################################

# Model physical examination as a binomial proportion
overall_res <- fit_and_summarise(
  wp3data_rep,
  outcome = "cbind(examination_score, (max_examination_score - examination_score))",
  db_cols = c("examination_score", "max_examination_score", "sp_names", "clinic_names")
)

site_res <- setNames(
  lapply(site_levels, \(s) fit_and_summarise(
    wp3data_rep,
    outcome   = "cbind(examination_score, (max_examination_score - examination_score))",
    site_name = s,
    db_cols   = c("examination_score", "max_examination_score", "sp_names", "clinic_names")
  )),
  site_levels
)

results_all <- c(list(Overall = overall_res), site_res)
print(results_all)


###############################################################
## ADJUSTED MODEL: Correct Case Management with Covariates   ##
###############################################################

# This model adjusts for potential confounders:
#   - site: geographic location
#   - case: clinical case type presented by SP
#   - private: clinic ownership (public vs. private)
#   - patient_volume: average daily patient volume at clinic
#   - clinician_cadre: professional qualification level

mod_adjusted <- glmmTMB(
  correct_case_management ~ -1 +                      # Suppress global intercept
    factor(site) +                                    # Site fixed effects
    factor(case) +                                    # Case-type fixed effects
    factor(private) +                                 # Ownership fixed effect
    patient_volume +                                  # Continuous covariate
    factor(clinician_cadre) +                         # Clinician qualification
    (1 | clinic_names) +                              # Random intercept for clinics
    (1 | sp_names),                                   # Random intercept for SPs
  data   = wp3data_rep,
  REML   = TRUE,
  family = binomial(link = "logit")
)

summary(mod_adjusted)

# -----------------------------
# Estimated Marginal Means: Correct Case Management by Case Type
# -----------------------------

# Quick look at response-scale EMMs (default z-based CIs)
emmeans(mod_adjusted, ~ case, type = "response")

# Extract EMMs on link (logit) scale for custom t-based CIs
emm_link <- emmeans(mod_adjusted, ~ case)
s <- as.data.frame(summary(emm_link))

# Use clinic-based df for conservative inference
df_use <- length(unique(wp3data_rep$clinic_names)) - 1
tcrit  <- qt(0.975, df = df_use)

# Compute t-based confidence limits on logit scale, then back-transform
s$LCL_link <- s$emmean - tcrit * s$SE
s$UCL_link <- s$emmean + tcrit * s$SE

s$prob     <- plogis(s$emmean)
s$LCL_prob <- plogis(s$LCL_link)
s$UCL_prob <- plogis(s$UCL_link)

# Display case-specific probabilities with CIs
s[, c("case", "prob", "SE", "LCL_prob", "UCL_prob")]

# -----------------------------
# Odds Ratio: Private vs. Public Clinics
# -----------------------------

# Pairwise contrast on logit scale gives log-odds ratio
or_private <- contrast(
  emmeans(mod_adjusted, ~ private),
  method = "revpairwise"
)

# Default output with z-based inference
summary(or_private, infer = c(TRUE, TRUE), type = "response")

# Recompute with t-based CIs
s <- as.data.frame(summary(or_private))
df_use <- length(unique(wp3data$clinic_names)) - 1
tcrit  <- qt(0.975, df = df_use)

# CI on log(OR) scale
s$LCL_logOR <- s$estimate - tcrit * s$SE
s$UCL_logOR <- s$estimate + tcrit * s$SE

# Back-transform to odds ratio scale
s$OR  <- exp(s$estimate)
s$LCL <- exp(s$LCL_logOR)
s$UCL <- exp(s$UCL_logOR)

s[, c("contrast", "OR", "SE", "LCL", "UCL")]

# -----------------------------
# Odds Ratio: Effect of Patient Volume (per unit increase)
# -----------------------------

# emtrends extracts the slope of a continuous predictor on the link scale
vol_trend <- emtrends(
  mod_adjusted,
  ~ 1,
  var = "patient_volume"
)

vol_out <- as.data.frame(summary(vol_trend))

# t-based CI on log(OR) scale
vol_out$LCL_logOR <- vol_out$patient_volume.trend - tcrit * vol_out$SE
vol_out$UCL_logOR <- vol_out$patient_volume.trend + tcrit * vol_out$SE

# Back-transform: exp(slope) = OR per one-unit increase in patient_volume
vol_out$OR  <- exp(vol_out$patient_volume.trend)
vol_out$LCL <- exp(vol_out$LCL_logOR)
vol_out$UCL <- exp(vol_out$UCL_logOR)

vol_out[, c("OR", "SE", "LCL", "UCL")]