KPMP Figures

Related to Figure 6.

Reads pseudobulk log-CPM CSVs from 7B.KPMP_Query.md, applies UCell scoring, and generates Figures 07–10.

Gene signatures

CAAH_10_GENES    <- c("CLU", "LRP2", "LAMP2", "COL4A2", "SPINK1",
                      "WFDC2", "PAX8", "LYZ", "S100A9", "CDH13")
HYPOXIA_GENES    <- c("HIF1A", "EPAS1", "EPO", "EPOR", "VEGFA",
                      "ALDOA", "LDHA", "PGK1", "SLC2A1", "ENO1",
                      "PGAM1", "TPI1", "ANGPTL4", "HILPDA", "PLAUR",
                      "SLC16A1", "BNIP3", "CA9", "P4HA1")
GLYCOLYSIS_GENES <- c("HK2", "PFKM", "PKM", "LDHA", "HK1",
                      "ALDOA", "GAPDH", "PGK1", "PGAM1", "ENO1",
                      "TPI1", "GPI", "PFKL", "PFKP", "LDHB")
ECM_GENES        <- c("COL1A1", "COL1A2", "COL3A1", "COL4A1", "COL4A2",
                      "COL12A1", "FN1", "TIMP1", "TIMP2", "MMP2",
                      "ACTA2", "VIM", "TGFB1", "CCN2", "POSTN", "THBS1")

ALL_SIGS <- list(CAAH_10gene  = CAAH_10_GENES, Hypoxia      = HYPOXIA_GENES,
                 Glycolysis   = GLYCOLYSIS_GENES, ECM_Fibrosis = ECM_GENES)

Load pseudobulk CSVs and apply UCell

pb_expr <- read.csv(file.path(KMP_DIR, "kpmp_pseudobulk_expr.csv"))
pb_meta <- read.csv(file.path(KMP_DIR, "kpmp_pseudobulk_meta.csv"))
pb_expr$disease_label <- factor(ifelse(pb_expr$disease == "normal", "Normal", "CKD"),
                                levels = c("Normal", "CKD"))

gene_cols <- setdiff(names(pb_expr), c("donor_id", "disease", "disease_label"))
pb_mat    <- t(as.matrix(pb_expr[, gene_cols]))
colnames(pb_mat) <- pb_expr$donor_id

scores_mat <- ScoreSignatures_UCell(matrix = pb_mat, features = ALL_SIGS)
score_df   <- cbind(pb_expr[, c("donor_id", "disease", "disease_label")],
                    as.data.frame(scores_mat))

Statistics

caah_col <- "CAAH_10gene_UCell"

wt      <- wilcox.test(score_df[[caah_col]] ~ score_df$disease_label, exact = FALSE)
roc_obj <- roc(score_df$disease == "chronic kidney disease",
               score_df[[caah_col]], quiet = TRUE)
auc_val <- round(as.numeric(auc(roc_obj)), 3)

Key results: Wilcoxon p < 0.001; AUC = 0.850 (CKD vs Normal).

Figure — CAAH score boxplot (CKD vs Normal)

Related to Figure 6d (CAAH UCell score boxplot, KPMP snRNA-seq, CKD vs Normal, AUC = 0.850).

pal_kpmp2 <- c("Normal" = "#4393C3", "CKD" = "#D6604D")
y_top     <- max(score_df[[caah_col]], na.rm = TRUE) * 1.05
p_label   <- if (wt$p.value < 0.001) "p < 0.001" else paste0("p = ", signif(wt$p.value, 2))

p7 <- ggplot(score_df, aes(x = disease_label, y = .data[[caah_col]], fill = disease_label)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7, width = 0.5) +
  geom_jitter(aes(colour = disease_label), width = 0.15, size = 1.8, alpha = 0.85) +
  annotate("segment", x = 1, xend = 2, y = y_top * 1.02, yend = y_top * 1.02,
           colour = "black", linewidth = 0.4) +
  annotate("text", x = 1.5, y = y_top * 1.10,
           label = paste0(p_label, "\nAUC = ", auc_val), size = 3.2, hjust = 0.5) +
  scale_fill_manual(values = pal_kpmp2, guide = "none") +
  scale_colour_manual(values = pal_kpmp2) +
  labs(title = "CAAH Signature — KPMP (snRNA-seq)", x = NULL,
       y = "CAAH 10-gene score (UCell)") +
  theme_classic(base_size = 12)

ggsave(file.path(FIG_DIR, "07_KPMP_signature_scores_boxplot.svg"), p7, width = 5, height = 4.5)

Figure — ROC curve

roc_df <- data.frame(specificity = rev(roc_obj$specificities),
                     sensitivity = rev(roc_obj$sensitivities))

p8 <- ggplot(roc_df, aes(x = 1 - specificity, y = sensitivity)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed",
              colour = "grey60", linewidth = 0.5) +
  geom_line(colour = "#D6604D", linewidth = 1.1) +
  annotate("text", x = 0.65, y = 0.15,
           label = paste0("AUC = ", auc_val, "\n(CKD vs Normal)"),
           size = 3.5, colour = "#D6604D", hjust = 0) +
  scale_x_continuous(labels = percent_format(), expand = c(0.01, 0.01)) +
  scale_y_continuous(labels = percent_format(), expand = c(0.01, 0.01)) +
  labs(title = "ROC — CAAH Signature (KPMP)",
       x = "1 − Specificity (False Positive Rate)", y = "Sensitivity (True Positive Rate)") +
  theme_classic(base_size = 12)

ggsave(file.path(FIG_DIR, "08_KPMP_ROC.svg"), p8, width = 5, height = 4.5)

Figure — Individual gene dot plot

Related to Figure 6e (per-gene mean difference in log-CPM between CKD and Normal, dot size = −log₁₀(p), color = direction).

caah_found <- intersect(CAAH_10_GENES, gene_cols)

gene_stats <- lapply(caah_found, function(g) {
  ckd    <- pb_expr[[g]][pb_expr$disease == "chronic kidney disease"]
  normal <- pb_expr[[g]][pb_expr$disease == "normal"]
  wt_g   <- wilcox.test(ckd, normal, exact = FALSE)
  data.frame(Gene = g, mean_CKD = mean(ckd, na.rm = TRUE),
             mean_Normal = mean(normal, na.rm = TRUE),
             Diff    = mean(ckd, na.rm = TRUE) - mean(normal, na.rm = TRUE),
             p_value = wt_g$p.value)
})
gene_stats_df <- do.call(rbind, gene_stats)
gene_stats_df$p_adj      <- p.adjust(gene_stats_df$p_value, method = "BH")
gene_stats_df$sig_label  <- dplyr::case_when(gene_stats_df$p_adj < 0.001 ~ "***",
                                              gene_stats_df$p_adj < 0.01  ~ "**",
                                              gene_stats_df$p_adj < 0.05  ~ "*",
                                              TRUE                         ~ "ns")
gene_stats_df$neg_log10_p <- -log10(gene_stats_df$p_value + 1e-10)
gene_stats_df <- gene_stats_df %>% dplyr::arrange(Diff) %>%
    dplyr::mutate(Gene = factor(Gene, levels = Gene))

p9 <- ggplot(gene_stats_df, aes(x = Diff, y = Gene,
             colour = ifelse(Diff > 0, "UP in CKD", "DOWN in CKD"),
             size = neg_log10_p)) +
  geom_vline(xintercept = 0, linetype = "dashed", colour = "grey60", linewidth = 0.5) +
  geom_point(alpha = 0.85) +
  geom_text(aes(label = sig_label),
            nudge_x = ifelse(gene_stats_df$Diff >= 0, 0.08, -0.08),
            size = 4, show.legend = FALSE) +
  scale_colour_manual("Direction",
                      values = c("UP in CKD" = "#D6604D", "DOWN in CKD" = "#4393C3")) +
  scale_size_continuous(name = expression(-log[10](p)), range = c(2, 8)) +
  labs(title    = "CAAH 10-Gene Expression — KPMP (CKD vs Normal)",
       subtitle = "Pseudo-bulk log-CPM; size = −log₁₀(p)",
       x = "Δ log-CPM (CKD − Normal)", y = NULL) +
  theme_classic(base_size = 12) + theme(legend.position = "right")

ggsave(file.path(FIG_DIR, "09_KPMP_individual_genes_dotplot.svg"), p9, width = 6, height = 4.5)

Figure — Combined ROC (ERCB + KPMP)

Related to Figure 6g (combined ROC across ERCB glomerular AUC = 0.789, ERCB tubulointerstitial AUC = 0.731, and KPMP AUC = 0.850).

ercb_scores <- read.csv(file.path(TAB_DIR, "CAAH_signature_scores_ERCB.csv"))

ercb_glom <- ercb_scores %>% filter(compartment == "Glomerular",       group %in% c("HT", "Healthy"))
ercb_ti   <- ercb_scores %>% filter(compartment == "Tubulointerstitial", group %in% c("HT", "Healthy"))

roc_glom <- roc(ercb_glom$group == "HT", ercb_glom$CAAH_10gene_UCell, quiet = TRUE)
roc_ti   <- roc(ercb_ti$group   == "HT", ercb_ti$CAAH_10gene_UCell,   quiet = TRUE)
auc_glom <- round(as.numeric(auc(roc_glom)), 3)
auc_ti   <- round(as.numeric(auc(roc_ti)),   3)

buildRocDf <- function(roc_obj, label, auc_val) {
  data.frame(fpr = 1 - rev(roc_obj$specificities),
             tpr = rev(roc_obj$sensitivities),
             Dataset = paste0(label, " (AUC = ", auc_val, ")"))
}

combined_roc <- bind_rows(
  buildRocDf(roc_glom, "ERCB Glomerular",        auc_glom),
  buildRocDf(roc_ti,   "ERCB Tubulointerstitial", auc_ti),
  buildRocDf(roc_obj,  "KPMP snRNA-seq",           auc_val)
)

roc_labels <- unique(combined_roc$Dataset)
pal_roc    <- setNames(c("#E41A1C", "#FF7F00", "#984EA3"), roc_labels)

p10 <- ggplot(combined_roc, aes(x = fpr, y = tpr, colour = Dataset, linetype = Dataset)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed",
              colour = "grey60", linewidth = 0.5) +
  geom_line(linewidth = 1.1) +
  scale_colour_manual("Dataset", values = pal_roc) +
  scale_linetype_manual("Dataset", values = c("solid", "longdash", "dotdash")) +
  scale_x_continuous(labels = percent_format(), limits = c(0, 1), expand = c(0.01, 0.01)) +
  scale_y_continuous(labels = percent_format(), limits = c(0, 1), expand = c(0.01, 0.01)) +
  labs(title    = "CAAH 10-Gene Signature — Human Validation ROC",
       subtitle = "ERCB microarray (HT vs Healthy) + KPMP snRNA-seq (CKD vs Normal)",
       x = "1 − Specificity (False Positive Rate)", y = "Sensitivity (True Positive Rate)") +
  theme_classic(base_size = 12) +
  theme(legend.position   = c(0.62, 0.18),
        legend.background = element_rect(fill = alpha("white", 0.8), colour = NA),
        legend.key.width  = unit(1.5, "cm"))

ggsave(file.path(FIG_DIR, "10_Combined_ROC_ERCB_KPMP.svg"), p10, width = 6, height = 5.5)