KPMP Figures
Related to Figure 6.
Reads pseudobulk log-CPM CSVs from 7B.KPMP_Query.md, applies UCell scoring, and generates Figures 07–10.
Gene signatures
CAAH_10_GENES <- c("CLU", "LRP2", "LAMP2", "COL4A2", "SPINK1",
"WFDC2", "PAX8", "LYZ", "S100A9", "CDH13")
HYPOXIA_GENES <- c("HIF1A", "EPAS1", "EPO", "EPOR", "VEGFA",
"ALDOA", "LDHA", "PGK1", "SLC2A1", "ENO1",
"PGAM1", "TPI1", "ANGPTL4", "HILPDA", "PLAUR",
"SLC16A1", "BNIP3", "CA9", "P4HA1")
GLYCOLYSIS_GENES <- c("HK2", "PFKM", "PKM", "LDHA", "HK1",
"ALDOA", "GAPDH", "PGK1", "PGAM1", "ENO1",
"TPI1", "GPI", "PFKL", "PFKP", "LDHB")
ECM_GENES <- c("COL1A1", "COL1A2", "COL3A1", "COL4A1", "COL4A2",
"COL12A1", "FN1", "TIMP1", "TIMP2", "MMP2",
"ACTA2", "VIM", "TGFB1", "CCN2", "POSTN", "THBS1")
ALL_SIGS <- list(CAAH_10gene = CAAH_10_GENES, Hypoxia = HYPOXIA_GENES,
Glycolysis = GLYCOLYSIS_GENES, ECM_Fibrosis = ECM_GENES)
Load pseudobulk CSVs and apply UCell
pb_expr <- read.csv(file.path(KMP_DIR, "kpmp_pseudobulk_expr.csv"))
pb_meta <- read.csv(file.path(KMP_DIR, "kpmp_pseudobulk_meta.csv"))
pb_expr$disease_label <- factor(ifelse(pb_expr$disease == "normal", "Normal", "CKD"),
levels = c("Normal", "CKD"))
gene_cols <- setdiff(names(pb_expr), c("donor_id", "disease", "disease_label"))
pb_mat <- t(as.matrix(pb_expr[, gene_cols]))
colnames(pb_mat) <- pb_expr$donor_id
scores_mat <- ScoreSignatures_UCell(matrix = pb_mat, features = ALL_SIGS)
score_df <- cbind(pb_expr[, c("donor_id", "disease", "disease_label")],
as.data.frame(scores_mat))
Statistics
caah_col <- "CAAH_10gene_UCell"
wt <- wilcox.test(score_df[[caah_col]] ~ score_df$disease_label, exact = FALSE)
roc_obj <- roc(score_df$disease == "chronic kidney disease",
score_df[[caah_col]], quiet = TRUE)
auc_val <- round(as.numeric(auc(roc_obj)), 3)
Key results: Wilcoxon p < 0.001; AUC = 0.850 (CKD vs Normal).
Figure — CAAH score boxplot (CKD vs Normal)
Related to Figure 6d (CAAH UCell score boxplot, KPMP snRNA-seq, CKD vs Normal, AUC = 0.850).
pal_kpmp2 <- c("Normal" = "#4393C3", "CKD" = "#D6604D")
y_top <- max(score_df[[caah_col]], na.rm = TRUE) * 1.05
p_label <- if (wt$p.value < 0.001) "p < 0.001" else paste0("p = ", signif(wt$p.value, 2))
p7 <- ggplot(score_df, aes(x = disease_label, y = .data[[caah_col]], fill = disease_label)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7, width = 0.5) +
geom_jitter(aes(colour = disease_label), width = 0.15, size = 1.8, alpha = 0.85) +
annotate("segment", x = 1, xend = 2, y = y_top * 1.02, yend = y_top * 1.02,
colour = "black", linewidth = 0.4) +
annotate("text", x = 1.5, y = y_top * 1.10,
label = paste0(p_label, "\nAUC = ", auc_val), size = 3.2, hjust = 0.5) +
scale_fill_manual(values = pal_kpmp2, guide = "none") +
scale_colour_manual(values = pal_kpmp2) +
labs(title = "CAAH Signature — KPMP (snRNA-seq)", x = NULL,
y = "CAAH 10-gene score (UCell)") +
theme_classic(base_size = 12)
ggsave(file.path(FIG_DIR, "07_KPMP_signature_scores_boxplot.svg"), p7, width = 5, height = 4.5)
Figure — ROC curve
roc_df <- data.frame(specificity = rev(roc_obj$specificities),
sensitivity = rev(roc_obj$sensitivities))
p8 <- ggplot(roc_df, aes(x = 1 - specificity, y = sensitivity)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed",
colour = "grey60", linewidth = 0.5) +
geom_line(colour = "#D6604D", linewidth = 1.1) +
annotate("text", x = 0.65, y = 0.15,
label = paste0("AUC = ", auc_val, "\n(CKD vs Normal)"),
size = 3.5, colour = "#D6604D", hjust = 0) +
scale_x_continuous(labels = percent_format(), expand = c(0.01, 0.01)) +
scale_y_continuous(labels = percent_format(), expand = c(0.01, 0.01)) +
labs(title = "ROC — CAAH Signature (KPMP)",
x = "1 − Specificity (False Positive Rate)", y = "Sensitivity (True Positive Rate)") +
theme_classic(base_size = 12)
ggsave(file.path(FIG_DIR, "08_KPMP_ROC.svg"), p8, width = 5, height = 4.5)
Figure — Individual gene dot plot
Related to Figure 6e (per-gene mean difference in log-CPM between CKD and Normal, dot size = −log₁₀(p), color = direction).
caah_found <- intersect(CAAH_10_GENES, gene_cols)
gene_stats <- lapply(caah_found, function(g) {
ckd <- pb_expr[[g]][pb_expr$disease == "chronic kidney disease"]
normal <- pb_expr[[g]][pb_expr$disease == "normal"]
wt_g <- wilcox.test(ckd, normal, exact = FALSE)
data.frame(Gene = g, mean_CKD = mean(ckd, na.rm = TRUE),
mean_Normal = mean(normal, na.rm = TRUE),
Diff = mean(ckd, na.rm = TRUE) - mean(normal, na.rm = TRUE),
p_value = wt_g$p.value)
})
gene_stats_df <- do.call(rbind, gene_stats)
gene_stats_df$p_adj <- p.adjust(gene_stats_df$p_value, method = "BH")
gene_stats_df$sig_label <- dplyr::case_when(gene_stats_df$p_adj < 0.001 ~ "***",
gene_stats_df$p_adj < 0.01 ~ "**",
gene_stats_df$p_adj < 0.05 ~ "*",
TRUE ~ "ns")
gene_stats_df$neg_log10_p <- -log10(gene_stats_df$p_value + 1e-10)
gene_stats_df <- gene_stats_df %>% dplyr::arrange(Diff) %>%
dplyr::mutate(Gene = factor(Gene, levels = Gene))
p9 <- ggplot(gene_stats_df, aes(x = Diff, y = Gene,
colour = ifelse(Diff > 0, "UP in CKD", "DOWN in CKD"),
size = neg_log10_p)) +
geom_vline(xintercept = 0, linetype = "dashed", colour = "grey60", linewidth = 0.5) +
geom_point(alpha = 0.85) +
geom_text(aes(label = sig_label),
nudge_x = ifelse(gene_stats_df$Diff >= 0, 0.08, -0.08),
size = 4, show.legend = FALSE) +
scale_colour_manual("Direction",
values = c("UP in CKD" = "#D6604D", "DOWN in CKD" = "#4393C3")) +
scale_size_continuous(name = expression(-log[10](p)), range = c(2, 8)) +
labs(title = "CAAH 10-Gene Expression — KPMP (CKD vs Normal)",
subtitle = "Pseudo-bulk log-CPM; size = −log₁₀(p)",
x = "Δ log-CPM (CKD − Normal)", y = NULL) +
theme_classic(base_size = 12) + theme(legend.position = "right")
ggsave(file.path(FIG_DIR, "09_KPMP_individual_genes_dotplot.svg"), p9, width = 6, height = 4.5)
Figure — Combined ROC (ERCB + KPMP)
Related to Figure 6g (combined ROC across ERCB glomerular AUC = 0.789, ERCB tubulointerstitial AUC = 0.731, and KPMP AUC = 0.850).
ercb_scores <- read.csv(file.path(TAB_DIR, "CAAH_signature_scores_ERCB.csv"))
ercb_glom <- ercb_scores %>% filter(compartment == "Glomerular", group %in% c("HT", "Healthy"))
ercb_ti <- ercb_scores %>% filter(compartment == "Tubulointerstitial", group %in% c("HT", "Healthy"))
roc_glom <- roc(ercb_glom$group == "HT", ercb_glom$CAAH_10gene_UCell, quiet = TRUE)
roc_ti <- roc(ercb_ti$group == "HT", ercb_ti$CAAH_10gene_UCell, quiet = TRUE)
auc_glom <- round(as.numeric(auc(roc_glom)), 3)
auc_ti <- round(as.numeric(auc(roc_ti)), 3)
buildRocDf <- function(roc_obj, label, auc_val) {
data.frame(fpr = 1 - rev(roc_obj$specificities),
tpr = rev(roc_obj$sensitivities),
Dataset = paste0(label, " (AUC = ", auc_val, ")"))
}
combined_roc <- bind_rows(
buildRocDf(roc_glom, "ERCB Glomerular", auc_glom),
buildRocDf(roc_ti, "ERCB Tubulointerstitial", auc_ti),
buildRocDf(roc_obj, "KPMP snRNA-seq", auc_val)
)
roc_labels <- unique(combined_roc$Dataset)
pal_roc <- setNames(c("#E41A1C", "#FF7F00", "#984EA3"), roc_labels)
p10 <- ggplot(combined_roc, aes(x = fpr, y = tpr, colour = Dataset, linetype = Dataset)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed",
colour = "grey60", linewidth = 0.5) +
geom_line(linewidth = 1.1) +
scale_colour_manual("Dataset", values = pal_roc) +
scale_linetype_manual("Dataset", values = c("solid", "longdash", "dotdash")) +
scale_x_continuous(labels = percent_format(), limits = c(0, 1), expand = c(0.01, 0.01)) +
scale_y_continuous(labels = percent_format(), limits = c(0, 1), expand = c(0.01, 0.01)) +
labs(title = "CAAH 10-Gene Signature — Human Validation ROC",
subtitle = "ERCB microarray (HT vs Healthy) + KPMP snRNA-seq (CKD vs Normal)",
x = "1 − Specificity (False Positive Rate)", y = "Sensitivity (True Positive Rate)") +
theme_classic(base_size = 12) +
theme(legend.position = c(0.62, 0.18),
legend.background = element_rect(fill = alpha("white", 0.8), colour = NA),
legend.key.width = unit(1.5, "cm"))
ggsave(file.path(FIG_DIR, "10_Combined_ROC_ERCB_KPMP.svg"), p10, width = 6, height = 5.5)