ERCB Microarray Analysis

Related to Figure 6.

European Renal cDNA Bank (ERCB) — two Affymetrix microarray datasets from GEO. Glomerular (GSE37460) and tubulointerstitial (GSE37455) biopsy compartments are analyzed separately. Comparison: hypertensive nephropathy (HT) vs. healthy living donors.

Gene signatures

Human orthologs of the mouse CAAH signature, scored alongside pathway context signatures via UCell.

caah_10_genes <- c("CLU", "LRP2", "LAMP2", "COL4A2", "SPINK1",
                   "WFDC2", "PAX8", "LYZ", "S100A9", "CDH13")

hypoxia_genes <- c("HIF1A", "EPAS1", "EPO", "EPOR", "VEGFA",
                   "ALDOA", "LDHA", "PGK1", "SLC2A1", "ENO1",
                   "PGAM1", "TPI1", "ANGPTL4", "HILPDA", "PLAUR",
                   "SLC16A1", "BNIP3", "CA9", "P4HA1", "KRT17")

glycolysis_genes <- c("HK2", "PFKM", "PKM", "LDHA", "HK1",
                      "ALDOA", "GAPDH", "PGK1", "PGAM1", "ENO1",
                      "TPI1", "GPI", "PFKL", "PFKP", "LDHB")

ecm_genes <- c("COL1A1", "COL1A2", "COL3A1", "COL4A1", "COL4A2",
               "COL12A1", "FN1", "TIMP1", "TIMP2", "MMP2",
               "ACTA2", "VIM", "TGFB1", "CTGF", "POSTN", "THBS1")

all_signatures <- list(
    CAAH_10gene  = caah_10_genes,
    Hypoxia      = hypoxia_genes,
    Glycolysis   = glycolysis_genes,
    SASP         = sasp_genes,
    ECM_Fibrosis = ecm_genes
)

Hallmark OxPhos and Glycolysis gene sets from MSigDB are appended after loading:

msig_hallmark <- msigdbr(species = "Homo sapiens", category = "H")
all_signatures[["OxPhos_Hallmark"]]    <- msig_hallmark %>%
    filter(gs_name == "HALLMARK_OXIDATIVE_PHOSPHORYLATION") %>%
    pull(gene_symbol) %>% unique()
all_signatures[["Glycolysis_Hallmark"]] <- msig_hallmark %>%
    filter(gs_name == "HALLMARK_GLYCOLYSIS") %>%
    pull(gene_symbol) %>% unique()

Parse GEO series matrix files

The series matrix files are read without GEOquery. Sample metadata is extracted from !Sample_ header rows; row IDs are Entrez Gene IDs.

parseSeriesMatrix <- function(filepath) {
    lines      <- readLines(filepath)
    meta_rows  <- grep("^!Sample_", lines, value = TRUE)
    titles     <- gsub('"', '', strsplit(
                       meta_rows[grep("^!Sample_title",         meta_rows)], "\t")[[1]][-1])
    accessions <- gsub('"', '', strsplit(
                       meta_rows[grep("^!Sample_geo_accession", meta_rows)], "\t")[[1]][-1])
    groups <- dplyr::case_when(
        grepl("Healthy|Living donor|living donor", titles, ignore.case = TRUE) ~ "Healthy",
        grepl("Hypertensive|hypertensive",         titles, ignore.case = TRUE) ~ "HT",
        grepl("IgA|IgAN",                          titles, ignore.case = TRUE) ~ "IgAN",
        TRUE ~ "Other"
    )
    meta <- data.frame(sample = accessions, title = titles, group = groups,
                       stringsAsFactors = FALSE)
    data_start <- grep("^!series_matrix_table_begin", lines) + 1
    data_end   <- grep("^!series_matrix_table_end",   lines) - 1
    expr_dt    <- fread(text = paste(lines[data_start:data_end], collapse = "\n"),
                        sep = "\t", header = TRUE, data.table = FALSE)
    rownames(expr_dt)    <- as.character(expr_dt[, 1])
    expr_dt              <- expr_dt[, -1, drop = FALSE]
    colnames(expr_dt)    <- gsub('"', '', colnames(expr_dt))
    list(expr = as.matrix(expr_dt[, meta$sample, drop = FALSE]), meta = meta)
}

Map Entrez IDs to gene symbols and collapse probes

Probes mapping to the same gene are collapsed by keeping the one with the highest mean expression.

entrezToSymbol <- function(entrez_ids) {
    map <- AnnotationDbi::select(org.Hs.eg.db,
                                 keys    = as.character(entrez_ids),
                                 columns = c("ENTREZID", "SYMBOL"),
                                 keytype = "ENTREZID")
    map <- map[!is.na(map$SYMBOL) & !duplicated(map$ENTREZID), ]
    setNames(map$SYMBOL, map$ENTREZID)
}

collapseToGenes <- function(expr_mat, symbol_map) {
    symbols  <- symbol_map[rownames(expr_mat)]
    keep     <- !is.na(symbols)
    expr_mat <- expr_mat[keep, , drop = FALSE]
    symbols  <- symbols[keep]
    df       <- data.frame(symbol = symbols, mean_expr = rowMeans(expr_mat, na.rm = TRUE),
                           idx = seq_along(symbols))
    df       <- df[order(df$symbol, -df$mean_expr), ]
    df       <- df[!duplicated(df$symbol), ]
    expr_mat <- expr_mat[df$idx, , drop = FALSE]
    rownames(expr_mat) <- df$symbol
    expr_mat
}

Load datasets

Glomerular uses GPL14663 only (HT + IgAN + Healthy). Tubulointerstitial merges GPL14663 (HT + small Healthy set) with GPL11670 (18 Healthy donors) on shared genes.

glom  <- parseSeriesMatrix(file.path(DATA_DIR, "GSE37460/GSE37460-GPL14663_series_matrix.txt.gz"))
tub14 <- parseSeriesMatrix(file.path(DATA_DIR, "GSE37455/GSE37455-GPL14663_series_matrix.txt.gz"))
tub11 <- parseSeriesMatrix(file.path(DATA_DIR, "GSE37455/GSE37455-GPL11670_series_matrix.txt.gz"))

glom_gene  <- collapseToGenes(glom$expr,  entrezToSymbol(rownames(glom$expr)))
tub14_gene <- collapseToGenes(tub14$expr, entrezToSymbol(rownames(tub14$expr)))
tub11_gene <- collapseToGenes(tub11$expr, entrezToSymbol(rownames(tub11$expr)))

common_tub_genes <- intersect(rownames(tub14_gene), rownames(tub11_gene))
tub_expr <- cbind(tub14_gene[common_tub_genes, ], tub11_gene[common_tub_genes, ])
tub_meta <- rbind(tub14$meta, tub11$meta)

UCell scoring

scoreSignaturesBulk <- function(expr_mat, meta_df, signatures, label) {
    scores    <- UCell::ScoreSignatures_UCell(matrix = expr_mat, features = signatures)
    scores_dt <- as.data.frame(scores)
    scores_dt$sample <- rownames(scores_dt)
    result <- merge(scores_dt, meta_df, by = "sample")
    result$compartment <- label
    result
}

glom_scores <- scoreSignaturesBulk(glom_gene, glom$meta, all_signatures, "Glomerular")
tub_scores  <- scoreSignaturesBulk(tub_expr,  tub_meta,  all_signatures, "Tubulointerstitial")
all_scores  <- rbind(glom_scores, tub_scores)
fwrite(all_scores, file.path(TAB_DIR, "CAAH_signature_scores_ERCB.csv"))

Wilcoxon statistics (HT vs Healthy)

score_cols <- grep("_UCell$", colnames(all_scores), value = TRUE)

runStats <- function(scores_df, compartment_label) {
    df <- scores_df[scores_df$compartment == compartment_label &
                    scores_df$group %in% c("Healthy", "HT"), ]
    results <- lapply(score_cols, function(sig) {
        ht  <- df[df$group == "HT",      sig]
        ctl <- df[df$group == "Healthy", sig]
        if (length(ht) < 2 || length(ctl) < 2) return(NULL)
        wt <- wilcox.test(ht, ctl, exact = FALSE)
        data.frame(Signature = sub("_UCell$", "", sig), Compartment = compartment_label,
                   n_HT = length(ht), n_Healthy = length(ctl),
                   mean_HT = mean(ht, na.rm = TRUE), mean_Healthy = mean(ctl, na.rm = TRUE),
                   log2FC  = log2(mean(ht, na.rm = TRUE) / mean(ctl, na.rm = TRUE)),
                   p_value = wt$p.value)
    })
    results <- do.call(rbind, Filter(Negate(is.null), results))
    results$p_adj <- p.adjust(results$p_value, method = "BH")
    results
}

stats_all <- rbind(runStats(all_scores, "Glomerular"),
                   runStats(all_scores, "Tubulointerstitial"))
fwrite(stats_all, file.path(TAB_DIR, "CAAH_signature_stats_HT_vs_Healthy.csv"))

Key results: CAAH 10-gene score elevated in Glomerular (padj = 0.035, AUC = 0.789) and Tubulointerstitial (padj = 0.032, AUC = 0.731). Individually significant genes: CDH13, COL4A2, EPAS1.

Differential expression with limma

runLimmaDE <- function(expr_mat, meta_df, compartment_label) {
    keep   <- meta_df$group %in% c("Healthy", "HT")
    expr   <- expr_mat[, meta_df$sample[keep], drop = FALSE]
    group  <- factor(meta_df$group[keep], levels = c("Healthy", "HT"))
    fit    <- eBayes(lmFit(expr, model.matrix(~ group)))
    top    <- topTable(fit, coef = "groupHT", number = Inf, sort.by = "P")
    data.frame(Gene = rownames(top), top, Compartment = compartment_label,
               stringsAsFactors = FALSE)
}

de_glom <- runLimmaDE(glom_gene, glom$meta, "Glomerular")
de_tub  <- runLimmaDE(tub_expr,  tub_meta,  "Tubulointerstitial")
fwrite(de_glom, file.path(TAB_DIR, "DE_HT_vs_Healthy_Glomerular_limma.csv"))
fwrite(de_tub,  file.path(TAB_DIR, "DE_HT_vs_Healthy_Tubulointerstitial_limma.csv"))

Gene set enrichment (fgsea)

Ranked by limma t-statistic. MSigDB Hallmark pathways plus custom signatures are tested.

runFgseaAnalysis <- function(de_result, compartment_label) {
    ranked <- sort(setNames(de_result$t, de_result$Gene), decreasing = TRUE)
    ranked <- ranked[!is.na(ranked)]
    msig_h <- msigdbr(species = "Homo sapiens", category = "H")
    gene_sets <- c(split(msig_h$gene_symbol, msig_h$gs_name),
                   setNames(lapply(all_signatures, toupper),
                            paste0("CAAH_", names(all_signatures))))
    set.seed(99)
    fgsea_res <- fgsea(pathways = gene_sets, stats = ranked,
                       minSize = 5, maxSize = 500, nPermSimple = 10000)
    fgsea_res$Compartment <- compartment_label
    fgsea_res[order(fgsea_res$padj), ]
}

fgsea_glom <- runFgseaAnalysis(de_glom, "Glomerular")
fgsea_tub  <- runFgseaAnalysis(de_tub,  "Tubulointerstitial")
fwrite(fgsea_glom, file.path(TAB_DIR, "fgsea_HT_vs_Healthy_Glomerular.csv"))
fwrite(fgsea_tub,  file.path(TAB_DIR, "fgsea_HT_vs_Healthy_Tubulointerstitial.csv"))

Figure — UCell score boxplot

Related to Figure 6c (CAAH 10-gene UCell score boxplots in ERCB glomerular and tubulointerstitial compartments, HT vs Healthy).

group_colors <- c("Healthy" = "#4575B4", "HT" = "#D73027", "IgAN" = "#74ADD1")

score_long <- all_scores %>%
    dplyr::filter(group %in% c("Healthy", "HT")) %>%
    dplyr::select(sample, group, compartment,
                  matches("CAAH_10gene_UCell|Hypoxia_UCell|Glycolysis_UCell|OxPhos_Hallmark_UCell")) %>%
    tidyr::pivot_longer(cols = matches("_UCell$"), names_to = "Signature", values_to = "Score") %>%
    dplyr::mutate(Signature = sub("_UCell$", "", sub("^CAAH_", "", Signature)),
                  group     = factor(group, levels = c("Healthy", "HT")))

p_violin <- ggplot(score_long, aes(x = group, y = Score, fill = group)) +
    geom_boxplot(width = 0.4, outlier.size = 0.5, alpha = 0.8) +
    geom_jitter(width = 0.1, size = 0.8, alpha = 0.5) +
    facet_grid(Signature ~ compartment, scales = "free_y") +
    scale_fill_manual(values = group_colors) +
    labs(title    = "CAAH Gene Signature Scores: Hypertensive Nephropathy vs. Healthy",
         subtitle = "ERCB Glomerular (GSE37460) and Tubulointerstitial (GSE37455)",
         x = "", y = "UCell Score") +
    theme_bw(base_size = 8) + theme(legend.position = "none")

ggsave(file.path(FIG_DIR, "01_CAAH_signature_scores_violin.svg"),
       p_violin, width = 140, height = 200, units = "mm", dpi = 300)

Figure — Heatmaps

Row-scaled expression (z-score across samples), HT vs Healthy only.

plotHeatmap <- function(expr_mat, meta_df, genes, title, out_prefix) {
    genes_found <- intersect(toupper(genes), rownames(expr_mat))
    keep        <- meta_df$group %in% c("Healthy", "HT")
    mat_scaled  <- t(scale(t(expr_mat[genes_found, meta_df$sample[keep], drop = FALSE])))
    ann_col     <- data.frame(Group = meta_df$group[keep], row.names = meta_df$sample[keep])
    for (ext in c(".png", ".pdf")) {
        pheatmap(mat_scaled,
                 annotation_col    = ann_col,
                 annotation_colors = list(Group = group_colors[c("Healthy", "HT")]),
                 cluster_cols = TRUE, cluster_rows = TRUE,
                 color    = colorRampPalette(c("#313695", "white", "#A50026"))(100),
                 fontsize = 7, main = title,
                 filename = paste0(out_prefix, ext), width = 8, height = 5)
    }
}

plotHeatmap(glom_gene, glom$meta, caah_10_genes,
    "CAAH 10-Gene Signature: Glomerular (HT vs Healthy)",
    file.path(FIG_DIR, "02_CAAH_heatmap_Glomerular"))
plotHeatmap(tub_expr,  tub_meta,  caah_10_genes,
    "CAAH 10-Gene Signature: Tubulointerstitial (HT vs Healthy)",
    file.path(FIG_DIR, "02_CAAH_heatmap_Tubulointerstitial"))
plotHeatmap(glom_gene, glom$meta,
    c("HIF1A", "EPAS1", "EPO", "EPOR", "VEGFA", "HK2", "PFKM", "PKM", "LDHA", "ENO1", "PGK1"),
    "Hypoxia & Glycolysis Genes: Glomerular (HT vs Healthy)",
    file.path(FIG_DIR, "03_Hypoxia_Glycolysis_heatmap_Glomerular"))

Figure — fgsea NES bar chart

caah_relevant_pathways <- c(
    "HALLMARK_HYPOXIA", "HALLMARK_GLYCOLYSIS", "HALLMARK_OXIDATIVE_PHOSPHORYLATION",
    "HALLMARK_INFLAMMATORY_RESPONSE", "HALLMARK_TGF_BETA_SIGNALING",
    "HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION", "HALLMARK_ANGIOGENESIS",
    "CAAH_CAAH_10gene", "CAAH_Hypoxia", "CAAH_Glycolysis", "CAAH_ECM_Fibrosis"
)

plotFgseaBar <- function(fgsea_dt, title, out_prefix) {
    dt <- as.data.frame(fgsea_dt)[as.data.frame(fgsea_dt)$pathway %in% caah_relevant_pathways, ]
    dt$label <- factor(sub("^CAAH_", "[CAAH] ", sub("^HALLMARK_", "", dt$pathway)),
                       levels = sub("^CAAH_", "[CAAH] ", sub("^HALLMARK_", "", dt$pathway[order(dt$NES)])))
    dt$sig <- ifelse(dt$padj < 0.05, "p.adj<0.05", "NS")
    p <- ggplot(dt, aes(x = label, y = NES, fill = sig)) +
        geom_bar(stat = "identity") + geom_hline(yintercept = 0, linewidth = 0.3) +
        coord_flip() +
        scale_fill_manual(values = c("p.adj<0.05" = "#D73027", "NS" = "grey70"), name = "") +
        labs(title = title, subtitle = "fgsea; HT vs Healthy; ranked by limma t-statistic",
             x = "", y = "Normalized Enrichment Score (NES)") +
        theme_bw(base_size = 8)
    ggsave(paste0(out_prefix, ".svg"), p, width = 150, height = 100, units = "mm", dpi = 300)
}

plotFgseaBar(fgsea_glom, "Pathway Enrichment: Glomerular HT vs Healthy",
               file.path(FIG_DIR, "04_fgsea_NES_Glomerular"))
plotFgseaBar(fgsea_tub,  "Pathway Enrichment: Tubulointerstitial HT vs Healthy",
               file.path(FIG_DIR, "04_fgsea_NES_Tubulointerstitial"))

Figure — Volcano plot

CAAH signature genes labeled in orange.

plotVolcano <- function(de_result, title, out_prefix) {
    de_result$color <- dplyr::case_when(
        de_result$Gene %in% toupper(caah_10_genes)              ~ "CAAH signature",
        de_result$adj.P.Val < 0.05 & de_result$logFC >  0.5    ~ "Up",
        de_result$adj.P.Val < 0.05 & de_result$logFC < -0.5    ~ "Down",
        TRUE                                                      ~ "NS"
    )
    p <- ggplot(de_result, aes(x = logFC, y = -log10(P.Value), color = color)) +
        geom_point(size = 0.8, alpha = 0.6) +
        geom_point(data = de_result[de_result$color == "CAAH signature", ],
                   size = 2.5, alpha = 1) +
        ggrepel::geom_text_repel(
            data = de_result[de_result$color == "CAAH signature", ],
            aes(label = Gene), size = 2.5, max.overlaps = 20, show.legend = FALSE) +
        scale_color_manual(values = c("CAAH signature" = "#E66100",
                                      "Up" = "#D73027", "Down" = "#4575B4", "NS" = "grey70")) +
        geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed",
                   linewidth = 0.3, color = "grey50") +
        geom_hline(yintercept = -log10(0.05), linetype = "dashed",
                   linewidth = 0.3, color = "grey50") +
        labs(title = title, x = "log2 Fold Change", y = "-log10(p-value)", color = "") +
        theme_bw(base_size = 8)
    ggsave(paste0(out_prefix, ".svg"), p, width = 120, height = 100, units = "mm", dpi = 300)
}

plotVolcano(de_glom, "Glomerular: HT vs Healthy — CAAH Signature Highlighted",
             file.path(FIG_DIR, "05_volcano_Glomerular"))
plotVolcano(de_tub,  "Tubulointerstitial: HT vs Healthy — CAAH Signature Highlighted",
             file.path(FIG_DIR, "05_volcano_Tubulointerstitial"))

Figure — ROC curve

plotRoc <- function(scores_df, compartment_label, sig_col = "CAAH_10gene_UCell") {
    df      <- scores_df[scores_df$compartment == compartment_label &
                         scores_df$group %in% c("Healthy", "HT"), ]
    df$label <- as.integer(df$group == "HT")
    roc_obj  <- pROC::roc(df$label, df[[sig_col]], direction = "<")
    auc_val  <- round(pROC::auc(roc_obj), 3)
    roc_df   <- data.frame(Specificity = roc_obj$specificities,
                            Sensitivity = roc_obj$sensitivities)
    p <- ggplot(roc_df, aes(x = 1 - Specificity, y = Sensitivity)) +
        geom_line(color = "#D73027", linewidth = 1) +
        geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
        annotate("text", x = 0.7, y = 0.2, label = paste0("AUC = ", auc_val), size = 3) +
        labs(title = paste0("ROC: CAAH 10-Gene Score — ", compartment_label),
             x = "1 - Specificity", y = "Sensitivity") +
        theme_bw(base_size = 8)
    ggsave(file.path(FIG_DIR, paste0("06_ROC_CAAH_", compartment_label, ".png")),
           p, width = 80, height = 80, units = "mm", dpi = 300)
}

plotRoc(all_scores, "Glomerular")
plotRoc(all_scores, "Tubulointerstitial")