ERCB Microarray Analysis
Related to Figure 6.
European Renal cDNA Bank (ERCB) — two Affymetrix microarray datasets from GEO. Glomerular (GSE37460) and tubulointerstitial (GSE37455) biopsy compartments are analyzed separately. Comparison: hypertensive nephropathy (HT) vs. healthy living donors.
Gene signatures
Human orthologs of the mouse CAAH signature, scored alongside pathway context signatures via UCell.
caah_10_genes <- c("CLU", "LRP2", "LAMP2", "COL4A2", "SPINK1",
"WFDC2", "PAX8", "LYZ", "S100A9", "CDH13")
hypoxia_genes <- c("HIF1A", "EPAS1", "EPO", "EPOR", "VEGFA",
"ALDOA", "LDHA", "PGK1", "SLC2A1", "ENO1",
"PGAM1", "TPI1", "ANGPTL4", "HILPDA", "PLAUR",
"SLC16A1", "BNIP3", "CA9", "P4HA1", "KRT17")
glycolysis_genes <- c("HK2", "PFKM", "PKM", "LDHA", "HK1",
"ALDOA", "GAPDH", "PGK1", "PGAM1", "ENO1",
"TPI1", "GPI", "PFKL", "PFKP", "LDHB")
ecm_genes <- c("COL1A1", "COL1A2", "COL3A1", "COL4A1", "COL4A2",
"COL12A1", "FN1", "TIMP1", "TIMP2", "MMP2",
"ACTA2", "VIM", "TGFB1", "CTGF", "POSTN", "THBS1")
all_signatures <- list(
CAAH_10gene = caah_10_genes,
Hypoxia = hypoxia_genes,
Glycolysis = glycolysis_genes,
SASP = sasp_genes,
ECM_Fibrosis = ecm_genes
)
Hallmark OxPhos and Glycolysis gene sets from MSigDB are appended after loading:
msig_hallmark <- msigdbr(species = "Homo sapiens", category = "H")
all_signatures[["OxPhos_Hallmark"]] <- msig_hallmark %>%
filter(gs_name == "HALLMARK_OXIDATIVE_PHOSPHORYLATION") %>%
pull(gene_symbol) %>% unique()
all_signatures[["Glycolysis_Hallmark"]] <- msig_hallmark %>%
filter(gs_name == "HALLMARK_GLYCOLYSIS") %>%
pull(gene_symbol) %>% unique()
Parse GEO series matrix files
The series matrix files are read without GEOquery. Sample metadata is extracted from !Sample_ header rows; row IDs are Entrez Gene IDs.
parseSeriesMatrix <- function(filepath) {
lines <- readLines(filepath)
meta_rows <- grep("^!Sample_", lines, value = TRUE)
titles <- gsub('"', '', strsplit(
meta_rows[grep("^!Sample_title", meta_rows)], "\t")[[1]][-1])
accessions <- gsub('"', '', strsplit(
meta_rows[grep("^!Sample_geo_accession", meta_rows)], "\t")[[1]][-1])
groups <- dplyr::case_when(
grepl("Healthy|Living donor|living donor", titles, ignore.case = TRUE) ~ "Healthy",
grepl("Hypertensive|hypertensive", titles, ignore.case = TRUE) ~ "HT",
grepl("IgA|IgAN", titles, ignore.case = TRUE) ~ "IgAN",
TRUE ~ "Other"
)
meta <- data.frame(sample = accessions, title = titles, group = groups,
stringsAsFactors = FALSE)
data_start <- grep("^!series_matrix_table_begin", lines) + 1
data_end <- grep("^!series_matrix_table_end", lines) - 1
expr_dt <- fread(text = paste(lines[data_start:data_end], collapse = "\n"),
sep = "\t", header = TRUE, data.table = FALSE)
rownames(expr_dt) <- as.character(expr_dt[, 1])
expr_dt <- expr_dt[, -1, drop = FALSE]
colnames(expr_dt) <- gsub('"', '', colnames(expr_dt))
list(expr = as.matrix(expr_dt[, meta$sample, drop = FALSE]), meta = meta)
}
Map Entrez IDs to gene symbols and collapse probes
Probes mapping to the same gene are collapsed by keeping the one with the highest mean expression.
entrezToSymbol <- function(entrez_ids) {
map <- AnnotationDbi::select(org.Hs.eg.db,
keys = as.character(entrez_ids),
columns = c("ENTREZID", "SYMBOL"),
keytype = "ENTREZID")
map <- map[!is.na(map$SYMBOL) & !duplicated(map$ENTREZID), ]
setNames(map$SYMBOL, map$ENTREZID)
}
collapseToGenes <- function(expr_mat, symbol_map) {
symbols <- symbol_map[rownames(expr_mat)]
keep <- !is.na(symbols)
expr_mat <- expr_mat[keep, , drop = FALSE]
symbols <- symbols[keep]
df <- data.frame(symbol = symbols, mean_expr = rowMeans(expr_mat, na.rm = TRUE),
idx = seq_along(symbols))
df <- df[order(df$symbol, -df$mean_expr), ]
df <- df[!duplicated(df$symbol), ]
expr_mat <- expr_mat[df$idx, , drop = FALSE]
rownames(expr_mat) <- df$symbol
expr_mat
}
Load datasets
Glomerular uses GPL14663 only (HT + IgAN + Healthy). Tubulointerstitial merges GPL14663 (HT + small Healthy set) with GPL11670 (18 Healthy donors) on shared genes.
glom <- parseSeriesMatrix(file.path(DATA_DIR, "GSE37460/GSE37460-GPL14663_series_matrix.txt.gz"))
tub14 <- parseSeriesMatrix(file.path(DATA_DIR, "GSE37455/GSE37455-GPL14663_series_matrix.txt.gz"))
tub11 <- parseSeriesMatrix(file.path(DATA_DIR, "GSE37455/GSE37455-GPL11670_series_matrix.txt.gz"))
glom_gene <- collapseToGenes(glom$expr, entrezToSymbol(rownames(glom$expr)))
tub14_gene <- collapseToGenes(tub14$expr, entrezToSymbol(rownames(tub14$expr)))
tub11_gene <- collapseToGenes(tub11$expr, entrezToSymbol(rownames(tub11$expr)))
common_tub_genes <- intersect(rownames(tub14_gene), rownames(tub11_gene))
tub_expr <- cbind(tub14_gene[common_tub_genes, ], tub11_gene[common_tub_genes, ])
tub_meta <- rbind(tub14$meta, tub11$meta)
UCell scoring
scoreSignaturesBulk <- function(expr_mat, meta_df, signatures, label) {
scores <- UCell::ScoreSignatures_UCell(matrix = expr_mat, features = signatures)
scores_dt <- as.data.frame(scores)
scores_dt$sample <- rownames(scores_dt)
result <- merge(scores_dt, meta_df, by = "sample")
result$compartment <- label
result
}
glom_scores <- scoreSignaturesBulk(glom_gene, glom$meta, all_signatures, "Glomerular")
tub_scores <- scoreSignaturesBulk(tub_expr, tub_meta, all_signatures, "Tubulointerstitial")
all_scores <- rbind(glom_scores, tub_scores)
fwrite(all_scores, file.path(TAB_DIR, "CAAH_signature_scores_ERCB.csv"))
Wilcoxon statistics (HT vs Healthy)
score_cols <- grep("_UCell$", colnames(all_scores), value = TRUE)
runStats <- function(scores_df, compartment_label) {
df <- scores_df[scores_df$compartment == compartment_label &
scores_df$group %in% c("Healthy", "HT"), ]
results <- lapply(score_cols, function(sig) {
ht <- df[df$group == "HT", sig]
ctl <- df[df$group == "Healthy", sig]
if (length(ht) < 2 || length(ctl) < 2) return(NULL)
wt <- wilcox.test(ht, ctl, exact = FALSE)
data.frame(Signature = sub("_UCell$", "", sig), Compartment = compartment_label,
n_HT = length(ht), n_Healthy = length(ctl),
mean_HT = mean(ht, na.rm = TRUE), mean_Healthy = mean(ctl, na.rm = TRUE),
log2FC = log2(mean(ht, na.rm = TRUE) / mean(ctl, na.rm = TRUE)),
p_value = wt$p.value)
})
results <- do.call(rbind, Filter(Negate(is.null), results))
results$p_adj <- p.adjust(results$p_value, method = "BH")
results
}
stats_all <- rbind(runStats(all_scores, "Glomerular"),
runStats(all_scores, "Tubulointerstitial"))
fwrite(stats_all, file.path(TAB_DIR, "CAAH_signature_stats_HT_vs_Healthy.csv"))
Key results: CAAH 10-gene score elevated in Glomerular (padj = 0.035, AUC = 0.789) and Tubulointerstitial (padj = 0.032, AUC = 0.731). Individually significant genes: CDH13, COL4A2, EPAS1.
Differential expression with limma
runLimmaDE <- function(expr_mat, meta_df, compartment_label) {
keep <- meta_df$group %in% c("Healthy", "HT")
expr <- expr_mat[, meta_df$sample[keep], drop = FALSE]
group <- factor(meta_df$group[keep], levels = c("Healthy", "HT"))
fit <- eBayes(lmFit(expr, model.matrix(~ group)))
top <- topTable(fit, coef = "groupHT", number = Inf, sort.by = "P")
data.frame(Gene = rownames(top), top, Compartment = compartment_label,
stringsAsFactors = FALSE)
}
de_glom <- runLimmaDE(glom_gene, glom$meta, "Glomerular")
de_tub <- runLimmaDE(tub_expr, tub_meta, "Tubulointerstitial")
fwrite(de_glom, file.path(TAB_DIR, "DE_HT_vs_Healthy_Glomerular_limma.csv"))
fwrite(de_tub, file.path(TAB_DIR, "DE_HT_vs_Healthy_Tubulointerstitial_limma.csv"))
Gene set enrichment (fgsea)
Ranked by limma t-statistic. MSigDB Hallmark pathways plus custom signatures are tested.
runFgseaAnalysis <- function(de_result, compartment_label) {
ranked <- sort(setNames(de_result$t, de_result$Gene), decreasing = TRUE)
ranked <- ranked[!is.na(ranked)]
msig_h <- msigdbr(species = "Homo sapiens", category = "H")
gene_sets <- c(split(msig_h$gene_symbol, msig_h$gs_name),
setNames(lapply(all_signatures, toupper),
paste0("CAAH_", names(all_signatures))))
set.seed(99)
fgsea_res <- fgsea(pathways = gene_sets, stats = ranked,
minSize = 5, maxSize = 500, nPermSimple = 10000)
fgsea_res$Compartment <- compartment_label
fgsea_res[order(fgsea_res$padj), ]
}
fgsea_glom <- runFgseaAnalysis(de_glom, "Glomerular")
fgsea_tub <- runFgseaAnalysis(de_tub, "Tubulointerstitial")
fwrite(fgsea_glom, file.path(TAB_DIR, "fgsea_HT_vs_Healthy_Glomerular.csv"))
fwrite(fgsea_tub, file.path(TAB_DIR, "fgsea_HT_vs_Healthy_Tubulointerstitial.csv"))
Figure — UCell score boxplot
Related to Figure 6c (CAAH 10-gene UCell score boxplots in ERCB glomerular and tubulointerstitial compartments, HT vs Healthy).
group_colors <- c("Healthy" = "#4575B4", "HT" = "#D73027", "IgAN" = "#74ADD1")
score_long <- all_scores %>%
dplyr::filter(group %in% c("Healthy", "HT")) %>%
dplyr::select(sample, group, compartment,
matches("CAAH_10gene_UCell|Hypoxia_UCell|Glycolysis_UCell|OxPhos_Hallmark_UCell")) %>%
tidyr::pivot_longer(cols = matches("_UCell$"), names_to = "Signature", values_to = "Score") %>%
dplyr::mutate(Signature = sub("_UCell$", "", sub("^CAAH_", "", Signature)),
group = factor(group, levels = c("Healthy", "HT")))
p_violin <- ggplot(score_long, aes(x = group, y = Score, fill = group)) +
geom_boxplot(width = 0.4, outlier.size = 0.5, alpha = 0.8) +
geom_jitter(width = 0.1, size = 0.8, alpha = 0.5) +
facet_grid(Signature ~ compartment, scales = "free_y") +
scale_fill_manual(values = group_colors) +
labs(title = "CAAH Gene Signature Scores: Hypertensive Nephropathy vs. Healthy",
subtitle = "ERCB Glomerular (GSE37460) and Tubulointerstitial (GSE37455)",
x = "", y = "UCell Score") +
theme_bw(base_size = 8) + theme(legend.position = "none")
ggsave(file.path(FIG_DIR, "01_CAAH_signature_scores_violin.svg"),
p_violin, width = 140, height = 200, units = "mm", dpi = 300)
Figure — Heatmaps
Row-scaled expression (z-score across samples), HT vs Healthy only.
plotHeatmap <- function(expr_mat, meta_df, genes, title, out_prefix) {
genes_found <- intersect(toupper(genes), rownames(expr_mat))
keep <- meta_df$group %in% c("Healthy", "HT")
mat_scaled <- t(scale(t(expr_mat[genes_found, meta_df$sample[keep], drop = FALSE])))
ann_col <- data.frame(Group = meta_df$group[keep], row.names = meta_df$sample[keep])
for (ext in c(".png", ".pdf")) {
pheatmap(mat_scaled,
annotation_col = ann_col,
annotation_colors = list(Group = group_colors[c("Healthy", "HT")]),
cluster_cols = TRUE, cluster_rows = TRUE,
color = colorRampPalette(c("#313695", "white", "#A50026"))(100),
fontsize = 7, main = title,
filename = paste0(out_prefix, ext), width = 8, height = 5)
}
}
plotHeatmap(glom_gene, glom$meta, caah_10_genes,
"CAAH 10-Gene Signature: Glomerular (HT vs Healthy)",
file.path(FIG_DIR, "02_CAAH_heatmap_Glomerular"))
plotHeatmap(tub_expr, tub_meta, caah_10_genes,
"CAAH 10-Gene Signature: Tubulointerstitial (HT vs Healthy)",
file.path(FIG_DIR, "02_CAAH_heatmap_Tubulointerstitial"))
plotHeatmap(glom_gene, glom$meta,
c("HIF1A", "EPAS1", "EPO", "EPOR", "VEGFA", "HK2", "PFKM", "PKM", "LDHA", "ENO1", "PGK1"),
"Hypoxia & Glycolysis Genes: Glomerular (HT vs Healthy)",
file.path(FIG_DIR, "03_Hypoxia_Glycolysis_heatmap_Glomerular"))
Figure — fgsea NES bar chart
caah_relevant_pathways <- c(
"HALLMARK_HYPOXIA", "HALLMARK_GLYCOLYSIS", "HALLMARK_OXIDATIVE_PHOSPHORYLATION",
"HALLMARK_INFLAMMATORY_RESPONSE", "HALLMARK_TGF_BETA_SIGNALING",
"HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION", "HALLMARK_ANGIOGENESIS",
"CAAH_CAAH_10gene", "CAAH_Hypoxia", "CAAH_Glycolysis", "CAAH_ECM_Fibrosis"
)
plotFgseaBar <- function(fgsea_dt, title, out_prefix) {
dt <- as.data.frame(fgsea_dt)[as.data.frame(fgsea_dt)$pathway %in% caah_relevant_pathways, ]
dt$label <- factor(sub("^CAAH_", "[CAAH] ", sub("^HALLMARK_", "", dt$pathway)),
levels = sub("^CAAH_", "[CAAH] ", sub("^HALLMARK_", "", dt$pathway[order(dt$NES)])))
dt$sig <- ifelse(dt$padj < 0.05, "p.adj<0.05", "NS")
p <- ggplot(dt, aes(x = label, y = NES, fill = sig)) +
geom_bar(stat = "identity") + geom_hline(yintercept = 0, linewidth = 0.3) +
coord_flip() +
scale_fill_manual(values = c("p.adj<0.05" = "#D73027", "NS" = "grey70"), name = "") +
labs(title = title, subtitle = "fgsea; HT vs Healthy; ranked by limma t-statistic",
x = "", y = "Normalized Enrichment Score (NES)") +
theme_bw(base_size = 8)
ggsave(paste0(out_prefix, ".svg"), p, width = 150, height = 100, units = "mm", dpi = 300)
}
plotFgseaBar(fgsea_glom, "Pathway Enrichment: Glomerular HT vs Healthy",
file.path(FIG_DIR, "04_fgsea_NES_Glomerular"))
plotFgseaBar(fgsea_tub, "Pathway Enrichment: Tubulointerstitial HT vs Healthy",
file.path(FIG_DIR, "04_fgsea_NES_Tubulointerstitial"))
Figure — Volcano plot
CAAH signature genes labeled in orange.
plotVolcano <- function(de_result, title, out_prefix) {
de_result$color <- dplyr::case_when(
de_result$Gene %in% toupper(caah_10_genes) ~ "CAAH signature",
de_result$adj.P.Val < 0.05 & de_result$logFC > 0.5 ~ "Up",
de_result$adj.P.Val < 0.05 & de_result$logFC < -0.5 ~ "Down",
TRUE ~ "NS"
)
p <- ggplot(de_result, aes(x = logFC, y = -log10(P.Value), color = color)) +
geom_point(size = 0.8, alpha = 0.6) +
geom_point(data = de_result[de_result$color == "CAAH signature", ],
size = 2.5, alpha = 1) +
ggrepel::geom_text_repel(
data = de_result[de_result$color == "CAAH signature", ],
aes(label = Gene), size = 2.5, max.overlaps = 20, show.legend = FALSE) +
scale_color_manual(values = c("CAAH signature" = "#E66100",
"Up" = "#D73027", "Down" = "#4575B4", "NS" = "grey70")) +
geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed",
linewidth = 0.3, color = "grey50") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed",
linewidth = 0.3, color = "grey50") +
labs(title = title, x = "log2 Fold Change", y = "-log10(p-value)", color = "") +
theme_bw(base_size = 8)
ggsave(paste0(out_prefix, ".svg"), p, width = 120, height = 100, units = "mm", dpi = 300)
}
plotVolcano(de_glom, "Glomerular: HT vs Healthy — CAAH Signature Highlighted",
file.path(FIG_DIR, "05_volcano_Glomerular"))
plotVolcano(de_tub, "Tubulointerstitial: HT vs Healthy — CAAH Signature Highlighted",
file.path(FIG_DIR, "05_volcano_Tubulointerstitial"))
Figure — ROC curve
plotRoc <- function(scores_df, compartment_label, sig_col = "CAAH_10gene_UCell") {
df <- scores_df[scores_df$compartment == compartment_label &
scores_df$group %in% c("Healthy", "HT"), ]
df$label <- as.integer(df$group == "HT")
roc_obj <- pROC::roc(df$label, df[[sig_col]], direction = "<")
auc_val <- round(pROC::auc(roc_obj), 3)
roc_df <- data.frame(Specificity = roc_obj$specificities,
Sensitivity = roc_obj$sensitivities)
p <- ggplot(roc_df, aes(x = 1 - Specificity, y = Sensitivity)) +
geom_line(color = "#D73027", linewidth = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
annotate("text", x = 0.7, y = 0.2, label = paste0("AUC = ", auc_val), size = 3) +
labs(title = paste0("ROC: CAAH 10-Gene Score — ", compartment_label),
x = "1 - Specificity", y = "Sensitivity") +
theme_bw(base_size = 8)
ggsave(file.path(FIG_DIR, paste0("06_ROC_CAAH_", compartment_label, ".png")),
p, width = 80, height = 80, units = "mm", dpi = 300)
}
plotRoc(all_scores, "Glomerular")
plotRoc(all_scores, "Tubulointerstitial")