Derive the CAAH 10-Gene Signature
Related to Figure 6.
The 10-gene CAAH signature is derived through a two-phase process: building a broad candidate pool from ECM, inflammation, hypoxia, SASP, and renal fibrosis gene lists, then optimizing a 10-gene subset by iterative gene-swap across both models simultaneously to maximize the composite discrimination AUC.
Candidate gene pool
The candidate pool covers ECM biology, inflammatory signaling, hypoxia response, SASP, renal fibrosis markers, known urinary biomarkers, and a preliminary signature used as the starting point for optimization.
initial_signature <- c("Clu", "Col4a2", "Ctsc", "Ets1", "Igfbp5",
"Lrp2", "Lyz2", "Pax8", "S100a6", "S100a9")
IFRG_genes <- unique(c("Tnfrsf1a", "Tnfrsf1b", "Tnfrsf21", "Tnfrsf19",
"Il15ra", "Il17f", "Cd55", "Cd300c", "Tnfsf15", "Csf1", "Havcr2",
"Il1r1", "Il18r1", "Il17a", "Lifr", "Cxcl1", "Ros1", "Plaur", "Acvr2a",
"Klf6", "Gabbr1", "Aplnr", "Sphk1", "Pdpn", "Adora2b", "Ripk2", "Ptges",
"Cxcl9", "Wnt7a", "Hrh1", "Tnfsf9", "Kl", "Ffar3", "Agt",
"Mc1r", "Il17d"))
hypoxia_genes <- unique(c("Plaur", "Ucn", "Pabpc1l", "Slc16a12", "Nfe2l3",
"Kcnab1", "Gpc3", "Kif5a", "Ankzf1", "Ets1", "Selenbp1", "Aldoa",
"Angptl4", "Anln", "Bnc1", "Mrgbp", "Cdkn3", "Col4a6", "Dcbld1", "Eno1",
"Fam83b", "Fosl1", "Gnai1", "Hilpda", "Kctd11", "Krt17", "Ldha", "P4ha1",
"Pgam1", "Pgk1", "Sdc1", "Slc16a1", "Slc2a1", "Tpi1", "Vegfa", "Cygb",
"Hif1a", "Arnt"))
sasp_genes <- unique(c("Il6", "Il1a", "Il1b", "Il1r1", "Il1rn", "Il7",
"Il7r", "Il13", "Il15", "Cxcl1", "Cxcl2", "Cxcl3",
"Ccl8", "Ccl20", "Ccl11", "Ccl25",
"Ccl26", "Csf2", "Mif", "Fn1", "Areg", "Ereg", "Nrg1", "Fgf2",
"Hgf", "Fgf7", "Vegfa", "Ang", "Pigf",
"Mmp3", "Mmp10", "Mmp12", "Mmp13", "Mmp14", "Timp2",
"Serpine1", "Ctsb", "Icam1", "Tnfrsf11b",
"Fas", "Plaur", "Il6st", "Egfr", "Ptger2", "Nos1",
"Cdkn1a", "Cdkn2a", "Lmnb1", "Bcl2l1", "Bcl2l2",
"Serpinb2", "Hsp90aa1", "Hsp90ab1", "Igf1", "Igf1r", "Igf2",
"Igfbp1", "Igfbp2", "Igfbp3", "Igfbp4", "Igfbp5",
"Igfbp6", "Igfbp7"))
renal_fibrosis_markers <- c("Adora2a", "Ager", "Agt", "Agtrap", "Akt1",
"Alb", "Angpt2", "Anxa1", "Apln", "Aplnr", "Apoe", "Azgp1", "Bdkrb1",
"Bdkrb2", "Bgn", "Bicc1", "Bmp1", "Bmp2", "Bmp3", "Bmp7", "Bmpr1a", "C6",
"Cast", "Cat", "Cav1", "Ccl2", "Ccl5", "Ccr7", "Cd248", "Cdc42", "Cdh16",
"Cebpa", "Cftr", "Clu", "Col1a1", "Col3a1", "Col4a3", "Creb1", "Csf1",
"Ctnnb1", "Cubn", "Cul3", "Cxcl16", "Cybb", "Cys1", "Ddah1", "Dnmt1",
"Edn1", "Ednra", "Ednrb", "Efnb2", "Egfr", "Egln1", "Eng", "Epas1", "Epo",
"Erbb4", "Fan1", "Fermt2", "Fgf23", "Fn1", "Gli1", "Gli2", "Glis2",
"Havcr1", "Hgf", "Hipk2", "Hmox1", "Hnrnpf", "Ift140", "Ift20", "Igf1",
"Ilk", "Invs", "Itga1", "Itgb1", "Jak3", "Kcnj1", "Kcnn4", "Kl", "Klf5",
"Klhl3", "Lcn2", "Lepr", "Lrp2", "Mmp14", "Mmp2", "Mpz", "Mrc2", "Mtor",
"N4bp2l2", "Ncf1", "Nfkb1", "Nos3", "Notch1", "Notch3", "Nox4", "Nr0b2",
"Nr1h4", "Pax8", "Pdpn", "Pkd1", "Pkhd1", "Plat", "Plaur", "Postn",
"Ppara", "Pparg", "Ppargc1a", "Prkaa1", "Prkcb", "Prkg1", "Prss23",
"Ptgis", "Ptgs2", "Rasal1", "Rbpj", "Rictor", "Rock1", "Rptor", "Rtn1",
"S100a4", "Sdc4", "Serpine1", "Shh", "Shroom3", "Sirt1", "Smad7",
"Smurf1", "Smurf2", "Snai1", "Snai2", "Socs1", "Srebf1", "Stk11", "Tet3",
"Tgfb1", "Tgfb2", "Tgfb3", "Tgm2", "Thbs1", "Thg1l", "Tlr4", "Trpc3",
"Tsc1", "Twist1", "Ucp2", "Umod", "Wfdc2", "Wnk4", "Wnt4", "Xpnpep1",
"Xylt2")
biomarker_genes <- c("Ly6d", "Lyz2", "Grn", "Cdh13", "Ambp", "Col12a1", "Ahsg",
"Spink1", "Wfdc2", "Tnfrsf1a", "Ccl2", "Egf", "Havcr1", "Lcn2", "Tgfa")
ECM gene sets from the Matrisome database (gene aliases updated to current symbols):
ecm_glycoproteins <- c("Abi3bp", "Aebp1", "Agrn", "AW551984", "Bmper", "Ccn2", "Ccn5", "Cilp",
"Cilp2", "Colq", "Dmbt1", "Dpt", "Ecm1", "Efemp1", "Efemp2", "Eln",
"Emid1", "Emilin1", "Emilin2", "Fbln1", "Fbln2", "Fbln5", "Fbn1", "Fbn2",
"Fga", "Fgb", "Fgg", "Fn1", "Fndc1", "Fras1", "Gldn", "Hmcn1",
"Hmcn2", "Igfbp6", "Igfbp7", "Igsf10", "Ints14", "Kcp", "Lama1", "Lama2",
"Lama3", "Lama4", "Lama5", "Lamb1", "Lamb2", "Lamb3", "Lamc1", "Lamc2",
"Lamc3", "Lrg1", "Ltbp1", "Ltbp2", "Ltbp3", "Ltbp4", "Matn1", "Matn2",
"Matn4", "Mfap2", "Mfap4", "Mfap5", "Mfge8", "Mgp", "Mmrn1", "Mmrn2",
"Ndnf", "Nid1", "Nid2", "Npnt", "Ntn1", "Ntn4", "Otog", "Papln",
"Pcolce", "Pcolce2", "Postn", "Pxdn", "Sbspon", "Sparc", "Spon1", "Srpx",
"Srpx2", "Tgfbi", "Thbs1", "Thbs2", "Thbs3", "Thbs4", "Thsd4", "Tinag",
"Tinagl1", "Tnc", "Tnn", "Tnxb", "Vtn", "Vwa1", "Vwa5a", "Vwf")
collagens <- c("Col10a1", "Col11a1", "Col11a2", "Col12a1", "Col13a1", "Col14a1", "Col15a1", "Col16a1",
"Col17a1", "Col18a1", "Col19a1", "Col1a1", "Col1a2", "Col22a1", "Col23a1", "Col24a1",
"Col25a1", "Col27a1", "Col28a1", "Col2a1", "Col3a1", "Col4a1", "Col4a2", "Col4a3",
"Col4a4", "Col4a5", "Col4a6", "Col5a1", "Col5a2", "Col5a3", "Col6a1", "Col6a2",
"Col6a3", "Col6a4", "Col6a5", "Col6a6", "Col7a1", "Col8a1", "Col8a2", "Col9a1",
"Col9a3")
ecm_regulators <- c("1810010H24Rik", "Adam10", "Adam19", "Adam23", "Adam7", "Adamts17", "Adamts9", "Adamtsl1",
"Adamtsl4", "Adamtsl5", "AI182371", "Ambp", "Cpn2", "Cstb", "Ctsb", "Ctsc",
"Ctsd", "Ctsg", "Egln1", "Elane", "F10", "F13a1", "F13b", "F2",
"F9", "Fam20b", "Habp2", "Hrg", "Htra1", "Htra3", "Hyal2", "Itih1",
"Itih2", "Itih3", "Itih4", "Itih5", "Kng1", "Ky", "P3h1", "P3h2",
"Lox", "Loxl1", "Loxl2", "Loxl3", "Mep1a", "Mmp19", "Mmp2", "Mmp9",
"Mug2", "P4ha1", "P4ha2", "Plg", "Plod1", "Plod2", "Plod3", "Pzp",
"Serpina10", "Serpina1a", "Serpina1b", "Serpina1d", "Serpina3k", "Serpina3m", "Serpina3n", "Serpinb1a",
"Serpinb2", "Serpinc1", "Serpind1", "Serpine2", "Serpinf1", "Serpinf2", "Serping1", "Serpinh1",
"St14", "Tgm1", "Tgm2", "Tgm3", "Timp3", "Try10")
ecm_affiliated <- c("Anxa1", "Anxa11", "Anxa2", "Anxa3", "Anxa4", "Anxa5", "Anxa6", "Anxa7",
"Anxa9", "C1qa", "C1qc", "C1qtnf2", "C1qtnf5", "C1qtnf7", "C1qtnf9", "Clec14a",
"Colec12", "Cspg4", "Cspg5", "Frem1", "Frem2", "Gpc4", "Grem1", "Hpx",
"Itln1", "Lgals1", "Lgals3", "Lgals4", "Lgals9", "Lman1", "Muc2", "Muc4",
"Plxdc2", "Plxnb2", "Plxnc1", "Reg4", "Sema3c", "Sftpa1", "Sftpb", "Sftpc",
"Sftpd")
proteoglycans <- c("Acan", "Aspn", "Bgn", "Chad", "Dcn", "Fmod", "Hapln1", "Hspg2",
"Impg1", "Lum", "Ogn", "Prelp", "Prg2", "Prg4", "Spock3", "Vcan")
secreted_factors <- c("Angptl1", "Angptl2", "Angptl6", "Chrdl1", "Crlf3", "Cxcl12", "Egfl7", "Fgf2",
"Hcfc1", "Hcfc2", "Il16", "Insl5", "Megf6", "Nrg1", "Pf4", "Rptn",
"S100a10", "S100a13", "S100a16", "S100a4", "S100a6", "S100a8", "S100a9", "Scube2",
"Tgfb1")
candidate_genes <- unique(c(
ecm_glycoproteins, collagens, ecm_regulators, ecm_affiliated, proteoglycans, secreted_factors,
IFRG_genes, hypoxia_genes, sasp_genes, renal_fibrosis_markers,
initial_signature, biomarker_genes))
Load Seurat objects
ren1c_obj <- readRDS("ren1c_obj.rds")
captopril_obj <- readRDS("captopril_obj.rds")
Filter candidate genes
Ribosomal, mitochondrial, hemoglobin, and non-standard gene names are excluded from both models before evaluation. Only genes present in both filtered feature sets are carried forward.
filterFeatures <- function(seurat_obj) {
f <- Features(seurat_obj)
f <- f[!grepl("^Rp[sl][[:digit:]]|^Rplp[[:digit:]]|^Rpsa", f)]
f <- sort(f[!grepl("^mt-", f)])
f <- sort(f[!grepl("^H2-", f)])
f <- sort(f[!grepl("^Hb[ab]-", f)])
f <- sort(f[!grepl("^[0-9]", f)])
f <- sort(f[!grepl("^Gm", f)])
f <- sort(f[!grepl("^ENSMUSG", f)])
f <- sort(f[!grepl("(.*)Rik", f)])
f <- sort(f[!grepl("^B[BCE]", f)])
f <- sort(f[!grepl("^AI", f)])
f[-1]
}
DefaultAssay(ren1c_obj) <- "RNA"
DefaultAssay(captopril_obj) <- "RNA"
ren1c_filtered <- filterFeatures(ren1c_obj)
captopril_filtered <- filterFeatures(captopril_obj)
candidates_present <- intersect(
candidate_genes[candidate_genes %in% ren1c_filtered],
candidate_genes[candidate_genes %in% captopril_filtered])
Per-gene ROC evaluation
Each candidate gene is individually scored in both models. AUC and Wilcoxon p-value are computed to rank genes by discriminative power before the combinatorial optimization.
evaluateIndividualGenes <- function(seurat_obj, candidate_genes,
disease_label, control_label,
disease_column = "genotype") {
expr_data <- FetchData(seurat_obj, vars = c(candidate_genes, disease_column))
expr_data$disease_status <- ifelse(expr_data[[disease_column]] == disease_label, 1, 0)
gene_performance <- data.frame(gene = character(), auc = numeric(),
ci_lower = numeric(), ci_upper = numeric(),
p_value = numeric(), stringsAsFactors = FALSE)
pb <- txtProgressBar(min = 0, max = length(candidate_genes), style = 3)
for (i in seq_along(candidate_genes)) {
gene <- candidate_genes[i]
if (gene %in% colnames(expr_data)) {
tryCatch({
roc_obj <- roc(expr_data$disease_status, expr_data[[gene]],
quiet = TRUE, ci = TRUE)
ci <- ci.auc(roc_obj)
disease_expr <- expr_data[[gene]][expr_data$disease_status == 1]
control_expr <- expr_data[[gene]][expr_data$disease_status == 0]
wt <- wilcox.test(disease_expr, control_expr)
gene_performance <- rbind(gene_performance,
data.frame(gene = gene, auc = as.numeric(auc(roc_obj)),
ci_lower = ci[1], ci_upper = ci[3],
p_value = wt$p.value))
}, error = function(e) NULL)
}
setTxtProgressBar(pb, i)
}
close(pb)
gene_performance %>%
arrange(desc(auc)) %>%
mutate(rank = row_number(), significant = p_value < 0.05)
}
ren1c_evaluations <- evaluateIndividualGenes(
seurat_obj = ren1c_obj,
candidate_genes = candidates_present,
disease_label = "KO", control_label = "WT",
disease_column = "genotype")
captopril_evaluations <- evaluateIndividualGenes(
seurat_obj = captopril_obj,
candidate_genes = candidates_present,
disease_label = "trt", control_label = "ctrl",
disease_column = "condition")
Dual-model signature optimization
optimizeSignatureDual performs iterative gene-swap optimization across both models simultaneously. At each iteration it evaluates every possible single-gene swap in the current 10-gene set, accepts the swap with the largest improvement in composite weighted AUC (30% Ren1c KO + 70% captopril), and stops when no swap improves the composite AUC for a specified number of consecutive iterations.
optimizeSignatureDual <- function(seurat_obj1, seurat_obj2,
disease_label1, control_label1,
disease_label2, control_label2,
disease_column1 = "genotype",
disease_column2 = "condition",
candidate_genes,
initial_signature = NULL,
n_genes = 10,
max_consecutive_iteractions = 3,
max_swap_iterations = 50,
weight_dataset1 = 0.5) {
expr_data1 <- FetchData(seurat_obj1, vars = c(candidate_genes, disease_column1))
expr_data1$disease_status <- ifelse(expr_data1[[disease_column1]] == disease_label1, 1, 0)
expr_data2 <- FetchData(seurat_obj2, vars = c(candidate_genes, disease_column2))
expr_data2$disease_status <- ifelse(expr_data2[[disease_column2]] == disease_label2, 1, 0)
calcCompositeAUC <- function(genes, w1 = weight_dataset1) {
s1 <- rowMeans(expr_data1[, genes, drop = FALSE])
auc1 <- as.numeric(auc(roc(expr_data1$disease_status, s1, quiet = TRUE)))
s2 <- rowMeans(expr_data2[, genes, drop = FALSE])
auc2 <- as.numeric(auc(roc(expr_data2$disease_status, s2, quiet = TRUE)))
list(composite = w1 * auc1 + (1 - w1) * auc2, auc1 = auc1, auc2 = auc2)
}
if (is.null(initial_signature)) {
indiv <- data.frame(gene = character(), composite_auc = numeric())
for (g in candidate_genes) {
a <- calcCompositeAUC(g)
indiv <- rbind(indiv, data.frame(gene = g, composite_auc = a$composite))
}
selected_genes <- head(arrange(indiv, desc(composite_auc))$gene, n_genes)
} else {
selected_genes <- initial_signature
}
remaining_genes <- setdiff(candidate_genes, selected_genes)
init <- calcCompositeAUC(selected_genes)
current_composite <- init$composite
current_auc1 <- init$auc1
current_auc2 <- init$auc2
swap_history <- data.frame(iteration = 0, gene_removed = "None", gene_added = "None",
auc1 = current_auc1, auc2 = current_auc2,
composite_auc = current_composite, improvement = 0)
no_improve <- 0
for (iter in seq_len(max_swap_iterations)) {
best_composite <- current_composite
best_auc1 <- current_auc1; best_auc2 <- current_auc2
best_out <- NULL; best_in <- NULL
pb <- txtProgressBar(min = 0, max = length(selected_genes) * length(remaining_genes), style = 3)
k <- 0
for (gene_out in selected_genes) {
for (gene_in in remaining_genes) {
k <- k + 1
test <- calcCompositeAUC(c(setdiff(selected_genes, gene_out), gene_in))
if (test$composite > best_composite) {
best_composite <- test$composite; best_auc1 <- test$auc1; best_auc2 <- test$auc2
best_out <- gene_out; best_in <- gene_in
}
setTxtProgressBar(pb, k)
}
}
close(pb)
if (!is.null(best_out) && (best_composite - current_composite) > 0.0001) {
improvement <- best_composite - current_composite
selected_genes <- c(setdiff(selected_genes, best_out), best_in)
remaining_genes <- c(setdiff(remaining_genes, best_in), best_out)
current_composite <- best_composite; current_auc1 <- best_auc1; current_auc2 <- best_auc2
swap_history <- rbind(swap_history,
data.frame(iteration = iter, gene_removed = best_out, gene_added = best_in,
auc1 = current_auc1, auc2 = current_auc2,
composite_auc = current_composite, improvement = improvement))
no_improve <- 0
} else {
no_improve <- no_improve + 1
if (no_improve >= max_consecutive_iteractions) break
}
}
list(selected_genes = selected_genes,
swap_history = swap_history,
final_auc1 = current_auc1,
final_auc2 = current_auc2,
final_composite = current_composite,
total_improvement = current_composite - swap_history$composite_auc[1])
}
optimized_dual <- optimizeSignatureDual(
seurat_obj1 = ren1c_obj,
seurat_obj2 = captopril_obj,
disease_label1 = "KO", control_label1 = "WT",
disease_label2 = "trt", control_label2 = "ctrl",
disease_column1 = "genotype",
disease_column2 = "condition",
candidate_genes = candidates_present,
initial_signature = initial_signature,
n_genes = 10,
max_consecutive_iteractions = length(candidates_present),
max_swap_iterations = 500,
weight_dataset1 = 0.3)
print(optimized_dual$selected_genes)
The final optimized 10-gene signature: Clu, Lrp2, Lamp2, Col4a2, Spink1, Wfdc2, Pax8, Lyz2, S100a9, and Cdh13. This list outperforms any single-model permutation result and is used throughout sections 6B, 6C, and the human validation (section 7).