Derive the CAAH 10-Gene Signature

Related to Figure 6.

The 10-gene CAAH signature is derived through a two-phase process: building a broad candidate pool from ECM, inflammation, hypoxia, SASP, and renal fibrosis gene lists, then optimizing a 10-gene subset by iterative gene-swap across both models simultaneously to maximize the composite discrimination AUC.

Candidate gene pool

The candidate pool covers ECM biology, inflammatory signaling, hypoxia response, SASP, renal fibrosis markers, known urinary biomarkers, and a preliminary signature used as the starting point for optimization.

initial_signature <- c("Clu", "Col4a2", "Ctsc", "Ets1", "Igfbp5",
                        "Lrp2", "Lyz2", "Pax8", "S100a6", "S100a9")

IFRG_genes <- unique(c("Tnfrsf1a", "Tnfrsf1b", "Tnfrsf21", "Tnfrsf19",
    "Il15ra", "Il17f", "Cd55", "Cd300c", "Tnfsf15", "Csf1", "Havcr2",
    "Il1r1", "Il18r1", "Il17a", "Lifr", "Cxcl1", "Ros1", "Plaur", "Acvr2a",
    "Klf6", "Gabbr1", "Aplnr", "Sphk1", "Pdpn", "Adora2b", "Ripk2", "Ptges",
    "Cxcl9", "Wnt7a", "Hrh1", "Tnfsf9", "Kl", "Ffar3", "Agt",
    "Mc1r", "Il17d"))

hypoxia_genes <- unique(c("Plaur", "Ucn", "Pabpc1l", "Slc16a12", "Nfe2l3",
    "Kcnab1", "Gpc3", "Kif5a", "Ankzf1", "Ets1", "Selenbp1", "Aldoa",
    "Angptl4", "Anln", "Bnc1", "Mrgbp", "Cdkn3", "Col4a6", "Dcbld1", "Eno1",
    "Fam83b", "Fosl1", "Gnai1", "Hilpda", "Kctd11", "Krt17", "Ldha", "P4ha1",
    "Pgam1", "Pgk1", "Sdc1", "Slc16a1", "Slc2a1", "Tpi1", "Vegfa", "Cygb",
    "Hif1a", "Arnt"))

sasp_genes <- unique(c("Il6", "Il1a", "Il1b", "Il1r1", "Il1rn", "Il7",
    "Il7r", "Il13", "Il15", "Cxcl1", "Cxcl2", "Cxcl3",
    "Ccl8", "Ccl20", "Ccl11", "Ccl25",
    "Ccl26", "Csf2", "Mif", "Fn1", "Areg", "Ereg", "Nrg1", "Fgf2",
    "Hgf", "Fgf7", "Vegfa", "Ang", "Pigf",
    "Mmp3", "Mmp10", "Mmp12", "Mmp13", "Mmp14", "Timp2",
    "Serpine1", "Ctsb", "Icam1", "Tnfrsf11b",
    "Fas", "Plaur", "Il6st", "Egfr", "Ptger2", "Nos1",
    "Cdkn1a", "Cdkn2a", "Lmnb1", "Bcl2l1", "Bcl2l2",
    "Serpinb2", "Hsp90aa1", "Hsp90ab1", "Igf1", "Igf1r", "Igf2",
    "Igfbp1", "Igfbp2", "Igfbp3", "Igfbp4", "Igfbp5",
    "Igfbp6", "Igfbp7"))

renal_fibrosis_markers <- c("Adora2a", "Ager", "Agt", "Agtrap", "Akt1",
    "Alb", "Angpt2", "Anxa1", "Apln", "Aplnr", "Apoe", "Azgp1", "Bdkrb1",
    "Bdkrb2", "Bgn", "Bicc1", "Bmp1", "Bmp2", "Bmp3", "Bmp7", "Bmpr1a", "C6",
    "Cast", "Cat", "Cav1", "Ccl2", "Ccl5", "Ccr7", "Cd248", "Cdc42", "Cdh16",
    "Cebpa", "Cftr", "Clu", "Col1a1", "Col3a1", "Col4a3", "Creb1", "Csf1",
    "Ctnnb1", "Cubn", "Cul3", "Cxcl16", "Cybb", "Cys1", "Ddah1", "Dnmt1",
    "Edn1", "Ednra", "Ednrb", "Efnb2", "Egfr", "Egln1", "Eng", "Epas1", "Epo",
    "Erbb4", "Fan1", "Fermt2", "Fgf23", "Fn1", "Gli1", "Gli2", "Glis2",
    "Havcr1", "Hgf", "Hipk2", "Hmox1", "Hnrnpf", "Ift140", "Ift20", "Igf1",
    "Ilk", "Invs", "Itga1", "Itgb1", "Jak3", "Kcnj1", "Kcnn4", "Kl", "Klf5",
    "Klhl3", "Lcn2", "Lepr", "Lrp2", "Mmp14", "Mmp2", "Mpz", "Mrc2", "Mtor",
    "N4bp2l2", "Ncf1", "Nfkb1", "Nos3", "Notch1", "Notch3", "Nox4", "Nr0b2",
    "Nr1h4", "Pax8", "Pdpn", "Pkd1", "Pkhd1", "Plat", "Plaur", "Postn",
    "Ppara", "Pparg", "Ppargc1a", "Prkaa1", "Prkcb", "Prkg1", "Prss23",
    "Ptgis", "Ptgs2", "Rasal1", "Rbpj", "Rictor", "Rock1", "Rptor", "Rtn1",
    "S100a4", "Sdc4", "Serpine1", "Shh", "Shroom3", "Sirt1", "Smad7",
    "Smurf1", "Smurf2", "Snai1", "Snai2", "Socs1", "Srebf1", "Stk11", "Tet3",
    "Tgfb1", "Tgfb2", "Tgfb3", "Tgm2", "Thbs1", "Thg1l", "Tlr4", "Trpc3",
    "Tsc1", "Twist1", "Ucp2", "Umod", "Wfdc2", "Wnk4", "Wnt4", "Xpnpep1",
    "Xylt2")

biomarker_genes <- c("Ly6d", "Lyz2", "Grn", "Cdh13", "Ambp", "Col12a1", "Ahsg",
    "Spink1", "Wfdc2", "Tnfrsf1a", "Ccl2", "Egf", "Havcr1", "Lcn2", "Tgfa")

ECM gene sets from the Matrisome database (gene aliases updated to current symbols):

ecm_glycoproteins <- c("Abi3bp", "Aebp1", "Agrn", "AW551984", "Bmper", "Ccn2", "Ccn5", "Cilp",
    "Cilp2", "Colq", "Dmbt1", "Dpt", "Ecm1", "Efemp1", "Efemp2", "Eln",
    "Emid1", "Emilin1", "Emilin2", "Fbln1", "Fbln2", "Fbln5", "Fbn1", "Fbn2",
    "Fga", "Fgb", "Fgg", "Fn1", "Fndc1", "Fras1", "Gldn", "Hmcn1",
    "Hmcn2", "Igfbp6", "Igfbp7", "Igsf10", "Ints14", "Kcp", "Lama1", "Lama2",
    "Lama3", "Lama4", "Lama5", "Lamb1", "Lamb2", "Lamb3", "Lamc1", "Lamc2",
    "Lamc3", "Lrg1", "Ltbp1", "Ltbp2", "Ltbp3", "Ltbp4", "Matn1", "Matn2",
    "Matn4", "Mfap2", "Mfap4", "Mfap5", "Mfge8", "Mgp", "Mmrn1", "Mmrn2",
    "Ndnf", "Nid1", "Nid2", "Npnt", "Ntn1", "Ntn4", "Otog", "Papln",
    "Pcolce", "Pcolce2", "Postn", "Pxdn", "Sbspon", "Sparc", "Spon1", "Srpx",
    "Srpx2", "Tgfbi", "Thbs1", "Thbs2", "Thbs3", "Thbs4", "Thsd4", "Tinag",
    "Tinagl1", "Tnc", "Tnn", "Tnxb", "Vtn", "Vwa1", "Vwa5a", "Vwf")

collagens <- c("Col10a1", "Col11a1", "Col11a2", "Col12a1", "Col13a1", "Col14a1", "Col15a1", "Col16a1",
    "Col17a1", "Col18a1", "Col19a1", "Col1a1", "Col1a2", "Col22a1", "Col23a1", "Col24a1",
    "Col25a1", "Col27a1", "Col28a1", "Col2a1", "Col3a1", "Col4a1", "Col4a2", "Col4a3",
    "Col4a4", "Col4a5", "Col4a6", "Col5a1", "Col5a2", "Col5a3", "Col6a1", "Col6a2",
    "Col6a3", "Col6a4", "Col6a5", "Col6a6", "Col7a1", "Col8a1", "Col8a2", "Col9a1",
    "Col9a3")

ecm_regulators <- c("1810010H24Rik", "Adam10", "Adam19", "Adam23", "Adam7", "Adamts17", "Adamts9", "Adamtsl1",
    "Adamtsl4", "Adamtsl5", "AI182371", "Ambp", "Cpn2", "Cstb", "Ctsb", "Ctsc",
    "Ctsd", "Ctsg", "Egln1", "Elane", "F10", "F13a1", "F13b", "F2",
    "F9", "Fam20b", "Habp2", "Hrg", "Htra1", "Htra3", "Hyal2", "Itih1",
    "Itih2", "Itih3", "Itih4", "Itih5", "Kng1", "Ky", "P3h1", "P3h2",
    "Lox", "Loxl1", "Loxl2", "Loxl3", "Mep1a", "Mmp19", "Mmp2", "Mmp9",
    "Mug2", "P4ha1", "P4ha2", "Plg", "Plod1", "Plod2", "Plod3", "Pzp",
    "Serpina10", "Serpina1a", "Serpina1b", "Serpina1d", "Serpina3k", "Serpina3m", "Serpina3n", "Serpinb1a",
    "Serpinb2", "Serpinc1", "Serpind1", "Serpine2", "Serpinf1", "Serpinf2", "Serping1", "Serpinh1",
    "St14", "Tgm1", "Tgm2", "Tgm3", "Timp3", "Try10")

ecm_affiliated <- c("Anxa1", "Anxa11", "Anxa2", "Anxa3", "Anxa4", "Anxa5", "Anxa6", "Anxa7",
    "Anxa9", "C1qa", "C1qc", "C1qtnf2", "C1qtnf5", "C1qtnf7", "C1qtnf9", "Clec14a",
    "Colec12", "Cspg4", "Cspg5", "Frem1", "Frem2", "Gpc4", "Grem1", "Hpx",
    "Itln1", "Lgals1", "Lgals3", "Lgals4", "Lgals9", "Lman1", "Muc2", "Muc4",
    "Plxdc2", "Plxnb2", "Plxnc1", "Reg4", "Sema3c", "Sftpa1", "Sftpb", "Sftpc",
    "Sftpd")

proteoglycans <- c("Acan", "Aspn", "Bgn", "Chad", "Dcn", "Fmod", "Hapln1", "Hspg2",
    "Impg1", "Lum", "Ogn", "Prelp", "Prg2", "Prg4", "Spock3", "Vcan")

secreted_factors <- c("Angptl1", "Angptl2", "Angptl6", "Chrdl1", "Crlf3", "Cxcl12", "Egfl7", "Fgf2",
    "Hcfc1", "Hcfc2", "Il16", "Insl5", "Megf6", "Nrg1", "Pf4", "Rptn",
    "S100a10", "S100a13", "S100a16", "S100a4", "S100a6", "S100a8", "S100a9", "Scube2",
    "Tgfb1")

candidate_genes <- unique(c(
    ecm_glycoproteins, collagens, ecm_regulators, ecm_affiliated, proteoglycans, secreted_factors,
    IFRG_genes, hypoxia_genes, sasp_genes, renal_fibrosis_markers,
    initial_signature, biomarker_genes))

Load Seurat objects

ren1c_obj     <- readRDS("ren1c_obj.rds")
captopril_obj <- readRDS("captopril_obj.rds")

Filter candidate genes

Ribosomal, mitochondrial, hemoglobin, and non-standard gene names are excluded from both models before evaluation. Only genes present in both filtered feature sets are carried forward.

filterFeatures <- function(seurat_obj) {
    f <- Features(seurat_obj)
    f <- f[!grepl("^Rp[sl][[:digit:]]|^Rplp[[:digit:]]|^Rpsa", f)]
    f <- sort(f[!grepl("^mt-",      f)])
    f <- sort(f[!grepl("^H2-",      f)])
    f <- sort(f[!grepl("^Hb[ab]-",  f)])
    f <- sort(f[!grepl("^[0-9]",    f)])
    f <- sort(f[!grepl("^Gm",       f)])
    f <- sort(f[!grepl("^ENSMUSG",  f)])
    f <- sort(f[!grepl("(.*)Rik",   f)])
    f <- sort(f[!grepl("^B[BCE]",   f)])
    f <- sort(f[!grepl("^AI",       f)])
    f[-1]
}

DefaultAssay(ren1c_obj)     <- "RNA"
DefaultAssay(captopril_obj) <- "RNA"

ren1c_filtered     <- filterFeatures(ren1c_obj)
captopril_filtered <- filterFeatures(captopril_obj)

candidates_present <- intersect(
    candidate_genes[candidate_genes %in% ren1c_filtered],
    candidate_genes[candidate_genes %in% captopril_filtered])

Per-gene ROC evaluation

Each candidate gene is individually scored in both models. AUC and Wilcoxon p-value are computed to rank genes by discriminative power before the combinatorial optimization.

evaluateIndividualGenes <- function(seurat_obj, candidate_genes,
                                   disease_label, control_label,
                                   disease_column = "genotype") {
    expr_data <- FetchData(seurat_obj, vars = c(candidate_genes, disease_column))
    expr_data$disease_status <- ifelse(expr_data[[disease_column]] == disease_label, 1, 0)

    gene_performance <- data.frame(gene = character(), auc = numeric(),
                                   ci_lower = numeric(), ci_upper = numeric(),
                                   p_value = numeric(), stringsAsFactors = FALSE)
    pb <- txtProgressBar(min = 0, max = length(candidate_genes), style = 3)
    for (i in seq_along(candidate_genes)) {
        gene <- candidate_genes[i]
        if (gene %in% colnames(expr_data)) {
            tryCatch({
                roc_obj <- roc(expr_data$disease_status, expr_data[[gene]],
                               quiet = TRUE, ci = TRUE)
                ci <- ci.auc(roc_obj)
                disease_expr <- expr_data[[gene]][expr_data$disease_status == 1]
                control_expr <- expr_data[[gene]][expr_data$disease_status == 0]
                wt <- wilcox.test(disease_expr, control_expr)
                gene_performance <- rbind(gene_performance,
                    data.frame(gene = gene, auc = as.numeric(auc(roc_obj)),
                               ci_lower = ci[1], ci_upper = ci[3],
                               p_value = wt$p.value))
            }, error = function(e) NULL)
        }
        setTxtProgressBar(pb, i)
    }
    close(pb)
    gene_performance %>%
        arrange(desc(auc)) %>%
        mutate(rank = row_number(), significant = p_value < 0.05)
}

ren1c_evaluations <- evaluateIndividualGenes(
    seurat_obj     = ren1c_obj,
    candidate_genes = candidates_present,
    disease_label  = "KO", control_label = "WT",
    disease_column = "genotype")

captopril_evaluations <- evaluateIndividualGenes(
    seurat_obj     = captopril_obj,
    candidate_genes = candidates_present,
    disease_label  = "trt", control_label = "ctrl",
    disease_column = "condition")

Dual-model signature optimization

optimizeSignatureDual performs iterative gene-swap optimization across both models simultaneously. At each iteration it evaluates every possible single-gene swap in the current 10-gene set, accepts the swap with the largest improvement in composite weighted AUC (30% Ren1c KO + 70% captopril), and stops when no swap improves the composite AUC for a specified number of consecutive iterations.

optimizeSignatureDual <- function(seurat_obj1, seurat_obj2,
                                  disease_label1, control_label1,
                                  disease_label2, control_label2,
                                  disease_column1 = "genotype",
                                  disease_column2 = "condition",
                                  candidate_genes,
                                  initial_signature = NULL,
                                  n_genes = 10,
                                  max_consecutive_iteractions = 3,
                                  max_swap_iterations = 50,
                                  weight_dataset1 = 0.5) {

    expr_data1 <- FetchData(seurat_obj1, vars = c(candidate_genes, disease_column1))
    expr_data1$disease_status <- ifelse(expr_data1[[disease_column1]] == disease_label1, 1, 0)
    expr_data2 <- FetchData(seurat_obj2, vars = c(candidate_genes, disease_column2))
    expr_data2$disease_status <- ifelse(expr_data2[[disease_column2]] == disease_label2, 1, 0)

    calcCompositeAUC <- function(genes, w1 = weight_dataset1) {
        s1  <- rowMeans(expr_data1[, genes, drop = FALSE])
        auc1 <- as.numeric(auc(roc(expr_data1$disease_status, s1, quiet = TRUE)))
        s2  <- rowMeans(expr_data2[, genes, drop = FALSE])
        auc2 <- as.numeric(auc(roc(expr_data2$disease_status, s2, quiet = TRUE)))
        list(composite = w1 * auc1 + (1 - w1) * auc2, auc1 = auc1, auc2 = auc2)
    }

    if (is.null(initial_signature)) {
        indiv <- data.frame(gene = character(), composite_auc = numeric())
        for (g in candidate_genes) {
            a <- calcCompositeAUC(g)
            indiv <- rbind(indiv, data.frame(gene = g, composite_auc = a$composite))
        }
        selected_genes <- head(arrange(indiv, desc(composite_auc))$gene, n_genes)
    } else {
        selected_genes <- initial_signature
    }
    remaining_genes <- setdiff(candidate_genes, selected_genes)

    init <- calcCompositeAUC(selected_genes)
    current_composite <- init$composite
    current_auc1      <- init$auc1
    current_auc2      <- init$auc2

    swap_history <- data.frame(iteration = 0, gene_removed = "None", gene_added = "None",
                               auc1 = current_auc1, auc2 = current_auc2,
                               composite_auc = current_composite, improvement = 0)
    no_improve <- 0

    for (iter in seq_len(max_swap_iterations)) {
        best_composite  <- current_composite
        best_auc1 <- current_auc1; best_auc2 <- current_auc2
        best_out  <- NULL; best_in <- NULL
        pb <- txtProgressBar(min = 0, max = length(selected_genes) * length(remaining_genes), style = 3)
        k  <- 0
        for (gene_out in selected_genes) {
            for (gene_in in remaining_genes) {
                k <- k + 1
                test <- calcCompositeAUC(c(setdiff(selected_genes, gene_out), gene_in))
                if (test$composite > best_composite) {
                    best_composite <- test$composite; best_auc1 <- test$auc1; best_auc2 <- test$auc2
                    best_out <- gene_out; best_in <- gene_in
                }
                setTxtProgressBar(pb, k)
            }
        }
        close(pb)

        if (!is.null(best_out) && (best_composite - current_composite) > 0.0001) {
            improvement     <- best_composite - current_composite
            selected_genes  <- c(setdiff(selected_genes, best_out), best_in)
            remaining_genes <- c(setdiff(remaining_genes, best_in), best_out)
            current_composite <- best_composite; current_auc1 <- best_auc1; current_auc2 <- best_auc2
            swap_history <- rbind(swap_history,
                data.frame(iteration = iter, gene_removed = best_out, gene_added = best_in,
                           auc1 = current_auc1, auc2 = current_auc2,
                           composite_auc = current_composite, improvement = improvement))
            no_improve <- 0
        } else {
            no_improve <- no_improve + 1
            if (no_improve >= max_consecutive_iteractions) break
        }
    }

    list(selected_genes   = selected_genes,
         swap_history     = swap_history,
         final_auc1       = current_auc1,
         final_auc2       = current_auc2,
         final_composite  = current_composite,
         total_improvement = current_composite - swap_history$composite_auc[1])
}

optimized_dual <- optimizeSignatureDual(
    seurat_obj1    = ren1c_obj,
    seurat_obj2    = captopril_obj,
    disease_label1 = "KO",  control_label1 = "WT",
    disease_label2 = "trt", control_label2 = "ctrl",
    disease_column1 = "genotype",
    disease_column2 = "condition",
    candidate_genes = candidates_present,
    initial_signature = initial_signature,
    n_genes = 10,
    max_consecutive_iteractions = length(candidates_present),
    max_swap_iterations = 500,
    weight_dataset1 = 0.3)

print(optimized_dual$selected_genes)

The final optimized 10-gene signature: Clu, Lrp2, Lamp2, Col4a2, Spink1, Wfdc2, Pax8, Lyz2, S100a9, and Cdh13. This list outperforms any single-model permutation result and is used throughout sections 6B, 6C, and the human validation (section 7).