Seurat的打分函数AddMouduleScore

在读张泽民老师发表的新冠文献的时候看到计算免疫细胞的cytokine score或inflammatory score使用了Seurat包的AddMouduleScore函数，在计算细胞周期的CellCycleScoring函数的原代码中也使用了这个函数。

此功能可用于计算任何基因列表的监督模块评分，非常实用！！！

1. 查看一下这个函数的用法

library(Seurat)?AddModuleScore

2. 使用

输入数据：Seurat对象和一个gene list。

1 准备矩阵

library(tidyverse)library(Matrix)library(cowplot)pbmc <- readRDS("pbmc.rds")

2 准备想要计算的gene list

inflammatory_gene <- readxl::read_xlsx("inflammatory_gene.xlsx")View(inflammatory_gene)

#转换成listgene <- as.list(inflammatory_gene)

3 计算

pbmc <- AddModuleScore( object = pbmc, features = gene, ctrl = 100, #默认值是100 name = 'CD_Features')

计算结果保存在pbmc@meta.data[["CD_Features1"]]

得到的score是在每个细胞中算出来的我们感兴趣的基因的表达均值。

其在使用时需要先计算基因集中所有基因的平均值，再根据平均值把表达矩阵切割成若干个bin，然后从切割后的每一个bin随机抽取对照基因（基因集外的基因，默认100个）作为背景值。最后所有的目标基因算一个平均值，所有的背景基因算一个平均值，两者相减就是该gene set的score值。因此，在整合不同样本的情况下，即使使用相同基因集为相同细胞打分，也会产生不同的富集评分。从本质上看它和Zscore的算法很类似，Zscore又称Z值，原是一个统计学概念，表示的是个体测量值X以标准差σ为单位，偏离总体均数μ的距离，即：Z score=（X-μ）/σ。牵扯到统计学的概念不免有些难以理解，简单说它就是处理过的平均值。

colnames(pbmc@meta.data)# [1] "orig.ident" "nCount_RNA" "nFeature_RNA" # [4] "percent.mt" "RNA_snn_res.0.5" "seurat_clusters"# [7] "cell_type" "CD_Features1" colnames(pbmc@meta.data)[8] <- 'Inflammatory_Score' VlnPlot(pbmc,features = 'Inflammatory_Score')

绘制箱线图

data<- FetchData(pbmc,vars = c("cell_type","Inflammatory_Score"))p <- ggplot(data,aes(cell_type,Inflammatory_Score))p+geom_boxplot()+theme_bw()+RotatedAxis()

绘制分数的分布图

library(ggplot2)mydata<- FetchData(pbmc,vars = c("UMAP_1","UMAP_2","Inflammatory_Score"))a <- ggplot(mydata,aes(x = UMAP_1,y =UMAP_2,colour = Inflammatory_Score))+geom_point(size = 1)+scale_color_gradientn(values = seq(0,1,0.2),colours = c('blue','cyan','green','yellow','orange','red'))a+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))

打分是一个连续变化的值，我们也可以直接将其转换为一定的分类变量：

#按照均值分pbmc[["stage"]] <- ifelse(pbmc@meta.data[,"Inflammatory_Score"] > mean(pbmc@meta.data[,"Inflammatory_Score"]),"High","Low")rownames(pbmc@meta.data)#按照75%分pbmc[["stage"]] <-pbmc@meta.data[,"Inflammatory_Score"] <- ifelse(pbmc@meta.data[,"Inflammatory_Score"] > quantile(pbmc@meta.data[,"Inflammatory_Score"],0.75),"High","Low")#按照具体数值分pbmc[["stage"]] <-pbmc@meta.data[,"Inflammatory_Score"] <- ifelse(pbmc@meta.data[,"Inflammatory_Score"] > 0.5,"High","Low")Idents(pbmc) <- 'Inflammatory_Score'DimPlot(pbmc,reduction = "umap",label = TRUE,pt.size = 1.5)

3. 主要参数

参数意义

objectSeurat object

featuresA list of vectors of features for expression programs; each entry should be a vector of feature names

poolList of features to check expression levels against, defaults to rownames(x = object)

nbinNumber of bins of aggregate expression levels for all analyzed features

ctrlNumber of control features selected from the same bin per analyzed feature

kUse feature clusters returned from DoKMeans

assayName of assay to use

nameName for the expression programs; will append a number to the end for each entry in features (eg. if features has three programs, the results will be stored as name1, name2, name3, respectively)

seedSet a random seed. If NULL, seed is not set.

searchSearch for symbol synonyms for features in features that don't match features in object? Searches the HGNC's gene names database; see UpdateSymbolList for more details

...Extra parameters passed to UpdateSymbolList

4. 函数代码

AddModuleScorefunction (object, features, pool = NULL, nbin = 24, ctrl = 100, k = FALSE, assay = NULL, name = "Cluster", seed = 1, search = FALSE, ...) { if (!is.null(x = seed)) { set.seed(seed = seed) } assay.old <- DefaultAssay(object = object) assay <- assay %||% assay.old DefaultAssay(object = object) <- assay assay.data <- GetAssayData(object = object) features.old <- features if (k) { .NotYetUsed(arg = "k") features <- list() for (i in as.numeric(x = names(x = table(object@kmeans.obj[[1]]$cluster)))) { features[[i]] <- names(x = which(x = object@kmeans.obj[[1]]$cluster == i)) } cluster.length <- length(x = features) } else { if (is.null(x = features)) { stop("Missing input feature list") } features <- lapply(X = features, FUN = function(x) { missing.features <- setdiff(x = x, y = rownames(x = object)) if (length(x = missing.features) > 0) { warning("The following features are not present in the object: ", paste(missing.features, collapse = ", "), ifelse(test = search, yes = ", attempting to find updated synonyms", no = ", not searching for symbol synonyms"), call. = FALSE, immediate. = TRUE) if (search) { tryCatch(expr = { updated.features <- UpdateSymbolList(symbols = missing.features, ...) names(x = updated.features) <- missing.features for (miss in names(x = updated.features)) { index <- which(x == miss) x[index] <- updated.features[miss] } }, error = function(...) { warning("Could not reach HGNC's gene names database", call. = FALSE, immediate. = TRUE) }) missing.features <- setdiff(x = x, y = rownames(x = object)) if (length(x = missing.features) > 0) { warning("The following features are still not present in the object: ", paste(missing.features, collapse = ", "), call. = FALSE, immediate. = TRUE) } } } return(intersect(x = x, y = rownames(x = object))) }) cluster.length <- length(x = features) } if (!all(LengthCheck(values = features))) { warning(paste("Could not find enough features in the object from the following feature lists:", paste(names(x = which(x = !LengthCheck(values = features)))), "Attempting to match case...")) features <- lapply(X = features.old, FUN = CaseMatch, match = rownames(x = object)) } if (!all(LengthCheck(values = features))) { stop(paste("The following feature lists do not have enough features present in the object:", paste(names(x = which(x = !LengthCheck(values = features)))), "exiting...")) } pool <- pool %||% rownames(x = object) data.avg <- Matrix::rowMeans(x = assay.data[pool, ]) data.avg <- data.avg[order(data.avg)] data.cut <- cut_number(x = data.avg + rnorm(n = length(data.avg))/1e+30, n = nbin, labels = FALSE, right = FALSE) names(x = data.cut) <- names(x = data.avg) ctrl.use <- vector(mode = "list", length = cluster.length) for (i in 1:cluster.length) { features.use <- features[[i]] for (j in 1:length(x = features.use)) { ctrl.use[[i]] <- c(ctrl.use[[i]], names(x = sample(x = data.cut[which(x = data.cut == data.cut[features.use[j]])], size = ctrl, replace = FALSE))) } } ctrl.use <- lapply(X = ctrl.use, FUN = unique) ctrl.scores <- matrix(data = numeric(length = 1L), nrow = length(x = ctrl.use), ncol = ncol(x = object)) for (i in 1:length(ctrl.use)) { features.use <- ctrl.use[[i]] ctrl.scores[i, ] <- Matrix::colMeans(x = assay.data[features.use, ]) } features.scores <- matrix(data = numeric(length = 1L), nrow = cluster.length, ncol = ncol(x = object)) for (i in 1:cluster.length) { features.use <- features[[i]] data.use <- assay.data[features.use, , drop = FALSE] features.scores[i, ] <- Matrix::colMeans(x = data.use) } features.scores.use <- features.scores - ctrl.scores rownames(x = features.scores.use) <- paste0(name, 1:cluster.length) features.scores.use <- as.data.frame(x = t(x = features.scores.use)) rownames(x = features.scores.use) <- colnames(x = object) object[[colnames(x = features.scores.use)]] <- features.scores.use CheckGC() DefaultAssay(object = object) <- assay.old return(object)}<bytecode: 0x7fc8df13f008><environment: namespace:Seurat>

5. 参考文献：

本站仅提供存储服务，所有内容均由用户发布，如发现有害或侵权内容，请点击举报。