大家好,我是阿琛。在之前的内容中,我们介绍了LASSO模型的构建方法(高分生信秘籍!手把手叫你构建LASSO Cox回归模型)。然而,在临床模型,以及机器学习中,除了常用的LASSO模型,还有许多经典的模型方法。作为其好朋友,随机森林首当其冲,更是一种大家耳熟能详的模型构建方法。
1.R包的安装与读取
rm(list = ls()) #清空环境变量
options(stringsAsFactors = F)
###1. R包的安装与读取
if(!require(randomForest))install.packages("randomForest")
if(!require(tidyverse))install.packages("tidyverse")
if(!require(ggpubr))install.packages("ggpubr")
if(!require(ROCR))install.packages("ROCR")
library(randomForest)
library(tidyverse)
library(ggpubr)
library(ROCR)
2.数据读取与预处理
#读取表达数据
rt <- read.table("exp.txt", header=T, sep=" ", check.names=F, row.names = 1)
exp <- t(log2(rt+1))
str(exp)
#读取临床数据
cli <- read.table("cliData.txt",header=T,sep=" ",check.names=F, row.names = 1)
head(cli)
str(cli)
#将数据切分为训练集和测试集
set.seed(123)
index <- sample(nrow(cli), round(nrow(cli) * 0.7))
trainee <- exp[index,]
traineeCli <- cli[index,]
testee <- exp[-index,]
testeeCli <- cli[-index,]
3.使用trainee数据集建立随机森林模型
#构建模型
rf_output=randomForest(x = trainee,
y = traineeCli$fustat,
importance = TRUE,
ntree = 500,
proximity=TRUE)
plot(rf_output, type = "l") #对训练误差进行可视化
#top20的基因
varImpPlot(rf_output, type=2, n.var=20, scale=FALSE,
main="Importance of Variables for top 20 gens",cex =.7)
#提取前20基因名
rf_importances=importance(rf_output, scale=FALSE)
top20_gene=rownames(tail(rf_importances[order(rf_importances[,2]),],20))
top20_gene
4.模型验证及其可视化
1
rf_train <- predict(rf_output, trainee)
result_train <- cbind(traineeCli , rf_train)
head(result_train)
ggboxplot(result_train,
x = "fustat", y = "rf_train",
color = "fustat", palette = "jco",
add = "jitter") +
stat_compare_means()
2
rf_test <- predict(rf_output, testee)
result_test <- cbind(testeeCli , rf_test)
ggboxplot(result_test,
x = "fustat", y = "rf_test",
color = "fustat", palette = "jco",
add = "jitter") +
stat_compare_means()
3
pred_train <- prediction(result_train[,3], result_train[,2])
ROC_train <- performance(pred_train,"tpr","fpr")
auc_train <- performance(pred_train,"auc")@y.values[[1]]
auc_train
#[1] 1
plot(ROC_train,
col="red", #曲线的颜色
xlab="False positive rate", ylab="True positive rate", #x轴和y轴的名称
lty=1,lwd=3,
main=paste("AUC=", auc_train))
abline(0, 1, lty=2, lwd=3) #绘制对角线
4
pred_test <- prediction(result_test[,3], result_test[,2])
ROC_test <- performance(pred_test,"tpr","fpr")
auc_test <- performance(pred_test,"auc")@y.values[[1]]
auc_test
#[1] 0.7222222
plot(ROC_test,
col="blue", #曲线的颜色
xlab="False positive rate", ylab="True positive rate", #x轴和y轴的名称
lty=1,lwd=3,
main=paste("AUC=", auc_test))
abline(0, 1, lty=2, lwd=3) #绘制对角线
联系客服