TCGA数据挖掘之BRCA subtype poster

这是一篇review，里面有几个图，我们重现了

这次 jimmy交给我了学徒任务，只能硬着头皮上，主要是完成一个海报的两个突，真的是指哪打哪！

海报在线链接：http://www.nature.com/nrclinonc/posters/breastcancer/

需要重现的图有：

marker基因差异表达与PAM50分型的热图:

somatic突变图谱与PAM50分型

step1.数据下载

文章中提到了数据来自于引用文献：https://www.nature.com/articles/nature11412

Agilent mRNA expression microarrays (n = 547)

Affymetrix 6.0 single nucleotide polymorphism (SNP) arrays (n = 773)

下载TCGA_xena中的BRCA数据：

根据这个推文里面说到的数据下载方法：https://mp.weixin.qq.com/s/Yz5sl-T3ITvLEFnv5H47wQ

安捷伦芯片数据：AgilentG4502A_07_3 (n=597) TCGA hub

临床数据：Phenotypes (n=1,247) TCGA hub

以及somatic mutations数据：来自于R包 TCGAmutations具体下载方法请看后文

step2.临床数据

# 清空环境
rm(list = ls())
options(stringsAsFactors = F)

# 读取临床数据
phe = read.table('BRCA_clinicalMatrix.gz',header = T,sep = '\t',quote = '')
table(phe$PAM50Call_RNAseq)
# 取出sampleID和PAM50Call_RNAseq两列
phe = phe[c('sampleID','PAM50Call_RNAseq')]
rownames(phe) = phe$sampleID
table(phe$PAM50Call_RNAseq)
# 过滤掉Normal和空白分组
phe = phe[phe$PAM50Call_RNAseq != 'Normal',]
phe = phe[phe$PAM50Call_RNAseq != '',]

phe = phe[order(phe$PAM50Call_RNAseq),]

step3.热图

# 读取表达矩阵
dat <- read.table('AgilentG4502A_07_3.gz',
                  header = T,
                  sep = '\t',
                  quote = '',
                  row.names = 1)
dat[1:4,1:4]
colnames(dat) = gsub('\\.',  '-', colnames(dat))
# 矩阵取小，取出有临床数据phe的pam50的样本
table(colnames(dat) %in% rownames(phe))
phe = phe[rownames(phe) %in% colnames(dat),]
dat = dat[,rownames(phe)]

# 取出感兴趣的基因
gene=c('ESR1','PGR','GATA3','XBP1','FOXA1',
       'ERBB2','GRB7',
       'EGFR','FOXC1','ID4','KIT','MYC',
       'MKI67','MYBL2','AURKA','BUB1','CENPF',
       'KRT8','KRT18','KRT5','KRT6A')
marker_dat = dat[gene,]

# 热图基因的注释
gene_list = data.frame(group = c(rep('Luminal',5),
                               rep('HER2',2),
                               rep('Basal',5),
                               rep('Proliferation',5),
                               rep('Keratin',4)),
                       row.names = gene)


# 热图样本注释
pam50_list = phe[colnames(dat),][,2]
sample_group = data.frame(pam50_list = pam50_list,
                          row.names = colnames(dat))

# 用pheatmap画热图，添加注释
pheatmap::pheatmap(marker_dat,cluster_cols = F,
                   cluster_rows = F,
                   cutree_rows = 5,
                   show_colnames =F,
                   show_rownames = T,
                   annotation_col = sample_group,
                   annotation_row = gene_list,
                   filename = 'TCGA_BRCA_gene_AgilentG.png')

step4.marker基因突变图

# 清空环境
rm(list = ls())
options(stringsAsFactors = F)

# 切换镜像安装R包
options(repos='https://mirrors.ustc.edu.cn/CRAN')
options(BioC_mirror='https://mirrors.ustc.edu.cn/bioc')
# 安装maftools包
BiocManager::install('maftools')
# 安装TCGAmutations包，对网络要求较高，而且下载很久，可能下载失败
devtools::install_github(repo = 'PoisonAlien/TCGAmutations')
library(TCGAmutations)
library(maftools)

# TCGAmutations自带数据，可以用`?tcga_load`查看帮助文档
# 这里我们直接加载BRCA突变数据，读进来就是一个名为tcga_brca_mc3的maf对象
tcga_load(study = 'BRCA')
# 看看数据的整体情况
laml = tcga_brca_mc3
laml@data = laml@data[!grepl('^MT-', laml@data$Hugo_Symbol), ]
laml@data$t_vaf = (laml@data$t_alt_count / laml@data$t_depth)
unique(laml@data$Tumor_Sample_Barcode)
getSampleSummary(laml)
getGeneSummary(laml)
getFields(laml)

# 因为对象自带临床数据，可以拿到IHC分型，先统计一下
table(laml@clinical.data$breast_carcinoma_estrogen_receptor_status)
table(laml@clinical.data$breast_carcinoma_progesterone_receptor_status)
table(laml@clinical.data$lab_proc_her2_neu_immunohistochemistry_receptor_status)
# 然后把er，pr，her2的IHC分型做成一个数据框
# 这里需要注意一点，临床分型样本名所在的列名需要与maf对象laml保持一致
dat = data.frame(
    Tumor_Sample_Barcode = laml@clinical.data$Tumor_Sample_Barcode,
    er = laml@clinical.data$breast_carcinoma_estrogen_receptor_status,
    pr = laml@clinical.data$breast_carcinoma_progesterone_receptor_status,
    her2 = laml@clinical.data$lab_proc_her2_neu_immunohistochemistry_receptor_status)
# 把一些阴阳性不确定的样本过滤掉
dat = dat[with(dat,er %in% c('Negative','Positive')) &
             with(dat,pr %in% c('Negative','Positive')) &
             with(dat,her2 %in% c('Negative','Positive')),]
# 由er，pr，her2的阴阳性拿到IHC分型
dat$sub[(dat$er=='Positive' | dat$pr=='Positive') & dat$her2=='Negative'] = 'HR+/HER2–'
dat$sub[(dat$er=='Positive' | dat$pr=='Positive') & dat$her2=='Positive'] = 'HR+/HER2+'
dat$sub[(dat$er=='Negative' & dat$pr=='Negative') & dat$her2=='Positive'] = 'HER2'
dat$sub[(dat$er=='Negative' & dat$pr=='Negative') & dat$her2=='Negative'] = 'TNBC'
table(dat$sub)

# 最后用maftools的oncoplot画突变全景图，
# marker基因突变全景图
gene = c('PIK3CA','CDH1','GATA3','MAP3K1','MAP2K4','FOXA1','AKT1','PTEN','TP53')
oncoplot(maf = laml, 
         genes = gene, 
         fill = T,
         fontSize = 0.8 ,
         annotationDat = dat,
         clinicalFeatures = 'sub',
         sortByAnnotation = T,
         GeneOrderSort = T,
         keepGeneOrder = T,
         showTumorSampleBarcodes = F)

写在后面

本文排版奇差无比，图片也重复的很不美观，我都不想把它放入学徒数据挖掘列表，不过至少知识点和数据处理本身的代码都ok啦，希望对大家有所帮助！

本站仅提供存储服务，所有内容均由用户发布，如发现有害或侵权内容，请点击举报。