注意:今天的教程比较长,请规划好你的时间。本文是付费内容,在本文文末有本教程的全部的代码和示例数据。
输出结果
分析代码
关于WGCNA分析,如果你的数据量较大,建议使用服务期直接分析,本地分析可能导致R崩掉。
设置文件位置
setwd("~/00_WGCNA/20230217_WGCNA/WGCNA_01")
加载分析所需的安装包
install.packages("WGCNA")#BiocManager::install('WGCNA')library(WGCNA)options(stringsAsFactors = FALSE)
注意,如果你想打开多线程分析,可以使用一下代码
enableWGCNAThreads()
一、导入基因表达量数据
## 读取txt文件格式数据WGCNA.fpkm = read.table("ExpData_WGCNA.txt",header=T,comment.char = "",check.names=F)################ 读取csv文件格式WGCNA.fpkm = read.csv("ExpData_WGCNA.csv", header = T, check.names = F)
数据处理
dim(WGCNA.fpkm)names(WGCNA.fpkm)datExpr0 = as.data.frame(t(WGCNA.fpkm[,-1]))names(datExpr0) = WGCNA.fpkm$sample;##########如果第一行不是ID命名,就写成fpkm[,1]rownames(datExpr0) = names(WGCNA.fpkm[,-1])
过滤数据
gsg = goodSamplesGenes(datExpr0, verbose = 3)gsg$allOKif (!gsg$allOK){if (sum(!gsg$goodGenes)>0)printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")))if (sum(!gsg$goodSamples)>0)printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")))# Remove the offending genes and samples from the data:datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]}
过滤低于设定的值的基因
##filtermeanFPKM=0.5###--过滤标准,可以修改n=nrow(datExpr0)datExpr0[n+1,]=apply(datExpr0[c(1:nrow(datExpr0)),],2,mean)datExpr0=datExpr0[1:n,datExpr0[n+1,] > meanFPKM]# for meanFpkm in row n+1 and it must be above what you set--select meanFpkm>opt$meanFpkm(by rp)filtered_fpkm=t(datExpr0)filtered_fpkm=data.frame(rownames(filtered_fpkm),filtered_fpkm)names(filtered_fpkm)[1]="sample"head(filtered_fpkm)write.table(filtered_fpkm, file="mRNA.filter.txt",row.names=F, col.names=T,quote=FALSE,sep="\t")
Sample cluster
sampleTree = hclust(dist(datExpr0), method = "average")pdf(file = "1.sampleClustering.pdf", width = 15, height = 8)par(cex = 0.6)par(mar = c(0,6,6,0))plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 2, cex.axis = 1.5, cex.main = 2)### Plot a line to show the cut#abline(h = 180, col = "red")##剪切高度不确定,故无红线dev.off()
不过滤数据
如果你的数据不进行过滤直接进行一下操作,此步与前面的操作相同,任选异种即可。
## 不过滤## Determine cluster under the lineclust = cutreeStatic(sampleTree, cutHeight = 50000, minSize = 10)table(clust)# clust 1 contains the samples we want to keep.keepSamples = (clust!=0)datExpr0 = datExpr0[keepSamples, ]write.table(datExpr0, file="mRNA.symbol.uniq.filter.sample.txt",row.names=T, col.names=T,quote=FALSE,sep="\t")################Sample cluster###########sampleTree = hclust(dist(datExpr0), method = "average")pdf(file = "1.sampleClustering.filter.pdf", width = 12, height = 9)par(cex = 0.6)par(mar = c(0,4,2,0))plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)### Plot a line to show the cut#abline(h = 50000, col = "red")##剪切高度不确定,故无红线dev.off()
二、导入性状数据
traitData = read.table("TraitData.txt",row.names=1,header=T,comment.char = "",check.names=F)allTraits = traitDatadim(allTraits)names(allTraits)
## 形成一个类似于表达数据的数据框架fpkmSamples = rownames(datExpr0)traitSamples =rownames(allTraits)traitRows = match(fpkmSamples, traitSamples)datTraits = allTraits[traitRows,]rownames(datTraits)collectGarbage()
再次样本聚类
sampleTree2 = hclust(dist(datExpr0), method = "average")# Convert traits to a color representation: white means low, red means high, grey means missing entrytraitColors = numbers2colors(datTraits, signed = FALSE)
输出样本聚类图
pdf(file="2.Sample_dendrogram_and_trait_heatmap.pdf",width=20,height=12)plotDendroAndColors(sampleTree2, traitColors,groupLabels = names(datTraits),main = "Sample dendrogram and trait heatmap",cex.colorLabels = 1.5, cex.dendroLabels = 1, cex.rowText = 2)dev.off()
三、WGCNA分析(后面都是重点)
筛选软阈值
enableWGCNAThreads()# 设置soft-thresholding powers的数量powers = c(1:30)sft = pickSoftThreshold(datExpr0, powerVector = powers, verbose = 5)
此步骤是比较耗费时间的,静静等待即可。
绘制soft Threshold plot
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence"));text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red");# this line corresponds to using an R^2 cut-off of habline(h=0.8,col="red")# Mean connectivity as a function of the soft-thresholding powerplot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity"))text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")dev.off()
选择softpower
选择softpower是一个玄学
的过程,可以直接使用软件自己认为是最好的softpower值,但是不一定你要获得最好结果;其次,我们自己选择自己认为比较好的softpower值,但是,需要自己不断的筛选。因此,从这里开始WGCNA的分析结果就开始受到不同的影响。
## 选择软件认为是最好的softpower值#softPower =sft$powerEstimate---# 自己设定softpower值softPower = 9
继续分析
adjacency = adjacency(datExpr0, power = softPower)
将邻接转化为拓扑重叠
这一步建议去服务器上跑,后面的步骤就在服务器上跑吧,数据量太大;如果你的数据量较小,本地也就可以
TOM = TOMsimilarity(adjacency);dissTOM = 1-TOM
geneTree = hclust(as.dist(dissTOM), method = "average");
绘制聚类树(树状图)
pdf(file="4_Gene clustering on TOM-based dissimilarity.pdf",width=24,height=18)plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity", labels = FALSE, hang = 0.04)dev.off()
加入模块
minModuleSize = 30# Module identification using dynamic tree cut:dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,deepSplit = 2, pamRespectsDendro = FALSE,minClusterSize = minModuleSize);table(dynamicMods)# Convert numeric lables into colorsdynamicColors = labels2colors(dynamicMods)table(dynamicColors)# Plot the dendrogram and colors underneath#sizeGrWindow(8,6)pdf(file="5_Dynamic Tree Cut.pdf",width=8,height=6)plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05,main = "Gene dendrogram and module colors")dev.off()
合并模块
做出的WGCNA分析中,具有较多的模块,但是在我们后续的分析中,是使用不到这么多的模块,以及模块越多对我们的分析越困难,那么就必须合并模块信息。具体操作如下。
MEList = moduleEigengenes(datExpr0, colors = dynamicColors)MEs = MEList$eigengenes# Calculate dissimilarity of module eigengenesMEDiss = 1-cor(MEs);# Cluster module eigengenesMETree = hclust(as.dist(MEDiss), method = "average")# Plot the result#sizeGrWindow(7, 6)pdf(file="6_Clustering of module eigengenes.pdf",width=7,height=6)plot(METree, main = "Clustering of module eigengenes", xlab = "", sub = "")######剪切高度可修改MEDissThres = 0.4# Plot the cut line into the dendrogramabline(h=MEDissThres, col = "red")dev.off()
合并及绘图
= mergeCloseModules(datExpr0, dynamicColors, cutHeight = MEDissThres, verbose = 3)# The merged module colorsmergedColors = merge$colors# Eigengenes of the new merged modules:mergedMEs = merge$newMEstable(mergedColors)#sizeGrWindow(12, 9)pdf(file="7_merged dynamic.pdf", width = 9, height = 6)plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),c("Dynamic Tree Cut", "Merged dynamic"),dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)dev.off()
Rename to moduleColors
moduleColors = mergedColors# Construct numerical labels corresponding to the colorscolorOrder = c("grey", standardColors(50))moduleLabels = match(moduleColors, colorOrder)-1MEs = mergedMEs
性状数据与基因模块进行分析
nGenes = ncol(datExpr0)nSamples = nrow(datExpr0)moduleTraitCor = cor(MEs, datTraits, use = "p")moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
绘制模块性状相关性图
pdf(file="8_Module-trait relationships.pdf",width=10,height=10)# Will display correlations and their p-valuestextMatrix = paste(signif(moduleTraitCor, 2), "\n(", signif(moduleTraitPvalue, 1), ")", sep = "")dim(textMatrix) = dim(moduleTraitCor)par(mar = c(6, 8.5, 3, 3))# Display the correlation values within a heatmap plotlabeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = names(MEs), ySymbols = names(MEs), colorLabels = FALSE, colors = greenWhiteRed(50), textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1), main = paste("Module-trait relationships"))dev.off()
计算MM和GS
modNames = substring(names(MEs), 3)geneModuleMembership = as.data.frame(cor(datExpr0, MEs, use = "p"))MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))names(geneModuleMembership) = paste("MM", modNames, sep="")names(MMPvalue) = paste("p.MM", modNames, sep="")#names of those traittraitNames=names(datTraits)geneTraitSignificance = as.data.frame(cor(datExpr0, datTraits, use = "p"))GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))names(geneTraitSignificance) = paste("GS.", traitNames, sep="")names(GSPvalue) = paste("p.GS.", traitNames, sep="")
批量绘制性状与各个模块基因的相关性图
for (trait in traitNames){traitColumn=match(trait,traitNames)for (module in modNames){column = match(module, modNames)moduleGenes = moduleColors==moduleif (nrow(geneModuleMembership[moduleGenes,]) > 1){####进行这部分计算必须每个模块内基因数量大于2,由于前面设置了最小数量是30,这里可以不做这个判断,但是grey有可能会出现1个gene,它会导致代码运行的时候中断,故设置这一步#sizeGrWindow(7, 7)pdf(file=paste("9_", trait, "_", module,"_Module membership vs gene significance.pdf",sep=""),width=7,height=7)par(mfrow = c(1,1))verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), abs(geneTraitSignificance[moduleGenes, traitColumn]), xlab = paste("Module Membership in", module, "module"), ylab = paste("Gene significance for ",trait), main = paste("Module membership vs. gene significance\n"), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)dev.off()}}}names(datExpr0)probes = names(datExpr0)
输出GS和MM数据
geneInfo0 = data.frame(probes= probes, moduleColor = moduleColors)for (Tra in 1:ncol(geneTraitSignificance)){oldNames = names(geneInfo0)geneInfo0 = data.frame(geneInfo0, geneTraitSignificance[,Tra], GSPvalue[, Tra])names(geneInfo0) = c(oldNames,names(geneTraitSignificance)[Tra], names(GSPvalue)[Tra])}for (mod in 1:ncol(geneModuleMembership)){oldNames = names(geneInfo0)geneInfo0 = data.frame(geneInfo0, geneModuleMembership[,mod], MMPvalue[, mod])names(geneInfo0) = c(oldNames,names(geneModuleMembership)[mod], names(MMPvalue)[mod])}geneOrder =order(geneInfo0$moduleColor)geneInfo = geneInfo0[geneOrder, ]write.table(geneInfo, file = "10_GS_and_MM.xls",sep="\t",row.names=F)
可视化基因网络
nGenes = ncol(datExpr0)nSamples = nrow(datExpr0)nSelect = 400# For reproducibility, we set the random seed 不能用全部的基因,不然会爆炸的set.seed(10)select = sample(nGenes, size = nSelect)selectTOM = dissTOM[select, select]selectTree = hclust(as.dist(selectTOM), method = "average")selectColors = moduleColors[select]#sizeGrWindow(9,9)# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing# the color palette; setting the diagonal to NA also improves the clarity of the plotplotDiss = selectTOM^7diag(plotDiss) = NA
绘图
library("gplots")pdf(file="13_Network heatmap plot_selected genes.pdf",width=9, height=9)mycol = colorpanel(250,'red','orange','lemonchiffon')TOMplot(plotDiss, selectTree, selectColors, col=mycol ,main = "Network heatmap plot, selected genes")dev.off()
特征基因的基因网络可视化
pdf(file="14_Eigengene dendrogram and Eigengene adjacency heatmap.pdf", width=5, height=7.5)par(cex = 0.9)plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)dev.off()
获得本教程代码链接:WGCNA分析 | 全流程分析代码 | 代码一
小杜的生信筆記 ,主要发表或收录生物信息学的教程,以及基于R的分析和可视化(包括数据分析,图形绘制等);分享感兴趣的文献和学习资料!!