This workflow takes in a CEL file and a normalisation method then returns a series of images/graphs which represent the same output obtained using the MADAT software package (MicroArray Data Analysis Tool) [http://www.bioinf.manchester.ac.uk/MADAT/index.html]. Also retruned by this workflow are a list of the top differentialy expressed genes (size dependant on the number specified as input - geneNumber), which are then used to find the candidate pathways which may be influencing the observed changes in the microarray data. By identifying the candidate pathways, more detailed insights into the gene expression data can be obtained.
NOTE - You will also need to install R and Rserv on your machine and install the libaries required by the R script into you R library directory (see for basic info: http://www.cs.man.ac.uk/~fisherp/rlib.html)
The example inputs for this workflow are as follows:
Samples1 = one or more CEL files for cross-correlating with Samples2 CEL files (new line separated including the .CEL):
Liver_Day1_Mouse.CEL
Liver_Day2_Mouse.CEL
Samples2 = one or more CEL files for cross-correlating with Samples1 CEL files (new line separated including the .CEL):
Kideny_Day1_Mouse.CEL
Kidney_Day2_Mouse.CEL
geneNumber = the number of differentialy expressed gene to be returned above a given p-value, e.g. 20
arrayTypeAffy = the name of the Mouse AffyMetrix array used, e.g. mouse4302, hgu133a...
path = the direct path to the CEL file location, e.g. C:/Microarray_Data/CEL_FILES/ - note the forward slashes
NormalizationMethod = the type of normalisation to perfrom, e.g. rma, gcrma or mmgmos
testMethod = e.g. limma, mmtest or pplr
p-value = the p-value cut-off value for the array data, e.g. 0.05
foldChange = the fold change value for the microarray data, e.g. 1 (means greater than 1 or less than -1)
org.embl.ebi.escience.scuflworkers.java.SplitByRegex
org.embl.ebi.escience.scuflworkers.java.SplitByRegex
C:/affy data for Munich course/ShortCelFilesLiver
mouse4302
#@to do: extract array type automatically then install relevant packages
# installing packages
#source("http://www.bioconductor.org/biocLite.R")
#biocLite()
#biocLite(c("affy "," limma ","mmgmos"," ade4"," affyQCReport"," base","pplr " ," splines "," survival "," multtest "))
library(base)
library(affy)
# make sure you are in the directory
# where the CEL files are
getwd()
#setwd("C:/affy data for Munich course/ShortCelFilesLiver")
# dirPath input from workflow
setwd(dirPath)
fnames = dir(pattern="CEL")
# read in CEL files
#library(mouse4302cdf, lib="C:/Program Files/R/rw2001/library" )
#library(mouse4302cdf)
#library(mouse4302, lib="C:/Program Files/R/rw2001/library")
#library(mouse4302);
library(arrayType,,character.only = TRUE)
library(sub(" ","",paste(arrayType,'cdf')), character.only = TRUE)
affyID1 <- NULL
#************ Loadind data
data.raw = ReadAffy(filenames=fnames)
sName = sampleNames(data.raw)
# check whether it read in things correctly
#data.raw
#image(data.raw, col=rainbow(100, start=0, end=0.7)[100:1])
# boxplot(data.raw, col=rainbow(50))
histRawPlot = 'histRawPlot.png'
png(histRawPlot)
hist(data.raw, col = 1:length(sName))
legend('right',sName, lt = 1:length(sName),col = 1:length(sName), cex = 0.6)
dev.off()
for (i in 1:5) gc()
#************* Normalization Method
normalization <- function (data,method) {
switch(method,
rma = rma(data),
gcrma = gcrma(data),
mmgmos = mmgmos(data)
)
}
data.norm <- normalization(data.raw, normMethod)
# Write result into the text file
normText = 'normText.txt'
temp = exprs(data.norm)
#write.exprs(temp, normText)
write(temp, normText)
# create box plot
boxNormPlot = 'boxNormPlot.png'
png(boxNormPlot)
boxplot(data.frame(exprs(data.norm)), col=rainbow(30))
dev.off()
#class(data.rma)
#*************** Quality control PCA
library(ade4)
library(affyQCReport)
library(base)
data.pca = dudi.pca(exprs(data.norm), scannf = F, nf = length(fnames))
# produce PCA plot
pcaPlot = 'pcaPlot.png'
pc1 = data.pca$co[,2]
pc2 = data.pca$co[,3]
xl = "1. PC"
yl = "2. PC"
title = "PCA Result"
png(pcaPlot)
plot(pc1,pc2, xlab=xl,ylab=yl,main=title)
sName = sampleNames(data.norm)
points(pc1,pc2, col = 1:length(sName))
legend('topright',sName, col = 1:length(sName),pch = 1, cex = 0.6)
dev.off()
# write result column to the given file name
pcaResultFileCol = 'pcaResultCol.txt'
pcaResultFileRow = 'pcaResultRow.txt'
pcaResultFileEig = 'pcaResultEig.txt'
se = ','
# write result column to the given file name
write.table(data.pca$co, file = pcaResultFileCol, sep = se, row.names = FALSE)
#write result raw to the given file name
# create column names of comp1, comp2, ...
v = data.pca$eig
h = length (v)
comp= 'Comp'
M = 1:h
for (i in 1:h) { M[i] = as.character(i) }
colNames = paste(comp ,M)
write.table(data.pca$li, file = pcaResultFileRow, sep = se, row.names = FALSE, col.names = colNames)
#write result reigen value to the given file name
K = matrix(1:h, h,3)
#setting cel file names from row.names
K[,1] = row.names(data.pca$co)
for (i in 1:h) {K[i,2] = v[i]}
for (i in 1:h) {K[i,3] =( (v[i]*100) / sum(v))}
col = c ('','Eigen Value', ' % Percentage')
write.table(K, file = pcaResultFileEig , sep = se, row.names = FALSE,col.names = col)
#************** start ttest
library(splines)
library(limma)
library(survival)
library(splines)
library(multtest)
#*****ttest
envi = 'mouse4302GENENAME'
#group1 = c("B05.CEL","B04.CEL")
#group2 = c("A75.CEL","A74.CEL")
cl = c(rep(1,length(group1)),rep(2,length(group2)))
# limma function
limma <- function (data.norm,group1,group2,cl){
design <- model.matrix(~ -1+factor(cl))
#design <- model.matrix(~ -1+factor(c(1,1,1,2,2,2)))
colnames(design)<-c("group1","group2")
contrast.matrix = makeContrasts(group1-group2,levels = design)
#contrast.matrix <- cbind(group2vs1=c(-1,1))
fit = lmFit(exprs(data.norm)[,c(group1,group2)],design)
fit2 <-contrasts.fit(fit,contrast.matrix)
efit <- eBayes(fit2)
tops = topTable(efit,coef=1,adjust='fdr',sort.by='B',number = 50000)
affyID = tops[, 'ID']
P = tops[,'P.Value']
T = tops[,'t']
M = tops[,'M']
A= tops[,'A']
B = tops[,'B']
# result file name should be same as workflow output
result2 = 'testResult.txt'
write.table(cbind(affyID,A,M,T,P,B), file = result2,sep = ',',row.names = FALSE)
# filtering
tops = topTable(efit,coef=1,adjust="fdr",sort.by="B",number=gNum)
filter = tops[tops$P.Value <pVal & (tops$M > FCVal| tops$M < (-1*FCVal) ) ,]
#geneId = c('gene')
# annotating filterd result
num = gNum
topsFil = filter
fSize = num
lenFil = length (topsFil$ID)
if (lenFil < fSize) fSize = lenFil
if (fSize > 0) topsFil = topsFil [1: fSize,] else topsFil = NULL
if (fSize > 0) testSize = TRUE else testSize = FALSE
affyID = topsFil[, 'ID']
for (i in 1: length(affyID)) { affyID1 = c(affyID1,affyID[i]) }
P = topsFil[,'P.Value']
T = topsFil[,'t']
M = topsFil[,'M']
A= topsFil[,'A']
B = topsFil[,'B']
# result file name should be same as workflow output
result2 = 'filterResult.txt'
envi = sub(" ","",envi)
llnames = NULL
# if (exists(envi, mode='environment') ) llnames <- mget(affyID, env = mouse4302GENENAME ,ifnotfound ='NA') else llnames = NULL
GeneName <- as.character(llnames)
if (length(GeneName) >0) write.table(cbind(affyID,GeneName,A,M,T,P,B), file = result2,sep = ',',row.names = FALSE) else write.table(cbind(affyID,A,M,T,P,B), file = result2,sep = ',',row.names = FALSE)
#affyID1 = c("1418809_at","1418240_at")
return(affyID1);
}
# end of limma function
# **********mmtest function
mmtest <- function (data.norm,group1,group2,cl) {
# calculate t statistic
cl = cl -1
tValue = mt.teststat(exprs(data.norm)[,c(group1,group2)], classlabel=cl)
# calculate P value
pValue = 2*pt(-abs(tValue), df=ncol(exprs(data.norm))-2 )
# adjust P value according to Benjamini & Yekutieli
adjp = mt.rawp2adjp(pValue, proc="BY")
# order pvals
ord = order(adjp$index)
adjp.BY = adjp$adj[ord,"BY"]
# calculate fold changes
mean.group1 = apply(exprs(data.norm)[,group1], 1, mean, na.rm=T)
mean.group2 = apply(exprs(data.norm)[,group2], 1, mean, na.rm=T)
FC = 2^( mean.group2 - mean.group1 )
# writettest result into the file
size = length(FC)
AffyID = names(FC)
FoldChange = numeric(size)
for (i in 1:size) { FoldChange[i] = FC[[i]]}
Pvalue = adjp.BY
result1 = 'testResult.txt'
write.table(cbind(AffyID,Pvalue,FoldChange), file= result1,sep = ',',,row.names = FALSE)
# filtering
# fRes = which(adjp.BY<pVal & (FC >FCVal | FC< (-1*FCVal) ))
fRes = which(adjp.BY<pVal & (FC >FCVal | FC< (-1*FCVal) ))
num = gNum
size = num
lenFil = length (fRes)
if (lenFil < size) size = lenFil
if ( size > 0) fRes = fRes [1: size] else fRes = NULL
if (size > 0) testSize = TRUE else testSize = FALSE
AffyID = names(fRes)
for (i in 1: length(AffyID)) { affyID1 = c(affyID1,AffyID[i]) }
FoldChange = numeric(size)
Pvalue = numeric(size)
llnames = NULL
if ( size > 0) for (i in 1:size) { FoldChange[i] = FC[[fRes[[i]]]]; Pvalue[i] = adjp.BY[fRes[[i]]] }
result3 = 'filterResult.txt'
#if (exists(envi, mode='environment') ) llnames <- mget(AffyID, env = mouse4302GENENAME ,ifnotfound ='NA') else llnames = NULL
GeneName <- llnames
GeneName = gsub (',',' ',GeneName)
if (length(GeneName) >0) write.table(cbind(AffyID,GeneName,Pvalue,FoldChange), file= result3,sep = ',',row.names = FALSE) else write.table(cbind(AffyID,Pvalue,FoldChange), file= result3,sep = ',',row.names = FALSE)
return(affyID1);
}
# end of mmtest function
# *********pplr
pplrTest <- function (data.norm,group1,group2,cl){
library(pplr)
num = gNum
e <- NULL
se <- NULL
r <- NULL
p1e <- NULL
samples <- NULL
samples = c(group1,group2)
e <- exprs(data.norm)[,samples]
se <- se.exprs(data.norm)[,samples]
r <- bcomb(e,se,replicate = cl, method='em')
#// rCon.eval("r <- bcomb(e,se,replicate = cl, method='map')"); //quick for testing but should be em
p1e <- pplr(r,1,2)
result1 = 'testResult.txt'
write.csv (p1e,result1)
# do filter
fSize = num
topsFil = p1e[p1e$PPLR <pVal & (p1e$LRM > FCVal | p1e$LRM < (-1*FCVal)),]
lenFil = length (row.names(topsFil))
if (lenFil < fSize) fSize = lenFil
if (fSize > 0) topsFil = topsFil [1: fSize,] else topsFil = NULL
if (fSize > 0) testSize = TRUE else testSize = FALSE
affyID = row.names(topsFil)
for (i in 1: length(affyID)) { affyID1 = c(affyID1,affyID[i]) }
PPLR = topsFil[,'PPLR']
index = topsFil[,'index']
cM = topsFil[,'cM']
sM = topsFil[,'sM']
cStd = topsFil[,'cStd']
sStd = topsFil[,'sStd']
LRM= topsFil[,'LRM']
LRStd= topsFil[,'LRStd']
stat = topsFil[,'stat']
llnames = NULL
result2 = 'filterResult.txt'
##annotation file
# if (exists(envi, mode='environment') ) llnames <- mget(affyID, env = mouse4302GENENAME ,ifnotfound ='NA') else llnames = NULL
GeneName <- as.character(llnames)
if (length(GeneName) >0) write.table(cbind(affyID,GeneName,index,cM,sM,cStd,sStd,LRM,LRStd,stat,PPLR), file = result2,sep = ',',row.names = FALSE) else write.table(cbind(affyID,index,cM,sM,cStd,sStd,LRM,LRStd,stat,PPLR), file = result2,sep = ',',row.names = FALSE)
return(affyID1);
} # end of pplr function
# selecting ttest method
ttest <- function (data.norm,method) {
switch(method,
limma = limma(data.norm,group1,group2,cl), mmtest = mmtest(data.norm,group1,group2,cl), pplr = pplrTest (data.norm,group1,group2,cl))
}
#ttest(data.norm,"limma")
#if ((length(ttestMethod) > 0) affyID1 <- ttest(data.norm,ttestMethod) else affyID1 = NULL
affyID1 <- ttest(data.norm,ttestMethod)
#ttest(data.norm,"pplr")
dirPath
arrayType
normMethod
ttestMethod
gNum
pVal
FCVal
group1
group2
histRawPlot
boxNormPlot
pcaPlot
affyID1
\n
file:/E:/toolkit/human_probeset_to_pathway_0.7.xml