Name |
Type |
Description |
read_files |
rshell |
component that reads the epigenetic file and the file that was mapped on the genome Scriptsetwd(dir)
epigen_file<-read.table(epigenetic_file, header=TRUE, sep="\t")
epigen_file[,2]<-gsub("chr","",epigen_file[,2])
genes<-read.table(biomart,header=TRUE, sep="\t")
#genes_pvalues<-read.table(gene_pvals,header=TRUE)
#sep="\t"
print("filesread") R Serverlocalhost:6311 |
transform_files_to_gen_ranges |
rshell |
transform the files to genomic ranges using the R package GenomicRanges to compute overlapping regions Scriptlibrary(GenomicRanges)
#transform each file in GRanges format
#t1 <- GRanges(seqnames = genes[,gene_chr],
# ranges = IRanges(genes[,gene_start], genes[,gene_end] , names = genes$entrezgene))
t1 <- GRanges(seqnames = genes$chromosome_name, ranges = IRanges(genes$promoter_start, genes$promoter_end , names = genes$entrezgene))
t2 <- GRanges(seqnames = epigen_file[,inter_chr],ranges = IRanges(epigen_file[,inter_start], epigen_file[,inter_end]))
print("t1_t2_done")
#t2 <- GRanges(seqnames = cpg$chrom,
# ranges = IRanges(cpg$chromStart, cpg$chromEnd))
R Serverlocalhost:6311 |
calculate_overlaps |
rshell |
compute overlaps between the two genomic files Scriptov1<-findOverlaps(t1,t2,minoverlap=overlap)
print("ovdone")
genes_pvalues<-read.table(gene_expr_file,header=TRUE, sep="\t")
idx<-match(genes$entrezgene[as.matrix(ov1)[,1]],genes_pvalues$gene_id )
overlapping_genes<-genes$entrezgene[as.matrix(ov1)[,1]]
non_overlapping_genes<-genes$entrezgene[-(as.matrix(ov1)[,1])]
list_non_overlapping<-unique(non_overlapping_genes)
list_overlapping<-unique(overlapping_genes)
idx_com<-match(list_non_overlapping,list_overlapping)
idx_non_overlapping_without_overlapping_idx<-which(is.na(idx_com)==TRUE)
list_non_overlapping_new<-list_non_overlapping[idx_non_overlapping_without_overlapping_idx]
idx_overlapping<- match(list_overlapping, genes_pvalues$gene_id)
overlapping_genes_pvals<-genes_pvalues[idx_overlapping,]
overlapping_genes_pvals_out<- paste("overlapping_genes",Sys.time(),".txt",sep="")
idx_non_overlapping<- match(list_non_overlapping_new, genes_pvalues$gene_id)
non_overlapping_genes_pvals<-genes_pvalues[idx_non_overlapping,]
non_overlapping_genes_pvals_out<-paste("non_overlapping_genes",Sys.time(),".txt",sep="")
write.table(overlapping_genes_pvals,overlapping_genes_pvals_out,col.names=TRUE, row.names=FALSE, sep="\t")
write.table(non_overlapping_genes_pvals,non_overlapping_genes_pvals_out ,col.names=TRUE,row.names=FALSE,sep="\t")
R Serverlocalhost:6311 |
calculate_promoter_region |
rshell |
For each one of the genes we compute a promoter region according to the prespecified values of the variables upstream_bp and downstream_bp the transcription start site. In this component we take into account the direction that a gene is transcribed. The variable "strand" is responsible for that. Script#a<-unique(biomart$entrezgene)
setwd(workdir)
biomart<-read.table(genes,header=TRUE, sep="\t")
mat<-matrix(NA, nrow=dim(biomart)[1], ncol=10)
colnames(mat)<-c(colnames(biomart),"promoter_start","promoter_end")
for (i in 1:dim(biomart)[1] ) {
if(biomart[i,3] > 0 ) {
promoter_start<-as.numeric(biomart$transcript_start[i])-upstream
promoter_end<-as.numeric(biomart$transcript_start[i])+downstream
mat[i,]<-c(biomart[i,1],as.character(biomart[i,2]),biomart[i,3],as.character(biomart[i,4]), biomart[i,5],biomart[i,6],biomart[i,7],biomart[i,8],promoter_start,promoter_end)
}
if(biomart[i,3] < 0 ) {
promoter_start<-as.numeric(biomart$transcript_end[i])+upstream
promoter_end<-as.numeric(biomart$transcript_end[i])-downstream
mat[i,]<-c(biomart[i,1],as.character(biomart[i,2]),biomart[i,3],as.character(biomart[i,4]), biomart[i,5],biomart[i,6],biomart[i,7],biomart[i,8],promoter_end,promoter_start)
}
}
output_file<- "mapped_file_promoter_region.txt"
write.table(mat,output_file, col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)
R Serverlocalhost:6311 |
ecdf_plot |
rshell |
this component plots the empirical cumulative distribution function of the P-values
of the two groups of genes (the ones that overlap and the ones that do not).
in black we indicate overlapping genes and in red non overlapping genes Scriptpng(filename=cn, height=400, width=400, bg="white");
plot(ecdf(as.numeric((overlapping_genes_pvals[,4]))),xlab="adjusted p-value", main="",ylab= "proportion of genes <= x", col.main="black");
lines(ecdf(as.numeric((non_overlapping_genes_pvals[,4]))), col="red")
legend("topleft", inset=.05, c("overlapping genes","nonOverlapping genes"), lwd=2, lty=c(1, 1, 1, 2), col=c("black","red"))
dev.off()
R Serverlocalhost:6311 |
ks.test |
rshell |
Using again the adjusted pvalues of the two groups of genes (overlapping genes
and non overlapping genes), we perform a kolmogorov smirnov test in order
to test for differences between the two distributions. We export the p value of
the statistical test, and the distance D between the two distributions. Script#ks.test(cpg_genes_pvals[,4],no_cpg_genes_pvals[,4])
ks_test<-ks.test(overlapping_genes_pvals[,4],non_overlapping_genes_pvals[,4])
#cn_ks_p<-ks.test(as.numeric(brain_cpgs_cn[,4]), as.numeric(brain_no_cpgs_cn[,4]))$D
ks_list<-unlist(ks_test)
ks_D<-ks_list[[1]]
ks_p<-ks_list[[2]]
R Serverlocalhost:6311 |
Comments (0)
No comments yet
Log in to make a comment