#################################################### ## ## Primary Researcher: Ben and Carlos ## Washington State University ## Skinner labratory ## Nimblegen Promotor array CHiP data Dataset ## January 25 2008 ## ## Bioinformatician: Matt Settles ## Washington State University ## ## file = ComputeDiff.R ## Modified by Md Haque ########## PREAMBLE ############ base <- "" setwd(base) rawPath <- file.path(getwd(),"Data") figurePath <- file.path(getwd(), "Figures") tablePath <- file.path(getwd(), "Tables") pairPath <- file.path(getwd(),"Raw_Data_Files") designPath <- file.path( base,"Design_Information") probeAnnoPath <- file.path( base,"Probe_Anno") library(limma) library(Biobase) designids <- c(designID) allChr <- paste("chr",c(1:20,"1_random","2_random","3_random","4_random", "5_random","6_random","7_random","8_random", "9_random","11_random","12_random","15_random", "16_random","19_random","20_random","Un","X","X_random"),sep="") maxp <- 1e-5 for (i in 1:length(designids)){ library(limma) design_id <- designids[i] load(file=file.path(probeAnnoPath,paste("ndfpos",design_id,"RData",sep="."))) load(file=file.path(rawPath,paste("GCNormSmoothed",design_id,"RData",sep="."))) rowM <- rowMedians(smooth300.gc$M) rowA <- rowMedians(smooth300.gc$A) sdA <- apply(smooth300.gc$A,1,sd,na.rm=TRUE) meanM <- mean(rowM,na.rm=TRUE);sdM <- sd(rowM,na.rm=TRUE) rawp <- 2*pnorm(abs(rowM-meanM),mean=0,sd=sdM,lower.tail=FALSE) res <- data.frame(M=rowM,A=rowA,sdA = sdA,t=NA,rawp=rawp,row.names=rownames(smooth300.gc)) write.table(res,file=file.path(tablePath,paste("tresults",design_id,"csv",sep=".")),sep=",",row.names=TRUE,col.names=TRUE) probes <- res[which(res$rawp < maxp),] probes <- data.frame(probes,POS[match(rownames(probes),POS$PROBE_ID),c("CHROMOSOME","POSITION")]) probes$POSITION <- probes$POSITION + ceiling(POS[match(rownames(probes),POS$PROBE_ID),c("LENGTH")]/2) probes$Dir <- cut(probes$M, c(-10000,0 ,10000), labels = c("Cont","Vinc")) probes$rank= floor(-log10(probes$rawp)) probes$cluster <- 1 probes <- probes[order(probes$CHROMOSOME,as.numeric(as.character(probes$POSITION))),] for (i in 2:nrow(probes)){ if ( probes[i,"CHROMOSOME"] == probes[i-1,"CHROMOSOME"]){ if( (as.numeric(as.character(probes[i,"POSITION"])) - 300) < (as.numeric(as.character(probes[i-1,"POSITION"])) + 300) & (probes[i,"Dir"] == probes[i-1,"Dir"])){ probes[i,"cluster"] <- probes[i-1,"cluster"] } else { probes[i,"cluster"] <- probes[i-1,"cluster"] + 1 } } else { probes[i,"cluster"] <- probes[(i-1),"cluster"] + 1 } } clusterID <- tapply(1:length(probes$cluster),probes$cluster,function(x) paste(probes[["CHROMOSOME"]][x][1],":",min(probes[["POSITION"]][x])-300,"-", max(probes[["POSITION"]][x]+300),sep="")) probes$clusterloc <- clusterID[probes$cluster] #################### Clusters ##################################### library(BSgenome) library("BSgenome.Rnorvegicus.UCSC.rn4") sregions <- split(probes,probes$cluster) regions <- sapply(sregions, function(x) { z <- c(ClusterID=x[1,"clusterloc"],Chromosome = x[1,"CHROMOSOME"],cSTART = min(x$POSITION)-300, cSTOP=max(x$POSITION)+300, meanM = mean(x$M),meanA = mean(x$A),minP = min(x$rawp),nprobes = nrow(x)) a <- c(Score = max(x$rank)) t(c(z,a)) }) regions <- as.data.frame(t(regions)) colnames(regions) <- c("ClusterID","Chromosome","cSTART","cSTOP","meanM","meanA","minP","nProbes","Score") Seq <- apply(regions[,c("Chromosome","cSTART","cSTOP")],1,function(x) as.character(getSeq(Rnorvegicus, x[1], start=as.integer(x[2]), end=as.integer(x[3]),as.character=FALSE))) base <- alphabetFrequency(DNAStringSet(Seq),baseOnly=TRUE) regions$Length <- nchar(Seq) regions$GC <- rowSums(base[,2:3]) regions$CpG <- dinucleotideFrequency(DNAStringSet(Seq))[,"CG"] regions$CpGdensity <- (as.numeric(as.character(regions$CpG)) / ( as.numeric(as.character(regions$cSTOP)) - as.numeric(as.character(regions$cSTART)) )) regions$Seq <- Seq regions <- regions[which(regions$CpGdensity >= 0.01),] regions <- regions[order(regions$meanA),] regions <- regions[which(as.numeric(as.character(regions$meanA)) > 9.5),] write.table(probes,file=file.path(tablePath,paste("ProbeResults",design_id,"csv",sep=".")),sep=",",row.names=TRUE,col.names=TRUE) write.table(regions,file=file.path(tablePath,paste("RegionsResults",design_id,"csv",sep=".")),sep=",",row.names=TRUE,col.names=TRUE,quote=FALSE) } for (i in 1:length(designids)){ design_id <- designids[i] tmp <- read.table(file=file.path(tablePath,paste("RegionsResults",design_id,"csv",sep=".")),sep=",") if (i == 1) cresults <- tmp else cresults <- rbind(cresults,tmp) } write.table(cresults,file=file.path(tablePath,paste("RegionsCombined","csv",sep=".")),sep=",",row.names=TRUE,col.names=TRUE,quote=FALSE)