####################################################
##
## Primary Researcher: Ben and Carlos
##		Washington State University
## 		Skinner labratory
##  Nimblegen Promotor array CHiP data Dataset
##  January 25 2008
##
## Bioinformatician: Matt Settles
##	Washington State University
##  Normalize by Adjusted GC content
##
## file = normalizeLoessGC.R
########
########## PREAMBLE ############

base <- ""
experimentName <- "ExpName"


setwd(base)
codePath <- "http://bioinfo-mite.crb.wsu.edu/Rcode"
rawPath <- file.path(getwd(),"Data")
figurePath <- file.path(getwd(), "Figures")
tablePath <- file.path(getwd(), "Tables")
pairPath <- file.path(getwd(),"Raw_Data_Files")
designPath <- file.path( base,"Design_Information")
probeAnnoPath <- file.path( base,"Probe_Anno")
library(limma)

targetsFile <- "TargetFile.txt"
designids <- c(designID)
ext <- ".pair"
################################################################################
### LOAD DATA FILES
#############################
### Load Raw Data ###
#############################

for (i in 1:length(designids)){
	design_id <- designids[i] 
	
	if (file.exists(file=file.path(rawPath,paste("UnmodifiedRawData",design_id,"RData",sep=".")))){
		load(file=file.path(rawPath,paste("UnmodifiedRawData",design_id,"RData",sep=".")))      
		#############################
		### Read ndf and Pos files
		#############################
		## load annotation information
		if(file.exists(file.path(probeAnnoPath,paste("ndfpos",design_id,"RData",sep=".")))){
			load(file=file.path(probeAnnoPath,paste("ndfpos",design_id,"RData",sep=".")))
		} else {
			designName <- RG$targets$DESIGN_NAME[1]
			NDF <- read.table( file.path(designPath,paste(designName,"ndf",sep=".")) , 
					sep="\t",header=TRUE,as.is=TRUE)
			POS <- read.table( file.path(designPath,paste(designName,"pos",sep=".")) , 
					sep="\t",header=TRUE,as.is=TRUE)
			NDF <- NDF[match(RG$genes$PROBE_ID,NDF$PROBE_ID),]
			NDF$CHROMOSOME <- "RANDOM"
			NDF[match(POS$PROBE_ID,NDF$PROBE_ID),"CHROMOSOME"] <- POS$CHROMOSOME
			NDF$LENGTH <- nchar(NDF$PROBE_SEQUENCE)
			NDF$COUNT <- 0
			NDF[match(POS$PROBE_ID,NDF$PROBE_ID),"COUNT"] <- POS$COUNT
			library(Biostrings)
			NDF$GC <-sapply(gregexpr("[CG]",NDF$PROBE_SEQUENCE) ,length)
			NDF$CG <-sapply(gregexpr("CG",NDF$PROBE_SEQUENCE) ,length)
			NDF <- NDF[order(NDF$CHROMOSOME,NDF$POSITION),]
			POS <- NDF
			rm(NDF)
			save(POS,file=file.path(probeAnnoPath,paste("ndfpos",design_id,"RData",sep=".")))
		}
	} else {
		
		hybes  <- readTargets(targetsFile)
		hybes <- hybes[order(hybes$DESIGN_ID,hybes$ORD_ID,hybes$DYE),]
		hybes <- hybes[which(hybes$DESIGN_ID == design_id),]
		
		
		source(file.path(base,"readNimblegen.R"))
		design_name <- hybes[,"DESIGN_NAME"][1]
		
		narray <- length(unique(hybes[,"CHIP_ID"]))
		cy3FileName <- grep("[Cc][Yy]3", hybes$DYE)
		cy5FileName <- grep("[Cc][Yy]5",  hybes$DYE)
		files <- paste(hybes[c(cy3FileName,cy5FileName),"CHIP_ID",drop=TRUE],"_",rep(c(532,635),each=narray),ext,sep="")
		files <- matrix(files, ncol = 2, byrow = FALSE)
		RG <- readPairFiles(files, verbose=TRUE, path=pairPath)
		RG$Rb <- RG$Gb <- NULL 
		spottypes <- readSpotTypes("spottypes.txt")
		RG$genes$Status <- controlStatus(spottypes, RG$genes)
		RG$genes <- RG$genes[, match(c("PROBE_ID","Status"),colnames(RG$genes))]
		RG$targets <- cbind(hybes[cy3FileName,c("ORD_ID","CHIP_ID","DESIGN_NAME","DESIGN_ID","SAMPLE_SPECIES")],
				matrix(hybes$SAMPLE_DESCRIPTION[c(cy3FileName,cy5FileName)],ncol=2,byrow=FALSE,dimnames=list(NULL,c("Cy3","Cy5"))))
		if (!all(!duplicated(RG$genes$PROBE_ID)))
			RG <- RG[-which(duplicated(RG$genes$PROBE_ID)==TRUE),]
		
		
		#############################
		### Read ndf and Pos files
		#############################
		## load annotation information
		if(file.exists(file.path(probeAnnoPath,paste("ndfpos",design_id,"RData",sep=".")))){
			load(file=file.path(probeAnnoPath,paste("ndfpos",design_id,"RData",sep=".")))
		} else {
			designName <- RG$targets$DESIGN_NAME[1]
			NDF <- read.table( file.path(designPath,paste(designName,"ndf",sep=".")) , 
					sep="\t",header=TRUE,as.is=TRUE)
			POS <- read.table( file.path(designPath,paste(designName,"pos",sep=".")) , 
					sep="\t",header=TRUE,as.is=TRUE)
			NDF <- NDF[match(RG$genes$PROBE_ID,NDF$PROBE_ID),]
			NDF$CHROMOSOME <- "RANDOM"
			NDF[match(POS$PROBE_ID,NDF$PROBE_ID),"CHROMOSOME"] <- POS$CHROMOSOME
			NDF$LENGTH <- nchar(NDF$PROBE_SEQUENCE)
			NDF$COUNT <- 0
			NDF[match(POS$PROBE_ID,NDF$PROBE_ID),"COUNT"] <- POS$COUNT
			library(Biostrings)
			NDF$GC <-sapply(gregexpr("[CG]",NDF$PROBE_SEQUENCE) ,length)
			NDF$CG <-sapply(gregexpr("CG",NDF$PROBE_SEQUENCE) ,length)
			NDF <- NDF[order(NDF$CHROMOSOME,NDF$POSITION),]
			POS <- NDF
			rm(NDF)
			save(POS,file=file.path(probeAnnoPath,paste("ndfpos",design_id,"RData",sep=".")))
		}
		RG <- RG[match(POS$PROBE_ID,RG$genes$PROBE_ID),]
		save(RG,file=file.path(rawPath,paste("UnmodifiedRawData",design_id,"RData",sep=".")))
		
		########### produce MA-plots and Histogram of RawData
		source(file.path(codePath,"maplot.R"))
		library(affy)
		narray <- ncol(RG$R)
		pdf(file=file.path(figurePath,paste(experimentName,design_id,"RawHist","pdf",sep=".")),width=7,height=10,pointsize=10)
		par(mfrow=c(2,1))
		plotDensity(cbind(log2(RG$R),log2(RG$G)),col=rep(c("red","green"),each=narray),
				lty=c(1:narray,1:narray),xlab="Log-intensity",ylab="Density",lwd=2,
				main="Channel bias in log-intensity",sub=paste("ChipSet",design_id))
		plotDensity(log2(RG$R)-log2(RG$G), lwd=2,
				xlab="Log-fold change",ylab="Density",
				main="Raw data in log-fold change, [log2(R) - log2(G)]",
				sub=paste("ChipSet",design_id))
		dev.off()
		
		for(i in 1:narray){
			png(filename = file.path(figurePath,paste(experimentName,design_id,"RawMA","array", RG$targets$CHIP_ID[i],"png",sep=".")),
					width = 7, height = 7, units = "in",res=300,
					pointsize = 14)
			maplot(RG,array=i,show.statistics=TRUE,ylim=c(-5,5),
					main=paste("Array",  RG$targets$CHIP_ID[i],"\nChipset",design_id)) ## can't use sub
			dev.off()
		}
		
	
		minE <-  floor(min(log2(cbind(RG$R,RG$G))))
		maxE <-  ceiling(max(log2(cbind(RG$R,RG$G))))
		#### Boxplots by column
		png(file=file.path(figurePath,paste(experimentName,design_id,"qcBP","png",sep=".")),
				height=10,width=7,pointsize=8,res=300,units="in")
		par(mfrow=c(3,1))
		boxplot(log2(RG$R) ~ col(RG$R),ylim=c(minE,maxE),col="red", 
				main=paste("all arrays Chipset",design_id,"Red"))
		boxplot(log2(RG$G) ~ col(RG$G),ylim=c(minE,maxE),col="green", 
				main=paste("all arrays Chipset",design_id,"Green"))
		boxplot(log2(RG$R) - log2(RG$G) ~ col(RG$R),col="blue", 
				main=paste("log-FC all arrays Chipset",design_id))
		dev.off()
		
		#### Boxplots by probelengths
		png(file=file.path(figurePath,paste(experimentName,design_id,"qcBPprobelengths","png",sep=".")),
				height=10,width=7,pointsize=8,res=300,units="in")
		par(mfrow=c(3,1))
		boxplot(log2(RG$R) ~ POS$LENGTH,ylim=c(minE,maxE),col="red", 
				main=paste("probe length all arrays Chipset",design_id,"Red"))
		boxplot(log2(RG$G) ~ POS$LENGTH,ylim=c(minE,maxE),col="green", 
				main=paste("probe length all arrays Chipset",design_id,"Green"))
		boxplot(log2(RG$R) - log2(RG$G) ~ POS$LENGTH,col="blue", 
				main=paste("probe length vs log-FC all arrays Chipset",design_id))
		dev.off()
		
		#### Boxplots by probe copy number
		png(file=file.path(figurePath,paste(experimentName,design_id,"qcBPprobecopy","png",sep=".")),
				height=10,width=7,pointsize=8,res=300,units="in")
		par(mfrow=c(3,1))
		boxplot(log2(RG$R) ~ POS$COUNT,ylim=c(minE,maxE),col="red", 
				main=paste("copy count all arrays Chipset",design_id,"Red"))
		boxplot(log2(RG$G) ~ POS$COUNT,ylim=c(minE,maxE),col="green", 
				main=paste("copy count all arrays Chipset",design_id,"Green"))
		boxplot(log2(RG$R) - log2(RG$G) ~ POS$COUNT,col="blue", 
				main=paste("probe count vs log-FC all arrays Chipset",design_id))
		dev.off()
		
		## Intensity by CpG
		png(file=file.path(figurePath,paste(experimentName,design_id,"qcBPCpG","png",sep=".")),
				height=10,width=7,pointsize=8,res=300,units="in")
		par(mfrow=c(3,1))
		boxplot(log2(RG$R) ~ POS$CG,ylim=c(minE,maxE),col="red",
				main=paste("Intensity vs GC count all arrays Chipset",design_id,"Red"))
		boxplot(log2(RG$G) ~ POS$CG,ylim=c(minE,maxE),col="green", 
				main=paste("Intensity vs GC arrays Chipset",design_id,"Green"))
		boxplot(log2(RG$R)-log2(RG$G) ~ POS$CG,col="blue", 
				main=paste("log-FC vs GC count arrays Chipset",design_id))
		dev.off()
		
		## Intensity by GC    
		png(file=file.path(figurePath,paste(experimentName,design_id,"qcBPGC","png",sep=".")),
				height=10,width=7,pointsize=8,res=300,units="in")
		par(mfrow=c(3,1))
		boxplot(log2(RG$R) ~ POS$GC,ylim=c(minE,maxE),col="red",
				main=paste("Intensity vs GC count all arrays Chipset",design_id,"Red"))
		boxplot(log2(RG$G) ~ POS$GC,ylim=c(minE,maxE),col="green", 
				main=paste("Intensity vs GC arrays Chipset",design_id,"Green"))
		boxplot(log2(RG$R)-log2(RG$G) ~ POS$GC,col="blue", 
				main=paste("log-FC vs GC count arrays Chipset",design_id))
		dev.off()
		
		corc1 <- sapply(names(table(POS$GC)),function(x) cor(as.numeric(log2(RG$R[which(POS$GC == x),])),as.numeric(log2(RG$G[which(POS$GC == x),]))))
		png(file=file.path(figurePath,paste(experimentName,design_id,"qcScatter","png",sep=".")),
				height=10,width=7,pointsize=8,res=300,units="in")
		par(mfrow=c(2,1))
		plot(log2(RG$R[which(POS$GC == 11),]),log2(RG$G[which(POS$GC == 11),]), 
				main=paste("Log Intensity for GC=11, Chipset",design_id),
				ylim=c(minE,maxE),xlim=c(minE,maxE),
				xlab="Cy5 log Intensity",ylab="Cy3 log Intensity")
		text(maxE-1,minE+1,paste("Correlation =",signif(corc1["11"],3)))
		plot(log2(RG$R[which(POS$GC == 39),]),log2(RG$G[which(POS$GC == 39),]), 
				main=paste("Log Intensity for GC=39, Chipset",design_id),
				ylim=c(minE,maxE),xlim=c(minE,maxE),
				xlab="Cy5 log Intensity",ylab="Cy3 log Intensity")
		text(maxE-1,minE+1,paste("Correlation =",signif(corc1["39"],3)))
		dev.off()
		## qc cor
		png(file=file.path(figurePath,paste(experimentName,design_id,"qcCorr","png",sep=".")),
				height=10,width=7,pointsize=8,res=300,units="in")
		plot(as.numeric(names(table(POS$GC))),corc1,type="l",main=paste("Correlation by GC count Chipset",design_id),xlab="GC count",ylab="Pearson correlation")
		dev.off()
		## qc hist
		png(file=file.path(figurePath,paste(experimentName,design_id,"qcHist","png",sep=".")),
				height=10,width=7,pointsize=8,res=300,units="in")
		par(mfrow=c(2,1))
		hist(log2(RG$G[,1]),breaks=100,
				main= paste("Green Channel, G-C bias in log-intensity (GC = 20), Chipset",design_id,"Array:",RG$targets$CHIP_ID[1]),
				xlim=c(minE,maxE),xlab="Log-intensity",ylab="Frequency")
		hist(log2(RG$G[which(POS$GC <=20),1]),
				col="green",breaks=100,add=TRUE)
		hist(log2(RG$R[,1]),breaks=100,
				main= paste("Red Channel, G-C bias in log-intensity (GC = 20), Chipset",design_id,"Array:",RG$targets$CHIP_ID[1]),
				xlim=c(minE,maxE),xlab="Log-intensity",ylab="Frequency")
		hist(log2(RG$R[which(POS$GC <=20),1]),col="red",breaks=100,add=TRUE)
		dev.off()
		
	}
	#######################################################
	## plot quantiles
	Rq <- as.data.frame(apply(log2(RG$R),2,quantile,probs = seq(0, 1, 0.001),na.rm=TRUE))
	Gq <- as.data.frame(apply(log2(RG$G),2,quantile,probs = seq(0, 1, 0.001),na.rm=TRUE))
	
	png(file=file.path(figurePath,paste(experimentName,design_id,"RvGquantiles","png",sep=".")),height=7,width=7,pointsize=8,res=300,units="in")
	matplot(Rq,Gq,type="p",col=1:ncol(RG$R),cex=0.5)
	abline(a=0,b=1)
	dev.off()
	
	### Remove Saturated Probes
	sat <- 2^(15.5)
	
	removeA1 <- c(apply(RG$R,2,function(x) which (x > sat)),
			apply(RG$G,2,function(x) which (x > sat)))
	matrix(lapply(removeA1,length),ncol=2,dimnames=list(RG$targets$CHIP_ID,c("Red","Green")))

	length(unique(unlist(removeA1)))
	duplicated <- which(POS$COUNT > 1)
	length(duplicated)

	RG$R[as.numeric((unique(c(unlist(removeA1),duplicated)))),] <- NA
	RG$G[as.numeric((unique(c(unlist(removeA1),duplicated)))),] <- NA
	## removed saturated and duplicated probes
	save(RG,file=file.path(rawPath,paste("PreprocRawData",design_id,"RData",sep=".")))
		
	
	################################################33
	# Log Values
	# MA2C
	logR <- log2(RG$R)
	logG <- log2(RG$G)
	## Adjust and GC content
	narray <- ncol(RG$R)

	min <- 2
	GCcount = table(as.factor(POS$GC))
	Rmean = apply(logR,2, function (x) tapply(x,POS$GC, mean,na.rm=TRUE))
	Gmean = apply(logG,2, function (x) tapply(x,POS$GC, mean,na.rm=TRUE))
	Rvar = apply(logR,2, function(x) tapply(x,POS$GC,var,na.rm=TRUE))
	Gvar = apply(logG,2, function(x) tapply(x,POS$GC,var,na.rm=TRUE))
	gccov = matrix(NA,nrow=length(GCcount),ncol=ncol(logR))
	for(i in 1:ncol(logR)) 
		gccov[,i] <- tapply(1:nrow(logR),POS$GC,
				function(x) cov(cbind(logR[x,i],logG[x,i]),use="pairwise.complete.obs")[1,2])
	
	GCsum <- list(GCcount=GCcount,Rmean=Rmean,Gmean=Gmean,Rvar=Rvar,Gvar=Gvar,gccov=gccov)
	
	png(file=file.path(figurePath,paste(experimentName,design_id,"meanIntvsGC","png",sep=".")),height=7,width=7,pointsize=8,res=300,units="in")
	matplot(names(GCcount),cbind(Rmean,Gmean),type = "l",col=rep(c("red","green"),each=narray),lty=rep(1:narray,times=2),main=paste("Mean expression within GC Chip",design_id))
	dev.off()
	
	## no background correction
	object <- MA.RG(RG, bc.method = "none", offset = 0)
	if (is.vector(object$M)) 
		object$M <- as.matrix(object$M)
	if (is.vector(object$A)) 
		object$A <- as.matrix(object$A)
	
	## GC correction 
	plotit=TRUE
	pdf(file=file.path(figurePath,paste(experimentName,design_id,"loesscurvesbyGC","pdf",
							sep=".")),height=7,width=7,pointsize=8)
	for (j in 1:narray) {
		if (plotit){
			plot(object$A[,j],object$M[,j],type="n",cex = 0.5,ylim=c(-5,5),xlab="A",ylab="M",main=paste("Loess Curves by GC, Chipset", design_id,", Array:",RG$targets$CHIP_ID[1]))
			abline(0, 0, col = "blue")
		}
		cat("array", j,"\tGC:")
		# look into each GC levels, may vary from 1 to 50
		for (GCp in levels(factor(POS$GC)) ) {
			cat(GCp,":")
			# pick up the probes with GC level N (N=1..50)
			spots <- which(POS$GC == as.numeric(GCp) )
			cat(length(spots),"..")
			# Remove probes with NA and see we have more than 1 probe
			if (length(na.exclude(object$M[spots,j])) > 1){
				y <- object$M[spots, j]
				x <- object$A[spots, j]
				#w <- weights[spots, j]
				lfit <- loessFit(y, x, span = 0.3, iterations = 4)
				# lfit has two list (fitted and residuals)
				# replace spot intensity M with residuals
				object$M[spots, j] <- lfit$residuals
				if(plotit){
					o <- order(x)
					# order A
					A <- x[o]
					# order M with same order
					M <- lfit$fitted[o]
					# get the probes with unique values (A)
					o <- which(!duplicated(x))
					lines(approx(A[o], M[o]), col = "red", lwd = 1, lty = 1)
				}
			} else {
				# if only one spot with the GC level, mark the probe as NA
				object$M[spots,j] <- NA
			}
		}
		cat("\n")
	}
	dev.off()
	
	
	object <- normalizeBetweenArrays(object) # A-quantile , quantile norm the A values
	
	
	#####################
	tmp <- RG.MA(object)
	## plot quantiles
	Rq <- as.data.frame(apply(log2(tmp$R),2,quantile,probs = seq(0, 1, 0.001),na.rm=TRUE))
	Gq <- as.data.frame(apply(log2(tmp$G),2,quantile,probs = seq(0, 1, 0.001),na.rm=TRUE))
	
	png(file=file.path(figurePath,paste(experimentName,design_id,"RvGpostGCquantiles","png",sep=".")),height=7,width=7,pointsize=8,res=300,units="in")
	matplot(Rq,Gq,type="p",col=1:ncol(tmp$R),cex=0.5)
	abline(a=0,b=1)
	dev.off()
	
	library(affy)
	narray <- ncol(tmp$R)
	pdf(file=file.path(figurePath,paste(experimentName,design_id,"postGCHist","pdf",sep=".")),width=7,height=10,pointsize=10)
	par(mfrow=c(2,1))
	plotDensity(cbind(log2(tmp$R),log2(tmp$G)),col=rep(c("red","green"),each=narray),
			lty=c(1:narray,1:narray),xlab="Log-intensity",ylab="Density",lwd=2,
			main="Channel bias in log-intensity",sub=paste("ChipSet",design_id))
	plotDensity(log2(tmp$R)-log2(tmp$G), lwd=2,
			xlab="Log-fold change",ylab="Density",
			main="Raw data in log-fold change, [log2(R) - log2(G)]",
			sub=paste("ChipSet",design_id))
	dev.off()
	#####################
	
	############## MA2C
	Rmean = apply(logR,2, function (x) tapply(x,POS$GC, mean,na.rm=TRUE))
	numer <- (logR - Rmean[as.character(POS$GC),]) - (logG - Gmean[as.character(POS$GC),])
	denom <- sqrt(Gvar + Rvar - 2*gccov)[as.character(POS$GC),]
	MA2C <- numer/denom  ## adjusted further for correlation
	MA2C <- scale(MA2C,center=FALSE)
	MA2CA <-  (logR + logG)/2

	save(object,MA2C,MA2CA,GCsum,file=file.path(rawPath,paste("GCNormalized",design_id,"RData",sep=".")))
	
	#maplotNormalized
	source(file.path(codePath,"maplot.R"))
	for(i in 1:narray){
		png(filename=file.path(figurePath,paste(experimentName,design_id,"GCloessMA","array",RG$targets$CHIP_ID[i],"png",sep=".")),
				width = 7, height = 7, units = "in",res=300,
				pointsize = 14)
		maplot(object,array=i,show.statistics=TRUE,ylim=c(-5,5),
				main=paste("Array ", RG$targets$CHIP_ID[i],"\nChipset",design_id)) ## can't use sub
		dev.off()
	}
	#maplotNormalized
	source(file.path(codePath,"maplot.R"))
	for(i in 1:narray){
		png(filename=file.path(figurePath,paste(experimentName,design_id,"MA2CMA","array",RG$targets$CHIP_ID[i],"png",sep=".")),
				width = 7, height = 7, units = "in",res=300,
				pointsize = 14)
		maplot(M=MA2C,object=MA2CA,array=i,show.statistics=TRUE,ylim=c(-5,5),
				main=paste("Array ", RG$targets$CHIP_ID[i],"\nChipset",design_id)) ## can't use sub
		dev.off()
	}
}