R/crlmmGT2.R
16fadf14
 crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
                      col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
                      SNRMin=5, recallMin=10, recallRegMin=1000,
                      gender=NULL, desctrucitve=FALSE, verbose=TRUE,
                      returnParams=FALSE, badSNP=.7, snp.names){
 	pkgname <- getCrlmmAnnotationName(cdfName)
 	stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
 	open(SNR)
 	open(A)
 	open(B)
 	open(mixtureParams)
 	## expect objects to be ff
 	keepIndex <- which( SNR[] > SNRMin)
 	if(length(keepIndex)==0) stop("No arrays above quality threshold!")
 	if(is.null(rownames(A))){
 		loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
 		gns <- getVarInEnv("gns", .crlmmPkgEnv)
 		stopifnot(nrow(A) == length(gns))
 		index <- seq(length=nrow(A))
 	}
 	if(!missing(snp.names)){
 		stopifnot(!is.null(rownames(A)))
 		##verify that A has only snps.  otherwise, calling function must pass rownames
 		index <- match(snp.names, rownames(A))
 	}
 	snpBatches <- splitIndicesByLength(index, ocProbesets())
 	NR <- length(unlist(snpBatches))
b59e8327
 	if(verbose) message("Calling ", NR, " SNPs for recalibration... ")
16fadf14
 	NC <- ncol(A)
 	##
 	if(verbose) message("Loading annotations.")
 	obj1 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
 	obj2 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
 	## this is toget rid of the 'no visible binding' notes
 	## variable definitions
 	XIndex <- getVarInEnv("XIndex")
 	autosomeIndex <- getVarInEnv("autosomeIndex")
 	YIndex <- getVarInEnv("YIndex")
 	SMEDIAN <- getVarInEnv("SMEDIAN")
 	theKnots <- getVarInEnv("theKnots")
 	regionInfo <- getVarInEnv("regionInfo")
 	params <- getVarInEnv("params")
 	rm(list=c(obj1, obj2), envir=.crlmmPkgEnv)
 	rm(obj1, obj2)
 	##
 	## IF gender not provide, we predict
 	## FIXME: XIndex may be greater than ocProbesets()
 	if(is.null(gender)){
 		if(verbose) message("Determining gender.")
 		##    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
 		XMedian <- ocLapply(splitIndicesByNode(1:NC), predictGender, theA=A, theB=B, XIndex=XIndex, neededPkgs="crlmm")
 		XMedian <- unlist(XMedian)
 		if(sum(SNR[] > SNRMin)==1){
 			gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
 		}else{
 			gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
 		}
 	}
 	##
 	Indexes <- list(autosomeIndex, XIndex, YIndex)
 	cIndexes <- list(keepIndex,
 			 keepIndex[which(gender[keepIndex]==2)],
 			 keepIndex[which(gender[keepIndex]==1)])
 	## call C
 	fIndex <- which(gender==2)
 	mIndex <- which(gender==1)
 	## different here
 	## use gtypeCallerR in batches
 	##snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
 	newparamsBatch <- vector("list", length(snpBatches))
 	process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
 			     YIndex, A, B, mixtureParams, fIndex, mIndex,
 			     params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
 		open(A)
 		open(B)
 		open(mixtureParams)
 		snps <- snpBatches[[idxBatch]]
 		rSnps <- range(snps)
 		last <- (idxBatch-1)*batchSize
 		IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
 				     XIndex[XIndex %in% snps]-last,
 				     YIndex[YIndex %in% snps]-last)
 		IndexesBatch <- lapply(IndexesBatch, as.integer)
 		tmpA <- as.matrix(A[snps,])
 		tmpB <- as.matrix(B[snps,])
 		## newparamsBatch[[idxBatch]]
 		tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
 				    params[["centers"]][snps,],
 				    params[["scales"]][snps,],
 				    params[["N"]][snps,],
 				    IndexesBatch, cIndexes,
 				    sapply(IndexesBatch, length),
 				    sapply(cIndexes, length), SMEDIAN,
 				    theKnots, mixtureParams[], DF, probs, 0.025)
 		rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last)
 		gc(verbose=FALSE)
 		close(A)
 		close(B)
 		close(mixtureParams)
 		tmp
 	}
 	##
b59e8327
 	if(verbose) message("Calling process1")
16fadf14
 	newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
 				   snpBatches=snpBatches,
 				   autosomeIndex=autosomeIndex, XIndex=XIndex,
 				   YIndex=YIndex, A=A, B=B,
 				   mixtureParams=mixtureParams, fIndex=fIndex,
 				   mIndex=mIndex, params=params,
 				   cIndexes=cIndexes, SMEDIAN=SMEDIAN,
 				   theKnots=theKnots, DF=DF, probs=probs,
 				   batchSize=ocProbesets())
 	newparams <- vector("list", 3)
 	names(newparams) <- c("centers", "scales", "N")
 	newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
 	newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2))
 	newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3))
 	rm(newparamsBatch); gc(verbose=FALSE)
 	if(verbose) message("Done.")
 	if(verbose) message("Estimating recalibration parameters.")
 	d <- newparams[["centers"]] - params$centers
 	##
 	##regression
 	Index <- intersect(which(pmin(newparams[["N"]][, 1],
 				      newparams[["N"]][, 2],
 				      newparams[["N"]][, 3]) > recallMin &
 				 !apply(regionInfo, 1, any)),
 			   autosomeIndex)
 	if(length(Index) < recallRegMin){
 		warning("Recalibration not possible. Possible cause: small sample size.")
 		newparams <- params
 		dev <- vector("numeric", nrow(newparams[["centers"]]))
 		SS <- matrix(Inf, 3, 3)
 		DD <- 0
 	}else{
 		data4reg <- as.data.frame(newparams[["centers"]][Index,])
 		names(data4reg) <- c("AA", "AB", "BB")
 		regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
 				   c(coef(lm(AB~AA+BB, data=data4reg)), 0),
 				   coef(lm(BB~AA*AB, data=data4reg)))
 		rownames(regParams) <- c("intercept", "X", "Y", "XY")
 		rm(data4reg)
 		##
 		minN <- 3
 		newparams[["centers"]][newparams[["N"]] < minN] <- NA
 		Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
 		if(verbose) message("Filling out empty centers", appendLF=FALSE)
 		for(i in Index){
 			if(verbose) if(i%%10000==0) message(".", appendLF=FALSE)
 			mu <- newparams[["centers"]][i, ]
 			j <- which(is.na(mu))
 			newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
 			rm(mu, j)
 		}
 		##
 		##remaing NAs are made like originals
 		if(length(YIndex)>0){
 			noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex),
 					     YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
 		}
 		snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0)
 		snps2keep <- setdiff(autosomeIndex, snps2ignore)
 		rm(snps2ignore)
 		newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
 		if(verbose) cat("\n")
 		##
 		if(verbose) message("Calculating and standardizing size of shift... ", appendLF=FALSE)
 		GG <- DD <- newparams[["centers"]] - params[["centers"]]
 		DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
 		SS <- cov(DD[autosomeIndex, ])
 		SSI <- solve(SS)
 		dev <- vector("numeric", nrow(DD))
 		if(length(YIndex)){
 			dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
 			dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
 			##Now Y (only two params)
 			SSY <- SS[c(1, 3), c(1, 3)]
 			SSI <- solve(SSY)
 			dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
 			dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
 		} else {
 			dev=apply(DD,1,function(x) x%*%SSI%*%x)
 			dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
 		}
 	}
 	if (verbose) message("OK")
 	##
 	## BC: must keep SD
 	params[-2] <- newparams[-2]
 	rm(newparams)
 	gc(verbose=FALSE)
 	##
 	if(verbose) message("Calling ", NR, " SNPs... ", appendLF=FALSE)
 	##
 	## ###################
 	## ## MOVE TO C#######
 	##
 	## running in batches
 	process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
 			     YIndex, A, B, mixtureParams, fIndex, mIndex,
 			     params, cIndexes, SMEDIAN, theKnots, DF, probs,
 			     regionInfo, batchSize){
 		open(A)
 		open(B)
 		open(mixtureParams)
 		snps <- snpBatches[[idxBatch]]
 		tmpA <- as.matrix(A[snps,])
 		tmpB <- as.matrix(B[snps,])
 		rSnps <- range(snps)
 		last <- (idxBatch-1)*batchSize
 		IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
 				     XIndex[XIndex %in% snps]-last,
 				     YIndex[YIndex %in% snps]-last)
 		IndexesBatch <- lapply(IndexesBatch, as.integer)
 		ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
 					params[["centers"]][snps,],
 					params[["scales"]][snps,],
 					params[["N"]][snps,],
 					IndexesBatch, cIndexes,
 					sapply(IndexesBatch, length),
 					sapply(cIndexes, length),
 					SMEDIAN, theKnots, mixtureParams[],
 					DF, probs, 0.025,
 					which(regionInfo[snps, 2]),
 					which(regionInfo[snps, 1]))
 		A[snps,] <- tmpA
 		B[snps,] <- tmpB
 		rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull, last)
 		gc(verbose=FALSE)
 		close(A)
 		close(B)
 		close(mixtureParams)
 	}
 	##
 	ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
 		 autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
 		 A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
 		 mIndex=mIndex, params=params, cIndexes=cIndexes,
 		 SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
 		 regionInfo=regionInfo, batchSize=ocProbesets())
 	##  END MOVE TO C#######
 	## ##################
 	##
 	dev <- dev/(dev+1/383)
 	if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
 	if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
 	##
 	if(length(Index) >= recallRegMin){
 		tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
 		tmpSnpQc <- dev[autosomeIndex]
 		SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
 		batchQC <- mean(diag(SS))
 	}else{
 		SS <- matrix(0, 3, 3)
 		batchQC <- Inf
 	}
 	##
 	if(verbose) message("Done.")
 	if (returnParams){
 		return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
 	}else{
 		return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
 	}
 }