R/snprma-functions.R
6a218440
 snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1, verbose=TRUE, seed=1, cdfName, sns){
2ae7850e
 	if (missing(sns)) sns <- basename(filenames)
 	##ADD CHECK TO SEE IF LOADED
 	if (missing(cdfName))
 		cdfName <- read.celfile.header(filenames[1])$cdfName
 	##  stuffDir <- changeToCrlmmAnnotationName(cdfName)
 	pkgname <- getCrlmmAnnotationName(cdfName)
 	if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
 		suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
 		msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
 		message(strwrap(msg))
 		stop("Package ", pkgname, " could not be found.")
 		rm(suggCall, msg)
 	}
 	if(verbose) message("Loading annotations and mixture model parameters.")
 	loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
 	loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
 	loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
 	autosomeIndex <- getVarInEnv("autosomeIndex")
 	pnsa <- getVarInEnv("pnsa")
 	pnsb <- getVarInEnv("pnsb")
 	fid <- getVarInEnv("fid")
 	reference <- getVarInEnv("reference")
 	aIndex <- getVarInEnv("aIndex")
 	bIndex <- getVarInEnv("bIndex")
 	SMEDIAN <- getVarInEnv("SMEDIAN")
 	theKnots <- getVarInEnv("theKnots")
 	gns <- getVarInEnv("gns")
6a218440
 
2ae7850e
 	##We will read each cel file, summarize, and run EM one by one
 	##We will save parameters of EM to use later
 	mixtureParams <- matrix(0, 4, length(filenames))
 	SNR <- vector("numeric", length(filenames))
 	SKW <- vector("numeric", length(filenames))
6a218440
 
2ae7850e
 	## This is the sample for the fitting of splines
 	## BC: I like better the idea of the user passing the seed,
 	##     because this might intefere with other analyses
 	##     (like what happened to GCRMA)
 	set.seed(seed)
 	idx <- sort(sample(autosomeIndex, mixtureSampleSize))
 	##S will hold (A+B)/2 and M will hold A-B
 	##NOTE: We actually dont need to save S. Only for pics etc...
 	##f is the correction. we save to avoid recomputing
 	A <- matrix(as.integer(0), length(pnsa), length(filenames))
 	B <- matrix(as.integer(0), length(pnsb), length(filenames))
6a218440
   
2ae7850e
 	if(verbose){
 		message("Processing ", length(filenames), " files.")
 		if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3)
 	}
 	##We start looping throug cel files
 	idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything
 	for(i in seq(along=filenames)){
 		y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
 		x <- log2(y[idx2])
 		SKW[i] <- mean((x-mean(x))^3)/(sd(x)^3)
 		rm(x)
 		y <- normalize.quantiles.use.target(y, target=reference)
 		A[, i] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
 		B[, i] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
 		##Now to fit the EM
 		if(fitMixture){
 			S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN
 			M <- log2(A[idx, i])-log2(B[idx, i])
6a218440
       
2ae7850e
 			##we need to test the choice of eps.. it is not the max diff between funcs
 			tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
 			
 			mixtureParams[, i] <- tmp[["coef"]]
 			SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
 		}
 		if (verbose)
 			if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
 			else cat(".")
 	}
 	if (verbose)
 		if (getRversion() > '2.7.0') close(pb)
 		else cat("\n")
 	if (!fitMixture) SNR <- mixtureParams <- NA
 	## gns comes from preprocStuff.rda
 	list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
6a218440
 }
3620e6b1
 
 fitAffySnpMixture56 <- function(S, M, knots, probs=rep(1/3, 3), eps=.01, maxit=10, verbose=FALSE){
   ##56 stands for 5 and 6 arrays but will also work for Illumina
   ##Note the unfortunate choice of numbering:
   ##1 is BB, 2 AB, and 3 AA. Opposite to everything else!
   ##this is legacy code I decided not to change.
   ## this why at the end we report -coefs: F1 is the negative f
   mus <- append(quantile(M, c(1, 5)/6, names=FALSE), 0, 1)
   sigmas <- rep(mad(c(M[M<mus[1]]-mus[1], M[M>mus[3]]-mus[3])), 3)
   sigmas[2] <- sigmas[2]/2
  
   weights <- apply(cbind(mus, sigmas), 1, function(p) dnorm(M, p[1], p[2]))
   previousF1 <- -Inf
   change <- eps+1
   it <- 0
  
   if(verbose) message("Max change must be under ", eps, ".")
   matS <- stupidSplineBasis(S, knots)
   while (change > eps & it < maxit){
     it <- it+1
     ## E
     z <- sweep(weights, 2, probs, "*")
     LogLik <- rowSums(z)
     z <- sweep(z, 1, LogLik, "/")
     probs <- colMeans(z)
  
     ## M
     fit1 <- crossprod(chol2inv(chol(crossprod(sweep(matS, 1, z[, 1], FUN="*"), matS))), crossprod(matS, z[, 1]*M))
  
     fit2 <- sum(z[, 2]*M)/sum(z[, 2])
     F1 <- matS%*%fit1
     sigmas[c(1, 3)] <- sqrt(sum(z[, 1]*(M-F1)^2)/sum(z[, 1]))
     sigmas[2] <- sqrt(sum(z[, 2]*(M-fit2)^2)/sum(z[, 2]))
  
     weights[, 1] <- dnorm(M, F1, sigmas[1])
     weights[, 2] <- dnorm(M, fit2, sigmas[2])
     weights[, 3] <- dnorm(M, -F1, sigmas[3])
     
     change <- max(abs(F1-previousF1))
     previousF1 <- F1
     if(verbose) message("Iter ", it, ": ", change, ".")
   }
   medF1 <- median(-F1)
  return(list(coef= -fit1, medF1=medF1, sigma1=sigmas[1], sigma2=sigmas[2]))
 }