R/krlmm.R
3d8f4872
 krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
cbfc52e5
     pkgname <- getCrlmmAnnotationName(cdfName)
3d8f4872
     
 ### pre-processing, output M    
     M = computeLogRatio(cnSet, verbose)
cbfc52e5
     message("Leaving out non-variant SNPs")
3d8f4872
     
     # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
     uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
     SNPtoProcessIndex = uniqueCount >= 3
     noVariantSNPIndex = uniqueCount < 3
     M = M[SNPtoProcessIndex, ]
 
     numSNP = nrow(M)
     numSample = ncol(M)
     
     calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
     scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
     open(calls)
     open(scores)
 
     rownames(calls) = rownames(M)
     rownames(scores) = rownames(M)
     colnames(calls) = colnames(M)
     colnames(scores) = colnames(M)  
     
     priormeans = calculatePriorValues(M, numSNP, verbose)
b2cf0c8e
       
3d8f4872
 ### retrieve or calculate coefficients
     krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
     
 ### do VGLM fit, to predict k for each SNP
cbfc52e5
     kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose)
3d8f4872
     rm(VGLMparameters)
     
 ### assign calls
cbfc52e5
     assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose)
3d8f4872
     rm(kPrediction)
 
 ### assign confidence scores
     computeCallPr(scores, M, calls, numSNP, numSample, verbose)
     
 ### add back SNPs excluded before
     AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
 
     loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
     YIndex <- getVarInEnv("YIndex")
     loader("annotation.rda", .crlmmPkgEnv, pkgname)
     annotation <- getVarInEnv("annot")
     
 ### impute gender if gender information not provided
     if (is.null(gender)) {
         gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose)
     }
 
 ### double-check ChrY SNP cluster, impute gender if gender information not provided
     if (!(is.null(gender))) {
         verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
     }
 
     close(calls)
     close(scores)
     rm(calls)
     rm(scores)
     rm(M)
     rm(annotation)
     rm(YIndex)
     TRUE
 }
 
cbfc52e5
 krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annotation=NULL, verbose=TRUE) {
     if(is.null(normalize.method)){
         message("No normalization method specified. Quantile normalization applied")
         M=quantilenormalization(cnSet,verbose,offset=offset)
     }else{
         if(normalize.method=="quantile"){
             M=quantilenormalization(cnSet,verbose,offset=offset)
         }
         if(normalize.method=="loess"){
             M=loessnormalization(cnSet,verbose,offset=offset)
 	}
     }
 
     message("Leaving out non-variant SNPs")
b2cf0c8e
     
cbfc52e5
     # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
     uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
     SNPtoProcessIndex = uniqueCount >= 3
     noVariantSNPIndex = uniqueCount < 3
     M = M[SNPtoProcessIndex, ]
 
     numSNP = nrow(M)
     numSample = ncol(M)
 
     calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
     scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
     open(calls)
     open(scores)
 
     rownames(calls) = rownames(M)
     rownames(scores) = rownames(M)
     colnames(calls) = colnames(M)
     colnames(scores) = colnames(M)
 
     prepriormeans=calculatePriorcentersC(M,numSample)
 
     trueCalls=matrix(NA,nrow(M),ncol(M))
 
     for(i in 1:ncol(M)){
       tmp=kmeans(M[,i],centers=prepriormeans,nstart=45)
       trueCalls[,i]=tmp$cluster
     }
     colnames(trueCalls)=colnames(M)
     rownames(trueCalls)=rownames(M)
b2cf0c8e
 if(numSample<=30){
   priormeans=prepriormeans
 }else{
   priormeans = calculatePriorValues(M, numSNP, verbose)
     if((abs(priormeans[1]-prepriormeans[1])+abs(priormeans[2]-prepriormeans[2])+abs(priormeans[3]-prepriormeans[3]))>=3){
 priormeans=prepriormeans
 }else{
       priormeans=priormeans
   }
  }
  
     
cbfc52e5
     VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
 
 ### retrieve or calculate coefficients
 
     krlmmCoefficients = getKrlmmVGLMCoefficients(trueCalls=trueCalls,params=VGLMparameters,numSample=ncol(M),samplenames=colnames(M),verbose=T)
 ### do VGLM fit, to predict k for each SNP
     kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose)
     rm(VGLMparameters)
 
 ### assign calls
     assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose=T)
     rm(kPrediction)
 
 ### assign confidence scores
     computeCallPr(scores, M, calls, numSNP, numSample, verbose)
 
b2cf0c8e
    YIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==24 & isSnp(cnSet)]
     XIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==23 & isSnp(cnSet)]
cbfc52e5
 ### impute gender if gender information not provided
     if (is.null(gender)) {
d49c9607
        if(sum(is.na(YIndex))==length(YIndex)){
 		gender=NULL
        }else{
 	gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset)
        }
cbfc52e5
     }
 
b2cf0c8e
 ### double-check ChrY ChrX SNP cluster, impute gender if gender information not provided
cbfc52e5
     if (!(is.null(gender))) {
         verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
b2cf0c8e
         verifyChrXSNPs(cnSet, M, calls, gender, annotation, XIndex, priormeans, verbose)
cbfc52e5
     }
 
 #    if (length(YIndex)>0  && !(is.null(gender))) {
 #      verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
 #    }
 ### add back SNPs excluded before
b2cf0c8e
    AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
cbfc52e5
 
     close(calls)
     close(scores)
     rm(calls)
     rm(scores)
     rm(M)
     TRUE
 }
 
 
 #######################################################################################################
 
 loessnormalization<- function(cnSet, verbose, offset=16, blockSize = 300000){
     # compute log-ratio in blocksize of 300,000 by default
     message("Start computing log-ratios")
     A <- A(cnSet)
     B <- B(cnSet)
     open(A)
     open(B)
 
     numSNP <- nrow(A)
     numSample <- ncol(A)
b2cf0c8e
    
cbfc52e5
 
     M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
     S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
 
     rownames(M) = rownames(A)
     colnames(M) = colnames(A)
 
     numBlock = ceiling(numSNP / blockSize)
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
         subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
         subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
         M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
         subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
         S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
 
         rm(subsetA, subsetB, subsetM,subsetS)
 
     }
     close(A)
     close(B)
     gc()
 
     isna=is.na(M[,1])
     prepriormeansL=calculatePriorcentersC(M[,][!isna,],numSample)
     F01=abs(prepriormeansL[1])-prepriormeansL[2]   
     F02=abs(prepriormeansL[3])+prepriormeansL[2]
 
     for(i in 1:ncol(M)) {
         isna=is.na(M[,i])
         Msub = M[!isna,i]
         Mna=M[isna,i]
         Ssub = S[!isna,i]
         Sna=S[isna,i]
 
         km = kmeans(Msub, prepriormeansL)
         ind1v2 = km$cluster==1
         ind2v2 = km$cluster==2
         ind3v2 = km$cluster==3
 
         l1v2 = loessFit(Msub[ind1v2], Ssub[ind1v2])
         l2v2 = loessFit(Msub[ind2v2], Ssub[ind2v2])
         l3v2 = loessFit(Msub[ind3v2], Ssub[ind3v2])
         Msub[ind1v2] = l1v2$residuals+F01
         Msub[ind2v2] = l2v2$residuals
         Msub[ind3v2] = l3v2$residuals-F02
         Mnew=rbind(as.matrix(Msub),as.matrix(Mna))
         rownames(Mnew)=c(rownames(as.matrix(Msub)),rownames(as.matrix(Mna)))
         m=match(rownames(M),rownames(Mnew))
         Mnew=Mnew[m]
         M[,i] = Mnew
         rm(Msub, Ssub, ind1v2, ind2v2, ind3v2, l1v2, l2v2, l3v2)
     }
     return(M)
 }
 
 
 #######################################################################################################
 
 quantilenormalization <- function(cnSet, verbose, offset=16, blockSize = 300000){
     A <- A(cnSet)
     B <- B(cnSet)
     open(A)
     open(B)
 
     numSNP <- nrow(A)
     numSample <- ncol(A)
 
     M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
     X <- oligoClasses::initializeBigMatrix(name="X", numSNP, numSample, vmode = "integer")
     Y <- oligoClasses::initializeBigMatrix(name="Y", numSNP, numSample, vmode = "integer")
     rownames(M) = rownames(X) = rownames(Y) = rownames(A)
     colnames(M) = colnames(X) = colnames(Y) = colnames(A)
 
     # This step may chew up a lot of memory...
     X =  normalize.quantiles(as.matrix(A[,]))+offset
     Y =  normalize.quantiles(as.matrix(B[,]))+offset
 
     numBlock = ceiling(numSNP / blockSize)
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetXqws = as.matrix(X[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         subsetYqws = as.matrix(Y[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         subsetM = .Call("krlmmComputeM", subsetXqws, subsetYqws, PACKAGE="crlmm")
         M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
         rm(subsetYqws, subsetXqws, subsetM)
     }
 
    close(A)
    close(B)
    return(M)
    delete(X)
    delete(Y)
    rm(X)
    rm(Y)
    gc()
 }
 
 
3d8f4872
 #######################################################################################################
 
 getNumberOfCores <- function(){
     defaultCores = min(detectCores(), 8)
     return(getOption("krlmm.cores", defaultCores))    
 }
 
 
 #######################################################################################################
 
cbfc52e5
 computeLogRatio <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){
3d8f4872
     # compute log-ratio in blocksize of 500,000 by default
cbfc52e5
     if(verbose)
         message("Start computing log-ratios")
     if(!is.null(col) | !is.null(row)) {
       if(is.null(row)) {
        A <- A(cnSet)[,col]
        B <- B(cnSet)[,col]
       }
       if(is.null(col)) {
        A <- A(cnSet)[row,]
        B <- B(cnSet)[row,]
       }
       if(!is.null(col) & !is.null(row)) {
        A <- A(cnSet)[row,col]
        B <- B(cnSet)[row,col]
       }
     } else {
        A <- A(cnSet)
        B <- B(cnSet)
     }
3d8f4872
     open(A)
     open(B)
     
     numSNP <- nrow(A)
     numSample <- ncol(A)
     
     M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
     rownames(M) = rownames(A)
     colnames(M) = colnames(A)
   
     numBlock = ceiling(numSNP / blockSize)
     for (i in 1:numBlock){
cbfc52e5
         if(verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
         subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset              
3d8f4872
         subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
         M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
         rm(subsetA, subsetB, subsetM)
     }
     
     close(A)
     close(B)
cbfc52e5
     if(verbose)
         message("Done computing log-ratios")
3d8f4872
     return(M)    
 }
 
cbfc52e5
 computeAverageLogIntensity <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){
3d8f4872
     # compute average log intensity in blocksize of 500,000 by default
cbfc52e5
     if(verbose)
         message("Start computing average log-intensities")
     if(!is.null(col) | !is.null(row)) {
       if(is.null(row)) {
        A <- A(cnSet)[,col]
        B <- B(cnSet)[,col]
       }
       if(is.null(col)) {
        A <- A(cnSet)[row,]
        B <- B(cnSet)[row,]
       }
       if(!is.null(col) & !is.null(row)) {
        A <- A(cnSet)[row,col]
        B <- B(cnSet)[row,col]
       }
     } else {
        A <- A(cnSet)
        B <- B(cnSet)
     }
3d8f4872
     open(A)
     open(B)
     
     numSNP <- nrow(A)
     numSample <- ncol(A)
     
     S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
     rownames(S) = rownames(A)
     colnames(S) = colnames(A)
       
     numBlock = ceiling(numSNP / blockSize)
     for (i in 1:numBlock){
cbfc52e5
         if(verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
         subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset                     
3d8f4872
         subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
         S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
         rm(subsetA, subsetB, subsetS)
     }
     
     close(A)
     close(B)
cbfc52e5
     if(verbose)
         message("Done computing average log-intensities")
     return(S)  
3d8f4872
 }
 
 
 #######################################################################################################
 
 calculatePriorValues <- function(M, numSNP, verbose) {
 
     calculateOneKmeans <- function(x) {
         tmp = kmeans(x, 3, nstart=45)
         return(sort(tmp$centers, decreasing = T))
     }
 
cbfc52e5
     if (verbose) message("Start calculating prior means")
3d8f4872
     cl <- makeCluster(getNumberOfCores())
     centers <- parRapply(cl, M, calculateOneKmeans)
     stopCluster(cl) 
     centers <- matrix(centers, numSNP, 3, byrow = TRUE)
     priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
cc879294
     if(abs(sum(priormeans))>1) {
       checksymmetric= apply(centers,1,function(x){abs(sum(x))})<1
1613e94f
       priormeans=apply(centers[checksymmetric,],2, FUN="median", na.rm=TRUE)
cc879294
     }
cbfc52e5
     if (verbose) message("Done calculating prior means")
3d8f4872
     return(priormeans)
 }
 
cbfc52e5
 calculatePriorcentersC <- function(M, numSample) {
 
     calculateOneKmeans <- function(x) {
         tmp = kmeans(x, 3, nstart=45)
         return(sort(tmp$centers, decreasing = T))
     }
 
     cl <- makeCluster(getNumberOfCores())
     centers <- parCapply(cl, M, calculateOneKmeans)
     stopCluster(cl) 
     centers <- matrix(centers, numSample, 3, byrow = TRUE)
     prepriormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
     
     return(prepriormeans)
 }
 
 
3d8f4872
 #######################################################################################################
 
 calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
 #    if (!require(VGAM)) {
 #        message("VGAM package not found, fall back to use defined platform-specific coefficients")
 #        return(NULL)
 #    }
     amatch = match(rownames(params), rownames(trueCalls))
     amatch = amatch[!is.na(amatch)]
     trueCalls = trueCalls[amatch,]
 
     amatch = match(rownames(trueCalls), rownames(params))
     params = params[amatch, ]
     
     amatch = match(samplenames, colnames(trueCalls))
     amatch = amatch[!is.na(amatch)]    
     trueCalls = trueCalls[, amatch]
         
     if ((numSample <= 40) && (ncol(trueCalls) < round(numSample/2))){
         message("Sample size is ", numSample, ", KRLMM requires at least trueCalls of ", round(numSample/2), " samples to calculate coefficients")
         return(NULL)
     }
     if ((numSample > 40) && (ncol(trueCalls) < 20)){
         message("At least trueCalls of 20 samples are required to calculate coefficients")
         return(NULL)
     }
     if (nrow(trueCalls) < 1000){
         message("At lease trueCalls of 1000 SNPs are required to calculate coefficients")
         return(NULL)
     }
 
     getna = apply(trueCalls, 1, function(x){sum(is.na(x))>=10})
     truek = apply(trueCalls, 1, function(x){length(unique(x[!is.na(x)]))})
    
     params1 = params[!getna,]
     truek1 = truek[!getna]
 
     xx = data.frame(params1)
     t = as.factor(as.numeric(truek1)) 
     xx = data.frame(xx, t)
     fit = suppressWarnings(vglm(t~., multinomial(refLevel=1), xx))
     coe = coefficients(fit) # VGAM::
     return(coe)    
 }
 
 getKrlmmVGLMCoefficients <- function(pkgname, trueCalls, params, verbose, numSample, samplenames){
     if (!is.null(trueCalls)) {
         coe = calculateKrlmmCoefficients(trueCalls, params, numSample, samplenames)
         if (!is.null(coe)) {
             if (verbose) message ("Done calculating platform-specific coefficients to predict number of clusters")
             return(coe)
         }
     }
     if (!is.null(trueCalls)) message("Fall back to use defined platform-specific coefficients")
     if (verbose) message ("Retrieving defined platform-specific coefficients to predict number of clusters")
     loader("krlmmVGLMCoefficients.rda", .crlmmPkgEnv, pkgname)
     return(getVarInEnv("krlmmCoefficients"))      
 }
 
 
 predictKwithVGLM <- function(data, coe, verbose){
     if (verbose) message("Start predicting number of clusters")
     logit1 <- rep(0, nrow(data))
     logit2 <- coe[1]+coe[3]*data[,1]+coe[5]*data[,2]+coe[7]*data[,3]+coe[9]*data[,4]+coe[11]*data[,5]+coe[13]*data[,6]+coe[15]*data[,7]+coe[17]*data[,8]
     logit23 <- coe[2]+coe[4]*data[,1]+coe[6]*data[,2]+coe[8]*data[,3]+coe[10]*data[,4]+coe[12]*data[,5]+coe[14]*data[,6]+coe[16]*data[,7]+coe[18]*data[,8]
    
     logits <- cbind(logit1, logit2, logit23)
     rm(logit1)
     rm(logit2)
     rm(logit23)
     p.unscaled <- exp(logits)
     rm(logits)   
     p <- p.unscaled / rowSums(p.unscaled)
     clusterPrediction = apply(p, 1, function(x){which.max(x)})
     rm(p.unscaled)
     rm(p)
     if (verbose) message("Done predicting number of clusters")
     return(clusterPrediction)
 }
 
 #######################################################################################################
 
 assignCalls3Cluster <- function(intensities, priormeans){
     prior31 = c(priormeans[1]/2,priormeans[2],priormeans[3])
     prior32 = c(priormeans[1],priormeans[2],priormeans[3]/2)	
 
     emp <- rep(NA, length(intensities))
 	ansCalls <- emp
     tmp <- try(kmeans(intensities, priormeans, nstart=1),TRUE)
     if(class(tmp) == "try-error") {
         tmp1 <- try(kmeans(intensities, prior31, nstart=1),TRUE)
         if(class(tmp1) == "try-error"){
             tmp2 <- try(kmeans(intensities, prior32, nstart=1),TRUE)
             if(class(tmp2) == "try-error"){
                 ansCalls = emp
             }else{
                 ansCalls = tmp2$cluster
             }
         }else{
             ansCalls = tmp1$cluster
         }
     }else{
         ansCalls = tmp$cluster
     }
     rm(prior31, prior32)
     return(ansCalls)
  
 }
 
 #######################################################################################################
 
 assignCalls2Cluster <- function(intensities, priormeans){
     closest <- rep(NA, 3)	
     for(j in 1:3){
         distance <- as.vector(abs(priormeans[j]-intensities))
         closest[j] <- intensities[which.min(distance)]
     }
     prior2 <- priormeans[priormeans!=priormeans[which.max(abs(closest-priormeans))]]
 	
     emp <- rep(NA, length(intensities))
     ansCalls <- emp    
     tmp <- try(kmeans(intensities, prior2, nstart=1), TRUE)
     if(class(tmp) == "try-error") {
         ansCalls <- emp
     }else{
         if(prior2[1]==priormeans[2] && prior2[2]==priormeans[3]){
             mp <- abs(tmp$centers[1]-tmp$centers[2])
             pp <- abs(priormeans[2]-priormeans[3])
             if((mp/pp)<=0.25){
                 ansCalls <- emp
             }else{
                 ansCalls <- tmp$cluster+1
             }
         }
         if(prior2[1]==priormeans[1] && prior2[2]==priormeans[2]){
             mp=abs(tmp$centers[1]-tmp$centers[2])
             pp=abs(priormeans[1]-priormeans[2])
             if((mp/pp)<=0.25){
                 ansCalls = emp
             }else{
                 ansCalls <- tmp$cluster
             }
         }
         if(prior2[1]==priormeans[1] && prior2[2]==priormeans[3]){
             mp <- abs(tmp$centers[1]-tmp$centers[2])
             pp <- abs(priormeans[1]-priormeans[3])
             if ((mp/pp) <=0.25){
                 ansCalls <- emp
             }else{
                 ansCalls[tmp$cluster==1] <- 1
                 ansCalls[tmp$cluster==2] <- 3
             }
         }
     }
     rm(tmp)
     return(ansCalls)
 }
 
 #######################################################################################################
 
 assignCalls1Cluster <- function(intensities, priormeans){
     closest <- rep(NA, 3)
     for(j in 1:3){
         distance <- as.vector(abs(priormeans[j]-intensities))
         closest[j]=intensities[which.min(distance)]
     }
     clusterindex <- which.min(abs(closest-priormeans))
     return(rep(clusterindex, length(intensities)))
 }
   
 #######################################################################################################
 
 assignCallsOneSNP <- function(x, priormeans, numSample){
     tolerance = 1e-3
        
     k = x[numSample + 1]
     values = x[1:numSample]  
     
     if (abs(k - 2) <= tolerance) {
         tmp <- try(kmeans(values, priormeans, nstart=1),TRUE)
         if (!(class(tmp)=="try-error")) {
             k <- 3;
         }
     }
 
     successful <- FALSE;
     if (abs(k - 3) <= tolerance){
         SNPCalls <- assignCalls3Cluster(values, priormeans)
         successful <- !is.na(SNPCalls[1])
         if (!successful) { 
             k <- 2
         }
     }
 		
     if ( (abs(k - 2) <= tolerance) && (!successful)){
         SNPCalls <- assignCalls2Cluster(values, priormeans)
         successful <- !is.na(SNPCalls[1])
         if (!successful) { 
             k <- 1
         }			
     }
 
     if ( (abs(k - 1) <= tolerance) && (!successful)){
         SNPCalls <- assignCalls1Cluster(values, priormeans)
     }
     return(SNPCalls)
 }
 
 
 assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
     # process by block size of 500,000 by default
cbfc52e5
     message("Start assigning calls")
3d8f4872
        
     numBlock = ceiling(numSNP / blockSize)
 
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         thisnumSNP = nrow(subsetM)
         subseta = a[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP)]
         subseta = matrix(subseta, thisnumSNP, 1)
 
         testValues = cbind(subsetM, subseta)
 
         cl <- makeCluster(getNumberOfCores())
         callAns <- parRapply(cl, testValues, assignCallsOneSNP, priormeans, numSample)
         stopCluster(cl)
 
         callAnsAllValues = unlist(callAns)
 
         subsetcalls <- matrix(callAnsAllValues, thisnumSNP, numSample, byrow = TRUE)
         
         callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
         rm(subsetM, subseta, subsetcalls)
     }
cbfc52e5
     message("Done assigning calls")
3d8f4872
 }
 
 #######################################################################################################
 
 
 calculateParameters <- function(M, priormeans, numSNP, verbose) {
cbfc52e5
     if (verbose) message("Start calculating 3-cluster parameters")
3d8f4872
     params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
     if (verbose) message("Done calculating 3-cluster parameters")
 
     if (verbose) message("Start calculating 2-cluster parameters")
     params2cluster <- calculateParameters2Cluster(M, priormeans, numSNP, verbose);
     if (verbose) message("Done calculating 2-cluster parameters")
 
     if (verbose) message("Start calculating 1-cluster parameters")
     params1cluster <- calculateParameters1Cluster(M, priormeans, numSNP, verbose);    
     if (verbose) message("Done calculating 1-cluster parameters")
 
     parameters <- cbind(as.matrix(params1cluster$elemh), as.matrix(params1cluster$eless), as.matrix(params2cluster$elemh), as.matrix(params2cluster$eless),
                         as.matrix(params3cluster$elemh), as.matrix(params3cluster$eless), as.matrix(params2cluster$elehw), as.matrix(params3cluster$elehw));
     
     rownames(parameters) = rownames(M)
     rm(params3cluster)
     rm(params2cluster)
     rm(params1cluster)
     return(parameters)
 }
 
 hardyweinberg <- function(x, minN = 8){
    if (length(x) < minN){
        return(NA) 
     } else {
        result = .Call("krlmmHardyweinberg", x)
        return(result)
     }
 }
 
 calculateOneParams3Cluster <- function(x, priors, priorDown, priorUp){
 
     tmp = try(kmeans(x, priors, nstart=1), TRUE)
     if(class(tmp) == "kmeans") {
         flag = 1
      } else {
         tmp = try(kmeans(x, priorDown, nstart=1), TRUE)
         if(class(tmp) == "kmeans") {
             flag = 2
         } else {
             tmp = try(kmeans(x, priorUp, nstart=1), TRUE)
             if (class(tmp) == "kmeans") {
                 flag = 3
             } else {
                 tmp = kmeans(x, 3, nstart=35)
                 flag = 4
             }
         }  
     }
     ss = sum(tmp$withinss)
     hw = hardyweinberg(tmp$cluster) 
     centers = sort(tmp$centers, decreasing = TRUE)
 
     return(c(centers, ss, hw, flag))
 }
 
 calculateParameters3Cluster <- function(M, priormeans, numSNP, verbose) {
     Apriors = priormeans
     ApriorDown = c(Apriors[1]/2, Apriors[2], Apriors[3]) # shift-down
     ApriorUp = c(Apriors[1], Apriors[2], Apriors[3]/2) # shift-up
 
     cl <- makeCluster(getNumberOfCores())
     clusterEvalQ(cl, library(crlmm))
     parAns <- parRapply(cl, M, calculateOneParams3Cluster, Apriors, ApriorDown, ApriorUp)
 
     stopCluster(cl)
     parAnsAllValues = unlist(parAns)
     parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
    
     centers <- parameters[, 1:3]
     ss <- parameters[, 4]
     hw <- parameters[, 5]
     flag <- parameters[, 6]
 
     rm(parAns)
     rm(parameters)
    
     sigma=solve(cov(centers, use="complete.obs"))
     mh = calculateMahalDist3Cluster(centers, sigma, flag, Apriors, ApriorDown, ApriorUp, numSNP)
 
     rm(sigma)
     rm(centers)
 
     gc()
     return(list(eless = ss, elemh = mh, elehw = hw))
 }
 
 
 calculateMahalDist3Cluster <- function(centers, sigma, flag, priors, priorDown, priorUp, numSNP){
     mahaldist = rep(NA, numSNP)
 
     tolerance = 1e-3
     for (i in 1:numSNP) {
         if ((abs(flag[i] - 1) <= tolerance) || (abs(flag[i]- 4) <= tolerance)) difference = centers[i, ] - priors
         if (abs(flag[i] - 2) <= tolerance) difference = centers[i, ] - priorDown
         if (abs(flag[i] - 3) <= tolerance) difference = centers[i, ] - priorUp         
         mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
     }
     return(mahaldist)
 }
 
 
 calculateOneParams2Cluster <- function(x, priors){
     aa = rep(NA, 3)   
     for(j in 1:3){
         dist = as.vector(abs(priors[j]-x))
         aa[j]=x[which.min(dist)]
     } 
     prior2 = priors[priors!=priors[which.max(abs(aa-priors))]]
      
 
     tmp = try(kmeans(x, prior2, nstart=1), TRUE)
     if (class(tmp)=="kmeans") {
         centers = tmp$centers
         hw = hardyweinberg(tmp$cluster)
     }
     rm(tmp)
     tmp = kmeans(x, 2, nstart = 35)
     if (tmp$centers[1] < tmp$centers[2]){
         centers = c(tmp$centers[2], tmp$centers[1])
         hw = hardyweinberg(3 - tmp$cluster)   
     } else {
         centers = tmp$centers
         hw = hardyweinberg(tmp$cluster)
     }
     ss = sum(tmp$withinss)     
     return(c(centers, prior2, ss, hw))
 }
 
 
 calculateParameters2Cluster <- function(M, priormeans, numSNP, verbose) {
     Apriors = priormeans
    
     cl <- makeCluster(getNumberOfCores())
     clusterEvalQ(cl, library(crlmm))
     parAns <- parRapply(cl, M, calculateOneParams2Cluster, Apriors)
     stopCluster(cl)
 
     parAnsAllValues = unlist(parAns)
     parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
 
     centers <- parameters[, 1:2]
     priors2cluster <- parameters[, 3:4]
     ss <- parameters[, 5]
     hw <- parameters[, 6]
     
     rm(parAns)
     rm(parameters)
     sigma=solve(cov(centers, use="complete.obs"))
 
     mh = calculateMahalDist2Cluster(centers, sigma, priors2cluster, numSNP)
 
     rm(sigma)
     rm(centers)
 
     gc()
     return(list(eless = ss, elemh = mh, elehw = hw))
 }
 
 
 calculateMahalDist2Cluster <- function(centers, sigma, priors, numSNP){
     mahaldist = rep(NA, numSNP)
     
     for (i in 1:numSNP) {
         difference <- centers[i,] - priors[i,]
         mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)        
     }
     return(mahaldist)
 }
 
 calculateOneParams1Cluster <- function(x, priors){
     center = mean(x)
     diff <- x - center
     diffsquare <- diff^2
     ss = sum(diffsquare)
 
     closestPrior = priors[which.min(abs(priors - center))]
 
     return(c(center, closestPrior, ss))
 }
 
 
 calculateParameters1Cluster <- function(M, priormeans, numSNP, verbose) {
     Apriors = priormeans
    
     cl <- makeCluster(getNumberOfCores())
     clusterEvalQ(cl, library(crlmm))
     parAns <- parRapply(cl, M, calculateOneParams1Cluster, Apriors)
     stopCluster(cl)
 
     parAnsAllValues = unlist(parAns)
     parameters <- matrix(parAnsAllValues, numSNP, 3, byrow = TRUE)
 
     centers <- matrix(parameters[, 1], numSNP, 1)
     prior1cluster <- matrix(parameters[, 2], numSNP, 1)
     ss <- parameters[, 3]
     
     rm(parAns)
     rm(parameters)
     sigma=solve(cov(centers, use="complete.obs"))
 
     mh = calculateMahalDist1Cluster(centers, sigma, prior1cluster, numSNP)
 
     rm(sigma)
     rm(centers)
 
     gc()
     return(list(eless = ss, elemh = mh))
 }
 
 
 calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
     mahaldist = rep(NA, numSNP)
     
     for(i in 1:numSNP) {
         difference <- as.vector(centers[i, 1] - priors[i, 1])
 	mahaldist[i]=difference%*%sigma%*%difference
     }
     return(mahaldist)
 }
    
 #############################################
 
 
 computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
     # compute log-ratio in blocksize of 500,000 by default
cbfc52e5
     if (verbose) message("Start computing confidence scores")
3d8f4872
     
     numBlock = ceiling(numSNP / blockSize)
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         subsetCalls = as.matrix(calls[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
         subsetCallProb = .Call("krlmmConfidenceScore", subsetM, subsetCalls, PACKAGE="crlmm");
         callsPr[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetCallProb[1:nrow(subsetM), ]
         rm(subsetM, subsetCalls, subsetCallProb)
     }
 
cbfc52e5
     if (verbose) message("Done computing confidence scores")
3d8f4872
 }
 
 #############################################
 
 AddBackNoVarianceSNPs <- function(cnSet, calls, scores, numSNP, numSample, variantSNPIndex, noVariantSNPIndex){
     callsGt = calls(cnSet)
     callsPr = snpCallProbability(cnSet)
     open(callsGt)
     open(callsPr)
 
     callsGt[variantSNPIndex, ] = calls[,]
     callsPr[variantSNPIndex, ] = scores[,]  
     
     callsGt[noVariantSNPIndex, ] = NA
     callsPr[noVariantSNPIndex, ] = 0     
 
     close(callsGt)
     close(callsPr)    
 }
 
 #############################################
 
cbfc52e5
 krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose, offset=0){
3d8f4872
     if (verbose) message("Start imputing gender")    
cbfc52e5
     S = computeAverageLogIntensity(cnSet, verbose, offset=offset) 
3d8f4872
 
     # S is calculated and saved in original SNP order. 
b2cf0c8e
     matchy = match(rownames(annotation)[YIndex], rownames(S))
3d8f4872
     matchy = matchy[!is.na(matchy)]
     if (length(matchy) <= 10){
cbfc52e5
         predictedGender = rep(NA, ncol(S))
3d8f4872
     }
     Sy = S[matchy,]
 
     uniqueDataPoint = apply(Sy, 1, function(x){length(unique(x))})
     validYSNPs = uniqueDataPoint >= 2
     SyValid = Sy[validYSNPs, ]
     
     if (nrow(SyValid) < 20){
         message("Not enough ChrY SNPs, skipping gender prediction step");
         predictedGender = rep(NA, ncol(Sy))
     }
    
     rm(S)
     numYChrSNP = nrow(SyValid)
 
     allS = matrix(NA, numYChrSNP, 2)
 
     for (i in 1:numYChrSNP) {
         tmp = kmeans(SyValid[i,] ,2, nstart=45)
         allS[i,] = sort(tmp$centers, decreasing=F)
     }
     priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
 
     if (abs(priorS[1] - priorS[2]) <= 1.6) {
b2cf0c8e
         message("Separation between clusters too small (samples probabaly all the same gender): ");
       
3d8f4872
     }
     
     meanmatrix = apply(Sy, 2, median)
 
     Sy1 = abs(meanmatrix - priorS[1])
     Sy2 = abs(meanmatrix - priorS[2])
 
     # output male - 1, female - 2, female S-values are smaller than male S-values. 
     test = cbind(Sy2, Sy1)
     predictedGender = apply(test, 1, which.min)
 
     open(cnSet$gender)
     cnSet$gender[,] = predictedGender
     close(cnSet$gender)
     
     if (verbose) message("Done imputing gender")
     return(predictedGender)   
 }
 
 
 #############################################
 
 verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose){
     if (verbose) message("Start verifying SNPs on Chromosome Y")
     callsGt = calls(cnSet)
     callsPr = snpCallProbability(cnSet)
     open(callsGt)
     open(callsPr)
        
b2cf0c8e
     matchy = match(rownames(annotation)[YIndex], rownames(M))
3d8f4872
     matchy = matchy[!is.na(matchy)]
    
     MChrY = M[matchy,]
     callsChrY = calls[matchy,]
   
     male = gender == 1
     female = gender == 2    
     
     checkK = apply(callsChrY[, male], 1, function(x){ length(unique(x[!is.na(x)])) } )
     
     for(i in 1:nrow(MChrY)){
         # Chromosome Y SNPs, no calls for female, k = 1 or 2 permitted for male samples
         thisChrYSNPorigPosition = match(rownames(callsChrY)[i], rownames(callsGt))
         callsGt[thisChrYSNPorigPosition, female] = NA
         callsPr[thisChrYSNPorigPosition, female] = 0
 
         # early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned
         # male samples to 3 clusters by mistake. 
         if (checkK[i] < 3) next;
           
         if (class(try(kmeans(MChrY[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){
            
             maleSampleCalls = kmeans(MChrY[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster
             callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 1] = 1
             callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 2] = 3
         } else {
                     
             distanceToPrior1 = mean(abs(MChrY[i, male] - priormeans[1]))
             distanceToPrior3 = mean(abs(MChrY[i, male] - priormeans[3]))                
             callsGt[thisChrYSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3)
         }
 
         MMaleSamples = MChrY[i, male]
         callsMaleSample = callsGt[thisChrYSNPorigPosition, male]
         scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm");
 
         callsPr[thisChrYSNPorigPosition, male] = scoresMaleSample       
     }
 
     close(callsGt)
     close(callsPr)
     
     if (verbose) message("Done verifying SNPs on Chromosome Y")
 }
b2cf0c8e
 
 
 
 
 #######################################
 verifyChrXSNPs <- function(cnSet, M, calls, gender, annotation, XIndex, priormeans, verbose){
     if (verbose) message("Start verifying SNPs on Chromosome X")
     callsGt = calls(cnSet)
     callsPr = snpCallProbability(cnSet)
     open(callsGt)
     open(callsPr)
       
     matchx = match(rownames(annotation)[XIndex], rownames(M))
     matchx = matchx[!is.na(matchx)]
    
     MChrX= M[matchx,]
     callsChrX = callsGt[matchx,]
   
     male = gender == 1
     female = gender == 2    
     
     checkK = apply(callsChrX[, male], 1, function(x){ length(unique(x[!is.na(x)])) } )
     
     for(i in 1:nrow(MChrX)){
       
         # Chromosome X SNPs for male, k = 1 or 2 permitted for male samples
 thisChrXSNPorigPosition = match(rownames(callsChrX)[i], rownames(callsGt))
         # early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned
         # male samples to 3 clusters by mistake. 
         if (checkK[i] < 3) next;
           
         if (class(try(kmeans(MChrX[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){
            
             maleSampleCalls = kmeans(MChrX[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster
             callsGt[thisChrXSNPorigPosition, male][maleSampleCalls == 1] = 1
             callsGt[thisChrXSNPorigPosition, male][maleSampleCalls == 2] = 3
         } else {
                     
             distanceToPrior1 = mean(abs(MChrX[i, male] - priormeans[1]))
             distanceToPrior3 = mean(abs(MChrX[i, male] - priormeans[3]))                
             callsGt[thisChrXSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3)
         }
 
         MMaleSamples = MChrX[i, male]
         callsMaleSample = callsGt[thisChrXSNPorigPosition, male]
         scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm");
 
        callsPr[thisChrXSNPorigPosition, male] = scoresMaleSample       
     }
 
     close(callsGt)
     close(callsPr)
     
     if (verbose) message("Done verifying SNPs on Chromosome X")
 }