R/krlmm.R
3d8f4872
 krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
   
     pkgname <- getCrlmmAnnotationName(cdfName) # crlmm:::
     
 ### pre-processing, output M    
     M = computeLogRatio(cnSet, verbose)
     message("leaving out novariant SNPs")
     
     # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
     uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
     SNPtoProcessIndex = uniqueCount >= 3
     noVariantSNPIndex = uniqueCount < 3
     M = M[SNPtoProcessIndex, ]
 
     numSNP = nrow(M)
     numSample = ncol(M)
     
     calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
     scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
     open(calls)
     open(scores)
 
     rownames(calls) = rownames(M)
     rownames(scores) = rownames(M)
     colnames(calls) = colnames(M)
     colnames(scores) = colnames(M)  
     
     priormeans = calculatePriorValues(M, numSNP, verbose)
     VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
   
 ### retrieve or calculate coefficients
     krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
     
 ### do VGLM fit, to predict k for each SNP
     kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose);
     rm(VGLMparameters)
     
 ### assign calls
     assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose);
     rm(kPrediction)
 
 ### assign confidence scores
     computeCallPr(scores, M, calls, numSNP, numSample, verbose)
     
 ### add back SNPs excluded before
     AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
 
     loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
     YIndex <- getVarInEnv("YIndex")
     loader("annotation.rda", .crlmmPkgEnv, pkgname)
     annotation <- getVarInEnv("annot")
     
 ### impute gender if gender information not provided
     if (is.null(gender)) {
         gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose)
     }
 
 ### double-check ChrY SNP cluster, impute gender if gender information not provided
     if (!(is.null(gender))) {
         verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
     }
 
     close(calls)
     close(scores)
     rm(calls)
     rm(scores)
     rm(M)
     rm(annotation)
     rm(YIndex)
     TRUE
 }
 
 #######################################################################################################
 
 getNumberOfCores <- function(){
     defaultCores = min(detectCores(), 8)
     return(getOption("krlmm.cores", defaultCores))    
 }
 
 
 #######################################################################################################
 
 computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
     # compute log-ratio in blocksize of 500,000 by default
     message("Start computing log ratio")
     A <- A(cnSet)
     B <- B(cnSet)
     open(A)
     open(B)
     
     numSNP <- nrow(A)
     numSample <- ncol(A)
     
     M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
     rownames(M) = rownames(A)
     colnames(M) = colnames(A)
   
     numBlock = ceiling(numSNP / blockSize)
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
         subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
         M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
         rm(subsetA, subsetB, subsetM)
     }
     
     close(A)
     close(B)
     if (verbose) message("Done computing log ratio")
     return(M)    
 }
 
 computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
     # compute average log intensity in blocksize of 500,000 by default
     message("Start computing average log intensity")
     A <- A(cnSet)
     B <- B(cnSet)
     open(A)
     open(B)
     
     numSNP <- nrow(A)
     numSample <- ncol(A)
     
     S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
     rownames(S) = rownames(A)
     colnames(S) = colnames(A)
       
     numBlock = ceiling(numSNP / blockSize)
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
         subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
         S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
         rm(subsetA, subsetB, subsetS)
     }
     
     close(A)
     close(B)
     if (verbose) message("Done computing average log intensity")
     return(S)
   
 }
 
 
 #######################################################################################################
 
 calculatePriorValues <- function(M, numSNP, verbose) {
 
     calculateOneKmeans <- function(x) {
         tmp = kmeans(x, 3, nstart=45)
         return(sort(tmp$centers, decreasing = T))
     }
 
     if (verbose) message("Start calculating Prior Means")
     cl <- makeCluster(getNumberOfCores())
     centers <- parRapply(cl, M, calculateOneKmeans)
     stopCluster(cl) 
     centers <- matrix(centers, numSNP, 3, byrow = TRUE)
     priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
cc879294
     if(abs(sum(priormeans))>1) {
       checksymmetric= apply(centers,1,function(x){abs(sum(x))})<1
1613e94f
       priormeans=apply(centers[checksymmetric,],2, FUN="median", na.rm=TRUE)
cc879294
     }
3d8f4872
     if (verbose) message("Done calculating Prior Means")
     return(priormeans)
 }
 
 #######################################################################################################
 
 calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
 #    if (!require(VGAM)) {
 #        message("VGAM package not found, fall back to use defined platform-specific coefficients")
 #        return(NULL)
 #    }
     amatch = match(rownames(params), rownames(trueCalls))
     amatch = amatch[!is.na(amatch)]
     trueCalls = trueCalls[amatch,]
 
     amatch = match(rownames(trueCalls), rownames(params))
     params = params[amatch, ]
     
     amatch = match(samplenames, colnames(trueCalls))
     amatch = amatch[!is.na(amatch)]    
     trueCalls = trueCalls[, amatch]
         
     if ((numSample <= 40) && (ncol(trueCalls) < round(numSample/2))){
         message("Sample size is ", numSample, ", KRLMM requires at least trueCalls of ", round(numSample/2), " samples to calculate coefficients")
         return(NULL)
     }
     if ((numSample > 40) && (ncol(trueCalls) < 20)){
         message("At least trueCalls of 20 samples are required to calculate coefficients")
         return(NULL)
     }
     if (nrow(trueCalls) < 1000){
         message("At lease trueCalls of 1000 SNPs are required to calculate coefficients")
         return(NULL)
     }
 
     getna = apply(trueCalls, 1, function(x){sum(is.na(x))>=10})
     truek = apply(trueCalls, 1, function(x){length(unique(x[!is.na(x)]))})
    
     params1 = params[!getna,]
     truek1 = truek[!getna]
 
     xx = data.frame(params1)
     t = as.factor(as.numeric(truek1)) 
     xx = data.frame(xx, t)
     fit = suppressWarnings(vglm(t~., multinomial(refLevel=1), xx))
     coe = coefficients(fit) # VGAM::
     return(coe)    
 }
 
 getKrlmmVGLMCoefficients <- function(pkgname, trueCalls, params, verbose, numSample, samplenames){
     if (!is.null(trueCalls)) {
         coe = calculateKrlmmCoefficients(trueCalls, params, numSample, samplenames)
         if (!is.null(coe)) {
             if (verbose) message ("Done calculating platform-specific coefficients to predict number of clusters")
             return(coe)
         }
     }
     if (!is.null(trueCalls)) message("Fall back to use defined platform-specific coefficients")
     if (verbose) message ("Retrieving defined platform-specific coefficients to predict number of clusters")
     loader("krlmmVGLMCoefficients.rda", .crlmmPkgEnv, pkgname)
     return(getVarInEnv("krlmmCoefficients"))      
 }
 
 
 predictKwithVGLM <- function(data, coe, verbose){
     if (verbose) message("Start predicting number of clusters")
     logit1 <- rep(0, nrow(data))
     logit2 <- coe[1]+coe[3]*data[,1]+coe[5]*data[,2]+coe[7]*data[,3]+coe[9]*data[,4]+coe[11]*data[,5]+coe[13]*data[,6]+coe[15]*data[,7]+coe[17]*data[,8]
     logit23 <- coe[2]+coe[4]*data[,1]+coe[6]*data[,2]+coe[8]*data[,3]+coe[10]*data[,4]+coe[12]*data[,5]+coe[14]*data[,6]+coe[16]*data[,7]+coe[18]*data[,8]
    
     logits <- cbind(logit1, logit2, logit23)
     rm(logit1)
     rm(logit2)
     rm(logit23)
     p.unscaled <- exp(logits)
     rm(logits)   
     p <- p.unscaled / rowSums(p.unscaled)
     clusterPrediction = apply(p, 1, function(x){which.max(x)})
     rm(p.unscaled)
     rm(p)
     if (verbose) message("Done predicting number of clusters")
     return(clusterPrediction)
 }
 
 #######################################################################################################
 
 assignCalls3Cluster <- function(intensities, priormeans){
     prior31 = c(priormeans[1]/2,priormeans[2],priormeans[3])
     prior32 = c(priormeans[1],priormeans[2],priormeans[3]/2)	
 
     emp <- rep(NA, length(intensities))
 	ansCalls <- emp
     tmp <- try(kmeans(intensities, priormeans, nstart=1),TRUE)
     if(class(tmp) == "try-error") {
         tmp1 <- try(kmeans(intensities, prior31, nstart=1),TRUE)
         if(class(tmp1) == "try-error"){
             tmp2 <- try(kmeans(intensities, prior32, nstart=1),TRUE)
             if(class(tmp2) == "try-error"){
                 ansCalls = emp
             }else{
                 ansCalls = tmp2$cluster
             }
         }else{
             ansCalls = tmp1$cluster
         }
     }else{
         ansCalls = tmp$cluster
     }
     rm(prior31, prior32)
     return(ansCalls)
  
 }
 
 #######################################################################################################
 
 assignCalls2Cluster <- function(intensities, priormeans){
     closest <- rep(NA, 3)	
     for(j in 1:3){
         distance <- as.vector(abs(priormeans[j]-intensities))
         closest[j] <- intensities[which.min(distance)]
     }
     prior2 <- priormeans[priormeans!=priormeans[which.max(abs(closest-priormeans))]]
 	
     emp <- rep(NA, length(intensities))
     ansCalls <- emp    
     tmp <- try(kmeans(intensities, prior2, nstart=1), TRUE)
     if(class(tmp) == "try-error") {
         ansCalls <- emp
     }else{
         if(prior2[1]==priormeans[2] && prior2[2]==priormeans[3]){
             mp <- abs(tmp$centers[1]-tmp$centers[2])
             pp <- abs(priormeans[2]-priormeans[3])
             if((mp/pp)<=0.25){
                 ansCalls <- emp
             }else{
                 ansCalls <- tmp$cluster+1
             }
         }
         if(prior2[1]==priormeans[1] && prior2[2]==priormeans[2]){
             mp=abs(tmp$centers[1]-tmp$centers[2])
             pp=abs(priormeans[1]-priormeans[2])
             if((mp/pp)<=0.25){
                 ansCalls = emp
             }else{
                 ansCalls <- tmp$cluster
             }
         }
         if(prior2[1]==priormeans[1] && prior2[2]==priormeans[3]){
             mp <- abs(tmp$centers[1]-tmp$centers[2])
             pp <- abs(priormeans[1]-priormeans[3])
             if ((mp/pp) <=0.25){
                 ansCalls <- emp
             }else{
                 ansCalls[tmp$cluster==1] <- 1
                 ansCalls[tmp$cluster==2] <- 3
             }
         }
     }
     rm(tmp)
     return(ansCalls)
 }
 
 #######################################################################################################
 
 assignCalls1Cluster <- function(intensities, priormeans){
     closest <- rep(NA, 3)
     for(j in 1:3){
         distance <- as.vector(abs(priormeans[j]-intensities))
         closest[j]=intensities[which.min(distance)]
     }
     clusterindex <- which.min(abs(closest-priormeans))
     return(rep(clusterindex, length(intensities)))
 }
   
 #######################################################################################################
 
 assignCallsOneSNP <- function(x, priormeans, numSample){
     tolerance = 1e-3
        
     k = x[numSample + 1]
     values = x[1:numSample]  
     
     if (abs(k - 2) <= tolerance) {
         tmp <- try(kmeans(values, priormeans, nstart=1),TRUE)
         if (!(class(tmp)=="try-error")) {
             k <- 3;
         }
     }
 
     successful <- FALSE;
     if (abs(k - 3) <= tolerance){
         SNPCalls <- assignCalls3Cluster(values, priormeans)
         successful <- !is.na(SNPCalls[1])
         if (!successful) { 
             k <- 2
         }
     }
 		
     if ( (abs(k - 2) <= tolerance) && (!successful)){
         SNPCalls <- assignCalls2Cluster(values, priormeans)
         successful <- !is.na(SNPCalls[1])
         if (!successful) { 
             k <- 1
         }			
     }
 
     if ( (abs(k - 1) <= tolerance) && (!successful)){
         SNPCalls <- assignCalls1Cluster(values, priormeans)
     }
     return(SNPCalls)
 }
 
 
 assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
     # process by block size of 500,000 by default
     message("Start assign calls")
        
     numBlock = ceiling(numSNP / blockSize)
 
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         thisnumSNP = nrow(subsetM)
         subseta = a[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP)]
         subseta = matrix(subseta, thisnumSNP, 1)
 
         testValues = cbind(subsetM, subseta)
 
         cl <- makeCluster(getNumberOfCores())
         callAns <- parRapply(cl, testValues, assignCallsOneSNP, priormeans, numSample)
         stopCluster(cl)
 
         callAnsAllValues = unlist(callAns)
 
         subsetcalls <- matrix(callAnsAllValues, thisnumSNP, numSample, byrow = TRUE)
         
         callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
         rm(subsetM, subseta, subsetcalls)
     }
     message("Done assign calls")
 }
 
 #######################################################################################################
 
 
 calculateParameters <- function(M, priormeans, numSNP, verbose) {
     if (verbose) message("Start calculating 3-clusters parameters")
     params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
     if (verbose) message("Done calculating 3-cluster parameters")
 
     if (verbose) message("Start calculating 2-cluster parameters")
     params2cluster <- calculateParameters2Cluster(M, priormeans, numSNP, verbose);
     if (verbose) message("Done calculating 2-cluster parameters")
 
     if (verbose) message("Start calculating 1-cluster parameters")
     params1cluster <- calculateParameters1Cluster(M, priormeans, numSNP, verbose);    
     if (verbose) message("Done calculating 1-cluster parameters")
 
     parameters <- cbind(as.matrix(params1cluster$elemh), as.matrix(params1cluster$eless), as.matrix(params2cluster$elemh), as.matrix(params2cluster$eless),
                         as.matrix(params3cluster$elemh), as.matrix(params3cluster$eless), as.matrix(params2cluster$elehw), as.matrix(params3cluster$elehw));
     
     rownames(parameters) = rownames(M)
     rm(params3cluster)
     rm(params2cluster)
     rm(params1cluster)
     return(parameters)
 }
 
 hardyweinberg <- function(x, minN = 8){
    if (length(x) < minN){
        return(NA) 
     } else {
        result = .Call("krlmmHardyweinberg", x)
        return(result)
     }
 }
 
 calculateOneParams3Cluster <- function(x, priors, priorDown, priorUp){
 
     tmp = try(kmeans(x, priors, nstart=1), TRUE)
     if(class(tmp) == "kmeans") {
         flag = 1
      } else {
         tmp = try(kmeans(x, priorDown, nstart=1), TRUE)
         if(class(tmp) == "kmeans") {
             flag = 2
         } else {
             tmp = try(kmeans(x, priorUp, nstart=1), TRUE)
             if (class(tmp) == "kmeans") {
                 flag = 3
             } else {
                 tmp = kmeans(x, 3, nstart=35)
                 flag = 4
             }
         }  
     }
     ss = sum(tmp$withinss)
     hw = hardyweinberg(tmp$cluster) 
     centers = sort(tmp$centers, decreasing = TRUE)
 
     return(c(centers, ss, hw, flag))
 }
 
 calculateParameters3Cluster <- function(M, priormeans, numSNP, verbose) {
     Apriors = priormeans
     ApriorDown = c(Apriors[1]/2, Apriors[2], Apriors[3]) # shift-down
     ApriorUp = c(Apriors[1], Apriors[2], Apriors[3]/2) # shift-up
 
     cl <- makeCluster(getNumberOfCores())
     clusterEvalQ(cl, library(crlmm))
     parAns <- parRapply(cl, M, calculateOneParams3Cluster, Apriors, ApriorDown, ApriorUp)
 
     stopCluster(cl)
     parAnsAllValues = unlist(parAns)
     parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
    
     centers <- parameters[, 1:3]
     ss <- parameters[, 4]
     hw <- parameters[, 5]
     flag <- parameters[, 6]
 
     rm(parAns)
     rm(parameters)
    
     sigma=solve(cov(centers, use="complete.obs"))
     mh = calculateMahalDist3Cluster(centers, sigma, flag, Apriors, ApriorDown, ApriorUp, numSNP)
 
     rm(sigma)
     rm(centers)
 
     gc()
     return(list(eless = ss, elemh = mh, elehw = hw))
 }
 
 
 calculateMahalDist3Cluster <- function(centers, sigma, flag, priors, priorDown, priorUp, numSNP){
     mahaldist = rep(NA, numSNP)
 
     tolerance = 1e-3
     for (i in 1:numSNP) {
         if ((abs(flag[i] - 1) <= tolerance) || (abs(flag[i]- 4) <= tolerance)) difference = centers[i, ] - priors
         if (abs(flag[i] - 2) <= tolerance) difference = centers[i, ] - priorDown
         if (abs(flag[i] - 3) <= tolerance) difference = centers[i, ] - priorUp         
         mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
     }
     return(mahaldist)
 }
 
 
 calculateOneParams2Cluster <- function(x, priors){
     aa = rep(NA, 3)   
     for(j in 1:3){
         dist = as.vector(abs(priors[j]-x))
         aa[j]=x[which.min(dist)]
     } 
     prior2 = priors[priors!=priors[which.max(abs(aa-priors))]]
      
 
     tmp = try(kmeans(x, prior2, nstart=1), TRUE)
     if (class(tmp)=="kmeans") {
         centers = tmp$centers
         hw = hardyweinberg(tmp$cluster)
     }
     rm(tmp)
     tmp = kmeans(x, 2, nstart = 35)
     if (tmp$centers[1] < tmp$centers[2]){
         centers = c(tmp$centers[2], tmp$centers[1])
         hw = hardyweinberg(3 - tmp$cluster)   
     } else {
         centers = tmp$centers
         hw = hardyweinberg(tmp$cluster)
     }
     ss = sum(tmp$withinss)     
     return(c(centers, prior2, ss, hw))
 }
 
 
 calculateParameters2Cluster <- function(M, priormeans, numSNP, verbose) {
     Apriors = priormeans
    
     cl <- makeCluster(getNumberOfCores())
     clusterEvalQ(cl, library(crlmm))
     parAns <- parRapply(cl, M, calculateOneParams2Cluster, Apriors)
     stopCluster(cl)
 
     parAnsAllValues = unlist(parAns)
     parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
 
     centers <- parameters[, 1:2]
     priors2cluster <- parameters[, 3:4]
     ss <- parameters[, 5]
     hw <- parameters[, 6]
     
     rm(parAns)
     rm(parameters)
     sigma=solve(cov(centers, use="complete.obs"))
 
     mh = calculateMahalDist2Cluster(centers, sigma, priors2cluster, numSNP)
 
     rm(sigma)
     rm(centers)
 
     gc()
     return(list(eless = ss, elemh = mh, elehw = hw))
 }
 
 
 calculateMahalDist2Cluster <- function(centers, sigma, priors, numSNP){
     mahaldist = rep(NA, numSNP)
     
     for (i in 1:numSNP) {
         difference <- centers[i,] - priors[i,]
         mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)        
     }
     return(mahaldist)
 }
 
 calculateOneParams1Cluster <- function(x, priors){
     center = mean(x)
     diff <- x - center
     diffsquare <- diff^2
     ss = sum(diffsquare)
 
     closestPrior = priors[which.min(abs(priors - center))]
 
     return(c(center, closestPrior, ss))
 }
 
 
 calculateParameters1Cluster <- function(M, priormeans, numSNP, verbose) {
     Apriors = priormeans
    
     cl <- makeCluster(getNumberOfCores())
     clusterEvalQ(cl, library(crlmm))
     parAns <- parRapply(cl, M, calculateOneParams1Cluster, Apriors)
     stopCluster(cl)
 
     parAnsAllValues = unlist(parAns)
     parameters <- matrix(parAnsAllValues, numSNP, 3, byrow = TRUE)
 
     centers <- matrix(parameters[, 1], numSNP, 1)
     prior1cluster <- matrix(parameters[, 2], numSNP, 1)
     ss <- parameters[, 3]
     
     rm(parAns)
     rm(parameters)
     sigma=solve(cov(centers, use="complete.obs"))
 
     mh = calculateMahalDist1Cluster(centers, sigma, prior1cluster, numSNP)
 
     rm(sigma)
     rm(centers)
 
     gc()
     return(list(eless = ss, elemh = mh))
 }
 
 
 calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
     mahaldist = rep(NA, numSNP)
     
     for(i in 1:numSNP) {
         difference <- as.vector(centers[i, 1] - priors[i, 1])
 	mahaldist[i]=difference%*%sigma%*%difference
     }
     return(mahaldist)
 }
    
 #############################################
 
 
 computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
     # compute log-ratio in blocksize of 500,000 by default
     if (verbose) message("Start computing confidence score")
     
     numBlock = ceiling(numSNP / blockSize)
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         subsetCalls = as.matrix(calls[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
         subsetCallProb = .Call("krlmmConfidenceScore", subsetM, subsetCalls, PACKAGE="crlmm");
         callsPr[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetCallProb[1:nrow(subsetM), ]
         rm(subsetM, subsetCalls, subsetCallProb)
     }
 
     if (verbose) message("Done computing confidence score")
 }
 
 #############################################
 
 AddBackNoVarianceSNPs <- function(cnSet, calls, scores, numSNP, numSample, variantSNPIndex, noVariantSNPIndex){
     callsGt = calls(cnSet)
     callsPr = snpCallProbability(cnSet)
     open(callsGt)
     open(callsPr)
 
     callsGt[variantSNPIndex, ] = calls[,]
     callsPr[variantSNPIndex, ] = scores[,]  
     
     callsGt[noVariantSNPIndex, ] = NA
     callsPr[noVariantSNPIndex, ] = 0     
 
     close(callsGt)
     close(callsPr)    
 }
 
 #############################################
 
 krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose){
     if (verbose) message("Start imputing gender")    
     S = computeAverageLogIntensity(cnSet, verbose) 
 
     # S is calculated and saved in original SNP order. 
     matchy = match(annotation[YIndex, 2], rownames(S))
     matchy = matchy[!is.na(matchy)]
     if (length(matchy) <= 10){
         predictedGender = rep(NA, ncol(A))
     }
     Sy = S[matchy,]
 
     uniqueDataPoint = apply(Sy, 1, function(x){length(unique(x))})
     validYSNPs = uniqueDataPoint >= 2
     SyValid = Sy[validYSNPs, ]
     
     if (nrow(SyValid) < 20){
         message("Not enough ChrY SNPs, skipping gender prediction step");
         predictedGender = rep(NA, ncol(Sy))
     }
    
     rm(S)
     numYChrSNP = nrow(SyValid)
 
     allS = matrix(NA, numYChrSNP, 2)
 
     for (i in 1:numYChrSNP) {
         tmp = kmeans(SyValid[i,] ,2, nstart=45)
         allS[i,] = sort(tmp$centers, decreasing=F)
     }
     priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
 
     if (abs(priorS[1] - priorS[2]) <= 1.6) {
         message("Skipping gender prediction step");
         predictedGender = rep(NA, ncol(Sy))        
     }
     
     meanmatrix = apply(Sy, 2, median)
 
     Sy1 = abs(meanmatrix - priorS[1])
     Sy2 = abs(meanmatrix - priorS[2])
 
     # output male - 1, female - 2, female S-values are smaller than male S-values. 
     test = cbind(Sy2, Sy1)
     predictedGender = apply(test, 1, which.min)
 
     open(cnSet$gender)
     cnSet$gender[,] = predictedGender
     close(cnSet$gender)
     
     if (verbose) message("Done imputing gender")
     return(predictedGender)   
 }
 
 
 #############################################
 
 verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose){
     if (verbose) message("Start verifying SNPs on Chromosome Y")
     callsGt = calls(cnSet)
     callsPr = snpCallProbability(cnSet)
     open(callsGt)
     open(callsPr)
        
     matchy = match(annotation[YIndex, 2], rownames(M))
     matchy = matchy[!is.na(matchy)]
    
     MChrY = M[matchy,]
     callsChrY = calls[matchy,]
   
     male = gender == 1
     female = gender == 2    
     
     checkK = apply(callsChrY[, male], 1, function(x){ length(unique(x[!is.na(x)])) } )
     
     for(i in 1:nrow(MChrY)){
         # Chromosome Y SNPs, no calls for female, k = 1 or 2 permitted for male samples
         thisChrYSNPorigPosition = match(rownames(callsChrY)[i], rownames(callsGt))
         callsGt[thisChrYSNPorigPosition, female] = NA
         callsPr[thisChrYSNPorigPosition, female] = 0
 
         # early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned
         # male samples to 3 clusters by mistake. 
         if (checkK[i] < 3) next;
           
         if (class(try(kmeans(MChrY[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){
            
             maleSampleCalls = kmeans(MChrY[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster
             callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 1] = 1
             callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 2] = 3
         } else {
                     
             distanceToPrior1 = mean(abs(MChrY[i, male] - priormeans[1]))
             distanceToPrior3 = mean(abs(MChrY[i, male] - priormeans[3]))                
             callsGt[thisChrYSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3)
         }
 
         MMaleSamples = MChrY[i, male]
         callsMaleSample = callsGt[thisChrYSNPorigPosition, male]
         scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm");
 
         callsPr[thisChrYSNPorigPosition, male] = scoresMaleSample       
     }
 
     close(callsGt)
     close(callsPr)
     
     if (verbose) message("Done verifying SNPs on Chromosome Y")
 }