krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) { pkgname <- getCrlmmAnnotationName(cdfName) ### pre-processing, output M M = computeLogRatio(cnSet, verbose) message("Leaving out non-variant SNPs") # For SNPs with less than 3 distinct data point, exclude them from downstream analysis uniqueCount = apply(M[,], 1, function(x){length(unique(x))}) SNPtoProcessIndex = uniqueCount >= 3 noVariantSNPIndex = uniqueCount < 3 M = M[SNPtoProcessIndex, ] numSNP = nrow(M) numSample = ncol(M) calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer") scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double") open(calls) open(scores) rownames(calls) = rownames(M) rownames(scores) = rownames(M) colnames(calls) = colnames(M) colnames(scores) = colnames(M) priormeans = calculatePriorValues(M, numSNP, verbose) VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose) ### retrieve or calculate coefficients krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M)) ### do VGLM fit, to predict k for each SNP kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose) rm(VGLMparameters) ### assign calls assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose) rm(kPrediction) ### assign confidence scores computeCallPr(scores, M, calls, numSNP, numSample, verbose) ### add back SNPs excluded before AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex) loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) YIndex <- getVarInEnv("YIndex") loader("annotation.rda", .crlmmPkgEnv, pkgname) annotation <- getVarInEnv("annot") ### impute gender if gender information not provided if (is.null(gender)) { gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose) } ### double-check ChrY SNP cluster, impute gender if gender information not provided if (!(is.null(gender))) { verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose) } close(calls) close(scores) rm(calls) rm(scores) rm(M) rm(annotation) rm(YIndex) TRUE } krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annotation=NULL, verbose=TRUE) { if(is.null(normalize.method)){ message("No normalization method specified. Quantile normalization applied") M=quantilenormalization(cnSet,verbose,offset=offset) }else{ if(normalize.method=="quantile"){ M=quantilenormalization(cnSet,verbose,offset=offset) } if(normalize.method=="loess"){ M=loessnormalization(cnSet,verbose,offset=offset) } } message("Leaving out non-variant SNPs") # For SNPs with less than 3 distinct data point, exclude them from downstream analysis uniqueCount = apply(M[,], 1, function(x){length(unique(x))}) SNPtoProcessIndex = uniqueCount >= 3 noVariantSNPIndex = uniqueCount < 3 M = M[SNPtoProcessIndex, ] numSNP = nrow(M) numSample = ncol(M) calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer") scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double") open(calls) open(scores) rownames(calls) = rownames(M) rownames(scores) = rownames(M) colnames(calls) = colnames(M) colnames(scores) = colnames(M) prepriormeans=calculatePriorcentersC(M,numSample) trueCalls=matrix(NA,nrow(M),ncol(M)) for(i in 1:ncol(M)){ tmp=kmeans(M[,i],centers=prepriormeans,nstart=45) trueCalls[,i]=tmp$cluster } colnames(trueCalls)=colnames(M) rownames(trueCalls)=rownames(M) priormeans = calculatePriorValues(M, numSNP, verbose) VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose) ### retrieve or calculate coefficients krlmmCoefficients = getKrlmmVGLMCoefficients(trueCalls=trueCalls,params=VGLMparameters,numSample=ncol(M),samplenames=colnames(M),verbose=T) ### do VGLM fit, to predict k for each SNP kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose) rm(VGLMparameters) ### assign calls assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose=T) rm(kPrediction) ### assign confidence scores computeCallPr(scores, M, calls, numSNP, numSample, verbose) YIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==24 & isSnp(cnSet)] ### impute gender if gender information not provided if (is.null(gender)) { gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset) } ### double-check ChrY SNP cluster, impute gender if gender information not provided if (!(is.null(gender))) { verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose) } # if (length(YIndex)>0 && !(is.null(gender))) { # verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose) # } ### add back SNPs excluded before AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex) close(calls) close(scores) rm(calls) rm(scores) rm(M) TRUE } ####################################################################################################### loessnormalization<- function(cnSet, verbose, offset=16, blockSize = 300000){ # compute log-ratio in blocksize of 300,000 by default message("Start computing log-ratios") A <- A(cnSet) B <- B(cnSet) open(A) open(B) numSNP <- nrow(A) numSample <- ncol(A) library(limma) M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double") S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double") rownames(M) = rownames(A) colnames(M) = colnames(A) numBlock = ceiling(numSNP / blockSize) for (i in 1:numBlock){ if (verbose) message(" -- Processing segment ", i, " out of ", numBlock) subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm") M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ] subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm") S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ] rm(subsetA, subsetB, subsetM,subsetS) } close(A) close(B) gc() isna=is.na(M[,1]) prepriormeansL=calculatePriorcentersC(M[,][!isna,],numSample) F01=abs(prepriormeansL[1])-prepriormeansL[2] F02=abs(prepriormeansL[3])+prepriormeansL[2] for(i in 1:ncol(M)) { isna=is.na(M[,i]) Msub = M[!isna,i] Mna=M[isna,i] Ssub = S[!isna,i] Sna=S[isna,i] km = kmeans(Msub, prepriormeansL) ind1v2 = km$cluster==1 ind2v2 = km$cluster==2 ind3v2 = km$cluster==3 l1v2 = loessFit(Msub[ind1v2], Ssub[ind1v2]) l2v2 = loessFit(Msub[ind2v2], Ssub[ind2v2]) l3v2 = loessFit(Msub[ind3v2], Ssub[ind3v2]) Msub[ind1v2] = l1v2$residuals+F01 Msub[ind2v2] = l2v2$residuals Msub[ind3v2] = l3v2$residuals-F02 Mnew=rbind(as.matrix(Msub),as.matrix(Mna)) rownames(Mnew)=c(rownames(as.matrix(Msub)),rownames(as.matrix(Mna))) m=match(rownames(M),rownames(Mnew)) Mnew=Mnew[m] M[,i] = Mnew rm(Msub, Ssub, ind1v2, ind2v2, ind3v2, l1v2, l2v2, l3v2) } return(M) } ####################################################################################################### quantilenormalization <- function(cnSet, verbose, offset=16, blockSize = 300000){ A <- A(cnSet) B <- B(cnSet) open(A) open(B) numSNP <- nrow(A) numSample <- ncol(A) M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double") X <- oligoClasses::initializeBigMatrix(name="X", numSNP, numSample, vmode = "integer") Y <- oligoClasses::initializeBigMatrix(name="Y", numSNP, numSample, vmode = "integer") rownames(M) = rownames(X) = rownames(Y) = rownames(A) colnames(M) = colnames(X) = colnames(Y) = colnames(A) # This step may chew up a lot of memory... X = normalize.quantiles(as.matrix(A[,]))+offset Y = normalize.quantiles(as.matrix(B[,]))+offset numBlock = ceiling(numSNP / blockSize) for (i in 1:numBlock){ if (verbose) message(" -- Processing segment ", i, " out of ", numBlock) subsetXqws = as.matrix(X[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ]) subsetYqws = as.matrix(Y[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ]) subsetM = .Call("krlmmComputeM", subsetXqws, subsetYqws, PACKAGE="crlmm") M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ] rm(subsetYqws, subsetXqws, subsetM) } close(A) close(B) return(M) delete(X) delete(Y) rm(X) rm(Y) gc() } ####################################################################################################### getNumberOfCores <- function(){ defaultCores = min(detectCores(), 8) return(getOption("krlmm.cores", defaultCores)) } ####################################################################################################### computeLogRatio <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){ # compute log-ratio in blocksize of 500,000 by default if(verbose) message("Start computing log-ratios") if(!is.null(col) | !is.null(row)) { if(is.null(row)) { A <- A(cnSet)[,col] B <- B(cnSet)[,col] } if(is.null(col)) { A <- A(cnSet)[row,] B <- B(cnSet)[row,] } if(!is.null(col) & !is.null(row)) { A <- A(cnSet)[row,col] B <- B(cnSet)[row,col] } } else { A <- A(cnSet) B <- B(cnSet) } open(A) open(B) numSNP <- nrow(A) numSample <- ncol(A) M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double") rownames(M) = rownames(A) colnames(M) = colnames(A) numBlock = ceiling(numSNP / blockSize) for (i in 1:numBlock){ if(verbose) message(" -- Processing segment ", i, " out of ", numBlock) subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm") M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ] rm(subsetA, subsetB, subsetM) } close(A) close(B) if(verbose) message("Done computing log-ratios") return(M) } computeAverageLogIntensity <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){ # compute average log intensity in blocksize of 500,000 by default if(verbose) message("Start computing average log-intensities") if(!is.null(col) | !is.null(row)) { if(is.null(row)) { A <- A(cnSet)[,col] B <- B(cnSet)[,col] } if(is.null(col)) { A <- A(cnSet)[row,] B <- B(cnSet)[row,] } if(!is.null(col) & !is.null(row)) { A <- A(cnSet)[row,col] B <- B(cnSet)[row,col] } } else { A <- A(cnSet) B <- B(cnSet) } open(A) open(B) numSNP <- nrow(A) numSample <- ncol(A) S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double") rownames(S) = rownames(A) colnames(S) = colnames(A) numBlock = ceiling(numSNP / blockSize) for (i in 1:numBlock){ if(verbose) message(" -- Processing segment ", i, " out of ", numBlock) subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm") S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ] rm(subsetA, subsetB, subsetS) } close(A) close(B) if(verbose) message("Done computing average log-intensities") return(S) } ####################################################################################################### calculatePriorValues <- function(M, numSNP, verbose) { calculateOneKmeans <- function(x) { tmp = kmeans(x, 3, nstart=45) return(sort(tmp$centers, decreasing = T)) } if (verbose) message("Start calculating prior means") cl <- makeCluster(getNumberOfCores()) centers <- parRapply(cl, M, calculateOneKmeans) stopCluster(cl) centers <- matrix(centers, numSNP, 3, byrow = TRUE) priormeans = apply(centers, 2, FUN="median", na.rm=TRUE) if(abs(sum(priormeans))>1) { checksymmetric= apply(centers,1,function(x){abs(sum(x))})<1 priormeans=apply(centers[checksymmetric,],2, FUN="median", na.rm=TRUE) } if (verbose) message("Done calculating prior means") return(priormeans) } calculatePriorcentersC <- function(M, numSample) { calculateOneKmeans <- function(x) { tmp = kmeans(x, 3, nstart=45) return(sort(tmp$centers, decreasing = T)) } cl <- makeCluster(getNumberOfCores()) centers <- parCapply(cl, M, calculateOneKmeans) stopCluster(cl) centers <- matrix(centers, numSample, 3, byrow = TRUE) prepriormeans = apply(centers, 2, FUN="median", na.rm=TRUE) return(prepriormeans) } ####################################################################################################### calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){ # if (!require(VGAM)) { # message("VGAM package not found, fall back to use defined platform-specific coefficients") # return(NULL) # } amatch = match(rownames(params), rownames(trueCalls)) amatch = amatch[!is.na(amatch)] trueCalls = trueCalls[amatch,] amatch = match(rownames(trueCalls), rownames(params)) params = params[amatch, ] amatch = match(samplenames, colnames(trueCalls)) amatch = amatch[!is.na(amatch)] trueCalls = trueCalls[, amatch] if ((numSample <= 40) && (ncol(trueCalls) < round(numSample/2))){ message("Sample size is ", numSample, ", KRLMM requires at least trueCalls of ", round(numSample/2), " samples to calculate coefficients") return(NULL) } if ((numSample > 40) && (ncol(trueCalls) < 20)){ message("At least trueCalls of 20 samples are required to calculate coefficients") return(NULL) } if (nrow(trueCalls) < 1000){ message("At lease trueCalls of 1000 SNPs are required to calculate coefficients") return(NULL) } getna = apply(trueCalls, 1, function(x){sum(is.na(x))>=10}) truek = apply(trueCalls, 1, function(x){length(unique(x[!is.na(x)]))}) params1 = params[!getna,] truek1 = truek[!getna] xx = data.frame(params1) t = as.factor(as.numeric(truek1)) xx = data.frame(xx, t) fit = suppressWarnings(vglm(t~., multinomial(refLevel=1), xx)) coe = coefficients(fit) # VGAM:: return(coe) } getKrlmmVGLMCoefficients <- function(pkgname, trueCalls, params, verbose, numSample, samplenames){ if (!is.null(trueCalls)) { coe = calculateKrlmmCoefficients(trueCalls, params, numSample, samplenames) if (!is.null(coe)) { if (verbose) message ("Done calculating platform-specific coefficients to predict number of clusters") return(coe) } } if (!is.null(trueCalls)) message("Fall back to use defined platform-specific coefficients") if (verbose) message ("Retrieving defined platform-specific coefficients to predict number of clusters") loader("krlmmVGLMCoefficients.rda", .crlmmPkgEnv, pkgname) return(getVarInEnv("krlmmCoefficients")) } predictKwithVGLM <- function(data, coe, verbose){ if (verbose) message("Start predicting number of clusters") logit1 <- rep(0, nrow(data)) logit2 <- coe[1]+coe[3]*data[,1]+coe[5]*data[,2]+coe[7]*data[,3]+coe[9]*data[,4]+coe[11]*data[,5]+coe[13]*data[,6]+coe[15]*data[,7]+coe[17]*data[,8] logit23 <- coe[2]+coe[4]*data[,1]+coe[6]*data[,2]+coe[8]*data[,3]+coe[10]*data[,4]+coe[12]*data[,5]+coe[14]*data[,6]+coe[16]*data[,7]+coe[18]*data[,8] logits <- cbind(logit1, logit2, logit23) rm(logit1) rm(logit2) rm(logit23) p.unscaled <- exp(logits) rm(logits) p <- p.unscaled / rowSums(p.unscaled) clusterPrediction = apply(p, 1, function(x){which.max(x)}) rm(p.unscaled) rm(p) if (verbose) message("Done predicting number of clusters") return(clusterPrediction) } ####################################################################################################### assignCalls3Cluster <- function(intensities, priormeans){ prior31 = c(priormeans[1]/2,priormeans[2],priormeans[3]) prior32 = c(priormeans[1],priormeans[2],priormeans[3]/2) emp <- rep(NA, length(intensities)) ansCalls <- emp tmp <- try(kmeans(intensities, priormeans, nstart=1),TRUE) if(class(tmp) == "try-error") { tmp1 <- try(kmeans(intensities, prior31, nstart=1),TRUE) if(class(tmp1) == "try-error"){ tmp2 <- try(kmeans(intensities, prior32, nstart=1),TRUE) if(class(tmp2) == "try-error"){ ansCalls = emp }else{ ansCalls = tmp2$cluster } }else{ ansCalls = tmp1$cluster } }else{ ansCalls = tmp$cluster } rm(prior31, prior32) return(ansCalls) } ####################################################################################################### assignCalls2Cluster <- function(intensities, priormeans){ closest <- rep(NA, 3) for(j in 1:3){ distance <- as.vector(abs(priormeans[j]-intensities)) closest[j] <- intensities[which.min(distance)] } prior2 <- priormeans[priormeans!=priormeans[which.max(abs(closest-priormeans))]] emp <- rep(NA, length(intensities)) ansCalls <- emp tmp <- try(kmeans(intensities, prior2, nstart=1), TRUE) if(class(tmp) == "try-error") { ansCalls <- emp }else{ if(prior2[1]==priormeans[2] && prior2[2]==priormeans[3]){ mp <- abs(tmp$centers[1]-tmp$centers[2]) pp <- abs(priormeans[2]-priormeans[3]) if((mp/pp)<=0.25){ ansCalls <- emp }else{ ansCalls <- tmp$cluster+1 } } if(prior2[1]==priormeans[1] && prior2[2]==priormeans[2]){ mp=abs(tmp$centers[1]-tmp$centers[2]) pp=abs(priormeans[1]-priormeans[2]) if((mp/pp)<=0.25){ ansCalls = emp }else{ ansCalls <- tmp$cluster } } if(prior2[1]==priormeans[1] && prior2[2]==priormeans[3]){ mp <- abs(tmp$centers[1]-tmp$centers[2]) pp <- abs(priormeans[1]-priormeans[3]) if ((mp/pp) <=0.25){ ansCalls <- emp }else{ ansCalls[tmp$cluster==1] <- 1 ansCalls[tmp$cluster==2] <- 3 } } } rm(tmp) return(ansCalls) } ####################################################################################################### assignCalls1Cluster <- function(intensities, priormeans){ closest <- rep(NA, 3) for(j in 1:3){ distance <- as.vector(abs(priormeans[j]-intensities)) closest[j]=intensities[which.min(distance)] } clusterindex <- which.min(abs(closest-priormeans)) return(rep(clusterindex, length(intensities))) } ####################################################################################################### assignCallsOneSNP <- function(x, priormeans, numSample){ tolerance = 1e-3 k = x[numSample + 1] values = x[1:numSample] if (abs(k - 2) <= tolerance) { tmp <- try(kmeans(values, priormeans, nstart=1),TRUE) if (!(class(tmp)=="try-error")) { k <- 3; } } successful <- FALSE; if (abs(k - 3) <= tolerance){ SNPCalls <- assignCalls3Cluster(values, priormeans) successful <- !is.na(SNPCalls[1]) if (!successful) { k <- 2 } } if ( (abs(k - 2) <= tolerance) && (!successful)){ SNPCalls <- assignCalls2Cluster(values, priormeans) successful <- !is.na(SNPCalls[1]) if (!successful) { k <- 1 } } if ( (abs(k - 1) <= tolerance) && (!successful)){ SNPCalls <- assignCalls1Cluster(values, priormeans) } return(SNPCalls) } assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){ # process by block size of 500,000 by default message("Start assigning calls") numBlock = ceiling(numSNP / blockSize) for (i in 1:numBlock){ if (verbose) message(" -- Processing segment ", i, " out of ", numBlock) subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ]) thisnumSNP = nrow(subsetM) subseta = a[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP)] subseta = matrix(subseta, thisnumSNP, 1) testValues = cbind(subsetM, subseta) cl <- makeCluster(getNumberOfCores()) callAns <- parRapply(cl, testValues, assignCallsOneSNP, priormeans, numSample) stopCluster(cl) callAnsAllValues = unlist(callAns) subsetcalls <- matrix(callAnsAllValues, thisnumSNP, numSample, byrow = TRUE) callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ] rm(subsetM, subseta, subsetcalls) } message("Done assigning calls") } ####################################################################################################### calculateParameters <- function(M, priormeans, numSNP, verbose) { if (verbose) message("Start calculating 3-cluster parameters") params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose); if (verbose) message("Done calculating 3-cluster parameters") if (verbose) message("Start calculating 2-cluster parameters") params2cluster <- calculateParameters2Cluster(M, priormeans, numSNP, verbose); if (verbose) message("Done calculating 2-cluster parameters") if (verbose) message("Start calculating 1-cluster parameters") params1cluster <- calculateParameters1Cluster(M, priormeans, numSNP, verbose); if (verbose) message("Done calculating 1-cluster parameters") parameters <- cbind(as.matrix(params1cluster$elemh), as.matrix(params1cluster$eless), as.matrix(params2cluster$elemh), as.matrix(params2cluster$eless), as.matrix(params3cluster$elemh), as.matrix(params3cluster$eless), as.matrix(params2cluster$elehw), as.matrix(params3cluster$elehw)); rownames(parameters) = rownames(M) rm(params3cluster) rm(params2cluster) rm(params1cluster) return(parameters) } hardyweinberg <- function(x, minN = 8){ if (length(x) < minN){ return(NA) } else { result = .Call("krlmmHardyweinberg", x) return(result) } } calculateOneParams3Cluster <- function(x, priors, priorDown, priorUp){ tmp = try(kmeans(x, priors, nstart=1), TRUE) if(class(tmp) == "kmeans") { flag = 1 } else { tmp = try(kmeans(x, priorDown, nstart=1), TRUE) if(class(tmp) == "kmeans") { flag = 2 } else { tmp = try(kmeans(x, priorUp, nstart=1), TRUE) if (class(tmp) == "kmeans") { flag = 3 } else { tmp = kmeans(x, 3, nstart=35) flag = 4 } } } ss = sum(tmp$withinss) hw = hardyweinberg(tmp$cluster) centers = sort(tmp$centers, decreasing = TRUE) return(c(centers, ss, hw, flag)) } calculateParameters3Cluster <- function(M, priormeans, numSNP, verbose) { Apriors = priormeans ApriorDown = c(Apriors[1]/2, Apriors[2], Apriors[3]) # shift-down ApriorUp = c(Apriors[1], Apriors[2], Apriors[3]/2) # shift-up cl <- makeCluster(getNumberOfCores()) clusterEvalQ(cl, library(crlmm)) parAns <- parRapply(cl, M, calculateOneParams3Cluster, Apriors, ApriorDown, ApriorUp) stopCluster(cl) parAnsAllValues = unlist(parAns) parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE) centers <- parameters[, 1:3] ss <- parameters[, 4] hw <- parameters[, 5] flag <- parameters[, 6] rm(parAns) rm(parameters) sigma=solve(cov(centers, use="complete.obs")) mh = calculateMahalDist3Cluster(centers, sigma, flag, Apriors, ApriorDown, ApriorUp, numSNP) rm(sigma) rm(centers) gc() return(list(eless = ss, elemh = mh, elehw = hw)) } calculateMahalDist3Cluster <- function(centers, sigma, flag, priors, priorDown, priorUp, numSNP){ mahaldist = rep(NA, numSNP) tolerance = 1e-3 for (i in 1:numSNP) { if ((abs(flag[i] - 1) <= tolerance) || (abs(flag[i]- 4) <= tolerance)) difference = centers[i, ] - priors if (abs(flag[i] - 2) <= tolerance) difference = centers[i, ] - priorDown if (abs(flag[i] - 3) <= tolerance) difference = centers[i, ] - priorUp mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference) } return(mahaldist) } calculateOneParams2Cluster <- function(x, priors){ aa = rep(NA, 3) for(j in 1:3){ dist = as.vector(abs(priors[j]-x)) aa[j]=x[which.min(dist)] } prior2 = priors[priors!=priors[which.max(abs(aa-priors))]] tmp = try(kmeans(x, prior2, nstart=1), TRUE) if (class(tmp)=="kmeans") { centers = tmp$centers hw = hardyweinberg(tmp$cluster) } rm(tmp) tmp = kmeans(x, 2, nstart = 35) if (tmp$centers[1] < tmp$centers[2]){ centers = c(tmp$centers[2], tmp$centers[1]) hw = hardyweinberg(3 - tmp$cluster) } else { centers = tmp$centers hw = hardyweinberg(tmp$cluster) } ss = sum(tmp$withinss) return(c(centers, prior2, ss, hw)) } calculateParameters2Cluster <- function(M, priormeans, numSNP, verbose) { Apriors = priormeans cl <- makeCluster(getNumberOfCores()) clusterEvalQ(cl, library(crlmm)) parAns <- parRapply(cl, M, calculateOneParams2Cluster, Apriors) stopCluster(cl) parAnsAllValues = unlist(parAns) parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE) centers <- parameters[, 1:2] priors2cluster <- parameters[, 3:4] ss <- parameters[, 5] hw <- parameters[, 6] rm(parAns) rm(parameters) sigma=solve(cov(centers, use="complete.obs")) mh = calculateMahalDist2Cluster(centers, sigma, priors2cluster, numSNP) rm(sigma) rm(centers) gc() return(list(eless = ss, elemh = mh, elehw = hw)) } calculateMahalDist2Cluster <- function(centers, sigma, priors, numSNP){ mahaldist = rep(NA, numSNP) for (i in 1:numSNP) { difference <- centers[i,] - priors[i,] mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference) } return(mahaldist) } calculateOneParams1Cluster <- function(x, priors){ center = mean(x) diff <- x - center diffsquare <- diff^2 ss = sum(diffsquare) closestPrior = priors[which.min(abs(priors - center))] return(c(center, closestPrior, ss)) } calculateParameters1Cluster <- function(M, priormeans, numSNP, verbose) { Apriors = priormeans cl <- makeCluster(getNumberOfCores()) clusterEvalQ(cl, library(crlmm)) parAns <- parRapply(cl, M, calculateOneParams1Cluster, Apriors) stopCluster(cl) parAnsAllValues = unlist(parAns) parameters <- matrix(parAnsAllValues, numSNP, 3, byrow = TRUE) centers <- matrix(parameters[, 1], numSNP, 1) prior1cluster <- matrix(parameters[, 2], numSNP, 1) ss <- parameters[, 3] rm(parAns) rm(parameters) sigma=solve(cov(centers, use="complete.obs")) mh = calculateMahalDist1Cluster(centers, sigma, prior1cluster, numSNP) rm(sigma) rm(centers) gc() return(list(eless = ss, elemh = mh)) } calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){ mahaldist = rep(NA, numSNP) for(i in 1:numSNP) { difference <- as.vector(centers[i, 1] - priors[i, 1]) mahaldist[i]=difference%*%sigma%*%difference } return(mahaldist) } ############################################# computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){ # compute log-ratio in blocksize of 500,000 by default if (verbose) message("Start computing confidence scores") numBlock = ceiling(numSNP / blockSize) for (i in 1:numBlock){ if (verbose) message(" -- Processing segment ", i, " out of ", numBlock) subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ]) subsetCalls = as.matrix(calls[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ]) subsetCallProb = .Call("krlmmConfidenceScore", subsetM, subsetCalls, PACKAGE="crlmm"); callsPr[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetCallProb[1:nrow(subsetM), ] rm(subsetM, subsetCalls, subsetCallProb) } if (verbose) message("Done computing confidence scores") } ############################################# AddBackNoVarianceSNPs <- function(cnSet, calls, scores, numSNP, numSample, variantSNPIndex, noVariantSNPIndex){ callsGt = calls(cnSet) callsPr = snpCallProbability(cnSet) open(callsGt) open(callsPr) callsGt[variantSNPIndex, ] = calls[,] callsPr[variantSNPIndex, ] = scores[,] callsGt[noVariantSNPIndex, ] = NA callsPr[noVariantSNPIndex, ] = 0 close(callsGt) close(callsPr) } ############################################# krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose, offset=0){ if (verbose) message("Start imputing gender") S = computeAverageLogIntensity(cnSet, verbose, offset=offset) # S is calculated and saved in original SNP order. matchy = match(annotation[YIndex, "Name"], rownames(S)) matchy = matchy[!is.na(matchy)] if (length(matchy) <= 10){ predictedGender = rep(NA, ncol(S)) } Sy = S[matchy,] uniqueDataPoint = apply(Sy, 1, function(x){length(unique(x))}) validYSNPs = uniqueDataPoint >= 2 SyValid = Sy[validYSNPs, ] if (nrow(SyValid) < 20){ message("Not enough ChrY SNPs, skipping gender prediction step"); predictedGender = rep(NA, ncol(Sy)) } rm(S) numYChrSNP = nrow(SyValid) allS = matrix(NA, numYChrSNP, 2) for (i in 1:numYChrSNP) { tmp = kmeans(SyValid[i,] ,2, nstart=45) allS[i,] = sort(tmp$centers, decreasing=F) } priorS = apply(allS, 2, FUN="median", na.rm=TRUE) if (abs(priorS[1] - priorS[2]) <= 1.6) { message("Separation between clusters too small (samples probabaly all the same gender): skipping gender prediction step"); predictedGender = rep(NA, ncol(Sy)) } meanmatrix = apply(Sy, 2, median) Sy1 = abs(meanmatrix - priorS[1]) Sy2 = abs(meanmatrix - priorS[2]) # output male - 1, female - 2, female S-values are smaller than male S-values. test = cbind(Sy2, Sy1) predictedGender = apply(test, 1, which.min) open(cnSet$gender) cnSet$gender[,] = predictedGender close(cnSet$gender) if (verbose) message("Done imputing gender") return(predictedGender) } ############################################# verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose){ if (verbose) message("Start verifying SNPs on Chromosome Y") callsGt = calls(cnSet) callsPr = snpCallProbability(cnSet) open(callsGt) open(callsPr) matchy = match(annotation[YIndex, "Name"], rownames(M)) matchy = matchy[!is.na(matchy)] MChrY = M[matchy,] callsChrY = calls[matchy,] male = gender == 1 female = gender == 2 checkK = apply(callsChrY[, male], 1, function(x){ length(unique(x[!is.na(x)])) } ) for(i in 1:nrow(MChrY)){ # Chromosome Y SNPs, no calls for female, k = 1 or 2 permitted for male samples thisChrYSNPorigPosition = match(rownames(callsChrY)[i], rownames(callsGt)) callsGt[thisChrYSNPorigPosition, female] = NA callsPr[thisChrYSNPorigPosition, female] = 0 # early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned # male samples to 3 clusters by mistake. if (checkK[i] < 3) next; if (class(try(kmeans(MChrY[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){ maleSampleCalls = kmeans(MChrY[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 1] = 1 callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 2] = 3 } else { distanceToPrior1 = mean(abs(MChrY[i, male] - priormeans[1])) distanceToPrior3 = mean(abs(MChrY[i, male] - priormeans[3])) callsGt[thisChrYSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3) } MMaleSamples = MChrY[i, male] callsMaleSample = callsGt[thisChrYSNPorigPosition, male] scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm"); callsPr[thisChrYSNPorigPosition, male] = scoresMaleSample } close(callsGt) close(callsPr) if (verbose) message("Done verifying SNPs on Chromosome Y") }