R/krlmm.R
ba1a7d73
 krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
   
     pkgname <- getCrlmmAnnotationName(cdfName)
     
 ### pre-processing, output M    
     M = computeLogRatio(cnSet, verbose) 
     numSNP = nrow(M)
     numSample = ncol(M)
     
     callsGt = calls(cnSet)
     callsPr = snpCallProbability(cnSet)
     open(callsGt)
     open(callsPr)
 
     priormeans = calculatePriorValues(M, numSNP, verbose)
     VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
 
 ### retrieve or calculate coefficients
     krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
     
 ### do VGLM fit, to predict k for each SNP
     kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose);
     rm(VGLMparameters)
     
 ### assign calls
     assignCalls(callsGt, M, kPrediction, priormeans, numSNP, numSample, verbose);
     rm(kPrediction)
 
 ### assign confidence scores
     computeCallPr(callsPr, M, callsGt, numSNP, numSample, verbose)
     close(callsGt)
     close(callsPr)
 
 ### impute gender if gender information not provided
     if (is.null(gender)) {
       gender = krlmmImputeGender(cnSet, pkgname, verbose)
     }
     open(cnSet$gender)
     cnSet$gender[,] = gender
     close(cnSet$gender)
 
     TRUE
 }
 
 #######################################################################################################
 
 getNumberOfCores <- function(){
     defaultCores = min(detectCores(), 8)
     return(getOption("krlmm.cores", defaultCores))    
 }
 
 
 #######################################################################################################
 
 computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
     # compute log-ratio in blocksize of 500,000 by default
     message("Start computing log ratio")
     A <- A(cnSet)
     B <- B(cnSet)
     open(A)
     open(B)
     
     numSNP <- nrow(A)
     numSample <- ncol(A)
     
     M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
     rownames(M) = rownames(A)
     colnames(M) = colnames(A)
   
     numBlock = ceiling(numSNP / blockSize)
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
         subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
         M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
         rm(subsetA, subsetB, subsetM)
     }
     
     close(A)
     close(B)
     if (verbose) message("Done computing log ratio")
     return(M)
 }
 
 computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
     # compute average log intensity in blocksize of 500,000 by default
     message("Start computing average log intensity")
     A <- A(cnSet)
     B <- B(cnSet)
     open(A)
     open(B)
     
     numSNP <- nrow(A)
     numSample <- ncol(A)
     
     S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
     
     numBlock = ceiling(numSNP / blockSize)
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
         subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
         S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
         rm(subsetA, subsetB, subsetS)
     }
     
     close(A)
     close(B)
     if (verbose) message("Done computing average log intensity")
     return(S)
 }
 
 
 #######################################################################################################
 
 calculatePriorValues <- function(M, numSNP, verbose) {
 
     calculateOneKmeans <- function(x) {
         tmp = kmeans(x, 3, nstart=45)
         return(sort(tmp$centers, decreasing = T))
     }
 
     if (verbose) message("Start calculating Prior Means")
     cl <- makeCluster(getNumberOfCores())
     centers <- parRapply(cl, M, calculateOneKmeans)
     stopCluster(cl) 
     centers <- matrix(centers, numSNP, 3, byrow = TRUE)
     priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
     if (verbose) message("Done calculating Prior Means")
     return(priormeans)
 }
 
 #######################################################################################################
 
 calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
     if (!require(VGAM)) {
         message("VGAM package not found, fall back to use defined platform-specific coefficients")
         return(NULL)
     }
     amatch = match(rownames(params), rownames(trueCalls))
     amatch = amatch[!is.na(amatch)]
     trueCalls = trueCalls[amatch,]
 
     amatch = match(rownames(trueCalls), rownames(params))
     params = params[amatch, ]
     
     amatch = match(samplenames, colnames(trueCalls))
     amatch = amatch[!is.na(amatch)]    
     trueCalls = trueCalls[, amatch]
         
     if ((numSample <= 40) && (ncol(trueCalls) < round(numSample/2))){
         message("Sample size is ", numSample, ", KRLMM requires at least trueCalls of ", round(numSample/2), " samples to calculate coefficients")
         return(NULL)
     }
     if ((numSample > 40) && (ncol(trueCalls) < 20)){
         message("At least trueCalls of 20 samples are required to calculate coefficients")
         return(NULL)
     }
     if (nrow(trueCalls) < 1000){
         message("At lease trueCalls of 1000 SNPs are required to calculate coefficients")
         return(NULL)
     }
 
     getna = apply(trueCalls, 1, function(x){sum(is.na(x))>=10})
     truek = apply(trueCalls, 1, function(x){length(unique(x[!is.na(x)]))})
    
     params1 = params[!getna,]
     truek1 = truek[!getna]
 
     xx = data.frame(params1)
     t = as.factor(as.numeric(truek1)) 
     xx = data.frame(xx, t)
1d875d32
     fit = suppressWarnings(VGAM::vglm(t~., multinomial(refLevel=1), xx))
ba1a7d73
     coe = VGAM::coefficients(fit)
     return(coe)    
 }
 
 getKrlmmVGLMCoefficients <- function(pkgname, trueCalls, params, verbose, numSample, samplenames){
     if (!is.null(trueCalls)) {
         coe = calculateKrlmmCoefficients(trueCalls, params, numSample, samplenames)
1d875d32
         if (!is.null(coe)) {
             if (verbose) message ("Done calculating platform-specific coefficients to predict number of clusters")
             return(coe)
         }
ba1a7d73
     }
1d875d32
     if (!is.null(trueCalls)) message("Fall back to use defined platform-specific coefficients")
     if (verbose) message ("Retrieving defined platform-specific coefficients to predict number of clusters")
     loader("krlmmVGLMCoefficients.rda", .crlmmPkgEnv, pkgname)
     return(getVarInEnv("krlmmCoefficients"))      
ba1a7d73
 }
 
 
 predictKwithVGLM <- function(data, coe, verbose){
     if (verbose) message("Start predicting number of clusters")
     logit1 <- rep(0, nrow(data))
     logit2 <- coe[1]+coe[3]*data[,1]+coe[5]*data[,2]+coe[7]*data[,3]+coe[9]*data[,4]+coe[11]*data[,5]+coe[13]*data[,6]+coe[15]*data[,7]+coe[17]*data[,8]
     logit23 <- coe[2]+coe[4]*data[,1]+coe[6]*data[,2]+coe[8]*data[,3]+coe[10]*data[,4]+coe[12]*data[,5]+coe[14]*data[,6]+coe[16]*data[,7]+coe[18]*data[,8]
    
     logits <- cbind(logit1, logit2, logit23)
     rm(logit1)
     rm(logit2)
     rm(logit23)
     p.unscaled <- exp(logits)
     rm(logits)   
     p <- p.unscaled / rowSums(p.unscaled)
     clusterPrediction = apply(p, 1, function(x){which.max(x)})
     rm(p.unscaled)
     rm(p)
     if (verbose) message("Done predicting number of clusters")
     return(clusterPrediction)
 }
 
 #######################################################################################################
 
 assignCalls3Cluster <- function(intensities, priormeans){
     prior31 = c(priormeans[1]/2,priormeans[2],priormeans[3])
     prior32 = c(priormeans[1],priormeans[2],priormeans[3]/2)	
 
     emp <- rep(NA, length(intensities))
 	ansCalls <- emp
     tmp <- try(kmeans(intensities, priormeans, nstart=1),TRUE)
     if(class(tmp) == "try-error") {
         tmp1 <- try(kmeans(intensities, prior31, nstart=1),TRUE)
         if(class(tmp1) == "try-error"){
             tmp2 <- try(kmeans(intensities, prior32, nstart=1),TRUE)
             if(class(tmp2) == "try-error"){
                 ansCalls = emp
             }else{
                 ansCalls = tmp2$cluster
             }
         }else{
             ansCalls = tmp1$cluster
         }
     }else{
         ansCalls = tmp$cluster
     }
     rm(prior31, prior32)
     return(ansCalls)
  
 }
 
 #######################################################################################################
 
 assignCalls2Cluster <- function(intensities, priormeans){
     closest <- rep(NA, 3)	
     for(j in 1:3){
         distance <- as.vector(abs(priormeans[j]-intensities))
         closest[j] <- intensities[which.min(distance)]
     }
     prior2 <- priormeans[priormeans!=priormeans[which.max(abs(closest-priormeans))]]
 	
     emp <- rep(NA, length(intensities))
     ansCalls <- emp    
     tmp <- try(kmeans(intensities, prior2, nstart=1), TRUE)
     if(class(tmp) == "try-error") {
         ansCalls <- emp
     }else{
         if(prior2[1]==priormeans[2] && prior2[2]==priormeans[3]){
             mp <- abs(tmp$centers[1]-tmp$centers[2])
             pp <- abs(priormeans[2]-priormeans[3])
             if((mp/pp)<=0.25){
                 ansCalls <- emp
             }else{
                 ansCalls <- tmp$cluster+1
             }
         }
         if(prior2[1]==priormeans[1] && prior2[2]==priormeans[2]){
             mp=abs(tmp$centers[1]-tmp$centers[2])
             pp=abs(priormeans[1]-priormeans[2])
             if((mp/pp)<=0.25){
                 ansCalls = emp
             }else{
                 ansCalls <- tmp$cluster
             }
         }
         if(prior2[1]==priormeans[1] && prior2[2]==priormeans[3]){
             mp <- abs(tmp$centers[1]-tmp$centers[2])
             pp <- abs(priormeans[1]-priormeans[3])
             if ((mp/pp) <=0.25){
                 ansCalls <- emp
             }else{
                 ansCalls[tmp$cluster==1] <- 1
                 ansCalls[tmp$cluster==2] <- 3
             }
         }
     }
     rm(tmp)
     return(ansCalls)
 }
 
 #######################################################################################################
 
 assignCalls1Cluster <- function(intensities, priormeans){
     closest <- rep(NA, 3)
     for(j in 1:3){
         distance <- as.vector(abs(priormeans[j]-intensities))
         closest[j]=intensities[which.min(distance)]
     }
     clusterindex <- which.min(abs(closest-priormeans))
     return(rep(clusterindex, length(intensities)))
 }
   
 #######################################################################################################
 
 assignCallsOneSNP <- function(x, priormeans, numSample){
     tolerance = 1e-3
        
     k = x[numSample + 1]
     values = x[1:numSample]  
     
     if (abs(k - 2) <= tolerance) {
         tmp <- try(kmeans(values, priormeans, nstart=1),TRUE)
         if (!(class(tmp)=="try-error")) {
             k <- 3;
         }
     }
 
     successful <- FALSE;
     if (abs(k - 3) <= tolerance){
         SNPCalls <- assignCalls3Cluster(values, priormeans)
         successful <- !is.na(SNPCalls[1])
         if (!successful) { 
             k <- 2
         }
     }
 		
     if ( (abs(k - 2) <= tolerance) && (!successful)){
         SNPCalls <- assignCalls2Cluster(values, priormeans)
         successful <- !is.na(SNPCalls[1])
         if (!successful) { 
             k <- 1
         }			
     }
 
     if ( (abs(k - 1) <= tolerance) && (!successful)){
         SNPCalls <- assignCalls1Cluster(values, priormeans)
     }
     return(SNPCalls)
 }
 
 
 assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
     # process by block size of 500,000 by default
     message("Start assign calls")
        
     numBlock = ceiling(numSNP / blockSize)
 
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         thisnumSNP = nrow(subsetM)
         subseta = a[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP)]
         subseta = matrix(subseta, thisnumSNP, 1)
 
         testValues = cbind(subsetM, subseta)
 
         cl <- makeCluster(getNumberOfCores())
         callAns <- parRapply(cl, testValues, assignCallsOneSNP, priormeans, numSample)
         stopCluster(cl)
 
         callAnsAllValues = unlist(callAns)
 
         subsetcalls <- matrix(callAnsAllValues, thisnumSNP, numSample, byrow = TRUE)
         
         callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
         rm(subsetM, subseta, subsetcalls)
     }
     message("Done assign calls")
 }
 
 #######################################################################################################
 
 hardyweinberg <- function(x, minN=10){
   if (length(x) < minN){
     return(NA) 
   } else {
     result = .Call("krlmmHardyweinberg", x)
     return(result)
808cfecc
   }
ba1a7d73
 }
 
 calculateParameters <- function(M, priormeans, numSNP, verbose) {
     if (verbose) message("Start calculating 3-clusters parameters")
     params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
     if (verbose) message("Done calculating 3-cluster parameters")
 
     if (verbose) message("Start calculating 2-cluster parameters")
     params2cluster <- calculateParameters2Cluster(M, priormeans, numSNP, verbose);
     if (verbose) message("Done calculating 2-cluster parameters")
 
     if (verbose) message("Start calculating 1-cluster parameters")
     params1cluster <- calculateParameters1Cluster(M, priormeans, numSNP, verbose);    
     if (verbose) message("Done calculating 1-cluster parameters")
 
     parameters <- cbind(as.matrix(params1cluster$elemh), as.matrix(params1cluster$eless), as.matrix(params2cluster$elemh), as.matrix(params2cluster$eless),
                         as.matrix(params3cluster$elemh), as.matrix(params3cluster$eless), as.matrix(params2cluster$elehw), as.matrix(params3cluster$elehw));
     
     rownames(parameters) = rownames(M)
     rm(params3cluster)
     rm(params2cluster)
     rm(params1cluster)
     return(parameters)
 }
 
 
 calculateOneParams3Cluster <- function(x, priors, priorDown, priorUp){
     tmp = try(kmeans(x, priors, nstart=1), TRUE)
     if(class(tmp) == "kmeans") {
         flag = 1
      } else {
         tmp = try(kmeans(x, priorDown, nstart=1), TRUE)
         if(class(tmp) == "kmeans") {
             flag = 2
         } else {
             tmp = try(kmeans(x, priorUp, nstart=1), TRUE)
             if (class(tmp) == "kmeans") {
                 flag = 3
             } else {
                 tmp = kmeans(x, 3, nstart=35)
                 flag = 4
             }
         }  
     }
     ss = sum(tmp$withinss)
     hw = hardyweinberg(tmp$cluster) 
     centers = sort(tmp$centers, decreasing = TRUE)
 
     return(c(centers, ss, hw, flag))
 }
 
 calculateParameters3Cluster <- function(M, priormeans, numSNP, verbose) {
     Apriors = priormeans
     ApriorDown = c(Apriors[1]/2, Apriors[2], Apriors[3]) # shift-down
     ApriorUp = c(Apriors[1], Apriors[2], Apriors[3]/2) # shift-up
 
     cl <- makeCluster(getNumberOfCores())
     parAns <- parRapply(cl, M, calculateOneParams3Cluster, Apriors, ApriorDown, ApriorUp)
 
     stopCluster(cl)
     parAnsAllValues = unlist(parAns)
     parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
    
     centers <- parameters[, 1:3]
     ss <- parameters[, 4]
     hw <- parameters[, 5]
     flag <- parameters[, 6]
 
     rm(parAns)
     rm(parameters)
    
     sigma=solve(cov(centers, use="complete.obs"))
     mh = calculateMahalDist3Cluster(centers, sigma, flag, Apriors, ApriorDown, ApriorUp, numSNP)
 
     rm(sigma)
     rm(centers)
 
     gc()
     return(list(eless = ss, elemh = mh, elehw = hw))
 }
 
 
 calculateMahalDist3Cluster <- function(centers, sigma, flag, priors, priorDown, priorUp, numSNP){
     mahaldist = rep(NA, numSNP)
 
     tolerance = 1e-3
     for (i in 1:numSNP) {
         if ((abs(flag[i] - 1) <= tolerance) || (abs(flag[i]- 4) <= tolerance)) difference = centers[i, ] - priors
         if (abs(flag[i] - 2) <= tolerance) difference = centers[i, ] - priorDown
         if (abs(flag[i] - 3) <= tolerance) difference = centers[i, ] - priorUp         
         mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
     }
     return(mahaldist)
 }
 
 
 calculateOneParams2Cluster <- function(x, priors){
   
     aa = rep(NA, 3)   
     for(j in 1:3){
         dist = as.vector(abs(priors[j]-x))
         aa[j]=x[which.min(dist)]
     } 
     prior2 = priors[priors!=priors[which.max(abs(aa-priors))]]
      
 
     tmp = try(kmeans(x, prior2, nstart=1), TRUE)
     if (class(tmp)=="kmeans") {
         centers = tmp$centers
         hw = hardyweinberg(tmp$cluster)
     }
     rm(tmp)
     tmp = kmeans(x, 2, nstart = 35)
     if (tmp$centers[1] < tmp$centers[2]){
         centers = c(tmp$centers[2], tmp$centers[1])
         hw = hardyweinberg(3 - tmp$cluster)   
     } else {
         centers = tmp$centers
         hw = hardyweinberg(tmp$cluster)
     }
     ss = sum(tmp$withinss)     
     return(c(centers, prior2, ss, hw))
 }
 
 
 calculateParameters2Cluster <- function(M, priormeans, numSNP, verbose) {
     Apriors = priormeans
    
     cl <- makeCluster(getNumberOfCores())
     parAns <- parRapply(cl, M, calculateOneParams2Cluster, Apriors)
     stopCluster(cl)
 
     parAnsAllValues = unlist(parAns)
     parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
 
     centers <- parameters[, 1:2]
     priors2cluster <- parameters[, 3:4]
     ss <- parameters[, 5]
     hw <- parameters[, 6]
     
     rm(parAns)
     rm(parameters)
     sigma=solve(cov(centers, use="complete.obs"))
 
     mh = calculateMahalDist2Cluster(centers, sigma, priors2cluster, numSNP)
 
     rm(sigma)
     rm(centers)
 
     gc()
     return(list(eless = ss, elemh = mh, elehw = hw))
 }
 
 
 calculateMahalDist2Cluster <- function(centers, sigma, priors, numSNP){
     mahaldist = rep(NA, numSNP)
     
     for (i in 1:numSNP) {
         difference <- centers[i,] - priors[i,]
         mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)        
     }
     return(mahaldist)
 }
 
 calculateOneParams1Cluster <- function(x, priors){
     center = mean(x)
     diff <- x - center
     diffsquare <- diff^2
     ss = sum(diffsquare)
 
     closestPrior = priors[which.min(abs(priors - center))]
 
     return(c(center, closestPrior, ss))
 }
 
 
 calculateParameters1Cluster <- function(M, priormeans, numSNP, verbose) {
     Apriors = priormeans
    
     cl <- makeCluster(getNumberOfCores())
     parAns <- parRapply(cl, M, calculateOneParams1Cluster, Apriors)
     stopCluster(cl)
 
     parAnsAllValues = unlist(parAns)
     parameters <- matrix(parAnsAllValues, numSNP, 3, byrow = TRUE)
 
     centers <- matrix(parameters[, 1], numSNP, 1)
     prior1cluster <- matrix(parameters[, 2], numSNP, 1)
     ss <- parameters[, 3]
     
     rm(parAns)
     rm(parameters)
     sigma=solve(cov(centers, use="complete.obs"))
 
     mh = calculateMahalDist1Cluster(centers, sigma, prior1cluster, numSNP)
 
     rm(sigma)
     rm(centers)
 
     gc()
     return(list(eless = ss, elemh = mh))
 }
 
 
 calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
     mahaldist = rep(NA, numSNP)
     
     for(i in 1:numSNP) {
         difference <- as.vector(centers[i, 1] - priors[i, 1])
 	mahaldist[i]=difference%*%sigma%*%difference
     }
     return(mahaldist)
 }
    
 #############################################
 
 
 computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
     # compute log-ratio in blocksize of 500,000 by default
     if (verbose) message("Start computing confidence score")
     
     numBlock = ceiling(numSNP / blockSize)
     for (i in 1:numBlock){
         if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
         subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
         subsetCalls = as.matrix(calls[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
         subsetCallProb = .Call("krlmmConfidenceScore", subsetM, subsetCalls, PACKAGE="crlmm");
         callsPr[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetCallProb[1:nrow(subsetM), ]
         rm(subsetM, subsetCalls, subsetCallProb)
     }
 
     if (verbose) message("Done computing confidence score")
 }
 
 #############################################
 
 krlmmImputeGender <- function(cnSet, pkgname, verbose){
     if (verbose) message("Start imputing gender")    
     S = computeAverageLogIntensity(cnSet, verbose) 
     loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
     YIndex <- getVarInEnv("YIndex")
     loader("annotation.rda", .crlmmPkgEnv, pkgname)
     annotation <- getVarInEnv("annot")
 
     A = A(cnSet)
     open(A)
     SNPNames = rownames(A)
     close(A)
 
     matchy = match(annotation[YIndex, 2], SNPNames)
     Sy = S[matchy,]
     rm(S)
     numYChrSNP = nrow(Sy)
 
     allS = matrix(NA, numYChrSNP, 2)
 
     for (i in 1:numYChrSNP) {
         tmp = kmeans(Sy[i,] ,2, nstart=45)
         allS[i,] = sort(tmp$centers, decreasing=F)
     }
     priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
 
     group = rep(NA, ncol(Sy))
 
     meanmatrix = apply(Sy, 2, median)
 
     Sy1 = abs(meanmatrix-priorS[1])
     Sy2 = abs(meanmatrix-priorS[2])
 
     # output male - 1, female - 2, female S-values are smaller than male S-values. 
     test = cbind(Sy2, Sy1)
     group = apply(test, 1, which.min)
808cfecc
 
ba1a7d73
     if (verbose) message("Done imputing gender") 
     return(group)    
808cfecc
 }