Browse code

Changes to krlmm to avoid error that occurs in k-means clustering when all intensities are identical

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@88225 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 01/04/2014 02:51:49
Showing4 changed files

... ...
@@ -2,8 +2,8 @@ Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for
4 4
         Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
5
-Version: 1.21.8
6
-Date: Fri Mar 28 16:26:10 EST 2014
5
+Version: 1.21.9
6
+Date: Mon 31 Mar 2014 18:08:54 EST
7 7
 Author: Benilton S Carvalho, Robert Scharpf, Matt Ritchie, Ingo
8 8
         Ruczinski, Rafael A Irizarry
9 9
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>,
... ...
@@ -324,7 +324,8 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
324 324
 	  "human1mduov3b",          # 1M Duo
325 325
 	  "humanomni1quadv1b",      # Omni1 quad
326 326
 	  "humanomni25quadv1b",     # Omni2.5 quad
327
-	  "humanomni258v1a",        # Omni2.5 8
327
+	  "humanomni258v1a",        # Omni2.5 8 v1 A
328
+          "humanomni258v1p1b",      # Omni2.5 8 v1.1 B
328 329
           "humanomni5quadv1b",      # Omni5 quad  
329 330
 	  "humanomniexpress12v1b",  # Omni express 12
330 331
 	  "humanimmuno12v1b",       # Immuno chip 12
... ...
@@ -1255,7 +1256,8 @@ genotype.Illumina <- function(sampleSheet=NULL,
1255 1256
 			      badSNP=0.7) {
1256 1257
         krlmm.supported = c("humanomni1quadv1b",      # Omni1 quad
1257 1258
 	                    "humanomni25quadv1b",     # Omni2.5 quad
1258
-	                    "humanomni258v1a",        # Omni2.5 8
1259
+	                    "humanomni258v1a",        # Omni2.5 8 v1 A
1260
+                            "humanomni258v1p1b",      # Omni2.5 8 v1.1 B
1259 1261
                             "humanomni5quadv1b")      # Omni5 quad
1260 1262
         crlmm.supported = c("human1mv1c",             # 1M
1261 1263
                             "human370v1c",            # 370CNV
... ...
@@ -1266,6 +1268,8 @@ genotype.Illumina <- function(sampleSheet=NULL,
1266 1268
 	                    "human550v3b",            # 550K
1267 1269
 	                    "human1mduov3b",          # 1M Duo
1268 1270
                             "humanomni1quadv1b",      # Omni1 quad
1271
+			    "humanomni258v1a",        # Omni2.5 8 v1 A
1272
+                            "humanomni258v1p1b",      # Omni2.5 8 v1.1 B
1269 1273
 	                    "humanomniexpress12v1b",  # Omni express 12
1270 1274
 	                    "humanimmuno12v1b",       # Immuno chip 12
1271 1275
                             "humancytosnp12v2p1h",    # CytoSNP 12
... ...
@@ -1,657 +1,778 @@
1
-krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
2
-  
3
-    pkgname <- getCrlmmAnnotationName(cdfName)
4
-    
5
-### pre-processing, output M    
6
-    M = computeLogRatio(cnSet, verbose) 
7
-    numSNP = nrow(M)
8
-    numSample = ncol(M)
9
-    
10
-    callsGt = calls(cnSet)
11
-    callsPr = snpCallProbability(cnSet)
12
-    open(callsGt)
13
-    open(callsPr)
14
-
15
-    priormeans = calculatePriorValues(M, numSNP, verbose)
16
-    VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
17
-
18
-### retrieve or calculate coefficients
19
-    krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
20
-    
21
-### do VGLM fit, to predict k for each SNP
22
-    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose);
23
-    rm(VGLMparameters)
24
-    
25
-### assign calls
26
-    assignCalls(callsGt, M, kPrediction, priormeans, numSNP, numSample, verbose);
27
-    rm(kPrediction)
28
-
29
-### assign confidence scores
30
-    computeCallPr(callsPr, M, callsGt, numSNP, numSample, verbose)
31
-    close(callsGt)
32
-    close(callsPr)
33
-
34
-### impute gender if gender information not provided
35
-    if (is.null(gender)) {
36
-      gender = krlmmImputeGender(cnSet, pkgname, verbose)
37
-    }
38
-    open(cnSet$gender)
39
-    cnSet$gender[,] = gender
40
-    close(cnSet$gender)
41
-
42
-    TRUE
43
-}
44
-
45
-#######################################################################################################
46
-
47
-getNumberOfCores <- function(){
48
-    defaultCores = min(detectCores(), 8)
49
-    return(getOption("krlmm.cores", defaultCores))    
50
-}
51
-
52
-
53
-#######################################################################################################
54
-
55
-computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
56
-    # compute log-ratio in blocksize of 500,000 by default
57
-    message("Start computing log ratio")
58
-    A <- A(cnSet)
59
-    B <- B(cnSet)
60
-    open(A)
61
-    open(B)
62
-    
63
-    numSNP <- nrow(A)
64
-    numSample <- ncol(A)
65
-    
66
-    M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
67
-    rownames(M) = rownames(A)
68
-    colnames(M) = colnames(A)
69
-  
70
-    numBlock = ceiling(numSNP / blockSize)
71
-    for (i in 1:numBlock){
72
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
73
-        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
74
-        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
75
-        subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
76
-        M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
77
-        rm(subsetA, subsetB, subsetM)
78
-    }
79
-    
80
-    close(A)
81
-    close(B)
82
-    if (verbose) message("Done computing log ratio")
83
-    return(M)
84
-}
85
-
86
-computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
87
-    # compute average log intensity in blocksize of 500,000 by default
88
-    message("Start computing average log intensity")
89
-    A <- A(cnSet)
90
-    B <- B(cnSet)
91
-    open(A)
92
-    open(B)
93
-    
94
-    numSNP <- nrow(A)
95
-    numSample <- ncol(A)
96
-    
97
-    S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
98
-    
99
-    numBlock = ceiling(numSNP / blockSize)
100
-    for (i in 1:numBlock){
101
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
102
-        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
103
-        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
104
-        subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
105
-        S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
106
-        rm(subsetA, subsetB, subsetS)
107
-    }
108
-    
109
-    close(A)
110
-    close(B)
111
-    if (verbose) message("Done computing average log intensity")
112
-    return(S)
113
-}
114
-
115
-
116
-#######################################################################################################
117
-
118
-calculatePriorValues <- function(M, numSNP, verbose) {
119
-
120
-    calculateOneKmeans <- function(x) {
121
-        tmp = kmeans(x, 3, nstart=45)
122
-        return(sort(tmp$centers, decreasing = T))
123
-    }
124
-
125
-    if (verbose) message("Start calculating Prior Means")
126
-    cl <- makeCluster(getNumberOfCores())
127
-    centers <- parRapply(cl, M, calculateOneKmeans)
128
-    stopCluster(cl) 
129
-    centers <- matrix(centers, numSNP, 3, byrow = TRUE)
130
-    priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
131
-    if (verbose) message("Done calculating Prior Means")
132
-    return(priormeans)
133
-}
134
-
135
-#######################################################################################################
136
-
137
-calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
138
-    if (!require(VGAM)) {
139
-        message("VGAM package not found, fall back to use defined platform-specific coefficients")
140
-        return(NULL)
141
-    }
142
-    amatch = match(rownames(params), rownames(trueCalls))
143
-    amatch = amatch[!is.na(amatch)]
144
-    trueCalls = trueCalls[amatch,]
145
-
146
-    amatch = match(rownames(trueCalls), rownames(params))
147
-    params = params[amatch, ]
148
-    
149
-    amatch = match(samplenames, colnames(trueCalls))
150
-    amatch = amatch[!is.na(amatch)]    
151
-    trueCalls = trueCalls[, amatch]
152
-        
153
-    if ((numSample <= 40) && (ncol(trueCalls) < round(numSample/2))){
154
-        message("Sample size is ", numSample, ", KRLMM requires at least trueCalls of ", round(numSample/2), " samples to calculate coefficients")
155
-        return(NULL)
156
-    }
157
-    if ((numSample > 40) && (ncol(trueCalls) < 20)){
158
-        message("At least trueCalls of 20 samples are required to calculate coefficients")
159
-        return(NULL)
160
-    }
161
-    if (nrow(trueCalls) < 1000){
162
-        message("At lease trueCalls of 1000 SNPs are required to calculate coefficients")
163
-        return(NULL)
164
-    }
165
-
166
-    getna = apply(trueCalls, 1, function(x){sum(is.na(x))>=10})
167
-    truek = apply(trueCalls, 1, function(x){length(unique(x[!is.na(x)]))})
168
-   
169
-    params1 = params[!getna,]
170
-    truek1 = truek[!getna]
171
-
172
-    xx = data.frame(params1)
173
-    t = as.factor(as.numeric(truek1)) 
174
-    xx = data.frame(xx, t)
175
-    fit = suppressWarnings(VGAM::vglm(t~., multinomial(refLevel=1), xx))
176
-    coe = VGAM::coefficients(fit)
177
-    return(coe)    
178
-}
179
-
180
-getKrlmmVGLMCoefficients <- function(pkgname, trueCalls, params, verbose, numSample, samplenames){
181
-    if (!is.null(trueCalls)) {
182
-        coe = calculateKrlmmCoefficients(trueCalls, params, numSample, samplenames)
183
-        if (!is.null(coe)) {
184
-            if (verbose) message ("Done calculating platform-specific coefficients to predict number of clusters")
185
-            return(coe)
186
-        }
187
-    }
188
-    if (!is.null(trueCalls)) message("Fall back to use defined platform-specific coefficients")
189
-    if (verbose) message ("Retrieving defined platform-specific coefficients to predict number of clusters")
190
-    loader("krlmmVGLMCoefficients.rda", .crlmmPkgEnv, pkgname)
191
-    return(getVarInEnv("krlmmCoefficients"))      
192
-}
193
-
194
-
195
-predictKwithVGLM <- function(data, coe, verbose){
196
-    if (verbose) message("Start predicting number of clusters")
197
-    logit1 <- rep(0, nrow(data))
198
-    logit2 <- coe[1]+coe[3]*data[,1]+coe[5]*data[,2]+coe[7]*data[,3]+coe[9]*data[,4]+coe[11]*data[,5]+coe[13]*data[,6]+coe[15]*data[,7]+coe[17]*data[,8]
199
-    logit23 <- coe[2]+coe[4]*data[,1]+coe[6]*data[,2]+coe[8]*data[,3]+coe[10]*data[,4]+coe[12]*data[,5]+coe[14]*data[,6]+coe[16]*data[,7]+coe[18]*data[,8]
200
-   
201
-    logits <- cbind(logit1, logit2, logit23)
202
-    rm(logit1)
203
-    rm(logit2)
204
-    rm(logit23)
205
-    p.unscaled <- exp(logits)
206
-    rm(logits)   
207
-    p <- p.unscaled / rowSums(p.unscaled)
208
-    clusterPrediction = apply(p, 1, function(x){which.max(x)})
209
-    rm(p.unscaled)
210
-    rm(p)
211
-    if (verbose) message("Done predicting number of clusters")
212
-    return(clusterPrediction)
213
-}
214
-
215
-#######################################################################################################
216
-
217
-assignCalls3Cluster <- function(intensities, priormeans){
218
-    prior31 = c(priormeans[1]/2,priormeans[2],priormeans[3])
219
-    prior32 = c(priormeans[1],priormeans[2],priormeans[3]/2)	
220
-
221
-    emp <- rep(NA, length(intensities))
222
-	ansCalls <- emp
223
-    tmp <- try(kmeans(intensities, priormeans, nstart=1),TRUE)
224
-    if(class(tmp) == "try-error") {
225
-        tmp1 <- try(kmeans(intensities, prior31, nstart=1),TRUE)
226
-        if(class(tmp1) == "try-error"){
227
-            tmp2 <- try(kmeans(intensities, prior32, nstart=1),TRUE)
228
-            if(class(tmp2) == "try-error"){
229
-                ansCalls = emp
230
-            }else{
231
-                ansCalls = tmp2$cluster
232
-            }
233
-        }else{
234
-            ansCalls = tmp1$cluster
235
-        }
236
-    }else{
237
-        ansCalls = tmp$cluster
238
-    }
239
-    rm(prior31, prior32)
240
-    return(ansCalls)
241
- 
242
-}
243
-
244
-#######################################################################################################
245
-
246
-assignCalls2Cluster <- function(intensities, priormeans){
247
-    closest <- rep(NA, 3)	
248
-    for(j in 1:3){
249
-        distance <- as.vector(abs(priormeans[j]-intensities))
250
-        closest[j] <- intensities[which.min(distance)]
251
-    }
252
-    prior2 <- priormeans[priormeans!=priormeans[which.max(abs(closest-priormeans))]]
253
-	
254
-    emp <- rep(NA, length(intensities))
255
-    ansCalls <- emp    
256
-    tmp <- try(kmeans(intensities, prior2, nstart=1), TRUE)
257
-    if(class(tmp) == "try-error") {
258
-        ansCalls <- emp
259
-    }else{
260
-        if(prior2[1]==priormeans[2] && prior2[2]==priormeans[3]){
261
-            mp <- abs(tmp$centers[1]-tmp$centers[2])
262
-            pp <- abs(priormeans[2]-priormeans[3])
263
-            if((mp/pp)<=0.25){
264
-                ansCalls <- emp
265
-            }else{
266
-                ansCalls <- tmp$cluster+1
267
-            }
268
-        }
269
-        if(prior2[1]==priormeans[1] && prior2[2]==priormeans[2]){
270
-            mp=abs(tmp$centers[1]-tmp$centers[2])
271
-            pp=abs(priormeans[1]-priormeans[2])
272
-            if((mp/pp)<=0.25){
273
-                ansCalls = emp
274
-            }else{
275
-                ansCalls <- tmp$cluster
276
-            }
277
-        }
278
-        if(prior2[1]==priormeans[1] && prior2[2]==priormeans[3]){
279
-            mp <- abs(tmp$centers[1]-tmp$centers[2])
280
-            pp <- abs(priormeans[1]-priormeans[3])
281
-            if ((mp/pp) <=0.25){
282
-                ansCalls <- emp
283
-            }else{
284
-                ansCalls[tmp$cluster==1] <- 1
285
-                ansCalls[tmp$cluster==2] <- 3
286
-            }
287
-        }
288
-    }
289
-    rm(tmp)
290
-    return(ansCalls)
291
-}
292
-
293
-#######################################################################################################
294
-
295
-assignCalls1Cluster <- function(intensities, priormeans){
296
-    closest <- rep(NA, 3)
297
-    for(j in 1:3){
298
-        distance <- as.vector(abs(priormeans[j]-intensities))
299
-        closest[j]=intensities[which.min(distance)]
300
-    }
301
-    clusterindex <- which.min(abs(closest-priormeans))
302
-    return(rep(clusterindex, length(intensities)))
303
-}
304
-  
305
-#######################################################################################################
306
-
307
-assignCallsOneSNP <- function(x, priormeans, numSample){
308
-    tolerance = 1e-3
309
-       
310
-    k = x[numSample + 1]
311
-    values = x[1:numSample]  
312
-    
313
-    if (abs(k - 2) <= tolerance) {
314
-        tmp <- try(kmeans(values, priormeans, nstart=1),TRUE)
315
-        if (!(class(tmp)=="try-error")) {
316
-            k <- 3;
317
-        }
318
-    }
319
-
320
-    successful <- FALSE;
321
-    if (abs(k - 3) <= tolerance){
322
-        SNPCalls <- assignCalls3Cluster(values, priormeans)
323
-        successful <- !is.na(SNPCalls[1])
324
-        if (!successful) { 
325
-            k <- 2
326
-        }
327
-    }
328
-		
329
-    if ( (abs(k - 2) <= tolerance) && (!successful)){
330
-        SNPCalls <- assignCalls2Cluster(values, priormeans)
331
-        successful <- !is.na(SNPCalls[1])
332
-        if (!successful) { 
333
-            k <- 1
334
-        }			
335
-    }
336
-
337
-    if ( (abs(k - 1) <= tolerance) && (!successful)){
338
-        SNPCalls <- assignCalls1Cluster(values, priormeans)
339
-    }
340
-    return(SNPCalls)
341
-}
342
-
343
-
344
-assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
345
-    # process by block size of 500,000 by default
346
-    message("Start assign calls")
347
-       
348
-    numBlock = ceiling(numSNP / blockSize)
349
-
350
-    for (i in 1:numBlock){
351
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
352
-        subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
353
-        thisnumSNP = nrow(subsetM)
354
-        subseta = a[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP)]
355
-        subseta = matrix(subseta, thisnumSNP, 1)
356
-
357
-        testValues = cbind(subsetM, subseta)
358
-
359
-        cl <- makeCluster(getNumberOfCores())
360
-        callAns <- parRapply(cl, testValues, assignCallsOneSNP, priormeans, numSample)
361
-        stopCluster(cl)
362
-
363
-        callAnsAllValues = unlist(callAns)
364
-
365
-        subsetcalls <- matrix(callAnsAllValues, thisnumSNP, numSample, byrow = TRUE)
366
-        
367
-        callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
368
-        rm(subsetM, subseta, subsetcalls)
369
-    }
370
-    message("Done assign calls")
371
-}
372
-
373
-#######################################################################################################
374
-
375
-hardyweinberg <- function(x, minN=10){
376
-  if (length(x) < minN){
377
-    return(NA) 
378
-  } else {
379
-    result = .Call("krlmmHardyweinberg", x)
380
-    return(result)
381
-  }
382
-}
383
-
384
-calculateParameters <- function(M, priormeans, numSNP, verbose) {
385
-    if (verbose) message("Start calculating 3-clusters parameters")
386
-    params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
387
-    if (verbose) message("Done calculating 3-cluster parameters")
388
-
389
-    if (verbose) message("Start calculating 2-cluster parameters")
390
-    params2cluster <- calculateParameters2Cluster(M, priormeans, numSNP, verbose);
391
-    if (verbose) message("Done calculating 2-cluster parameters")
392
-
393
-    if (verbose) message("Start calculating 1-cluster parameters")
394
-    params1cluster <- calculateParameters1Cluster(M, priormeans, numSNP, verbose);    
395
-    if (verbose) message("Done calculating 1-cluster parameters")
396
-
397
-    parameters <- cbind(as.matrix(params1cluster$elemh), as.matrix(params1cluster$eless), as.matrix(params2cluster$elemh), as.matrix(params2cluster$eless),
398
-                        as.matrix(params3cluster$elemh), as.matrix(params3cluster$eless), as.matrix(params2cluster$elehw), as.matrix(params3cluster$elehw));
399
-    
400
-    rownames(parameters) = rownames(M)
401
-    rm(params3cluster)
402
-    rm(params2cluster)
403
-    rm(params1cluster)
404
-    return(parameters)
405
-}
406
-
407
-
408
-calculateOneParams3Cluster <- function(x, priors, priorDown, priorUp){
409
-    tmp = try(kmeans(x, priors, nstart=1), TRUE)
410
-    if(class(tmp) == "kmeans") {
411
-        flag = 1
412
-     } else {
413
-        tmp = try(kmeans(x, priorDown, nstart=1), TRUE)
414
-        if(class(tmp) == "kmeans") {
415
-            flag = 2
416
-        } else {
417
-            tmp = try(kmeans(x, priorUp, nstart=1), TRUE)
418
-            if (class(tmp) == "kmeans") {
419
-                flag = 3
420
-            } else {
421
-                tmp = kmeans(x, 3, nstart=35)
422
-                flag = 4
423
-            }
424
-        }  
425
-    }
426
-    ss = sum(tmp$withinss)
427
-    hw = hardyweinberg(tmp$cluster) 
428
-    centers = sort(tmp$centers, decreasing = TRUE)
429
-
430
-    return(c(centers, ss, hw, flag))
431
-}
432
-
433
-calculateParameters3Cluster <- function(M, priormeans, numSNP, verbose) {
434
-    Apriors = priormeans
435
-    ApriorDown = c(Apriors[1]/2, Apriors[2], Apriors[3]) # shift-down
436
-    ApriorUp = c(Apriors[1], Apriors[2], Apriors[3]/2) # shift-up
437
-
438
-    cl <- makeCluster(getNumberOfCores())
439
-    parAns <- parRapply(cl, M, calculateOneParams3Cluster, Apriors, ApriorDown, ApriorUp)
440
-
441
-    stopCluster(cl)
442
-    parAnsAllValues = unlist(parAns)
443
-    parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
444
-   
445
-    centers <- parameters[, 1:3]
446
-    ss <- parameters[, 4]
447
-    hw <- parameters[, 5]
448
-    flag <- parameters[, 6]
449
-
450
-    rm(parAns)
451
-    rm(parameters)
452
-   
453
-    sigma=solve(cov(centers, use="complete.obs"))
454
-    mh = calculateMahalDist3Cluster(centers, sigma, flag, Apriors, ApriorDown, ApriorUp, numSNP)
455
-
456
-    rm(sigma)
457
-    rm(centers)
458
-
459
-    gc()
460
-    return(list(eless = ss, elemh = mh, elehw = hw))
461
-}
462
-
463
-
464
-calculateMahalDist3Cluster <- function(centers, sigma, flag, priors, priorDown, priorUp, numSNP){
465
-    mahaldist = rep(NA, numSNP)
466
-
467
-    tolerance = 1e-3
468
-    for (i in 1:numSNP) {
469
-        if ((abs(flag[i] - 1) <= tolerance) || (abs(flag[i]- 4) <= tolerance)) difference = centers[i, ] - priors
470
-        if (abs(flag[i] - 2) <= tolerance) difference = centers[i, ] - priorDown
471
-        if (abs(flag[i] - 3) <= tolerance) difference = centers[i, ] - priorUp         
472
-        mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
473
-    }
474
-    return(mahaldist)
475
-}
476
-
477
-
478
-calculateOneParams2Cluster <- function(x, priors){
479
-  
480
-    aa = rep(NA, 3)   
481
-    for(j in 1:3){
482
-        dist = as.vector(abs(priors[j]-x))
483
-        aa[j]=x[which.min(dist)]
484
-    } 
485
-    prior2 = priors[priors!=priors[which.max(abs(aa-priors))]]
486
-     
487
-
488
-    tmp = try(kmeans(x, prior2, nstart=1), TRUE)
489
-    if (class(tmp)=="kmeans") {
490
-        centers = tmp$centers
491
-        hw = hardyweinberg(tmp$cluster)
492
-    }
493
-    rm(tmp)
494
-    tmp = kmeans(x, 2, nstart = 35)
495
-    if (tmp$centers[1] < tmp$centers[2]){
496
-        centers = c(tmp$centers[2], tmp$centers[1])
497
-        hw = hardyweinberg(3 - tmp$cluster)   
498
-    } else {
499
-        centers = tmp$centers
500
-        hw = hardyweinberg(tmp$cluster)
501
-    }
502
-    ss = sum(tmp$withinss)     
503
-    return(c(centers, prior2, ss, hw))
504
-}
505
-
506
-
507
-calculateParameters2Cluster <- function(M, priormeans, numSNP, verbose) {
508
-    Apriors = priormeans
509
-   
510
-    cl <- makeCluster(getNumberOfCores())
511
-    parAns <- parRapply(cl, M, calculateOneParams2Cluster, Apriors)
512
-    stopCluster(cl)
513
-
514
-    parAnsAllValues = unlist(parAns)
515
-    parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
516
-
517
-    centers <- parameters[, 1:2]
518
-    priors2cluster <- parameters[, 3:4]
519
-    ss <- parameters[, 5]
520
-    hw <- parameters[, 6]
521
-    
522
-    rm(parAns)
523
-    rm(parameters)
524
-    sigma=solve(cov(centers, use="complete.obs"))
525
-
526
-    mh = calculateMahalDist2Cluster(centers, sigma, priors2cluster, numSNP)
527
-
528
-    rm(sigma)
529
-    rm(centers)
530
-
531
-    gc()
532
-    return(list(eless = ss, elemh = mh, elehw = hw))
533
-}
534
-
535
-
536
-calculateMahalDist2Cluster <- function(centers, sigma, priors, numSNP){
537
-    mahaldist = rep(NA, numSNP)
538
-    
539
-    for (i in 1:numSNP) {
540
-        difference <- centers[i,] - priors[i,]
541
-        mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)        
542
-    }
543
-    return(mahaldist)
544
-}
545
-
546
-calculateOneParams1Cluster <- function(x, priors){
547
-    center = mean(x)
548
-    diff <- x - center
549
-    diffsquare <- diff^2
550
-    ss = sum(diffsquare)
551
-
552
-    closestPrior = priors[which.min(abs(priors - center))]
553
-
554
-    return(c(center, closestPrior, ss))
555
-}
556
-
557
-
558
-calculateParameters1Cluster <- function(M, priormeans, numSNP, verbose) {
559
-    Apriors = priormeans
560
-   
561
-    cl <- makeCluster(getNumberOfCores())
562
-    parAns <- parRapply(cl, M, calculateOneParams1Cluster, Apriors)
563
-    stopCluster(cl)
564
-
565
-    parAnsAllValues = unlist(parAns)
566
-    parameters <- matrix(parAnsAllValues, numSNP, 3, byrow = TRUE)
567
-
568
-    centers <- matrix(parameters[, 1], numSNP, 1)
569
-    prior1cluster <- matrix(parameters[, 2], numSNP, 1)
570
-    ss <- parameters[, 3]
571
-    
572
-    rm(parAns)
573
-    rm(parameters)
574
-    sigma=solve(cov(centers, use="complete.obs"))
575
-
576
-    mh = calculateMahalDist1Cluster(centers, sigma, prior1cluster, numSNP)
577
-
578
-    rm(sigma)
579
-    rm(centers)
580
-
581
-    gc()
582
-    return(list(eless = ss, elemh = mh))
583
-}
584
-
585
-
586
-calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
587
-    mahaldist = rep(NA, numSNP)
588
-    
589
-    for(i in 1:numSNP) {
590
-        difference <- as.vector(centers[i, 1] - priors[i, 1])
591
-	mahaldist[i]=difference%*%sigma%*%difference
592
-    }
593
-    return(mahaldist)
594
-}
595
-   
596
-#############################################
597
-
598
-
599
-computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
600
-    # compute log-ratio in blocksize of 500,000 by default
601
-    if (verbose) message("Start computing confidence score")
602
-    
603
-    numBlock = ceiling(numSNP / blockSize)
604
-    for (i in 1:numBlock){
605
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
606
-        subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
607
-        subsetCalls = as.matrix(calls[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
608
-        subsetCallProb = .Call("krlmmConfidenceScore", subsetM, subsetCalls, PACKAGE="crlmm");
609
-        callsPr[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetCallProb[1:nrow(subsetM), ]
610
-        rm(subsetM, subsetCalls, subsetCallProb)
611
-    }
612
-
613
-    if (verbose) message("Done computing confidence score")
614
-}
615
-
616
-#############################################
617
-
618
-krlmmImputeGender <- function(cnSet, pkgname, verbose){
619
-    if (verbose) message("Start imputing gender")    
620
-    S = computeAverageLogIntensity(cnSet, verbose) 
621
-    loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
622
-    YIndex <- getVarInEnv("YIndex")
623
-    loader("annotation.rda", .crlmmPkgEnv, pkgname)
624
-    annotation <- getVarInEnv("annot")
625
-
626
-    A = A(cnSet)
627
-    open(A)
628
-    SNPNames = rownames(A)
629
-    close(A)
630
-
631
-    matchy = match(annotation[YIndex, 2], SNPNames)
632
-    Sy = S[matchy,]
633
-    rm(S)
634
-    numYChrSNP = nrow(Sy)
635
-
636
-    allS = matrix(NA, numYChrSNP, 2)
637
-
638
-    for (i in 1:numYChrSNP) {
639
-        tmp = kmeans(Sy[i,] ,2, nstart=45)
640
-        allS[i,] = sort(tmp$centers, decreasing=F)
641
-    }
642
-    priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
643
-
644
-    group = rep(NA, ncol(Sy))
645
-
646
-    meanmatrix = apply(Sy, 2, median)
647
-
648
-    Sy1 = abs(meanmatrix-priorS[1])
649
-    Sy2 = abs(meanmatrix-priorS[2])
650
-
651
-    # output male - 1, female - 2, female S-values are smaller than male S-values. 
652
-    test = cbind(Sy2, Sy1)
653
-    group = apply(test, 1, which.min)
654
-
655
-    if (verbose) message("Done imputing gender") 
656
-    return(group)    
657
-}
1
+krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
2
+  
3
+    pkgname <- getCrlmmAnnotationName(cdfName) # crlmm:::
4
+    
5
+### pre-processing, output M    
6
+    M = computeLogRatio(cnSet, verbose)
7
+    message("leaving out novariant SNPs")
8
+    
9
+    # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
10
+    uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
11
+    SNPtoProcessIndex = uniqueCount >= 3
12
+    noVariantSNPIndex = uniqueCount < 3
13
+    M = M[SNPtoProcessIndex, ]
14
+
15
+    numSNP = nrow(M)
16
+    numSample = ncol(M)
17
+    
18
+    calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
19
+    scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
20
+    open(calls)
21
+    open(scores)
22
+
23
+    rownames(calls) = rownames(M)
24
+    rownames(scores) = rownames(M)
25
+    colnames(calls) = colnames(M)
26
+    colnames(scores) = colnames(M)  
27
+    
28
+    priormeans = calculatePriorValues(M, numSNP, verbose)
29
+    VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
30
+  
31
+### retrieve or calculate coefficients
32
+    krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
33
+    
34
+### do VGLM fit, to predict k for each SNP
35
+    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose);
36
+    rm(VGLMparameters)
37
+    
38
+### assign calls
39
+    assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose);
40
+    rm(kPrediction)
41
+
42
+### assign confidence scores
43
+    computeCallPr(scores, M, calls, numSNP, numSample, verbose)
44
+    
45
+### add back SNPs excluded before
46
+    AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
47
+
48
+    loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
49
+    YIndex <- getVarInEnv("YIndex")
50
+    loader("annotation.rda", .crlmmPkgEnv, pkgname)
51
+    annotation <- getVarInEnv("annot")
52
+    
53
+### impute gender if gender information not provided
54
+    if (is.null(gender)) {
55
+        gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose)
56
+    }
57
+
58
+### double-check ChrY SNP cluster, impute gender if gender information not provided
59
+    if (!(is.null(gender))) {
60
+        verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
61
+    }
62
+
63
+    close(calls)
64
+    close(scores)
65
+    rm(calls)
66
+    rm(scores)
67
+    rm(M)
68
+    rm(annotation)
69
+    rm(YIndex)
70
+    TRUE
71
+}
72
+
73
+#######################################################################################################
74
+
75
+getNumberOfCores <- function(){
76
+    defaultCores = min(detectCores(), 8)
77
+    return(getOption("krlmm.cores", defaultCores))    
78
+}
79
+
80
+
81
+#######################################################################################################
82
+
83
+computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
84
+    # compute log-ratio in blocksize of 500,000 by default
85
+    message("Start computing log ratio")
86
+    A <- A(cnSet)
87
+    B <- B(cnSet)
88
+    open(A)
89
+    open(B)
90
+    
91
+    numSNP <- nrow(A)
92
+    numSample <- ncol(A)
93
+    
94
+    M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
95
+    rownames(M) = rownames(A)
96
+    colnames(M) = colnames(A)
97
+  
98
+    numBlock = ceiling(numSNP / blockSize)
99
+    for (i in 1:numBlock){
100
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
101
+        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
102
+        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
103
+        subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
104
+        M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
105
+        rm(subsetA, subsetB, subsetM)
106
+    }
107
+    
108
+    close(A)
109
+    close(B)
110
+    if (verbose) message("Done computing log ratio")
111
+    return(M)    
112
+}
113
+
114
+computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
115
+    # compute average log intensity in blocksize of 500,000 by default
116
+    message("Start computing average log intensity")
117
+    A <- A(cnSet)
118
+    B <- B(cnSet)
119
+    open(A)
120
+    open(B)
121
+    
122
+    numSNP <- nrow(A)
123
+    numSample <- ncol(A)
124
+    
125
+    S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
126
+    rownames(S) = rownames(A)
127
+    colnames(S) = colnames(A)
128
+      
129
+    numBlock = ceiling(numSNP / blockSize)
130
+    for (i in 1:numBlock){
131
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
132
+        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
133
+        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
134
+        subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
135
+        S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
136
+        rm(subsetA, subsetB, subsetS)
137
+    }
138
+    
139
+    close(A)
140
+    close(B)
141
+    if (verbose) message("Done computing average log intensity")
142
+    return(S)
143
+  
144
+}
145
+
146
+
147
+#######################################################################################################
148
+
149
+calculatePriorValues <- function(M, numSNP, verbose) {
150
+
151
+    calculateOneKmeans <- function(x) {
152
+        tmp = kmeans(x, 3, nstart=45)
153
+        return(sort(tmp$centers, decreasing = T))
154
+    }
155
+
156
+    if (verbose) message("Start calculating Prior Means")
157
+    cl <- makeCluster(getNumberOfCores())
158
+    centers <- parRapply(cl, M, calculateOneKmeans)
159
+    stopCluster(cl) 
160
+    centers <- matrix(centers, numSNP, 3, byrow = TRUE)
161
+    priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
162
+    if (verbose) message("Done calculating Prior Means")
163
+    return(priormeans)
164
+}
165
+
166
+#######################################################################################################
167
+
168
+calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
169
+#    if (!require(VGAM)) {
170
+#        message("VGAM package not found, fall back to use defined platform-specific coefficients")
171
+#        return(NULL)
172
+#    }
173
+    amatch = match(rownames(params), rownames(trueCalls))
174
+    amatch = amatch[!is.na(amatch)]
175
+    trueCalls = trueCalls[amatch,]
176
+
177
+    amatch = match(rownames(trueCalls), rownames(params))
178
+    params = params[amatch, ]
179
+    
180
+    amatch = match(samplenames, colnames(trueCalls))
181
+    amatch = amatch[!is.na(amatch)]    
182
+    trueCalls = trueCalls[, amatch]
183
+        
184
+    if ((numSample <= 40) && (ncol(trueCalls) < round(numSample/2))){
185
+        message("Sample size is ", numSample, ", KRLMM requires at least trueCalls of ", round(numSample/2), " samples to calculate coefficients")
186
+        return(NULL)
187
+    }
188
+    if ((numSample > 40) && (ncol(trueCalls) < 20)){
189
+        message("At least trueCalls of 20 samples are required to calculate coefficients")
190
+        return(NULL)
191
+    }
192
+    if (nrow(trueCalls) < 1000){
193
+        message("At lease trueCalls of 1000 SNPs are required to calculate coefficients")
194
+        return(NULL)
195
+    }
196
+
197
+    getna = apply(trueCalls, 1, function(x){sum(is.na(x))>=10})
198
+    truek = apply(trueCalls, 1, function(x){length(unique(x[!is.na(x)]))})
199
+   
200
+    params1 = params[!getna,]
201
+    truek1 = truek[!getna]
202
+
203
+    xx = data.frame(params1)
204
+    t = as.factor(as.numeric(truek1)) 
205
+    xx = data.frame(xx, t)
206
+    fit = suppressWarnings(vglm(t~., multinomial(refLevel=1), xx))
207
+    coe = coefficients(fit) # VGAM::
208
+    return(coe)    
209
+}
210
+
211
+getKrlmmVGLMCoefficients <- function(pkgname, trueCalls, params, verbose, numSample, samplenames){
212
+    if (!is.null(trueCalls)) {
213
+        coe = calculateKrlmmCoefficients(trueCalls, params, numSample, samplenames)
214
+        if (!is.null(coe)) {
215
+            if (verbose) message ("Done calculating platform-specific coefficients to predict number of clusters")
216
+            return(coe)
217
+        }
218
+    }
219
+    if (!is.null(trueCalls)) message("Fall back to use defined platform-specific coefficients")
220
+    if (verbose) message ("Retrieving defined platform-specific coefficients to predict number of clusters")
221
+    loader("krlmmVGLMCoefficients.rda", .crlmmPkgEnv, pkgname)
222
+    return(getVarInEnv("krlmmCoefficients"))      
223
+}
224
+
225
+
226
+predictKwithVGLM <- function(data, coe, verbose){
227
+    if (verbose) message("Start predicting number of clusters")
228
+    logit1 <- rep(0, nrow(data))
229
+    logit2 <- coe[1]+coe[3]*data[,1]+coe[5]*data[,2]+coe[7]*data[,3]+coe[9]*data[,4]+coe[11]*data[,5]+coe[13]*data[,6]+coe[15]*data[,7]+coe[17]*data[,8]
230
+    logit23 <- coe[2]+coe[4]*data[,1]+coe[6]*data[,2]+coe[8]*data[,3]+coe[10]*data[,4]+coe[12]*data[,5]+coe[14]*data[,6]+coe[16]*data[,7]+coe[18]*data[,8]
231
+   
232
+    logits <- cbind(logit1, logit2, logit23)
233
+    rm(logit1)
234
+    rm(logit2)
235
+    rm(logit23)
236
+    p.unscaled <- exp(logits)
237
+    rm(logits)   
238
+    p <- p.unscaled / rowSums(p.unscaled)
239
+    clusterPrediction = apply(p, 1, function(x){which.max(x)})
240
+    rm(p.unscaled)
241
+    rm(p)
242
+    if (verbose) message("Done predicting number of clusters")
243
+    return(clusterPrediction)
244
+}
245
+
246
+#######################################################################################################
247
+
248
+assignCalls3Cluster <- function(intensities, priormeans){
249
+    prior31 = c(priormeans[1]/2,priormeans[2],priormeans[3])
250
+    prior32 = c(priormeans[1],priormeans[2],priormeans[3]/2)	
251
+
252
+    emp <- rep(NA, length(intensities))
253
+	ansCalls <- emp
254
+    tmp <- try(kmeans(intensities, priormeans, nstart=1),TRUE)
255
+    if(class(tmp) == "try-error") {
256
+        tmp1 <- try(kmeans(intensities, prior31, nstart=1),TRUE)
257
+        if(class(tmp1) == "try-error"){
258
+            tmp2 <- try(kmeans(intensities, prior32, nstart=1),TRUE)
259
+            if(class(tmp2) == "try-error"){
260
+                ansCalls = emp
261
+            }else{
262
+                ansCalls = tmp2$cluster
263
+            }
264
+        }else{
265
+            ansCalls = tmp1$cluster
266
+        }
267
+    }else{
268
+        ansCalls = tmp$cluster
269
+    }
270
+    rm(prior31, prior32)
271
+    return(ansCalls)
272
+ 
273
+}
274
+
275
+#######################################################################################################
276
+
277
+assignCalls2Cluster <- function(intensities, priormeans){
278
+    closest <- rep(NA, 3)	
279
+    for(j in 1:3){
280
+        distance <- as.vector(abs(priormeans[j]-intensities))
281
+        closest[j] <- intensities[which.min(distance)]
282
+    }
283
+    prior2 <- priormeans[priormeans!=priormeans[which.max(abs(closest-priormeans))]]
284
+	
285
+    emp <- rep(NA, length(intensities))
286
+    ansCalls <- emp    
287
+    tmp <- try(kmeans(intensities, prior2, nstart=1), TRUE)
288
+    if(class(tmp) == "try-error") {
289
+        ansCalls <- emp
290
+    }else{
291
+        if(prior2[1]==priormeans[2] && prior2[2]==priormeans[3]){
292
+            mp <- abs(tmp$centers[1]-tmp$centers[2])
293
+            pp <- abs(priormeans[2]-priormeans[3])
294
+            if((mp/pp)<=0.25){
295
+                ansCalls <- emp
296
+            }else{
297
+                ansCalls <- tmp$cluster+1
298
+            }
299
+        }
300
+        if(prior2[1]==priormeans[1] && prior2[2]==priormeans[2]){
301
+            mp=abs(tmp$centers[1]-tmp$centers[2])
302
+            pp=abs(priormeans[1]-priormeans[2])
303
+            if((mp/pp)<=0.25){
304
+                ansCalls = emp
305
+            }else{
306
+                ansCalls <- tmp$cluster
307
+            }
308
+        }
309
+        if(prior2[1]==priormeans[1] && prior2[2]==priormeans[3]){
310
+            mp <- abs(tmp$centers[1]-tmp$centers[2])
311
+            pp <- abs(priormeans[1]-priormeans[3])
312
+            if ((mp/pp) <=0.25){
313
+                ansCalls <- emp
314
+            }else{
315
+                ansCalls[tmp$cluster==1] <- 1
316
+                ansCalls[tmp$cluster==2] <- 3
317
+            }
318
+        }
319
+    }
320
+    rm(tmp)
321
+    return(ansCalls)
322
+}
323
+
324
+#######################################################################################################
325
+
326
+assignCalls1Cluster <- function(intensities, priormeans){
327
+    closest <- rep(NA, 3)
328
+    for(j in 1:3){
329
+        distance <- as.vector(abs(priormeans[j]-intensities))
330
+        closest[j]=intensities[which.min(distance)]
331
+    }
332
+    clusterindex <- which.min(abs(closest-priormeans))
333
+    return(rep(clusterindex, length(intensities)))
334
+}
335
+  
336
+#######################################################################################################
337
+
338
+assignCallsOneSNP <- function(x, priormeans, numSample){
339
+    tolerance = 1e-3
340
+       
341
+    k = x[numSample + 1]
342
+    values = x[1:numSample]  
343
+    
344
+    if (abs(k - 2) <= tolerance) {
345
+        tmp <- try(kmeans(values, priormeans, nstart=1),TRUE)
346
+        if (!(class(tmp)=="try-error")) {
347
+            k <- 3;
348
+        }
349
+    }
350
+
351
+    successful <- FALSE;
352
+    if (abs(k - 3) <= tolerance){
353
+        SNPCalls <- assignCalls3Cluster(values, priormeans)
354
+        successful <- !is.na(SNPCalls[1])
355
+        if (!successful) { 
356
+            k <- 2
357
+        }
358
+    }
359
+		
360
+    if ( (abs(k - 2) <= tolerance) && (!successful)){
361
+        SNPCalls <- assignCalls2Cluster(values, priormeans)
362
+        successful <- !is.na(SNPCalls[1])
363
+        if (!successful) { 
364
+            k <- 1
365
+        }			
366
+    }
367
+
368
+    if ( (abs(k - 1) <= tolerance) && (!successful)){
369
+        SNPCalls <- assignCalls1Cluster(values, priormeans)
370
+    }
371
+    return(SNPCalls)
372
+}
373
+
374
+
375
+assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
376
+    # process by block size of 500,000 by default
377
+    message("Start assign calls")
378
+       
379
+    numBlock = ceiling(numSNP / blockSize)
380
+
381
+    for (i in 1:numBlock){
382
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
383
+        subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
384
+        thisnumSNP = nrow(subsetM)
385
+        subseta = a[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP)]
386
+        subseta = matrix(subseta, thisnumSNP, 1)
387
+
388
+        testValues = cbind(subsetM, subseta)
389
+
390
+        cl <- makeCluster(getNumberOfCores())
391
+        callAns <- parRapply(cl, testValues, assignCallsOneSNP, priormeans, numSample)
392
+        stopCluster(cl)
393
+
394
+        callAnsAllValues = unlist(callAns)
395
+
396
+        subsetcalls <- matrix(callAnsAllValues, thisnumSNP, numSample, byrow = TRUE)
397
+        
398
+        callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
399
+        rm(subsetM, subseta, subsetcalls)
400
+    }
401
+    message("Done assign calls")
402
+}
403
+
404
+#######################################################################################################
405
+
406
+
407
+calculateParameters <- function(M, priormeans, numSNP, verbose) {
408
+    if (verbose) message("Start calculating 3-clusters parameters")
409
+    params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
410
+    if (verbose) message("Done calculating 3-cluster parameters")
411
+
412
+    if (verbose) message("Start calculating 2-cluster parameters")
413
+    params2cluster <- calculateParameters2Cluster(M, priormeans, numSNP, verbose);
414
+    if (verbose) message("Done calculating 2-cluster parameters")
415
+
416
+    if (verbose) message("Start calculating 1-cluster parameters")
417
+    params1cluster <- calculateParameters1Cluster(M, priormeans, numSNP, verbose);    
418
+    if (verbose) message("Done calculating 1-cluster parameters")
419
+
420
+    parameters <- cbind(as.matrix(params1cluster$elemh), as.matrix(params1cluster$eless), as.matrix(params2cluster$elemh), as.matrix(params2cluster$eless),
421
+                        as.matrix(params3cluster$elemh), as.matrix(params3cluster$eless), as.matrix(params2cluster$elehw), as.matrix(params3cluster$elehw));
422
+    
423
+    rownames(parameters) = rownames(M)
424
+    rm(params3cluster)
425
+    rm(params2cluster)
426
+    rm(params1cluster)
427
+    return(parameters)
428
+}
429
+
430
+hardyweinberg <- function(x, minN = 8){
431
+   if (length(x) < minN){
432
+       return(NA) 
433
+    } else {
434
+       result = .Call("krlmmHardyweinberg", x)
435
+       return(result)
436
+    }
437
+}
438
+
439
+calculateOneParams3Cluster <- function(x, priors, priorDown, priorUp){
440
+
441
+    tmp = try(kmeans(x, priors, nstart=1), TRUE)
442
+    if(class(tmp) == "kmeans") {
443
+        flag = 1
444
+     } else {
445
+        tmp = try(kmeans(x, priorDown, nstart=1), TRUE)
446
+        if(class(tmp) == "kmeans") {
447
+            flag = 2
448
+        } else {
449
+            tmp = try(kmeans(x, priorUp, nstart=1), TRUE)
450
+            if (class(tmp) == "kmeans") {
451
+                flag = 3
452
+            } else {
453
+                tmp = kmeans(x, 3, nstart=35)
454
+                flag = 4
455
+            }
456
+        }  
457
+    }
458
+    ss = sum(tmp$withinss)
459
+    hw = hardyweinberg(tmp$cluster) 
460
+    centers = sort(tmp$centers, decreasing = TRUE)
461
+
462
+    return(c(centers, ss, hw, flag))
463
+}
464
+
465
+calculateParameters3Cluster <- function(M, priormeans, numSNP, verbose) {
466
+    Apriors = priormeans
467
+    ApriorDown = c(Apriors[1]/2, Apriors[2], Apriors[3]) # shift-down
468
+    ApriorUp = c(Apriors[1], Apriors[2], Apriors[3]/2) # shift-up
469
+
470
+    cl <- makeCluster(getNumberOfCores())
471
+    clusterEvalQ(cl, library(crlmm))
472
+    parAns <- parRapply(cl, M, calculateOneParams3Cluster, Apriors, ApriorDown, ApriorUp)
473
+
474
+    stopCluster(cl)
475
+    parAnsAllValues = unlist(parAns)
476
+    parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
477
+   
478
+    centers <- parameters[, 1:3]
479
+    ss <- parameters[, 4]
480
+    hw <- parameters[, 5]
481
+    flag <- parameters[, 6]
482
+
483
+    rm(parAns)
484
+    rm(parameters)
485
+   
486
+    sigma=solve(cov(centers, use="complete.obs"))
487
+    mh = calculateMahalDist3Cluster(centers, sigma, flag, Apriors, ApriorDown, ApriorUp, numSNP)
488
+
489
+    rm(sigma)
490
+    rm(centers)
491
+
492
+    gc()
493
+    return(list(eless = ss, elemh = mh, elehw = hw))
494
+}
495
+
496
+
497
+calculateMahalDist3Cluster <- function(centers, sigma, flag, priors, priorDown, priorUp, numSNP){
498
+    mahaldist = rep(NA, numSNP)
499
+
500
+    tolerance = 1e-3
501
+    for (i in 1:numSNP) {
502
+        if ((abs(flag[i] - 1) <= tolerance) || (abs(flag[i]- 4) <= tolerance)) difference = centers[i, ] - priors
503
+        if (abs(flag[i] - 2) <= tolerance) difference = centers[i, ] - priorDown
504
+        if (abs(flag[i] - 3) <= tolerance) difference = centers[i, ] - priorUp         
505
+        mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
506
+    }
507
+    return(mahaldist)
508
+}
509
+
510
+
511
+calculateOneParams2Cluster <- function(x, priors){
512
+    aa = rep(NA, 3)   
513
+    for(j in 1:3){
514
+        dist = as.vector(abs(priors[j]-x))
515
+        aa[j]=x[which.min(dist)]
516
+    } 
517
+    prior2 = priors[priors!=priors[which.max(abs(aa-priors))]]
518
+     
519
+
520
+    tmp = try(kmeans(x, prior2, nstart=1), TRUE)
521
+    if (class(tmp)=="kmeans") {
522
+        centers = tmp$centers
523
+        hw = hardyweinberg(tmp$cluster)
524
+    }
525
+    rm(tmp)
526
+    tmp = kmeans(x, 2, nstart = 35)
527
+    if (tmp$centers[1] < tmp$centers[2]){
528
+        centers = c(tmp$centers[2], tmp$centers[1])
529
+        hw = hardyweinberg(3 - tmp$cluster)   
530
+    } else {
531
+        centers = tmp$centers
532
+        hw = hardyweinberg(tmp$cluster)
533
+    }
534
+    ss = sum(tmp$withinss)     
535
+    return(c(centers, prior2, ss, hw))
536
+}
537
+
538
+
539
+calculateParameters2Cluster <- function(M, priormeans, numSNP, verbose) {
540
+    Apriors = priormeans
541
+   
542
+    cl <- makeCluster(getNumberOfCores())
543
+    clusterEvalQ(cl, library(crlmm))
544
+    parAns <- parRapply(cl, M, calculateOneParams2Cluster, Apriors)
545
+    stopCluster(cl)
546
+
547
+    parAnsAllValues = unlist(parAns)
548
+    parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
549
+
550
+    centers <- parameters[, 1:2]
551
+    priors2cluster <- parameters[, 3:4]
552
+    ss <- parameters[, 5]
553
+    hw <- parameters[, 6]
554
+    
555
+    rm(parAns)
556
+    rm(parameters)
557
+    sigma=solve(cov(centers, use="complete.obs"))
558
+
559
+    mh = calculateMahalDist2Cluster(centers, sigma, priors2cluster, numSNP)
560
+
561
+    rm(sigma)
562
+    rm(centers)
563
+
564
+    gc()
565
+    return(list(eless = ss, elemh = mh, elehw = hw))
566
+}
567
+
568
+
569
+calculateMahalDist2Cluster <- function(centers, sigma, priors, numSNP){
570
+    mahaldist = rep(NA, numSNP)
571
+    
572
+    for (i in 1:numSNP) {
573
+        difference <- centers[i,] - priors[i,]
574
+        mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)        
575
+    }
576
+    return(mahaldist)
577
+}
578
+
579
+calculateOneParams1Cluster <- function(x, priors){
580
+    center = mean(x)
581
+    diff <- x - center
582
+    diffsquare <- diff^2
583
+    ss = sum(diffsquare)
584
+
585
+    closestPrior = priors[which.min(abs(priors - center))]
586
+
587
+    return(c(center, closestPrior, ss))
588
+}
589
+
590
+
591
+calculateParameters1Cluster <- function(M, priormeans, numSNP, verbose) {
592
+    Apriors = priormeans
593
+   
594
+    cl <- makeCluster(getNumberOfCores())
595
+    clusterEvalQ(cl, library(crlmm))
596
+    parAns <- parRapply(cl, M, calculateOneParams1Cluster, Apriors)
597
+    stopCluster(cl)
598
+
599
+    parAnsAllValues = unlist(parAns)
600
+    parameters <- matrix(parAnsAllValues, numSNP, 3, byrow = TRUE)
601
+
602
+    centers <- matrix(parameters[, 1], numSNP, 1)
603
+    prior1cluster <- matrix(parameters[, 2], numSNP, 1)
604
+    ss <- parameters[, 3]
605
+    
606
+    rm(parAns)
607
+    rm(parameters)
608
+    sigma=solve(cov(centers, use="complete.obs"))
609
+
610
+    mh = calculateMahalDist1Cluster(centers, sigma, prior1cluster, numSNP)
611
+
612
+    rm(sigma)
613
+    rm(centers)
614
+
615
+    gc()
616
+    return(list(eless = ss, elemh = mh))
617
+}
618
+
619
+
620
+calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
621
+    mahaldist = rep(NA, numSNP)
622
+    
623
+    for(i in 1:numSNP) {
624
+        difference <- as.vector(centers[i, 1] - priors[i, 1])
625
+	mahaldist[i]=difference%*%sigma%*%difference
626
+    }
627
+    return(mahaldist)
628
+}
629
+   
630
+#############################################
631
+
632
+
633
+computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
634
+    # compute log-ratio in blocksize of 500,000 by default
635
+    if (verbose) message("Start computing confidence score")
636
+    
637
+    numBlock = ceiling(numSNP / blockSize)
638
+    for (i in 1:numBlock){
639
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
640
+        subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
641
+        subsetCalls = as.matrix(calls[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
642
+        subsetCallProb = .Call("krlmmConfidenceScore", subsetM, subsetCalls, PACKAGE="crlmm");
643
+        callsPr[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetCallProb[1:nrow(subsetM), ]
644
+        rm(subsetM, subsetCalls, subsetCallProb)
645
+    }
646
+
647
+    if (verbose) message("Done computing confidence score")
648
+}
649
+
650
+#############################################
651
+
652
+AddBackNoVarianceSNPs <- function(cnSet, calls, scores, numSNP, numSample, variantSNPIndex, noVariantSNPIndex){
653
+    callsGt = calls(cnSet)
654
+    callsPr = snpCallProbability(cnSet)
655
+    open(callsGt)
656
+    open(callsPr)
657
+
658
+    callsGt[variantSNPIndex, ] = calls[,]
659
+    callsPr[variantSNPIndex, ] = scores[,]  
660
+    
661
+    callsGt[noVariantSNPIndex, ] = NA
662
+    callsPr[noVariantSNPIndex, ] = 0     
663
+
664
+    close(callsGt)
665
+    close(callsPr)    
666
+}
667
+
668
+#############################################
669
+
670
+krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose){
671
+    if (verbose) message("Start imputing gender")    
672
+    S = computeAverageLogIntensity(cnSet, verbose) 
673
+
674
+    # S is calculated and saved in original SNP order. 
675
+    matchy = match(annotation[YIndex, 2], rownames(S))
676
+    matchy = matchy[!is.na(matchy)]
677
+    if (length(matchy) <= 10){
678
+        predictedGender = rep(NA, ncol(A))
679
+    }
680
+    Sy = S[matchy,]
681
+
682
+    uniqueDataPoint = apply(Sy, 1, function(x){length(unique(x))})
683
+    validYSNPs = uniqueDataPoint >= 2
684
+    SyValid = Sy[validYSNPs, ]
685
+    
686
+    if (nrow(SyValid) < 20){
687
+        message("Not enough ChrY SNPs, skipping gender prediction step");
688
+        predictedGender = rep(NA, ncol(Sy))
689
+    }
690
+   
691
+    rm(S)
692
+    numYChrSNP = nrow(SyValid)
693
+
694
+    allS = matrix(NA, numYChrSNP, 2)
695
+
696
+    for (i in 1:numYChrSNP) {
697
+        tmp = kmeans(SyValid[i,] ,2, nstart=45)
698
+        allS[i,] = sort(tmp$centers, decreasing=F)
699
+    }
700
+    priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
701
+
702
+    if (abs(priorS[1] - priorS[2]) <= 1.6) {
703
+        message("Skipping gender prediction step");
704
+        predictedGender = rep(NA, ncol(Sy))        
705
+    }
706
+    
707
+    meanmatrix = apply(Sy, 2, median)
708
+
709
+    Sy1 = abs(meanmatrix - priorS[1])
710
+    Sy2 = abs(meanmatrix - priorS[2])
711
+
712
+    # output male - 1, female - 2, female S-values are smaller than male S-values. 
713
+    test = cbind(Sy2, Sy1)
714
+    predictedGender = apply(test, 1, which.min)
715
+
716
+    open(cnSet$gender)
717
+    cnSet$gender[,] = predictedGender
718
+    close(cnSet$gender)
719
+    
720
+    if (verbose) message("Done imputing gender")
721
+    return(predictedGender)   
722
+}
723
+
724
+
725
+#############################################
726
+
727
+verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose){
728
+    if (verbose) message("Start verifying SNPs on Chromosome Y")
729
+    callsGt = calls(cnSet)
730
+    callsPr = snpCallProbability(cnSet)
731
+    open(callsGt)
732
+    open(callsPr)
733
+       
734
+    matchy = match(annotation[YIndex, 2], rownames(M))
735
+    matchy = matchy[!is.na(matchy)]
736
+   
737
+    MChrY = M[matchy,]
738
+    callsChrY = calls[matchy,]
739
+  
740
+    male = gender == 1
741
+    female = gender == 2    
742
+    
743
+    checkK = apply(callsChrY[, male], 1, function(x){ length(unique(x[!is.na(x)])) } )
744
+    
745
+    for(i in 1:nrow(MChrY)){
746
+        # Chromosome Y SNPs, no calls for female, k = 1 or 2 permitted for male samples
747
+        thisChrYSNPorigPosition = match(rownames(callsChrY)[i], rownames(callsGt))
748
+        callsGt[thisChrYSNPorigPosition, female] = NA
749
+        callsPr[thisChrYSNPorigPosition, female] = 0
750
+
751
+        # early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned
752
+        # male samples to 3 clusters by mistake. 
753
+        if (checkK[i] < 3) next;
754
+          
755
+        if (class(try(kmeans(MChrY[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){
756
+           
757
+            maleSampleCalls = kmeans(MChrY[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster
758
+            callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 1] = 1
759
+            callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 2] = 3
760
+        } else {
761
+                    
762
+            distanceToPrior1 = mean(abs(MChrY[i, male] - priormeans[1]))
763
+            distanceToPrior3 = mean(abs(MChrY[i, male] - priormeans[3]))                
764
+            callsGt[thisChrYSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3)
765
+        }
766
+
767
+        MMaleSamples = MChrY[i, male]
768
+        callsMaleSample = callsGt[thisChrYSNPorigPosition, male]
769
+        scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm");
770
+
771
+        callsPr[thisChrYSNPorigPosition, male] = scoresMaleSample       
772
+    }
773
+
774
+    close(callsGt)
775
+    close(callsPr)
776
+    
777
+    if (verbose) message("Done verifying SNPs on Chromosome Y")
778
+}
... ...
@@ -166,7 +166,8 @@ illuminaCdfNames <- function(){
166 166
 	  "human1mduov3b",          # 1M Duo
167 167
 	  "humanomni1quadv1b",      # Omni1 quad
168 168
 	  "humanomni25quadv1b",     # Omni2.5 quad
169
-	  "humanomni258v1a",        # Omni2.5 8
169
+	  "humanomni258v1a",        # Omni2.5 8 v1 A
170
+          "humanomni258v1p1b",      # Omni2.5 8 v1.1 B
170 171
           "humanomni5quadv1b",      # Omni5 quad          
171 172
 	  "humanomniexpress12v1b",  # Omni express 12
172 173
 	  "humanimmuno12v1b",       # Immuno chip 12