...
|
...
|
@@ -75,25 +75,11 @@ construct <- function(filenames, cdfName, copynumber=FALSE,
|
75
|
75
|
sampleNames(callSet) <- sns
|
76
|
76
|
return(callSet)
|
77
|
77
|
}
|
78
|
|
-setMethod("close", "AlleleSet", function(con, ...){
|
79
|
|
- object <- con
|
80
|
|
- names <- ls(assayData(object))
|
81
|
|
- L <- length(names)
|
82
|
|
- for(i in 1:L) close(eval(substitute(assayData(object)[[NAME]], list(NAME=names[i]))))
|
83
|
|
- return()
|
84
|
|
-})
|
85
|
|
-setMethod("open", "AlleleSet", function(con, ...){
|
86
|
|
- object <- con
|
87
|
|
- names <- ls(assayData(object))
|
88
|
|
- L <- length(names)
|
89
|
|
- for(i in 1:L) open(eval(substitute(assayData(object)[[NAME]], list(NAME=names[i]))))
|
90
|
|
- return()
|
91
|
|
-})
|
92
|
|
-##setReplaceMethod("calls", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "call", value))
|
93
|
|
-##setReplaceMethod("confs", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "callProbability", value))
|
94
|
|
-##setMethod("confs", "SnpSuperSet", function(object) assayDataElement(object, "callProbability"))
|
|
78
|
+
|
|
79
|
+
|
95
|
80
|
genotype <- function(filenames,
|
96
|
81
|
cdfName,
|
|
82
|
+ batch,
|
97
|
83
|
mixtureSampleSize=10^5,
|
98
|
84
|
eps=0.1,
|
99
|
85
|
verbose=TRUE,
|
...
|
...
|
@@ -111,6 +97,12 @@ genotype <- function(filenames,
|
111
|
97
|
if(missing(cdfName)) stop("must specify cdfName")
|
112
|
98
|
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames")
|
113
|
99
|
if(missing(sns)) sns <- basename(filenames)
|
|
100
|
+ if(missing(batch)){
|
|
101
|
+ warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch. The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.")
|
|
102
|
+ } else {
|
|
103
|
+ if(length(batch) != length(filenames))
|
|
104
|
+ stop("batch variable must be the same length as the filenames")
|
|
105
|
+ }
|
114
|
106
|
## callSet contains potentially very big matrices
|
115
|
107
|
## More big matrices are created within snprma, that will then be removed.
|
116
|
108
|
callSet <- construct(filenames=filenames,
|
...
|
...
|
@@ -118,6 +110,9 @@ genotype <- function(filenames,
|
118
|
110
|
copynumber=copynumber,
|
119
|
111
|
sns=sns,
|
120
|
112
|
verbose=verbose)
|
|
113
|
+ if(missing(batch)){
|
|
114
|
+ protocolData(callSet)$batch <- as.numeric(as.factor(protocolData(callSet)$ScanDate))
|
|
115
|
+ }
|
121
|
116
|
mixtureParams <- matrix(NA, 4, length(filenames))
|
122
|
117
|
snp.index <- which(isSnp(callSet)==1)
|
123
|
118
|
batches <- splitIndicesByLength(1:ncol(callSet), ocSamples())
|
...
|
...
|
@@ -193,13 +188,13 @@ genotype2 <- function(filenames,
|
193
|
188
|
gender=NULL,
|
194
|
189
|
returnParams=TRUE,
|
195
|
190
|
badSNP=0.7){
|
|
191
|
+ if(!isPackageLoaded("ff")) stop("Must load package 'ff'")
|
196
|
192
|
if(!copynumber){
|
197
|
193
|
callSet <- crlmm2(filenames=filenames,
|
198
|
194
|
cdfName=cdfName,
|
199
|
195
|
mixtureSampleSize=mixtureSampleSize,
|
200
|
196
|
eps=eps,
|
201
|
197
|
verbose=verbose,
|
202
|
|
- seed=seed,
|
203
|
198
|
sns=sns,
|
204
|
199
|
probs=probs,
|
205
|
200
|
DF=DF,
|
...
|
...
|
@@ -232,9 +227,7 @@ genotype2 <- function(filenames,
|
232
|
227
|
if(!missing(batch)) protocolData(callSet)$batch <- batch
|
233
|
228
|
mixtureParams <- matrix(NA, 4, length(filenames))
|
234
|
229
|
snp.index <- which(isSnp(callSet)==1)
|
235
|
|
- snprmaRes <- snprma2(##A=A(callSet),
|
236
|
|
- ##B=B(callSet),
|
237
|
|
- filenames=filenames,
|
|
230
|
+ snprmaRes <- snprma2(filenames=filenames,
|
238
|
231
|
mixtureSampleSize=mixtureSampleSize,
|
239
|
232
|
fitMixture=TRUE,
|
240
|
233
|
eps=eps,
|
...
|
...
|
@@ -249,13 +242,9 @@ genotype2 <- function(filenames,
|
249
|
242
|
open(snprmaRes[["SKW"]])
|
250
|
243
|
open(snprmaRes[["mixtureParams"]])
|
251
|
244
|
if(verbose) message("Updating elements of callSet")
|
252
|
|
-## A(callSet) <- snprmaRes[["A"]]
|
253
|
|
-## B(callSet) <- snprmaRes[["B"]]
|
254
|
245
|
bb = ocProbesets()*ncol(A)*8
|
255
|
246
|
ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]], BATCHBYTES=bb)
|
256
|
247
|
ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]], BATCHBYTES=bb)
|
257
|
|
- ##ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]])
|
258
|
|
- ##ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]])
|
259
|
248
|
if(verbose) message("Finished updating elements of callSet")
|
260
|
249
|
stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns))
|
261
|
250
|
pData(callSet)$SKW <- snprmaRes$SKW
|
...
|
...
|
@@ -289,435 +278,14 @@ genotype2 <- function(filenames,
|
289
|
278
|
badSNP=badSNP)
|
290
|
279
|
open(tmp[["calls"]])
|
291
|
280
|
open(tmp[["confs"]])
|
292
|
|
- ##snpCall(callSet) <- tmp[["calls"]]
|
293
|
|
- ##snpCallProbability(callSet) <- tmp[["confs"]]
|
294
|
281
|
bb = ocProbesets()*ncol(A)*8
|
295
|
282
|
ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb)
|
296
|
283
|
ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb)
|
297
|
|
-## ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]])
|
298
|
|
-## ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]])
|
299
|
284
|
callSet$gender <- tmp$gender
|
300
|
285
|
cnSet <- as(callSet, "CNSetLM")
|
301
|
286
|
return(cnSet)
|
302
|
287
|
}
|
303
|
288
|
|
304
|
|
-snprma2RS <- function(A, B,
|
305
|
|
- filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
|
306
|
|
- eps=0.1, verbose=TRUE, seed=1, cdfName, sns){
|
307
|
|
- if (missing(sns)) sns <- basename(filenames)
|
308
|
|
- if (missing(cdfName))
|
309
|
|
- cdfName <- read.celfile.header(filenames[1])[["cdfName"]]
|
310
|
|
- pkgname <- getCrlmmAnnotationName(cdfName)
|
311
|
|
- stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
|
312
|
|
-
|
313
|
|
- if(verbose) message("Loading annotations and mixture model parameters.")
|
314
|
|
- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
|
315
|
|
- pnsa <- getVarInEnv("pnsa")
|
316
|
|
- pnsb <- getVarInEnv("pnsb")
|
317
|
|
- gns <- getVarInEnv("gns")
|
318
|
|
-
|
319
|
|
- ##We will read each cel file, summarize, and run EM one by one
|
320
|
|
- ##We will save parameters of EM to use later
|
321
|
|
- if(verbose) message("Initializing objects.")
|
322
|
|
- mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
|
323
|
|
- SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
|
324
|
|
- SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")
|
325
|
|
-
|
326
|
|
- ## This is the sample for the fitting of splines
|
327
|
|
- ## BC: I like better the idea of the user passing the seed,
|
328
|
|
- ## because this might intefere with other analyses
|
329
|
|
- ## (like what happened to GCRMA)
|
330
|
|
- ##S will hold (A+B)/2 and M will hold A-B
|
331
|
|
- ##NOTE: We actually dont need to save S. Only for pics etc...
|
332
|
|
- ##f is the correction. we save to avoid recomputing
|
333
|
|
-## A <- initializeBigMatrix("crlmmA-", length(pnsa), length(filenames), "integer")
|
334
|
|
-## B <- initializeBigMatrix("crlmmB-", length(pnsb), length(filenames), "integer")
|
335
|
|
-
|
336
|
|
- sampleBatches <- splitIndicesByNode(seq(along=filenames))
|
337
|
|
-
|
338
|
|
- if(verbose) message("Processing ", length(filenames), " files.")
|
339
|
|
-
|
340
|
|
- ocLapply(sampleBatches, processCEL_RS, filenames=filenames,
|
341
|
|
- fitMixture=fitMixture, A=A, B=B, SKW=SKW, SNR=SNR,
|
342
|
|
- mixtureParams=mixtureParams, eps=eps, seed=seed,
|
343
|
|
- mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
|
344
|
|
- neededPkgs=c("crlmm", pkgname))
|
345
|
|
-
|
346
|
|
- list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
|
347
|
|
-}
|
348
|
|
-
|
349
|
|
-processCEL_RS <- function(i, filenames, fitMixture, A, B, SKW, SNR,
|
350
|
|
- mixtureParams, eps, seed, mixtureSampleSize,
|
351
|
|
- pkgname){
|
352
|
|
-
|
353
|
|
- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
|
354
|
|
- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
|
355
|
|
- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
|
356
|
|
- autosomeIndex <- getVarInEnv("autosomeIndex")
|
357
|
|
- pnsa <- getVarInEnv("pnsa")
|
358
|
|
- pnsb <- getVarInEnv("pnsb")
|
359
|
|
- fid <- getVarInEnv("fid")
|
360
|
|
- reference <- getVarInEnv("reference")
|
361
|
|
- aIndex <- getVarInEnv("aIndex")
|
362
|
|
- bIndex <- getVarInEnv("bIndex")
|
363
|
|
- SMEDIAN <- getVarInEnv("SMEDIAN")
|
364
|
|
- theKnots <- getVarInEnv("theKnots")
|
365
|
|
- gns <- getVarInEnv("gns")
|
366
|
|
-
|
367
|
|
- ## for mixture
|
368
|
|
- set.seed(seed)
|
369
|
|
- idx <- sort(sample(autosomeIndex, mixtureSampleSize))
|
370
|
|
- ##for skewness. no need to do everything
|
371
|
|
- idx2 <- sample(length(fid), 10^5)
|
372
|
|
-
|
373
|
|
- open(A)
|
374
|
|
- open(B)
|
375
|
|
- open(SKW)
|
376
|
|
- open(mixtureParams)
|
377
|
|
- open(SNR)
|
378
|
|
- ##RS
|
379
|
|
- iii <- seq(along=pnsa)
|
380
|
|
-
|
381
|
|
- for (k in i){
|
382
|
|
- y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
|
383
|
|
- x <- log2(y[idx2])
|
384
|
|
- SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3)
|
385
|
|
- rm(x)
|
386
|
|
- y <- normalize.quantiles.use.target(y, target=reference)
|
387
|
|
- A[iii, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
|
388
|
|
- B[iii, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
|
389
|
|
- rm(y)
|
390
|
|
-
|
391
|
|
- if(fitMixture){
|
392
|
|
- S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN
|
393
|
|
- M <- log2(A[idx, k])-log2(B[idx, k])
|
394
|
|
- tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
|
395
|
|
- mixtureParams[, k] <- tmp[["coef"]]
|
396
|
|
- SNR[k] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
|
397
|
|
- } else {
|
398
|
|
- mixtureParams[, k] <- NA
|
399
|
|
- SNR[k] <- NA
|
400
|
|
- }
|
401
|
|
- }
|
402
|
|
- close(A)
|
403
|
|
- close(B)
|
404
|
|
- close(SKW)
|
405
|
|
- close(mixtureParams)
|
406
|
|
- close(SNR)
|
407
|
|
- TRUE
|
408
|
|
-}
|
409
|
|
-
|
410
|
|
-crlmmGT2_RS<- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
|
411
|
|
- col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
|
412
|
|
- SNRMin=5, recallMin=10, recallRegMin=1000,
|
413
|
|
- gender=NULL, desctrucitve=FALSE, verbose=TRUE,
|
414
|
|
- returnParams=FALSE, badSNP=.7){
|
415
|
|
- open(SNR)
|
416
|
|
- open(A)
|
417
|
|
- open(B)
|
418
|
|
- open(mixtureParams)
|
419
|
|
- ## expect objects to be ff
|
420
|
|
-
|
421
|
|
- keepIndex <- which( SNR[] > SNRMin)
|
422
|
|
- if(length(keepIndex)==0) stop("No arrays above quality threshold!")
|
423
|
|
-
|
424
|
|
- NC <- ncol(A)
|
425
|
|
- NR <- nrow(A)
|
426
|
|
-
|
427
|
|
- pkgname <- getCrlmmAnnotationName(cdfName)
|
428
|
|
- stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
|
429
|
|
-
|
430
|
|
- if(verbose) message("Loading annotations.")
|
431
|
|
- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
|
432
|
|
- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
|
433
|
|
- ## this is toget rid of the 'no visible binding' notes
|
434
|
|
- ## variable definitions
|
435
|
|
- XIndex <- getVarInEnv("XIndex")
|
436
|
|
- autosomeIndex <- getVarInEnv("autosomeIndex")
|
437
|
|
- YIndex <- getVarInEnv("YIndex")
|
438
|
|
- SMEDIAN <- getVarInEnv("SMEDIAN")
|
439
|
|
- theKnots <- getVarInEnv("theKnots")
|
440
|
|
- regionInfo <- getVarInEnv("regionInfo")
|
441
|
|
- params <- getVarInEnv("params")
|
442
|
|
- ##RS
|
443
|
|
- pnsa <- getVarInEnv("pnsa")
|
444
|
|
- NR <- length(pnsa)
|
445
|
|
-
|
446
|
|
- ##IF gender not provide, we predict
|
447
|
|
- if(is.null(gender)){
|
448
|
|
- if(verbose) message("Determining gender.")
|
449
|
|
- XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
|
450
|
|
- if(sum(SNR[] > SNRMin)==1){
|
451
|
|
- gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
|
452
|
|
- }else{
|
453
|
|
- gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
|
454
|
|
- }
|
455
|
|
- }
|
456
|
|
-
|
457
|
|
- Indexes <- list(autosomeIndex, XIndex, YIndex)
|
458
|
|
- cIndexes <- list(keepIndex,
|
459
|
|
- keepIndex[which(gender[keepIndex]==2)],
|
460
|
|
- keepIndex[which(gender[keepIndex]==1)])
|
461
|
|
-
|
462
|
|
- if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
|
463
|
|
-
|
464
|
|
- ## call C
|
465
|
|
- fIndex <- which(gender==2)
|
466
|
|
- mIndex <- which(gender==1)
|
467
|
|
-
|
468
|
|
- ## different here
|
469
|
|
- ## use gtypeCallerR in batches
|
470
|
|
- ##RS
|
471
|
|
- snpBatches <- splitIndicesByLength(1:NR, ocProbesets())
|
472
|
|
- newparamsBatch <- vector("list", length(snpBatches))
|
473
|
|
-
|
474
|
|
- process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
|
475
|
|
- YIndex, A, B, mixtureParams, fIndex, mIndex,
|
476
|
|
- params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
|
477
|
|
- open(A)
|
478
|
|
- open(B)
|
479
|
|
- open(mixtureParams)
|
480
|
|
- snps <- snpBatches[[idxBatch]]
|
481
|
|
- rSnps <- range(snps)
|
482
|
|
- last <- (idxBatch-1)*batchSize
|
483
|
|
- IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
|
484
|
|
- XIndex[XIndex %in% snps]-last,
|
485
|
|
- YIndex[YIndex %in% snps]-last)
|
486
|
|
- IndexesBatch <- lapply(IndexesBatch, as.integer)
|
487
|
|
- tmpA <- as.matrix(A[snps,])
|
488
|
|
- tmpB <- as.matrix(B[snps,])
|
489
|
|
- ## newparamsBatch[[idxBatch]]
|
490
|
|
-
|
491
|
|
- tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
|
492
|
|
- params[["centers"]][snps,],
|
493
|
|
- params[["scales"]][snps,],
|
494
|
|
- params[["N"]][snps,],
|
495
|
|
- IndexesBatch, cIndexes,
|
496
|
|
- sapply(IndexesBatch, length),
|
497
|
|
- sapply(cIndexes, length), SMEDIAN,
|
498
|
|
- theKnots, mixtureParams[], DF, probs, 0.025)
|
499
|
|
-
|
500
|
|
- last <- rSnps[2]
|
501
|
|
- rm(snps, rSnps, IndexesBatch, tmpA, tmpB)
|
502
|
|
- close(A)
|
503
|
|
- close(B)
|
504
|
|
- close(mixtureParams)
|
505
|
|
- tmp
|
506
|
|
- }
|
507
|
|
-
|
508
|
|
- newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
|
509
|
|
- snpBatches=snpBatches,
|
510
|
|
- autosomeIndex=autosomeIndex, XIndex=XIndex,
|
511
|
|
- YIndex=YIndex, A=A, B=B,
|
512
|
|
- mixtureParams=mixtureParams, fIndex=fIndex,
|
513
|
|
- mIndex=mIndex, params=params,
|
514
|
|
- cIndexes=cIndexes, SMEDIAN=SMEDIAN,
|
515
|
|
- theKnots=theKnots, DF=DF, probs=probs,
|
516
|
|
- batchSize=ocProbesets())
|
517
|
|
-## last <- 0
|
518
|
|
-## for (idxBatch in seq(along=snpBatches)){
|
519
|
|
-## snps <- snpBatches[[idxBatch]]
|
520
|
|
-## rSnps <- range(snps)
|
521
|
|
-## IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
|
522
|
|
-## XIndex[XIndex %in% snps]-last,
|
523
|
|
-## YIndex[YIndex %in% snps]-last)
|
524
|
|
-## IndexesBatch <- lapply(IndexesBatch, as.integer)
|
525
|
|
-## tmpA <- A[snps,]
|
526
|
|
-## tmpB <- B[snps,]
|
527
|
|
-## newparamsBatch[[idxBatch]] <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
|
528
|
|
-## params[["centers"]][snps,],
|
529
|
|
-## params[["scales"]][snps,],
|
530
|
|
-## params[["N"]][snps,],
|
531
|
|
-## IndexesBatch, cIndexes,
|
532
|
|
-## sapply(IndexesBatch, length),
|
533
|
|
-## sapply(cIndexes, length),
|
534
|
|
-## SMEDIAN, theKnots,
|
535
|
|
-## mixtureParams[], DF, probs, 0.025)
|
536
|
|
-## last <- rSnps[2]
|
537
|
|
-## rm(snps, rSnps, IndexesBatch, tmpA, tmpB)
|
538
|
|
-## }
|
539
|
|
-## rm(last)
|
540
|
|
-
|
541
|
|
- newparams <- vector("list", 3)
|
542
|
|
- names(newparams) <- c("centers", "scales", "N")
|
543
|
|
- newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
|
544
|
|
- newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2))
|
545
|
|
- newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3))
|
546
|
|
- rm(newparamsBatch)
|
547
|
|
- if(verbose) message("Done.")
|
548
|
|
- if(verbose) message("Estimating recalibration parameters.")
|
549
|
|
- d <- newparams[["centers"]] - params$centers
|
550
|
|
-
|
551
|
|
- ##regression
|
552
|
|
- Index <- intersect(which(pmin(newparams[["N"]][, 1],
|
553
|
|
- newparams[["N"]][, 2],
|
554
|
|
- newparams[["N"]][, 3]) > recallMin &
|
555
|
|
- !apply(regionInfo, 1, any)),
|
556
|
|
- autosomeIndex)
|
557
|
|
- if(length(Index) < recallRegMin){
|
558
|
|
- warning("Recalibration not possible. Possible cause: small sample size.")
|
559
|
|
- newparams <- params
|
560
|
|
- dev <- vector("numeric", nrow(newparams[["centers"]]))
|
561
|
|
- SS <- matrix(Inf, 3, 3)
|
562
|
|
- DD <- 0
|
563
|
|
- }else{
|
564
|
|
- data4reg <- as.data.frame(newparams[["centers"]][Index,])
|
565
|
|
- names(data4reg) <- c("AA", "AB", "BB")
|
566
|
|
- regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)),
|
567
|
|
- c(coef(lm(AB~AA+BB, data=data4reg)), 0),
|
568
|
|
- coef(lm(BB~AA*AB, data=data4reg)))
|
569
|
|
- rownames(regParams) <- c("intercept", "X", "Y", "XY")
|
570
|
|
- rm(data4reg)
|
571
|
|
-
|
572
|
|
- minN <- 3
|
573
|
|
- newparams[["centers"]][newparams[["N"]] < minN] <- NA
|
574
|
|
- Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
|
575
|
|
- if(verbose) cat("Filling out empty centers")
|
576
|
|
- for(i in Index){
|
577
|
|
- if(verbose) if(i%%10000==0)cat(".")
|
578
|
|
- mu <- newparams[["centers"]][i, ]
|
579
|
|
- j <- which(is.na(mu))
|
580
|
|
- newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
|
581
|
|
- }
|
582
|
|
-
|
583
|
|
- ##remaing NAs are made like originals
|
584
|
|
- if(length(YIndex)>0){
|
585
|
|
- noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex),
|
586
|
|
- YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
|
587
|
|
- }
|
588
|
|
- snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0)
|
589
|
|
- snps2keep <- setdiff(autosomeIndex, snps2ignore)
|
590
|
|
- rm(snps2ignore)
|
591
|
|
- newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
|
592
|
|
- if(verbose) cat("\n")
|
593
|
|
-
|
594
|
|
- if(verbose) message("Calculating and standardizing size of shift.")
|
595
|
|
- GG <- DD <- newparams[["centers"]] - params[["centers"]]
|
596
|
|
- DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
|
597
|
|
- SS <- cov(DD[autosomeIndex, ])
|
598
|
|
- SSI <- solve(SS)
|
599
|
|
- dev <- vector("numeric", nrow(DD))
|
600
|
|
- if(length(YIndex)){
|
601
|
|
- dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
|
602
|
|
- dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
|
603
|
|
- ##Now Y (only two params)
|
604
|
|
- SSY <- SS[c(1, 3), c(1, 3)]
|
605
|
|
- SSI <- solve(SSY)
|
606
|
|
- dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
|
607
|
|
- dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
|
608
|
|
- } else {
|
609
|
|
- dev=apply(DD,1,function(x) x%*%SSI%*%x)
|
610
|
|
- dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
|
611
|
|
- }
|
612
|
|
- }
|
613
|
|
-
|
614
|
|
- ## BC: must keep SD
|
615
|
|
- params[-2] <- newparams[-2]
|
616
|
|
-
|
617
|
|
- rm(newparams);gc(verbose=FALSE)
|
618
|
|
- if(verbose) cat("Calling", NR, "SNPs... ")
|
619
|
|
- ## ###################
|
620
|
|
- ## ## MOVE TO C#######
|
621
|
|
-
|
622
|
|
- ## running in batches
|
623
|
|
- ## snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
|
624
|
|
-
|
625
|
|
- process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
|
626
|
|
- YIndex, A, B, mixtureParams, fIndex, mIndex,
|
627
|
|
- params, cIndexes, SMEDIAN, theKnots, DF, probs,
|
628
|
|
- regionInfo, batchSize){
|
629
|
|
- open(A)
|
630
|
|
- open(B)
|
631
|
|
- open(mixtureParams)
|
632
|
|
- snps <- snpBatches[[idxBatch]]
|
633
|
|
- tmpA <- as.matrix(A[snps,])
|
634
|
|
- tmpB <- as.matrix(B[snps,])
|
635
|
|
- rSnps <- range(snps)
|
636
|
|
- last <- (idxBatch-1)*batchSize
|
637
|
|
- IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
|
638
|
|
- XIndex[XIndex %in% snps]-last,
|
639
|
|
- YIndex[YIndex %in% snps]-last)
|
640
|
|
- IndexesBatch <- lapply(IndexesBatch, as.integer)
|
641
|
|
- ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
|
642
|
|
- params[["centers"]][snps,],
|
643
|
|
- params[["scales"]][snps,],
|
644
|
|
- params[["N"]][snps,],
|
645
|
|
- IndexesBatch, cIndexes,
|
646
|
|
- sapply(IndexesBatch, length),
|
647
|
|
- sapply(cIndexes, length),
|
648
|
|
- SMEDIAN, theKnots, mixtureParams[],
|
649
|
|
- DF, probs, 0.025,
|
650
|
|
- which(regionInfo[snps, 2]),
|
651
|
|
- which(regionInfo[snps, 1]))
|
652
|
|
- A[snps,] <- tmpA
|
653
|
|
- B[snps,] <- tmpB
|
654
|
|
- last <- rSnps[2]
|
655
|
|
- rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull)
|
656
|
|
- close(A)
|
657
|
|
- close(B)
|
658
|
|
- close(mixtureParams)
|
659
|
|
- }
|
660
|
|
-
|
661
|
|
- ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
|
662
|
|
- autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
|
663
|
|
- A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
|
664
|
|
- mIndex=mIndex, params=params, cIndexes=cIndexes,
|
665
|
|
- SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
|
666
|
|
- regionInfo=regionInfo, batchSize=ocProbesets())
|
667
|
|
-
|
668
|
|
-## last <- 0
|
669
|
|
-## for (idxBatch in seq(along=snpBatches)){
|
670
|
|
-## snps <- snpBatches[[idxBatch]]
|
671
|
|
-## tmpA <- A[snps,]
|
672
|
|
-## tmpB <- B[snps,]
|
673
|
|
-## rSnps <- range(snps)
|
674
|
|
-## IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
|
675
|
|
-## XIndex[XIndex %in% snps]-last,
|
676
|
|
-## YIndex[YIndex %in% snps]-last)
|
677
|
|
-## IndexesBatch <- lapply(IndexesBatch, as.integer)
|
678
|
|
-## ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
|
679
|
|
-## params[["centers"]][snps,],
|
680
|
|
-## params[["scales"]][snps,],
|
681
|
|
-## params[["N"]][snps,],
|
682
|
|
-## IndexesBatch, cIndexes,
|
683
|
|
-## sapply(IndexesBatch, length),
|
684
|
|
-## sapply(cIndexes, length),
|
685
|
|
-## SMEDIAN, theKnots, mixtureParams[],
|
686
|
|
-## DF, probs, 0.025,
|
687
|
|
-## which(regionInfo[snps, 2]),
|
688
|
|
-## which(regionInfo[snps, 1]))
|
689
|
|
-## A[snps,] <- tmpA
|
690
|
|
-## B[snps,] <- tmpB
|
691
|
|
-## last <- rSnps[2]
|
692
|
|
-## rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull)
|
693
|
|
-## }
|
694
|
|
-##
|
695
|
|
-## gc(verbose=FALSE)
|
696
|
|
- ## END MOVE TO C#######
|
697
|
|
- ## ##################
|
698
|
|
-
|
699
|
|
- dev <- dev/(dev+1/383)
|
700
|
|
- if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
|
701
|
|
- if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
|
702
|
|
-
|
703
|
|
- if(length(Index) >= recallRegMin){
|
704
|
|
- tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
|
705
|
|
- tmpSnpQc <- dev[autosomeIndex]
|
706
|
|
- SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
|
707
|
|
- batchQC <- mean(diag(SS))
|
708
|
|
- }else{
|
709
|
|
- SS <- matrix(0, 3, 3)
|
710
|
|
- batchQC <- Inf
|
711
|
|
- }
|
712
|
|
-
|
713
|
|
- if(verbose) message("Done.")
|
714
|
|
- if (returnParams){
|
715
|
|
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
|
716
|
|
- }else{
|
717
|
|
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
|
718
|
|
- }
|
719
|
|
-}
|
720
|
|
-
|
721
|
289
|
|
722
|
290
|
##---------------------------------------------------------------------------
|
723
|
291
|
##---------------------------------------------------------------------------
|
...
|
...
|
@@ -853,17 +421,13 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
|
853
|
421
|
## Here, I'm just using the # of rows returned from the above function
|
854
|
422
|
if(k == 1){
|
855
|
423
|
if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability")
|
856
|
|
- load.obj <- loadObject("callSet", load.it)
|
857
|
|
- if(!load.obj){
|
858
|
|
- callSet <- new("SnpSuperSet",
|
859
|
|
- alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
|
860
|
|
- alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
|
861
|
|
- call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
|
862
|
|
- callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
|
863
|
|
- annotation=cdfName)
|
864
|
|
- sampleNames(callSet) <- sns
|
865
|
|
- save(callSet, file=file.path(ldPath(), "callSet.rda"))
|
866
|
|
- } else load(file.path(ldPath(), "callSet.rda"))
|
|
424
|
+ callSet <- new("SnpSuperSet",
|
|
425
|
+ alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
|
|
426
|
+ alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
|
|
427
|
+ call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
|
|
428
|
+ callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
|
|
429
|
+ annotation=cdfName)
|
|
430
|
+ sampleNames(callSet) <- sns
|
867
|
431
|
phenoData(callSet) <- getPhenoData(sampleSheet=sampleSheet,
|
868
|
432
|
arrayNames=sns,
|
869
|
433
|
arrayInfoColNames=arrayInfoColNames)
|
...
|
...
|
@@ -878,6 +442,7 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
|
878
|
442
|
##pData(callSet)$SNR <- rep(NA, length(sns))
|
879
|
443
|
pData(callSet)$gender <- rep(NA, length(sns))
|
880
|
444
|
mixtureParams <- initializeBigMatrix("crlmmMixt-", nr=4, nc=ncol(callSet), vmode="double")
|
|
445
|
+ save(mixtureParams, file=file.path(ldPath(), "mixtureParams.rda"))
|
881
|
446
|
if(missing(batch)){
|
882
|
447
|
protocolData(callSet)$batch <- rep(NA, length(sns))
|
883
|
448
|
} else{
|
...
|
...
|
@@ -907,6 +472,16 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
|
907
|
472
|
rm(res); gc()
|
908
|
473
|
k <- k+1
|
909
|
474
|
}
|
|
475
|
+ save(callSet, file=file.path(ldPath(), "callSet.rda"))
|
|
476
|
+ ##otherwise, A and B get overwritten
|
|
477
|
+ ##AA <- initializeBigMatrix("crlmmA", nrow(callSet), ncol(callSet), "integer")
|
|
478
|
+ ##BB <- initializeBigMatrix("crlmmB", nrow(callSet), ncol(callSet), "integer")
|
|
479
|
+ ##bb = ocProbesets()*ncol(A)*8
|
|
480
|
+ AA <- clone(A(callSet))
|
|
481
|
+ BB <- clone(B(callSet))
|
|
482
|
+ ##ffrowapply(AA[i1:i2, ] <- A(callSet)[i1:i2, ], X=A(callSet), BATCHBYTES=bb)
|
|
483
|
+ ##ffrowapply(BB[i1:i2, ] <- B(callSet)[i1:i2, ], X=B(callSet), BATCHBYTES=bb)
|
|
484
|
+ ##crlmmGT2 overwrites A and B.
|
910
|
485
|
tmp <- crlmmGT2(A=A(callSet),
|
911
|
486
|
B=B(callSet),
|
912
|
487
|
SNR=callSet$SNR,
|
...
|
...
|
@@ -925,27 +500,13 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
|
925
|
500
|
badSNP=badSNP)
|
926
|
501
|
open(tmp[["calls"]])
|
927
|
502
|
open(tmp[["confs"]])
|
|
503
|
+ A(callSet) <- AA
|
|
504
|
+ B(callSet) <- BB
|
928
|
505
|
snpCall(callSet) <- tmp[["calls"]]
|
929
|
506
|
## MR: many zeros in the conf. scores (?)
|
930
|
507
|
snpCallProbability(callSet) <- tmp[["confs"]]
|
931
|
508
|
callSet$gender <- tmp$gender
|
932
|
|
- if(copynumber){
|
933
|
|
- load.obj <- loadObject("cnSet", load.it)
|
934
|
|
- if(!load.obj){
|
935
|
|
- cnSet <- as(callSet, "CNSetLM")
|
936
|
|
- } else {
|
937
|
|
- load(file.path(ldPath(), "cnSet.rda"))
|
938
|
|
- A(cnSet) <- A(callSet)
|
939
|
|
- B(cnSet) <- B(callSet)
|
940
|
|
- snpCall(cnSet) <- snpCall(callSet)
|
941
|
|
- snpCallProbability(cnSet) <- snpCallProbability(callSet)
|
942
|
|
- annotation(cnSet) <- annotation(callSet)
|
943
|
|
- featureData(cnSet) <- featureData(callSet)
|
944
|
|
- protocolData(cnSet) <- protocolData(callSet)
|
945
|
|
- phenoData(cnSet) <- phenoData(callSet)
|
946
|
|
- experimentData(cnSet) <- experimentData(callSet)
|
947
|
|
- }
|
948
|
|
- }
|
|
509
|
+ if(copynumber) cnSet <- as(callSet, "CNSetLM")
|
949
|
510
|
close(mixtureParams)
|
950
|
511
|
rm(tmp); gc()
|
951
|
512
|
return(cnSet)
|
...
|
...
|
@@ -1298,7 +859,7 @@ crlmmCopynumber <- function(object,
|
1298
|
859
|
}
|
1299
|
860
|
|
1300
|
861
|
crlmmCopynumber2 <- function(object,
|
1301
|
|
- which.batches,
|
|
862
|
+ which.batches,
|
1302
|
863
|
MIN.SAMPLES=10,
|
1303
|
864
|
SNRMin=5,
|
1304
|
865
|
MIN.OBS=1,
|
...
|
...
|
@@ -1313,8 +874,7 @@ crlmmCopynumber2 <- function(object,
|
1313
|
874
|
MIN.NU=2^3,
|
1314
|
875
|
MIN.PHI=2^3,
|
1315
|
876
|
THR.NU.PHI=TRUE,
|
1316
|
|
- thresholdCopynumber=TRUE,
|
1317
|
|
- load.it=TRUE){
|
|
877
|
+ thresholdCopynumber=TRUE){
|
1318
|
878
|
stopifnot("batch" %in% varLabels(protocolData(object)))
|
1319
|
879
|
stopifnot("chromosome" %in% fvarLabels(object))
|
1320
|
880
|
stopifnot("position" %in% fvarLabels(object))
|
...
|
...
|
@@ -1325,9 +885,7 @@ crlmmCopynumber2 <- function(object,
|
1325
|
885
|
XIndex.nps <- (1:nrow(object))[chromosome(object) == 23 & !isSnp(object) & !is.na(chromosome(object))]
|
1326
|
886
|
autosomeIndex.snps <- (1:nrow(object))[chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object))]
|
1327
|
887
|
autosomeIndex.nps <- (1:nrow(object))[chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object))]
|
1328
|
|
- ##Indexes <- list(autosomeIndex, XIndex, YIndex)
|
1329
|
|
- ##snpBatches <- splitIndicesByLength(1:nrow(object), ocProbesets())
|
1330
|
|
- snpBatches <- splitIndicesByLength(autosomeIndex.snps, ocProbesets())
|
|
888
|
+
|
1331
|
889
|
## Do chromosome X in batches
|
1332
|
890
|
Ns <- initializeBigMatrix("Ns", nrow(object), 5)
|
1333
|
891
|
colnames(Ns) <- c("A", "B", "AA", "AB", "BB")
|
...
|
...
|
@@ -1345,25 +903,26 @@ crlmmCopynumber2 <- function(object,
|
1345
|
903
|
|
1346
|
904
|
}
|
1347
|
905
|
if(verbose) message("Estimating allele-specific copy number at autosomal SNPs")
|
|
906
|
+ snpBatches <- splitIndicesByLength(autosomeIndex.snps, ocProbesets())
|
1348
|
907
|
ocLapply(seq(along=snpBatches),
|
1349
|
|
- fit.lm1,
|
1350
|
|
- autosomeIndex=autosomeIndex.snps,
|
1351
|
|
- object=object,
|
1352
|
|
- Ns=Ns,
|
1353
|
|
- normal=normal,
|
1354
|
|
- snpflags=snpflags,
|
1355
|
|
- snpBatches=snpBatches,
|
1356
|
|
- batchSize=ocProbesets(),
|
1357
|
|
- SNRMin=SNRMin,
|
1358
|
|
- MIN.SAMPLES=MIN.SAMPLES,
|
1359
|
|
- MIN.OBS=MIN.OBS,
|
1360
|
|
- DF=DF.PRIOR,
|
1361
|
|
- GT.CONF.THR=GT.CONF.THR,
|
1362
|
|
- THR.NU.PHI=THR.NU.PHI,
|
1363
|
|
- MIN.NU=MIN.NU,
|
1364
|
|
- MIN.PHI=MIN.PHI,
|
1365
|
|
- verbose=verbose,
|
1366
|
|
- neededPkgs="crlmm")
|
|
908
|
+ fit.lm1,
|
|
909
|
+ autosomeIndex=autosomeIndex.snps,
|
|
910
|
+ object=object,
|
|
911
|
+ Ns=Ns,
|
|
912
|
+ normal=normal,
|
|
913
|
+ snpflags=snpflags,
|
|
914
|
+ snpBatches=snpBatches,
|
|
915
|
+ batchSize=ocProbesets(),
|
|
916
|
+ SNRMin=SNRMin,
|
|
917
|
+ MIN.SAMPLES=MIN.SAMPLES,
|
|
918
|
+ MIN.OBS=MIN.OBS,
|
|
919
|
+ DF=DF.PRIOR,
|
|
920
|
+ GT.CONF.THR=GT.CONF.THR,
|
|
921
|
+ THR.NU.PHI=THR.NU.PHI,
|
|
922
|
+ MIN.NU=MIN.NU,
|
|
923
|
+ MIN.PHI=MIN.PHI,
|
|
924
|
+ verbose=verbose,
|
|
925
|
+ neededPkgs="crlmm")
|
1367
|
926
|
## autosomal NPs
|
1368
|
927
|
snpBatches <- splitIndicesByLength(autosomeIndex.nps, ocProbesets())
|
1369
|
928
|
if(verbose) message("Estimating total copy number at nonpolymorphic loci")
|
...
|
...
|
@@ -1387,7 +946,7 @@ crlmmCopynumber2 <- function(object,
|
1387
|
946
|
verbose=verbose,
|
1388
|
947
|
neededPkgs="crlmm")
|
1389
|
948
|
snpBatches <- splitIndicesByLength(XIndex.snps, ocProbesets())
|
1390
|
|
- if(verbose) message("Estimating total copy number at polymorphic loci on chromosome X")
|
|
949
|
+ if(verbose) message("Estimating allele-specific copy number at polymorphic loci on chromosome X")
|
1391
|
950
|
ocLapply(seq(along=snpBatches),
|
1392
|
951
|
fit.lm3,
|
1393
|
952
|
autosomeIndex=XIndex.snps,
|
...
|
...
|
@@ -1409,7 +968,6 @@ crlmmCopynumber2 <- function(object,
|
1409
|
968
|
neededPkgs="crlmm")
|
1410
|
969
|
if(verbose) message("Estimating total copy number for nonpolymorphic loci on chromosome X")
|
1411
|
970
|
snpBatches <- splitIndicesByLength(XIndex.nps, ocProbesets())
|
1412
|
|
- if(verbose) message("Estimating total copy number at nonpolymorphic loci on chromosome X")
|
1413
|
971
|
tmp <- ocLapply(seq(along=snpBatches),
|
1414
|
972
|
fit.lm4,
|
1415
|
973
|
XIndex=XIndex.nps,
|
...
|
...
|
@@ -1432,6 +990,7 @@ crlmmCopynumber2 <- function(object,
|
1432
|
990
|
return(object)
|
1433
|
991
|
}
|
1434
|
992
|
|
|
993
|
+
|
1435
|
994
|
fit.lm1 <- function(idxBatch,
|
1436
|
995
|
snpBatches,
|
1437
|
996
|
autosomeIndex,
|
...
|
...
|
@@ -1449,25 +1008,32 @@ fit.lm1 <- function(idxBatch,
|
1449
|
1008
|
MIN.NU,
|
1450
|
1009
|
MIN.PHI,
|
1451
|
1010
|
verbose, ...){
|
1452
|
|
- ## which.batches, ...){
|
1453
|
1011
|
if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches))
|
|
1012
|
+ snps <- snpBatches[[idxBatch]]
|
|
1013
|
+ batches <- split(seq(along=batch(object)), batch(object))
|
|
1014
|
+ batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
|
|
1015
|
+
|
1454
|
1016
|
open(object)
|
1455
|
1017
|
open(snpflags)
|
1456
|
1018
|
open(normal)
|
1457
|
|
- snps <- snpBatches[[idxBatch]]
|
1458
|
|
- batches <- split(seq(along=batch(object)), batch(object))
|
1459
|
|
- batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
|
1460
|
|
- corr <- corrA.BB <- corrB.AA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batches)))
|
|
1019
|
+
|
|
1020
|
+ corr <- corrA.BB <- corrB.AA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batch(object))))
|
1461
|
1021
|
flags <- nuA <- nuB <- phiA <- phiB <- corr
|
|
1022
|
+
|
|
1023
|
+ normal.snps <- normal[snps, ]
|
1462
|
1024
|
cB <- cA <- matrix(NA, length(snps), ncol(object))
|
|
1025
|
+ GG <- as.matrix(calls(object)[snps, ])
|
|
1026
|
+ CP <- as.matrix(snpCallProbability(object)[snps, ])
|
|
1027
|
+ AA <- as.matrix(A(object)[snps, ])
|
|
1028
|
+ BB <- as.matrix(B(object)[snps, ])
|
1463
|
1029
|
for(k in batches){
|
1464
|
|
- G <- calls(object)[snps, k]
|
1465
|
|
- NORM <- normal[snps, k]
|
1466
|
|
- xx <- snpCallProbability(object)[snps, k]
|
|
1030
|
+ G <- GG[, k]
|
|
1031
|
+ NORM <- normal.snps[, k]
|
|
1032
|
+ xx <- CP[, k]
|
1467
|
1033
|
highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
|
1468
|
1034
|
G <- G*highConf*NORM
|
1469
|
|
- A <- A(object)[snps, k]
|
1470
|
|
- B <- B(object)[snps, k]
|
|
1035
|
+ A <- AA[, k]
|
|
1036
|
+ B <- BB[, k]
|
1471
|
1037
|
##index <- GT.B <- GT.A <- vector("list", 3)
|
1472
|
1038
|
##names(index) <- names(GT.B) <- names(GT.A) <- c("AA", "AB", "BB")
|
1473
|
1039
|
Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G)
|
...
|
...
|
@@ -1490,7 +1056,7 @@ fit.lm1 <- function(idxBatch,
|
1490
|
1056
|
corr[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=2, DF.PRIOR)
|
1491
|
1057
|
corrB.AA[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=1, DF.PRIOR)
|
1492
|
1058
|
corrA.BB[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=3, DF.PRIOR)
|
1493
|
|
- ##formerly oneBatch()...
|
|
1059
|
+
|
1494
|
1060
|
##---------------------------------------------------------------------------
|
1495
|
1061
|
## Impute sufficient statistics for unobserved genotypes (plate-specific)
|
1496
|
1062
|
##---------------------------------------------------------------------------
|
...
|
...
|
@@ -1532,6 +1098,8 @@ fit.lm1 <- function(idxBatch,
|
1532
|
1098
|
muA[index[[j]], -kk] <- mus[, 1:2]
|
1533
|
1099
|
muB[index[[j]], -kk] <- mus[, 3:4]
|
1534
|
1100
|
}
|
|
1101
|
+ rm(betahat, X, Y, mus, index, noAA, noAB, noBB, res)
|
|
1102
|
+ gc()
|
1535
|
1103
|
negA <- rowSums(muA < 0) > 0
|
1536
|
1104
|
negB <- rowSums(muB < 0) > 0
|
1537
|
1105
|
flags[, J] <- rowSums(Ns == 0) > 0
|
...
|
...
|
@@ -1561,21 +1129,53 @@ fit.lm1 <- function(idxBatch,
|
1561
|
1129
|
## formerly polymorphic(): calculate copy number
|
1562
|
1130
|
cA[, k] <- matrix((1/phiA[, J]*(A-nuA[, J])), nrow(A), ncol(A))
|
1563
|
1131
|
cB[, k] <- matrix((1/phiB[, J]*(B-nuB[, J])), nrow(B), ncol(B))
|
|
1132
|
+ rm(G, A, B, NORM, wA, wB, YA,YB, res, negA, negB, Np, Ns)
|
|
1133
|
+ gc()
|
1564
|
1134
|
}
|
|
1135
|
+
|
|
1136
|
+ cA[cA < 0.05] <- 0.05
|
|
1137
|
+ cB[cB < 0.05] <- 0.05
|
|
1138
|
+ cA[cA > 5] <- 5
|
|
1139
|
+ cB[cB > 5] <- 5
|
1565
|
1140
|
cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA))
|
1566
|
1141
|
cB <- matrix(as.integer(cB*100), nrow(cB), ncol(cB))
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
|
1567
|
1145
|
CA(object)[snps, ] <- cA
|
1568
|
1146
|
CB(object)[snps, ] <- cB
|
|
1147
|
+
|
|
1148
|
+
|
1569
|
1149
|
snpflags[snps, ] <- flags
|
1570
|
1150
|
lapply(lM(object), open)
|
1571
|
|
- lM(object)$tau2A[snps, ] <- tau2A
|
1572
|
|
- lM(object)$tau2B[snps, ] <- tau2B
|
1573
|
|
- lM(object)$sig2A[snps, ] <- sig2A
|
1574
|
|
- lM(object)$sig2B[snps, ] <- sig2B
|
1575
|
|
- lM(object)$nuA[snps, ] <- nuA
|
1576
|
|
- lM(object)$nuB[snps, ] <- nuB
|
1577
|
|
- lM(object)$phiA[snps, ] <- phiA
|
1578
|
|
- lM(object)$phiB[snps, ] <- phiB
|
|
1151
|
+
|
|
1152
|
+ tmp <- physical(lM(object))$tau2A
|
|
1153
|
+ tmp[snps, ] <- tau2A
|
|
1154
|
+ lM(object)$tau2A <- tmp
|
|
1155
|
+ tmp <- physical(lM(object))$tau2B
|
|
1156
|
+ tmp[snps, ] <- tau2B
|
|
1157
|
+ lM(object)$tau2B <- tmp
|
|
1158
|
+ tmp <- physical(lM(object))$tau2B
|
|
1159
|
+ tmp[snps, ] <- tau2B
|
|
1160
|
+ lM(object)$tau2B <- tmp
|
|
1161
|
+ tmp <- physical(lM(object))$sig2A
|
|
1162
|
+ tmp[snps, ] <- sig2A
|
|
1163
|
+ lM(object)$sig2A <- tmp
|
|
1164
|
+ tmp <- physical(lM(object))$sig2B
|
|
1165
|
+ tmp[snps, ] <- sig2B
|
|
1166
|
+ lM(object)$sig2B <- tmp
|
|
1167
|
+ tmp <- physical(lM(object))$nuA
|
|
1168
|
+ tmp[snps, ] <- nuA
|
|
1169
|
+ lM(object)$nuA <- tmp
|
|
1170
|
+ tmp <- physical(lM(object))$nuB
|
|
1171
|
+ tmp[snps, ] <- nuB
|
|
1172
|
+ lM(object)$nuB <- tmp
|
|
1173
|
+ tmp <- physical(lM(object))$phiA
|
|
1174
|
+ tmp[snps, ] <- phiA
|
|
1175
|
+ lM(object)$phiA <- tmp
|
|
1176
|
+ tmp <- physical(lM(object))$phiB
|
|
1177
|
+ tmp[snps, ] <- phiB
|
|
1178
|
+ lM(object)$phiB <- tmp
|
1579
|
1179
|
lapply(assayData(object), close)
|
1580
|
1180
|
lapply(lM(object), close)
|
1581
|
1181
|
TRUE
|
...
|
...
|
@@ -1600,45 +1200,56 @@ fit.lm2 <- function(idxBatch,
|
1600
|
1200
|
verbose, ...){
|
1601
|
1201
|
## which.batches, ...){
|
1602
|
1202
|
if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches))
|
1603
|
|
- open(object)
|
1604
|
|
- open(snpflags)
|
1605
|
|
- open(normal)
|
1606
|
1203
|
snps <- snpBatches[[idxBatch]]
|
1607
|
1204
|
batches <- split(seq(along=batch(object)), batch(object))
|
1608
|
1205
|
batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
|
|
1206
|
+
|
|
1207
|
+ open(object)
|
|
1208
|
+ open(snpflags)
|
|
1209
|
+ open(normal)
|
|
1210
|
+
|
|
1211
|
+
|
1609
|
1212
|
cA <- matrix(NA, length(snps), ncol(object))
|
1610
|
1213
|
ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object))
|
1611
|
|
- flags <- snpflags[, ]
|
1612
|
|
- noflags <- rowSums(flags) == 0
|
|
1214
|
+ flags <- as.matrix(snpflags[,])
|
|
1215
|
+ noflags <- rowSums(flags, na.rm=TRUE) == 0 ##NA's for unevaluated batches
|
1613
|
1216
|
## We do not want to write to discuss for each batch. More efficient to
|
1614
|
1217
|
## write to disk after estimating these parameters for all batches.
|
1615
|
|
- nuA.np <- phiA.np <- sig2A.np <- matrix(NA, length(snps), length(unique(batches)))
|
|
1218
|
+ nuA.np <- phiA.np <- sig2A.np <- matrix(NA, length(snps), length(unique(batch(object))))
|
1616
|
1219
|
## for imputation, we need the corresponding parameters of the snps
|
1617
|
|
- nuA <- lM(object)$nuA
|
1618
|
|
- phiA <- lM(object)$phiA
|
1619
|
|
- nuB <- lM(object)$nuB
|
1620
|
|
- phiB <- lM(object)$phiB
|
1621
|
|
- snp.ind <- ii & noflags
|
1622
|
|
- NORM.np <- normal[snps, ]
|
|
1220
|
+ NN <- min(10e3, length(which(ii & noflags)))
|
|
1221
|
+ snp.ind <- sample(which(ii & noflags), NN)
|
|
1222
|
+ nnuA.snp <- as.matrix(physical(lM(object))$nuA[snp.ind,])
|
|
1223
|
+ pphiA.snp <- as.matrix(physical(lM(object))$phiA[snp.ind,])
|
|
1224
|
+ nnuB.snp <- as.matrix(physical(lM(object))$nuB[snp.ind,])
|
|
1225
|
+ pphiB.snp <- as.matrix(physical(lM(object))$phiB[snp.ind,])
|
|
1226
|
+
|
|
1227
|
+ AA.snp <- as.matrix(A(object)[snp.ind, ])
|
|
1228
|
+ BB.snp <- as.matrix(B(object)[snp.ind, ])
|
|
1229
|
+ NNORM.snp <- as.matrix(normal[snp.ind, ])
|
|
1230
|
+ NORM.np <- as.matrix(normal[snps, ])
|
|
1231
|
+ AA.np <- as.matrix(A(object)[snps, ])
|
|
1232
|
+ GG <- as.matrix(calls(object)[snp.ind, ])
|
|
1233
|
+ CP <- as.matrix(snpCallProbability(object)[snp.ind, ])
|
1623
|
1234
|
for(k in batches){
|
1624
|
1235
|
##if(verbose) message("SNP batch ", ii, " of ", length(batches))
|
1625
|
1236
|
J <- match(unique(batch(object)[k]), unique(batch(object)))
|
1626
|
|
- snp.index <- snp.ind & nuA[, J] > 20 & nuB[, J] > 20 & phiA[, J] > 20 & phiB[, J] > 20
|
1627
|
|
- if(sum(snp.index) >= 5000){
|
1628
|
|
- snp.index <- sample(which(snp.index), 5000)
|
1629
|
|
- }
|
1630
|
|
- phiA.snp <- phiA[snp.index, J]
|
1631
|
|
- phiB.snp <- phiB[snp.index, J]
|
1632
|
|
- A.snp <- A(object)[snp.index, k]
|
1633
|
|
- B.snp <- B(object)[snp.index, k]
|
1634
|
|
- NORM.snp <- normal[snp.index, k]
|
1635
|
|
- G <- calls(object)[snp.index, k]
|
1636
|
|
- xx <- snpCallProbability(object)[snp.index, k]
|
|
1237
|
+## snp.index <- snp.ind & nuA[, J] > 20 & nuB[, J] > 20 & phiA[, J] > 20 & phiB[, J] > 20
|
|
1238
|
+## if(sum(snp.index) >= 5000){
|
|
1239
|
+## snp.index <- sample(which(snp.index), 5000)
|
|
1240
|
+## } else snp.index <- which(snp.index)
|
|
1241
|
+ phiA.snp <- pphiA.snp[, J]
|
|
1242
|
+ phiB.snp <- pphiB.snp[, J]
|
|
1243
|
+ A.snp <- AA.snp[, k]
|
|
1244
|
+ B.snp <- BB.snp[, k]
|
|
1245
|
+ NORM.snp <- NNORM.snp[, k]
|
|
1246
|
+ G <- GG[, k]
|
|
1247
|
+ xx <- CP[, k]
|
1637
|
1248
|
highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
|
1638
|
|
- G <- G*highConf*NORM.snp[, k]
|
|
1249
|
+ G <- G*highConf*NORM.snp
|
1639
|
1250
|
G[G==0] <- NA
|
1640
|
1251
|
##nonpolymorphic
|
1641
|
|
- A.np <- A(object)[snps, k]
|
|
1252
|
+ A.np <- AA.np[, k]
|
1642
|
1253
|
Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G)
|
1643
|
1254
|
muA <- applyByGenotype(A.snp, rowMedians, G)
|
1644
|
1255
|
muB <- applyByGenotype(B.snp, rowMedians, G)
|
...
|
...
|
@@ -1660,13 +1271,27 @@ fit.lm2 <- function(idxBatch,
|
1660
|
1271
|
}
|
1661
|
1272
|
cA[, k] <- 1/phiA.np[, J] * (A.np - nuA.np[, J])
|
1662
|
1273
|
sig2A.np[, J] <- rowMAD(log2(A.np*NORM.np[, k]), na.rm=TRUE)
|
|
1274
|
+ rm(NORM.snp, highConf, xx, G, Ns, A.np, X, Y, betahat, mus, logPhiT)
|
|
1275
|
+ gc()
|
1663
|
1276
|
}
|
|
1277
|
+
|
|
1278
|
+ cA[cA < 0.05] <- 0.05
|
|
1279
|
+ cA[cA > 5] <- 5
|
1664
|
1280
|
cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA))
|
1665
|
1281
|
CA(object)[snps, ] <- cA
|
1666
|
1282
|
##open(lM(object))
|
1667
|
|
- lM(object)$sig2A[snps, ] <- sig2A.np
|
1668
|
|
- lM(object)$nuA[snps, ] <- nuA.np
|
1669
|
|
- lM(object)$phiA[snps, ] <- phiA.np
|
|
1283
|
+ tmp <- physical(lM(object))$nuA
|
|
1284
|
+ tmp[snps, ] <- nuA.np
|
|
1285
|
+ lM(object)$nuA <- tmp
|
|
1286
|
+ tmp <- physical(lM(object))$sig2A
|
|
1287
|
+ tmp[snps, ] <- sig2A.np
|
|
1288
|
+ lM(object)$sig2A <- tmp
|
|
1289
|
+ tmp <- physical(lM(object))$phiA
|
|
1290
|
+ tmp[snps, ] <- phiA.np
|
|
1291
|
+ lM(object)$sig2A <- tmp
|
|
1292
|
+## lM(object)$sig2A[snps, ] <- sig2A.np
|
|
1293
|
+## lM(object)$nuA[snps, ] <- nuA.np
|
|
1294
|
+## lM(object)$phiA[snps, ] <- phiA.np
|
1670
|
1295
|
lapply(assayData(object), close)
|
1671
|
1296
|
lapply(lM(object), close)
|
1672
|
1297
|
TRUE
|
...
|
...
|
@@ -1692,13 +1317,14 @@ fit.lm3 <- function(idxBatch,
|
1692
|
1317
|
verbose, ...){
|
1693
|
1318
|
## which.batches, ...){
|
1694
|
1319
|
if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches))
|
|
1320
|
+ snps <- snpBatches[[idxBatch]]
|
|
1321
|
+ batches <- split(seq(along=batch(object)), batch(object))
|
|
1322
|
+ batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
|
|
1323
|
+
|
1695
|
1324
|
open(snpflags)
|
1696
|
1325
|
open(normal)
|
1697
|
1326
|
open(object)
|
1698
|
|
- snps <- snpBatches[[idxBatch]]
|
1699
|
|
- batches <- split(seq(along=batch(object)), batch(object))
|
1700
|
|
- batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
|
1701
|
|
- corrAB <- corrBB <- corrAA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batches)))
|
|
1327
|
+ corrAB <- corrBB <- corrAA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batch(object))))
|
1702
|
1328
|
phiA2 <- phiB2 <- tau2A
|
1703
|
1329
|
flags <- nuA <- nuB <- phiA <- phiB <- corrAB
|
1704
|
1330
|
cB <- cA <- matrix(NA, length(snps), ncol(object))
|
...
|
...
|
@@ -1706,15 +1332,21 @@ fit.lm3 <- function(idxBatch,
|
1706
|
1332
|
IX <- matrix(gender, length(snps), ncol(object))
|
1707
|
1333
|
NORM <- normal[snps,]
|
1708
|
1334
|
IX <- IX==2
|
|
1335
|
+
|
|
1336
|
+ GG <- as.matrix(calls(object)[snps, ])
|
|
1337
|
+ CP <- as.matrix(snpCallProbability(object)[snps,])
|
|
1338
|
+ AA <- as.matrix(A(object)[snps, ])
|
|
1339
|
+ BB <- as.matrix(B(object)[snps, ])
|
1709
|
1340
|
for(k in batches){
|
1710
|
1341
|
##if(verbose) message("SNP batch ", ii, " of ", length(batches))
|
1711
|
1342
|
## within-genotype moments
|
1712
|
|
- G <- calls(object)[snps, k]
|
1713
|
|
- xx <- snpCallProbability(object)[snps, k]
|
|
1343
|
+ gender <- object$gender[k]
|
|
1344
|
+ G <- GG[, k]
|
|
1345
|
+ xx <- CP[, k]
|
1714
|
1346
|
highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
|
1715
|
1347
|
G <- G*highConf*NORM[, k]
|
1716
|
|
- A <- A(object)[snps, k]
|
1717
|
|
- B <- B(object)[snps, k]
|
|
1348
|
+ A <- AA[, k]
|
|
1349
|
+ B <- BB[, k]
|
1718
|
1350
|
##index <- GT.B <- GT.A <- vector("list", 3)
|
1719
|
1351
|
##names(index) <- names(GT.B) <- names(GT.A) <- c("AA", "AB", "BB")
|
1720
|
1352
|
Ns.F <- applyByGenotype(matrix(1, nrow(G), sum(gender==2)), rowSums, G[, gender==2])
|
...
|
...
|
@@ -1753,10 +1385,10 @@ fit.lm3 <- function(idxBatch,
|
1753
|
1385
|
correct.orderB <- muB.F[, 3] > muB.F[, 1]
|
1754
|
1386
|
index.complete <- intersect(which(correct.orderA & correct.orderB), intersect(index[[1]], intersect(index[[2]], index[[3]])))
|
1755
|
1387
|
size <- min(5000, length(index.complete))
|
1756
|
|
- if(size == 5000) index.complete <- sample(index.complete, 5000, replace=TRUE)
|
1757
|
1388
|
if(length(index.complete) < 200){
|
1758
|
1389
|
stop("fewer than 200 snps pass criteria for predicting the sufficient statistics")
|
1759
|
1390
|
}
|
|
1391
|
+ if(size==5000) index.complete <- sample(index.complete, size)
|
1760
|
1392
|
index <- vector("list", 3)
|
1761
|
1393
|
index[[1]] <- which(Ns.F[, 1] == 0 & (Ns.F[, 2] >= MIN.OBS & Ns.F[, 3] >= MIN.OBS))
|
1762
|
1394
|
index[[2]] <- which(Ns.F[, 2] == 0 & (Ns.F[, 1] >= MIN.OBS & Ns.F[, 3] >= MIN.OBS))
|
...
|
...
|
@@ -1772,10 +1404,11 @@ fit.lm3 <- function(idxBatch,
|
1772
|
1404
|
complete[[2]] <- which(correct.orderA & correct.orderB & nobsB & notMissing) ##be selective here
|
1773
|
1405
|
size <- min(5000, length(complete[[1]]))
|
1774
|
1406
|
if(size > 5000) complete <- lapply(complete, function(x) sample(x, size))
|
|
1407
|
+ ##
|
1775
|
1408
|
res <- imputeCenterX(muA.M, muB.M, Ns.M, complete, MIN.OBS)
|
1776
|
1409
|
muA.M <- res[[1]]
|
1777
|
1410
|
muB.M <- res[[2]]
|
1778
|
|
-
|
|
1411
|
+ ##
|
1779
|
1412
|
## Monomorphic SNPs. Mixture model may be better
|
1780
|
1413
|
## Improve estimation by borrowing strength across batch
|
1781
|
1414
|
noAA <- Ns.F[, 1] < MIN.OBS
|
...
|
...
|
@@ -1833,9 +1466,17 @@ fit.lm3 <- function(idxBatch,
|
1833
|
1466
|
phistar <- phiB2[, J]/phiA[, J]
|
1834
|
1467
|
tmp <- (B-nuB[, J] - phistar*A + phistar*nuA[, J])/phiB[, J]
|
1835
|
1468
|
cB[, k] <- tmp/(1-phistar*phiA2[, J]/phiB[, J])
|
1836
|
|
- cA[, k] <- (A-nuA[, J]-phiA2[, J]*cB)/phiA[, J]
|
|
1469
|
+ cA[, k] <- (A-nuA[, J]-phiA2[, J]*cB[, k])/phiA[, J]
|
1837
|
1470
|
##some of the snps are called for the men, but not the women
|
|
1471
|
+ rm(YA, YB, wA, wB, res, tmp, phistar, A, B, G, index)
|
|
1472
|
+ gc()
|
1838
|
1473
|
}
|
|
1474
|
+
|
|
1475
|
+ cA[cA < 0.05] <- 0.05
|
|
1476
|
+ cB[cB < 0.05] <- 0.05
|
|
1477
|
+ cA[cA > 5] <- 5
|
|
1478
|
+ cB[cB > 5] <- 5
|
|
1479
|
+
|
1839
|
1480
|
##--------------------------------------------------
|
1840
|
1481
|
##RS: need to fix. why are there NA's by coercion
|
1841
|
1482
|
cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA))
|
...
|
...
|
@@ -1846,16 +1487,49 @@ fit.lm3 <- function(idxBatch,
|
1846
|
1487
|
CA(object)[snps, ] <- cA
|
1847
|
1488
|
CB(object)[snps, ] <- cB
|
1848
|
1489
|
snpflags[snps, ] <- flags
|
1849
|
|
- lM(object)$tau2A[snps, ] <- tau2A
|
1850
|
|
- lM(object)$tau2B[snps, ] <- tau2B
|
1851
|
|
- lM(object)$sig2A[snps, ] <- sig2A
|
1852
|
|
- lM(object)$sig2B[snps, ] <- sig2B
|
1853
|
|
- lM(object)$nuA[snps, ] <- nuA
|
1854
|
|
- lM(object)$nuB[snps, ] <- nuB
|
1855
|
|
- lM(object)$phiA[snps, ] <- phiA
|
1856
|
|
- lM(object)$phiB[snps, ] <- phiB
|
1857
|
|
- lM(object)$phiPrimeA[snps, ] <- phiA2
|
1858
|
|
- lM(object)$phiPrimeB[snps, ] <- phiB2
|
|
1490
|
+ tmp <- physical(lM(object))$tau2A
|
|
1491
|
+ tmp[snps, ] <- tau2A
|
|
1492
|
+ lM(object)$tau2A <- tmp
|
|
1493
|
+ tmp <- physical(lM(object))$tau2B
|
|
1494
|
+ tmp[snps, ] <- tau2B
|
|
1495
|
+ lM(object)$tau2B <- tmp
|
|
1496
|
+ tmp <- physical(lM(object))$tau2B
|
|
1497
|
+ tmp[snps, ] <- tau2B
|
|
1498
|
+ lM(object)$tau2B <- tmp
|
|
1499
|
+ tmp <- physical(lM(object))$sig2A
|
|
1500
|
+ tmp[snps, ] <- sig2A
|
|
1501
|
+ lM(object)$sig2A <- tmp
|
|
1502
|
+ tmp <- physical(lM(object))$sig2B
|
|
1503
|
+ tmp[snps, ] <- sig2B
|
|
1504
|
+ lM(object)$sig2B <- tmp
|
|
1505
|
+ tmp <- physical(lM(object))$nuA
|
|
1506
|
+ tmp[snps, ] <- nuA
|
|
1507
|
+ lM(object)$nuA <- tmp
|
|
1508
|
+ tmp <- physical(lM(object))$nuB
|
|
1509
|
+ tmp[snps, ] <- nuB
|
|
1510
|
+ lM(object)$nuB <- tmp
|
|
1511
|
+ tmp <- physical(lM(object))$phiA
|
|
1512
|
+ tmp[snps, ] <- phiA
|
|
1513
|
+ lM(object)$phiA <- tmp
|
|
1514
|
+ tmp <- physical(lM(object))$phiB
|
|
1515
|
+ tmp[snps, ] <- phiB
|
|
1516
|
+ lM(object)$phiB <- tmp
|
|
1517
|
+ tmp <- physical(lM(object))$phiPrimeA
|
|
1518
|
+ tmp[snps, ] <- phiA2
|
|
1519
|
+ lM(object)$phiPrimeA <- tmp
|
|
1520
|
+ tmp <- physical(lM(object))$phiPrimeB
|
|
1521
|
+ tmp[snps, ] <- phiB2
|
|
1522
|
+ lM(object)$phiPrimeB <- tmp
|
|
1523
|
+## lM(object)$tau2A[snps, ] <- tau2A
|
|
1524
|
+## lM(object)$tau2B[snps, ] <- tau2B
|
|
1525
|
+## lM(object)$sig2A[snps, ] <- sig2A
|
|
1526
|
+## lM(object)$sig2B[snps, ] <- sig2B
|
|
1527
|
+## lM(object)$nuA[snps, ] <- nuA
|
|
1528
|
+## lM(object)$nuB[snps, ] <- nuB
|
|
1529
|
+## lM(object)$phiA[snps, ] <- phiA
|
|
1530
|
+## lM(object)$phiB[snps, ] <- phiB
|
|
1531
|
+## lM(object)$phiPrimeA[snps, ] <- phiA2
|
|
1532
|
+## lM(object)$phiPrimeB[snps, ] <- phiB2
|
1859
|
1533
|
lapply(assayData(object), close)
|
1860
|
1534
|
lapply(lM(object), close)
|
1861
|
1535
|
TRUE
|
...
|
...
|
@@ -1885,39 +1559,51 @@ fit.lm4 <- function(idxBatch,
|
1885
|
1559
|
snps <- snpBatches[[idxBatch]]
|
1886
|
1560
|
batches <- split(seq(along=batch(object)), batch(object))
|
1887
|
1561
|
batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
|
1888
|
|
- nuA <- phiA <- sig2A <- tau2A <- matrix(NA, length(snps), length(unique(batches)))
|
|
1562
|
+ nuA <- phiA <- sig2A <- tau2A <- matrix(NA, length(snps), length(unique(batch(object))))
|
1889
|
1563
|
cA <- matrix(NA, length(snps), ncol(object))
|
1890
|
1564
|
ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object))
|
1891
|
1565
|
flags <- snpflags[ii, , drop=FALSE]
|
1892
|
|
- noflags <- rowSums(flags) == 0
|
|
1566
|
+ noflags <- rowSums(flags, na.rm=TRUE) == 0
|
1893
|
1567
|
lapply(lM(object), open)
|
1894
|
|
- nuIA <- lM(object)$nuA[ii, ]
|
1895
|
|
- nuIB <- lM(object)$nuB[ii, ]
|
1896
|
|
- phiIA <- lM(object)$phiA[ii,]
|
1897
|
|
- phiIB <- lM(object)$phiB[ii,]
|
1898
|
|
-
|
1899
|
|
- snp.index <- which(nuIA > 20 & nuIB & 20 & phiIA > 20 & phiIB > 20 & noflags)
|
|
1568
|
+ nuIA <- physical(lM(object))$nuA[ii, ]
|
|
1569
|
+ nuIB <- physical(lM(object))$nuB[ii, ]
|
|
1570
|
+ phiIA <- physical(lM(object))$phiA[ii,]
|
|
1571
|
+ phiIB <- physical(lM(object))$phiB[ii,]
|
|
1572
|
+
|
|
1573
|
+ i1 <- rowSums(nuIA < 20, na.rm=TRUE) == 0
|
|
1574
|
+ i2 <- rowSums(nuIB < 20, na.rm=TRUE) == 0
|
|
1575
|
+ i3 <- rowSums(phiIA < 20, na.rm=TRUE) == 0
|
|
1576
|
+ i4 <- rowSums(phiIB < 20, na.rm=TRUE) == 0
|
|
1577
|
+
|
|
1578
|
+ snp.index <- which(i1 & i2 & i3 & i4 & noflags)
|
|
1579
|
+ if(length(snp.index) == 0){
|
|
1580
|
+ warning("No snps meet the following criteria: (1) nu and phi > 20 and (2) at least MIN.OBS in each genotype cluster. CN not estimated for nonpolymorphic loci on X")
|
|
1581
|
+ return(TRUE)
|
|
1582
|
+ }
|
1900
|
1583
|
if(length(snp.index) >= 5000){
|
1901
|
1584
|
snp.index <- sample(snp.index, 5000)
|
1902
|
1585
|
}
|
1903
|
|
- phiA.snp <- lM(object)$phiA[snp.index, , drop=FALSE]
|
1904
|
|
- phiB.snp <- lM(object)$phiB[snp.index, , drop=FALSE]
|
1905
|
|
- A.snp <- A(object)[snp.index, ]
|
1906
|
|
- B.snp <- B(object)[snp.index, ]
|
1907
|
|
- NORM.snp <- normal[snp.index, ]
|
1908
|
|
- NORM.np <- normal[snps, ]
|
|
1586
|
+ phiA.snp <- physical(lM(object))$phiA[snp.index, , drop=FALSE]
|
|
1587
|
+ phiB.snp <- physical(lM(object))$phiB[snp.index, , drop=FALSE]
|
|
1588
|
+ A.snp <- as.matrix(A(object)[snp.index, ])
|
|
1589
|
+ B.snp <- as.matrix(B(object)[snp.index, ])
|
|
1590
|
+ NORM.snp <- as.matrix(normal[snp.index, ])
|
|
1591
|
+ NORM.np <- as.matrix(normal[snps, ])
|
1909
|
1592
|
gender <- object$gender
|
1910
|
1593
|
|
1911
|
1594
|
|
1912
|
1595
|
pseudoAR <- position(object)[snps] < 2709520 | (position(object)[snps] > 154584237 & position(object)[snps] < 154913754)
|
1913
|
1596
|
pseudoAR[is.na(pseudoAR)] <- FALSE
|
1914
|
|
-
|
|
1597
|
+
|
|
1598
|
+ GG <- as.matrix(calls(object)[snp.index, ])
|
|
1599
|
+ CP <- as.matrix(snpCallProbability(object)[snp.index, ])
|
|
1600
|
+ AA.np <- as.matrix(A(object)[snps, ])
|
1915
|
1601
|
##if(missing(which.batches)) which.batches <- seq(along=batches)
|
1916
|
1602
|
##batches <- batches[which.batches]
|
1917
|
1603
|
for(k in batches){
|
1918
|
1604
|
##if(verbose) message("SNP batch ", ii, " of ", length(batches))
|
1919
|
|
- G <- calls(object)[snp.index, k]
|
1920
|
|
- xx <- snpCallProbability(object)[snp.index, k]
|
|
1605
|
+ G <- GG[, k]
|
|
1606
|
+ xx <- CP[, k]
|
1921
|
1607
|
highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
|
1922
|
1608
|
G <- G*highConf*NORM.snp[, k]
|
1923
|
1609
|
##snps
|
...
|
...
|
@@ -1947,7 +1633,7 @@ fit.lm4 <- function(idxBatch,
|
1947
|
1633
|
|
1948
|
1634
|
|
1949
|
1635
|
##nonpolymorphic
|
1950
|
|
- A <- A(object)[snps, k]
|
|
1636
|
+ A <- AA.np[, k]
|
1951
|
1637
|
gend <- gender[k]
|
1952
|
1638
|
A.M <- A[, gend==1]
|
1953
|
1639
|
mu1 <- rowMedians(A.M, na.rm=TRUE)
|
...
|
...
|
@@ -1985,185 +1671,32 @@ fit.lm4 <- function(idxBatch,
|
1985
|
1671
|
tmp[, gend==1] <- CT1
|
1986
|
1672
|
tmp[, gend==2] <- CT2
|
1987
|
1673
|
cA[, k] <- tmp
|
|
1674
|
+ rm(tmp, CT1, CT2, A.F, normal.f, G, AA, BB, Y, X, Ns)
|
|
1675
|
+ gc()
|
1988
|
1676
|
}
|
|
1677
|
+
|
|
1678
|
+ cA[cA < 0.05] <- 0.05
|
|
1679
|
+ cA[cA > 5] <- 5
|
1989
|
1680
|
cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA))
|
1990
|
1681
|
CA(object)[snps, ] <- cA
|
1991
|
1682
|
open(lM(object))
|
1992
|
|
- lM(object)$sig2A[snps, ] <- sig2A
|
1993
|
|
- lM(object)$nuA[snps, ] <- nuA
|
1994
|
|
- lM(object)$phiA[snps, ] <- phiA
|
|
1683
|
+ tmp <- physical(lM(object))$nuA
|
|
1684
|
+ tmp[snps, ] <- nuA
|
|
1685
|
+ lM(object)$nuA <- tmp
|
|
1686
|
+ tmp <- physical(lM(object))$sig2A
|
|
1687
|
+ tmp[snps, ] <- sig2A
|
|
1688
|
+ lM(object)$sig2A <- tmp
|
|
1689
|
+ tmp <- physical(lM(object))$phiA
|
|
1690
|
+ tmp[snps, ] <- phiA
|
|
1691
|
+ lM(object)$sig2A <- tmp
|
|
1692
|
+## lM(object)$sig2A[snps, ] <- sig2A
|
|
1693
|
+## lM(object)$nuA[snps, ] <- nuA
|
|
1694
|
+## lM(object)$phiA[snps, ] <- phiA
|
1995
|
1695
|
lapply(assayData(object), close)
|
1996
|
1696
|
lapply(lM(object), close)
|
1997
|
1697
|
TRUE
|
1998
|
1698
|
}
|
1999
|
1699
|
|
2000
|
|
-
|
2001
|
|
-crlmmWrapper <- function(filenames, cnOptions, ...){
|
2002
|
|
- cdfName <- cnOptions[["cdfName"]]
|
2003
|
|
- load.it <- cnOptions[["load.it"]]
|
2004
|
|
- save.it <- cnOptions[["save.it"]]
|
2005
|
|
- splitByChr <- cnOptions[["splitByChr"]]
|
2006
|
|
- crlmmFile <- cnOptions[["crlmmFile"]]
|
2007
|
|
- intensityFile=cnOptions[["intensityFile"]]
|
2008
|
|
- rgFile=cnOptions[["rgFile"]]
|
2009
|
|
- ##use.ff=cnOptions[["use.ff"]]
|
2010
|
|
- outdir <- cnOptions[["outdir"]]
|
2011
|
|
- if(missing(cdfName)) stop("cdfName is missing -- a valid cdfName is required. See crlmm:::validCdfNames()")
|
2012
|
|
- platform <- whichPlatform(cdfName)
|
2013
|
|
- if(!(platform %in% c("affymetrix", "illumina"))){
|
2014
|
|
- stop("Only 'affymetrix' and 'illumina' platforms are supported at this time.")
|
2015
|
|
- } else {
|
2016
|
|
- message("Checking whether annotation package for the ", platform, " platform is available")
|
2017
|
|
- if(!isValidCdfName(cdfName)){
|
2018
|
|
- cat("FALSE\n")
|
2019
|
|
- stop(cdfName, " is not a valid entry. See crlmm:::validCdfNames(platform)")
|
2020
|
|
- } else cat("TRUE\n")
|
2021
|
|
- }
|
2022
|
|
- if(missing(intensityFile)) stop("must specify 'intensityFile'.")
|
2023
|
|
- if(missing(crlmmFile)) stop("must specify 'crlmmFile'.")
|
2024
|
|
- if(platform == "illumina"){
|
2025
|
|
- if(missing(rgFile)){
|
2026
|
|
- ##stop("must specify 'rgFile'.")
|
2027
|
|
- rgFile <- file.path(dirname(crlmmFile), "rgFile.rda")
|
2028
|
|
- message("rgFile not specified. Using ", rgFile)
|
2029
|
|
- }
|
2030
|
|
- if(!load.it){
|
2031
|
|
- RG <- readIdatFiles(...)
|
2032
|
|
- if(save.it) save(RG, file=rgFile)
|
2033
|
|
- }
|
2034
|
|
- if(load.it & !file.exists(rgFile)){
|
2035
|
|
- message("load.it is TRUE, but rgFile not present. Attempting to read the idatFiles.")
|
2036
|
|
- RG <- readIdatFiles(...)
|
2037
|
|
- if(save.it) save(RG, file=rgFile)
|
2038
|
|
- }
|
2039
|
|
- if(load.it & file.exists(rgFile)){
|
2040
|
|
- message("Loading RG file")
|
2041
|
|
- load(rgFile)
|
2042
|
|
- RG <- get("RG")
|
2043
|
|
- }
|
2044
|
|
- }
|
2045
|
|
- if(!(file.exists(dirname(crlmmFile)))) stop(dirname(crlmmFile), " does not exist.")
|
2046
|
|
- if(!(file.exists(dirname(intensityFile)))) stop(dirname(intensityFile), " does not exist.")
|
2047
|
|
- ##---------------------------------------------------------------------------
|
2048
|
|
- ## FIX
|
2049
|
|
- outfiles <- file.path(dirname(crlmmFile), paste("crlmmSetList_", 1:24, ".rda", sep=""))
|
2050
|
|
- if(load.it & all(file.exists(outfiles))){
|
2051
|
|
- load(outfiles[1])
|
2052
|
|
- crlmmSetList <- get("crlmmSetList")
|
2053
|
|
- if(!all(sampleNames(crlmmSetList) == basename(filenames))){
|
2054
|
|
- stop("load.it is TRUE, but sampleNames(crlmmSetList != basename(filenames))")
|
2055
|
|
- } else{
|
2056
|
|
- return("load.it is TRUE and 'crlmmSetList_<CHR>.rda' objects found. Nothing to do...")
|
2057
|
|
- }
|
2058
|
|
- }
|
2059
|
|
- if(load.it){
|
2060
|
|
- if(!file.exists(crlmmFile)){
|
2061
|
|
- message("load.it is TRUE, but ", crlmmFile, " does not exist. Rerunning the genotype calling algorithm")
|
2062
|
|
- load.it <- FALSE
|
2063
|
|
- }
|
2064
|
|
- }
|
2065
|
|
- if(platform == "affymetrix"){
|
2066
|
|
- if(!file.exists(crlmmFile) | !load.it){
|
2067
|
|
- callSet <- crlmm(filenames=filenames,
|
2068
|
|
- cdfName=cdfName,
|
2069
|
|
- save.it=TRUE,
|
2070
|
|
- load.it=load.it,
|
2071
|
|
- intensityFile=intensityFile)
|
2072
|
|
- message("Quantile normalizing the copy number probes...")
|
2073
|
|
- cnrmaResult <- cnrma(filenames=filenames, cdfName=cdfName, outdir=outdir)
|
2074
|
|
- if(save.it){
|
2075
|
|
- message("Saving callSet and cnrmaResult to", crlmmFile)
|
2076
|
|
- save(callSet, cnrmaResult, file=crlmmFile)
|
2077
|
|
- }
|
2078
|
|
- } else {
|
2079
|
|
- message("Loading ", crlmmFile, "...")
|
2080
|
|
- load(intensityFile)
|
2081
|
|
- load(crlmmFile)
|
2082
|
|
- callSet <- get("callSet")
|
2083
|
|
- cnrmaResult <- get("cnrmaResult")
|
2084
|
|
- }
|
2085
|
|
- scanDates <- data.frame(ScanDate=sapply(filenames, celfileDate))
|
2086
|
|
- protocolData(callSet) <- new("AnnotatedDataFrame",
|
2087
|
|
- data=scanDates,
|
2088
|
|
- varMetadata=data.frame(labelDescription=colnames(scanDates),
|
2089
|
|
- row.names=colnames(scanDates)))
|
2090
|
|
- }
|
2091
|
|
- if(platform == "illumina"){
|
2092
|
|
- if(!file.exists(crlmmFile) | !load.it){
|
2093
|
|
- callSet <- crlmmIllumina(RG=RG,
|
2094
|
|
- cdfName=cdfName,
|
2095
|
|
- sns=sampleNames(RG),
|
2096
|
|
- returnParams=TRUE,
|
2097
|
|
- save.it=TRUE,
|
2098
|
|
- intensityFile=intensityFile)
|
2099
|
|
- if(save.it) save(callSet, file=crlmmFile)
|
2100
|
|
- } else {
|
2101
|
|
- message("Loading ", crlmmFile, "...")
|
2102
|
|
- load(crlmmFile)
|
2103
|
|
- callSet <- get("callSet")
|
2104
|
|
- }
|
2105
|
|
- protocolData(callSet) <- protocolData(RG)
|
2106
|
|
- }
|
2107
|
|
- if(platform=="affymetrix") {
|
2108
|
|
- protocolData(callSet)[["ScanDate"]] <- as.character(celDates(filenames))
|
2109
|
|
- sampleNames(protocolData(callSet)) <- sampleNames(callSet)
|
2110
|
|
- }
|
2111
|
|
- load(intensityFile)
|
2112
|
|
- snprmaResult <- get("res")
|
2113
|
|
- if(platform=="illumina"){
|
2114
|
|
- if(exists("cnAB")){
|
2115
|
|
- np.A <- as.integer(cnAB$A)
|
2116
|
|
- np.B <- as.integer(cnAB$B)
|
2117
|
|
- np <- ifelse(np.A > np.B, np.A, np.B)
|
2118
|
|
- np <- matrix(np, nrow(cnAB$A), ncol(cnAB$A))
|
2119
|
|
- rownames(np) <- cnAB$gns
|
2120
|
|
- colnames(np) <- cnAB$sns
|
2121
|
|
- cnAB$NP <- np
|
2122
|
|
- ##sampleNames(callSet) <- res$sns
|
2123
|
|
- sampleNames(callSet) <- cnAB$sns
|
2124
|
|
- cnrmaResult <- get("cnAB")
|
|