Browse code

added genotype2 and crlmmCopynumber2

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45672 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 02/04/2010 03:03:19
Showing 9 changed files

... ...
@@ -489,3 +489,10 @@ then readIDAT() should work. Thanks to Pierre Cherel who reported this error.
489 489
 
490 490
 ** added parallel/large dataset support to snprma/crlmm
491 491
 ** merged changes on .41 with my local changes
492
+
493
+2010-04-01 R.Scharpf committed version 1.5.44
494
+
495
+** added functions genotype2, cnrma2 for preprocessing and genotyping
496
+   with crlmm2
497
+** added crlmmCopynumber2 for parallel support with copy number
498
+   est. (needs more checking)
... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.5.43
4
+Version: 1.5.44
5 5
 Date: 2010-02-05
6 6
 Author: Rafael A Irizarry, Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>
7 7
 Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -12,7 +12,7 @@ importMethodsFrom(Biobase, annotation, "annotation<-",
12 12
                   annotatedDataFrameFrom, assayData, "assayData<-",
13 13
                   combine, dims, experimentData, "experimentData<-",
14 14
                   fData, featureData, "featureData<-", featureNames,
15
-                  fvarMetadata, fvarLabels, pData, phenoData,
15
+                  fvarMetadata, fvarLabels, pData, "pData<-", phenoData,
16 16
                   "phenoData<-", protocolData, "protocolData<-",
17 17
                   pubMedIds, rowMedians, sampleNames, snpCall,
18 18
                   snpCallProbability,
... ...
@@ -67,7 +67,12 @@ export(crlmm,
67 67
        readIdatFiles, 
68 68
        snprma,
69 69
        snprma2,
70
-       crlmm2) 
70
+       crlmm2,
71
+       genotype2,
72
+       cnrma2,
73
+       processCEL2,
74
+       batch,
75
+       crlmmCopynumber2)
71 76
 
72 77
 export(initializeParamObject)
73 78
 
... ...
@@ -1,3 +1,4 @@
1
+setGeneric("batch", function(object) standardGeneric("batch"))
1 2
 setGeneric("getParam", function(object, name, batch) standardGeneric("getParam"))
2 3
 setGeneric("cnIndex", function(object) standardGeneric("cnIndex"))
3 4
 setGeneric("cnNames", function(object) standardGeneric("cnNames"))
... ...
@@ -54,19 +54,20 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){
54 54
 	##crlmmOpts$snpRange <- range(snpIndex)
55 55
 	##crlmmOpts$npRange <- range(npIndex)
56 56
 }
57
-construct <- function(filenames, cdfName, copynumber=FALSE, sns, verbose=TRUE){
57
+construct <- function(filenames, cdfName, copynumber=FALSE,
58
+		      sns, verbose=TRUE){
58 59
 	if(verbose) message("Initializing container for assay data elements alleleA, alleleB, call, callProbability")
59 60
 	if(missing(sns)) sns <- basename(filenames)
60 61
 	protocolData <- getProtocolData.Affy(filenames)
61 62
 	featureData <- getFeatureData.Affy(cdfName, copynumber=copynumber)
62 63
 	nr <- nrow(featureData); nc <- length(filenames)
63 64
 	callSet <- new("SnpSuperSet", 
64
-			 alleleA=initializeBigMatrix(name="A", nr, nc),
65
-			 alleleB=initializeBigMatrix(name="B", nr, nc),
66
-			 call=initializeBigMatrix(name="call", nr, nc),
67
-			 callProbability=initializeBigMatrix(name="callPr", nr,nc),
68
-			 featureData=featureData,
69
-			 annotation=cdfName)
65
+		       alleleA=initializeBigMatrix(name="A", nr, nc),
66
+		       alleleB=initializeBigMatrix(name="B", nr, nc),
67
+		       call=initializeBigMatrix(name="call", nr, nc),
68
+		       callProbability=initializeBigMatrix(name="callPr", nr,nc),
69
+		       featureData=featureData,
70
+		       annotation=cdfName)
70 71
 	pd <- data.frame(matrix(NA, nc, 3), row.names=sns)
71 72
 	colnames(pd)=c("SKW", "SNR", "gender")
72 73
 	phenoData(callSet) <- new("AnnotatedDataFrame", data=pd)
... ...
@@ -167,14 +168,557 @@ genotype <- function(filenames,
167 168
 			       verbose=verbose,
168 169
 			       returnParams=returnParams,
169 170
 			       badSNP=badSNP)
170
-		suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
171
-		suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
171
+		snpCall(callSet)[snp.index, j] <- tmp[["calls"]]
172
+		snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]]
172 173
 		callSet$gender[j] <- tmp$gender
173 174
 		iter <- iter+1
174 175
 	}
175 176
 	return(callSet)
176 177
 }
177 178
 
179
+genotype2 <- function(filenames,
180
+		      cdfName,
181
+		      batch,
182
+		      mixtureSampleSize=10^5,
183
+		      eps=0.1,
184
+		      verbose=TRUE,
185
+		      seed=1,
186
+		      sns,
187
+		      copynumber=FALSE,
188
+		      probs=rep(1/3, 3),
189
+		      DF=6,
190
+		      SNRMin=5,
191
+		      recallMin=10,
192
+		      recallRegMin=1000,
193
+		      gender=NULL,
194
+		      returnParams=TRUE,
195
+		      badSNP=0.7){
196
+	if(!copynumber){
197
+		callSet <- crlmm2(filenames=filenames,
198
+				  cdfName=cdfName,
199
+				  mixtureSampleSize=mixtureSampleSize,
200
+				  eps=eps,
201
+				  verbose=verbose,
202
+				  seed=seed,
203
+				  sns=sns,
204
+				  probs=probs,
205
+				  DF=DF,
206
+				  SNRMin=SNRMin,
207
+				  recallMin=recallMin,
208
+				  recallRegMin=recallRegMin,
209
+				  gender=gender,
210
+				  returnParams=returnParams,
211
+				  badSNP=badSNP)
212
+		return(callSet)
213
+	}
214
+	if(missing(cdfName)) stop("must specify cdfName")
215
+	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
216
+	if(missing(batch)){
217
+		warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch.  The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.")
218
+	} else {
219
+		if(length(batch) != length(filenames))
220
+			stop("batch variable must be the same length as the filenames")
221
+	}
222
+	if(missing(sns)) sns <- basename(filenames)
223
+	## callSet contains potentially very big matrices
224
+	callSet <- construct(filenames=filenames,
225
+			     cdfName=cdfName,
226
+			     copynumber=TRUE,
227
+			     sns=sns,
228
+			     verbose=verbose)
229
+	if(missing(batch)){
230
+		protocolData(callSet)$batch <- as.numeric(as.factor(protocolData(callSet)$ScanDate))
231
+	}
232
+	if(!missing(batch)) protocolData(callSet)$batch <- batch
233
+	mixtureParams <- matrix(NA, 4, length(filenames))
234
+	snp.index <- which(isSnp(callSet)==1)
235
+	snprmaRes <- snprma2(##A=A(callSet),
236
+			     ##B=B(callSet),
237
+			       filenames=filenames,
238
+			       mixtureSampleSize=mixtureSampleSize,
239
+			       fitMixture=TRUE,
240
+			       eps=eps,
241
+			       verbose=verbose,
242
+			       seed=seed,
243
+			       cdfName=cdfName,
244
+			       sns=sns)
245
+	if(verbose) message("Finished preprocessing.")
246
+	open(snprmaRes[["A"]])
247
+	open(snprmaRes[["B"]])
248
+	open(snprmaRes[["SNR"]])
249
+	open(snprmaRes[["SKW"]])
250
+	open(snprmaRes[["mixtureParams"]])
251
+	if(verbose) message("Updating elements of callSet")
252
+##	A(callSet) <- snprmaRes[["A"]]
253
+##	B(callSet) <- snprmaRes[["B"]]
254
+	bb = ocProbesets()*ncol(A)*8
255
+	ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]], BATCHBYTES=bb)
256
+	ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]], BATCHBYTES=bb)
257
+	##ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]])
258
+	##ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]])
259
+	if(verbose) message("Finished updating elements of callSet")
260
+	stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns))
261
+	pData(callSet)$SKW <- snprmaRes$SKW
262
+	pData(callSet)$SNR <- snprmaRes$SNR
263
+	mixtureParams <- snprmaRes$mixtureParams
264
+	np.index <- which(isSnp(callSet) == 0)
265
+	cnrmaRes <- cnrma2(A=A(callSet),
266
+			   filenames=filenames,
267
+			   row.names=featureNames(callSet)[np.index],
268
+			   cdfName=cdfName,
269
+			   sns=sns,
270
+			   seed=seed,
271
+			   verbose=verbose)
272
+	rm(cnrmaRes); gc()
273
+	## as.matrix needed when ffdf is used
274
+	tmp <- crlmmGT2(A=snprmaRes[["A"]],
275
+			   B=snprmaRes[["B"]],
276
+			   SNR=snprmaRes[["SNR"]],
277
+			   mixtureParams=snprmaRes[["mixtureParams"]],
278
+			   cdfName=cdfName,
279
+			   row.names=NULL, ##featureNames(callSet),##[snp.index],
280
+			   col.names=sampleNames(callSet),
281
+			   probs=probs,
282
+			   DF=DF,
283
+			   SNRMin=SNRMin,
284
+			   recallMin=recallMin,
285
+			   recallRegMin=recallRegMin,
286
+			   gender=gender,
287
+			   verbose=verbose,
288
+			   returnParams=returnParams,
289
+			   badSNP=badSNP)
290
+	open(tmp[["calls"]])
291
+	open(tmp[["confs"]])
292
+	##snpCall(callSet) <- tmp[["calls"]]
293
+	##snpCallProbability(callSet) <- tmp[["confs"]]
294
+	bb = ocProbesets()*ncol(A)*8
295
+	ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb)
296
+	ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb)
297
+##	ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]])
298
+##	ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]])
299
+	callSet$gender <- tmp$gender
300
+	cnSet <- as(callSet, "CNSetLM")
301
+	return(cnSet)
302
+}
303
+
304
+snprma2RS <- function(A, B,
305
+		      filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
306
+		      eps=0.1, verbose=TRUE, seed=1, cdfName, sns){
307
+  if (missing(sns)) sns <- basename(filenames)
308
+  if (missing(cdfName))
309
+    cdfName <- read.celfile.header(filenames[1])[["cdfName"]]
310
+  pkgname <- getCrlmmAnnotationName(cdfName)
311
+  stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
312
+  
313
+  if(verbose) message("Loading annotations and mixture model parameters.")
314
+  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
315
+  pnsa <- getVarInEnv("pnsa")
316
+  pnsb <- getVarInEnv("pnsb")
317
+  gns <- getVarInEnv("gns")
318
+  
319
+  ##We will read each cel file, summarize, and run EM one by one
320
+  ##We will save parameters of EM to use later
321
+  if(verbose) message("Initializing objects.")
322
+  mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
323
+  SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
324
+  SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")
325
+  
326
+  ## This is the sample for the fitting of splines
327
+  ## BC: I like better the idea of the user passing the seed,
328
+  ##     because this might intefere with other analyses
329
+  ##     (like what happened to GCRMA)
330
+  ##S will hold (A+B)/2 and M will hold A-B
331
+  ##NOTE: We actually dont need to save S. Only for pics etc...
332
+  ##f is the correction. we save to avoid recomputing
333
+##  A <- initializeBigMatrix("crlmmA-", length(pnsa), length(filenames), "integer")
334
+##  B <- initializeBigMatrix("crlmmB-", length(pnsb), length(filenames), "integer")
335
+
336
+  sampleBatches <- splitIndicesByNode(seq(along=filenames))
337
+
338
+  if(verbose) message("Processing ", length(filenames), " files.")
339
+
340
+  ocLapply(sampleBatches, processCEL_RS, filenames=filenames,
341
+           fitMixture=fitMixture, A=A, B=B, SKW=SKW, SNR=SNR,
342
+           mixtureParams=mixtureParams, eps=eps, seed=seed,
343
+           mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
344
+           neededPkgs=c("crlmm", pkgname))
345
+  
346
+  list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
347
+}
348
+
349
+processCEL_RS <- function(i, filenames, fitMixture, A, B, SKW, SNR,
350
+			  mixtureParams, eps, seed, mixtureSampleSize,
351
+			  pkgname){
352
+  
353
+  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
354
+  loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
355
+  loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
356
+  autosomeIndex <- getVarInEnv("autosomeIndex")
357
+  pnsa <- getVarInEnv("pnsa")
358
+  pnsb <- getVarInEnv("pnsb")
359
+  fid <- getVarInEnv("fid")
360
+  reference <- getVarInEnv("reference")
361
+  aIndex <- getVarInEnv("aIndex")
362
+  bIndex <- getVarInEnv("bIndex")
363
+  SMEDIAN <- getVarInEnv("SMEDIAN")
364
+  theKnots <- getVarInEnv("theKnots")
365
+  gns <- getVarInEnv("gns")
366
+
367
+  ## for mixture
368
+  set.seed(seed)
369
+  idx <- sort(sample(autosomeIndex, mixtureSampleSize))
370
+  ##for skewness. no need to do everything
371
+  idx2 <- sample(length(fid), 10^5)
372
+
373
+  open(A)
374
+  open(B)
375
+  open(SKW)
376
+  open(mixtureParams)
377
+  open(SNR)
378
+  ##RS
379
+  iii <- seq(along=pnsa)
380
+
381
+  for (k in i){
382
+    y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
383
+    x <- log2(y[idx2])
384
+    SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3)
385
+    rm(x)
386
+    y <- normalize.quantiles.use.target(y, target=reference)
387
+    A[iii, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
388
+    B[iii, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
389
+    rm(y)
390
+    
391
+    if(fitMixture){
392
+      S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN
393
+      M <- log2(A[idx, k])-log2(B[idx, k])
394
+      tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
395
+      mixtureParams[, k] <- tmp[["coef"]]
396
+      SNR[k] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
397
+    } else {
398
+      mixtureParams[, k] <- NA
399
+      SNR[k] <- NA
400
+    }
401
+  }
402
+  close(A)
403
+  close(B)
404
+  close(SKW)
405
+  close(mixtureParams)
406
+  close(SNR)
407
+  TRUE
408
+}
409
+
410
+crlmmGT2_RS<- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
411
+		       col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
412
+		       SNRMin=5, recallMin=10, recallRegMin=1000,
413
+		       gender=NULL, desctrucitve=FALSE, verbose=TRUE,
414
+		       returnParams=FALSE, badSNP=.7){
415
+  open(SNR)
416
+  open(A)
417
+  open(B)
418
+  open(mixtureParams)
419
+  ## expect objects to be ff
420
+  
421
+  keepIndex <- which( SNR[] > SNRMin)
422
+  if(length(keepIndex)==0) stop("No arrays above quality threshold!")
423
+  
424
+  NC <- ncol(A)
425
+  NR <- nrow(A)
426
+  
427
+  pkgname <- getCrlmmAnnotationName(cdfName)
428
+  stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
429
+
430
+  if(verbose) message("Loading annotations.")
431
+  loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
432
+  loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
433
+  ## this is toget rid of the 'no visible binding' notes
434
+  ## variable definitions
435
+  XIndex <- getVarInEnv("XIndex")
436
+  autosomeIndex <- getVarInEnv("autosomeIndex")
437
+  YIndex <- getVarInEnv("YIndex")
438
+  SMEDIAN <- getVarInEnv("SMEDIAN")
439
+  theKnots <- getVarInEnv("theKnots")
440
+  regionInfo <- getVarInEnv("regionInfo")
441
+  params <- getVarInEnv("params")
442
+  ##RS
443
+  pnsa <- getVarInEnv("pnsa")
444
+  NR <- length(pnsa)
445
+  
446
+  ##IF gender not provide, we predict
447
+  if(is.null(gender)){
448
+    if(verbose) message("Determining gender.")
449
+    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
450
+    if(sum(SNR[] > SNRMin)==1){
451
+      gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
452
+    }else{
453
+      gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
454
+    }
455
+  }
456
+  
457
+  Indexes <- list(autosomeIndex, XIndex, YIndex)
458
+  cIndexes <- list(keepIndex, 
459
+                   keepIndex[which(gender[keepIndex]==2)], 
460
+                   keepIndex[which(gender[keepIndex]==1)])
461
+  
462
+  if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
463
+
464
+  ## call C
465
+  fIndex <- which(gender==2)
466
+  mIndex <- which(gender==1)
467
+
468
+  ## different here
469
+  ## use gtypeCallerR in batches
470
+  ##RS
471
+  snpBatches <- splitIndicesByLength(1:NR, ocProbesets())
472
+  newparamsBatch <- vector("list", length(snpBatches))
473
+
474
+  process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
475
+                       YIndex, A, B, mixtureParams, fIndex, mIndex,
476
+                       params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
477
+    open(A)
478
+    open(B)
479
+    open(mixtureParams)
480
+    snps <- snpBatches[[idxBatch]]
481
+    rSnps <- range(snps)
482
+    last <- (idxBatch-1)*batchSize
483
+    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
484
+                         XIndex[XIndex %in% snps]-last,
485
+                         YIndex[YIndex %in% snps]-last)
486
+    IndexesBatch <- lapply(IndexesBatch, as.integer)
487
+    tmpA <- as.matrix(A[snps,])
488
+    tmpB <- as.matrix(B[snps,])
489
+    ## newparamsBatch[[idxBatch]]
490
+
491
+    tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
492
+                        params[["centers"]][snps,],
493
+                        params[["scales"]][snps,],
494
+                        params[["N"]][snps,],
495
+                        IndexesBatch, cIndexes,
496
+                        sapply(IndexesBatch, length),
497
+                        sapply(cIndexes, length), SMEDIAN,
498
+                        theKnots, mixtureParams[], DF, probs, 0.025)
499
+    
500
+    last <- rSnps[2]
501
+    rm(snps, rSnps, IndexesBatch, tmpA, tmpB)
502
+    close(A)
503
+    close(B)
504
+    close(mixtureParams)
505
+    tmp
506
+  }
507
+
508
+  newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
509
+                             snpBatches=snpBatches,
510
+                             autosomeIndex=autosomeIndex, XIndex=XIndex,
511
+                             YIndex=YIndex, A=A, B=B,
512
+                             mixtureParams=mixtureParams, fIndex=fIndex,
513
+                             mIndex=mIndex, params=params,
514
+                             cIndexes=cIndexes, SMEDIAN=SMEDIAN,
515
+                             theKnots=theKnots, DF=DF, probs=probs,
516
+                             batchSize=ocProbesets())
517
+##   last <- 0
518
+##   for (idxBatch in seq(along=snpBatches)){
519
+##     snps <- snpBatches[[idxBatch]]
520
+##     rSnps <- range(snps)
521
+##     IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
522
+##                          XIndex[XIndex %in% snps]-last,
523
+##                          YIndex[YIndex %in% snps]-last)
524
+##     IndexesBatch <- lapply(IndexesBatch, as.integer)
525
+##     tmpA <- A[snps,]
526
+##     tmpB <- B[snps,]
527
+##     newparamsBatch[[idxBatch]] <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
528
+##                                                params[["centers"]][snps,],
529
+##                                                params[["scales"]][snps,],
530
+##                                                params[["N"]][snps,],
531
+##                                                IndexesBatch, cIndexes,
532
+##                                                sapply(IndexesBatch, length),
533
+##                                                sapply(cIndexes, length),
534
+##                                                SMEDIAN, theKnots,
535
+##                                                mixtureParams[], DF, probs, 0.025)
536
+##     last <- rSnps[2]
537
+##     rm(snps, rSnps, IndexesBatch, tmpA, tmpB)
538
+##   }
539
+##   rm(last)
540
+  
541
+  newparams <- vector("list", 3)
542
+  names(newparams) <- c("centers", "scales", "N")
543
+  newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
544
+  newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2))
545
+  newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3))
546
+  rm(newparamsBatch)
547
+  if(verbose) message("Done.")
548
+  if(verbose) message("Estimating recalibration parameters.")
549
+  d <- newparams[["centers"]] - params$centers
550
+
551
+  ##regression 
552
+  Index <- intersect(which(pmin(newparams[["N"]][, 1],
553
+                                newparams[["N"]][, 2],
554
+                                newparams[["N"]][, 3]) > recallMin &
555
+                                !apply(regionInfo, 1, any)),
556
+                                autosomeIndex)
557
+  if(length(Index) < recallRegMin){
558
+    warning("Recalibration not possible. Possible cause: small sample size.")
559
+    newparams <- params
560
+    dev <- vector("numeric", nrow(newparams[["centers"]]))
561
+    SS <- matrix(Inf, 3, 3)
562
+    DD <- 0
563
+  }else{
564
+    data4reg <- as.data.frame(newparams[["centers"]][Index,])
565
+    names(data4reg) <- c("AA", "AB", "BB")
566
+    regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
567
+                       c(coef(lm(AB~AA+BB, data=data4reg)), 0), 
568
+                       coef(lm(BB~AA*AB, data=data4reg)))
569
+    rownames(regParams) <- c("intercept", "X", "Y", "XY")
570
+    rm(data4reg)
571
+  
572
+    minN <- 3
573
+    newparams[["centers"]][newparams[["N"]] < minN] <- NA
574
+    Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
575
+    if(verbose) cat("Filling out empty centers")
576
+    for(i in Index){
577
+      if(verbose) if(i%%10000==0)cat(".")
578
+      mu <- newparams[["centers"]][i, ]
579
+      j <- which(is.na(mu))
580
+      newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
581
+    }
582
+    
583
+    ##remaing NAs are made like originals
584
+    if(length(YIndex)>0){
585
+      noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), 
586
+                           YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
587
+    }
588
+    snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0)
589
+    snps2keep <- setdiff(autosomeIndex, snps2ignore)
590
+    rm(snps2ignore)
591
+    newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
592
+    if(verbose) cat("\n")
593
+  
594
+    if(verbose) message("Calculating and standardizing size of shift.")
595
+    GG <- DD <- newparams[["centers"]] - params[["centers"]]
596
+    DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
597
+    SS <- cov(DD[autosomeIndex, ])
598
+    SSI <- solve(SS)
599
+    dev <- vector("numeric", nrow(DD))
600
+    if(length(YIndex)){
601
+      dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
602
+      dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
603
+      ##Now Y (only two params)
604
+      SSY <- SS[c(1, 3), c(1, 3)]
605
+      SSI <- solve(SSY) 
606
+      dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
607
+      dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
608
+    } else {
609
+      dev=apply(DD,1,function(x) x%*%SSI%*%x)
610
+      dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
611
+    }
612
+  }
613
+    
614
+  ## BC: must keep SD
615
+  params[-2] <- newparams[-2]
616
+  
617
+  rm(newparams);gc(verbose=FALSE)  
618
+  if(verbose) cat("Calling", NR, "SNPs... ")
619
+  ## ###################
620
+  ## ## MOVE TO C#######
621
+
622
+  ## running in batches
623
+  ## snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
624
+
625
+  process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
626
+                       YIndex, A, B, mixtureParams, fIndex, mIndex,
627
+                       params, cIndexes, SMEDIAN, theKnots, DF, probs,
628
+                       regionInfo, batchSize){
629
+    open(A)
630
+    open(B)
631
+    open(mixtureParams)
632
+    snps <- snpBatches[[idxBatch]]
633
+    tmpA <- as.matrix(A[snps,])
634
+    tmpB <- as.matrix(B[snps,])
635
+    rSnps <- range(snps)
636
+    last <- (idxBatch-1)*batchSize
637
+    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
638
+                         XIndex[XIndex %in% snps]-last,
639
+                         YIndex[YIndex %in% snps]-last)
640
+    IndexesBatch <- lapply(IndexesBatch, as.integer)
641
+    ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
642
+                            params[["centers"]][snps,],
643
+                            params[["scales"]][snps,],
644
+                            params[["N"]][snps,],
645
+                            IndexesBatch, cIndexes,
646
+                            sapply(IndexesBatch, length),
647
+                            sapply(cIndexes, length),
648
+                            SMEDIAN, theKnots, mixtureParams[],
649
+                            DF, probs, 0.025,
650
+                            which(regionInfo[snps, 2]),
651
+                            which(regionInfo[snps, 1]))
652
+    A[snps,] <- tmpA
653
+    B[snps,] <- tmpB
654
+    last <- rSnps[2]
655
+    rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull)
656
+    close(A)
657
+    close(B)
658
+    close(mixtureParams)
659
+  }
660
+
661
+  ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
662
+           autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
663
+           A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
664
+           mIndex=mIndex, params=params, cIndexes=cIndexes,
665
+           SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
666
+           regionInfo=regionInfo, batchSize=ocProbesets())
667
+  
668
+##   last <- 0
669
+##   for (idxBatch in seq(along=snpBatches)){
670
+##     snps <- snpBatches[[idxBatch]]
671
+##     tmpA <- A[snps,]
672
+##     tmpB <- B[snps,]
673
+##     rSnps <- range(snps)
674
+##     IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
675
+##                          XIndex[XIndex %in% snps]-last,
676
+##                          YIndex[YIndex %in% snps]-last)
677
+##     IndexesBatch <- lapply(IndexesBatch, as.integer)
678
+##     ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
679
+##                             params[["centers"]][snps,],
680
+##                             params[["scales"]][snps,],
681
+##                             params[["N"]][snps,],
682
+##                             IndexesBatch, cIndexes,
683
+##                             sapply(IndexesBatch, length),
684
+##                             sapply(cIndexes, length),
685
+##                             SMEDIAN, theKnots, mixtureParams[],
686
+##                             DF, probs, 0.025,
687
+##                             which(regionInfo[snps, 2]),
688
+##                             which(regionInfo[snps, 1]))
689
+##     A[snps,] <- tmpA
690
+##     B[snps,] <- tmpB
691
+##     last <- rSnps[2]
692
+##     rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull)
693
+##   }
694
+##   
695
+##   gc(verbose=FALSE)
696
+  ##  END MOVE TO C#######
697
+  ## ##################
698
+  
699
+  dev <- dev/(dev+1/383)
700
+  if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
701
+  if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
702
+
703
+  if(length(Index) >= recallRegMin){
704
+   tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
705
+   tmpSnpQc <- dev[autosomeIndex]
706
+   SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
707
+   batchQC <- mean(diag(SS))
708
+  }else{
709
+    SS <- matrix(0, 3, 3)
710
+    batchQC <- Inf
711
+  }
712
+  
713
+  if(verbose) message("Done.")
714
+  if (returnParams){
715
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
716
+  }else{
717
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
718
+  }
719
+}
720
+
721
+
178 722
 ##---------------------------------------------------------------------------
179 723
 ##---------------------------------------------------------------------------
180 724
 ## For Illumina
... ...
@@ -252,6 +796,7 @@ constructRG <- function(filenames, cdfName, sns, verbose, fileExt, sep, sampleSh
252 796
 }
253 797
 crlmmIlluminaRS <- function(sampleSheet=NULL,
254 798
 			    arrayNames=NULL,
799
+			    batch,
255 800
 			    ids=NULL,
256 801
 			    path=".",
257 802
 			    arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
... ...
@@ -266,10 +811,18 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
266 811
 			    seed=1, save.ab=FALSE, snpFile, cnFile,
267 812
 			    mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
268 813
 			    cdfName, sns, recallMin=10, recallRegMin=1000,
269
-			    returnParams=FALSE, badSNP=.7) {
814
+			    returnParams=FALSE, badSNP=.7,
815
+			    copynumber=FALSE,
816
+			    load.it=TRUE) {
270 817
 	if(missing(cdfName)) stop("must specify cdfName")
271 818
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
272 819
 	if(missing(sns)) sns <- basename(arrayNames)
820
+	if(missing(batch)){
821
+		warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch.  The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.")
822
+	} else {
823
+		if(length(batch) != length(sns))
824
+			stop("batch variable must be the same length as the filenames")
825
+	}	
273 826
 	batches <- splitIndicesByLength(seq(along=arrayNames), ocSamples())
274 827
 	k <- 1
275 828
 	for(j in batches){
... ...
@@ -300,13 +853,17 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
300 853
 		##  Here, I'm just using the # of rows returned from the above function
301 854
 		if(k == 1){
302 855
 			if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability")
303
-			callSet <- new("SnpSuperSet",
304
-				       alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
305
-				       alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
306
-				       call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
307
-				       callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
308
-				       annotation=cdfName)
309
-			sampleNames(callSet) <- sns
856
+			load.obj <- loadObject("callSet", load.it)
857
+			if(!load.obj){
858
+				callSet <- new("SnpSuperSet",
859
+					       alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
860
+					       alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
861
+					       call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
862
+					       callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
863
+					       annotation=cdfName)
864
+				sampleNames(callSet) <- sns
865
+				save(callSet, file=file.path(ldPath(), "callSet.rda"))
866
+			} else load(file.path(ldPath(), "callSet.rda"))
310 867
 			phenoData(callSet) <- getPhenoData(sampleSheet=sampleSheet,
311 868
 							   arrayNames=sns,
312 869
 							   arrayInfoColNames=arrayInfoColNames)
... ...
@@ -315,9 +872,21 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
315 872
 			protocolData(callSet) <- new("AnnotatedDataFrame", data=pD)
316 873
 			pData(protocolData(callSet))[j, ] <- pData(protocolData)
317 874
 			featureNames(callSet) <- res[["gns"]]
318
-			pData(callSet)$SKW <- rep(NA, length(sns))
319
-			pData(callSet)$SNR <- rep(NA, length(sns))
875
+			pData(callSet)$SNR <- initializeBigVector("crlmmSNR-", length(sns), "double")
876
+			pData(callSet)$SKW <- initializeBigVector("crlmmSKW-", length(sns), "double")
877
+			##pData(callSet)$SKW <- rep(NA, length(sns))
878
+			##pData(callSet)$SNR <- rep(NA, length(sns))
320 879
 			pData(callSet)$gender <- rep(NA, length(sns))
880
+			mixtureParams <- initializeBigMatrix("crlmmMixt-", nr=4, nc=ncol(callSet), vmode="double")
881
+			if(missing(batch)){
882
+				protocolData(callSet)$batch <- rep(NA, length(sns))
883
+			} else{
884
+				protocolData(callSet)$batch <- batch
885
+			}
886
+			featureData(callSet) <- addFeatureAnnotation(callSet)
887
+			open(mixtureParams)
888
+			open(callSet$SNR)
889
+			open(callSet$SKW)
321 890
 		}
322 891
 		if(k > 1 & nrow(res[[1]]) != nrow(callSet)){
323 892
 			##RS: I don't understand why the IDATS for the
... ...
@@ -325,40 +894,61 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
325 894
 			res[["A"]] <- res[["A"]][res$gns %in% featureNames(callSet), ]
326 895
 			res[["B"]] <- res[["B"]][res$gns %in% featureNames(callSet), ]
327 896
 		}
897
+		if(missing(batch)){
898
+			protocolData(callSet)$batch[j] <- as.numeric(as.factor(protocolData$ScanDate))
899
+		}
328 900
 		## MR: we need to define a snp.index vs np.index
329 901
 		snp.index <- match(res$gns, featureNames(callSet))		
330
-		suppressWarnings(A(callSet)[snp.index, j] <- res[["A"]])
331
-		suppressWarnings(B(callSet)[snp.index, j] <- res[["B"]])
902
+		A(callSet)[snp.index, j] <- res[["A"]]
903
+		B(callSet)[snp.index, j] <- res[["B"]]
332 904
 		pData(callSet)$SKW[j] <- res$SKW
333 905
 		pData(callSet)$SNR[j] <- res$SNR
334
-		mixtureParams <- res$mixtureParams
906
+		mixtureParams[, j] <- res$mixtureParams
335 907
 		rm(res); gc()
336
-		##MR:  edit snp.index
337
-		##snp.index <- 1:nrow(callSet)
338
-		tmp <- crlmmGT(A=as.matrix(A(callSet)[, j]),
339
-			       B=as.matrix(B(callSet)[, j]),
340
-			       SNR=callSet$SNR[j],
341
-			       mixtureParams=mixtureParams,
342
-			       cdfName=annotation(callSet),
343
-			       row.names=featureNames(callSet),
344
-			       col.names=sampleNames(callSet)[j],
345
-			       probs=probs,
346
-			       DF=DF,
347
-			       SNRMin=SNRMin,
348
-			       recallMin=recallMin,
349
-			       recallRegMin=recallRegMin,
350
-			       gender=gender,
351
-			       verbose=verbose,
352
-			       returnParams=returnParams,
353
-			       badSNP=badSNP)
354
-		suppressWarnings(snpCall(callSet)[, j] <- tmp[["calls"]])
355
-		## MR: many zeros in the conf. scores (?)
356
-		suppressWarnings(snpCallProbability(callSet)[, j] <- tmp[["confs"]])
357
-		callSet$gender[j] <- tmp$gender
358
-		rm(tmp); gc()
359 908
 		k <- k+1
360 909
 	}
361
-	return(callSet)
910
+	tmp <- crlmmGT2(A=A(callSet),
911
+			B=B(callSet),
912
+			SNR=callSet$SNR,
913
+			mixtureParams=mixtureParams,
914
+			cdfName=annotation(callSet),
915
+			row.names=featureNames(callSet),
916
+			col.names=sampleNames(callSet),
917
+			probs=probs,
918
+			DF=DF,
919
+			SNRMin=SNRMin,
920
+			recallMin=recallMin,
921
+			recallRegMin=recallRegMin,
922
+			gender=gender,
923
+			verbose=verbose,
924
+			returnParams=returnParams,
925
+			badSNP=badSNP)
926
+	open(tmp[["calls"]])
927
+	open(tmp[["confs"]])
928
+	snpCall(callSet) <- tmp[["calls"]]
929
+	## MR: many zeros in the conf. scores (?)
930
+	snpCallProbability(callSet) <- tmp[["confs"]]
931
+	callSet$gender <- tmp$gender
932
+	if(copynumber){
933
+		load.obj <- loadObject("cnSet", load.it)
934
+		if(!load.obj){
935
+			cnSet <- as(callSet, "CNSetLM")
936
+		} else {
937
+			load(file.path(ldPath(), "cnSet.rda"))
938
+			A(cnSet) <- A(callSet)
939
+			B(cnSet) <- B(callSet)
940
+			snpCall(cnSet) <- snpCall(callSet)
941
+			snpCallProbability(cnSet) <- snpCallProbability(callSet)
942
+			annotation(cnSet) <- annotation(callSet)
943
+			featureData(cnSet) <- featureData(callSet)
944
+			protocolData(cnSet) <- protocolData(callSet)
945
+			phenoData(cnSet) <- phenoData(callSet)
946
+			experimentData(cnSet) <- experimentData(callSet)
947
+		}
948
+	}
949
+	close(mixtureParams)
950
+	rm(tmp); gc()
951
+	return(cnSet)
362 952
 }
363 953
 ##---------------------------------------------------------------------------
364 954
 ##---------------------------------------------------------------------------
... ...
@@ -377,6 +967,29 @@ rowMAD <- function(x, y, ...){
377 967
 	return(mad)
378 968
 }
379 969
 
970
+shrink <- function(x, Ns, DF.PRIOR){
971
+	DF <- Ns-1
972
+	DF[DF < 1] <- 1
973
+	x.0 <- apply(x, 2, median, na.rm=TRUE)
974
+	x <- (x*DF + x.0*DF.PRIOR)/(DF.PRIOR + DF)
975
+	for(j in 1:ncol(x)) x[is.na(x[, j]), j] <- x.0[j]
976
+	return(x)
977
+}
978
+
979
+applyByGenotype <- function(x, FUN, G){
980
+	FUN <- match.fun(FUN)
981
+	tmp <- matrix(NA, nrow(x), 3)
982
+	for(j in 1:3){
983
+		GT <- G==j
984
+		GT[GT == FALSE] <- NA
985
+		gt.x <- GT*x
986
+		tmp[, j] <- FUN(gt.x, na.rm=TRUE)
987
+	}
988
+	tmp
989
+}
990
+
991
+
992
+
380 993
 rowCors <- function(x, y, ...){
381 994
 	N <- rowSums(!is.na(x))
382 995
 	x <- suppressWarnings(log2(x))
... ...
@@ -387,6 +1000,15 @@ rowCors <- function(x, y, ...){
387 1000
 	return(covar/(sd.x*sd.y))
388 1001
 }
389 1002
 
1003
+corByGenotype <- function(A, B, G, Ns, which.cluster=c(1,2,3)[1], DF.PRIOR){
1004
+	x <- A * (G == which.cluster)
1005
+	x[x==0] <- NA
1006
+	y <- B * (G == which.cluster)
1007
+	res <- as.matrix(rowCors(x, y, na.rm=TRUE))
1008
+	cors <- shrink(res, Ns[, which.cluster], DF.PRIOR)
1009
+	cors
1010
+}
1011
+
390 1012
 generateX <- function(w, X) as.numeric(diag(w) %*% X)
391 1013
 generateIXTX <- function(x, nrow=3) {
392 1014
 	X <- matrix(x, nrow=nrow)
... ...
@@ -394,6 +1016,61 @@ generateIXTX <- function(x, nrow=3) {
394 1016
 	solve(XTX)
395 1017
 }
396 1018
 
1019
+nuphiAllele2 <- function(allele, Ystar, W, Ns){
1020
+	complete <- rowSums(is.na(W)) == 0 
1021
+	if(any(!is.finite(W))){## | any(!is.finite(V))){
1022
+		i <- which(rowSums(!is.finite(W)) > 0)
1023
+		stop("Possible zeros in the within-genotype estimates of the spread (vA, vB). ")
1024
+	}
1025
+	NOHET <- mean(Ns[, 2], na.rm=TRUE) < 0.05
1026
+	if(missing(allele)) stop("must specify allele")
1027
+	if(allele == "A") X <- cbind(1, 2:0) else X <- cbind(1, 0:2)
1028
+	if(NOHET) X <- X[-2, ] ##more than 1 X chromosome, but all homozygous		
1029
+	##How to quickly generate Xstar, Xstar = diag(W) %*% X
1030
+	Xstar <- apply(W, 1, generateX, X)
1031
+	IXTX <- apply(Xstar, 2, generateIXTX, nrow=nrow(X))
1032
+	betahat <- matrix(NA, 2, nrow(Ystar))
1033
+	ses <- matrix(NA, 2, nrow(Ystar))
1034
+	for(i in 1:nrow(Ystar)){
1035
+		betahat[, i] <- crossprod(matrix(IXTX[, i], ncol(X), ncol(X)), crossprod(matrix(Xstar[, i], nrow=nrow(X)), Ystar[i, ]))
1036
+		ssr <- sum((Ystar[i, ] - matrix(Xstar[, i], nrow(X), ncol(X)) %*% matrix(betahat[, i], ncol(X), 1))^2)
1037
+		ses[, i] <- sqrt(diag(matrix(IXTX[, i], ncol(X), ncol(X)) * ssr))
1038
+	}
1039
+	nu <- betahat[1, ]
1040
+	phi <- betahat[2, ]
1041
+	return(list(nu, phi))
1042
+}
1043
+
1044
+nuphiAlleleX <- function(allele, Ystar, W, Ns, chrX=FALSE){
1045
+	complete <- rowSums(is.na(W)) == 0 
1046
+	if(any(!is.finite(W))){## | any(!is.finite(V))){
1047
+		i <- which(rowSums(!is.finite(W)) > 0)
1048
+		stop("Possible zeros in the within-genotype estimates of the spread (vA, vB). ")
1049
+	}
1050
+	##NOHET <- mean(Ns[, 2], na.rm=TRUE) < 0.05
1051
+	if(missing(allele)) stop("must specify allele")
1052
+	if(allele == "A") X <- cbind(1, c(1, 0, 2, 1, 0), c(0, 1, 0, 1, 2))
1053
+	if(allele == "B") X <- cbind(1, c(0, 1, 0, 1, 2), c(1, 0, 2, 1, 0))	
1054
+	##if(NOHET) X <- X[-2, ] ##more than 1 X chromosome, but all homozygous		
1055
+	##How to quickly generate Xstar, Xstar = diag(W) %*% X
1056
+	Xstar <- apply(W, 1, generateX, X)
1057
+	IXTX <- apply(Xstar, 2, generateIXTX, nrow=nrow(X))
1058
+
1059
+	betahat <- matrix(NA, 3, nrow(Ystar))
1060
+	ses <- matrix(NA, 3, nrow(Ystar))			
1061
+	##betahat <- matrix(NA, 2, nrow(Ystar))
1062
+	##ses <- matrix(NA, 2, nrow(Ystar))
1063
+	for(i in 1:nrow(Ystar)){
1064
+		betahat[, i] <- crossprod(matrix(IXTX[, i], ncol(X), ncol(X)), crossprod(matrix(Xstar[, i], nrow=nrow(X)), Ystar[i, ]))
1065
+		ssr <- sum((Ystar[i, ] - matrix(Xstar[, i], nrow(X), ncol(X)) %*% matrix(betahat[, i], ncol(X), 1))^2)
1066
+		ses[, i] <- sqrt(diag(matrix(IXTX[, i], ncol(X), ncol(X)) * ssr))
1067
+	}
1068
+	nu <- betahat[1, ]
1069
+	phi <- betahat[2, ]
1070
+	phi2 <- betahat[3, ]
1071
+	return(list(nu, phi, phi2))
1072
+}
1073
+
397 1074
 
398 1075
 nuphiAllele <- function(object, allele, Ystar, W, tmp.objects, cnOptions){
399 1076
 	I <- isSnp(object)
... ...
@@ -410,7 +1087,8 @@ nuphiAllele <- function(object, allele, Ystar, W, tmp.objects, cnOptions){
410 1087
 	Ns <- Ns[I, ]
411 1088
 	
412 1089
 	CHR <- unique(chromosome(object))
413
-	batch <- unique(object$batch)
1090
+	##batch <- unique(object$batch)
1091
+	batch <- unique(batch(object))
414 1092
 	nuA <- getParam(object, "nuA", batch)
415 1093
 	nuB <- getParam(object, "nuB", batch)
416 1094
 	nuA.se <- getParam(object, "nuA.se", batch)
... ...
@@ -518,57 +1196,6 @@ predictGender <- function(res, cdfName="genomewidesnp6", SNRMin=5){
518 1196
 	return(gender)
519 1197
 }
520 1198
 
521
-##combineIntensities <- function(res, NP=NULL, callSet){
522
-##	rownames(res$B) <- rownames(res$A) <- res$gns
523
-##	colnames(res$B) <- colnames(res$A) <- res$sns
524
-##	if(!is.null(NP)){
525
-##		blank <- matrix(NA, nrow(NP), ncol(NP))
526
-##		dimnames(blank) <- dimnames(NP)
527
-##		A <- rbind(res$A, NP)
528
-##		B <- rbind(res$B, blank)
529
-##	} else {
530
-##		A <- res$A
531
-##		B <- res$B
532
-##	}
533
-##	dimnames(B) <- dimnames(A)
534
-##	index.snps <- match(res$gns, rownames(A))
535
-##	callsConfs <- calls <- matrix(NA, nrow(A), ncol(A), dimnames=dimnames(A))
536
-##	
537
-##	calls[index.snps, ] <- calls(callSet)
538
-##	callsConfs[index.snps, ] <- assayData(callSet)[["callProbability"]]
539
-##	fd <- data.frame(matrix(NA, nrow(calls), length(fvarLabels(callSet))))
540
-##	fd[index.snps, ] <- fData(callSet)
541
-##	rownames(fd) <- rownames(A)
542
-##	colnames(fd) <- fvarLabels(callSet)
543
-##	fD <- new("AnnotatedDataFrame",
544
-##		  data=data.frame(fd),
545
-##		  varMetadata=data.frame(labelDescription=colnames(fd), row.names=colnames(fd)))
546
-##	superSet <- new("CNSet",
547
-##			CA=matrix(NA, nrow(A), ncol(A), dimnames=dimnames(A)),
548
-##			CB=matrix(NA, nrow(A), ncol(A), dimnames=dimnames(A)),
549
-##			alleleA=A,
550
-##			alleleB=B,
551
-##			call=calls,
552
-##			callProbability=callsConfs,
553
-##			featureData=fD,
554
-##			phenoData=phenoData(callSet),
555
-##			experimentData=experimentData(callSet),
556
-##			protocolData=protocolData(callSet),
557
-##			annotation=annotation(callSet))
558
-##	return(superSet)
559
-##}
560
-##
561
-harmonizeDimnamesTo <- function(object1, object2){
562
-	#object2 should be a subset of object 1
563
-	object2 <- object2[featureNames(object2) %in% featureNames(object1), ]
564
-	object1 <- object1[match(featureNames(object2), featureNames(object1)), ]
565
-	object1 <- object1[, match(sampleNames(object2), sampleNames(object1))]
566
-	stopifnot(all.equal(featureNames(object1), featureNames(object2)))
567
-	stopifnot(all.equal(sampleNames(object1), sampleNames(object2)))
568
-	return(object1)
569
-}
570
-
571
-
572 1199
 
573 1200
 crlmmCopynumber <- function(object,
574 1201
 			    chromosome=1:23,
... ...
@@ -588,12 +1215,22 @@ crlmmCopynumber <- function(object,
588 1215
 			    MIN.PHI=2^3,
589 1216
 			    THR.NU.PHI=TRUE,
590 1217
 			    thresholdCopynumber=TRUE){
591
-	stopifnot("batch" %in% varLabels(object))
1218
+	if(isPackageLoaded("ff")){
1219
+		open(object)
1220
+		open(object$SKW)
1221
+		open(object$SNR)
1222
+	}
1223
+	stopifnot("batch" %in% varLabels(protocolData(object)))
592 1224
 	stopifnot("chromosome" %in% fvarLabels(object))
593 1225
 	stopifnot("position" %in% fvarLabels(object))
594 1226
 	stopifnot("isSnp" %in% fvarLabels(object))
595
-	batch <- object$batch
596
-	batches <- split((1:ncol(object))[object$SNR > SNRMin], batch[object$SNR > SNRMin])
1227
+	##batch <- object$batch
1228
+	batch <- batch(object)
1229
+	if(isPackageLoaded("ff")) {
1230
+		open(object$SNR)
1231
+		SNR <- object$SNR[, ]
1232
+	} else SNR <-  object$SNR
1233
+	batches <- split((1:ncol(object))[SNR > SNRMin], batch[SNR > SNRMin])
597 1234
 	if(any(sapply(batches, length) < MIN.SAMPLES)) message("Excluding batches with fewer than ", MIN.SAMPLES, " samples")
598 1235
 	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
599 1236
 	if(missing(which.batches)) which.batches <- seq(along=batches)
... ...
@@ -618,7 +1255,7 @@ crlmmCopynumber <- function(object,
618 1255
 				   annotation=annotation(object))
619 1256
 			featureData(tmp) <- addFeatureAnnotation(tmp)
620 1257
 			featureData(tmp) <- lm.parameters(tmp, batch=unique(batch[j]))
621
-			tmp$batch <- batch[j]
1258
+			tmp$batch <- batch(object)[j]
622 1259
 			tmp <- computeCopynumber(tmp,
623 1260
 						 MIN.OBS=MIN.OBS,
624 1261
 						 DF.PRIOR=DF.PRIOR,
... ...
@@ -640,10 +1277,19 @@ crlmmCopynumber <- function(object,
640 1277
 					  dimnames=list(featureNames(tmp), sampleNames(tmp)))
641 1278
 			CA(object)[row.index, j] <- CA(tmp)
642 1279
 			CB(object)[row.index, j] <- CB(tmp)
643
-			labels.asis <- fvarLabels(tmp)
644
-			labels.asis <- gsub(paste("_", unique(tmp$batch), sep=""), paste(".", ii, sep=""), labels.asis)
645
-			k <- match(labels.asis, colnames(lM(object)))
646
-			lM(object)[row.index, k] <- fData(tmp)
1280
+			##ad-hocery
1281
+			batchName <- unique(batch(object)[j])
1282
+			fvarLabels(tmp)[15:17] <- paste(c("corrAB", "corrBB", "corrAA"), batchName, sep=".")
1283
+			fvarLabels(tmp)[13:14] <- paste(c("phiPrimeA", "phiPrimeB"), batchName, sep=".")
1284
+			fvarLabels(tmp) <- gsub("_", ".", fvarLabels(tmp))
1285
+			##fvarLabels(tmp) <- gsub("\\.[1-9]", "", fvarLabels(tmp))
1286
+			fData(tmp) <- fData(tmp)[, fvarLabels(tmp) %in% names(physical(lM(object)))]
1287
+			jj <- match(fvarLabels(tmp), names(lM(object)))
1288
+			lM(object)[row.index, jj] <- fData(tmp)
1289
+			##labels.asis <- fvarLabels(tmp)
1290
+			##labels.asis <- gsub(paste("_", unique(tmp$batch), sep=""), paste(".", ii, sep=""), labels.asis)
1291
+			##k <- match(labels.asis, colnames(lM(object)))
1292
+			##lM(object)[row.index, k] <- fData(tmp)
647 1293
 			rm(tmp); gc()
648 1294
 			ii <- ii+1
649 1295
 		}
... ...
@@ -651,6 +1297,705 @@ crlmmCopynumber <- function(object,
651 1297
 	return(object)
652 1298
 }
653 1299
 
1300
+crlmmCopynumber2 <- function(object,
1301
+			    which.batches,
1302
+			    MIN.SAMPLES=10,
1303
+			    SNRMin=5,
1304
+			    MIN.OBS=1,
1305
+			    DF.PRIOR=50,
1306
+			    bias.adj=FALSE,
1307
+			    prior.prob=rep(1/4,4),
1308
+			    seed=1,
1309
+			    verbose=TRUE,
1310
+			    GT.CONF.THR=0.99,
1311
+			    PHI.THR=2^6,
1312
+			    nHOM.THR=5,
1313
+			    MIN.NU=2^3,
1314
+			    MIN.PHI=2^3,
1315
+			    THR.NU.PHI=TRUE,
1316
+			    thresholdCopynumber=TRUE,
1317
+			     load.it=TRUE){
1318
+	stopifnot("batch" %in% varLabels(protocolData(object)))
1319
+	stopifnot("chromosome" %in% fvarLabels(object))
1320
+	stopifnot("position" %in% fvarLabels(object))
1321
+	stopifnot("isSnp" %in% fvarLabels(object))
1322
+	batch <- batch(object)
1323
+	XIndex.snps <- (1:nrow(object))[chromosome(object) == 23 & isSnp(object) & !is.na(chromosome(object))]
1324
+	##YIndex.snps <- (1:nrow(object))[chromosome(object) == 24 & isSnp(object)]
1325
+	XIndex.nps <- (1:nrow(object))[chromosome(object) == 23 & !isSnp(object) & !is.na(chromosome(object))]
1326
+	autosomeIndex.snps <- (1:nrow(object))[chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object))]
1327
+	autosomeIndex.nps <- (1:nrow(object))[chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object))]	
1328
+	##Indexes <- list(autosomeIndex, XIndex, YIndex)
1329
+	##snpBatches <- splitIndicesByLength(1:nrow(object), ocProbesets())
1330
+	snpBatches <- splitIndicesByLength(autosomeIndex.snps, ocProbesets())
1331
+	## Do chromosome X in batches
1332
+	Ns <- initializeBigMatrix("Ns", nrow(object), 5)
1333
+	colnames(Ns) <- c("A", "B", "AA", "AB", "BB")
1334
+	if(!file.exists(file.path(ldPath(), "normal.rda"))){
1335
+		normal <- initializeBigMatrix("normal", nrow(object), ncol(object), vmode="integer")
1336
+		normal[,] <- 1L
1337
+		save(normal, file=file.path(ldPath(), "normal.rda"))
1338
+	} else load(file.path(ldPath(), "normal.rda"))
1339
+	if(!file.exists(file.path(ldPath(), "snpflags.rda"))){
1340
+		snpflags <- initializeBigMatrix("snpflags", nrow(object), length(unique(batch(object))), vmode="integer")
1341
+		snpflags[,] <- 0L
1342
+		save(snpflags, file=file.path(ldPath(), "snpflags.rda"))
1343
+	} else{
1344
+		load(file.path(ldPath(), "snpflags.rda"))
1345
+				
1346
+	} 
1347
+	if(verbose) message("Estimating allele-specific copy number at autosomal SNPs")
1348
+	ocLapply(seq(along=snpBatches),
1349
+			fit.lm1,
1350
+			autosomeIndex=autosomeIndex.snps,
1351
+			object=object,
1352
+			Ns=Ns,
1353
+			normal=normal,
1354
+			snpflags=snpflags,
1355
+			snpBatches=snpBatches,
1356
+			batchSize=ocProbesets(),
1357
+			SNRMin=SNRMin,
1358
+			MIN.SAMPLES=MIN.SAMPLES,
1359
+			MIN.OBS=MIN.OBS,
1360
+			DF=DF.PRIOR,
1361
+			GT.CONF.THR=GT.CONF.THR,
1362
+			THR.NU.PHI=THR.NU.PHI,
1363
+			MIN.NU=MIN.NU,
1364
+			MIN.PHI=MIN.PHI,
1365
+			verbose=verbose,
1366
+			neededPkgs="crlmm")
1367
+	## autosomal NPs
1368
+	snpBatches <- splitIndicesByLength(autosomeIndex.nps, ocProbesets())
1369
+	if(verbose) message("Estimating total copy number at nonpolymorphic loci")
1370
+	ocLapply(seq(along=snpBatches),
1371
+			fit.lm2,
1372
+			autosomeIndex=autosomeIndex.nps,
1373
+			object=object,
1374
+			Ns=Ns,
1375
+			normal=normal,
1376
+			snpflags=snpflags,
1377
+			snpBatches=snpBatches,
1378
+			batchSize=ocProbesets(),
1379
+			SNRMin=SNRMin,
1380
+			MIN.SAMPLES=MIN.SAMPLES,
1381
+			MIN.OBS=MIN.OBS,
1382
+			DF=DF.PRIOR,
1383
+			GT.CONF.THR=GT.CONF.THR,
1384
+			THR.NU.PHI=THR.NU.PHI,
1385
+			MIN.NU=MIN.NU,
1386
+			MIN.PHI=MIN.PHI,
1387
+			verbose=verbose,
1388
+		 neededPkgs="crlmm")
1389
+	snpBatches <- splitIndicesByLength(XIndex.snps, ocProbesets())
1390
+	if(verbose) message("Estimating total copy number at polymorphic loci on chromosome X")
1391
+	ocLapply(seq(along=snpBatches),
1392
+			fit.lm3,
1393
+			autosomeIndex=XIndex.snps,
1394
+			object=object,
1395
+			Ns=Ns,
1396
+			normal=normal,
1397
+			snpflags=snpflags,
1398
+			snpBatches=snpBatches,
1399
+			batchSize=ocProbesets(),
1400
+			SNRMin=SNRMin,
1401
+			MIN.SAMPLES=MIN.SAMPLES,
1402
+			MIN.OBS=MIN.OBS,
1403
+			DF=DF.PRIOR,
1404
+			GT.CONF.THR=GT.CONF.THR,
1405
+			THR.NU.PHI=THR.NU.PHI,
1406
+			MIN.NU=MIN.NU,
1407
+			MIN.PHI=MIN.PHI,
1408
+			verbose=verbose,
1409
+		 neededPkgs="crlmm")
1410
+	if(verbose) message("Estimating total copy number for nonpolymorphic loci on chromosome X")
1411
+	snpBatches <- splitIndicesByLength(XIndex.nps, ocProbesets())
1412
+	if(verbose) message("Estimating total copy number at nonpolymorphic loci on chromosome X")
1413
+	tmp <- ocLapply(seq(along=snpBatches),
1414
+			fit.lm4,
1415
+			XIndex=XIndex.nps,
1416
+			object=object,
1417
+			Ns=Ns,
1418
+			normal=normal,
1419
+			snpflags=snpflags,
1420
+			snpBatches=snpBatches,
1421
+			batchSize=ocProbesets(),
1422
+			SNRMin=SNRMin,
1423
+			MIN.SAMPLES=MIN.SAMPLES,
1424
+			MIN.OBS=MIN.OBS,
1425
+			DF=DF.PRIOR,
1426
+			GT.CONF.THR=GT.CONF.THR,
1427
+			THR.NU.PHI=THR.NU.PHI,
1428
+			MIN.NU=MIN.NU,
1429
+			MIN.PHI=MIN.PHI,
1430
+			verbose=verbose,
1431
+			neededPkgs="crlmm")
1432
+	return(object)
1433
+}
1434
+
1435
+fit.lm1 <- function(idxBatch,
1436
+		    snpBatches,
1437
+		    autosomeIndex,
1438
+		    object,
1439
+		    Ns,
1440
+		    normal,
1441
+		    snpflags,
1442
+		    batchSize,
1443
+		    SNRMin,
1444
+		    MIN.SAMPLES,
1445
+		    MIN.OBS,
1446
+		    DF.PRIOR,
1447
+		    GT.CONF.THR,
1448
+		    THR.NU.PHI,
1449
+		    MIN.NU,
1450
+		    MIN.PHI,
1451
+		    verbose, ...){
1452
+			##   which.batches, ...){
1453
+	if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches))
1454
+	open(object)
1455
+	open(snpflags)
1456
+	open(normal)
1457
+	snps <- snpBatches[[idxBatch]]
1458
+		batches <- split(seq(along=batch(object)), batch(object))
1459
+	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
1460
+	corr <- corrA.BB <- corrB.AA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batches)))
1461
+	flags <- nuA <- nuB <- phiA <- phiB <- corr
1462
+	cB <- cA <- matrix(NA, length(snps), ncol(object))
1463
+	for(k in batches){
1464
+		G <- calls(object)[snps, k]
1465
+		NORM <- normal[snps, k]
1466
+		xx <- snpCallProbability(object)[snps, k]
1467
+		highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1468
+		G <- G*highConf*NORM
1469
+		A <- A(object)[snps, k]
1470
+		B <- B(object)[snps, k]
1471
+		##index <- GT.B <- GT.A <- vector("list", 3)
1472
+		##names(index) <- names(GT.B) <- names(GT.A) <- c("AA", "AB", "BB")
1473
+		Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G)
1474
+		muA <- applyByGenotype(A, rowMedians, G)
1475
+		muB <- applyByGenotype(B, rowMedians, G)
1476
+		vA <- applyByGenotype(A, rowMAD, G)
1477
+		vB <- applyByGenotype(B, rowMAD, G)
1478
+		vA <- shrink(vA, Ns, DF.PRIOR)
1479
+		vB <- shrink(vB, Ns, DF.PRIOR)
1480
+		##location and scale
1481
+		J <- match(unique(batch(object)[k]), unique(batch(object)))
1482
+		##background variance for alleleA
1483
+		taus <- applyByGenotype(log2(A), rowMAD, G)^2
1484
+		tau2A[, J] <- shrink(taus[, 3, drop=FALSE], Ns[, 3], DF.PRIOR)
1485
+		sig2A[, J] <- shrink(taus[, 1, drop=FALSE], Ns[, 1], DF.PRIOR)
1486
+		taus <- applyByGenotype(log2(B), rowMAD, G)^2
1487
+		tau2B[, J] <- shrink(taus[, 3, drop=FALSE], Ns[, 1], DF.PRIOR)
1488
+		sig2B[, J] <- shrink(taus[, 1, drop=FALSE], Ns[, 3], DF.PRIOR)
1489
+
1490
+		corr[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=2, DF.PRIOR)
1491
+		corrB.AA[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=1, DF.PRIOR)
1492
+		corrA.BB[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=3, DF.PRIOR)
1493
+		##formerly oneBatch()...
1494
+		##---------------------------------------------------------------------------
1495
+		## Impute sufficient statistics for unobserved genotypes (plate-specific)
1496
+		##---------------------------------------------------------------------------
1497
+		index <- apply(Ns, 2, function(x, MIN.OBS) which(x > MIN.OBS), MIN.OBS)
1498
+		correct.orderA <- muA[, 1] > muA[, 3]
1499
+		correct.orderB <- muB[, 3] > muB[, 1]
1500
+		index.complete <- intersect(which(correct.orderA & correct.orderB), intersect(index[[1]], intersect(index[[2]], index[[3]])))
1501
+		size <- min(5000, length(index.complete))
1502
+		if(size == 5000) index.complete <- sample(index.complete, 5000, replace=TRUE)
1503
+		if(length(index.complete) < 200){
1504
+			stop("fewer than 200 snps pass criteria for predicting the sufficient statistics")
1505
+		}
1506
+		index <- vector("list", 3)
1507
+		index[[1]] <- which(Ns[, 1] == 0 & (Ns[, 2] >= MIN.OBS & Ns[, 3] >= MIN.OBS))
1508
+		index[[2]] <- which(Ns[, 2] == 0 & (Ns[, 1] >= MIN.OBS & Ns[, 3] >= MIN.OBS))
1509
+		index[[3]] <- which(Ns[, 3] == 0 & (Ns[, 2] >= MIN.OBS & Ns[, 1] >= MIN.OBS))
1510
+		res <- imputeCenter(muA, muB, index.complete, index)
1511
+		muA <- res[[1]]
1512
+		muB <- res[[2]]
1513
+
1514
+		## Monomorphic SNPs.  Mixture model may be better
1515
+		## Improve estimation by borrowing strength across batch
1516
+		noAA <- Ns[, 1] < MIN.OBS
1517
+		noAB <- Ns[, 2] < MIN.OBS
1518
+		noBB <- Ns[, 3] < MIN.OBS
1519
+		index[[1]] <- noAA & noAB
1520
+		index[[2]] <- noBB & noAB
1521
+		index[[3]] <- noAA & noBB
1522
+		cols <- c(3, 1, 2)
1523
+		for(j in 1:3){
1524
+			if(sum(index[[j]]) == 0) next()
1525
+			kk <- cols[j]
1526
+			X <- cbind(1, muA[index.complete, kk], muB[index.complete, kk])
1527
+			Y <- cbind(muA[index.complete,  -kk],
1528
+				   muB[index.complete,  -kk])
1529
+			betahat <- solve(crossprod(X), crossprod(X,Y))
1530
+			X <- cbind(1, muA[index[[j]],  kk], muB[index[[j]],  kk])
1531
+			mus <- X %*% betahat
1532
+			muA[index[[j]], -kk] <- mus[, 1:2]
1533
+			muB[index[[j]], -kk] <- mus[, 3:4]
1534
+		}
1535
+		negA <- rowSums(muA < 0) > 0
1536
+		negB <- rowSums(muB < 0) > 0
1537
+		flags[, J] <- rowSums(Ns == 0) > 0
1538
+		##flags[, J] <- index[[1]] | index[[2]] | index[[3]] | rowSums(
1539
+
1540
+		##formerly coefs()
1541
+		Np <- Ns
1542
+		Np[Np < 1] <- 1
1543
+		vA2 <- vA^2/Np
1544
+		vB2 <- vB^2/Np
1545
+		wA <- sqrt(1/vA2)
1546
+		wB <- sqrt(1/vB2)
1547
+		YA <- muA*wA
1548
+		YB <- muB*wB
1549
+		res <- nuphiAllele2(allele="A", Ystar=YA, W=wA, Ns=Ns)
1550
+		nuA[, J] <- res[[1]]
1551
+		phiA[, J] <- res[[2]]
1552
+		res <- nuphiAllele2(allele="B", Ystar=YB, W=wB, Ns=Ns)
1553
+		nuB[, J] <- res[[1]]
1554
+		phiB[, J] <- res[[2]]
1555
+		if(THR.NU.PHI){
1556
+			nuA[nuA[, J] < MIN.NU, J] <- MIN.NU
1557
+			nuB[nuB[, J] < MIN.NU, J] <- MIN.NU
1558
+			phiA[phiA[, J] < MIN.PHI, J] <- MIN.PHI
1559
+			phiB[phiB[, J] < MIN.PHI, J] <- MIN.PHI
1560
+		}
1561
+		## formerly polymorphic():  calculate copy number
1562
+		cA[, k] <- matrix((1/phiA[, J]*(A-nuA[, J])), nrow(A), ncol(A))
1563
+		cB[, k] <- matrix((1/phiB[, J]*(B-nuB[, J])), nrow(B), ncol(B))
1564
+	}
1565
+	cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA))
1566
+	cB <- matrix(as.integer(cB*100), nrow(cB), ncol(cB))
1567
+	CA(object)[snps, ] <- cA
1568
+	CB(object)[snps, ] <- cB
1569
+	snpflags[snps, ] <- flags
1570
+	lapply(lM(object), open)
1571
+	lM(object)$tau2A[snps, ] <- tau2A
1572
+	lM(object)$tau2B[snps, ] <- tau2B
1573
+	lM(object)$sig2A[snps, ] <- sig2A
1574
+	lM(object)$sig2B[snps, ] <- sig2B
1575
+	lM(object)$nuA[snps, ] <- nuA
1576
+	lM(object)$nuB[snps, ] <- nuB
1577
+	lM(object)$phiA[snps, ] <- phiA
1578
+	lM(object)$phiB[snps, ] <- phiB
1579
+	lapply(assayData(object), close)
1580
+	lapply(lM(object), close)
1581
+	TRUE
1582
+}
1583
+
1584
+fit.lm2 <- function(idxBatch,
1585
+		    snpBatches,
1586
+		    autosomeIndex,
1587
+		    object,
1588
+		    Ns,
1589
+		    normal,
1590
+		    snpflags,
1591
+		    batchSize,
1592
+		    SNRMin,
1593
+		    MIN.SAMPLES,
1594
+		    MIN.OBS,
1595
+		    DF.PRIOR,
1596
+		    GT.CONF.THR,
1597
+		    THR.NU.PHI,
1598
+		    MIN.NU,
1599
+		    MIN.PHI,
1600
+		    verbose, ...){
1601
+			##   which.batches, ...){
1602
+	if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches))
1603
+	open(object)
1604
+	open(snpflags)
1605
+	open(normal)
1606
+	snps <- snpBatches[[idxBatch]]
1607
+	batches <- split(seq(along=batch(object)), batch(object))
1608
+	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
1609
+	cA <- matrix(NA, length(snps), ncol(object))
1610
+	ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object))
1611
+	flags <- snpflags[, ]
1612
+	noflags <- rowSums(flags) == 0
1613
+	## We do not want to write to discuss for each batch.  More efficient to
1614
+	## write to disk after estimating these parameters for all batches.
1615
+	nuA.np <- phiA.np <- sig2A.np <- matrix(NA, length(snps), length(unique(batches)))
1616
+	## for imputation, we need the corresponding parameters of the snps
1617
+	nuA <- lM(object)$nuA
1618
+	phiA <- lM(object)$phiA
1619
+	nuB <- lM(object)$nuB
1620
+	phiB <- lM(object)$phiB
1621
+	snp.ind <- ii & noflags
1622
+	NORM.np <- normal[snps, ]
1623
+	for(k in batches){
1624
+		##if(verbose) message("SNP batch ", ii, " of ", length(batches))
1625
+		J <- match(unique(batch(object)[k]), unique(batch(object)))
1626
+		snp.index <- snp.ind & nuA[, J] > 20 & nuB[, J] > 20 & phiA[, J] > 20 & phiB[, J] > 20
1627
+		if(sum(snp.index) >= 5000){
1628
+			snp.index <- sample(which(snp.index), 5000)
1629
+		}
1630
+		phiA.snp <- phiA[snp.index, J]
1631
+		phiB.snp <- phiB[snp.index, J]
1632
+		A.snp <- A(object)[snp.index, k]
1633
+		B.snp <- B(object)[snp.index, k]
1634
+		NORM.snp <- normal[snp.index, k]
1635
+		G <- calls(object)[snp.index, k]
1636
+		xx <- snpCallProbability(object)[snp.index, k]
1637
+		highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1638
+		G <- G*highConf*NORM.snp[, k]
1639
+		G[G==0] <- NA
1640
+		##nonpolymorphic
1641
+		A.np <- A(object)[snps, k]
1642
+		Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G)
1643
+		muA <- applyByGenotype(A.snp, rowMedians, G)
1644
+		muB <- applyByGenotype(B.snp, rowMedians, G)
1645
+		muA <- muA[, 1]
1646
+		muB <- muB[, 3]
1647
+		X <- cbind(1, log2(c(muA, muB)))