Browse code

added genotype2 and crlmmCopynumber2

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45672 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 02/04/2010 03:03:19
Showing9 changed files

... ...
@@ -489,3 +489,10 @@ then readIDAT() should work. Thanks to Pierre Cherel who reported this error.
489 489
 
490 490
 ** added parallel/large dataset support to snprma/crlmm
491 491
 ** merged changes on .41 with my local changes
492
+
493
+2010-04-01 R.Scharpf committed version 1.5.44
494
+
495
+** added functions genotype2, cnrma2 for preprocessing and genotyping
496
+   with crlmm2
497
+** added crlmmCopynumber2 for parallel support with copy number
498
+   est. (needs more checking)
... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.5.43
4
+Version: 1.5.44
5 5
 Date: 2010-02-05
6 6
 Author: Rafael A Irizarry, Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>
7 7
 Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -12,7 +12,7 @@ importMethodsFrom(Biobase, annotation, "annotation<-",
12 12
                   annotatedDataFrameFrom, assayData, "assayData<-",
13 13
                   combine, dims, experimentData, "experimentData<-",
14 14
                   fData, featureData, "featureData<-", featureNames,
15
-                  fvarMetadata, fvarLabels, pData, phenoData,
15
+                  fvarMetadata, fvarLabels, pData, "pData<-", phenoData,
16 16
                   "phenoData<-", protocolData, "protocolData<-",
17 17
                   pubMedIds, rowMedians, sampleNames, snpCall,
18 18
                   snpCallProbability,
... ...
@@ -67,7 +67,12 @@ export(crlmm,
67 67
        readIdatFiles, 
68 68
        snprma,
69 69
        snprma2,
70
-       crlmm2) 
70
+       crlmm2,
71
+       genotype2,
72
+       cnrma2,
73
+       processCEL2,
74
+       batch,
75
+       crlmmCopynumber2)
71 76
 
72 77
 export(initializeParamObject)
73 78
 
... ...
@@ -1,3 +1,4 @@
1
+setGeneric("batch", function(object) standardGeneric("batch"))
1 2
 setGeneric("getParam", function(object, name, batch) standardGeneric("getParam"))
2 3
 setGeneric("cnIndex", function(object) standardGeneric("cnIndex"))
3 4
 setGeneric("cnNames", function(object) standardGeneric("cnNames"))
... ...
@@ -54,19 +54,20 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){
54 54
 	##crlmmOpts$snpRange <- range(snpIndex)
55 55
 	##crlmmOpts$npRange <- range(npIndex)
56 56
 }
57
-construct <- function(filenames, cdfName, copynumber=FALSE, sns, verbose=TRUE){
57
+construct <- function(filenames, cdfName, copynumber=FALSE,
58
+		      sns, verbose=TRUE){
58 59
 	if(verbose) message("Initializing container for assay data elements alleleA, alleleB, call, callProbability")
59 60
 	if(missing(sns)) sns <- basename(filenames)
60 61
 	protocolData <- getProtocolData.Affy(filenames)
61 62
 	featureData <- getFeatureData.Affy(cdfName, copynumber=copynumber)
62 63
 	nr <- nrow(featureData); nc <- length(filenames)
63 64
 	callSet <- new("SnpSuperSet", 
64
-			 alleleA=initializeBigMatrix(name="A", nr, nc),
65
-			 alleleB=initializeBigMatrix(name="B", nr, nc),
66
-			 call=initializeBigMatrix(name="call", nr, nc),
67
-			 callProbability=initializeBigMatrix(name="callPr", nr,nc),
68
-			 featureData=featureData,
69
-			 annotation=cdfName)
65
+		       alleleA=initializeBigMatrix(name="A", nr, nc),
66
+		       alleleB=initializeBigMatrix(name="B", nr, nc),
67
+		       call=initializeBigMatrix(name="call", nr, nc),
68
+		       callProbability=initializeBigMatrix(name="callPr", nr,nc),
69
+		       featureData=featureData,
70
+		       annotation=cdfName)
70 71
 	pd <- data.frame(matrix(NA, nc, 3), row.names=sns)
71 72
 	colnames(pd)=c("SKW", "SNR", "gender")
72 73
 	phenoData(callSet) <- new("AnnotatedDataFrame", data=pd)
... ...
@@ -167,14 +168,557 @@ genotype <- function(filenames,
167 168
 			       verbose=verbose,
168 169
 			       returnParams=returnParams,
169 170
 			       badSNP=badSNP)
170
-		suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
171
-		suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
171
+		snpCall(callSet)[snp.index, j] <- tmp[["calls"]]
172
+		snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]]
172 173
 		callSet$gender[j] <- tmp$gender
173 174
 		iter <- iter+1
174 175
 	}
175 176
 	return(callSet)
176 177
 }
177 178
 
179
+genotype2 <- function(filenames,
180
+		      cdfName,
181
+		      batch,
182
+		      mixtureSampleSize=10^5,
183
+		      eps=0.1,
184
+		      verbose=TRUE,
185
+		      seed=1,
186
+		      sns,
187
+		      copynumber=FALSE,
188
+		      probs=rep(1/3, 3),
189
+		      DF=6,
190
+		      SNRMin=5,
191
+		      recallMin=10,
192
+		      recallRegMin=1000,
193
+		      gender=NULL,
194
+		      returnParams=TRUE,
195
+		      badSNP=0.7){
196
+	if(!copynumber){
197
+		callSet <- crlmm2(filenames=filenames,
198
+				  cdfName=cdfName,
199
+				  mixtureSampleSize=mixtureSampleSize,
200
+				  eps=eps,
201
+				  verbose=verbose,
202
+				  seed=seed,
203
+				  sns=sns,
204
+				  probs=probs,
205
+				  DF=DF,
206
+				  SNRMin=SNRMin,
207
+				  recallMin=recallMin,
208
+				  recallRegMin=recallRegMin,
209
+				  gender=gender,
210
+				  returnParams=returnParams,
211
+				  badSNP=badSNP)
212
+		return(callSet)
213
+	}
214
+	if(missing(cdfName)) stop("must specify cdfName")
215
+	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
216
+	if(missing(batch)){
217
+		warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch.  The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.")
218
+	} else {
219
+		if(length(batch) != length(filenames))
220
+			stop("batch variable must be the same length as the filenames")
221
+	}
222
+	if(missing(sns)) sns <- basename(filenames)
223
+	## callSet contains potentially very big matrices
224
+	callSet <- construct(filenames=filenames,
225
+			     cdfName=cdfName,
226
+			     copynumber=TRUE,
227
+			     sns=sns,
228
+			     verbose=verbose)
229
+	if(missing(batch)){
230
+		protocolData(callSet)$batch <- as.numeric(as.factor(protocolData(callSet)$ScanDate))
231
+	}
232
+	if(!missing(batch)) protocolData(callSet)$batch <- batch
233
+	mixtureParams <- matrix(NA, 4, length(filenames))
234
+	snp.index <- which(isSnp(callSet)==1)
235
+	snprmaRes <- snprma2(##A=A(callSet),
236
+			     ##B=B(callSet),
237
+			       filenames=filenames,
238
+			       mixtureSampleSize=mixtureSampleSize,
239
+			       fitMixture=TRUE,
240
+			       eps=eps,
241
+			       verbose=verbose,
242
+			       seed=seed,
243
+			       cdfName=cdfName,
244
+			       sns=sns)
245
+	if(verbose) message("Finished preprocessing.")
246
+	open(snprmaRes[["A"]])
247
+	open(snprmaRes[["B"]])
248
+	open(snprmaRes[["SNR"]])
249
+	open(snprmaRes[["SKW"]])
250
+	open(snprmaRes[["mixtureParams"]])
251
+	if(verbose) message("Updating elements of callSet")
252
+##	A(callSet) <- snprmaRes[["A"]]
253
+##	B(callSet) <- snprmaRes[["B"]]
254
+	bb = ocProbesets()*ncol(A)*8
255
+	ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]], BATCHBYTES=bb)
256
+	ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]], BATCHBYTES=bb)
257
+	##ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]])
258
+	##ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]])
259
+	if(verbose) message("Finished updating elements of callSet")
260
+	stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns))
261
+	pData(callSet)$SKW <- snprmaRes$SKW
262
+	pData(callSet)$SNR <- snprmaRes$SNR
263
+	mixtureParams <- snprmaRes$mixtureParams
264
+	np.index <- which(isSnp(callSet) == 0)
265
+	cnrmaRes <- cnrma2(A=A(callSet),
266
+			   filenames=filenames,
267
+			   row.names=featureNames(callSet)[np.index],
268
+			   cdfName=cdfName,
269
+			   sns=sns,
270
+			   seed=seed,
271
+			   verbose=verbose)
272
+	rm(cnrmaRes); gc()
273
+	## as.matrix needed when ffdf is used
274
+	tmp <- crlmmGT2(A=snprmaRes[["A"]],
275
+			   B=snprmaRes[["B"]],
276
+			   SNR=snprmaRes[["SNR"]],
277
+			   mixtureParams=snprmaRes[["mixtureParams"]],
278
+			   cdfName=cdfName,
279
+			   row.names=NULL, ##featureNames(callSet),##[snp.index],
280
+			   col.names=sampleNames(callSet),
281
+			   probs=probs,
282
+			   DF=DF,
283
+			   SNRMin=SNRMin,
284
+			   recallMin=recallMin,
285
+			   recallRegMin=recallRegMin,
286
+			   gender=gender,
287
+			   verbose=verbose,
288
+			   returnParams=returnParams,
289
+			   badSNP=badSNP)
290
+	open(tmp[["calls"]])
291
+	open(tmp[["confs"]])
292
+	##snpCall(callSet) <- tmp[["calls"]]
293
+	##snpCallProbability(callSet) <- tmp[["confs"]]
294
+	bb = ocProbesets()*ncol(A)*8
295
+	ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb)
296
+	ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb)
297
+##	ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]])
298
+##	ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]])
299
+	callSet$gender <- tmp$gender
300
+	cnSet <- as(callSet, "CNSetLM")
301
+	return(cnSet)
302
+}
303
+
304
+snprma2RS <- function(A, B,
305
+		      filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
306
+		      eps=0.1, verbose=TRUE, seed=1, cdfName, sns){
307
+  if (missing(sns)) sns <- basename(filenames)
308
+  if (missing(cdfName))
309
+    cdfName <- read.celfile.header(filenames[1])[["cdfName"]]
310
+  pkgname <- getCrlmmAnnotationName(cdfName)
311
+  stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
312
+  
313
+  if(verbose) message("Loading annotations and mixture model parameters.")
314
+  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
315
+  pnsa <- getVarInEnv("pnsa")
316
+  pnsb <- getVarInEnv("pnsb")
317
+  gns <- getVarInEnv("gns")
318
+  
319
+  ##We will read each cel file, summarize, and run EM one by one
320
+  ##We will save parameters of EM to use later
321
+  if(verbose) message("Initializing objects.")
322
+  mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
323
+  SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
324
+  SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")
325
+  
326
+  ## This is the sample for the fitting of splines
327
+  ## BC: I like better the idea of the user passing the seed,
328
+  ##     because this might intefere with other analyses
329
+  ##     (like what happened to GCRMA)
330
+  ##S will hold (A+B)/2 and M will hold A-B
331
+  ##NOTE: We actually dont need to save S. Only for pics etc...
332
+  ##f is the correction. we save to avoid recomputing
333
+##  A <- initializeBigMatrix("crlmmA-", length(pnsa), length(filenames), "integer")
334
+##  B <- initializeBigMatrix("crlmmB-", length(pnsb), length(filenames), "integer")
335
+
336
+  sampleBatches <- splitIndicesByNode(seq(along=filenames))
337
+
338
+  if(verbose) message("Processing ", length(filenames), " files.")
339
+
340
+  ocLapply(sampleBatches, processCEL_RS, filenames=filenames,
341
+           fitMixture=fitMixture, A=A, B=B, SKW=SKW, SNR=SNR,
342
+           mixtureParams=mixtureParams, eps=eps, seed=seed,
343
+           mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
344
+           neededPkgs=c("crlmm", pkgname))
345
+  
346
+  list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
347
+}
348
+
349
+processCEL_RS <- function(i, filenames, fitMixture, A, B, SKW, SNR,
350
+			  mixtureParams, eps, seed, mixtureSampleSize,
351
+			  pkgname){
352
+  
353
+  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
354
+  loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
355
+  loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
356
+  autosomeIndex <- getVarInEnv("autosomeIndex")
357
+  pnsa <- getVarInEnv("pnsa")
358
+  pnsb <- getVarInEnv("pnsb")
359
+  fid <- getVarInEnv("fid")
360
+  reference <- getVarInEnv("reference")
361
+  aIndex <- getVarInEnv("aIndex")
362
+  bIndex <- getVarInEnv("bIndex")
363
+  SMEDIAN <- getVarInEnv("SMEDIAN")
364
+  theKnots <- getVarInEnv("theKnots")
365
+  gns <- getVarInEnv("gns")
366
+
367
+  ## for mixture
368
+  set.seed(seed)
369
+  idx <- sort(sample(autosomeIndex, mixtureSampleSize))
370
+  ##for skewness. no need to do everything
371
+  idx2 <- sample(length(fid), 10^5)
372
+
373
+  open(A)
374
+  open(B)
375
+  open(SKW)
376
+  open(mixtureParams)
377
+  open(SNR)
378
+  ##RS
379
+  iii <- seq(along=pnsa)
380
+
381
+  for (k in i){
382
+    y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
383
+    x <- log2(y[idx2])
384
+    SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3)
385
+    rm(x)
386
+    y <- normalize.quantiles.use.target(y, target=reference)
387
+    A[iii, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
388
+    B[iii, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
389
+    rm(y)
390
+    
391
+    if(fitMixture){
392
+      S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN
393
+      M <- log2(A[idx, k])-log2(B[idx, k])
394
+      tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
395
+      mixtureParams[, k] <- tmp[["coef"]]
396
+      SNR[k] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
397
+    } else {
398
+      mixtureParams[, k] <- NA
399
+      SNR[k] <- NA
400
+    }
401
+  }
402
+  close(A)
403
+  close(B)
404
+  close(SKW)
405
+  close(mixtureParams)
406
+  close(SNR)
407
+  TRUE
408
+}
409
+
410
+crlmmGT2_RS<- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
411
+		       col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
412
+		       SNRMin=5, recallMin=10, recallRegMin=1000,
413
+		       gender=NULL, desctrucitve=FALSE, verbose=TRUE,
414
+		       returnParams=FALSE, badSNP=.7){
415
+  open(SNR)
416
+  open(A)
417
+  open(B)
418
+  open(mixtureParams)
419
+  ## expect objects to be ff
420
+  
421
+  keepIndex <- which( SNR[] > SNRMin)
422
+  if(length(keepIndex)==0) stop("No arrays above quality threshold!")
423
+  
424
+  NC <- ncol(A)
425
+  NR <- nrow(A)
426
+  
427
+  pkgname <- getCrlmmAnnotationName(cdfName)
428
+  stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
429
+
430
+  if(verbose) message("Loading annotations.")
431
+  loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
432
+  loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
433
+  ## this is toget rid of the 'no visible binding' notes
434
+  ## variable definitions
435
+  XIndex <- getVarInEnv("XIndex")
436
+  autosomeIndex <- getVarInEnv("autosomeIndex")
437
+  YIndex <- getVarInEnv("YIndex")
438
+  SMEDIAN <- getVarInEnv("SMEDIAN")
439
+  theKnots <- getVarInEnv("theKnots")
440
+  regionInfo <- getVarInEnv("regionInfo")
441
+  params <- getVarInEnv("params")
442
+  ##RS
443
+  pnsa <- getVarInEnv("pnsa")
444
+  NR <- length(pnsa)
445
+  
446
+  ##IF gender not provide, we predict
447
+  if(is.null(gender)){
448
+    if(verbose) message("Determining gender.")
449
+    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
450
+    if(sum(SNR[] > SNRMin)==1){
451
+      gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
452
+    }else{
453
+      gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
454
+    }
455
+  }
456
+  
457
+  Indexes <- list(autosomeIndex, XIndex, YIndex)
458
+  cIndexes <- list(keepIndex, 
459
+                   keepIndex[which(gender[keepIndex]==2)], 
460
+                   keepIndex[which(gender[keepIndex]==1)])
461
+  
462
+  if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
463
+
464
+  ## call C
465
+  fIndex <- which(gender==2)
466
+  mIndex <- which(gender==1)
467
+
468
+  ## different here
469
+  ## use gtypeCallerR in batches
470
+  ##RS
471
+  snpBatches <- splitIndicesByLength(1:NR, ocProbesets())
472
+  newparamsBatch <- vector("list", length(snpBatches))
473
+
474
+  process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
475
+                       YIndex, A, B, mixtureParams, fIndex, mIndex,
476
+                       params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
477
+    open(A)
478
+    open(B)
479
+    open(mixtureParams)
480
+    snps <- snpBatches[[idxBatch]]
481
+    rSnps <- range(snps)
482
+    last <- (idxBatch-1)*batchSize
483
+    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
484
+                         XIndex[XIndex %in% snps]-last,
485
+                         YIndex[YIndex %in% snps]-last)
486
+    IndexesBatch <- lapply(IndexesBatch, as.integer)
487
+    tmpA <- as.matrix(A[snps,])
488
+    tmpB <- as.matrix(B[snps,])
489
+    ## newparamsBatch[[idxBatch]]
490
+
491
+    tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
492
+                        params[["centers"]][snps,],
493
+                        params[["scales"]][snps,],
494
+                        params[["N"]][snps,],
495
+                        IndexesBatch, cIndexes,
496
+                        sapply(IndexesBatch, length),
497
+                        sapply(cIndexes, length), SMEDIAN,
498
+                        theKnots, mixtureParams[], DF, probs, 0.025)
499
+    
500
+    last <- rSnps[2]
501
+    rm(snps, rSnps, IndexesBatch, tmpA, tmpB)
502
+    close(A)
503
+    close(B)
504
+    close(mixtureParams)
505
+    tmp
506
+  }
507
+
508
+  newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
509
+                             snpBatches=snpBatches,
510
+                             autosomeIndex=autosomeIndex, XIndex=XIndex,
511
+                             YIndex=YIndex, A=A, B=B,
512
+                             mixtureParams=mixtureParams, fIndex=fIndex,
513
+                             mIndex=mIndex, params=params,
514
+                             cIndexes=cIndexes, SMEDIAN=SMEDIAN,
515
+                             theKnots=theKnots, DF=DF, probs=probs,
516
+                             batchSize=ocProbesets())
517
+##   last <- 0
518
+##   for (idxBatch in seq(along=snpBatches)){
519
+##     snps <- snpBatches[[idxBatch]]
520
+##     rSnps <- range(snps)
521
+##     IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
522
+##                          XIndex[XIndex %in% snps]-last,
523
+##                          YIndex[YIndex %in% snps]-last)
524
+##     IndexesBatch <- lapply(IndexesBatch, as.integer)
525
+##     tmpA <- A[snps,]
526
+##     tmpB <- B[snps,]
527
+##     newparamsBatch[[idxBatch]] <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
528
+##                                                params[["centers"]][snps,],
529
+##                                                params[["scales"]][snps,],
530
+##                                                params[["N"]][snps,],
531
+##                                                IndexesBatch, cIndexes,
532
+##                                                sapply(IndexesBatch, length),
533
+##                                                sapply(cIndexes, length),
534
+##                                                SMEDIAN, theKnots,
535
+##                                                mixtureParams[], DF, probs, 0.025)
536
+##     last <- rSnps[2]
537
+##     rm(snps, rSnps, IndexesBatch, tmpA, tmpB)
538
+##   }
539
+##   rm(last)
540
+  
541
+  newparams <- vector("list", 3)
542
+  names(newparams) <- c("centers", "scales", "N")
543
+  newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
544
+  newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2))
545
+  newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3))
546
+  rm(newparamsBatch)
547
+  if(verbose) message("Done.")
548
+  if(verbose) message("Estimating recalibration parameters.")
549
+  d <- newparams[["centers"]] - params$centers
550
+
551
+  ##regression 
552
+  Index <- intersect(which(pmin(newparams[["N"]][, 1],
553
+                                newparams[["N"]][, 2],
554
+                                newparams[["N"]][, 3]) > recallMin &
555
+                                !apply(regionInfo, 1, any)),
556
+                                autosomeIndex)
557
+  if(length(Index) < recallRegMin){
558
+    warning("Recalibration not possible. Possible cause: small sample size.")
559
+    newparams <- params
560
+    dev <- vector("numeric", nrow(newparams[["centers"]]))
561
+    SS <- matrix(Inf, 3, 3)
562
+    DD <- 0
563
+  }else{
564
+    data4reg <- as.data.frame(newparams[["centers"]][Index,])
565
+    names(data4reg) <- c("AA", "AB", "BB")
566
+    regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
567
+                       c(coef(lm(AB~AA+BB, data=data4reg)), 0), 
568
+                       coef(lm(BB~AA*AB, data=data4reg)))
569
+    rownames(regParams) <- c("intercept", "X", "Y", "XY")
570
+    rm(data4reg)
571
+  
572
+    minN <- 3
573
+    newparams[["centers"]][newparams[["N"]] < minN] <- NA
574
+    Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
575
+    if(verbose) cat("Filling out empty centers")
576
+    for(i in Index){
577
+      if(verbose) if(i%%10000==0)cat(".")
578
+      mu <- newparams[["centers"]][i, ]
579
+      j <- which(is.na(mu))
580
+      newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
581
+    }
582
+    
583
+    ##remaing NAs are made like originals
584
+    if(length(YIndex)>0){
585
+      noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), 
586
+                           YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
587
+    }
588
+    snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0)
589
+    snps2keep <- setdiff(autosomeIndex, snps2ignore)
590
+    rm(snps2ignore)
591
+    newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
592
+    if(verbose) cat("\n")
593
+  
594
+    if(verbose) message("Calculating and standardizing size of shift.")
595
+    GG <- DD <- newparams[["centers"]] - params[["centers"]]
596
+    DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
597
+    SS <- cov(DD[autosomeIndex, ])
598
+    SSI <- solve(SS)
599
+    dev <- vector("numeric", nrow(DD))
600
+    if(length(YIndex)){
601
+      dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
602
+      dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
603
+      ##Now Y (only two params)
604
+      SSY <- SS[c(1, 3), c(1, 3)]
605
+      SSI <- solve(SSY) 
606
+      dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
607
+      dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
608
+    } else {
609
+      dev=apply(DD,1,function(x) x%*%SSI%*%x)
610
+      dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
611
+    }
612
+  }
613
+    
614
+  ## BC: must keep SD
615
+  params[-2] <- newparams[-2]
616
+  
617
+  rm(newparams);gc(verbose=FALSE)  
618
+  if(verbose) cat("Calling", NR, "SNPs... ")
619
+  ## ###################
620
+  ## ## MOVE TO C#######
621
+
622
+  ## running in batches
623
+  ## snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
624
+
625
+  process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
626
+                       YIndex, A, B, mixtureParams, fIndex, mIndex,
627
+                       params, cIndexes, SMEDIAN, theKnots, DF, probs,
628
+                       regionInfo, batchSize){
629
+    open(A)
630
+    open(B)
631
+    open(mixtureParams)
632
+    snps <- snpBatches[[idxBatch]]
633
+    tmpA <- as.matrix(A[snps,])
634
+    tmpB <- as.matrix(B[snps,])
635
+    rSnps <- range(snps)
636
+    last <- (idxBatch-1)*batchSize
637
+    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
638
+                         XIndex[XIndex %in% snps]-last,
639
+                         YIndex[YIndex %in% snps]-last)
640
+    IndexesBatch <- lapply(IndexesBatch, as.integer)
641
+    ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
642
+                            params[["centers"]][snps,],
643
+                            params[["scales"]][snps,],
644
+                            params[["N"]][snps,],
645
+                            IndexesBatch, cIndexes,
646
+                            sapply(IndexesBatch, length),
647
+                            sapply(cIndexes, length),
648
+                            SMEDIAN, theKnots, mixtureParams[],
649
+                            DF, probs, 0.025,
650
+                            which(regionInfo[snps, 2]),
651
+                            which(regionInfo[snps, 1]))
652
+    A[snps,] <- tmpA
653
+    B[snps,] <- tmpB
654
+    last <- rSnps[2]
655
+    rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull)
656
+    close(A)
657
+    close(B)
658
+    close(mixtureParams)
659
+  }
660
+
661
+  ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
662
+           autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
663
+           A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
664
+           mIndex=mIndex, params=params, cIndexes=cIndexes,
665
+           SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
666
+           regionInfo=regionInfo, batchSize=ocProbesets())
667
+  
668
+##   last <- 0
669
+##   for (idxBatch in seq(along=snpBatches)){
670
+##     snps <- snpBatches[[idxBatch]]
671
+##     tmpA <- A[snps,]
672
+##     tmpB <- B[snps,]
673
+##     rSnps <- range(snps)
674
+##     IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
675
+##                          XIndex[XIndex %in% snps]-last,
676
+##                          YIndex[YIndex %in% snps]-last)
677
+##     IndexesBatch <- lapply(IndexesBatch, as.integer)
678
+##     ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
679
+##                             params[["centers"]][snps,],
680
+##                             params[["scales"]][snps,],
681
+##                             params[["N"]][snps,],
682
+##                             IndexesBatch, cIndexes,
683
+##                             sapply(IndexesBatch, length),
684
+##                             sapply(cIndexes, length),
685
+##                             SMEDIAN, theKnots, mixtureParams[],
686
+##                             DF, probs, 0.025,
687
+##                             which(regionInfo[snps, 2]),
688
+##                             which(regionInfo[snps, 1]))
689
+##     A[snps,] <- tmpA
690
+##     B[snps,] <- tmpB
691
+##     last <- rSnps[2]
692
+##     rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull)
693
+##   }
694
+##   
695
+##   gc(verbose=FALSE)
696
+  ##  END MOVE TO C#######
697
+  ## ##################
698
+  
699
+  dev <- dev/(dev+1/383)
700
+  if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
701
+  if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
702
+
703
+  if(length(Index) >= recallRegMin){
704
+   tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
705
+   tmpSnpQc <- dev[autosomeIndex]
706
+   SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
707
+   batchQC <- mean(diag(SS))
708
+  }else{
709
+    SS <- matrix(0, 3, 3)
710
+    batchQC <- Inf
711
+  }
712
+  
713
+  if(verbose) message("Done.")
714
+  if (returnParams){
715
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
716
+  }else{
717
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
718
+  }
719
+}
720
+
721
+
178 722
 ##---------------------------------------------------------------------------
179 723
 ##---------------------------------------------------------------------------
180 724
 ## For Illumina
... ...
@@ -252,6 +796,7 @@ constructRG <- function(filenames, cdfName, sns, verbose, fileExt, sep, sampleSh
252 796
 }
253 797
 crlmmIlluminaRS <- function(sampleSheet=NULL,
254 798
 			    arrayNames=NULL,
799
+			    batch,
255 800
 			    ids=NULL,
256 801
 			    path=".",
257 802
 			    arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
... ...
@@ -266,10 +811,18 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
266 811
 			    seed=1, save.ab=FALSE, snpFile, cnFile,
267 812
 			    mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
268 813
 			    cdfName, sns, recallMin=10, recallRegMin=1000,
269
-			    returnParams=FALSE, badSNP=.7) {
814
+			    returnParams=FALSE, badSNP=.7,
815
+			    copynumber=FALSE,
816
+			    load.it=TRUE) {
270 817
 	if(missing(cdfName)) stop("must specify cdfName")
271 818
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
272 819
 	if(missing(sns)) sns <- basename(arrayNames)
820
+	if(missing(batch)){
821
+		warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch.  The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.")
822
+	} else {
823
+		if(length(batch) != length(sns))
824
+			stop("batch variable must be the same length as the filenames")
825
+	}	
273 826
 	batches <- splitIndicesByLength(seq(along=arrayNames), ocSamples())
274 827
 	k <- 1
275 828
 	for(j in batches){
... ...
@@ -300,13 +853,17 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
300 853
 		##  Here, I'm just using the # of rows returned from the above function
301 854
 		if(k == 1){
302 855
 			if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability")
303
-			callSet <- new("SnpSuperSet",
304
-				       alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
305
-				       alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
306
-				       call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
307
-				       callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
308
-				       annotation=cdfName)
309
-			sampleNames(callSet) <- sns
856
+			load.obj <- loadObject("callSet", load.it)
857
+			if(!load.obj){
858
+				callSet <- new("SnpSuperSet",
859
+					       alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
860
+					       alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
861
+					       call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
862
+					       callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
863
+					       annotation=cdfName)
864
+				sampleNames(callSet) <- sns
865
+				save(callSet, file=file.path(ldPath(), "callSet.rda"))
866
+			} else load(file.path(ldPath(), "callSet.rda"))
310 867
 			phenoData(callSet) <- getPhenoData(sampleSheet=sampleSheet,
311 868
 							   arrayNames=sns,
312 869
 							   arrayInfoColNames=arrayInfoColNames)
... ...
@@ -315,9 +872,21 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
315 872
 			protocolData(callSet) <- new("AnnotatedDataFrame", data=pD)
316 873
 			pData(protocolData(callSet))[j, ] <- pData(protocolData)
317 874
 			featureNames(callSet) <- res[["gns"]]
318
-			pData(callSet)$SKW <- rep(NA, length(sns))
319
-			pData(callSet)$SNR <- rep(NA, length(sns))
875
+			pData(callSet)$SNR <- initializeBigVector("crlmmSNR-", length(sns), "double")
876
+			pData(callSet)$SKW <- initializeBigVector("crlmmSKW-", length(sns), "double")
877
+			##pData(callSet)$SKW <- rep(NA, length(sns))
878
+			##pData(callSet)$SNR <- rep(NA, length(sns))
320 879
 			pData(callSet)$gender <- rep(NA, length(sns))
880
+			mixtureParams <- initializeBigMatrix("crlmmMixt-", nr=4, nc=ncol(callSet), vmode="double")
881
+			if(missing(batch)){
882
+				protocolData(callSet)$batch <- rep(NA, length(sns))
883
+			} else{
884
+				protocolData(callSet)$batch <- batch
885
+			}
886
+			featureData(callSet) <- addFeatureAnnotation(callSet)
887
+			open(mixtureParams)
888
+			open(callSet$SNR)
889
+			open(callSet$SKW)
321 890
 		}
322 891
 		if(k > 1 & nrow(res[[1]]) != nrow(callSet)){
323 892
 			##RS: I don't understand why the IDATS for the
... ...
@@ -325,40 +894,61 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
325 894
 			res[["A"]] <- res[["A"]][res$gns %in% featureNames(callSet), ]
326 895
 			res[["B"]] <- res[["B"]][res$gns %in% featureNames(callSet), ]
327 896
 		}
897
+		if(missing(batch)){
898
+			protocolData(callSet)$batch[j] <- as.numeric(as.factor(protocolData$ScanDate))
899
+		}
328 900
 		## MR: we need to define a snp.index vs np.index
329 901
 		snp.index <- match(res$gns, featureNames(callSet))		
330
-		suppressWarnings(A(callSet)[snp.index, j] <- res[["A"]])
331
-		suppressWarnings(B(callSet)[snp.index, j] <- res[["B"]])
902
+		A(callSet)[snp.index, j] <- res[["A"]]
903
+		B(callSet)[snp.index, j] <- res[["B"]]
332 904
 		pData(callSet)$SKW[j] <- res$SKW
333 905
 		pData(callSet)$SNR[j] <- res$SNR
334
-		mixtureParams <- res$mixtureParams
906
+		mixtureParams[, j] <- res$mixtureParams
335 907
 		rm(res); gc()
336
-		##MR:  edit snp.index
337
-		##snp.index <- 1:nrow(callSet)
338
-		tmp <- crlmmGT(A=as.matrix(A(callSet)[, j]),
339
-			       B=as.matrix(B(callSet)[, j]),
340
-			       SNR=callSet$SNR[j],
341
-			       mixtureParams=mixtureParams,
342
-			       cdfName=annotation(callSet),
343
-			       row.names=featureNames(callSet),
344
-			       col.names=sampleNames(callSet)[j],
345
-			       probs=probs,
346
-			       DF=DF,
347
-			       SNRMin=SNRMin,
348
-			       recallMin=recallMin,
349
-			       recallRegMin=recallRegMin,
350
-			       gender=gender,
351
-			       verbose=verbose,
352
-			       returnParams=returnParams,
353
-			       badSNP=badSNP)
354
-		suppressWarnings(snpCall(callSet)[, j] <- tmp[["calls"]])
355
-		## MR: many zeros in the conf. scores (?)
356
-		suppressWarnings(snpCallProbability(callSet)[, j] <- tmp[["confs"]])
357
-		callSet$gender[j] <- tmp$gender
358
-		rm(tmp); gc()
359 908
 		k <- k+1
360 909
 	}
361
-	return(callSet)
910
+	tmp <- crlmmGT2(A=A(callSet),
911
+			B=B(callSet),
912
+			SNR=callSet$SNR,
913
+			mixtureParams=mixtureParams,
914
+			cdfName=annotation(callSet),
915
+			row.names=featureNames(callSet),
916
+			col.names=sampleNames(callSet),
917
+			probs=probs,
918
+			DF=DF,
919
+			SNRMin=SNRMin,
920
+			recallMin=recallMin,
921
+			recallRegMin=recallRegMin,
922
+			gender=gender,
923
+			verbose=verbose,
924
+			returnParams=returnParams,
925
+			badSNP=badSNP)
926
+	open(tmp[["calls"]])
927
+	open(tmp[["confs"]])
928
+	snpCall(callSet) <- tmp[["calls"]]
929
+	## MR: many zeros in the conf. scores (?)
930
+	snpCallProbability(callSet) <- tmp[["confs"]]
931
+	callSet$gender <- tmp$gender
932
+	if(copynumber){
933
+		load.obj <- loadObject("cnSet", load.it)
934
+		if(!load.obj){
935
+			cnSet <- as(callSet, "CNSetLM")
936
+		} else {
937
+			load(file.path(ldPath(), "cnSet.rda"))
938
+			A(cnSet) <- A(callSet)
939
+			B(cnSet) <- B(callSet)
940
+			snpCall(cnSet) <- snpCall(callSet)
941
+			snpCallProbability(cnSet) <- snpCallProbability(callSet)
942
+			annotation(cnSet) <- annotation(callSet)
943
+			featureData(cnSet) <- featureData(callSet)
944
+			protocolData(cnSet) <- protocolData(callSet)
945
+			phenoData(cnSet) <- phenoData(callSet)
946
+			experimentData(cnSet) <- experimentData(callSet)
947
+		}
948
+	}
949
+	close(mixtureParams)
950
+	rm(tmp); gc()
951
+	return(cnSet)
362 952
 }
363 953
 ##---------------------------------------------------------------------------
364 954
 ##---------------------------------------------------------------------------
... ...
@@ -377,6 +967,29 @@ rowMAD <- function(x, y, ...){
377 967
 	return(mad)
378 968
 }
379 969
 
970
+shrink <- function(x, Ns, DF.PRIOR){
971
+	DF <- Ns-1
972
+	DF[DF < 1] <- 1
973
+	x.0 <- apply(x, 2, median, na.rm=TRUE)
974
+	x <- (x*DF + x.0*DF.PRIOR)/(DF.PRIOR + DF)
975
+	for(j in 1:ncol(x)) x[is.na(x[, j]), j] <- x.0[j]
976
+	return(x)
977
+}
978
+
979
+applyByGenotype <- function(x, FUN, G){
980
+	FUN <- match.fun(FUN)
981
+	tmp <- matrix(NA, nrow(x), 3)
982
+	for(j in 1:3){
983
+		GT <- G==j
984
+		GT[GT == FALSE] <- NA
985
+		gt.x <- GT*x
986
+		tmp[, j] <- FUN(gt.x, na.rm=TRUE)
987
+	}
988
+	tmp
989
+}
990
+
991
+
992
+
380 993
 rowCors <- function(x, y, ...){
381 994
 	N <- rowSums(!is.na(x))
382 995
 	x <- suppressWarnings(log2(x))
... ...
@@ -387,6 +1000,15 @@ rowCors <- function(x, y, ...){
387 1000
 	return(covar/(sd.x*sd.y))
388 1001
 }
389 1002
 
1003
+corByGenotype <- function(A, B, G, Ns, which.cluster=c(1,2,3)[1], DF.PRIOR){
1004
+	x <- A * (G == which.cluster)
1005
+	x[x==0] <- NA
1006
+	y <- B * (G == which.cluster)
1007
+	res <- as.matrix(rowCors(x, y, na.rm=TRUE))
1008
+	cors <- shrink(res, Ns[, which.cluster], DF.PRIOR)
1009
+	cors
1010
+}
1011
+
390 1012
 generateX <- function(w, X) as.numeric(diag(w) %*% X)
391 1013
 generateIXTX <- function(x, nrow=3) {
392 1014
 	X <- matrix(x, nrow=nrow)
... ...
@@ -394,6 +1016,61 @@ generateIXTX <- function(x, nrow=3) {
394 1016
 	solve(XTX)
395 1017
 }
396 1018
 
1019
+nuphiAllele2 <- function(allele, Ystar, W, Ns){
1020
+	complete <- rowSums(is.na(W)) == 0 
1021
+	if(any(!is.finite(W))){## | any(!is.finite(V))){
1022
+		i <- which(rowSums(!is.finite(W)) > 0)
1023
+		stop("Possible zeros in the within-genotype estimates of the spread (vA, vB). ")
1024
+	}
1025
+	NOHET <- mean(Ns[, 2], na.rm=TRUE) < 0.05
1026
+	if(missing(allele)) stop("must specify allele")
1027
+	if(allele == "A") X <- cbind(1, 2:0) else X <- cbind(1, 0:2)
1028
+	if(NOHET) X <- X[-2, ] ##more than 1 X chromosome, but all homozygous		
1029
+	##How to quickly generate Xstar, Xstar = diag(W) %*% X
1030
+	Xstar <- apply(W, 1, generateX, X)
1031
+	IXTX <- apply(Xstar, 2, generateIXTX, nrow=nrow(X))
1032
+	betahat <- matrix(NA, 2, nrow(Ystar))
1033
+	ses <- matrix(NA, 2, nrow(Ystar))
1034
+	for(i in 1:nrow(Ystar)){
1035
+		betahat[, i] <- crossprod(matrix(IXTX[, i], ncol(X), ncol(X)), crossprod(matrix(Xstar[, i], nrow=nrow(X)), Ystar[i, ]))
1036
+		ssr <- sum((Ystar[i, ] - matrix(Xstar[, i], nrow(X), ncol(X)) %*% matrix(betahat[, i], ncol(X), 1))^2)
1037
+		ses[, i] <- sqrt(diag(matrix(IXTX[, i], ncol(X), ncol(X)) * ssr))
1038
+	}
1039
+	nu <- betahat[1, ]
1040
+	phi <- betahat[2, ]
1041
+	return(list(nu, phi))
1042
+}
1043
+
1044
+nuphiAlleleX <- function(allele, Ystar, W, Ns, chrX=FALSE){
1045
+	complete <- rowSums(is.na(W)) == 0 
1046
+	if(any(!is.finite(W))){## | any(!is.finite(V))){
1047
+		i <- which(rowSums(!is.finite(W)) > 0)
1048
+		stop("Possible zeros in the within-genotype estimates of the spread (vA, vB). ")
1049
+	}
1050
+	##NOHET <- mean(Ns[, 2], na.rm=TRUE) < 0.05
1051
+	if(missing(allele)) stop("must specify allele")
1052
+	if(allele == "A") X <- cbind(1, c(1, 0, 2, 1, 0), c(0, 1, 0, 1, 2))
1053
+	if(allele == "B") X <- cbind(1, c(0, 1, 0, 1, 2), c(1, 0, 2, 1, 0))	
1054
+	##if(NOHET) X <- X[-2, ] ##more than 1 X chromosome, but all homozygous		
1055
+	##How to quickly generate Xstar, Xstar = diag(W) %*% X
1056
+	Xstar <- apply(W, 1, generateX, X)
1057
+	IXTX <- apply(Xstar, 2, generateIXTX, nrow=nrow(X))
1058
+
1059
+	betahat <- matrix(NA, 3, nrow(Ystar))
1060
+	ses <- matrix(NA, 3, nrow(Ystar))			
1061
+	##betahat <- matrix(NA, 2, nrow(Ystar))
1062
+	##ses <- matrix(NA, 2, nrow(Ystar))
1063
+	for(i in 1:nrow(Ystar)){
1064
+		betahat[, i] <- crossprod(matrix(IXTX[, i], ncol(X), ncol(X)), crossprod(matrix(Xstar[, i], nrow=nrow(X)), Ystar[i, ]))
1065
+		ssr <- sum((Ystar[i, ] - matrix(Xstar[, i], nrow(X), ncol(X)) %*% matrix(betahat[, i], ncol(X), 1))^2)
1066
+		ses[, i] <- sqrt(diag(matrix(IXTX[, i], ncol(X), ncol(X)) * ssr))
1067
+	}
1068
+	nu <- betahat[1, ]
1069
+	phi <- betahat[2, ]
1070
+	phi2 <- betahat[3, ]
1071
+	return(list(nu, phi, phi2))
1072
+}
1073
+
397 1074
 
398 1075
 nuphiAllele <- function(object, allele, Ystar, W, tmp.objects, cnOptions){
399 1076
 	I <- isSnp(object)
... ...
@@ -410,7 +1087,8 @@ nuphiAllele <- function(object, allele, Ystar, W, tmp.objects, cnOptions){
410 1087
 	Ns <- Ns[I, ]
411 1088
 	
412 1089
 	CHR <- unique(chromosome(object))
413
-	batch <- unique(object$batch)
1090
+	##batch <- unique(object$batch)
1091
+	batch <- unique(batch(object))
414 1092
 	nuA <- getParam(object, "nuA", batch)
415 1093
 	nuB <- getParam(object, "nuB", batch)
416 1094
 	nuA.se <- getParam(object, "nuA.se", batch)
... ...
@@ -518,57 +1196,6 @@ predictGender <- function(res, cdfName="genomewidesnp6", SNRMin=5){
518 1196
 	return(gender)
519 1197
 }
520 1198
 
521
-##combineIntensities <- function(res, NP=NULL, callSet){
522
-##	rownames(res$B) <- rownames(res$A) <- res$gns
523
-##	colnames(res$B) <- colnames(res$A) <- res$sns
524
-##	if(!is.null(NP)){
525
-##		blank <- matrix(NA, nrow(NP), ncol(NP))
526
-##		dimnames(blank) <- dimnames(NP)
527
-##		A <- rbind(res$A, NP)
528
-##		B <- rbind(res$B, blank)
529
-##	} else {
530
-##		A <- res$A
531
-##		B <- res$B
532
-##	}
533
-##	dimnames(B) <- dimnames(A)
534
-##	index.snps <- match(res$gns, rownames(A))
535
-##	callsConfs <- calls <- matrix(NA, nrow(A), ncol(A), dimnames=dimnames(A))
536
-##	
537
-##	calls[index.snps, ] <- calls(callSet)
538
-##	callsConfs[index.snps, ] <- assayData(callSet)[["callProbability"]]
539
-##	fd <- data.frame(matrix(NA, nrow(calls), length(fvarLabels(callSet))))
540
-##	fd[index.snps, ] <- fData(callSet)
541
-##	rownames(fd) <- rownames(A)
542
-##	colnames(fd) <- fvarLabels(callSet)
543
-##	fD <- new("AnnotatedDataFrame",
544
-##		  data=data.frame(fd),
545
-##		  varMetadata=data.frame(labelDescription=colnames(fd), row.names=colnames(fd)))
546
-##	superSet <- new("CNSet",
547
-##			CA=matrix(NA, nrow(A), ncol(A), dimnames=dimnames(A)),
548
-##			CB=matrix(NA, nrow(A), ncol(A), dimnames=dimnames(A)),
549
-##			alleleA=A,
550
-##			alleleB=B,
551
-##			call=calls,
552
-##			callProbability=callsConfs,
553
-##			featureData=fD,
554
-##			phenoData=phenoData(callSet),
555
-##			experimentData=experimentData(callSet),
556
-##			protocolData=protocolData(callSet),
557
-##			annotation=annotation(callSet))
558
-##	return(superSet)
559
-##}
560
-##
561
-harmonizeDimnamesTo <- function(object1, object2){
562
-	#object2 should be a subset of object 1
563
-	object2 <- object2[featureNames(object2) %in% featureNames(object1), ]
564
-	object1 <- object1[match(featureNames(object2), featureNames(object1)), ]
565
-	object1 <- object1[, match(sampleNames(object2), sampleNames(object1))]
566
-	stopifnot(all.equal(featureNames(object1), featureNames(object2)))
567
-	stopifnot(all.equal(sampleNames(object1), sampleNames(object2)))
568
-	return(object1)
569
-}
570
-
571
-
572 1199
 
573 1200
 crlmmCopynumber <- function(object,
574 1201
 			    chromosome=1:23,
... ...
@@ -588,12 +1215,22 @@ crlmmCopynumber <- function(object,
588 1215
 			    MIN.PHI=2^3,
589 1216
 			    THR.NU.PHI=TRUE,
590 1217
 			    thresholdCopynumber=TRUE){
591
-	stopifnot("batch" %in% varLabels(object))
1218
+	if(isPackageLoaded("ff")){
1219
+		open(object)
1220
+		open(object$SKW)
1221
+		open(object$SNR)
1222
+	}
1223
+	stopifnot("batch" %in% varLabels(protocolData(object)))
592 1224
 	stopifnot("chromosome" %in% fvarLabels(object))
593 1225
 	stopifnot("position" %in% fvarLabels(object))
594 1226
 	stopifnot("isSnp" %in% fvarLabels(object))
595
-	batch <- object$batch
596
-	batches <- split((1:ncol(object))[object$SNR > SNRMin], batch[object$SNR > SNRMin])
1227
+	##batch <- object$batch
1228
+	batch <- batch(object)
1229
+	if(isPackageLoaded("ff")) {
1230
+		open(object$SNR)
1231
+		SNR <- object$SNR[, ]
1232
+	} else SNR <-  object$SNR
1233
+	batches <- split((1:ncol(object))[SNR > SNRMin], batch[SNR > SNRMin])
597 1234
 	if(any(sapply(batches, length) < MIN.SAMPLES)) message("Excluding batches with fewer than ", MIN.SAMPLES, " samples")
598 1235
 	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
599 1236
 	if(missing(which.batches)) which.batches <- seq(along=batches)
... ...
@@ -618,7 +1255,7 @@ crlmmCopynumber <- function(object,
618 1255
 				   annotation=annotation(object))
619 1256
 			featureData(tmp) <- addFeatureAnnotation(tmp)
620 1257
 			featureData(tmp) <- lm.parameters(tmp, batch=unique(batch[j]))
621
-			tmp$batch <- batch[j]
1258
+			tmp$batch <- batch(object)[j]
622 1259
 			tmp <- computeCopynumber(tmp,
623 1260
 						 MIN.OBS=MIN.OBS,
624 1261
 						 DF.PRIOR=DF.PRIOR,
... ...
@@ -640,10 +1277,19 @@ crlmmCopynumber <- function(object,
640 1277
 					  dimnames=list(featureNames(tmp), sampleNames(tmp)))
641 1278
 			CA(object)[row.index, j] <- CA(tmp)
642 1279
 			CB(object)[row.index, j] <- CB(tmp)
643
-			labels.asis <- fvarLabels(tmp)
644
-			labels.asis <- gsub(paste("_", unique(tmp$batch), sep=""), paste(".", ii, sep=""), labels.asis)
645
-			k <- match(labels.asis, colnames(lM(object)))
646
-			lM(object)[row.index, k] <- fData(tmp)
1280
+			##ad-hocery
1281
+			batchName <- unique(batch(object)[j])
1282
+			fvarLabels(tmp)[15:17] <- paste(c("corrAB", "corrBB", "corrAA"), batchName, sep=".")
1283
+			fvarLabels(tmp)[13:14] <- paste(c("phiPrimeA", "phiPrimeB"), batchName, sep=".")
1284
+			fvarLabels(tmp) <- gsub("_", ".", fvarLabels(tmp))
1285
+			##fvarLabels(tmp) <- gsub("\\.[1-9]", "", fvarLabels(tmp))
1286
+			fData(tmp) <- fData(tmp)[, fvarLabels(tmp) %in% names(physical(lM(object)))]
1287
+			jj <- match(fvarLabels(tmp), names(lM(object)))
1288
+			lM(object)[row.index, jj] <- fData(tmp)
1289
+			##labels.asis <- fvarLabels(tmp)
1290
+			##labels.asis <- gsub(paste("_", unique(tmp$batch), sep=""), paste(".", ii, sep=""), labels.asis)
1291
+			##k <- match(labels.asis, colnames(lM(object)))
1292
+			##lM(object)[row.index, k] <- fData(tmp)
647 1293
 			rm(tmp); gc()
648 1294
 			ii <- ii+1
649 1295
 		}
... ...
@@ -651,6 +1297,705 @@ crlmmCopynumber <- function(object,
651 1297
 	return(object)
652 1298
 }
653 1299
 
1300
+crlmmCopynumber2 <- function(object,
1301
+			    which.batches,
1302
+			    MIN.SAMPLES=10,
1303
+			    SNRMin=5,
1304
+			    MIN.OBS=1,
1305
+			    DF.PRIOR=50,
1306
+			    bias.adj=FALSE,
1307
+			    prior.prob=rep(1/4,4),
1308
+			    seed=1,
1309
+			    verbose=TRUE,
1310
+			    GT.CONF.THR=0.99,
1311
+			    PHI.THR=2^6,
1312
+			    nHOM.THR=5,
1313
+			    MIN.NU=2^3,
1314
+			    MIN.PHI=2^3,
1315
+			    THR.NU.PHI=TRUE,
1316
+			    thresholdCopynumber=TRUE,
1317
+			     load.it=TRUE){
1318
+	stopifnot("batch" %in% varLabels(protocolData(object)))
1319
+	stopifnot("chromosome" %in% fvarLabels(object))
1320
+	stopifnot("position" %in% fvarLabels(object))
1321
+	stopifnot("isSnp" %in% fvarLabels(object))
1322
+	batch <- batch(object)
1323
+	XIndex.snps <- (1:nrow(object))[chromosome(object) == 23 & isSnp(object) & !is.na(chromosome(object))]
1324
+	##YIndex.snps <- (1:nrow(object))[chromosome(object) == 24 & isSnp(object)]
1325
+	XIndex.nps <- (1:nrow(object))[chromosome(object) == 23 & !isSnp(object) & !is.na(chromosome(object))]
1326
+	autosomeIndex.snps <- (1:nrow(object))[chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object))]
1327
+	autosomeIndex.nps <- (1:nrow(object))[chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object))]	
1328
+	##Indexes <- list(autosomeIndex, XIndex, YIndex)
1329
+	##snpBatches <- splitIndicesByLength(1:nrow(object), ocProbesets())
1330
+	snpBatches <- splitIndicesByLength(autosomeIndex.snps, ocProbesets())
1331
+	## Do chromosome X in batches
1332
+	Ns <- initializeBigMatrix("Ns", nrow(object), 5)
1333
+	colnames(Ns) <- c("A", "B", "AA", "AB", "BB")
1334
+	if(!file.exists(file.path(ldPath(), "normal.rda"))){
1335
+		normal <- initializeBigMatrix("normal", nrow(object), ncol(object), vmode="integer")
1336
+		normal[,] <- 1L
1337
+		save(normal, file=file.path(ldPath(), "normal.rda"))
1338
+	} else load(file.path(ldPath(), "normal.rda"))
1339
+	if(!file.exists(file.path(ldPath(), "snpflags.rda"))){
1340
+		snpflags <- initializeBigMatrix("snpflags", nrow(object), length(unique(batch(object))), vmode="integer")
1341
+		snpflags[,] <- 0L
1342
+		save(snpflags, file=file.path(ldPath(), "snpflags.rda"))
1343
+	} else{
1344
+		load(file.path(ldPath(), "snpflags.rda"))
1345
+				
1346
+	} 
1347
+	if(verbose) message("Estimating allele-specific copy number at autosomal SNPs")
1348
+	ocLapply(seq(along=snpBatches),
1349
+			fit.lm1,
1350
+			autosomeIndex=autosomeIndex.snps,
1351
+			object=object,
1352
+			Ns=Ns,
1353
+			normal=normal,
1354
+			snpflags=snpflags,
1355
+			snpBatches=snpBatches,
1356
+			batchSize=ocProbesets(),
1357
+			SNRMin=SNRMin,
1358
+			MIN.SAMPLES=MIN.SAMPLES,
1359
+			MIN.OBS=MIN.OBS,
1360
+			DF=DF.PRIOR,
1361
+			GT.CONF.THR=GT.CONF.THR,
1362
+			THR.NU.PHI=THR.NU.PHI,
1363
+			MIN.NU=MIN.NU,
1364
+			MIN.PHI=MIN.PHI,
1365
+			verbose=verbose,
1366
+			neededPkgs="crlmm")
1367
+	## autosomal NPs
1368
+	snpBatches <- splitIndicesByLength(autosomeIndex.nps, ocProbesets())
1369
+	if(verbose) message("Estimating total copy number at nonpolymorphic loci")
1370
+	ocLapply(seq(along=snpBatches),
1371
+			fit.lm2,
1372
+			autosomeIndex=autosomeIndex.nps,
1373
+			object=object,
1374
+			Ns=Ns,
1375
+			normal=normal,
1376
+			snpflags=snpflags,
1377
+			snpBatches=snpBatches,
1378
+			batchSize=ocProbesets(),
1379
+			SNRMin=SNRMin,
1380
+			MIN.SAMPLES=MIN.SAMPLES,
1381
+			MIN.OBS=MIN.OBS,
1382
+			DF=DF.PRIOR,
1383
+			GT.CONF.THR=GT.CONF.THR,
1384
+			THR.NU.PHI=THR.NU.PHI,
1385
+			MIN.NU=MIN.NU,
1386
+			MIN.PHI=MIN.PHI,
1387
+			verbose=verbose,
1388
+		 neededPkgs="crlmm")
1389
+	snpBatches <- splitIndicesByLength(XIndex.snps, ocProbesets())
1390
+	if(verbose) message("Estimating total copy number at polymorphic loci on chromosome X")
1391
+	ocLapply(seq(along=snpBatches),
1392
+			fit.lm3,
1393
+			autosomeIndex=XIndex.snps,
1394
+			object=object,
1395
+			Ns=Ns,
1396
+			normal=normal,
1397
+			snpflags=snpflags,
1398
+			snpBatches=snpBatches,
1399
+			batchSize=ocProbesets(),
1400
+			SNRMin=SNRMin,
1401
+			MIN.SAMPLES=MIN.SAMPLES,
1402
+			MIN.OBS=MIN.OBS,
1403
+			DF=DF.PRIOR,
1404
+			GT.CONF.THR=GT.CONF.THR,
1405
+			THR.NU.PHI=THR.NU.PHI,
1406
+			MIN.NU=MIN.NU,
1407
+			MIN.PHI=MIN.PHI,
1408
+			verbose=verbose,
1409
+		 neededPkgs="crlmm")
1410
+	if(verbose) message("Estimating total copy number for nonpolymorphic loci on chromosome X")
1411
+	snpBatches <- splitIndicesByLength(XIndex.nps, ocProbesets())
1412
+	if(verbose) message("Estimating total copy number at nonpolymorphic loci on chromosome X")
1413
+	tmp <- ocLapply(seq(along=snpBatches),
1414
+			fit.lm4,
1415
+			XIndex=XIndex.nps,
1416
+			object=object,
1417
+			Ns=Ns,
1418
+			normal=normal,
1419
+			snpflags=snpflags,
1420
+			snpBatches=snpBatches,
1421
+			batchSize=ocProbesets(),
1422
+			SNRMin=SNRMin,
1423
+			MIN.SAMPLES=MIN.SAMPLES,
1424
+			MIN.OBS=MIN.OBS,
1425
+			DF=DF.PRIOR,
1426
+			GT.CONF.THR=GT.CONF.THR,
1427
+			THR.NU.PHI=THR.NU.PHI,
1428
+			MIN.NU=MIN.NU,
1429
+			MIN.PHI=MIN.PHI,
1430
+			verbose=verbose,
1431
+			neededPkgs="crlmm")
1432
+	return(object)
1433
+}
1434
+
1435
+fit.lm1 <- function(idxBatch,
1436
+		    snpBatches,
1437
+		    autosomeIndex,
1438
+		    object,
1439
+		    Ns,
1440
+		    normal,
1441
+		    snpflags,
1442
+		    batchSize,
1443
+		    SNRMin,
1444
+		    MIN.SAMPLES,
1445
+		    MIN.OBS,
1446
+		    DF.PRIOR,
1447
+		    GT.CONF.THR,
1448
+		    THR.NU.PHI,
1449
+		    MIN.NU,
1450
+		    MIN.PHI,
1451
+		    verbose, ...){
1452
+			##   which.batches, ...){
1453
+	if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches))
1454
+	open(object)
1455
+	open(snpflags)
1456
+	open(normal)
1457
+	snps <- snpBatches[[idxBatch]]
1458
+		batches <- split(seq(along=batch(object)), batch(object))
1459
+	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
1460
+	corr <- corrA.BB <- corrB.AA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batches)))
1461
+	flags <- nuA <- nuB <- phiA <- phiB <- corr
1462
+	cB <- cA <- matrix(NA, length(snps), ncol(object))
1463
+	for(k in batches){
1464
+		G <- calls(object)[snps, k]
1465
+		NORM <- normal[snps, k]
1466
+		xx <- snpCallProbability(object)[snps, k]
1467
+		highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1468
+		G <- G*highConf*NORM
1469
+		A <- A(object)[snps, k]
1470
+		B <- B(object)[snps, k]
1471
+		##index <- GT.B <- GT.A <- vector("list", 3)
1472
+		##names(index) <- names(GT.B) <- names(GT.A) <- c("AA", "AB", "BB")
1473
+		Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G)
1474
+		muA <- applyByGenotype(A, rowMedians, G)
1475
+		muB <- applyByGenotype(B, rowMedians, G)
1476
+		vA <- applyByGenotype(A, rowMAD, G)
1477
+		vB <- applyByGenotype(B, rowMAD, G)
1478
+		vA <- shrink(vA, Ns, DF.PRIOR)
1479
+		vB <- shrink(vB, Ns, DF.PRIOR)
1480
+		##location and scale
1481
+		J <- match(unique(batch(object)[k]), unique(batch(object)))
1482
+		##background variance for alleleA
1483
+		taus <- applyByGenotype(log2(A), rowMAD, G)^2
1484
+		tau2A[, J] <- shrink(taus[, 3, drop=FALSE], Ns[, 3], DF.PRIOR)
1485
+		sig2A[, J] <- shrink(taus[, 1, drop=FALSE], Ns[, 1], DF.PRIOR)
1486
+		taus <- applyByGenotype(log2(B), rowMAD, G)^2
1487
+		tau2B[, J] <- shrink(taus[, 3, drop=FALSE], Ns[, 1], DF.PRIOR)
1488
+		sig2B[, J] <- shrink(taus[, 1, drop=FALSE], Ns[, 3], DF.PRIOR)
1489
+
1490
+		corr[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=2, DF.PRIOR)
1491
+		corrB.AA[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=1, DF.PRIOR)
1492
+		corrA.BB[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=3, DF.PRIOR)
1493
+		##formerly oneBatch()...
1494
+		##---------------------------------------------------------------------------
1495
+		## Impute sufficient statistics for unobserved genotypes (plate-specific)
1496
+		##---------------------------------------------------------------------------
1497
+		index <- apply(Ns, 2, function(x, MIN.OBS) which(x > MIN.OBS), MIN.OBS)
1498
+		correct.orderA <- muA[, 1] > muA[, 3]
1499
+		correct.orderB <- muB[, 3] > muB[, 1]
1500
+		index.complete <- intersect(which(correct.orderA & correct.orderB), intersect(index[[1]], intersect(index[[2]], index[[3]])))
1501
+		size <- min(5000, length(index.complete))
1502
+		if(size == 5000) index.complete <- sample(index.complete, 5000, replace=TRUE)
1503
+		if(length(index.complete) < 200){
1504
+			stop("fewer than 200 snps pass criteria for predicting the sufficient statistics")
1505
+		}
1506
+		index <- vector("list", 3)
1507
+		index[[1]] <- which(Ns[, 1] == 0 & (Ns[, 2] >= MIN.OBS & Ns[, 3] >= MIN.OBS))
1508
+		index[[2]] <- which(Ns[, 2] == 0 & (Ns[, 1] >= MIN.OBS & Ns[, 3] >= MIN.OBS))
1509
+		index[[3]] <- which(Ns[, 3] == 0 & (Ns[, 2] >= MIN.OBS & Ns[, 1] >= MIN.OBS))
1510
+		res <- imputeCenter(muA, muB, index.complete, index)
1511
+		muA <- res[[1]]
1512
+		muB <- res[[2]]
1513
+
1514
+		## Monomorphic SNPs.  Mixture model may be better
1515
+		## Improve estimation by borrowing strength across batch
1516
+		noAA <- Ns[, 1] < MIN.OBS
1517
+		noAB <- Ns[, 2] < MIN.OBS
1518
+		noBB <- Ns[, 3] < MIN.OBS
1519
+		index[[1]] <- noAA & noAB
1520
+		index[[2]] <- noBB & noAB
1521
+		index[[3]] <- noAA & noBB
1522
+		cols <- c(3, 1, 2)
1523
+		for(j in 1:3){
1524
+			if(sum(index[[j]]) == 0) next()
1525
+			kk <- cols[j]
1526
+			X <- cbind(1, muA[index.complete, kk], muB[index.complete, kk])
1527
+			Y <- cbind(muA[index.complete,  -kk],
1528
+				   muB[index.complete,  -kk])
1529
+			betahat <- solve(crossprod(X), crossprod(X,Y))
1530
+			X <- cbind(1, muA[index[[j]],  kk], muB[index[[j]],  kk])
1531
+			mus <- X %*% betahat
1532
+			muA[index[[j]], -kk] <- mus[, 1:2]
1533
+			muB[index[[j]], -kk] <- mus[, 3:4]
1534
+		}
1535
+		negA <- rowSums(muA < 0) > 0
1536
+		negB <- rowSums(muB < 0) > 0
1537
+		flags[, J] <- rowSums(Ns == 0) > 0
1538
+		##flags[, J] <- index[[1]] | index[[2]] | index[[3]] | rowSums(
1539
+
1540
+		##formerly coefs()
1541
+		Np <- Ns
1542
+		Np[Np < 1] <- 1
1543
+		vA2 <- vA^2/Np
1544
+		vB2 <- vB^2/Np
1545
+		wA <- sqrt(1/vA2)
1546
+		wB <- sqrt(1/vB2)
1547
+		YA <- muA*wA
1548
+		YB <- muB*wB
1549
+		res <- nuphiAllele2(allele="A", Ystar=YA, W=wA, Ns=Ns)
1550
+		nuA[, J] <- res[[1]]
1551
+		phiA[, J] <- res[[2]]
1552
+		res <- nuphiAllele2(allele="B", Ystar=YB, W=wB, Ns=Ns)
1553
+		nuB[, J] <- res[[1]]
1554
+		phiB[, J] <- res[[2]]
1555
+		if(THR.NU.PHI){
1556
+			nuA[nuA[, J] < MIN.NU, J] <- MIN.NU
1557
+			nuB[nuB[, J] < MIN.NU, J] <- MIN.NU
1558
+			phiA[phiA[, J] < MIN.PHI, J] <- MIN.PHI
1559
+			phiB[phiB[, J] < MIN.PHI, J] <- MIN.PHI
1560
+		}
1561
+		## formerly polymorphic():  calculate copy number
1562
+		cA[, k] <- matrix((1/phiA[, J]*(A-nuA[, J])), nrow(A), ncol(A))
1563
+		cB[, k] <- matrix((1/phiB[, J]*(B-nuB[, J])), nrow(B), ncol(B))
1564
+	}
1565
+	cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA))
1566
+	cB <- matrix(as.integer(cB*100), nrow(cB), ncol(cB))
1567
+	CA(object)[snps, ] <- cA
1568
+	CB(object)[snps, ] <- cB
1569
+	snpflags[snps, ] <- flags
1570
+	lapply(lM(object), open)
1571
+	lM(object)$tau2A[snps, ] <- tau2A
1572
+	lM(object)$tau2B[snps, ] <- tau2B
1573
+	lM(object)$sig2A[snps, ] <- sig2A
1574
+	lM(object)$sig2B[snps, ] <- sig2B
1575
+	lM(object)$nuA[snps, ] <- nuA
1576
+	lM(object)$nuB[snps, ] <- nuB
1577
+	lM(object)$phiA[snps, ] <- phiA
1578
+	lM(object)$phiB[snps, ] <- phiB
1579
+	lapply(assayData(object), close)
1580
+	lapply(lM(object), close)
1581
+	TRUE
1582
+}
1583
+
1584
+fit.lm2 <- function(idxBatch,
1585
+		    snpBatches,
1586
+		    autosomeIndex,
1587
+		    object,
1588
+		    Ns,
1589
+		    normal,
1590
+		    snpflags,
1591
+		    batchSize,
1592
+		    SNRMin,
1593
+		    MIN.SAMPLES,
1594
+		    MIN.OBS,
1595
+		    DF.PRIOR,
1596
+		    GT.CONF.THR,
1597
+		    THR.NU.PHI,
1598
+		    MIN.NU,
1599
+		    MIN.PHI,
1600
+		    verbose, ...){
1601
+			##   which.batches, ...){
1602
+	if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches))
1603
+	open(object)
1604
+	open(snpflags)
1605
+	open(normal)
1606
+	snps <- snpBatches[[idxBatch]]
1607
+	batches <- split(seq(along=batch(object)), batch(object))
1608
+	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
1609
+	cA <- matrix(NA, length(snps), ncol(object))
1610
+	ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object))
1611
+	flags <- snpflags[, ]
1612
+	noflags <- rowSums(flags) == 0
1613
+	## We do not want to write to discuss for each batch.  More efficient to
1614
+	## write to disk after estimating these parameters for all batches.
1615
+	nuA.np <- phiA.np <- sig2A.np <- matrix(NA, length(snps), length(unique(batches)))
1616
+	## for imputation, we need the corresponding parameters of the snps
1617
+	nuA <- lM(object)$nuA
1618
+	phiA <- lM(object)$phiA
1619
+	nuB <- lM(object)$nuB
1620
+	phiB <- lM(object)$phiB
1621
+	snp.ind <- ii & noflags
1622
+	NORM.np <- normal[snps, ]
1623
+	for(k in batches){
1624
+		##if(verbose) message("SNP batch ", ii, " of ", length(batches))
1625
+		J <- match(unique(batch(object)[k]), unique(batch(object)))
1626
+		snp.index <- snp.ind & nuA[, J] > 20 & nuB[, J] > 20 & phiA[, J] > 20 & phiB[, J] > 20
1627
+		if(sum(snp.index) >= 5000){
1628
+			snp.index <- sample(which(snp.index), 5000)
1629
+		}
1630
+		phiA.snp <- phiA[snp.index, J]
1631
+		phiB.snp <- phiB[snp.index, J]
1632
+		A.snp <- A(object)[snp.index, k]
1633
+		B.snp <- B(object)[snp.index, k]
1634
+		NORM.snp <- normal[snp.index, k]
1635
+		G <- calls(object)[snp.index, k]
1636
+		xx <- snpCallProbability(object)[snp.index, k]
1637
+		highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1638
+		G <- G*highConf*NORM.snp[, k]
1639
+		G[G==0] <- NA
1640
+		##nonpolymorphic
1641
+		A.np <- A(object)[snps, k]
1642
+		Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G)
1643
+		muA <- applyByGenotype(A.snp, rowMedians, G)
1644
+		muB <- applyByGenotype(B.snp, rowMedians, G)
1645
+		muA <- muA[, 1]
1646
+		muB <- muB[, 3]
1647
+		X <- cbind(1, log2(c(muA, muB)))
1648
+		Y <- log2(c(phiA.snp, phiB.snp))
1649
+		betahat <- solve(crossprod(X), crossprod(X, Y))
1650
+		##
1651
+		mus <- rowMedians(A.np * NORM.np[, k], na.rm=TRUE)
1652
+		crosshyb <- max(median(muA) - median(mus), 0)
1653
+		X <- cbind(1, log2(mus+crosshyb))
1654
+		logPhiT <- X %*% betahat
1655
+		phiA.np[, J] <- 2^(logPhiT)
1656
+		nuA.np[, J] <- mus-2*phiA.np[, J]
1657
+		if(THR.NU.PHI){
1658
+			nuA.np[nuA.np[, J] < MIN.NU, J] <- MIN.NU
1659
+			phiA.np[phiA.np[, J] < MIN.PHI, J] <- MIN.PHI
1660
+		}
1661
+		cA[, k] <- 1/phiA.np[, J] * (A.np - nuA.np[, J])
1662
+		sig2A.np[, J] <- rowMAD(log2(A.np*NORM.np[, k]), na.rm=TRUE)
1663
+	}
1664
+	cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA))
1665
+	CA(object)[snps, ] <- cA
1666
+	##open(lM(object))
1667
+	lM(object)$sig2A[snps, ] <- sig2A.np
1668
+	lM(object)$nuA[snps, ] <- nuA.np
1669
+	lM(object)$phiA[snps, ] <- phiA.np
1670
+	lapply(assayData(object), close)
1671
+	lapply(lM(object), close)
1672
+	TRUE
1673
+}
1674
+
1675
+
1676
+fit.lm3 <- function(idxBatch,
1677
+		    snpBatches,
1678
+		    XIndex,
1679
+		    object,
1680
+		    Ns,
1681
+		    normal,
1682
+		    snpflags,
1683
+		    batchSize,
1684
+		    SNRMin,
1685
+		    MIN.SAMPLES,
1686
+		    MIN.OBS,
1687
+		    DF.PRIOR,
1688
+		    GT.CONF.THR,
1689
+		    THR.NU.PHI,
1690
+		    MIN.NU,
1691
+		    MIN.PHI,
1692
+		    verbose, ...){
1693
+			##   which.batches, ...){
1694
+	if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches))
1695
+	open(snpflags)
1696
+	open(normal)
1697
+	open(object)
1698
+	snps <- snpBatches[[idxBatch]]
1699
+	batches <- split(seq(along=batch(object)), batch(object))
1700
+	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
1701
+	corrAB <- corrBB <- corrAA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batches)))
1702
+	phiA2 <- phiB2 <- tau2A
1703
+	flags <- nuA <- nuB <- phiA <- phiB <- corrAB
1704
+	cB <- cA <- matrix(NA, length(snps), ncol(object))
1705
+	gender <- object$gender
1706
+	IX <- matrix(gender, length(snps), ncol(object))
1707
+	NORM <- normal[snps,]
1708
+	IX <- IX==2
1709
+	for(k in batches){
1710
+		##if(verbose) message("SNP batch ", ii, " of ", length(batches)) 
1711
+		## within-genotype moments
1712
+		G <- calls(object)[snps, k]
1713
+		xx <- snpCallProbability(object)[snps, k]
1714
+		highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1715
+		G <- G*highConf*NORM[, k]
1716
+		A <- A(object)[snps, k]
1717
+		B <- B(object)[snps, k]
1718
+		##index <- GT.B <- GT.A <- vector("list", 3)
1719
+		##names(index) <- names(GT.B) <- names(GT.A) <- c("AA", "AB", "BB")
1720
+		Ns.F <- applyByGenotype(matrix(1, nrow(G), sum(gender==2)), rowSums, G[, gender==2])
1721
+		Ns.M <- applyByGenotype(matrix(1, nrow(G), sum(gender==1)), rowSums, G[, gender==1])
1722
+		Ns <- cbind(Ns.M[, 1], Ns.M[, 3], Ns.F)
1723
+		muA.F <- applyByGenotype(A[, gender==2], rowMedians, G[, gender==2])
1724
+		muA.M <- applyByGenotype(A[, gender==1], rowMedians, G[, gender==1])
1725
+		muB.F <- applyByGenotype(B[, gender==2], rowMedians, G[, gender==2])
1726
+		muB.M <- applyByGenotype(B[, gender==1], rowMedians, G[, gender==1])
1727
+		vA.F <- applyByGenotype(A[, gender==2], rowMAD, G[, gender==2])
1728
+		vB.F <- applyByGenotype(B[, gender==2], rowMAD, G[, gender==2])
1729
+		vA.M <- applyByGenotype(A[, gender==1], rowMAD, G[, gender==1])
1730
+		vB.M <- applyByGenotype(B[, gender==1], rowMAD, G[, gender==1])		
1731
+		vA.F <- shrink(vA.F, Ns.F, DF.PRIOR)
1732
+		vA.M <- shrink(vA.M, Ns.M, DF.PRIOR)
1733
+		vB.F <- shrink(vB.F, Ns.F, DF.PRIOR)
1734
+		vB.M <- shrink(vB.M, Ns.M, DF.PRIOR)
1735
+		##location and scale
1736
+		J <- match(unique(batch(object)[k]), unique(batch(object)))
1737
+		##background variance for alleleA
1738
+		taus <- applyByGenotype(log2(A[, gender==2]), rowMAD, G[, gender==2])^2
1739
+		tau2A[, J] <- shrink(taus[, 3, drop=FALSE], Ns.F[, 3], DF.PRIOR)
1740
+		sig2A[, J] <- shrink(taus[, 1, drop=FALSE], Ns.F[, 1], DF.PRIOR)
1741
+		taus <- applyByGenotype(log2(B[, gender==2]), rowMAD, G[, gender==2])^2
1742
+		tau2B[, J] <- shrink(taus[, 3, drop=FALSE], Ns.F[, 1], DF.PRIOR)
1743
+		sig2B[, J] <- shrink(taus[, 1, drop=FALSE], Ns.F[, 3], DF.PRIOR)
1744
+		corrAB[, J] <- corByGenotype(A=A[, gender==2], B=B[, gender==2], G=G[, gender==2], Ns=Ns.F, which.cluster=2, DF.PRIOR)
1745
+		corrAA[, J] <- corByGenotype(A=A[, gender==2], B=B[, gender==2], G=G[, gender==2], Ns=Ns.F, which.cluster=1, DF.PRIOR)
1746
+		corrBB[, J] <- corByGenotype(A=A[, gender==2], B=B[, gender==2], G=G[, gender==2], Ns=Ns.F, which.cluster=3, DF.PRIOR)
1747
+		##formerly oneBatch()...
1748
+		##---------------------------------------------------------------------------
1749
+		## Impute sufficient statistics for unobserved genotypes (plate-specific)
1750
+		##---------------------------------------------------------------------------
1751
+		index <- apply(Ns.F, 2, function(x, MIN.OBS) which(x > MIN.OBS), MIN.OBS)
1752
+		correct.orderA <- muA.F[, 1] > muA.F[, 3]
1753
+		correct.orderB <- muB.F[, 3] > muB.F[, 1]
1754
+		index.complete <- intersect(which(correct.orderA & correct.orderB), intersect(index[[1]], intersect(index[[2]], index[[3]])))
1755
+		size <- min(5000, length(index.complete))
1756
+		if(size == 5000) index.complete <- sample(index.complete, 5000, replace=TRUE)
1757
+		if(length(index.complete) < 200){
1758
+			stop("fewer than 200 snps pass criteria for predicting the sufficient statistics")
1759
+		}
1760
+		index <- vector("list", 3)
1761
+		index[[1]] <- which(Ns.F[, 1] == 0 & (Ns.F[, 2] >= MIN.OBS & Ns.F[, 3] >= MIN.OBS))
1762
+		index[[2]] <- which(Ns.F[, 2] == 0 & (Ns.F[, 1] >= MIN.OBS & Ns.F[, 3] >= MIN.OBS))
1763
+		index[[3]] <- which(Ns.F[, 3] == 0 & (Ns.F[, 2] >= MIN.OBS & Ns.F[, 1] >= MIN.OBS))
1764
+		res <- imputeCenter(muA.F, muB.F, index.complete, index)
1765
+		muA.F <- res[[1]]
1766
+		muB.F <- res[[2]]
1767
+		nobsA <- Ns.M[, 1] > MIN.OBS
1768
+		nobsB <- Ns.M[, 3] > MIN.OBS
1769
+		notMissing <- !(is.na(muA.M[, 1]) | is.na(muA.M[, 3]) | is.na(muB.M[, 1]) | is.na(muB.M[, 3]))
1770
+		complete <- list()
1771
+		complete[[1]] <- which(correct.orderA & correct.orderB & nobsA & notMissing) ##be selective here
1772
+		complete[[2]] <- which(correct.orderA & correct.orderB & nobsB & notMissing) ##be selective here	
1773
+		size <- min(5000, length(complete[[1]]))
1774
+		if(size > 5000) complete <- lapply(complete, function(x) sample(x, size))
1775
+		res <- imputeCenterX(muA.M, muB.M, Ns.M, complete, MIN.OBS)
1776
+		muA.M <- res[[1]]
1777
+		muB.M <- res[[2]]
1778
+
1779
+		## Monomorphic SNPs.  Mixture model may be better
1780
+		## Improve estimation by borrowing strength across batch
1781
+		noAA <- Ns.F[, 1] < MIN.OBS
1782
+		noAB <- Ns.F[, 2] < MIN.OBS
1783
+		noBB <- Ns.F[, 3] < MIN.OBS
1784
+		index[[1]] <- noAA & noAB
1785
+		index[[2]] <- noBB & noAB
1786
+		index[[3]] <- noAA & noBB
1787
+		cols <- c(3, 1, 2)
1788
+		for(j in 1:3){
1789
+			if(sum(index[[j]]) == 0) next()
1790
+			kk <- cols[j]
1791
+			X <- cbind(1, muA.F[index.complete, kk], muB.F[index.complete, kk])
1792
+			Y <- cbind(muA.F[index.complete,  -kk],
1793
+				   muB.F[index.complete,  -kk])
1794
+			betahat <- solve(crossprod(X), crossprod(X,Y))
1795
+			X <- cbind(1, muA.F[index[[j]],  kk], muB.F[index[[j]],  kk])
1796
+			mus <- X %*% betahat
1797
+			muA.F[index[[j]], -kk] <- mus[, 1:2]
1798
+			muB.F[index[[j]], -kk] <- mus[, 3:4]
1799
+		}
1800
+		negA <- rowSums(muA.F < 0) > 0
1801
+		negB <- rowSums(muB.F < 0) > 0
1802
+		flags[, J] <- rowSums(Ns.F == 0) > 0 | negA | negB
1803
+		##flags[, J] <- index[[1]] | index[[2]] | index[[3]] | rowSums(
1804
+		##formerly coefs()
1805
+		Np <- cbind(Ns.M[, c(1,3)], Ns.F)
1806
+		Np[Np < 1] <- 1
1807
+		vA <- cbind(vA.M[, c(1, 3)], vA.F)
1808
+		vB <- cbind(vB.M[, c(1, 3)], vB.F)
1809
+		muA <- cbind(muA.M[, c(1,3)], muA.F)
1810
+		muB <- cbind(muB.M[, c(1,3)], muB.F)
1811
+		vA2 <- vA^2/Np
1812
+		vB2 <- vB^2/Np
1813
+		wA <- sqrt(1/vA2)
1814
+		wB <- sqrt(1/vB2)
1815
+		YA <- muA*wA
1816
+		YB <- muB*wB
1817
+		res <- nuphiAlleleX(allele="A", Ystar=YA, W=wA)
1818
+		nuA[, J] <- res[[1]]
1819
+		phiA[, J] <- res[[2]]
1820
+		phiA2[, J] <- res[[3]]
1821
+		res <- nuphiAlleleX(allele="B", Ystar=YB, W=wB)
1822
+		nuB[, J] <- res[[1]]
1823
+		phiB[, J] <- res[[2]]
1824
+		phiB2[, J] <- res[[3]]
1825
+		if(THR.NU.PHI){
1826
+			nuA[nuA[, J] < MIN.NU, J] <- MIN.NU
1827
+			nuB[nuB[, J] < MIN.NU, J] <- MIN.NU
1828
+			phiA[phiA[, J] < MIN.PHI, J] <- MIN.PHI
1829
+			phiA2[phiA2[, J] < MIN.PHI, J] <- MIN.PHI
1830
+			phiB[phiB[, J] < MIN.PHI, J] <- MIN.PHI
1831
+			phiB2[phiB2[, J] < MIN.PHI, J] <- MIN.PHI
1832
+		}
1833
+		phistar <- phiB2[, J]/phiA[, J]
1834
+		tmp <- (B-nuB[, J] - phistar*A + phistar*nuA[, J])/phiB[, J]
1835
+		cB[, k] <- tmp/(1-phistar*phiA2[, J]/phiB[, J])
1836
+		cA[, k] <- (A-nuA[, J]-phiA2[, J]*cB)/phiA[, J]
1837
+		##some of the snps are called for the men, but not the women
1838
+	}
1839
+	##--------------------------------------------------
1840
+	##RS: need to fix.  why are there NA's by coercion
1841
+	cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA))
1842
+	##--------------------------------------------------
1843
+	##ii <- rowSums(is.na(cA)) > 0
1844
+	##these often arise at SNPs with low confidence scores
1845
+	cB <- matrix(as.integer(cB*100), nrow(cB), ncol(cB))
1846
+	CA(object)[snps, ] <- cA
1847
+	CB(object)[snps, ] <- cB
1848
+	snpflags[snps, ] <- flags
1849
+	lM(object)$tau2A[snps, ] <- tau2A
1850
+	lM(object)$tau2B[snps, ] <- tau2B
1851
+	lM(object)$sig2A[snps, ] <- sig2A
1852
+	lM(object)$sig2B[snps, ] <- sig2B
1853
+	lM(object)$nuA[snps, ] <- nuA
1854
+	lM(object)$nuB[snps, ] <- nuB
1855
+	lM(object)$phiA[snps, ] <- phiA
1856
+	lM(object)$phiB[snps, ] <- phiB
1857
+	lM(object)$phiPrimeA[snps, ] <- phiA2
1858
+	lM(object)$phiPrimeB[snps, ] <- phiB2
1859
+	lapply(assayData(object), close)
1860
+	lapply(lM(object), close)
1861
+	TRUE
1862
+}
1863
+
1864
+fit.lm4 <- function(idxBatch,
1865
+		    snpBatches,
1866
+		    XIndex,
1867
+		    object,
1868
+		    Ns,
1869
+		    normal,
1870
+		    snpflags,
1871
+		    batchSize,
1872
+		    SNRMin,
1873
+		    MIN.SAMPLES,
1874
+		    MIN.OBS,
1875
+		    DF.PRIOR,
1876
+		    GT.CONF.THR,
1877
+		    THR.NU.PHI,
1878
+		    MIN.NU,
1879
+		    MIN.PHI,
1880
+		    verbose, ...){
1881
+	if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches))	
1882
+	open(object)
1883
+	open(normal)
1884
+	open(snpflags)
1885
+	snps <- snpBatches[[idxBatch]]
1886
+	batches <- split(seq(along=batch(object)), batch(object))
1887
+	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
1888
+	nuA <- phiA <- sig2A <- tau2A <- matrix(NA, length(snps), length(unique(batches)))
1889
+	cA <- matrix(NA, length(snps), ncol(object))
1890
+	ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object))
1891
+	flags <- snpflags[ii, , drop=FALSE]
1892
+	noflags <- rowSums(flags) == 0
1893
+	lapply(lM(object), open)
1894
+	nuIA <- lM(object)$nuA[ii, ]
1895
+	nuIB <- lM(object)$nuB[ii, ]
1896
+	phiIA <- lM(object)$phiA[ii,]
1897
+	phiIB <- lM(object)$phiB[ii,]
1898
+
1899
+	snp.index <- which(nuIA > 20 & nuIB & 20 & phiIA > 20 & phiIB > 20 & noflags)
1900
+	if(length(snp.index) >= 5000){
1901
+		snp.index <- sample(snp.index, 5000)
1902
+	}
1903
+	phiA.snp <- lM(object)$phiA[snp.index, , drop=FALSE]
1904
+	phiB.snp <- lM(object)$phiB[snp.index, , drop=FALSE]
1905
+	A.snp <- A(object)[snp.index, ]
1906
+	B.snp <- B(object)[snp.index, ]
1907
+	NORM.snp <- normal[snp.index, ]
1908
+	NORM.np <- normal[snps, ]
1909
+	gender <- object$gender
1910
+
1911
+
1912
+	pseudoAR <- position(object)[snps] < 2709520 | (position(object)[snps] > 154584237 & position(object)[snps] < 154913754)
1913
+	pseudoAR[is.na(pseudoAR)] <- FALSE
1914
+	
1915
+	##if(missing(which.batches)) which.batches <- seq(along=batches)
1916
+	##batches <- batches[which.batches]
1917
+	for(k in batches){
1918
+		##if(verbose) message("SNP batch ", ii, " of ", length(batches)) 
1919
+		G <- calls(object)[snp.index, k]
1920
+		xx <- snpCallProbability(object)[snp.index, k]
1921
+		highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1922
+		G <- G*highConf*NORM.snp[, k]
1923
+		##snps
1924
+		AA <- A.snp[, k]
1925
+		BB <- B.snp[, k]
1926
+
1927
+				  
1928
+		##index <- GT.B <- GT.A <- vector("list", 3)
1929
+		##names(index) <- names(GT.B) <- names(GT.A) <- c("AA", "AB", "BB")
1930
+		Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G)
1931
+		muA <- applyByGenotype(AA, rowMedians, G)
1932
+		muB <- applyByGenotype(BB, rowMedians, G)
1933
+		muA <- muA[, 1]
1934
+		muB <- muB[, 3]
1935
+		X <- cbind(1, log2(c(muA, muB)))
1936
+		J <- match(unique(batch(object)[k]), unique(batch(object)))
1937
+		
1938
+		Y <- log2(c(phiA.snp[, J], phiB.snp[, J]))
1939
+
1940
+		##--------------------------------------------------
1941
+		##RS: need to fix
1942
+		remove <- is.na(X[, 2]) | !is.finite(Y)
1943
+		Y <- Y[!remove]
1944
+		X <- X[!remove, ]
1945
+		##--------------------------------------------------
1946
+		betahat <- solve(crossprod(X), crossprod(X, Y))
1947
+		
1948
+
1949
+		##nonpolymorphic
1950
+		A <- A(object)[snps, k]
1951
+		gend <- gender[k]
1952
+		A.M <- A[, gend==1]
1953
+		mu1 <- rowMedians(A.M, na.rm=TRUE)
1954
+
1955
+		A.F <- A[, gend==2]
1956
+		mu2 <- rowMedians(A.F, na.rm=TRUE)
1957
+		mus <- log2(cbind(mu1, mu2))
1958
+		X.men <- cbind(1, mus[, 1])
1959
+		X.fem <- cbind(1, mus[, 2])
1960
+
1961
+		Yhat1 <- as.numeric(X.men %*% betahat)
1962
+		Yhat2 <- as.numeric(X.fem %*% betahat)
1963
+		phi1 <- 2^(Yhat1)
1964
+		phi2 <- 2^(Yhat2)
1965
+		nu1 <- 2^(mus[, 1]) - phi1
1966
+		nu2 <- 2^(mus[, 2]) - 2*phi2
1967
+
1968
+		if(any(pseudoAR)){
1969
+			nu1[pseudoAR] <- 2^(mus[pseudoAR, 1]) - 2*phi1[pseudoAR]
1970
+		}
1971
+
1972
+		normal.f <- NORM.np[, k]
1973
+		A.F <- A.F*normal.f[, gend==2]
1974
+		A.F[A.F==0] <- NA
1975
+		nuA[, J] <- nu2
1976
+		phiA[, J] <- phi2
1977
+		sig2A[, J] <- rowMAD(log2(A.F), na.rm=TRUE)^2
1978
+		if(THR.NU.PHI){
1979
+			nuA[nuA[, J] < MIN.NU, J] <- MIN.NU
1980
+			phiA[phiA[, J] < MIN.PHI, J] <- MIN.PHI
1981
+		}
1982
+		CT1 <- 1/phi1*(A.M-nu1)
1983
+		CT2 <- 1/phi2*(A.F-nu2)
1984
+		tmp <- cA[, k]
1985
+		tmp[, gend==1] <- CT1
1986
+		tmp[, gend==2] <- CT2
1987
+		cA[, k] <- tmp
1988
+	}
1989
+	cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA))
1990
+	CA(object)[snps, ] <- cA
1991
+	open(lM(object))
1992
+	lM(object)$sig2A[snps, ] <- sig2A
1993
+	lM(object)$nuA[snps, ] <- nuA
1994
+	lM(object)$phiA[snps, ] <- phiA
1995
+	lapply(assayData(object), close)
1996
+	lapply(lM(object), close)
1997
+	TRUE
1998
+}
654 1999
 
655 2000
 
656 2001
 crlmmWrapper <- function(filenames, cnOptions, ...){
... ...
@@ -830,6 +2175,61 @@ whichPlatform <- function(cdfName){
830 2175
 	return(platform)
831 2176
 }
832 2177
 
2178
+cnrma2 <- function(A, filenames, row.names, verbose=TRUE, seed=1, cdfName, sns){
2179
+	if(missing(cdfName)) stop("must specify cdfName")
2180
+	pkgname <- getCrlmmAnnotationName(cdfName)
2181
+	require(pkgname, character.only=TRUE) || stop("Package ", pkgname, " not available")
2182
+	if (missing(sns)) sns <- basename(filenames)
2183
+	if(verbose) message("Loading annotations for nonpolymorphic probes")
2184
+        loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
2185
+	fid <- getVarInEnv("npProbesFid")
2186
+	SKW <- initializeBigVector("crlmmSKW.NP-", length(filenames), "double")
2187
+  	##A <- initializeBigMatrix("crlmmNP-", length(fid), length(filenames), "integer")
2188
+	sampleBatches <- splitIndicesByNode(seq(along=filenames))
2189
+	if(verbose) message("Processing nonpolymorphic probes for ", length(filenames), " files.")
2190
+	ocLapply(sampleBatches,
2191
+		 processCEL2,
2192
+		 row.names=row.names,
2193
+		 filenames=filenames,
2194
+		 A=A,
2195
+		 SKW=SKW, 
2196
+		 seed=seed,
2197
+		 pkgname=pkgname,
2198
+		 cdfName=cdfName,
2199
+		 neededPkgs=c("crlmm", pkgname))
2200
+	list(sns=sns, gns=row.names, SKW=SKW, cdfName=cdfName)
2201
+}
2202
+
2203
+processCEL2 <- function(i, filenames, row.names, A, SKW, seed, cdfName, pkgname){
2204
+	if(cdfName=="genomewidesnp6"){
2205
+		loader("1m_reference_cn.rda", .crlmmPkgEnv, pkgname)
2206
+	}
2207
+	if(cdfName=="genomewidesnp5"){
2208
+		loader("5.0_reference_cn.rda", .crlmmPkgEnv, pkgname)
2209
+	}
2210
+	reference <- getVarInEnv("reference")
2211
+        loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
2212
+	fid <- getVarInEnv("npProbesFid")
2213
+	fid <- fid[match(row.names, names(fid))]
2214
+	np.index <- match(row.names, rownames(A))
2215
+	gns <- names(fid)
2216
+	set.seed(seed)
2217
+	idx2 <- sample(length(fid), 10^5)
2218
+	open(A)
2219
+	open(SKW)
2220
+	for (k in i){
2221
+		y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
2222
+		x <- log2(y[idx2])
2223
+		SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3)
2224
+		rm(x)
2225
+		A[np.index, k] <- as.integer(normalize.quantiles.use.target(y, target=reference))
2226
+		rm(y)
2227
+	}
2228
+	close(A)
2229
+	close(SKW)
2230
+	TRUE
2231
+}
2232
+
833 2233
 
834 2234
 # steps: quantile normalize hapmap: create 1m_reference_cn.rda object
835 2235
 cnrma <- function(filenames, cdfName, row.names, sns, seed=1, verbose=FALSE){
... ...
@@ -1242,6 +2642,44 @@ withinGenotypeMoments <- function(object, cnOptions, tmp.objects){
1242 2642
 	return(tmp.objects)
1243 2643
 }
1244 2644
 
2645
+imputeCenter <- function(muA, muB, index.complete, index.missing){
2646
+	index <- index.missing
2647
+	mnA <- muA
2648
+	mnB <- muB
2649
+	for(j in 1:3){
2650
+		if(length(index[[j]]) == 0) next()
2651
+		X <- cbind(1, mnA[index.complete,  -j, drop=FALSE], mnB[index.complete,  -j, drop=FALSE])
2652
+		Y <- cbind(mnA[index.complete, j], mnB[index.complete,  j])
2653
+		betahat <- solve(crossprod(X), crossprod(X,Y))
2654
+		X <- cbind(1, mnA[index[[j]],  -j, drop=FALSE],  mnB[index[[j]],  -j, drop=FALSE])
2655
+		mus <- X %*% betahat
2656
+		muA[index[[j]], j] <- mus[, 1]
2657
+		muB[index[[j]], j] <- mus[, 2]
2658
+	}
2659
+	list(muA, muB)
2660
+}
2661
+
2662
+imputeCenterX <- function(muA, muB, Ns, index.complete, MIN.OBS){
2663
+	index1 <- which(Ns[, 1] == 0 & Ns[, 3] > MIN.OBS)
2664
+	X <- cbind(1, muA[index.complete[[1]], 3], muB[index.complete[[1]], 3])
2665
+	Y <- cbind(1, muA[index.complete[[1]], 1], muB[index.complete[[1]], 1])
2666
+	betahat <- solve(crossprod(X), crossprod(X,Y))
2667
+	##now with the incomplete SNPs
2668
+	X <- cbind(1, muA[index1, 3], muB[index1, 3])
2669
+	mus <- X %*% betahat
2670
+	muA[index1, 1] <- mus[, 2]
2671
+	muB[index1, 1] <- mus[, 3]
2672
+	index1 <- which(Ns[, 3] == 0)
2673
+	X <- cbind(1, muA[index.complete[[2]], 1], muB[index.complete[[2]], 1])
2674
+	Y <- cbind(1, muA[index.complete[[2]], 3], muB[index.complete[[2]], 3])
2675
+	betahat <- solve(crossprod(X), crossprod(X,Y))
2676
+	##now with the incomplete SNPs
2677
+	X <- cbind(1, muA[index1, 1], muB[index1, 1])
2678
+	mus <- X %*% betahat
2679
+	muA[index1, 3] <- mus[, 2]
2680
+	muB[index1, 3] <- mus[, 3]	
2681
+	list(muA, muB)
2682
+}
1245 2683
 
1246 2684
 oneBatch <- function(object, cnOptions, tmp.objects){
1247 2685
 	muA <- tmp.objects[["muA"]]
... ...
@@ -1365,7 +2803,8 @@ locationAndScale <- function(object, cnOptions, tmp.objects){
1365 2803
 	AB.B <- GT.B[[2]]
1366 2804
 	BB.B <- GT.B[[3]]
1367 2805
 	x <- BB.A[index.BB, ]
1368
-	batch <- unique(object$batch)
2806
+	##batch <- unique(object$batch)
2807
+	batch <- unique(batch(object))
1369 2808
 	tau2A <- getParam(object, "tau2A", batch)
1370 2809
 	tau2A[index.BB] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2	
1371 2810
 	DF <- Ns[, "BB"]-1
... ...
@@ -1439,7 +2878,8 @@ locationAndScale <- function(object, cnOptions, tmp.objects){
1439 2878
 }
1440 2879
 
1441 2880
 coefs <- function(object, cnOptions, tmp.objects){
1442
-	batch <- unique(object$batch)
2881
+	##batch <- unique(object$batch)
2882
+	batch <- unique(batch(object))
1443 2883
 	CHR <- unique(chromosome(object))
1444 2884
 	muA <- tmp.objects[["muA"]]
1445 2885
 	muB <- tmp.objects[["muB"]]
... ...
@@ -1499,7 +2939,8 @@ coefs <- function(object, cnOptions, tmp.objects){
1499 2939
 }
1500 2940
 
1501 2941
 polymorphic <- function(object, cnOptions, tmp.objects){
1502
-	batch <- unique(object$batch)
2942
+	##batch <- unique(object$batch)
2943
+	batch <- unique(batch(object))
1503 2944
 	CHR <- unique(chromosome(object))
1504 2945
 	vA <- tmp.objects[["vA"]]
1505 2946
 	vB <- tmp.objects[["vB"]]
... ...
@@ -1542,7 +2983,8 @@ posteriorProbability.snps <- function(object, cnOptions, tmp.objects=list()){
1542 2983
 	I <- isSnp(object)
1543 2984
 	gender <- object$gender
1544 2985
 	CHR <- unique(chromosome(object))
1545
-	batch <- unique(object$batch)
2986
+	##batch <- unique(object$batch)
2987
+	batch <- unique(batch(object))
1546 2988
 	if(CHR == 23){
1547 2989
 		phiAX <- getParam(object, "phiAX", batch)[I]
1548 2990
 		phiBX <- getParam(object, "phiBX", batch)[I]
... ...
@@ -1655,7 +3097,8 @@ biasAdj <- function(object, cnOptions, tmp.objects){
1655 3097
 
1656 3098
 ##biasAdjNP <- function(plateIndex, envir, priorProb){
1657 3099
 biasAdjNP <- function(object, cnOptions, tmp.objects){
1658
-	batch <- unique(object$batch)
3100
+	##batch <- unique(object$batch)
3101
+	batch <- unique(batch(object))
1659 3102
 	normalNP <- tmp.objects[["normal"]][!isSnp(object), ]
1660 3103
 	CHR <- unique(chromosome(object))
1661 3104
 	A <- A(object)[!isSnp(object), ]
... ...
@@ -1715,7 +3158,8 @@ biasAdjNP <- function(object, cnOptions, tmp.objects){
1715 3158
 
1716 3159
 
1717 3160
 getParams <- function(object, batch){
1718
-	batch <- unique(object$batch)
3161
+	##batch <- unique(object$batch)
3162
+	batch <- unique(batch(object))
1719 3163
 	if(length(batch) > 1) stop("batch variable not unique")		
1720 3164
 	nuA <- as.numeric(fData(object)[, match(paste("nuA", batch, sep="_"), fvarLabels(object))])
1721 3165
 	nuB <- as.numeric(fData(object)[, match(paste("nuB", batch, sep="_"), fvarLabels(object))])	
... ...
@@ -1798,7 +3242,8 @@ thresholdModelParams <- function(object, cnOptions){
1798 3242
 
1799 3243
 
1800 3244
 computeCopynumber.CNSet <- function(object, cnOptions){
1801
-	PLATE <- unique(object$batch)
3245
+	##PLATE <- unique(object$batch)
3246
+	PLATE <- unique(batch(object))
1802 3247
 	verbose <- cnOptions$verbose
1803 3248
 	tmp.objects <- instantiateObjects(object, cnOptions)
1804 3249
 	bias.adj <- cnOptions$bias.adj
... ...
@@ -339,7 +339,6 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
339 339
   if(verbose) message("Loading annotations.")
340 340
   loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
341 341
   loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
342
-
343 342
   ## this is toget rid of the 'no visible binding' notes
344 343
   ## variable definitions
345 344
   XIndex <- getVarInEnv("XIndex")
... ...
@@ -3,6 +3,7 @@ setMethod("show", "CNSetLM", function(object){
3 3
 	cat("lM: ", length(lM(object)), " elements \n")
4 4
 	print(names(lM(object)))
5 5
 })
6
+
6 7
 setMethod("[", "CNSetLM", function(x, i, j, ..., drop=FALSE){
7 8
 	x <- callNextMethod(x, i, j, ..., drop=drop)
8 9
 	if(!missing(i)){
... ...
@@ -14,14 +15,22 @@ setMethod("[", "CNSetLM", function(x, i, j, ..., drop=FALSE){
14 15
 	}
15 16
 	x
16 17
 })
18
+
17 19
 setMethod("lM", "CNSetLM", function(object) object@lM)
18 20
 setReplaceMethod("lM", c("CNSetLM", "list_or_ffdf"), function(object, value){
19 21
 	object@lM <- value
20 22
 	object
21 23
 })
22 24
 
25
+
26
+
27
+setMethod("open", "CNSetLM", function(con,...){