Browse code

commit after bioc rebase

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@52859 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 16/02/2011 16:19:11
Showing16 changed files

... ...
@@ -1,3 +1,5 @@
1
+test*
2
+myCrlmmGT2.Rnw
1 3
 ffobjs*
2 4
 auto*
3 5
 ModelForPolymorphicX.pdf
... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.9.2
4
+Version: 1.9.13
5 5
 Date: 2010-12-10
6 6
 Author: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -24,7 +24,8 @@ Imports: affyio (>= 1.19.2),
24 24
 Suggests: hapmapsnp6,
25 25
           genomewidesnp6Crlmm (>= 1.0.4),
26 26
           snpMatrix,
27
-	  ellipse
27
+	  ellipse,
28
+	  RUnit
28 29
 Collate: AllGenerics.R
29 30
 	 AllClasses.R
30 31
 	 methods-AssayData.R
... ...
@@ -39,5 +40,6 @@ Collate: AllGenerics.R
39 40
 	 plot-methods.R
40 41
          utils.R
41 42
          zzz.R
43
+	 test_crlmm_package.R
42 44
 LazyLoad: yes
43 45
 biocViews: Microarray, Preprocessing, SNP, Bioinformatics, CopyNumberVariants
... ...
@@ -1,7 +1,4 @@
1 1
 useDynLib("crlmm", .registration=TRUE)
2
-## this is temporary
3
-exportPattern("^[^\\.]")
4
-
5 2
 ##---------------------------------------------------------------------------
6 3
 ## Biobase
7 4
 ##---------------------------------------------------------------------------
... ...
@@ -47,7 +47,7 @@ getFeatureData <- function(cdfName, copynumber=FALSE){
47 47
 		M[index, "isSnp"] <- 0L
48 48
 	}
49 49
 	##A few of the snpProbes do not match -- I think it is chromosome Y.
50
-	M[is.na(M[, "isSnp"]), "isSnp"] <- 1L
50
+	M[is.na(M[, "chromosome"]), "isSnp"] <- 1L
51 51
 	M <- data.frame(M)
52 52
 	return(new("AnnotatedDataFrame", data=M))
53 53
 }
... ...
@@ -60,6 +60,9 @@ construct <- function(filenames,
60 60
 	if(!missing(batch)){
61 61
 		stopifnot(length(batch) == length(sns))
62 62
 	}
63
+	if(any(is.na(batch))){
64
+		stop("NA's in batch variable")
65
+	}
63 66
 	if(missing(sns) & missing(filenames)) stop("one of filenames or samplenames (sns) must be provided")
64 67
 	if(verbose) message("Initializing container for copy number estimation")
65 68
 	featureData <- getFeatureData(cdfName, copynumber=copynumber)
... ...
@@ -109,7 +112,7 @@ genotype <- function(filenames,
109 112
 		       SNRMin=5,
110 113
 		       recallMin=10,
111 114
 		       recallRegMin=1000,
112
-		       gender=NULL,
115
+		     gender=NULL,
113 116
 		       returnParams=TRUE,
114 117
 		       badSNP=0.7){
115 118
 	is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE)
... ...
@@ -151,7 +154,7 @@ genotype <- function(filenames,
151 154
 		 mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
152 155
 		 neededPkgs=c("crlmm", pkgname))
153 156
 	## Now we initialize a CNSet object, cloning A and B
154
-	message("Cloning A and B objects")
157
+	if(verbose) message("Cloning A and B matrices to store genotype calls and confidence scores.")
155 158
 	## The clones will be used to store calls and confidence scores
156 159
 	open(A)
157 160
 	open(B)
... ...
@@ -171,15 +174,13 @@ genotype <- function(filenames,
171 174
 		     batch=batch)
172 175
 	if(!missing(sns)){
173 176
 		sampleNames(cnSet) <- sns
174
-		open(A)
175
-		protocolData <- annotatedDataFrameFrom(A(cnSet), byrow=FALSE)
176
-		close(A)
177 177
 	} else {
178 178
 		sampleNames(cnSet) <- basename(filenames)
179
-		protocolData <- getProtocolData.Affy(filenames)
180 179
 	}
180
+	protocolData <- getProtocolData.Affy(filenames)
181 181
 	rownames(pData(protocolData)) <- sns
182 182
 	protocolData(cnSet) <- protocolData
183
+	##protocolData(cnSet) <- protocolData
183 184
 	pd <- data.frame(matrix(NA, nc, 3), row.names=sns)
184 185
 	colnames(pd)=c("SKW", "SNR", "gender")
185 186
 	phenoData(cnSet) <- new("AnnotatedDataFrame", data=pd)
... ...
@@ -213,7 +214,7 @@ genotype <- function(filenames,
213 214
 			  returnParams=returnParams,
214 215
 			  badSNP=badSNP)
215 216
 	if(verbose) message("Genotyping finished.  Updating container with genotype calls and confidence scores.")
216
-	cnSet$gender <- tmp$gender
217
+	cnSet$gender <- tmp[["gender"]]
217 218
 	return(cnSet)
218 219
 }
219 220
 
... ...
@@ -359,37 +360,39 @@ fit.wls <- function(NN, sigma, allele, Y, autosome, X){
359 360
 }
360 361
 
361 362
 shrinkGenotypeSummaries <- function(strata, index.list, object, MIN.OBS, MIN.SAMPLES, DF.PRIOR,
362
-				    verbose, is.lds){
363
-	if(is.lds) {physical <- get("physical"); open(object)}
363
+				    verbose, is.lds=TRUE){
364
+	##if(is.lds) {physical <- get("physical"); open(object)}
365
+	open(object)
364 366
 	if(verbose) message("      Probe stratum ", strata, " of ", length(index.list))
365 367
 	marker.index <- index.list[[strata]]
366 368
 	batches <- split(seq_along(batch(object)), as.character(batch(object)))
367 369
 	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
368
-	batchnames <- batchNames(object)
369
-	N.AA <- as.matrix(N.AA(object)[marker.index, ])
370
-	N.AB <- as.matrix(N.AB(object)[marker.index, ])
371
-	N.BB <- as.matrix(N.BB(object)[marker.index, ])
372
-	medianA.AA <- as.matrix(medianA.AA(object)[marker.index,])
373
-	medianA.AB <- as.matrix(medianA.AB(object)[marker.index,])
374
-	medianA.BB <- as.matrix(medianA.BB(object)[marker.index,])
375
-	medianB.AA <- as.matrix(medianB.AA(object)[marker.index,])
376
-	medianB.AB <- as.matrix(medianB.AB(object)[marker.index,])
377
-	medianB.BB <- as.matrix(medianB.BB(object)[marker.index,])
378
-	madA.AA <- as.matrix(madA.AA(object)[marker.index,])
379
-	madA.AB <- as.matrix(madA.AB(object)[marker.index,])
380
-	madA.BB <- as.matrix(madA.BB(object)[marker.index,])
381
-	madB.AA <- as.matrix(madB.AA(object)[marker.index,])
382
-	madB.AB <- as.matrix(madB.AB(object)[marker.index,])
383
-	madB.BB <- as.matrix(madB.BB(object)[marker.index,])
384
-	medianA <- medianB <- shrink.madB <- shrink.madA <- vector("list", length(batchnames))
385
-	shrink.tau2A.AA <- tau2A.AA <- as.matrix(tau2A.AA(object)[marker.index,])
386
-	shrink.tau2B.BB <- tau2B.BB <- as.matrix(tau2B.BB(object)[marker.index,])
387
-	shrink.tau2A.BB <- tau2A.BB <- as.matrix(tau2A.BB(object)[marker.index,])
388
-	shrink.tau2B.AA <- tau2B.AA <- as.matrix(tau2B.AA(object)[marker.index,])
389
-	shrink.corrAA <- corrAA <- as.matrix(corrAA(object)[marker.index, ])
390
-	shrink.corrAB <- corrAB <- as.matrix(corrAB(object)[marker.index, ])
391
-	shrink.corrBB <- corrBB <- as.matrix(corrBB(object)[marker.index, ])
392
-	flags <- as.matrix(flags(object)[marker.index, ])
370
+	batch.names <- batchNames(object)
371
+	batch.index <- which(batchNames(object) %in% batch.names)
372
+	N.AA <- as.matrix(N.AA(object)[marker.index, batch.index])
373
+	N.AB <- as.matrix(N.AB(object)[marker.index, batch.index])
374
+	N.BB <- as.matrix(N.BB(object)[marker.index, batch.index])
375
+	medianA.AA <- as.matrix(medianA.AA(object)[marker.index, batch.index])
376
+	medianA.AB <- as.matrix(medianA.AB(object)[marker.index, batch.index])
377
+	medianA.BB <- as.matrix(medianA.BB(object)[marker.index, batch.index])
378
+	medianB.AA <- as.matrix(medianB.AA(object)[marker.index, batch.index])
379
+	medianB.AB <- as.matrix(medianB.AB(object)[marker.index, batch.index])
380
+	medianB.BB <- as.matrix(medianB.BB(object)[marker.index, batch.index])
381
+	madA.AA <- as.matrix(madA.AA(object)[marker.index, batch.index])
382
+	madA.AB <- as.matrix(madA.AB(object)[marker.index, batch.index])
383
+	madA.BB <- as.matrix(madA.BB(object)[marker.index, batch.index])
384
+	madB.AA <- as.matrix(madB.AA(object)[marker.index, batch.index])
385
+	madB.AB <- as.matrix(madB.AB(object)[marker.index, batch.index])
386
+	madB.BB <- as.matrix(madB.BB(object)[marker.index, batch.index])
387
+	medianA <- medianB <- shrink.madB <- shrink.madA <- vector("list", length(batch.names))
388
+	shrink.tau2A.AA <- tau2A.AA <- as.matrix(tau2A.AA(object)[marker.index, batch.index])
389
+	shrink.tau2B.BB <- tau2B.BB <- as.matrix(tau2B.BB(object)[marker.index, batch.index])
390
+	shrink.tau2A.BB <- tau2A.BB <- as.matrix(tau2A.BB(object)[marker.index, batch.index])
391
+	shrink.tau2B.AA <- tau2B.AA <- as.matrix(tau2B.AA(object)[marker.index, batch.index])
392
+	shrink.corrAA <- corrAA <- as.matrix(corrAA(object)[marker.index, batch.index])
393
+	shrink.corrAB <- corrAB <- as.matrix(corrAB(object)[marker.index, batch.index])
394
+	shrink.corrBB <- corrBB <- as.matrix(corrBB(object)[marker.index, batch.index])
395
+	flags <- as.matrix(flags(object)[marker.index, batch.index])
393 396
 	for(k in seq(along=batches)){
394 397
 		sample.index <- batches[[k]]
395 398
 		this.batch <- unique(as.character(batch(object)[sample.index]))
... ...
@@ -422,9 +425,6 @@ shrinkGenotypeSummaries <- function(strata, index.list, object, MIN.OBS, MIN.SAM
422 425
 		## SNPs that we'll use for imputing location/scale of unobserved genotypes
423 426
 		##---------------------------------------------------------------------------
424 427
 		index.complete <- indexComplete(NN, medianA[[k]], medianB[[k]], MIN.OBS)
425
-		if(length(index.complete) == 1){
426
-			if(index.complete == FALSE) return()
427
-		}
428 428
 		##
429 429
 		##---------------------------------------------------------------------------
430 430
 		## Impute sufficient statistics for unobserved genotypes (plate-specific)
... ...
@@ -459,46 +459,32 @@ shrinkGenotypeSummaries <- function(strata, index.list, object, MIN.OBS, MIN.SAM
459 459
 		negB <- rowSums(medianB[[k]] < 0) > 0
460 460
 		flags[, k] <- as.integer(rowSums(NN == 0) > 0 | negA | negB)
461 461
 	}
462
-	if(length(batches) >= 3){
463
-		if(verbose) message("Imputing centers for unobserved genotypes from other batches")
464
-		res <- imputeAcrossBatch(N.AA, N.AB, N.BB,
465
-					 medianA.AA, medianA.AB, medianA.BB,
466
-					 medianB.AA, medianB.AB, medianB.BB)
467
-		medianA.AA <- res[["medianA.AA"]]
468
-		medianA.AB <- res[["medianA.AB"]]
469
-		medianA.BB <- res[["medianA.BB"]]
470
-		medianB.AA <- res[["medianB.AA"]]
471
-		medianB.AB <- res[["medianB.AB"]]
472
-		medianB.BB <- res[["medianB.BB"]]
473
-		updated <- res[["updated"]]
474
-	}
475
-	flags(object)[marker.index, ] <- flags
476
-	medianA.AA(object)[marker.index, ] <- do.call("cbind", lapply(medianA, function(x) x[, 1]))
477
-	medianA.AB(object)[marker.index, ] <- do.call("cbind", lapply(medianA, function(x) x[, 2]))
478
-	medianA.BB(object)[marker.index, ] <- do.call("cbind", lapply(medianA, function(x) x[, 3]))
479
-	medianB.AA(object)[marker.index, ] <- do.call("cbind", lapply(medianB, function(x) x[, 1]))
480
-	medianB.AB(object)[marker.index, ] <- do.call("cbind", lapply(medianB, function(x) x[, 2]))
481
-	medianB.BB(object)[marker.index, ] <- do.call("cbind", lapply(medianB, function(x) x[, 3]))
462
+	flags(object)[marker.index, batch.index] <- flags
463
+	medianA.AA(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianA, function(x) x[, 1]))
464
+	medianA.AB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianA, function(x) x[, 2]))
465
+	medianA.BB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianA, function(x) x[, 3]))
466
+	medianB.AA(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianB, function(x) x[, 1]))
467
+	medianB.AB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianB, function(x) x[, 2]))
468
+	medianB.BB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianB, function(x) x[, 3]))
482 469
 	##
483
-	madA.AA(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madA, function(x) x[, 1]))
484
-	madA.AB(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madA, function(x) x[, 2]))
485
-	madA.BB(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madA, function(x) x[, 3]))
486
-	madB.AA(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madB, function(x) x[, 1]))
487
-	madB.AB(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madB, function(x) x[, 2]))
488
-	madB.BB(object)[marker.index, ] <- do.call("cbind", lapply(shrink.madB, function(x) x[, 3]))
470
+	madA.AA(object)[marker.index, batch.index] <- do.call("cbind", lapply(shrink.madA, function(x) x[, 1]))
471
+	madA.AB(object)[marker.index, batch.index] <- do.call("cbind", lapply(shrink.madA, function(x) x[, 2]))
472
+	madA.BB(object)[marker.index, batch.index] <- do.call("cbind", lapply(shrink.madA, function(x) x[, 3]))
473
+	madB.AA(object)[marker.index, batch.index] <- do.call("cbind", lapply(shrink.madB, function(x) x[, 1]))
474
+	madB.AB(object)[marker.index, batch.index] <- do.call("cbind", lapply(shrink.madB, function(x) x[, 2]))
475
+	madB.BB(object)[marker.index, batch.index] <- do.call("cbind", lapply(shrink.madB, function(x) x[, 3]))
489 476
 	##
490
-	corrAA(object)[marker.index, ] <- shrink.corrAA
491
-	corrAB(object)[marker.index, ] <- shrink.corrAB
492
-	corrBB(object)[marker.index, ] <- shrink.corrBB
493
-	tau2A.AA(object)[marker.index,] <- shrink.tau2A.AA
494
-	tau2A.BB(object)[marker.index,] <- shrink.tau2A.BB
495
-	tau2B.AA(object)[marker.index,] <- shrink.tau2B.AA
496
-	tau2B.BB(object)[marker.index,] <- shrink.tau2B.BB
477
+	corrAA(object)[marker.index, batch.index] <- shrink.corrAA
478
+	corrAB(object)[marker.index, batch.index] <- shrink.corrAB
479
+	corrBB(object)[marker.index, batch.index] <- shrink.corrBB
480
+	tau2A.AA(object)[marker.index, batch.index] <- shrink.tau2A.AA
481
+	tau2A.BB(object)[marker.index, batch.index] <- shrink.tau2A.BB
482
+	tau2B.AA(object)[marker.index, batch.index] <- shrink.tau2B.AA
483
+	tau2B.BB(object)[marker.index, batch.index] <- shrink.tau2B.BB
497 484
 	if(is.lds) return(TRUE) else return(object)
498 485
 }
499 486
 
500 487
 
501
-
502 488
 fit.lm1 <- function(strata,
503 489
 		    index.list,
504 490
 		    object,
... ...
@@ -513,34 +499,38 @@ fit.lm1 <- function(strata,
513 499
 	snps <- index.list[[strata]]
514 500
 	batches <- split(seq_along(batch(object)), as.character(batch(object)))
515 501
 	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
516
-	batchnames <- batchNames(object)
517
-	N.AA <- as.matrix(N.AA(object)[snps, ])
518
-	N.AB <- as.matrix(N.AB(object)[snps, ])
519
-	N.BB <- as.matrix(N.BB(object)[snps, ])
520
-	medianA.AA <- as.matrix(medianA.AA(object)[snps,])
521
-	medianA.AB <- as.matrix(medianA.AB(object)[snps,])
522
-	medianA.BB <- as.matrix(medianA.BB(object)[snps,])
523
-	medianB.AA <- as.matrix(medianB.AA(object)[snps,])
524
-	medianB.AB <- as.matrix(medianB.AB(object)[snps,])
525
-	medianB.BB <- as.matrix(medianB.BB(object)[snps,])
526
-	madA.AA <- as.matrix(madA.AA(object)[snps,])
527
-	madA.AB <- as.matrix(madA.AB(object)[snps,])
528
-	madA.BB <- as.matrix(madA.BB(object)[snps,])
529
-	madB.AA <- as.matrix(madB.AA(object)[snps,])
530
-	madB.AB <- as.matrix(madB.AB(object)[snps,])
531
-	madB.BB <- as.matrix(madB.BB(object)[snps,])
532
-	tau2A.AA <- as.matrix(tau2A.AA(object)[snps,])
533
-	tau2B.BB <- as.matrix(tau2B.BB(object)[snps,])
534
-	tau2A.BB <- as.matrix(tau2A.BB(object)[snps,])
535
-	tau2B.AA <- as.matrix(tau2B.AA(object)[snps,])
536
-	corrAA <- as.matrix(corrAA(object)[snps, ])
537
-	corrAB <- as.matrix(corrAB(object)[snps, ])
538
-	corrBB <- as.matrix(corrBB(object)[snps, ])
539
-	nuA <- as.matrix(nuA(object)[snps, ])
540
-	phiA <- as.matrix(phiA(object)[snps, ])
541
-	nuB <- as.matrix(nuB(object)[snps, ])
542
-	phiB <- as.matrix(phiB(object)[snps, ])
543
-	flags <- as.matrix(flags(object)[snps, ])
502
+	##batchnames <- batchNames(object)
503
+	batch.names <- names(batches)
504
+	batch.index <- which(batchNames(object) %in% batch.names)
505
+	if(length(batches) > 1 && "grandMean" %in% batchNames(object))
506
+		batch.index <- c(batch.index, match("grandMean", batchNames(object)))
507
+	N.AA <- as.matrix(N.AA(object)[snps, batch.index])
508
+	N.AB <- as.matrix(N.AB(object)[snps, batch.index])
509
+	N.BB <- as.matrix(N.BB(object)[snps, batch.index])
510
+	medianA.AA <- as.matrix(medianA.AA(object)[snps, batch.index])
511
+	medianA.AB <- as.matrix(medianA.AB(object)[snps, batch.index])
512
+	medianA.BB <- as.matrix(medianA.BB(object)[snps, batch.index])
513
+	medianB.AA <- as.matrix(medianB.AA(object)[snps, batch.index])
514
+	medianB.AB <- as.matrix(medianB.AB(object)[snps, batch.index])
515
+	medianB.BB <- as.matrix(medianB.BB(object)[snps, batch.index])
516
+	madA.AA <- as.matrix(madA.AA(object)[snps, batch.index])
517
+	madA.AB <- as.matrix(madA.AB(object)[snps, batch.index])
518
+	madA.BB <- as.matrix(madA.BB(object)[snps, batch.index])
519
+	madB.AA <- as.matrix(madB.AA(object)[snps, batch.index])
520
+	madB.AB <- as.matrix(madB.AB(object)[snps, batch.index])
521
+	madB.BB <- as.matrix(madB.BB(object)[snps, batch.index])
522
+	tau2A.AA <- as.matrix(tau2A.AA(object)[snps, batch.index])
523
+	tau2B.BB <- as.matrix(tau2B.BB(object)[snps, batch.index])
524
+	tau2A.BB <- as.matrix(tau2A.BB(object)[snps, batch.index])
525
+	tau2B.AA <- as.matrix(tau2B.AA(object)[snps, batch.index])
526
+	corrAA <- as.matrix(corrAA(object)[snps, batch.index])
527
+	corrAB <- as.matrix(corrAB(object)[snps, batch.index])
528
+	corrBB <- as.matrix(corrBB(object)[snps, batch.index])
529
+	nuA <- as.matrix(nuA(object)[snps,  batch.index])
530
+	phiA <- as.matrix(phiA(object)[snps, batch.index])
531
+	nuB <- as.matrix(nuB(object)[snps, batch.index])
532
+	phiB <- as.matrix(phiB(object)[snps, batch.index])
533
+	flags <- as.matrix(flags(object)[snps, batch.index])
544 534
 	for(k in seq(along=batches)){
545 535
 		B <- batches[[k]]
546 536
 		if(length(B) < MIN.SAMPLES) next()
... ...
@@ -550,7 +540,29 @@ fit.lm1 <- function(strata,
550 540
 		madA <- cbind(madA.AA[, k], madA.AB[, k], madA.BB[, k])
551 541
 		madB <- cbind(madB.AA[, k], madB.AB[, k], madB.BB[, k])
552 542
 		NN <- cbind(N.AA[, k], N.AB[, k], N.BB[, k])
553
-		## we're regressing on the medians using the standard errors (hence the division by N) as weights
543
+		## regress on the medians using the standard errors (hence the division by N) as weights
544
+		res <- fit.wls(NN=NN, sigma=madA, allele="A", Y=medianA, autosome=!CHR.X)
545
+		nuA[, k] <- res[1, ]
546
+		phiA[, k] <- res[2, ]
547
+		rm(res)
548
+		res <- fit.wls(NN=NN, sigma=madB, allele="B", Y=medianB, autosome=!CHR.X)##allele="B", Ystar=YB, W=wB, Ns=Ns)
549
+		nuB[, k] <- res[1, ]
550
+		phiB[, k] <- res[2, ]
551
+	}
552
+	##---------------------------------------------------------------------------
553
+	##
554
+	## Grand mean
555
+	##
556
+	##---------------------------------------------------------------------------
557
+	if(length(batches) > 1 && "grandMean" %in% batchNames(object)){
558
+		## then the last column is for the grandMean
559
+		k <- ncol(medianA.AA)
560
+		medianA <- cbind(medianA.AA[, k], medianA.AB[, k], medianA.BB[, k])
561
+		medianB <- cbind(medianB.AA[, k], medianB.AB[, k], medianB.BB[, k])
562
+		madA <- cbind(madA.AA[, k], madA.AB[, k], madA.BB[, k])
563
+		madB <- cbind(madB.AA[, k], madB.AB[, k], madB.BB[, k])
564
+		NN <- cbind(N.AA[, k], N.AB[, k], N.BB[, k])
565
+		##index <- which(rowSums(is.na(medianA)) > 0)
554 566
 		res <- fit.wls(NN=NN, sigma=madA, allele="A", Y=medianA, autosome=!CHR.X)
555 567
 		nuA[, k] <- res[1, ]
556 568
 		phiA[, k] <- res[2, ]
... ...
@@ -558,8 +570,6 @@ fit.lm1 <- function(strata,
558 570
 		res <- fit.wls(NN=NN, sigma=madB, allele="B", Y=medianB, autosome=!CHR.X)##allele="B", Ystar=YB, W=wB, Ns=Ns)
559 571
 		nuB[, k] <- res[1, ]
560 572
 		phiB[, k] <- res[2, ]
561
-##		cA[, k] <- matrix((1/phiA[, J]*(A-nuA[, J])), nrow(A), ncol(A))
562
-##		cB[, k] <- matrix((1/phiB[, J]*(B-nuB[, J])), nrow(B), ncol(B))
563 573
 	}
564 574
 	if(THR.NU.PHI){
565 575
 		nuA[nuA < MIN.NU] <- MIN.NU
... ...
@@ -567,10 +577,10 @@ fit.lm1 <- function(strata,
567 577
 		phiA[phiA < MIN.PHI] <- MIN.PHI
568 578
 		phiB[phiB < MIN.PHI] <- MIN.PHI
569 579
 	}
570
-	nuA(object)[snps, ] <- nuA
571
-	nuB(object)[snps, ] <- nuB
572
-	phiA(object)[snps, ] <- phiA
573
-	phiB(object)[snps, ] <- phiB
580
+	nuA(object)[snps, batch.index] <- nuA
581
+	nuB(object)[snps, batch.index] <- nuB
582
+	phiA(object)[snps, batch.index] <- phiA
583
+	phiB(object)[snps, batch.index] <- phiB
574 584
 	if(is.lds){
575 585
 		close(object)
576 586
 		return(TRUE)
... ...
@@ -593,30 +603,34 @@ fit.lm2 <- function(strata,
593 603
 	marker.index <- index.list[[strata]]
594 604
 	batches <- split(seq_along(batch(object)), as.character(batch(object)))
595 605
 	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
596
-
606
+	batch.names <- names(batches)
607
+	batch.index <- which(batchNames(object) %in% batch.names)
608
+	##
597 609
 	ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object))
598
-	flags <- as.matrix(flags(object)[ii, ])
610
+	flags <- as.matrix(flags(object)[ii, batch.index])
599 611
 	fns <- featureNames(object)[ii]
600 612
 	fns.noflags <- fns[rowSums(flags, na.rm=T) == 0]
601 613
 	snp.index <- sample(match(fns.noflags, featureNames(object)), 5000)
602
-
603
-	nuA.np <- as.matrix(nuA(object)[marker.index, ])
604
-	phiA.np <- as.matrix(phiA(object)[marker.index, ])
605
-	tau2A.AA <- as.matrix(tau2A.AA(object)[marker.index, ])
606
-
607
-	nuA.snp <- as.matrix(nuA(object)[snp.index, ])
608
-	nuB.snp <- as.matrix(nuB(object)[snp.index, ])
609
-	phiA.snp <- as.matrix(phiA(object)[snp.index, ])
610
-	phiB.snp <- as.matrix(phiB(object)[snp.index, ])
611
-	medianA.AA <- as.matrix(medianA.AA(object)[snp.index,])
612
-	medianB.BB <- as.matrix(medianB.BB(object)[snp.index,])
613
-
614
-	medianA.AA.np <- as.matrix(medianA.AA(object)[marker.index,])
614
+	##
615
+	nuA.np <- as.matrix(nuA(object)[marker.index, batch.index])
616
+	phiA.np <- as.matrix(phiA(object)[marker.index, batch.index])
617
+	tau2A.AA <- as.matrix(tau2A.AA(object)[marker.index, batch.index])
618
+	##
619
+	nuA.snp <- as.matrix(nuA(object)[snp.index, batch.index])
620
+	nuB.snp <- as.matrix(nuB(object)[snp.index, batch.index])
621
+	phiA.snp <- as.matrix(phiA(object)[snp.index, batch.index])
622
+	phiB.snp <- as.matrix(phiB(object)[snp.index, batch.index])
623
+	medianA.AA <- as.matrix(medianA.AA(object)[snp.index, batch.index])
624
+	medianB.BB <- as.matrix(medianB.BB(object)[snp.index, batch.index])
625
+	medianA.AA.np <- as.matrix(medianA.AA(object)[marker.index, batch.index])
615 626
 	for(k in seq_along(batches)){
616 627
 		B <- batches[[k]]
617 628
 		this.batch <- unique(as.character(batch(object)[B]))
618 629
 		X <- cbind(1, log2(c(medianA.AA[, k], medianB.BB[, k])))
619 630
 		Y <- log2(c(phiA.snp[, k], phiB.snp[, k]))
631
+		not.missing <- rowSums(is.na(X)) == 0 & !is.na(Y)
632
+		X <- X[not.missing, ]
633
+		Y <- Y[not.missing]
620 634
 		betahat <- solve(crossprod(X), crossprod(X, Y))
621 635
 		##crosshyb <- max(median(medianA.AA[, k]) - median(medianA.AA.np[, k]), 0)
622 636
 		crosshyb <- 0
... ...
@@ -630,8 +644,8 @@ fit.lm2 <- function(strata,
630 644
 		nuA.np[nuA.np < MIN.NU] <- MIN.NU
631 645
 		phiA.np[phiA.np < MIN.PHI] <- MIN.PHI
632 646
 	}
633
-	nuA(object)[marker.index, ] <- nuA.np
634
-	phiA(object)[marker.index, ] <- phiA.np
647
+	nuA(object)[marker.index, batch.index] <- nuA.np
648
+	phiA(object)[marker.index, batch.index] <- phiA.np
635 649
 	if(is.lds) { close(object); return(TRUE)}
636 650
 	return(object)
637 651
 }
... ...
@@ -645,6 +659,7 @@ summarizeMaleXNps <- function(marker.index,
645 659
 	gender <- object$gender
646 660
 	AA <- as.matrix(A(object)[marker.index, gender==1])
647 661
 	madA.AA <- medianA.AA <- matrix(NA, nr, nc)
662
+	colnames(madA.AA) <- colnames(medianA.AA) <- batchNames(object)
648 663
 	numberMenPerBatch <- rep(NA, nc)
649 664
 	for(k in seq_along(batches)){
650 665
 		B <- batches[[k]]
... ...
@@ -656,8 +671,8 @@ summarizeMaleXNps <- function(marker.index,
656 671
 		sns <- colnames(AA)
657 672
 		J <- sns%in%sns.batch
658 673
 		numberMenPerBatch[k] <- length(J)
659
-		medianA.AA[, k] <- rowMedians(AA[, J], na.rm=TRUE)
660
-		madA.AA[, k] <- rowMAD(AA[, J], na.rm=TRUE)
674
+		medianA.AA[, k] <- rowMedians(AA[, J, drop=FALSE], na.rm=TRUE)
675
+		madA.AA[, k] <- rowMAD(AA[, J, drop=FALSE], na.rm=TRUE)
661 676
 	}
662 677
 	return(list(medianA.AA=medianA.AA,
663 678
 		    madA.AA=madA.AA))
... ...
@@ -665,29 +680,35 @@ summarizeMaleXNps <- function(marker.index,
665 680
 
666 681
 
667 682
 summarizeXGenotypes <- function(marker.index,
668
-				    batches,
669
-				    object,
670
-				    GT.CONF.THR,
671
-				    MIN.OBS,
672
-				    MIN.SAMPLES,
673
-				    verbose,
674
-				    is.lds,
675
-				    DF.PRIOR,
676
-				    gender="male", ...){
683
+				batches,
684
+				object,
685
+				GT.CONF.THR,
686
+				MIN.OBS,
687
+				MIN.SAMPLES,
688
+				verbose,
689
+				is.lds,
690
+				DF.PRIOR,
691
+				gender="male", ...){
692
+	I <- unlist(batches)
677 693
 	if(gender == "male"){
678
-		sample.index <- which(object$gender==1)
679
-	} else sample.index <- which(object$gender==2)
694
+		gender.index <- which(object$gender == 1)
695
+	} else {
696
+		gender.index <- which(object$gender == 2)
697
+	}
698
+	batches <- lapply(batches, function(x, gender.index) intersect(x, gender.index), gender.index)
699
+	batch.names <- names(batches)
700
+	batch.index <- which(batchNames(object) %in% batch.names)
680 701
 	nr <- length(marker.index)
681
-	nc <- length(batchNames(object))
682
-##	NN.Mlist <- imputed.medianA <- imputed.medianB <- shrink.madA <- shrink.madB <- vector("list", nc)
702
+	nc <- length(batch.index)
683 703
 	NN.Mlist <- medianA <- medianB <- shrink.madA <- shrink.madB <- vector("list", nc)
684
-	##gender <- object$gender
685
-	GG <- as.matrix(calls(object)[marker.index, sample.index])
686
-	CP <- as.matrix(snpCallProbability(object)[marker.index, sample.index])
687
-	AA <- as.matrix(A(object)[marker.index, sample.index])
688
-	BB <- as.matrix(B(object)[marker.index, sample.index])
704
+	names(NN.Mlist) <- names(medianA) <- names(medianB) <- names(shrink.madA) <- names(shrink.madB) <- batch.names
705
+	GG <- as.matrix(calls(object)[marker.index, ])
706
+	CP <- as.matrix(snpCallProbability(object)[marker.index, ])
707
+	AA <- as.matrix(A(object)[marker.index, ])
708
+	BB <- as.matrix(B(object)[marker.index, ])
689 709
 	for(k in seq_along(batches)){
690 710
 		sample.index <- batches[[k]]
711
+		if(length(sample.index) == 0) next()
691 712
 		this.batch <- unique(as.character(batch(object)[sample.index]))
692 713
 		##gender <- object$gender[sample.index]
693 714
 		if(length(sample.index) < MIN.SAMPLES) next()
... ...
@@ -695,12 +716,13 @@ summarizeXGenotypes <- function(marker.index,
695 716
 		##subset GG apppriately
696 717
 		sns <- colnames(GG)
697 718
 		J <- sns%in%sns.batch
698
-		G <- GG[, J]
699
-		xx <- CP[, J]
719
+		G <- GG[, J, drop=FALSE]
720
+		xx <- CP[, J, drop=FALSE]
721
+		p <- i2p(xx)
700 722
 		highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
701 723
 		G <- G*highConf
702
-		A <- AA[, J]
703
-		B <- BB[, J]
724
+		A <- AA[, J, drop=FALSE]
725
+		B <- BB[, J, drop=FALSE]
704 726
 		G.AA <- G==1
705 727
 		G.AA[G.AA==FALSE] <- NA
706 728
 		G.AB <- G==2
... ...
@@ -774,6 +796,18 @@ summarizeXGenotypes <- function(marker.index,
774 796
 			## AB and AA observed
775 797
 			unobserved.index[[3]] <- which(unobservedBB & (NN[, 2] >= MIN.OBS & NN[, 1] >= MIN.OBS))
776 798
 			res <- imputeCenter(medianA[[k]], medianB[[k]], index.complete, unobserved.index)
799
+			medianA[[k]] <- res[[1]]
800
+			medianB[[k]] <- res[[2]]
801
+
802
+			## RS: For Monomorphic SNPs a mixture model may be better
803
+			## RS: Further, we can improve estimation by borrowing strength across batch
804
+			unobserved.index[[1]] <- which(unobservedAA & unobservedAB)
805
+			unobserved.index[[2]] <- which(unobservedBB & unobservedAB)
806
+			unobserved.index[[3]] <- which(unobservedAA & unobservedBB) ## strange
807
+			res <- imputeCentersForMonomorphicSnps(medianA[[k]], medianB[[k]],
808
+							       index.complete,
809
+							       unobserved.index)
810
+			medianA[[k]] <- res[[1]]; medianB[[k]] <- res[[2]]
777 811
 		}
778 812
 		medianA[[k]] <- res[[1]]
779 813
 		medianB[[k]] <- res[[2]]
... ...
@@ -798,7 +832,7 @@ fit.lm3 <- function(strata,
798 832
 		    MIN.NU,
799 833
 		    MIN.PHI,
800 834
 		    verbose, is.lds, CHR.X, ...){
801
-	if(is.lds) {physical <- get("physical"); open(object)}
835
+	##if(is.lds) {physical <- get("physical"); open(object)}
802 836
 	if(verbose) message("      Probe stratum ", strata, " of ", length(index.list))
803 837
 	gender <- object$gender
804 838
 	enough.males <- sum(gender==1) >= MIN.SAMPLES
... ...
@@ -810,22 +844,25 @@ fit.lm3 <- function(strata,
810 844
 	marker.index <- index.list[[strata]]
811 845
 	batches <- split(seq_along(batch(object)), as.character(batch(object)))
812 846
 	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
813
-	nuA <- as.matrix(nuA(object)[marker.index, ])
814
-	nuB <- as.matrix(nuB(object)[marker.index, ])
815
-	phiA <- as.matrix(phiA(object)[marker.index, ])
816
-	phiB <- as.matrix(phiB(object)[marker.index, ])
817
-	phiA2 <- as.matrix(phiPrimeA(object)[marker.index, ])
818
-	phiB2 <- as.matrix(phiPrimeB(object)[marker.index, ])
847
+	batch.names <- names(batches)
848
+	batch.index <- which(batchNames(object) %in% batch.names)
849
+
850
+	nuA <- as.matrix(nuA(object)[marker.index, batch.index])
851
+	nuB <- as.matrix(nuB(object)[marker.index, batch.index])
852
+	phiA <- as.matrix(phiA(object)[marker.index, batch.index])
853
+	phiB <- as.matrix(phiB(object)[marker.index, batch.index])
854
+	phiA2 <- as.matrix(phiPrimeA(object)[marker.index, batch.index])
855
+	phiB2 <- as.matrix(phiPrimeB(object)[marker.index, batch.index])
819 856
 	if(enough.males){
820 857
 		res <- summarizeXGenotypes(marker.index=marker.index,
821
-					       batches=batches,
822
-					       object=object,
823
-					       GT.CONF.THR=GT.CONF.THR,
824
-					       MIN.SAMPLES=MIN.SAMPLES,
825
-					       MIN.OBS=MIN.OBS,
826
-					       verbose=verbose,
827
-					       is.lds=is.lds,
828
-					       DF.PRIOR=DF.PRIOR/2,
858
+					   batches=batches,
859
+					   object=object,
860
+					   GT.CONF.THR=GT.CONF.THR,
861
+					   MIN.SAMPLES=MIN.SAMPLES,
862
+					   MIN.OBS=MIN.OBS,
863
+					   verbose=verbose,
864
+					   is.lds=is.lds,
865
+					   DF.PRIOR=DF.PRIOR/2,
829 866
 					   gender="male")
830 867
 		madA.Mlist <- res[["madA"]]
831 868
 		madB.Mlist <- res[["madB"]]
... ...
@@ -855,6 +892,7 @@ fit.lm3 <- function(strata,
855 892
 	}
856 893
 	for(k in seq_along(batches)){
857 894
 		sample.index <- batches[[k]]
895
+		if(is.null(sample.index)) next()
858 896
 		this.batch <- unique(as.character(batch(object)[sample.index]))
859 897
 		gender <- object$gender[sample.index]
860 898
 		enough.men <- sum(gender==1) >= MIN.SAMPLES
... ...
@@ -878,6 +916,10 @@ fit.lm3 <- function(strata,
878 916
 			NN.M <- NN.Mlist[[k]]
879 917
 		}
880 918
 		if(enough.men & enough.women){
919
+			if(any(madA.F == 0)){
920
+				message("Zeros in median absolute deviation.  Not estimating CN for chrX for batch ", names(batches)[k])
921
+				next()
922
+			}
881 923
 			betas <- fit.wls(cbind(NN.M, NN.F),
882 924
 					 sigma=cbind(madA.M, madA.F),
883 925
 					 allele="A",
... ...
@@ -914,6 +956,10 @@ fit.lm3 <- function(strata,
914 956
 			phiB[, k] <- betas[2, ]
915 957
 		}
916 958
 		if(!enough.men & enough.women){
959
+			if(any(madA.F == 0)){
960
+				message("Zeros in median absolute deviation.  Not estimating CN for chrX for batch ", names(batches)[k])
961
+				next()
962
+			}
917 963
 			betas <- fit.wls(NN.F,
918 964
 					 sigma=madA.F,
919 965
 					 allele="A",
... ...
@@ -938,15 +984,16 @@ fit.lm3 <- function(strata,
938 984
 		phiB[phiB < MIN.PHI] <- MIN.PHI
939 985
 		phiB2[phiB2 < 1] <- 1
940 986
 	}
941
-	nuA(object)[marker.index, ] <- nuA
942
-	nuB(object)[marker.index, ] <- nuB
943
-	phiA(object)[marker.index, ] <- phiA
944
-	phiB(object)[marker.index, ] <- phiB
945
-	phiPrimeA(object)[marker.index, ] <- phiA2
946
-	phiPrimeB(object)[marker.index, ] <- phiB2
987
+	nuA(object)[marker.index, batch.index] <- nuA
988
+	nuB(object)[marker.index, batch.index] <- nuB
989
+	phiA(object)[marker.index, batch.index] <- phiA
990
+	phiB(object)[marker.index, batch.index] <- phiB
991
+	phiPrimeA(object)[marker.index, batch.index] <- phiA2
992
+	phiPrimeB(object)[marker.index, batch.index] <- phiB2
947 993
 	if(is.lds) {close(object); return(TRUE)} else return(object)
948 994
 }
949 995
 
996
+##nonpolymorphic markers on X
950 997
 fit.lm4 <- function(strata,
951 998
 		    index.list,
952 999
 		    object,
... ...
@@ -954,8 +1001,9 @@ fit.lm4 <- function(strata,
954 1001
 		    THR.NU.PHI,
955 1002
 		    MIN.NU,
956 1003
 		    MIN.PHI,
957
-		    verbose, is.lds, ...){
958
-	if(is.lds) {physical <- get("physical"); open(object)}
1004
+		    verbose, is.lds=TRUE, ...){
1005
+	##if(is.lds) {physical <- get("physical"); open(object)}
1006
+	## exclude batches that have fewer than MIN.SAMPLES
959 1007
 	gender <- object$gender
960 1008
 	enough.males <- sum(gender==1) >= MIN.SAMPLES
961 1009
 	enough.females <- sum(gender==2) >= MIN.SAMPLES
... ...
@@ -963,68 +1011,71 @@ fit.lm4 <- function(strata,
963 1011
 		message(paste("fewer than", MIN.SAMPLES, "men and women.  Copy number not estimated for CHR X"))
964 1012
 		return(object)
965 1013
 	}
1014
+	excludeBatch <- names(table(batch(object)))[table(batch(object)) < MIN.SAMPLES]
1015
+	if(length(excludeBatch) > 0){
1016
+		bns <- unique(batch(object))
1017
+		batch.index <- which(!bns %in% excludeBatch)
1018
+		sample.index <- which(!batch(object)%in% excludeBatch)
1019
+	} else {
1020
+		sample.index <- 1:ncol(object)
1021
+		batch.index <- as.integer(as.factor(unique(batch(object))))
1022
+	}
966 1023
 	if(verbose) message("      Probe stratum ", strata, " of ", length(index.list))
967 1024
 	marker.index <- index.list[[strata]]
968 1025
 	batches <- split(seq_along(batch(object)), as.character(batch(object)))
969
-	batches <- batches[sapply(batches, length) >= MIN.SAMPLES]
970
-	nc <- length(batchNames(object))
1026
+	batches <- batches[batch.index]
1027
+	nc <- length(batch.index)
971 1028
 	if(enough.males){
972 1029
 		res <- summarizeMaleXNps(marker.index=marker.index,
973 1030
 					 batches=batches,
974 1031
 					 object=object, MIN.SAMPLES=MIN.SAMPLES)
975
-		medianA.AA.M <- res[["medianA.AA"]]
976
-		madA.AA.M <- res[["madA.AA"]]
977
-
1032
+		medianA.AA.M <- res[["medianA.AA"]][, batch.index, drop=FALSE]
1033
+		madA.AA.M <- res[["madA.AA"]][, batch.index, drop=FALSE]
978 1034
 	}
979
-	medianA.AA.F <- as.matrix(medianA.AA(object)[marker.index, ]) ## median for women
980
-	madA.AA.F <- as.matrix(madA.AA(object)[marker.index, ]) ## median for women
981
-	split.gender <- split(gender, as.character(batch(object)))
1035
+	medianA.AA.F <- as.matrix(medianA.AA(object)[marker.index, batch.index, drop=FALSE]) ## median for women
1036
+	madA.AA.F <- as.matrix(madA.AA(object)[marker.index, batch.index, drop=FALSE]) ## median for women
1037
+	split.gender <- split(gender[sample.index], as.character(batch(object)[sample.index]))
982 1038
 	N.M <- sapply(split.gender, function(x) sum(x==1))
983 1039
 	N.F <- sapply(split.gender, function(x) sum(x==2))
984
-	nuA <- as.matrix(nuA(object)[marker.index, ])
985
-	nuB <- as.matrix(nuB(object)[marker.index, ])
986
-	phiA <- as.matrix(phiA(object)[marker.index, ])
987
-	phiB <- as.matrix(phiB(object)[marker.index, ])
1040
+	nuA <- as.matrix(nuA(object)[marker.index, batch.index, drop=FALSE])
1041
+	nuB <- as.matrix(nuB(object)[marker.index, batch.index, drop=FALSE])
1042
+	phiA <- as.matrix(phiA(object)[marker.index, batch.index, drop=FALSE])
1043
+	phiB <- as.matrix(phiB(object)[marker.index, batch.index, drop=FALSE])
988 1044
 	ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object))
989 1045
 	fns <- featureNames(object)[ii]
990
-	flags <- as.matrix(flags(object)[ii, ])
1046
+	flags <- as.matrix(flags(object)[ii, batch.index])
991 1047
 	fns.noflags <- fns[rowSums(flags, na.rm=T) == 0]
992 1048
 	snp.index <- sample(match(fns.noflags, featureNames(object)), 10000)
993
-
994
-	## exclude batches that have fewer than MIN.SAMPLES
995
-	excludeBatch <- names(table(batch(object)))[table(batch(object)) < MIN.SAMPLES]
996
-	batch.index <- which(!batchNames(object) %in% excludeBatch)
997
-
1049
+	##
998 1050
 	N.AA <- as.matrix(N.AA(object)[snp.index, batch.index, drop=FALSE])
999
-	N.AB <- as.matrix(N.AA(object)[snp.index, batch.index, drop=FALSE])
1000
-	N.BB <- as.matrix(N.AA(object)[snp.index, batch.index, drop=FALSE])
1051
+	N.AB <- as.matrix(N.AB(object)[snp.index, batch.index, drop=FALSE])
1052
+	N.BB <- as.matrix(N.BB(object)[snp.index, batch.index, drop=FALSE])
1001 1053
 	enoughAA <- rowSums(N.AA < 5) == 0
1002 1054
 	enoughAB <- rowSums(N.AB < 5) == 0
1003 1055
 	enoughBB <- rowSums(N.BB < 5) == 0
1004 1056
 	snp.index <- snp.index[enoughAA & enoughAB & enoughBB]
1005
-	if(length(snp.index) < 100) {
1006
-		message("Too few passing SNPs for estimating model parameters at nonpolymorphic loci on chrom X")
1007
-		return()
1057
+	if(length(snp.index) < 100){
1058
+		message("too few snps pass criteria for estimating parameters for NP markers on chr X")
1059
+		return(object)
1008 1060
 	}
1009
-	##stopifnot(length(snp.index) > 100)
1010
-	nuA.snp.notmissing <- rowSums(is.na(as.matrix(nuA(object)[snp.index, ]))) == 0
1011
-	nuA.snp.notnegative <- rowSums(as.matrix(nuA(object)[snp.index, ]) < 20) == 0
1061
+	nuA.snp <- as.matrix(nuA(object)[snp.index, batch.index, drop=FALSE])
1062
+	nuA.snp.notmissing <- rowSums(is.na(nuA.snp)) == 0
1063
+	nuA.snp.notnegative <- rowSums(as.matrix(nuA(object)[snp.index, batch.index, drop=FALSE]) < 20) == 0
1012 1064
 	snp.index <- snp.index[nuA.snp.notmissing & nuA.snp.notnegative]
1013
-	medianA.AA.snp <- as.matrix(medianA.AA(object)[snp.index,])
1014
-	medianB.BB.snp <- as.matrix(medianB.BB(object)[snp.index,])
1015
-
1016
-	nuA.snp <- as.matrix(nuA(object)[snp.index, ])
1017
-	nuB.snp <- as.matrix(nuB(object)[snp.index, ])
1018
-	phiA.snp <- as.matrix(phiA(object)[snp.index, ])
1019
-	phiB.snp <- as.matrix(phiB(object)[snp.index, ])
1065
+	medianA.AA.snp <- as.matrix(medianA.AA(object)[snp.index, batch.index])
1066
+	medianB.BB.snp <- as.matrix(medianB.BB(object)[snp.index, batch.index])
1067
+	nuA.snp <- as.matrix(nuA(object)[snp.index, batch.index])
1068
+	nuB.snp <- as.matrix(nuB(object)[snp.index, batch.index])
1069
+	phiA.snp <- as.matrix(phiA(object)[snp.index, batch.index])
1070
+	phiB.snp <- as.matrix(phiB(object)[snp.index, batch.index])
1020 1071
 ##	pseudoAR <- position(object)[snp.index] < 2709520 | (position(object)[snp.index] > 154584237 & position(object)[snp.index] < 154913754)
1021 1072
 ##	pseudoAR[is.na(pseudoAR)] <- FALSE
1022 1073
 	for(k in seq_along(batches)){
1023 1074
 		B <- batches[[k]]
1024 1075
 		this.batch <- unique(as.character(batch(object)[B]))
1025 1076
 		gender <- object$gender[B]
1026
-		enough.men <- N.M[k] >= MIN.SAMPLES
1027
-		enough.women <- N.F[k] >= MIN.SAMPLES
1077
+		enough.men <- N.M[[this.batch]] >= MIN.SAMPLES
1078
+		enough.women <- N.F[[this.batch]] >= MIN.SAMPLES
1028 1079
 		if(!enough.men & !enough.women) {
1029 1080
 			if(verbose) message(paste("fewer than", MIN.SAMPLES, "men and women in batch", this.batch, ". CHR X copy number not available. "))
1030 1081
 			next()
... ...
@@ -1036,6 +1087,9 @@ fit.lm4 <- function(strata,
1036 1087
 		}
1037 1088
 		X <- cbind(1, log2(c(tmp[, 1], tmp[, 2])))
1038 1089
 		Y <- log2(c(tmp[, 3], tmp[, 4]))
1090
+		notmissing.index <- which(rowSums(is.na(X)) == 0 & !is.na(Y))
1091
+		X <- X[notmissing.index, ]
1092
+		Y <- Y[notmissing.index]
1039 1093
 		betahat <- solve(crossprod(X), crossprod(X, Y))
1040 1094
 		if(enough.men){
1041 1095
 			X.men <- cbind(1, log2(medianA.AA.M[, k]))
... ...
@@ -1055,11 +1109,12 @@ fit.lm4 <- function(strata,
1055 1109
 		nuB[nuB < MIN.NU] <- MIN.NU
1056 1110
 		phiB[phiB < MIN.PHI] <- MIN.PHI
1057 1111
 	}
1058
-	nuA(object)[marker.index, ] <- nuA
1059
-	phiA(object)[marker.index, ] <- phiA
1060
-	nuB(object)[marker.index, ] <- nuB
1061
-	phiB(object)[marker.index, ] <- phiB
1062
-	if(is.lds) {close(object); return(TRUE)} else return(object)
1112
+	nuA(object)[marker.index, batch.index] <- nuA
1113
+	phiA(object)[marker.index, batch.index] <- phiA
1114
+	nuB(object)[marker.index, batch.index] <- nuB
1115
+	phiB(object)[marker.index, batch.index] <- phiB
1116
+	##if(is.lds) {close(object); return(TRUE)} else return(object)
1117
+	TRUE
1063 1118
 }
1064 1119
 
1065 1120
 whichPlatform <- function(cdfName){
... ...
@@ -1411,18 +1466,12 @@ shrinkSummary <- function(object,
1411 1466
 }
1412 1467
 
1413 1468
 genotypeSummary <- function(object,
1414
-			    GT.CONF.THR=0.95,
1469
+			    GT.CONF.THR=0.80,
1415 1470
 			    type=c("SNP", "NP", "X.SNP", "X.NP"), ##"X.snps", "X.nps"),
1416 1471
 			    verbose=TRUE,
1417 1472
 			    marker.index,
1418 1473
 			    is.lds){
1419 1474
 	if(type == "X.SNP" | type=="X.NP"){
1420
-##		gender <- object$gender
1421
-##		## the number of women in each batch could be less than 3...
1422
-##		if(sum(gender == 2) < 3) {
1423
-##			message("too few females to estimate within genotype summary statistics on CHR X")
1424
-##			return(object)
1425
-##		}
1426 1475
 		CHR.X <- TRUE
1427 1476
 	} else CHR.X <- FALSE
1428 1477
 	if(missing(marker.index)){
... ...
@@ -1514,7 +1563,7 @@ summarizeNps <- function(strata, index.list, object, batchSize,
1514 1563
 		}
1515 1564
 		this.batch <- unique(as.character(batch(object)[sample.index]))
1516 1565
 		j <- match(this.batch, batchnames)
1517
-		I.A <- AA[, sample.index]
1566
+		I.A <- AA[, sample.index, drop=FALSE]
1518 1567
 		medianA.AA[, k] <- rowMedians(I.A, na.rm=TRUE)
1519 1568
 		madA.AA[, k] <- rowMAD(I.A, na.rm=TRUE)
1520 1569
 		## log2 Transform Intensities
... ...
@@ -1532,23 +1581,14 @@ summarizeSnps <- function(strata,
1532 1581
 			  index.list,
1533 1582
 			  object,
1534 1583
 			  GT.CONF.THR,
1535
-			  verbose, is.lds, CHR.X, ...){
1536
-	if(is.lds) {
1537
-		physical <- get("physical")
1538
-		open(object)
1539
-	}
1584
+			  verbose, is.lds=TRUE, CHR.X, ...){
1585
+##	if(is.lds) {
1586
+##		physical <- get("physical")
1587
+##		open(object)
1588
+##	}
1589
+	open(object)
1540 1590
 	if(verbose) message("      Probe stratum ", strata, " of ", length(index.list))
1541 1591
 	index <- index.list[[strata]]
1542
-##	if(CHR.X) {
1543
-##		sample.index <- which(object$gender==2)
1544
-##		batches <- split(sample.index, as.character(batch(object))[sample.index])
1545
-##	} else {
1546
-##		batches <- split(seq_along(batch(object)), as.character(batch(object)))
1547
-##	}
1548
-	## this message can be confusing if no women are in the dataset
1549
-##	if(CHR.X){
1550
-##		if(verbose) message("        biallelic cluster medians are estimated using only the women for SNPs on chr. X")
1551
-##	}
1552 1592
 	batches <- split(seq_along(batch(object)), as.character(batch(object)))
1553 1593
 	batchnames <- batchNames(object)
1554 1594
 	nr <- length(index)
... ...
@@ -1562,6 +1602,14 @@ summarizeSnps <- function(strata,
1562 1602
 	AA <- as.matrix(A(object)[index, ])
1563 1603
 	BB <- as.matrix(B(object)[index, ])
1564 1604
 	FL <- as.matrix(flags(object)[index, ])
1605
+	summaryStats <- function(X, INT, FUNS){
1606
+		tmp <- matrix(NA, nrow(X), length(FUNS))
1607
+		for(j in seq_along(FUNS)){
1608
+			FUN <- match.fun(FUNS[j])
1609
+			tmp[, j] <- FUN(X*INT, na.rm=TRUE)
1610
+		}
1611
+		tmp
1612
+	}
1565 1613
 	if(verbose) message("        Computing summaries...")
1566 1614
 	for(k in seq_along(batches)){
1567 1615
 		##note that the genotype frequency for AA would include 'A' on chromosome X for men
... ...
@@ -1569,10 +1617,17 @@ summarizeSnps <- function(strata,
1569 1617
 		this.batch <- unique(as.character(batch(object)[sample.index]))
1570 1618
 		j <- match(this.batch, batchnames)
1571 1619
 		G <- GG[, sample.index, drop=FALSE]
1572
-		##NORM <- normal.index[, k]
1573 1620
 		xx <- CP[, sample.index, drop=FALSE]
1574 1621
 		highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1622
+		## Some markers may have genotype confidences scores that are ALL below the threshold
1623
+		## For these markers, provide statistical summaries based on all the samples and flag
1624
+		## Provide summaries for these markers and flag to indicate the problem
1625
+		## RS: check whether we need the next to lines and, if so, effect downstream
1626
+		ii <- which(rowSums(highConf) == 0)
1627
+		if(length(ii) > 0) highConf[ii, ] <- TRUE
1575 1628
 		G <- G*highConf
1629
+		## table(rowSums(G==0))
1630
+		##G <- G*highConf*NORM
1576 1631
 		A <- AA[, sample.index, drop=FALSE]
1577 1632
 		B <- BB[, sample.index, drop=FALSE]
1578 1633
 		## this can be time consuming...do only once
... ...
@@ -1590,48 +1645,101 @@ summarizeSnps <- function(strata,
1590 1645
 			sample.index <- sample.index[gender == 2]
1591 1646
 			if(length(sample.index) == 0) next()
1592 1647
 		}
1593
-		summaryStats <- function(X, INT, FUNS){
1594
-			tmp <- matrix(NA, nrow(X), length(FUNS))
1595
-			for(j in seq_along(FUNS)){
1596
-				FUN <- match.fun(FUNS[j])
1597
-				tmp[, j] <- FUN(X*INT, na.rm=TRUE)
1598
-			}
1599
-			tmp
1600
-		}
1601 1648
 		stats <- summaryStats(G.AA, A, FUNS=c("rowMedians", "rowMAD"))
1602 1649
 		medianA.AA(object)[index, k] <- stats[, 1]
1650
+		##missing.index <- which(rowSums(is.na(medianA.AA(object)[index, ,drop=FALSE])) > 0)
1603 1651
 		madA.AA(object)[index, k] <- stats[, 2]
1604
-
1605 1652
 		stats <- summaryStats(G.AB, A, FUNS=c("rowMedians", "rowMAD"))
1606 1653
 		medianA.AB(object)[index, k] <- stats[, 1]
1607 1654
 		madA.AB(object)[index, k] <- stats[, 2]
1608
-
1609 1655
 		stats <- summaryStats(G.BB, A, FUNS=c("rowMedians", "rowMAD"))
1610 1656
 		medianA.BB(object)[index, k] <- stats[, 1]
1611 1657
 		madA.BB(object)[index, k] <- stats[, 2]
1612
-
1613 1658
 		stats <- summaryStats(G.AA, B, FUNS=c("rowMedians", "rowMAD"))
1614 1659
 		medianB.AA(object)[index, k] <- stats[, 1]
1615 1660
 		madB.AA(object)[index, k] <- stats[, 2]
1616
-
1617 1661
 		stats <- summaryStats(G.AB, B, FUNS=c("rowMedians", "rowMAD"))
1618 1662
 		medianB.AB(object)[index, k] <- stats[, 1]
1619 1663
 		madB.AB(object)[index, k] <- stats[, 2]
1620
-
1621 1664
 		stats <- summaryStats(G.BB, B, FUNS=c("rowMedians", "rowMAD"))
1622 1665
 		medianB.BB(object)[index, k] <- stats[, 1]
1623 1666
 		madB.BB(object)[index, k] <- stats[, 2]
1624
-
1625 1667
 		## log2 Transform Intensities
1626 1668
 		A <- log2(A); B <- log2(B)
1627 1669
 		tau2A.AA[, k] <- summaryStats(G.AA, A, FUNS="rowMAD")^2
1628 1670
 		tau2A.BB[, k] <- summaryStats(G.BB, A, FUNS="rowMAD")^2
1629 1671
 		tau2B.AA[, k] <- summaryStats(G.AA, B, FUNS="rowMAD")^2
1630 1672
 		tau2B.BB[, k] <- summaryStats(G.BB, B, FUNS="rowMAD")^2
1631
-
1632 1673
 		corrAA[, k] <- rowCors(A*G.AA, B*G.AA, na.rm=TRUE)
1633 1674
 		corrAB[, k] <- rowCors(A*G.AB, B*G.AB, na.rm=TRUE)
1634 1675
 		corrBB[, k] <- rowCors(A*G.BB, B*G.BB, na.rm=TRUE)
1676
+		rm(A, B, G.AA, G.AB, G.BB, xx, highConf, G)
1677
+		gc()
1678
+	}
1679
+	##---------------------------------------------------------------------------
1680
+	## grand mean
1681
+	##---------------------------------------------------------------------------
1682
+	if(length(batches) > 1 && "grandMean" %in% batchNames(object)){
1683
+		##k <- k+1
1684
+		k <- match("grandMean", batchNames(object))
1685
+		if(verbose) message("        computing grand means...")
1686
+		##G <- GG[, B]
1687
+		##NORM <- normal.index[, k]
1688
+		##xx <- CP[, B]
1689
+		##highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1690
+		highConf <- (1-exp(-CP/1000)) > GT.CONF.THR
1691
+		##G <- G*highConf
1692
+		## Some markers may have genotype confidences scores that are ALL below the threshold
1693
+		## For these markers, provide statistical summaries based on all the samples and flag
1694
+		## Provide summaries for these markers and flag to indicate the problem
1695
+		ii <- which(rowSums(highConf) == 0)
1696
+		if(length(ii) > 0) highConf[ii, ] <- TRUE
1697
+		GG <- GG*highConf
1698
+		## table(rowSums(G==0))
1699
+		##G <- G*highConf*NORM
1700
+##		A <- AA[, B]
1701
+##		B <- BB[, B]
1702
+		## this can be time consuming...do only once
1703
+##		G.AA <- G==1
1704
+		G.AA <- GG==1
1705
+		G.AA[G.AA==FALSE] <- NA
1706
+		G.AB <- GG==2
1707
+		G.AB[G.AB==FALSE] <- NA
1708
+		G.BB <- GG==3
1709
+		G.BB[G.BB==FALSE] <- NA
1710
+		Ns.AA[, k] <- rowSums(G.AA, na.rm=TRUE)
1711
+		Ns.AB[, k] <- rowSums(G.AB, na.rm=TRUE)
1712
+		Ns.BB[, k] <- rowSums(G.BB, na.rm=TRUE)
1713
+		## Calculate row medians and MADs
1714
+		stats <- summaryStats(G.AA, AA, FUNS=c("rowMedians", "rowMAD"))
1715
+		medianA.AA(object)[index, k] <- stats[, 1]
1716
+		madA.AA(object)[index, k] <- stats[, 2]
1717
+		stats <- summaryStats(G.AB, AA, FUNS=c("rowMedians", "rowMAD"))
1718
+		medianA.AB(object)[index, k] <- stats[, 1]
1719
+		madA.AB(object)[index, k] <- stats[, 2]
1720
+		stats <- summaryStats(G.BB, AA, FUNS=c("rowMedians", "rowMAD"))
1721
+		medianA.BB(object)[index, k] <- stats[, 1]
1722
+		madA.BB(object)[index, k] <- stats[, 2]
1723
+		stats <- summaryStats(G.AA, BB, FUNS=c("rowMedians", "rowMAD"))
1724
+		medianB.AA(object)[index, k] <- stats[, 1]
1725
+		madB.AA(object)[index, k] <- stats[, 2]
1726
+		stats <- summaryStats(G.AB, BB, FUNS=c("rowMedians", "rowMAD"))
1727
+		medianB.AB(object)[index, k] <- stats[, 1]
1728
+		madB.AB(object)[index, k] <- stats[, 2]
1729
+		stats <- summaryStats(G.BB, BB, FUNS=c("rowMedians", "rowMAD"))
1730
+		medianB.BB(object)[index, k] <- stats[, 1]
1731
+		madB.BB(object)[index, k] <- stats[, 2]
1732
+		## log2 Transform Intensities
1733
+		AA <- log2(AA); BB <- log2(BB)
1734
+		tau2A.AA[, k] <- summaryStats(G.AA, AA, FUNS="rowMAD")^2
1735
+		tau2A.BB[, k] <- summaryStats(G.BB, AA, FUNS="rowMAD")^2
1736
+		tau2B.AA[, k] <- summaryStats(G.AA, BB, FUNS="rowMAD")^2
1737
+		tau2B.BB[, k] <- summaryStats(G.BB, BB, FUNS="rowMAD")^2
1738
+		corrAA[, k] <- rowCors(AA*G.AA, BB*G.AA, na.rm=TRUE)
1739
+		corrAB[, k] <- rowCors(AA*G.AB, BB*G.AB, na.rm=TRUE)
1740
+		corrBB[, k] <- rowCors(AA*G.BB, BB*G.BB, na.rm=TRUE)
1741
+		rm(GG, CP, AA, BB, FL, stats)
1742
+		gc()
1635 1743
 	}
1636 1744
 	if(verbose) message("        Begin writing...")
1637 1745
 	N.AA(object)[index,] <- Ns.AA
... ...
@@ -1640,24 +1748,9 @@ summarizeSnps <- function(strata,
1640 1748
 	corrAA(object)[index,] <- corrAA
1641 1749
 	corrAB(object)[index,] <- corrAB
1642 1750
 	corrBB(object)[index,] <- corrBB
1643
-##	medianA.AA(object)[index,] <- do.call(cbind, lapply(statsA.AA, function(x) x[, 1]))
1644
-##	medianA.AB(object)[index,] <- do.call(cbind, lapply(statsA.AB, function(x) x[, 1]))
1645
-##	medianA.BB(object)[index,] <- do.call(cbind, lapply(statsA.BB, function(x) x[, 1]))
1646
-##	medianB.AA(object)[index,] <- do.call(cbind, lapply(statsB.AA, function(x) x[, 1]))
1647
-##	medianB.AB(object)[index,] <- do.call(cbind, lapply(statsB.AB, function(x) x[, 1]))
1648
-##	medianB.BB(object)[index,] <- do.call(cbind, lapply(statsB.BB, function(x) x[, 1]))
1649
-##
1650
-##	madA.AA(object)[index,] <- do.call(cbind, lapply(statsA.AA, function(x) x[, 2]))
1651
-##	madA.AB(object)[index,] <- do.call(cbind, lapply(statsA.AB, function(x) x[, 2]))
1652
-##	madA.BB(object)[index,] <- do.call(cbind, lapply(statsA.BB, function(x) x[, 2]))
1653
-##	madB.AA(object)[index,] <- do.call(cbind, lapply(statsB.AA, function(x) x[, 2]))
1654
-##	madB.AB(object)[index,] <- do.call(cbind, lapply(statsB.AB, function(x) x[, 2]))
1655
-##	madB.BB(object)[index,] <- do.call(cbind, lapply(statsB.BB, function(x) x[, 2]))
1656 1751
 	tau2A.AA(object)[index, ] <- tau2A.AA
1657
-##	tau2A.AB(object)[index, ] <- tau2A.AB
1658 1752
 	tau2A.BB(object)[index, ] <- tau2A.BB
1659 1753
 	tau2B.AA(object)[index, ] <- tau2B.AA
1660
-##	tau2B.AB(object)[index, ] <- tau2B.AB
1661 1754
 	tau2B.BB(object)[index, ] <- tau2B.BB
1662 1755
 	if(is.lds) return(TRUE) else return(object)
1663 1756
 }
... ...
@@ -1677,6 +1770,7 @@ crlmmCopynumber <- function(object,
1677 1770
 			    THR.NU.PHI=TRUE,
1678 1771
 			    type=c("SNP", "NP", "X.SNP", "X.NP")){
1679 1772
 	stopifnot(type %in% c("SNP", "NP", "X.SNP", "X.NP"))
1773
+	if(GT.CONF.THR >= 1 | GT.CONF.THR < 0) stop("GT.CONF.THR must be in [0,1)")
1680 1774
 	batch <- batch(object)
1681 1775
 	is.snp <- isSnp(object)
1682 1776
 	is.autosome <- chromosome(object) < 23
... ...
@@ -1686,7 +1780,9 @@ crlmmCopynumber <- function(object,
1686 1780
 	if(is.lds) stopifnot(isPackageLoaded("ff"))
1687 1781
 	samplesPerBatch <- table(as.character(batch(object)))
1688 1782
 	if(any(samplesPerBatch < MIN.SAMPLES)){
1689
-		warning("The following batches have fewer than ", MIN.SAMPLES, " samples:", names(samplesPerBatch)[samplesPerBatch < MIN.SAMPLES], ". Not estimating copy number for these batches.")
1783
+		tmp <- paste(names(samplesPerBatch)[samplesPerBatch < MIN.SAMPLES], collapse=", ")
1784
+		message("The following batches have fewer than ", MIN.SAMPLES, " samples: ",  tmp)
1785
+		message("Not estimating copy number parameters for batch ", tmp)
1690 1786
 	}
1691 1787
 	mylabel <- function(marker.type){
1692 1788
 		switch(marker.type,
... ...
@@ -1696,15 +1792,14 @@ crlmmCopynumber <- function(object,
1696 1792
 		       X.NP="chromosome X nonpolymorphic markers")
1697 1793
 	}
1698 1794
 	if(verbose) message("Computing summary statistics of the genotype clusters for each batch")
1699
-	for(i in c(1, 2, 4)){ ## do not do X.SNP.  Do this during fit.lm3
1795
+	I <- which(type %in% c("SNP", "NP", "X.NP"))
1796
+	for(j in seq_along(I)){ ## do not do X.SNP.  Do this during fit.lm3
1797
+		i <- I[j]
1700 1798
 		marker.type <- type[i]
1701 1799
 		if(verbose) message(paste("...", mylabel(marker.type)))
1702 1800
 		##if(verbose) message(paste("Computing summary statistics for ", mylabel(marker.type), " genotype clusters for each batch")
1703
-		marker.index <- whichMarkers(marker.type,
1704
-					     is.snp,
1705
-					     is.autosome,
1706
-					     is.annotated,
1707
-					     is.X)
1801
+		marker.index <- whichMarkers(marker.type, is.snp,
1802
+					     is.autosome, is.annotated, is.X)
1708 1803
 		object <- genotypeSummary(object=object,
1709 1804
 					  GT.CONF.THR=GT.CONF.THR,
1710 1805
 					  type=marker.type,
... ...
@@ -1713,16 +1808,18 @@ crlmmCopynumber <- function(object,
1713 1808
 					  is.lds=is.lds)
1714 1809
 	}
1715 1810
 	if(verbose) message("Imputing unobserved genotype medians and shrinking the variances (within-batch, across loci) ")##SNPs only
1716
-	marker.index <- whichMarkers("SNP", is.snp,
1717
-				     is.autosome, is.annotated, is.X)
1718
-	object <- shrinkSummary(object=object,
1719
-				MIN.OBS=MIN.OBS,
1720
-				MIN.SAMPLES=MIN.SAMPLES,
1721
-				DF.PRIOR=DF.PRIOR,
1722
-				type="SNP",
1723
-				verbose=verbose,
1724
-				marker.index=marker.index,
1725
-				is.lds=is.lds)
1811
+	if("SNP" %in% type){
1812
+		marker.index <- whichMarkers("SNP", is.snp,
1813
+					     is.autosome, is.annotated, is.X)
1814
+		object <- shrinkSummary(object=object,
1815
+					MIN.OBS=MIN.OBS,
1816
+					MIN.SAMPLES=MIN.SAMPLES,
1817
+					DF.PRIOR=DF.PRIOR,
1818
+					type="SNP",
1819
+					verbose=verbose,
1820
+					marker.index=marker.index,
1821
+					is.lds=is.lds)
1822
+	}
1726 1823
 	if(verbose) message("Estimating parameters for copy number")##SNPs only
1727 1824
 	for(i in seq_along(type)){
1728 1825
 		marker.type <- type[i]
... ...
@@ -1745,7 +1842,7 @@ crlmmCopynumber <- function(object,
1745 1842
 					       is.lds=is.lds,
1746 1843
 					       CHR.X=CHR.X)
1747 1844
 	}
1748
-	return(object)
1845
+	TRUE
1749 1846
 }
1750 1847
 crlmmCopynumber2 <- function(){
1751 1848
 	.Defunct(msg="The crlmmCopynumber2 function has been deprecated. The function crlmmCopynumber should be used instead.  crlmmCopynumber will support large data using ff provided that the ff package is loaded.")
... ...
@@ -1753,6 +1850,7 @@ crlmmCopynumber2 <- function(){
1753 1850
 crlmmCopynumberLD <- crlmmCopynumber2
1754 1851
 
1755 1852
 
1853
+
1756 1854
 estimateCnParameters <- function(object,
1757 1855
 				 type=c("SNP", "NP", "X.SNP", "X.NP"),
1758 1856
 				 verbose=TRUE,
... ...
@@ -1848,10 +1946,13 @@ imputeAA.AB <- function(index, N.AA, N.AB, N.BB,
1848 1946
 		X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE])
1849 1947
 		Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE])
1850 1948
 		tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL)
1851
-		if(is.null(tmp)) {cat("."); next()}
1852
-
1949
+		if(is.null(tmp)) {
1950
+			##cat(".");
1951
+			next()
1952
+		}
1853 1953
 		## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute'
1854 1954
 		L <- which(rowSums(Ns < 3) == 2)
1955
+		if(length(L) == 0) next()
1855 1956
 		Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE])
1856 1957
 		imputedVals <- Z %*% betahat
1857 1958
 		medianA.AA[i, L] <- imputedVals[, 1]
... ...
@@ -1883,10 +1984,14 @@ imputeAB.BB <- function(index, N.AA, N.AB, N.BB,
1883 1984
 		X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE])
1884 1985
 		Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE])
1885 1986
 		tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL)
1886
-		if(is.null(tmp)) {cat("."); next()}
1987
+		if(is.null(tmp)) {
1988
+##			cat(".");
1989
+			next()
1990
+		}
1887 1991
 
1888 1992
 		## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute'
1889 1993
 		L <- which(rowSums(Ns < 3) == 2)
1994
+		if(length(L) == 0) next()
1890 1995
 		Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE])
1891 1996
 		imputedVals <- Z %*% betahat
1892 1997
 		medianA.AB[i, L] <- imputedVals[, 1]
... ...
@@ -1918,10 +2023,13 @@ imputeAA <- function(index, N.AA, N.AB, N.BB,
1918 2023
 		X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE])
1919 2024
 		Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE])
1920 2025
 		tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL)
1921
-		if(is.null(tmp)) {cat("."); next()}
1922
-
2026
+		if(is.null(tmp)) {
2027
+			##cat(".");
2028
+			next()
2029
+		}
1923 2030
 		## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute'
1924 2031
 		L <- which(rowSums(Ns < 3) == 1)
2032
+		if(length(L) == 0) next()
1925 2033
 		Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE])
1926 2034
 		imputedVals <- Z %*% betahat
1927 2035
 		medianA.AA[i, L] <- imputedVals[, 1]
... ...
@@ -1950,11 +2058,19 @@ imputeBB <- function(index, N.AA, N.AB, N.BB,
1950 2058
 		if(length(K) < 1) next() ## nothing we can do.  use information from other snps
1951 2059
 		X <- cbind(1, medianA[K, -gt.to.impute, drop=FALSE], medianB[K, -gt.to.impute, drop=FALSE])
1952 2060
 		Y <- cbind(medianA[K, gt.to.impute, drop=FALSE], medianB[K, gt.to.impute, drop=FALSE])
2061
+		colnames(Y) <- c("A.BB", "B.BB")
1953 2062
 		tmp <- tryCatch(betahat <- solve(crossprod(X), crossprod(X,Y)), error=function(e) NULL)
1954
-		if(is.null(tmp)) {cat("."); next()}
1955
-
2063
+		if(is.null(tmp)) {
2064
+			##cat(".");
2065
+			next()
2066
+		}
2067
+##		else{
2068
+##			R <- Y-crossprod(X, betahat)
2069
+##			RSS <- t(R)%*%R
2070
+##		}
1956 2071
 		## Get data from observed genotypes in batch with insufficient data for genotypes 'gt.to.impute'
1957 2072
 		L <- which(rowSums(Ns < 3) == 1)
2073
+		if(length(L) == 0) next()
1958 2074
 		Z <- cbind(1, medianA[L, -gt.to.impute, drop=FALSE], medianB[L, -gt.to.impute, drop=FALSE])
1959 2075
 		imputedVals <- Z %*% betahat
1960 2076
 		medianA.BB[i, L] <- imputedVals[, 1]
... ...
@@ -1987,13 +2103,13 @@ imputeAcrossBatch <- function(N.AA, N.AB, N.BB,
1987 2103
 	names(updated) <- c("AA.AB", "AB.BB", "AA", "BB")
1988 2104
 	if(length(index[["AA.AB"]] > 0)){
1989 2105
 		res <- imputeAA.AB(index[["AA.AB"]],
1990
-			   N.AA,
1991
-			   N.AB,
1992
-			   N.BB,
1993
-			   medianA.AA,
1994
-			   medianA.AB,
1995
-			   medianA.BB,
1996
-			   medianB.AA, medianB.AB, medianB.BB)
2106
+				   N.AA,
2107
+				   N.AB,
2108
+				   N.BB,
2109
+				   medianA.AA,
2110
+				   medianA.AB,
2111
+				   medianA.BB,
2112
+				   medianB.AA, medianB.AB, medianB.BB)
1997 2113
 		updated$AA.AB <- res$imputed
1998 2114
 	}
1999 2115
 	if(length(index[["AB.BB"]] > 0)){
... ...
@@ -2004,7 +2120,9 @@ imputeAcrossBatch <- function(N.AA, N.AB, N.BB,
2004 2120
 				   res[["medianA.AA"]],
2005 2121
 				   res[["medianA.AB"]],
2006 2122
 				   res[["medianA.BB"]],
2007
-				   res[["medianB.AA"]], res[["medianB.AB"]], res[["medianB.BB"]])
2123
+				   res[["medianB.AA"]],
2124
+				   res[["medianB.AB"]],
2125
+				   res[["medianB.BB"]])
2008 2126
 		updated$AB.BB <- res$imputed
2009 2127
 	}
2010 2128
 	if(length(index[["AA"]] > 0)){
... ...
@@ -2026,9 +2144,426 @@ imputeAcrossBatch <- function(N.AA, N.AB, N.BB,
2026 2144
 				res[["medianA.AA"]],
2027 2145
 				res[["medianA.AB"]],
2028 2146
 				res[["medianA.BB"]],
2029
-				res[["medianB.AA"]], res[["medianB.AB"]], res[["medianB.BB"]])
2147
+				res[["medianB.AA"]],
2148
+				res[["medianB.AB"]],
2149
+				res[["medianB.BB"]])
2030 2150
 		updated$BB <- res$imputed
2031 2151
 	}
2032 2152
 	updated.indices <- unlist(updated)
2033
-	return(res, updated)
2153
+	return(list(res, updated))
2154
+}
2155
+
2156
+posteriorMean <- function(object, type=c("SNP", "NP", "X.SNP", "X.NP"), verbose=TRUE,
2157
+			  prior.prob=c(1/6, 1/6, 3/6, 1/6)){
2158
+	stopifnot(type %in% c("SNP", "NP", "X.SNP", "X.NP"))
2159
+	stopifnot(sum(prior.prob)==1)
2160
+	batch <- batch(object)
2161
+	is.snp <- isSnp(object)
2162
+	is.autosome <- chromosome(object) < 23
2163
+	is.annotated <- !is.na(chromosome(object))
2164
+	is.X <- chromosome(object) == 23
2165
+	is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
2166
+	if(is.lds) stopifnot(isPackageLoaded("ff"))
2167
+	samplesPerBatch <- table(as.character(batch(object)))
2168
+	mylabel <- function(marker.type){
2169
+		switch(marker.type,
2170
+		       SNP="autosomal SNPs",
2171
+		       NP="autosomal nonpolymorphic markers",
2172
+		       X.SNP="chromosome X SNPs",
2173
+		       X.NP="chromosome X nonpolymorphic markers")
2174
+	}
2175
+	if(type=="SNP"){
2176
+		if(verbose) message(paste("...", mylabel(type)))
2177
+		marker.index <- whichMarkers("SNP",
2178
+					     is.snp,
2179
+					     is.autosome,
2180
+					     is.annotated,
2181
+					     is.X)
2182
+		marker.list <- splitIndicesByLength(marker.index, ocProbesets())
2183
+		emit <- ocLapply(seq_along(marker.list),
2184
+				 posteriorMean.snp,
2185
+				 object=object,
2186
+				 index.list=marker.list,
2187
+				 verbose=verbose,
2188
+				 prior.prob=prior.prob)
2189
+	}
2190
+	return(emit)
2191
+}
2192
+
2193
+posteriorMean.snp <- function(stratum, object, index.list,
2194
+			      prior.prob=c(1/6, 1/6, 3/6, 1/6), is.lds=TRUE, verbose=TRUE){
2195
+	if(verbose) message("Probe stratum ", stratum, " of ", length(index.list))
2196
+	index <- index.list[[stratum]]
2197
+	if(is.lds){
2198
+		open(A(object))
2199
+		open(B(object))
2200
+		open(tau2A.AA(object))
2201
+		open(tau2B.BB(object))
2202
+		open(tau2A.BB(object))
2203
+		open(tau2B.AA(object))
2204
+		open(corrAA(object))
2205
+		open(corrAB(object))
2206
+		open(corrBB(object))
2207
+		open(nuA(object))
2208
+		open(nuB(object))
2209
+		open(phiA(object))
2210
+		open(phiB(object))
2211
+	}
2212
+	a <- log2(as.matrix(A(object)[index, ]))
2213
+	b <- log2(as.matrix(B(object)[index, ]))
2214
+	sig2A <- as.matrix(tau2A.AA(object)[index,])
2215
+	sig2B <- as.matrix(tau2B.BB(object)[index,])
2216
+	tau2A <- as.matrix(tau2A.BB(object)[index,])
2217
+	tau2B <- as.matrix(tau2B.AA(object)[index,])
2218
+	corrAA <- as.matrix(corrAA(object)[index, ])
2219
+	corrAB <- as.matrix(corrAB(object)[index, ])
2220
+	corrBB <- as.matrix(corrBB(object)[index, ])
2221
+	nuA <- as.matrix(nuA(object)[index, ])
2222
+	phiA <- as.matrix(phiA(object)[index, ])
2223
+	nuB <- as.matrix(nuB(object)[index, ])
2224
+	phiB <- as.matrix(phiB(object)[index, ])
2225
+	if(is.lds){
2226
+		close(A(object))
2227
+		close(B(object))
2228
+		close(tau2A.AA(object))
2229
+		close(tau2B.BB(object))
2230
+		close(tau2A.BB(object))
2231
+		close(tau2B.AA(object))
2232
+		close(corrAA(object))
2233
+		close(corrAB(object))
2234
+		close(corrBB(object))
2235
+		close(nuA(object))
2236
+		close(nuB(object))
2237
+		close(phiA(object))
2238
+		close(phiB(object))
2239
+	}
2240
+	emit <- array(NA, dim=c(nrow(a), ncol(a), 4))##SNPs x sample x 'truth'
2241
+	sample.index <- split(1:ncol(object), batch(object))
2242
+	sum.mymatrix <- function(...){
2243
+		x <- list(...)
2244
+		return(x[[1]] + do.call(sum, x[-1]))
2245
+	}
2246
+	##emit <- vector("list", length(sample.index))
2247
+	for(j in seq_along(sample.index)){
2248
+		cat("batch ", j, "\n")
2249
+		J <- sample.index[[j]]
2250
+		probs <- array(NA, dim=c(nrow(a), length(J), 4))
2251
+		for(k in seq_along(0:3)){
2252
+			CT <- (0:3)[k]
2253
+			f.x.y <- vector("list", choose(CT+1, CT))
2254
+			for(i in seq_along(0:CT)){
2255
+				CA <- (0:CT)[i]
2256
+				CB <- CT-CA
2257
+				A.scale <- sqrt(tau2A[, j]*(CA==0) + sig2A[, j]*(CA > 0))
2258
+				B.scale <- sqrt(tau2B[, j]*(CA==0) + sig2B[, j]*(CA > 0))
2259
+				if(CA == 0 & CB == 0) rho <- 0
2260
+				if(CA == 0 & CB > 0) rho <- corrBB[, j]
2261
+				if(CA > 0 & CB == 0) rho <- corrAA[, j]
2262
+				if(CA > 0 & CB > 0) rho <- corrAB[, j]
2263
+				meanA <- log2(nuA[, j]+CA*phiA[, j])
2264
+				meanB <- log2(nuB[, j]+CB*phiB[, j])
2265
+				covs <- rho*A.scale*B.scale
2266
+				A.scale2 <- A.scale^2
2267
+				B.scale2 <- B.scale^2
2268
+				Q.x.y <- 1/(1-rho^2)*(((a[, J] - meanA)/A.scale)^2 + ((b[, J] - meanB)/B.scale)^2 - 2*rho*((a[, J] - meanA)*(b[, J] - meanB))/(A.scale*B.scale))
2269
+				f.x.y[[i]] <- 1/(2*pi*A.scale*B.scale*sqrt(1-rho^2))*exp(-0.5*Q.x.y)
2270
+				class(f.x.y[[i]]) <- "mymatrix"
2271
+			}
2272
+			probs[, , k] <- do.call("sum", f.x.y)
2273
+			##if none of the states are likely (outlier), assign NA
2274
+			##		emit[, , counter] <- f.x.y
2275
+			##		counter <- counter+1
2276
+		}
2277
+		emit[, J, ] <- probs
2278
+	}
2279
+	for(i in 1:4) emit[, , i] <- emit[, , i]*prior.prob[i]
2280
+	total <- matrix(0, nrow(emit), ncol(emit))
2281
+	for(i in 1:4) total <- total+emit[, , i]
2282
+	message(" need to check for outliers before rescaling")
2283
+	is.outlier <- total < 1e-5
2284
+##	plot(a[outlier.index[1], ], b[outlier.index[1], ], col="grey50",
2285
+##	     xaxt="n", yaxt="n", pch=21, cex=0.8,
2286
+##	     xlim=c(6.5, 12.5), ylim=c(6.5, 12.5), xlab="", ylab="")
2287
+##	for(j in seq_along(sample.index)){
2288
+##		J <- sample.index[[j]]
2289
+##		b <- unique(batch(object)[J])
2290
+##		for(CN in 2) 	lines(object, outlier.index[1], b, CN, col="black", lwd=2, x.axis="A")
2291
+##	}
2292
+	## how to handle outliers...
2293
+	##  - use priors (posterior mean will likely be near 2).  smoothing needs to take into account the uncertainty
2294
+	##  - need uncertainty estimates for posterior means
2295
+	for(i in 1:4) emit[, , i] <- emit[, , i]/total
2296
+	return(emit)
2297
+}
2298
+
2299
+
2300
+
2301
+
2302
+rscrlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
2303
+                     col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
2304
+                     SNRMin=5, recallMin=10, recallRegMin=1000,
2305
+                     gender=NULL, desctrucitve=FALSE, verbose=TRUE,
2306
+                     returnParams=FALSE, badSNP=.7){
2307
+  pkgname <- getCrlmmAnnotationName(cdfName)
2308
+  stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
2309
+  open(SNR)
2310
+  open(A)
2311
+  open(B)
2312
+  open(mixtureParams)
2313
+  ## expect objects to be ff
2314
+
2315
+  keepIndex <- which( SNR[] > SNRMin)
2316
+  if(length(keepIndex)==0) stop("No arrays above quality threshold!")
2317
+  if(is.null(row.names) & is.null(rownames(A))){
2318
+	  ##verify that A has only snps.  otherwise, calling function must pass rownames
2319
+	  message("rownames not available.  Assuming only SNPs in A")
2320
+	  snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
2321
+  } else{
2322
+	  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
2323
+	  gns <- getVarInEnv("gns")
2324
+	  index <- match(gns, rownames(A))
2325
+	  snpBatches <- splitIndicesByLength(index, ocProbesets())
2326
+  }
2327
+  NR <- length(unlist(snpBatches))
2328
+  if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
2329
+  NC <- ncol(A)
2330
+
2331
+  if(verbose) message("Loading annotations.")
2332
+  obj1 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
2333
+  obj2 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
2334
+  ## this is toget rid of the 'no visible binding' notes
2335
+  ## variable definitions
2336
+  XIndex <- getVarInEnv("XIndex")
2337
+  autosomeIndex <- getVarInEnv("autosomeIndex")
2338
+  YIndex <- getVarInEnv("YIndex")
2339
+  SMEDIAN <- getVarInEnv("SMEDIAN")
2340
+  theKnots <- getVarInEnv("theKnots")
2341
+  regionInfo <- getVarInEnv("regionInfo")
2342
+  params <- getVarInEnv("params")
2343
+  rm(list=c(obj1, obj2), envir=.crlmmPkgEnv)
2344
+  rm(obj1, obj2)
2345
+
2346
+  ## IF gender not provide, we predict
2347
+  ## FIXME: XIndex may be greater than ocProbesets()
2348
+  if(is.null(gender)){
2349
+    if(verbose) message("Determining gender.")
2350
+##    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
2351
+    XMedian <- ocLapply(splitIndicesByNode(1:NC), predictGender, theA=A, theB=B, XIndex=XIndex, neededPkgs="crlmm")
2352
+    XMedian <- unlist(XMedian)
2353
+    if(sum(SNR[] > SNRMin)==1){
2354
+      gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
2355
+    }else{
2356
+      gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
2357
+    }
2358
+  }
2359
+
2360
+  Indexes <- list(autosomeIndex, XIndex, YIndex)
2361
+  cIndexes <- list(keepIndex,
2362
+                   keepIndex[which(gender[keepIndex]==2)],
2363
+                   keepIndex[which(gender[keepIndex]==1)])
2364
+
2365
+  if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
2366
+
2367
+  ## call C
2368
+  fIndex <- which(gender==2)
2369
+  mIndex <- which(gender==1)
2370
+
2371
+  ## different here
2372
+  ## use gtypeCallerR in batches
2373
+  ##snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
2374
+  newparamsBatch <- vector("list", length(snpBatches))
2375
+
2376
+  process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
2377
+                       YIndex, A, B, mixtureParams, fIndex, mIndex,
2378
+                       params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
2379
+    open(A)
2380
+    open(B)
2381
+    open(mixtureParams)
2382
+    snps <- snpBatches[[idxBatch]]
2383
+    rSnps <- range(snps)
2384
+    last <- (idxBatch-1)*batchSize
2385
+    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
2386
+                         XIndex[XIndex %in% snps]-last,
2387
+                         YIndex[YIndex %in% snps]-last)
2388
+    IndexesBatch <- lapply(IndexesBatch, as.integer)
2389
+    tmpA <- as.matrix(A[snps,])
2390
+    tmpB <- as.matrix(B[snps,])
2391
+    ## newparamsBatch[[idxBatch]]
2392
+
2393
+    tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
2394
+                        params[["centers"]][snps,],
2395
+                        params[["scales"]][snps,],
2396
+                        params[["N"]][snps,],
2397
+                        IndexesBatch, cIndexes,
2398
+                        sapply(IndexesBatch, length),
2399
+                        sapply(cIndexes, length), SMEDIAN,
2400
+                        theKnots, mixtureParams[], DF, probs, 0.025)
2401
+    rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last)
2402
+    gc(verbose=FALSE)
2403
+    close(A)
2404
+    close(B)
2405
+    close(mixtureParams)
2406
+    tmp
2407
+  }
2408
+
2409
+  newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
2410
+                             snpBatches=snpBatches,
2411
+                             autosomeIndex=autosomeIndex, XIndex=XIndex,
2412
+                             YIndex=YIndex, A=A, B=B,
2413
+                             mixtureParams=mixtureParams, fIndex=fIndex,
2414
+                             mIndex=mIndex, params=params,
2415
+                             cIndexes=cIndexes, SMEDIAN=SMEDIAN,
2416
+                             theKnots=theKnots, DF=DF, probs=probs,
2417
+                             batchSize=ocProbesets())
2418
+  newparams <- vector("list", 3)
2419
+  names(newparams) <- c("centers", "scales", "N")
2420
+  newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
2421
+  newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2))
2422
+  newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3))
2423
+  rm(newparamsBatch); gc(verbose=FALSE)
2424
+  if(verbose) message("Done.")
2425
+  if(verbose) message("Estimating recalibration parameters.")
2426
+  d <- newparams[["centers"]] - params$centers
2427
+
2428
+  ##regression
2429
+  Index <- intersect(which(pmin(newparams[["N"]][, 1],
2430
+                                newparams[["N"]][, 2],
2431
+                                newparams[["N"]][, 3]) > recallMin &
2432
+                                !apply(regionInfo, 1, any)),
2433
+                                autosomeIndex)
2434
+  if(length(Index) < recallRegMin){
2435
+    warning("Recalibration not possible. Possible cause: small sample size.")
2436
+    newparams <- params
2437
+    dev <- vector("numeric", nrow(newparams[["centers"]]))
2438
+    SS <- matrix(Inf, 3, 3)
2439
+    DD <- 0
2440
+  }else{
2441
+    data4reg <- as.data.frame(newparams[["centers"]][Index,])
2442
+    names(data4reg) <- c("AA", "AB", "BB")
2443
+    regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
2444
+                       c(coef(lm(AB~AA+BB, data=data4reg)), 0),
2445
+                         coef(lm(BB~AA*AB, data=data4reg)))
2446
+    rownames(regParams) <- c("intercept", "X", "Y", "XY")
2447
+    rm(data4reg)
2448
+
2449
+    minN <- 3
2450
+    newparams[["centers"]][newparams[["N"]] < minN] <- NA
2451
+    Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
2452
+    if(verbose) message("Filling out empty centers", appendLF=FALSE)
2453
+    for(i in Index){
2454
+      if(verbose) if(i%%10000==0) message(".", appendLF=FALSE)
2455
+      mu <- newparams[["centers"]][i, ]
2456
+      j <- which(is.na(mu))
2457
+      newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
2458
+      rm(mu, j)
2459
+    }
2460
+
2461
+    ##remaing NAs are made like originals
2462
+    if(length(YIndex)>0){
2463
+      noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex),
2464
+                           YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
2465
+    }
2466
+    snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0)
2467
+    snps2keep <- setdiff(autosomeIndex, snps2ignore)
2468
+    rm(snps2ignore)
2469
+    newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
2470
+    if(verbose) cat("\n")
2471
+
2472
+    if(verbose) message("Calculating and standardizing size of shift... ", appendLF=FALSE)
2473
+    GG <- DD <- newparams[["centers"]] - params[["centers"]]
2474
+    DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
2475
+    SS <- cov(DD[autosomeIndex, ])
2476
+    SSI <- solve(SS)
2477
+    dev <- vector("numeric", nrow(DD))
2478
+    if(length(YIndex)){
2479
+      dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
2480
+      dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
2481
+      ##Now Y (only two params)
2482
+      SSY <- SS[c(1, 3), c(1, 3)]
2483
+      SSI <- solve(SSY)
2484
+      dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
2485
+      dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
2486
+    } else {
2487
+      dev=apply(DD,1,function(x) x%*%SSI%*%x)
2488
+      dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
2489
+    }
2490
+  }
2491
+  if (verbose) message("OK")
2492
+
2493
+  ## BC: must keep SD
2494
+  params[-2] <- newparams[-2]
2495
+  rm(newparams)
2496
+  gc(verbose=FALSE)
2497
+
2498
+  if(verbose) message("Calling ", NR, " SNPs... ", appendLF=FALSE)
2499
+
2500
+  ## ###################
2501
+  ## ## MOVE TO C#######
2502
+
2503
+  ## running in batches
2504
+  process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
2505
+                       YIndex, A, B, mixtureParams, fIndex, mIndex,
2506
+                       params, cIndexes, SMEDIAN, theKnots, DF, probs,
2507
+                       regionInfo, batchSize){
2508
+    open(A)
2509
+    open(B)
2510
+    open(mixtureParams)
2511
+    snps <- snpBatches[[idxBatch]]
2512
+    tmpA <- as.matrix(A[snps,])
2513
+    tmpB <- as.matrix(B[snps,])
2514
+    rSnps <- range(snps)
2515
+    last <- (idxBatch-1)*batchSize
2516
+    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
2517
+                         XIndex[XIndex %in% snps]-last,
2518
+                         YIndex[YIndex %in% snps]-last)
2519
+    IndexesBatch <- lapply(IndexesBatch, as.integer)
2520
+    ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
2521
+                            params[["centers"]][snps,],
2522
+                            params[["scales"]][snps,],
2523
+                            params[["N"]][snps,],
2524
+                            IndexesBatch, cIndexes,
2525
+                            sapply(IndexesBatch, length),
2526
+                            sapply(cIndexes, length),
2527
+                            SMEDIAN, theKnots, mixtureParams[],
2528
+                            DF, probs, 0.025,
2529
+                            which(regionInfo[snps, 2]),
2530
+                            which(regionInfo[snps, 1]))
2531
+    A[snps,] <- tmpA
2532
+    B[snps,] <- tmpB
2533
+    rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull, last)
2534
+    gc(verbose=FALSE)
2535
+    close(A)
2536
+    close(B)
2537
+    close(mixtureParams)
2538
+  }
2539
+
2540
+  ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
2541
+           autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
2542
+           A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
2543
+           mIndex=mIndex, params=params, cIndexes=cIndexes,
2544
+           SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
2545
+           regionInfo=regionInfo, batchSize=ocProbesets())
2546
+  ##  END MOVE TO C#######
2547
+  ## ##################
2548
+
2549
+  dev <- dev/(dev+1/383)
2550
+  if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
2551
+  if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
2552
+
2553
+  if(length(Index) >= recallRegMin){
2554
+   tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
2555
+   tmpSnpQc <- dev[autosomeIndex]
2556
+   SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
2557
+   batchQC <- mean(diag(SS))
2558
+  }else{
2559
+    SS <- matrix(0, 3, 3)
2560
+    batchQC <- Inf
2561
+  }
2562
+
2563
+  if(verbose) message("Done.")
2564
+  if (returnParams){
2565
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
2566
+  }else{
2567
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
2568
+  }
2034 2569
 }
... ...
@@ -323,8 +323,10 @@ predictGender <- function(cols, theA, theB, XIndex){
323 323
     open(theA)
324 324
     open(theB)
325 325
     for (i in 1:n){
326
-      vA <- log2(theA[XIndex, cols[i]])
327
-      vB <- log2(theB[XIndex, cols[i]])
326
+##      vA <- log2(theA[XIndex, cols[i]])
327
+##      vB <- log2(theB[XIndex, cols[i]])
328
+      vA <- log2(as.integer(theA[XIndex, cols[i]]))
329
+      vB <- log2(as.integer(theB[XIndex, cols[i]]))
328 330
       med[i] <- median(vA+vB)/2
329 331
       rm(vA, vB)
330 332
     }
... ...
@@ -427,281 +429,6 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
427 429
                         sapply(IndexesBatch, length),
428 430
                         sapply(cIndexes, length), SMEDIAN,
429 431
                         theKnots, mixtureParams[], DF, probs, 0.025)
430
-
431
-    rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last)
432
-    gc(verbose=FALSE)
433
-    close(A)
434
-    close(B)
435
-    close(mixtureParams)
436
-    tmp
437
-  }
438
-
439
-  newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
440
-                             snpBatches=snpBatches,
441
-                             autosomeIndex=autosomeIndex, XIndex=XIndex,
442
-                             YIndex=YIndex, A=A, B=B,
443
-                             mixtureParams=mixtureParams, fIndex=fIndex,
444
-                             mIndex=mIndex, params=params,
445
-                             cIndexes=cIndexes, SMEDIAN=SMEDIAN,
446
-                             theKnots=theKnots, DF=DF, probs=probs,
447
-                             batchSize=ocProbesets())
448
-  newparams <- vector("list", 3)
449
-  names(newparams) <- c("centers", "scales", "N")
450
-  newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
451
-  newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2))
452
-  newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3))
453
-  rm(newparamsBatch); gc(verbose=FALSE)
454
-  if(verbose) message("Done.")
455
-  if(verbose) message("Estimating recalibration parameters.")
456
-  d <- newparams[["centers"]] - params$centers
457
-
458
-  ##regression
459
-  Index <- intersect(which(pmin(newparams[["N"]][, 1],
460
-                                newparams[["N"]][, 2],
461
-                                newparams[["N"]][, 3]) > recallMin &
462
-                                !apply(regionInfo, 1, any)),
463
-                                autosomeIndex)
464
-  if(length(Index) < recallRegMin){
465
-    warning("Recalibration not possible. Possible cause: small sample size.")
466
-    newparams <- params
467
-    dev <- vector("numeric", nrow(newparams[["centers"]]))
468
-    SS <- matrix(Inf, 3, 3)
469
-    DD <- 0
470
-  }else{
471
-    data4reg <- as.data.frame(newparams[["centers"]][Index,])
472
-    names(data4reg) <- c("AA", "AB", "BB")
473
-    regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
474
-                       c(coef(lm(AB~AA+BB, data=data4reg)), 0),
475
-                         coef(lm(BB~AA*AB, data=data4reg)))
476
-    rownames(regParams) <- c("intercept", "X", "Y", "XY")
477
-    rm(data4reg)
478
-
479
-    minN <- 3
480
-    newparams[["centers"]][newparams[["N"]] < minN] <- NA
481
-    Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
482
-    if(verbose) message("Filling out empty centers", appendLF=FALSE)
483
-    for(i in Index){
484
-      if(verbose) if(i%%10000==0) message(".", appendLF=FALSE)
485
-      mu <- newparams[["centers"]][i, ]
486
-      j <- which(is.na(mu))
487
-      newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
488
-      rm(mu, j)
489
-    }
490
-
491
-    ##remaing NAs are made like originals
492
-    if(length(YIndex)>0){
493
-      noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex),
494
-                           YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
495
-    }
496
-    snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0)
497
-    snps2keep <- setdiff(autosomeIndex, snps2ignore)
498
-    rm(snps2ignore)
499
-    newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
500
-    if(verbose) cat("\n")
501
-
502
-    if(verbose) message("Calculating and standardizing size of shift... ", appendLF=FALSE)
503
-    GG <- DD <- newparams[["centers"]] - params[["centers"]]
504
-    DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
505
-    SS <- cov(DD[autosomeIndex, ])
506
-    SSI <- solve(SS)
507
-    dev <- vector("numeric", nrow(DD))
508
-    if(length(YIndex)){
509
-      dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
510
-      dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
511
-      ##Now Y (only two params)
512
-      SSY <- SS[c(1, 3), c(1, 3)]
513
-      SSI <- solve(SSY)
514
-      dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
515
-      dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
516
-    } else {
517
-      dev=apply(DD,1,function(x) x%*%SSI%*%x)
518
-      dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
519
-    }
520
-  }
521
-  if (verbose) message("OK")
522
-
523
-  ## BC: must keep SD
524
-  params[-2] <- newparams[-2]
525
-  rm(newparams)
526
-  gc(verbose=FALSE)
527
-
528
-  if(verbose) message("Calling ", NR, " SNPs... ", appendLF=FALSE)
529
-
530
-  ## ###################
531
-  ## ## MOVE TO C#######
532
-
533
-  ## running in batches
534
-  process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
535
-                       YIndex, A, B, mixtureParams, fIndex, mIndex,
536
-                       params, cIndexes, SMEDIAN, theKnots, DF, probs,
537
-                       regionInfo, batchSize){
538
-    open(A)
539
-    open(B)
540
-    open(mixtureParams)
541
-    snps <- snpBatches[[idxBatch]]
542
-    tmpA <- as.matrix(A[snps,])
543
-    tmpB <- as.matrix(B[snps,])
544
-    rSnps <- range(snps)
545
-    last <- (idxBatch-1)*batchSize
546
-    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
547
-                         XIndex[XIndex %in% snps]-last,
548
-                         YIndex[YIndex %in% snps]-last)
549
-    IndexesBatch <- lapply(IndexesBatch, as.integer)
550
-    ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
551
-                            params[["centers"]][snps,],
552
-                            params[["scales"]][snps,],
553
-                            params[["N"]][snps,],
554
-                            IndexesBatch, cIndexes,
555
-                            sapply(IndexesBatch, length),
556
-                            sapply(cIndexes, length),
557
-                            SMEDIAN, theKnots, mixtureParams[],
558
-                            DF, probs, 0.025,
559
-                            which(regionInfo[snps, 2]),
560
-                            which(regionInfo[snps, 1]))
561
-    A[snps,] <- tmpA
562
-    B[snps,] <- tmpB
563
-    rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull, last)
564
-    gc(verbose=FALSE)
565
-    close(A)
566
-    close(B)
567
-    close(mixtureParams)
568
-  }
569
-
570
-  ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
571
-           autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
572
-           A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
573
-           mIndex=mIndex, params=params, cIndexes=cIndexes,
574
-           SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
575
-           regionInfo=regionInfo, batchSize=ocProbesets())
576
-  ##  END MOVE TO C#######
577
-  ## ##################
578
-
579
-  dev <- dev/(dev+1/383)
580
-  if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
581
-  if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
582
-
583
-  if(length(Index) >= recallRegMin){
584
-   tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
585
-   tmpSnpQc <- dev[autosomeIndex]
586
-   SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
587
-   batchQC <- mean(diag(SS))
588
-  }else{
589
-    SS <- matrix(0, 3, 3)
590
-    batchQC <- Inf
591
-  }
592
-
593
-  if(verbose) message("Done.")
594
-  if (returnParams){
595
-    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
596
-  }else{
597
-    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
598
-  }
599
-}
600
-
601
-
602
-## a copy of crlmmGT2, except changes noted below by *RS* comments
603
-rscrlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
604
-		       col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
605
-		       SNRMin=5, recallMin=10, recallRegMin=1000,
606
-		       gender=NULL, desctrucitve=FALSE, verbose=TRUE,
607
-		       returnParams=FALSE, badSNP=.7){
608
-  open(SNR)
609
-  open(A)
610
-  open(B)
611
-  open(mixtureParams)
612
-  ## expect objects to be ff
613
-
614
-  keepIndex <- which( SNR[] > SNRMin)
615
-  if(length(keepIndex)==0) stop("No arrays above quality threshold!")
616
-
617
-  NC <- ncol(A)
618
-  NR <- nrow(A)
619
-
620
-  pkgname <- getCrlmmAnnotationName(cdfName)
621
-  stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
622
-
623
-  if(verbose) message("Loading annotations.")
624
-  obj1 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
625
-  obj2 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
626
-  ## this is toget rid of the 'no visible binding' notes
627
-  ## variable definitions
628
-  XIndex <- getVarInEnv("XIndex")
629
-  autosomeIndex <- getVarInEnv("autosomeIndex")
630
-  YIndex <- getVarInEnv("YIndex")
631
-  SMEDIAN <- getVarInEnv("SMEDIAN")
632
-  theKnots <- getVarInEnv("theKnots")
633
-  regionInfo <- getVarInEnv("regionInfo")
634
-  params <- getVarInEnv("params")
635
-  rm(list=c(obj1, obj2), envir=.crlmmPkgEnv)
636
-  rm(obj1, obj2)
637
-
638
-  ## IF gender not provide, we predict
639
-  ## FIXME: XIndex may be greater than ocProbesets()
640
-  if(is.null(gender)){
641
-    if(verbose) message("Determining gender.")
642
-##    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
643
-    XMedian <- ocLapply(splitIndicesByNode(1:NC), predictGender, theA=A, theB=B, XIndex=XIndex, neededPkgs="crlmm")
644
-    XMedian <- unlist(XMedian)
645
-    if(sum(SNR[] > SNRMin)==1){
646
-      gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
647
-    }else{
648
-      gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
649
-    }
650
-  }
651
-
652
-  Indexes <- list(autosomeIndex, XIndex, YIndex)
653
-  cIndexes <- list(keepIndex,
654
-                   keepIndex[which(gender[keepIndex]==2)],
655
-                   keepIndex[which(gender[keepIndex]==1)])
656
-
657
-  ## move this statement further down
658
-  ##if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
659
-
660
-  ## call C
661
-  fIndex <- which(gender==2)
662
-  mIndex <- which(gender==1)
663
-
664
-  ## different here
665
-  ## use gtypeCallerR in batches
666
-  if(is.null(row.names) & is.null(rownames(A))){
667
-	  ##verify that A has only snps.  otherwise, calling function must pass rownames
668
-	  message("rownames not available.  Assuming only SNPs in A")
669
-	  snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
670
-  } else{
671
-	  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
672
-	  gns <- getVarInEnv("gns")
673
-	  index <- match(gns, rownames(A))
674
-	  snpBatches <- splitIndicesByLength(index, ocProbesets())
675
-  }
676
-  NR <- length(unlist(snpBatches))
677
-  if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
678
-  newparamsBatch <- vector("list", length(snpBatches))
679
-  process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
680
-                       YIndex, A, B, mixtureParams, fIndex, mIndex,
681
-                       params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
682
-    open(A)
683
-    open(B)
684
-    open(mixtureParams)
685
-    snps <- snpBatches[[idxBatch]]
686
-    rSnps <- range(snps)
687
-    last <- (idxBatch-1)*batchSize
688
-    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
689
-                         XIndex[XIndex %in% snps]-last,
690
-                         YIndex[YIndex %in% snps]-last)
691
-    IndexesBatch <- lapply(IndexesBatch, as.integer)
692
-    tmpA <- as.matrix(A[snps,])
693
-    tmpB <- as.matrix(B[snps,])
694
-    ## newparamsBatch[[idxBatch]]
695
-
696
-    tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
697
-                        params[["centers"]][snps,],
698
-                        params[["scales"]][snps,],
699
-                        params[["N"]][snps,],
700
-                        IndexesBatch, cIndexes,
701
-                        sapply(IndexesBatch, length),
702
-                        sapply(cIndexes, length), SMEDIAN,
703
-                        theKnots, mixtureParams[], DF, probs, 0.025)
704
-
705 432
     rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last)
706 433
     gc(verbose=FALSE)
707 434
     close(A)
... ...
@@ -12,6 +12,7 @@ readIdatFiles = function(sampleSheet=NULL,
12 12
 			  sep="_",
13 13
 			  fileExt=list(green="Grn.idat", red="Red.idat"),
14 14
 			  saveDate=FALSE, verbose=FALSE) {
15
+	verbose <- FALSE
15 16
        if(!is.null(arrayNames)) {
16 17
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
17 18
        }
...