Browse code

added functions for copy number analysis (not yet documented). Added copynumber vignette

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@37630 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 27/02/2009 13:06:31
Showing 9 changed files

... ...
@@ -1,14 +1,14 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling via CRLMM Algorithm
4
-Version: 1.0.34
4
+Version: 1.0.35
5 5
 Date: 2008-12-28
6 6
 Author: Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>
8 8
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays.
9
-Imports: affyio, preprocessCore, utils, stats
9
+Imports: affyio, preprocessCore, utils, stats, genefilter, splines, Biobase
10 10
 License: Artistic-2.0
11
-Suggests: hapmapsnp5, genomewidesnp5Crlmm, methods
11
+Suggests: hapmapsnp5, genomewidesnp5Crlmm, genomewidesnp6Crlmm, methods
12 12
 Collate: crlmmGT.R
13 13
          crlmm.R
14 14
          fitAffySnpMixture56.R
... ...
@@ -17,4 +17,5 @@ Collate: crlmmGT.R
17 17
          zzz.R
18 18
          crlmmGTnm.R
19 19
          crlmmnm.R
20
+	 functions.R
20 21
 LazyLoad: yes
... ...
@@ -1,6 +1,9 @@
1 1
 useDynLib("crlmm", .registration=TRUE)
2
-export("crlmm", "list.celfiles")
2
+export("crlmm", "list.celfiles", "computeCnBatch")
3 3
 importFrom(affyio, read.celfile.header, read.celfile)
4 4
 importFrom(preprocessCore, normalize.quantiles.use.target)
5 5
 importFrom(utils, data, packageDescription, setTxtProgressBar, txtProgressBar)
6 6
 importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile, sd)
7
+##RS
8
+importFrom(Biobase, rowMedians)
9
+importFrom(genefilter, rowSds)
... ...
@@ -5,7 +5,7 @@ crlmm <- function(filenames, row.names=TRUE, col.names=TRUE,
5 5
                   cdfName, sns, recallMin=10, recallRegMin=1000,
6 6
                   returnParams=FALSE, badSNP=.7){
7 7
   if ((load.it | save.it) & missing(intensityFile))
8
-    stop("'intensityFile' is missing, and you chose either load.it or save.it")
8
+	  stop("'intensityFile' is missing, and you chose either load.it or save.it")
9 9
   
10 10
   if (missing(sns)) sns <- basename(filenames)
11 11
   if (!missing(intensityFile))
... ...
@@ -33,7 +33,6 @@ crlmm <- function(filenames, row.names=TRUE, col.names=TRUE,
33 33
   }
34 34
   if(row.names) row.names=res$gns else row.names=NULL
35 35
   if(col.names) col.names=res$sns else col.names=NULL
36
-
37 36
   res2 <- crlmmGT(res[["A"]], res[["B"]], res[["SNR"]],
38 37
                   res[["mixtureParams"]], res[["cdfName"]],
39 38
                   gender=gender, row.names=row.names,
... ...
@@ -36,13 +36,13 @@ crlmmGT <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
36 36
   
37 37
   ##IF gender not provide, we predict
38 38
   if(is.null(gender)){
39
-    if(verbose) message("Determining gender.")
40
-    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
41
-    if(sum(SNR>SNRMin)==1){
42
-      gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
43
-    }else{
44
-      gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]]
45
-    }
39
+	  if(verbose) message("Determining gender.")
40
+	  XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
41
+	  if(sum(SNR>SNRMin)==1){
42
+		  gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
43
+	  }else{
44
+		  gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]]
45
+	  }
46 46
   }
47 47
   
48 48
   Indexes <- list(autosomeIndex, XIndex, YIndex)
49 49
new file mode 100644
... ...
@@ -0,0 +1,984 @@
1
+rowCovs <- function(x, y, ...){
2
+	notna <- !is.na(x)
3
+	N <- rowSums(notna)
4
+	x <- suppressWarnings(log2(x))
5
+	if(!missing(y)) y <- suppressWarnings(log2(y))
6
+	return(rowSums((x - rowMeans(x, ...)) * (y - rowMeans(y, ...)), ...)/(N-1))
7
+}
8
+
9
+rowMAD <- function(x, y, ...){
10
+	notna <- !is.na(x)
11
+	mad <- 1.4826*rowMedians(abs(x-rowMedians(x, ...)), ...)
12
+	return(mad)
13
+}
14
+
15
+rowCors <- function(x, y, ...){
16
+	N <- rowSums(!is.na(x))
17
+	x <- suppressWarnings(log2(x))
18
+	y <- suppressWarnings(log2(y))
19
+	sd.x <- rowSds(x, ...)
20
+	sd.y <- rowSds(y, ...)
21
+	covar <- rowSums((x - rowMeans(x, ...)) * (y - rowMeans(y, ...)), ...)/(N-1)
22
+	return(covar/(sd.x*sd.y))
23
+}
24
+
25
+nuphiAllele <- function(allele, Ystar, W, envir, p){
26
+	Ns <- get("Ns", envir)
27
+	NOHET <- mean(Ns[, p, 2], na.rm=TRUE) < 0.05
28
+	chrom <- get("chrom", envir)
29
+	if(missing(allele)) stop("must specify allele")
30
+	if(chrom == 23){
31
+		gender <- get("gender", envir)
32
+		if(allele == "A" & gender == "female")
33
+			X <- cbind(1, 2:0)
34
+		if(allele == "B" & gender == "female")
35
+			X <- cbind(1, 0:2)
36
+		if(allele == "A" & gender == "male")
37
+			X <- cbind(1, 1:0)
38
+		if(allele == "B" & gender == "male")
39
+			X <- cbind(1, 0:1)
40
+	} else {##autosome
41
+		if(allele == "A") X <- cbind(1, 2:0) else X <- cbind(1, 0:2)
42
+		if(NOHET) X <- X[-2, ] ##more than 1 X chromosome, but all homozygous		
43
+	}
44
+	if(any(!is.finite(W))){## | any(!is.finite(V))){
45
+		i <- which(rowSums(!is.finite(W)) > 0)
46
+		browser()
47
+		stop("Inf values in W or V")
48
+	}
49
+	##How to quickly generate Xstar, Xstar = diag(W) %*% X
50
+	generateX <- function(w, X) as.numeric(diag(w) %*% X)
51
+	##Not instant
52
+	Xstar <- apply(W, 1, generateX, X)
53
+	generateIXTX <- function(x, nrow=3) {
54
+		X <- matrix(x, nrow=nrow)
55
+		XTX <- crossprod(X)
56
+		solve(XTX)
57
+	}
58
+	##a little slow
59
+	IXTX <- apply(Xstar, 2, generateIXTX, nrow=nrow(X))
60
+	betahat <- matrix(NA, 2, nrow(Ystar))
61
+	ses <- matrix(NA, 2, nrow(Ystar))
62
+	for(i in 1:nrow(Ystar)){
63
+		betahat[, i] <- crossprod(matrix(IXTX[, i], 2, 2), crossprod(matrix(Xstar[, i], nrow=nrow(X)), Ystar[i, ]))
64
+		ssr <- sum((Ystar[i, ] - matrix(Xstar[, i], nrow(X), 2) %*% matrix(betahat[, i], 2, 1))^2)
65
+		ses[, i] <- sqrt(diag(matrix(IXTX[, i], 2, 2) * ssr))
66
+	}
67
+	nu <- betahat[1, ]
68
+	phi <- betahat[2, ]
69
+	nu.se <- ses[1,]
70
+	phi.se <- ses[2,]
71
+	##dimnames(nu) <- dimnames(phi) <- dimnames(nu.ses) <- dimnames(phi.ses)
72
+	assign("nu", nu, envir=parent.frame(1))
73
+	assign("phi", phi, envir=parent.frame(1))
74
+	assign("nu.se", nu.se, envir=parent.frame(1))
75
+	assign("phi.se", phi.se, envir=parent.frame(1))	
76
+}
77
+
78
+
79
+
80
+celDatesFrom <- function(celfiles, path=""){
81
+	require(affyio)
82
+	celdates <- vector("character", length(celfiles))
83
+	celtimes <- vector("character", length(celfiles))
84
+	for(i in seq(along=celfiles)){
85
+		if(i %% 100 == 0) cat(".")
86
+		tmp <- read.celfile.header(file.path(path, celfiles[i]), info="full")$DatHeader
87
+		tmp <- strsplit(tmp, "\ +")
88
+		celdates[i] <- tmp[[1]][6]
89
+		celtimes[i] <- tmp[[1]][7]
90
+	}
91
+	tmp <- paste(celdates, celtimes)
92
+	celdts <- strptime(tmp, "%m/%d/%y %H:%M:%S")
93
+	return(celdts)
94
+}
95
+
96
+
97
+cnrma <- function(filenames, sns, cdfName, seed=1){
98
+##	require(preprocessCore) || stop("Package preprocessCore not available")
99
+	if (missing(sns)) sns <- basename(filenames)
100
+	if (missing(cdfName))
101
+		cdfName <- read.celfile.header(filenames[1])$cdfName
102
+	##  stuffDir <- changeToCrlmmAnnotationName(cdfName)
103
+	pkgname <- crlmm:::getCrlmmAnnotationName(cdfName)
104
+	if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
105
+		suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
106
+		msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
107
+		message(strwrap(msg))
108
+		stop("Package ", pkgname, " could not be found.")
109
+		rm(suggCall, msg)
110
+	}
111
+	
112
+	##I'm using the pd.mapping package
113
+	map <- list(); j <- 1
114
+	for(chrom in c(1:22, "X")){
115
+		map[[j]] <- getCnvFid(chrom)
116
+		j <- j+1
117
+	}
118
+	map <- lapply(map, function(x) x[order(x[, "chrom_start"]), ])
119
+	map <- do.call("rbind", map)
120
+
121
+	
122
+	fid <- map[, "fid"]
123
+	set.seed(seed)
124
+	idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything
125
+	SKW <- vector("numeric", length(filenames))
126
+	NP <- matrix(NA, nrow(map), length(filenames))
127
+	verbose <- TRUE
128
+	if(verbose){
129
+		message("Processing ", length(filenames), " files.")
130
+		if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3)
131
+	}
132
+	##load reference distribution obtained from hapmap
133
+	load("/thumper/ctsa/snpmicroarray/hapmap/processed/affy/1m_reference_cn.rda")
134
+	for(i in seq(along=filenames)){
135
+		y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
136
+		x <- log2(y[idx2])
137
+		SKW[i] <- mean((x-mean(x))^3)/(sd(x)^3)
138
+		rm(x)
139
+		NP[, i] <- as.integer(normalize.quantiles.use.target(y, target=reference))
140
+		##A[, i] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
141
+		##B[, i] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
142
+		##Now to fit the EM
143
+		if (verbose)
144
+			if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
145
+			else cat(".")
146
+	}
147
+	dimnames(NP) <- list(map[, "man_fsetid"], sns)
148
+	res3 <- list(NP=NP, SKW=SKW)
149
+	return(res3)
150
+}
151
+
152
+getFlags <- function(phi.thr, envir){
153
+	nuA <- get("nuA", envir)
154
+	nuB <- get("nuB", envir)
155
+	phiA <- get("phiA", envir)
156
+	phiB <- get("phiB", envir)
157
+	
158
+	negativeNus <- nuA < 1 | nuB < 1
159
+	negativeNus <- negativeNus > 0
160
+
161
+	negativePhis <- phiA < phi.thr | phiB < phi.thr
162
+	negativePhis <- negativePhis > 0
163
+	negativeCoef <- negativeNus | negativePhis
164
+
165
+	notfinitePhi <- !is.finite(phiA) | !is.finite(phiB)
166
+	notfinitePhi <- notfinitePhi > 0
167
+	flags <- negativeCoef | notfinitePhi  
168
+}
169
+
170
+goodSnps <- function(phi.thr, envir, fewAA=20, fewBB=20){
171
+	Ns <- get("Ns", envir)
172
+	flags <- getFlags(phi.thr=phi.thr, envir)
173
+	fewAA <- Ns[, , 1] < fewAA
174
+	fewBB <- Ns[, , 3] < fewBB
175
+	flagsA <- flags | fewAA
176
+	flagsB <- flags | fewBB
177
+	flags <- list(A=flagsA, B=flagsB)
178
+	return(flags)
179
+}
180
+
181
+instantiateObjects <- function(calls, NP, plate, envir, chrom){
182
+	if(missing(chrom)) stop("must specify chrom")
183
+	assign("chrom", chrom, envir)
184
+	snps <- rownames(calls)
185
+	cnvs <- rownames(NP)
186
+	sns <- basename(colnames(calls))
187
+	stopifnot(identical(colnames(calls), colnames(NP)))
188
+	assign("sns", sns, envir)	
189
+	assign("snps", snps, envir)
190
+	assign("cnvs", cnvs, envir)	
191
+
192
+	CA <- CB <- matrix(NA, nrow(calls), ncol(calls))
193
+	assign("plate", plate, envir)
194
+	uplate <- unique(plate)	
195
+	Ns <- array(NA, dim=c(nrow(calls), length(uplate), 3))
196
+	dimnames(Ns)[[3]] <- c("AA", "AB", "BB")
197
+	muA.AA <- matrix(NA, nrow=nrow(calls), length(uplate))
198
+	##Sigma.AA <- array(NA, dim=c(nrow(calls), 3, length(uplate)))
199
+	##dimnames(Sigma.AA)[2:3] <- list(c("varA", "varB", "cov"), uplate)
200
+
201
+	NP.CT <- matrix(NA, nrow(NP), ncol(NP))
202
+	NP.sds <- matrix(NA, nrow(NP), ncol(NP))
203
+	assign("NP.CT", NP.CT, envir)
204
+	assign("NP.sds", NP.sds, envir)
205
+	nus <- matrix(NA, nrow(NP), length(uplate))
206
+	assign("nus", nus, envir=envir)  
207
+	assign("phis", nus, envir=envir)
208
+
209
+	plates.completed <- rep(FALSE, length(uplate))
210
+	assign("plates.completed", plates.completed, envir)
211
+	
212
+	fit.variance <- NULL
213
+	snpflags <- NULL
214
+	assign("fit.variance", fit.variance, envir=envir)
215
+	assign("snpflags", snpflags, envir=envir)
216
+	
217
+	assign("Ns", Ns, envir=envir)		
218
+	assign("uplate", uplate, envir=envir)	
219
+	assign("muA.AA", muA.AA, envir=envir)
220
+	assign("muA.AB", muA.AA, envir=envir)
221
+	assign("muA.BB", muA.AA, envir=envir)
222
+	assign("muB.AA", muA.AA, envir=envir)
223
+	assign("muB.AB", muA.AA, envir=envir)
224
+	assign("muB.BB", muA.AA, envir=envir)
225
+	
226
+	assign("tau2A", muA.AA, envir=envir)
227
+	assign("tau2B", muA.AA, envir=envir)
228
+	assign("sig2A", muA.AA, envir=envir)
229
+	assign("sig2B", muA.AA, envir=envir)
230
+	assign("corr", muA.AA, envir=envir)
231
+	assign("corrA.BB", muA.AA, envir=envir)
232
+	assign("corrB.AA", muA.AA, envir=envir)			
233
+	
234
+	assign("nuA", muA.AA, envir=envir)
235
+	assign("nuB", muA.AA, envir=envir)
236
+	assign("phiA", muA.AA, envir=envir)
237
+	assign("phiB", muA.AA, envir=envir)
238
+	assign("nuA.se", muA.AA, envir=envir)
239
+	assign("nuB.se", muA.AA, envir=envir)
240
+	assign("phiA.se", muA.AA, envir=envir)
241
+	assign("phiB.se", muA.AA, envir=envir)
242
+	assign("sd.CT", CA, envir=envir)
243
+	assign("CA", CA, envir=envir)
244
+	assign("CB", CB, envir=envir)
245
+}
246
+
247
+computeCnBatch <- function(A,
248
+			   B,
249
+			   calls,
250
+			   conf,
251
+			   NP,
252
+			   plate,
253
+			   fit.variance=NULL,
254
+			   seed=123,
255
+			   MIN.OBS=3,
256
+			   envir,
257
+			   chrom, P, DF.PRIOR=50, CONF.THR=0.99, trim, upperTail, ...){
258
+	if(length(ls(envir)) == 0) instantiateObjects(calls, NP, plate, envir, chrom)
259
+	##require(genefilter)
260
+	require(splines)
261
+	require(Biobase)##needs rowMedians
262
+	##will be updating these objects
263
+	uplate <- get("uplate", envir)
264
+	##AAA <- A
265
+	##BBB <- B
266
+	message("Sufficient statistics")
267
+	if(missing(P)) P <- seq(along=uplate)
268
+	for(p in P){
269
+		cat(".")
270
+		if(sum(plate == uplate[p]) < 10) next()
271
+		oneBatch(plateIndex=p,
272
+			 G=calls[, plate==uplate[p]],
273
+			 A=A[, plate==uplate[p]],
274
+			 B=B[, plate==uplate[p]],
275
+			 envir=envir,
276
+			 MIN.OBS=MIN.OBS,
277
+			 DF.PRIOR=DF.PRIOR,
278
+			 trim=trim, upperTail=upperTail)
279
+	}
280
+	message("\nEstimating coefficients")	
281
+	for(p in P){
282
+		cat(".")
283
+		coefs(plateIndex=p, conf=conf[, plate==uplate[p]],
284
+		      envir=envir, CONF.THR=CONF.THR)
285
+	}
286
+	message("\nAllele specific copy number")	
287
+	for(p in P){
288
+		cat(".")
289
+		copynumber(plateIndex=p,
290
+			   A=A[, plate==uplate[p]],
291
+			   B=B[, plate==uplate[p]],
292
+			   envir=envir)
293
+	}
294
+	message("\nCopy number for nonpolymorphic probes...")	
295
+	for(p in P){
296
+		cat(".")
297
+		nonpolymorphic(plateIndex=p,
298
+			       NP=NP[, plate==uplate[p]],
299
+			       envir=envir)
300
+	}
301
+}
302
+
303
+nonpolymorphic <- function(plateIndex, NP, envir){
304
+	require(genefilter) 
305
+	p <- plateIndex
306
+	plate <- get("plate", envir)
307
+	uplate <- get("uplate", envir)	
308
+	plates.completed <- get("plates.completed", envir)
309
+	if(!plates.completed[p]) return()
310
+	##snpflags <- get("snpflags", envir)
311
+	##if(is.null(snpflags)){
312
+	snpflags <- goodSnps(phi.thr=10, envir=envir, fewAA=10, fewBB=10)
313
+	##assign("snpflags", snpflags, envir=envir)
314
+	flagsA <- snpflags$A[, p]
315
+	flagsB <- snpflags$B[, p]
316
+	if(all(flagsA) | all(flagsB)) stop("all snps are flagged")
317
+	nuA <- get("nuA", envir)
318
+	nuB <- get("nuB", envir)
319
+	phiA <- get("phiA", envir)
320
+	phiB <- get("phiB", envir)
321
+	uplate <- get("uplate", envir)
322
+	sns <- get("sns", envir)
323
+	muA.AA <- get("muA.AA", envir)
324
+	muA.AB <- get("muA.AB", envir)
325
+	muA.BB <- get("muA.BB", envir)
326
+	muB.AA <- get("muB.AA", envir)
327
+	muB.AB <- get("muB.AB", envir)
328
+	muB.BB <- get("muB.BB", envir)
329
+	NP.CT <- get("NP.CT", envir)
330
+	nus <- get("nus", envir)
331
+	phis <- get("phis", envir)
332
+	NP.sds <- get("NP.sds", envir)
333
+	##fit.variance <- get("fit.variance", envir)
334
+	NP.CT <- get("NP.CT", envir)
335
+	##---------------------------------------------------------------------------
336
+	## Train on unflagged SNPs
337
+	##---------------------------------------------------------------------------		
338
+	plateInd <- plate == uplate[p]
339
+	muA.AAp <- muA.AA[!flagsA, p]
340
+	muB.BBp <- muB.BB[!flagsB, p]
341
+		
342
+	##From SNP data
343
+	X <- cbind(1, log(c(muA.AAp, muB.BBp)))
344
+	Y <- log(c(phiA[!flagsA, p], phiB[!flagsB, p]))
345
+	##Y.nu <- log(c(nuA[!snpflags$A, p], nuB[!snpflags$B, p]))
346
+	##Y <- cbind(Y.nu, Y.phi)
347
+	betahat <- solve(crossprod(X), crossprod(X, Y))
348
+	##Prediction
349
+	mus <- rowMedians(NP, na.rm=TRUE)
350
+	X <- cbind(1, log(mus))
351
+	Yhat <- as.numeric(X %*% betahat)
352
+	##NP.nus[, p] <- exp(Yhat[, 1])
353
+	##NP.phis[, p] <- exp(Yhat[, 2])
354
+	phi <- exp(Yhat)
355
+	nu <- mus - 2*phi
356
+	
357
+	phis[, p] <- as.integer(phi)
358
+	nus[, p] <- as.integer(nu)
359
+	##plot(NP.phis[, p], NP.nus[, p], pch=".")
360
+	CT <- 1/phi*(NP-nu)
361
+	tmp <- matrix(as.integer(CT*100), nrow(CT), ncol(CT))
362
+	NP.CT[, plateInd] <- tmp
363
+	if(FALSE){
364
+		##should be centered at 2
365
+		tmp <- rowMeans(NP.CT[, plateInd])
366
+		hist(tmp/100, breaks=200)
367
+	}
368
+
369
+	##---------------------------------------------------------------------------
370
+	## For NA SNPs, treat as nonpolymorphic
371
+	##---------------------------------------------------------------------------
372
+	CA <- get("CA", envir)
373
+	CB <- get("CB", envir)	
374
+	tmpA <- CA[, plate==uplate[p]]
375
+	tmpB <- CB[, plate==uplate[p]]	
376
+	indexA <- which(rowSums(is.na(tmpA)) > 0)
377
+	indexB <- which(rowSums(is.na(tmpB)) > 0)
378
+	index <- union(indexA, indexB)
379
+	if(length(index) > 0) browser()
380
+	
381
+	##---------------------------------------------------------------------------
382
+	## this part could stand improvement
383
+	##  - estimate the uncertainty
384
+	##---------------------------------------------------------------------------
385
+	
386
+	##---------------------------------------------------------------------------
387
+	## Estimate variance for copy number probes
388
+	## VAR(CT) = var(1/phi * (NP - nu))
389
+	##         = 1/phi^2 * VAR(NP)
390
+	##         = 1/phi^2 * f(mu) * sigma
391
+	## ** Phi is predicted from a regression using the row medians as predictors
392
+	##   -- this may underestimate the uncertainty
393
+	##---------------------------------------------------------------------------		
394
+##	log.mus <- log2(mus)
395
+##	log.var <- log2(rowVars(NP))
396
+##	f <- predict(fit.variance, newdata=data.frame(log.mus=log.mus))
397
+##	resid <- log.var - f
398
+##	sds <- 1/phi*sqrt(2^(predict(fit.variance, newdata=data.frame(log.mus=log.mus+resid))))
399
+##
400
+##	robustSD <- function(X) diff(quantile(X, probs=c(0.16, (1-0.16)), na.rm=TRUE))/2
401
+##	sample.sds <- apply(CT, 2, robustSD)
402
+##	sample.sds <- sample.sds/median(sample.sds, na.rm=TRUE)
403
+##	NP.sds[, plate==uplate[p]] <- sds %*% t(sample.sds)
404
+	assign("phis", phis, envir)
405
+	assign("nus", nus, envir)
406
+	assign("NP.CT", NP.CT, envir)
407
+##	assign("NP.sds", NP.sds, envir)
408
+##	firstPass.NP <- list(phis=phis, nus=nus, CT=NP.CT, sds=sds,
409
+##			     gns=gns, sns=sns, bns=bns)
410
+##	firstPass.NP
411
+}
412
+
413
+oneBatch <- function(plateIndex, G, A, B, MIN.OBS=3, DF.PRIOR, envir, trim, upperTail, ...){
414
+	p <- plateIndex
415
+	plate <- get("plate", envir)
416
+	AA <- G == 1
417
+	AB <- G == 2
418
+	BB <- G == 3
419
+	Ns <- get("Ns", envir)
420
+	Ns[, p, "AA"] <- rowSums(AA)
421
+	Ns[, p, "AB"] <- rowSums(AB)
422
+	Ns[, p, "BB"] <- rowSums(BB)
423
+	AA[AA == FALSE] <- NA
424
+	AB[AB == FALSE] <- NA
425
+	BB[BB == FALSE] <- NA
426
+	Ip <- array(NA, dim=c(nrow(G), ncol(G), 3, 2))
427
+	dimnames(Ip) <- list(rownames(G), colnames(G), c("AA", "AB", "BB"), c("A", "B"))
428
+	Ip[, , "AA", "A"] <- AA*A
429
+	Ip[, , "AB", "A"] <- AB*A
430
+	Ip[, , "BB", "A"] <- BB*A
431
+	Ip[, , "AA", "B"] <- AA*B
432
+	Ip[, , "AB", "B"] <- AB*B
433
+	Ip[, , "BB", "B"] <- BB*B
434
+	##---------------------------------------------------------------------------
435
+	## Sufficient statistics (plate-specific)
436
+	##---------------------------------------------------------------------------
437
+	##These matrices instantiated in the calling function are updated
438
+	muA.AA <- get("muA.AA", envir)
439
+	muA.AB <- get("muA.AB", envir)
440
+	muA.BB <- get("muA.BB", envir)
441
+	muB.AA <- get("muB.AA", envir)
442
+	muB.AB <- get("muB.AB", envir)
443
+	muB.BB <- get("muB.BB", envir)
444
+	tau2A <- get("tau2A", envir)
445
+	tau2B <- get("tau2B", envir)
446
+	sig2A <- get("sig2A", envir)
447
+	sig2B <- get("sig2B", envir)
448
+	corr <- get("corr", envir)
449
+	corrA.BB <- get("corrA.BB", envir)  ## B allele
450
+	corrB.AA <- get("corrB.AA", envir)
451
+
452
+	AA.A <- AA*A
453
+	AB.A <- AB*A
454
+	BB.A <- BB*A
455
+	AA.B <- AA*B
456
+	AB.B <- AB*B
457
+	BB.B <- BB*B
458
+	if(trim > 0){
459
+		##rowMedians is not robust enough when a variant is common
460
+		##Try a trimmed rowMedian, trimming only one tail (must specify which)
461
+		##  - for genotypes with 3 or more observations, exclude the upper X% 
462
+		##        - one way to do this is to replace these observations with NAs
463
+		##         e.g, exclude round(Ns * X%, 0) observations
464
+		##  - for genotypes with fewer than 10 observations,
465
+		##  - recalculate rowMedians
466
+		replaceWithNAs <- function(x, trim, upperTail){
467
+			##put NA's last if trimming the upperTail
468
+			if(upperTail) decreasing <- TRUE else decreasing <- FALSE
469
+			NN <- round(sum(!is.na(x)) * trim, 0)
470
+			ix <- order(x, decreasing=decreasing, na.last=TRUE)[1:NN]
471
+			x[ix] <- NA
472
+			return(x)
473
+		}
474
+		##which rows should be trimmed
475
+		rowsToTrim <- which(round(Ns[, p, "AA"] * trim, 0) > 0)
476
+		##replace values in the tail of A with NAs
477
+		AA.A[rowsToTrim, ] <- t(apply(AA.A[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail))
478
+		AA.B[rowsToTrim, ] <- t(apply(AA.B[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail))
479
+		rowsToTrim <- which(round(Ns[, p, "AB"] * trim, 0) > 0)
480
+		AB.A[rowsToTrim, ] <- t(apply(AB.A[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail))
481
+		AB.B[rowsToTrim, ] <- t(apply(AB.B[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail))
482
+		rowsToTrim <- which(round(Ns[, p, "BB"] * trim, 0) > 0)
483
+		BB.A[rowsToTrim, ] <- t(apply(BB.A[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail))
484
+		BB.B[rowsToTrim, ] <- t(apply(BB.B[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail))
485
+
486
+		##Should probably recompute the Ns -- change the
487
+		##degrees of freedom
488
+		Ns[, p, "AA"] <- rowSums(!is.na(AA.A))
489
+		Ns[, p, "AB"] <- rowSums(!is.na(AB.A))
490
+		Ns[, p, "BB"] <- rowSums(!is.na(BB.A))		
491
+	}
492
+	muA.AA[, p] <- rowMedians(AA.A, na.rm=TRUE)
493
+	muA.AB[, p] <- rowMedians(AB.A, na.rm=TRUE)
494
+	muA.BB[, p] <- rowMedians(BB.A, na.rm=TRUE)
495
+	muB.AA[, p] <- rowMedians(AA.B, na.rm=TRUE)
496
+	muB.AB[, p] <- rowMedians(AB.B, na.rm=TRUE)
497
+	muB.BB[, p] <- rowMedians(BB.B, na.rm=TRUE)
498
+	sigmaA <- matrix(NA, nrow(AA), 3)
499
+	sigmaB <- matrix(NA, nrow(AA), 3)	
500
+	colnames(sigmaB) <- colnames(sigmaA) <- c("AA", "AB", "BB")
501
+
502
+
503
+	sigmaA[, "AA"] <- rowMAD(AA.A, na.rm=TRUE)
504
+	sigmaA[, "AB"] <- rowMAD(AB.A, na.rm=TRUE)
505
+	sigmaA[, "BB"] <- rowMAD(BB.A, na.rm=TRUE)
506
+	sigmaB[, "AA"] <- rowMAD(AA.B, na.rm=TRUE)
507
+	sigmaB[, "AB"] <- rowMAD(AB.B, na.rm=TRUE)
508
+	sigmaB[, "BB"] <- rowMAD(BB.B, na.rm=TRUE)
509
+
510
+##	sigmaA[, "AA"] <- rowSds(AA.A, na.rm=TRUE)
511
+##	sigmaA[, "AB"] <- rowSds(AB.A, na.rm=TRUE)
512
+##	sigmaA[, "BB"] <- rowSds(BB*A, na.rm=TRUE)
513
+##	sigmaB[, "AA"] <- rowSds(AA*B, na.rm=TRUE)
514
+##	sigmaB[, "AB"] <- rowSds(AB*B, na.rm=TRUE)
515
+##	sigmaB[, "BB"] <- rowSds(BB*B, na.rm=TRUE)
516
+	##shrink
517
+	DF <- Ns[, p, ]-1
518
+	DF[DF < 1] <- 1
519
+	medsA <- apply(sigmaA, 2, "median", na.rm=TRUE)
520
+	medsB <- apply(sigmaB, 2, "median", na.rm=TRUE)			
521
+	for(m in 1:3){
522
+		sigmaA[, m] <- (sigmaA[, m]*DF[, m] + medsA[m]*DF.PRIOR)/(DF.PRIOR+DF[, m])
523
+		sigmaA[is.na(sigmaA[, m]), m] <- medsA[m]
524
+		sigmaB[, m] <- (sigmaB[, m]*DF[, m] + medsB[m]*DF.PRIOR)/(DF.PRIOR+DF[, m])
525
+		sigmaB[is.na(sigmaB[, m]), m] <- medsB[m]		
526
+	}
527
+	index.AA <- which(Ns[, p, "AA"] >= 2)
528
+	index.AB <- which(Ns[, p, "AB"] >= 2)
529
+	index.BB <- which(Ns[, p, "BB"] >= 2)
530
+	
531
+	x <- Ip[index.BB, , "BB", "A"]
532
+	##tau2A[index.BB, p] <- rowVars(x, na.rm=TRUE)##var(log(IA)| BB)
533
+	tau2A[index.BB, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2
534
+	DF <- Ns[, p, "BB"]-1
535
+	DF[DF < 1] <- 1
536
+	med <- median(tau2A[, p], na.rm=TRUE)
537
+	tau2A[, p] <- (tau2A[, p] * DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
538
+	tau2A[is.na(tau2A[, p]), p] <- med
539
+	
540
+	x <- Ip[index.BB, , "BB", "B"]
541
+##	sig2B[index.BB, p] <- rowVars(log2(x), na.rm=TRUE) ##var(log(IB)|BB)
542
+	sig2B[index.BB, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2	
543
+	##system.time(tmp <- rowCovs(x, x, na.rm=TRUE)) ##var(log(IB)|BB)
544
+	##system.time(tmp2 <- rowVars(log2(x), na.rm=TRUE))
545
+	med <- median(sig2B[, p], na.rm=TRUE)
546
+	sig2B[, p] <- (sig2B[, p] * DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
547
+	sig2B[is.na(sig2B[, p]), p] <- med
548
+	
549
+	x <- Ip[index.AA, , "AA", "B"]
550
+	##tau2B[index.AA, p] <- rowCovs(x, x, na.rm=TRUE)##var(log(IB)| AA)
551
+	tau2B[index.AA, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2		
552
+	DF <- Ns[, p, "AA"]-1
553
+	DF[DF < 1] <- 1
554
+	med <- median(tau2B[, p], na.rm=TRUE)
555
+	tau2B[, p] <- (tau2B[, p] * DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
556
+	tau2B[is.na(tau2B[, p]), p] <- med
557
+	
558
+	x <- Ip[index.AA, , "AA", "A"]
559
+	##sig2A[index.AA, p] <- rowVars(log2(x), na.rm=TRUE)##var(log(IA)|AA)
560
+	sig2A[index.AA, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2##var(log(IA)|AA)	
561
+	##sig2A[index.AA, p] <- rowCovs(x, x, na.rm=TRUE) ##var(log(IA)|AA)
562
+	med <- median(sig2A[, p], na.rm=TRUE)
563
+	sig2A[, p] <- (sig2A[, p]*DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
564
+	sig2A[is.na(sig2A[, p]), p] <- med	
565
+
566
+	##estimate the correlation where there is at least 1 or more copies (AB genotypes)
567
+	if(length(index.AB) > 0){ ##all homozygous is possible	
568
+		x <- Ip[index.AB, , "AB", "A"]
569
+		y <- Ip[index.AB, , "AB", "B"]
570
+		corr[index.AB, p] <- rowCors(x, y, na.rm=TRUE)
571
+		corr[corr < 0] <- 0
572
+		DF <- Ns[, p, "AB"]-1
573
+		DF[DF<1] <- 1
574
+		med <- median(corr[, p], na.rm=TRUE)
575
+		corr[, p] <- (corr[, p]*DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
576
+		corr[is.na(corr[, p]), p] <- med
577
+	}
578
+	##estimate the correlation of the background errors and AA, BB genotypes
579
+	backgroundB <- Ip[index.AA, , "AA", "B"]
580
+	signalA <- Ip[index.AA, , "AA", "A"]
581
+	corrB.AA[index.AA, p] <- rowCors(backgroundB, signalA, na.rm=TRUE)
582
+	DF <- Ns[, p, "AA"]-1
583
+	DF[DF < 1] <- 1
584
+	med <- median(corrB.AA[, p], na.rm=TRUE)
585
+	corrB.AA[, p] <- (corrB.AA[, p]*DF + med*DF.PRIOR)/(DF.PRIOR + DF)
586
+	corrB.AA[is.na(corrB.AA[, p]), p] <- med
587
+
588
+	backgroundA <- Ip[index.BB, , "BB", "A"]
589
+	signalB <- Ip[index.BB, , "BB", "B"]
590
+	corrA.BB[index.BB, p] <- rowCors(backgroundA, signalB, na.rm=TRUE)
591
+	DF <- Ns[, p, "BB"]-1
592
+	DF[DF < 1] <- 1
593
+	med <- median(corrA.BB[, p], na.rm=TRUE)
594
+	corrA.BB[, p] <- (corrA.BB[, p]*DF + med*DF.PRIOR)/(DF.PRIOR + DF)
595
+	corrA.BB[is.na(corrA.BB[, p]), p] <- med	
596
+	
597
+	##---------------------------------------------------------------------------
598
+	## Predict sufficient statistics for unobserved genotypes (plate-specific)
599
+	##---------------------------------------------------------------------------
600
+	correct.orderA <- muA.AA[, p] > muA.BB[, p]
601
+	correct.orderB <- muB.BB[, p] > muB.AA[, p]
602
+	if(length(index.AB) > 0){
603
+		nobs <- rowSums(Ns[, p, ] >= MIN.OBS) == 3
604
+	} else nobs <- rowSums(Ns[, p, c(1,3)] >= MIN.OBS) == 2
605
+	index.complete <- which(correct.orderA & correct.orderB & nobs) ##be selective here
606
+	size <- min(5000, length(index.complete))
607
+	if(size == 5000) index.complete <- sample(index.complete, 5000)
608
+	if(length(index.complete) < 200){
609
+		warning("too few snps pass my criteria for predicting the sufficient statistics")
610
+		##message("Plate has too few samples to call many genotypes...can we use prior information to predict")
611
+		browser()
612
+	}
613
+	index.AA <- which(Ns[, p, "AA"] == 0 & Ns[, p, "AB"] >= MIN.OBS)
614
+	index.AB <- which(Ns[, p, "AB"] == 0 & (Ns[, p, "AA"] >= MIN.OBS & Ns[, p, "BB"] >= MIN.OBS))
615
+	index.BB <- which(Ns[, p, "BB"] == 0 & Ns[, p, "AB"] >= MIN.OBS)
616
+	if(FALSE){
617
+		png("figures/predictedMusBB%02d.png", width=800, height=500)
618
+		par(mfrow=c(1, 2), las=1, mar=c(3, 1, 1, 1), oma=c(1, 2, 1, 1), pty="s")				
619
+		plot(log(muA.BB[index.complete, 1]), log(muB.BB[index.complete, 1]), pch=".", ylim=c(5,12), xlim=c(5,12),
620
+		     xlab="A", ylab="B", main="good")
621
+		points(log(muA.AB[index.complete, 1]), log(muB.AB[index.complete, 1]), pch=".")
622
+		points(log(muA.AA[index.complete, 1]), log(muB.AA[index.complete, 1]), pch=".")
623
+		plot(log(muA.BB[index.BB, 1]), log(muB.BB[index.BB, 1]), pch=".", ylim=c(5,12), xlim=c(5,12),
624
+		     xlab="A", ylab="B", main="missing BB")
625
+		points(log(muA.AB[index.BB, 1]), log(muB.AB[index.BB, 1]), pch=".")
626
+		points(log(muA.AA[index.BB, 1]), log(muB.AA[index.BB, 1]), pch=".")				
627
+	}
628
+	if(length(index.AA) > 0){
629
+		X.AA <- cbind(1, muA.AB[index.complete, p], muA.BB[index.complete, p],
630
+			      muB.AB[index.complete, p], muB.BB[index.complete, p])
631
+		Y <- cbind(muA.AA[index.complete, p], muB.AA[index.complete, p])
632
+		XtY <- crossprod(X.AA, Y)
633
+		betahat <- solve(crossprod(X.AA), XtY)
634
+		X.AA <- cbind(1, muA.AB[index.AA, p], muA.BB[index.AA, p],
635
+			      muB.AB[index.AA, p], muB.BB[index.AA, p])
636
+		musAA <- X.AA %*% betahat
637
+		musAA[musAA <= 100] <- 100
638
+		muA.AA[index.AA, p] <- musAA[, 1]
639
+		muB.AA[index.AA, p] <- musAA[, 2]
640
+	}
641
+	if(length(index.AB) > 0){
642
+		X.AB <- cbind(1, muA.AA[index.complete, p], muA.BB[index.complete, p],
643
+			      muB.AA[index.complete, p], muB.BB[index.complete, p])
644
+		Y <- cbind(muA.AB[index.complete, p], muB.AB[index.complete, p])
645
+		XtY <- crossprod(X.AB, Y)
646
+		betahat <- solve(crossprod(X.AB), XtY)
647
+		X.AB <- cbind(1, muA.AA[index.AB, p], muA.BB[index.AB, p],
648
+			      muB.AA[index.AB, p], muB.BB[index.AB, p])
649
+		musAB <- X.AB %*% betahat
650
+		musAB[musAB <= 100] <- 100					
651
+		muA.AB[index.AB, p] <- musAB[, 1]
652
+		muB.AB[index.AB, p] <- musAB[, 2]				
653
+	}
654
+	if(length(index.BB) > 0){
655
+		X.BB <- cbind(1, muA.AA[index.complete, p], muA.AB[index.complete, p],
656
+			      muB.AA[index.complete, p], muB.AB[index.complete, p])
657
+		Y <- cbind(muA.BB[index.complete, p], muB.BB[index.complete, p])
658
+		XtY <- crossprod(X.BB, Y)
659
+		betahat <- solve(crossprod(X.BB), XtY)
660
+		X.BB <- cbind(1, muA.AA[index.BB, p], muA.AB[index.BB, p],
661
+			      muB.AA[index.BB, p], muB.AB[index.BB, p])
662
+		musBB <- X.BB %*% betahat
663
+		musBB[musBB <= 100] <- 100				
664
+		muA.BB[index.BB, p] <- musBB[, 1]
665
+		muB.BB[index.BB, p] <- musBB[, 2]								
666
+	}
667
+	if(FALSE){
668
+		points(log(muA.BB[index.BB, 1]), log(muB.BB[index.BB, 1]), pch=".", col="blue")
669
+		dev.off()
670
+	}
671
+	##missing two genotypes
672
+	noAA <- Ns[, p, "AA"] <= 2
673
+	noAB <- Ns[, p, "AB"] <= 2
674
+	noBB <- Ns[, p, "BB"] <= 2
675
+	##---------------------------------------------------------------------------
676
+	## Two genotype clusters not observed -- would sequence help? (didn't seem that helpful)
677
+	## 1 extract index of complete data
678
+	## 2 Regress  mu1,mu3 ~ sequence + mu2
679
+	## 3 Predict mu1*, mu3* for missing genotypes
680
+	##---------------------------------------------------------------------------			
681
+	if(sum(noAA & noAB) > 0){
682
+		##predict other cluster centers using only the observed genotypes
683
+		##predict AA:
684
+		X <- cbind(1, muA.BB[index.complete, p], muB.BB[index.complete, p])
685
+		Y <- cbind(muA.AA[index.complete, p],
686
+			   muB.AA[index.complete, p],
687
+			   muA.AB[index.complete, p],
688
+			   muB.AB[index.complete, p])
689
+		XtY <- crossprod(X, Y)
690
+		betahat <- solve(crossprod(X), XtY)
691
+		X <- cbind(1, muA.BB[noAA & noAB, p], muB.BB[noAA & noAB, p])
692
+		mus <- X %*% betahat
693
+		mus[mus <= 100] <- 100				
694
+		muA.AA[noAA & noAB, p] <- mus[, 1]
695
+		muB.AA[noAA & noAB, p] <- mus[, 2]
696
+		muA.AB[noAA & noAB, p] <- mus[, 3]
697
+		muB.AB[noAA & noAB, p] <- mus[, 4]				
698
+	}
699
+	if(FALSE){
700
+		i <- which(noAA & noAB)
701
+		par(mfrow=c(1, 1), las=1, mar=c(3, 3, 1, 1), pty="s")
702
+		plot(log(muA.BB[i, 1]), log(muB.BB[i, 1]), pch=".", ylim=c(5,12), xlim=c(5,12),
703
+		     xlab="A", ylab="B", main="missing BB")
704
+		points(log(muA.AB[i, 1]), log(muB.AB[i, 1]), pch=".", col="blue")
705
+		points(log(muA.AA[i, 1]), log(muB.AA[i, 1]), pch=".", col="red")
706
+	}
707
+	if(sum(noBB & noAB) > 0){
708
+		##predict other cluster centers using only the observed genotypes
709
+		X <- cbind(1, muA.AA[index.complete, p], muB.AA[index.complete, p])
710
+		Y <- cbind(muA.AB[index.complete, p],
711
+			   muB.AB[index.complete, p],
712
+			   muA.BB[index.complete, p], #muA.AB[index.complete, p],
713
+			   muB.BB[index.complete, p])#, muB.AB[index.complete, p])
714
+		XtY <- crossprod(X, Y)
715
+		betahat <- solve(crossprod(X), XtY)
716
+		X <- cbind(1, muA.AA[noBB & noAB, p], muB.AA[noBB & noAB, p])
717
+		mus <- X %*% betahat
718
+		mus[mus <= 100] <- 100
719
+		muA.AB[noBB & noAB, p] <- mus[, 1]
720
+		muB.AB[noBB & noAB, p] <- mus[, 2]								
721
+		muA.BB[noBB & noAB, p] <- mus[, 3]
722
+		muB.BB[noBB & noAB, p] <- mus[, 4]				
723
+		if(FALSE){
724
+			i <- which(noBB & noAB)
725
+			par(mfrow=c(1, 1), las=1, mar=c(3, 3, 1, 1), pty="s")
726
+			plot(log(muA.BB[i, 1]), log(muB.BB[i, 1]), pch=".", ylim=c(5,12), xlim=c(5,12),
727
+			     xlab="A", ylab="B", main="missing BB")
728
+			points(log(muA.AB[i, 1]), log(muB.AB[i, 1]), pch=".", col="blue")
729
+			points(log(muA.AA[i, 1]), log(muB.AA[i, 1]), pch=".", col="red")
730
+		}				
731
+	}
732
+	dn.Ns <- dimnames(Ns)
733
+	Ns <- array(as.integer(Ns), dim=dim(Ns))
734
+	dimnames(Ns)[[3]] <- dn.Ns[[3]]
735
+
736
+	assign("Ns", Ns, envir)	
737
+	assign("muA.AA", muA.AA, envir)
738
+	assign("muA.AB", muA.AB, envir)
739
+	assign("muA.BB", muA.BB, envir)
740
+	assign("muB.AA", muB.AA, envir)
741
+	assign("muB.AB", muB.AB, envir)
742
+	assign("muB.BB", muB.BB, envir)
743
+	assign("sigmaA", sigmaA, envir)
744
+	assign("sigmaB", sigmaB, envir)	
745
+	assign("tau2A", tau2A, envir)
746
+	assign("tau2B", tau2B, envir)
747
+	assign("sig2A", sig2A, envir)
748
+	assign("sig2B", sig2B, envir)
749
+	assign("corr", corr, envir)
750
+	assign("corrB.AA", corrB.AA, envir)
751
+	assign("corrA.BB", corrA.BB, envir)		
752
+	plates.completed <- get("plates.completed", envir)
753
+	plates.completed[p] <- TRUE
754
+	assign("plates.completed", plates.completed, envir)
755
+}
756
+
757
+coefs <- function(plateIndex, conf, MIN.OBS=3, envir, CONF.THR=0.99){
758
+	p <- plateIndex
759
+	##if(p == 5) browser()
760
+	plates.completed <- get("plates.completed", envir)
761
+	if(!plates.completed[p]) return()
762
+	plate <- get("plate", envir)
763
+	nuA <- get("nuA", envir)
764
+	nuB <- get("nuB", envir)
765
+	nuA.se <- get("nuA.se", envir)
766
+	nuB.se <- get("nuB.se", envir)
767
+	phiA <- get("phiA", envir)
768
+	phiB <- get("phiB", envir)
769
+	phiA.se <- get("phiA.se", envir)
770
+	phiB.se <- get("phiB.se", envir)
771
+	muA.AA <- get("muA.AA", envir)
772
+	muA.AB <- get("muA.AB", envir)
773
+	muA.BB <- get("muA.BB", envir)
774
+	muB.AA <- get("muB.AA", envir)
775
+	muB.AB <- get("muB.AB", envir)
776
+	muB.BB <- get("muB.BB", envir)
777
+	Ns <- get("Ns", envir)
778
+	uplate <- get("uplate", envir)
779
+	sigmaA <- get("sigmaA", envir)
780
+	sigmaB <- get("sigmaB", envir)	
781
+	##fit.variance <- get("fit.variance", envir)
782
+	IA <- cbind(muA.AA[, p], muA.AB[, p], muA.BB[, p])
783
+	IB <- cbind(muB.AA[, p], muB.AB[, p], muB.BB[, p])
784
+	NOHET <- mean(Ns[, p, 2], na.rm=TRUE) < 0.05
785
+	##predict missing variance (do only once)
786
+	is.complete <- rowSums(Ns[, p, ] >= 1) == 3
787
+	if(NOHET) is.complete <- rowSums(Ns[, p, c(1,3)] >= MIN.OBS) == 2		
788
+	correct.orderA <- muA.AA[, p] > muA.BB[, p]
789
+	correct.orderB <- muB.BB[, p] > muB.AA[, p]
790
+	highConf <- 1-exp(-conf/1000)
791
+	confInd <- rowMeans(highConf) > CONF.THR
792
+	keep <- confInd & is.complete & correct.orderA & correct.orderB
793
+##	if(is.null(fit.variance)){
794
+##		log.sigma2 <- as.numeric(log2(sigma2A[keep, ]))
795
+##		log.mus <- as.numeric(log2(IA[keep, ]))
796
+##		##log.mus <- as.numeric(log(mus[, p, , "A"]))
797
+##		nodups <- which(!duplicated(log.sigma2))
798
+##		i <- sample(nodups, 5000)
799
+##		include <- intersect(which(!is.na(log.sigma2) & !is.na(log.mus)), nodups)
800
+##		fit.variance <- lm(log.sigma2 ~ ns(log.mus, 3), subset=sample(include, 5000))
801
+##		assign("fit.variance", fit.variance, envir)
802
+##	} 
803
+##	i <- which(rowSums(is.na(sigma2A)) > 0)
804
+##	log.mus <- as.numeric(log2(IA[i, ]))
805
+##	predictS2 <- predict(fit.variance, newdata=data.frame(log.mus=log.mus))
806
+##	predictS2 <- matrix(2^(predictS2), ncol=3)
807
+##	sigma2A[i, ] <- predictS2
808
+	##---------------------------------------------------------------------------
809
+	## Estimate nu and phi
810
+	##---------------------------------------------------------------------------
811
+	Np <- Ns[, p, ]
812
+	sigma2A <- sigmaA^2
813
+	sigma2B <- sigmaB^2	
814
+	if(NOHET){
815
+		##only homozygous
816
+		Np <- Np[, -2]
817
+		Np[Np < 1] <- 1
818
+		IA <- IA[, c(1, 3)]
819
+		sigma2A <- sigma2A[, c(1,3)]
820
+	}else 	Np[Np < 1] <- 1
821
+	##Using MAD instead of 
822
+	V <- sigma2A/Np
823
+	W <- sqrt(1/V)
824
+	Ystar <- IA*W
825
+	is.complete <- rowSums(is.na(W)) == 0
826
+	##is.complete <- is.complete & correct.orderA & correct.orderB & confInd & notmissing
827
+	nuphiAllele(allele="A", Ystar=Ystar[is.complete, ], W=W[is.complete, ], envir=envir, p=p)
828
+	nuA[is.complete, p] <- nu
829
+	phiA[is.complete, p] <- phi
830
+	nuA.se[is.complete, p] <- nu.se
831
+	phiA.se[is.complete, p] <- phi.se
832
+	
833
+	if(NOHET){
834
+		IB <- IB[, c(1, 3)]
835
+		sigma2B <- sigma2B[, c(1,3)]
836
+	}
837
+##	i <- which(rowSums(is.na(sigma2B)) > 0)
838
+##	log.mus <- as.numeric(log2(IB[i, ]))
839
+##	predictS2 <- predict(fit.variance, newdata=data.frame(log.mus=log.mus))
840
+##	predictS2 <- matrix(2^(predictS2), ncol=ncol(sigma2B))
841
+##	sigma2B[i, ] <- predictS2		
842
+	V <- sigma2B/Np
843
+	W <- sqrt(1/V)
844
+	Ystar <- IB*W
845
+	is.complete <- rowSums(is.na(W)) == 0
846
+	nuphiAllele(allele="B", Ystar=Ystar[is.complete, ], W=W[is.complete, ], envir=envir, p=p)
847
+	nuB[is.complete, p] <- nu
848
+	phiB[is.complete, p] <- phi
849
+	nuB.se[is.complete, p] <- nu.se
850
+	phiB.se[is.complete, p] <- phi.se
851
+	phiA <- matrix(as.integer(phiA), nrow(phiA), ncol(phiA))
852
+	phiB <- matrix(as.integer(phiB), nrow(phiA), ncol(phiA))
853
+
854
+	assign("nuA", nuA, envir)
855
+	assign("nuB", nuB, envir)
856
+	assign("phiA", phiA, envir)
857
+	assign("phiB", phiB, envir)
858
+	assign("nuA.se", nuA.se, envir)
859
+	assign("nuB.se", nuB.se, envir)
860
+	assign("phiA.se", phiA.se, envir)
861
+	assign("phiB.se", phiB.se, envir)
862
+}
863
+
864
+copynumber <- function(plateIndex, A, B, envir){
865
+	p <- plateIndex
866
+	plates.completed <- get("plates.completed", envir)
867
+	if(!plates.completed[p]) return()
868
+	plate <- get("plate", envir)	
869
+	nuA <- get("nuA", envir)
870
+	nuB <- get("nuB", envir)
871
+	phiA <- get("phiA", envir)
872
+	phiB <- get("phiB", envir)
873
+	uplate <- get("uplate", envir)
874
+	muA.AA <- get("muA.AA", envir)
875
+	muA.AB <- get("muA.AB", envir)
876
+	muA.BB <- get("muA.BB", envir)
877
+	muB.AA <- get("muB.AA", envir)
878
+	muB.AB <- get("muB.AB", envir)
879
+	muB.BB <- get("muB.BB", envir)
880
+	sd.CT <- get("sd.CT", envir)
881
+	sigmaA <- get("sigmaA", envir)
882
+	sigmaB <- get("sigmaB", envir)	
883
+	Ns <- get("Ns", envir)
884
+	CA <- get("CA", envir)
885
+	CB <- get("CB", envir)
886
+	
887
+	NOHET <- mean(Ns[, p, 2], na.rm=TRUE) < 0.05
888
+	if(NOHET){
889
+		IA <- cbind(muA.AA[, p], muA.BB[, p])
890
+		IB <- cbind(muB.AA[, p], muB.BB[, p])		
891
+		sigma2A <- cbind(sigmaA[, c(1, 3)]^2)
892
+		sigma2B <- cbind(sigmaB[, c(1, 3)]^2)		
893
+##		sigma2A <- cbind(Sigma.AA[, "varA", p], Sigma.BB[, "varA", p])
894
+##		colnames(sigma2A) <- c("AA", "BB")
895
+##		sigma2B <- cbind(Sigma.AA[, "varB", p], Sigma.BB[, "varB", p])
896
+##		colnames(sigma2B) <- c("AA", "BB")
897
+	} else{
898
+		IA <- cbind(muA.AA[, p], muA.AB[, p], muA.BB[, p])
899
+		IB <- cbind(muB.AA[, p], muB.AB[, p], muB.BB[, p])		
900
+		sigma2A <- sigmaA^2
901
+		sigma2B <- sigmaB^2		
902
+##		sigma2A <- cbind(Sigma.AA[, "varA", p], Sigma.AB[, "varA", p], Sigma.BB[, "varA", p])
903
+##		colnames(sigma2A) <- c("AA", "AB", "BB")
904
+##		sigma2B <- cbind(Sigma.AA[, "varB", p], Sigma.AB[, "varB", p], Sigma.BB[, "varB", p])
905
+##		colnames(sigma2B) <- c("AA", "AB", "BB")
906
+	}
907
+	##sigma2A <- get("sigma2A", envir)
908
+	##sigma2B <- get("sigma2B", envir)
909
+##	fit.variance <- get("fit.variance", envir)
910
+
911
+	##---------------------------------------------------------------------------
912
+	## Estimate CA, CB
913
+	##---------------------------------------------------------------------------
914
+	phiA[phiA < 1] <- 1	
915
+	tmp <- (1/phiA[, p])*(A - nuA[, p])
916
+	tmp <- matrix(as.integer(tmp*100), nrow(tmp), ncol(tmp))
917
+	if(FALSE) hist(rowMeans(tmp/100), breaks=1000, xlim=c(-2, 8))
918
+	CA[, plate==uplate[p]] <- tmp
919
+	phiB[phiB < 1] <- 1
920
+	tmp <- (1/phiB[, p])*(B - nuB[, p])
921
+	tmp <- matrix(as.integer(tmp*100), nrow(tmp), ncol(tmp))
922
+	if(FALSE) hist(rowMeans(tmp/100), breaks=1000, xlim=c(-2, 8))		
923
+	CB[, plate==uplate[p]] <- tmp
924
+
925
+	nuA[nuA < 1] <- 1
926
+	nuB[nuB < 1] <- 1
927
+
928
+	tmpA <- CA[, plate==uplate[p]]
929
+	tmpB <- CB[, plate==uplate[p]]	
930
+	indexA <- which(rowSums(is.na(tmpA)) > 0)
931
+	indexB <- which(rowSums(is.na(tmpB)) > 0)
932
+	index <- union(indexA, indexB)
933
+	if(length(index) > 0) browser()	
934
+	##---------------------------------------------------------------------------
935
+	## Estimate var(CA), var(CB), var(CA+CB)
936
+	##---------------------------------------------------------------------------
937
+	##var(CA) = 1/phiA^2*var(IA - nuA)
938
+	##        = 1/phiA^2*var(IA)
939
+	##        = 1/phiA^2*f(mu)
940
+##	log.musA <- as.numeric(log2(IA))		
941
+##	log.sigma2A <- as.numeric(log2(sigma2A))
942
+##	log.musB <- as.numeric(log2(IB))
943
+##	log.sigma2B <- as.numeric(log2(sigma2B))
944
+##	f <- predict(fit.variance, newdata=data.frame(log.mus=log.musA))
945
+##	resid <- matrix(log.sigma2A - f, ncol=ncol(sigma2A))
946
+##	ls2 <- rowMedians(resid, na.rm=TRUE)		
947
+##	log.musA <- matrix(log.musA, ncol=ncol(sigma2A))
948
+##	tmp <- apply(log.musA + ls2, 2, function(log.mus, fit) sqrt(2^(predict(fit, newdata=data.frame(log.mus=log.mus)))), fit=fit.variance)
949
+##	sdA <- 1/phiA[, p]*rowMedians(tmp)
950
+##	##If all three points are below the spline, the 3 residuals
951
+##	##will be negative.  adding the median residual to the log mus
952
+##	##protects against over/under estimating the variance
953
+##	##sdA[tmpIndex, p, ] <- apply(log.musA[tmpIndex, ] + ls2, 2,
954
+##	##function(log.mus, fit) sqrt(exp(predict(fit.logVariance,
955
+##	##newdata=data.frame(log.mus=log.mus)))))
956
+##	robustSD <- function(X) diff(quantile(X, probs=c(0.16, (1-0.16)), na.rm=TRUE))/2	
957
+##	sample.sd <- apply(CA[, plate==uplate[p]]/100, 2, robustSD)
958
+##	sd.ca <- sdA %*% t(sample.sd)
959
+##	##where missing, use the residual from the genotype with the most observations
960
+##	f <- predict(fit.variance, newdata=data.frame(log.mus=log.musB))
961
+##	resid <- matrix(log.sigma2B - f, ncol=ncol(sigma2B))
962
+##	ls2 <- rowMedians(resid, na.rm=TRUE)		
963
+##	log.musB <- matrix(log.musB, ncol=ncol(sigma2B))
964
+##	tmp <- apply(log.musB + ls2, 2, function(log.mus, fit) sqrt(2^(predict(fit, newdata=data.frame(log.mus=log.mus)))), fit=fit.variance)
965
+##	sdB <- 1/phiB[, p]*rowMedians(tmp)
966
+##	sample.sd <- apply(CB[, plate==uplate[p]]/100, 2, robustSD)
967
+##	sd.cb <- sdB %*% t(sample.sd)
968
+##	covar <- rowCovs(1/phiA[, p]*log.musA, 1/phiB[, p]*log.musB)
969
+##	##tmp1 <- sqrt(sdA^2 + sdB^2 - 2*covar) ##probe-specific variance
970
+##	tmp <- sqrt(sd.ca^2 + sd.cb^2 - 2*covar) ##probe-specific variance
971
+##	sd.CT[, plate==uplate[p]] <- matrix(as.integer(100*tmp), nrow(tmp), ncol(tmp))
972
+	##another approach:
973
+	##tmp <- apply(log(A), 2, function(log.mus, fit) sqrt(exp(predict(fit, newdata=data.frame(log.mus=log.mus)))), fit=fit.variance)	
974
+	##tmp2 <- 1/phiA[, p]*tmp
975
+##	CA <- matrix(as.integer(CA*100), nrow(CA), ncol(CA))
976
+##	CB <- matrix(as.integer(CB*100), nrow(CB), ncol(CB))
977
+##	sd.CT <- matrix(as.integer(sd.CT*100), nrow(sd.CT), ncol(sd.CT))	
978
+	assign("CA", CA, envir)
979
+	assign("CB", CB, envir)
980
+##	assign("sd.CT", sd.CT, envir)	
981
+	##---------------------------------------------------------------------------
982
+	## nonpolymorphic probes
983
+	##---------------------------------------------------------------------------
984
+}
... ...
@@ -11,22 +11,22 @@ changeToCrlmmAnnotationName <- function(x){
11 11
 }
12 12
 
13 13
 getCrlmmAnnotationName <- function(x){
14
-  paste(tolower(gsub("_", "", x)), "Crlmm", sep="")
14
+	paste(tolower(gsub("_", "", x)), "Crlmm", sep="")
15 15
 }
16 16
 
17 17
 medianSummaries <- function(mat, grps)
18 18
   .Call("R_subColSummarize_median", mat, grps, PACKAGE = "preprocessCore")
19 19
 
20 20
 intMedianSummaries <- function(mat, grps)
21
-  as.integer(medianSummaries(mat, grps))
21
+	as.integer(medianSummaries(mat, grps))
22 22
 
23 23
 testProb <- function(p)
24 24
   .Call("test", p)
25 25
 
26 26
 
27 27
 list.celfiles <-   function(...){
28
-  files <- list.files(...)
29
-  return(files[grep("\\.[cC][eE][lL]$", files)])
28
+	files <- list.files(...)
29
+	return(files[grep("\\.[cC][eE][lL]$", files)])
30 30
 }
31 31
 
32 32
 ## .crlmmPkgEnv is an enviroment that will
... ...
@@ -36,10 +36,9 @@ list.celfiles <-   function(...){
36 36
 ## R CMD check
37 37
 
38 38
 isLoaded <- function(dataset, environ=.crlmmPkgEnv)
39
-  exists(dataset, envir=environ)
40
-
39
+	exists(dataset, envir=environ)
41 40
 getVarInEnv <- function(dataset, environ=.crlmmPkgEnv){
42
-  if (!isLoaded(dataset))
43
-    stop("Variable ", dataset, " not found in .crlmmPkgEnv")
44
-  environ[[dataset]]
41
+	if (!isLoaded(dataset))
42
+		stop("Variable ", dataset, " not found in .crlmmPkgEnv")
43
+	environ[[dataset]]
45 44
 }
... ...
@@ -10,7 +10,6 @@ THISPKG <- "crlmm"
10 10
 }
11 11
 
12 12
 .onUnload <- function( libpath ){
13
-  library.dynam.unload(THISPKG, libpath)
13
+	library.dynam.unload(THISPKG, libpath)
14 14
 }
15
-
16
-  .crlmmPkgEnv <- new.env(parent=emptyenv())
15
+.crlmmPkgEnv <- new.env(parent=emptyenv())
17 16
new file mode 100644
... ...
@@ -0,0 +1,378 @@
1
+%\VignetteIndexEntry{crlmm copy number Vignette}
2
+%\VignetteDepends{crlmm, genomewidesnp6Crlmm}
3
+%\VignetteKeywords{crlmm, SNP 6}
4
+%\VignettePackage{crlmm}
5
+\documentclass{article}
6
+\usepackage{graphicx}
7
+\newcommand{\Rfunction}[1]{{\texttt{#1}}}
8
+\newcommand{\Rmethod}[1]{{\texttt{#1}}}
9
+\newcommand{\Rcode}[1]{{\texttt{#1}}}
10
+\newcommand{\Robject}[1]{{\texttt{#1}}}
11
+\newcommand{\Rpackage}[1]{{\textsf{#1}}}
12
+\newcommand{\Rclass}[1]{{\textit{#1}}}
13
+\newcommand{\oligo}{\Rpackage{oligo }}
14
+\newcommand{\R}{\textsf{R}}
15
+
16
+\begin{document}
17
+\title{Estimating copy number for Affymetrix 6.0  with the crlmm Package}
18
+\date{February, 2009}
19
+\author{Rob Scharpf}
20
+\maketitle
21
+
22
+<<setup, echo=FALSE, results=hide>>=
23
+options(width=60)
24
+options(continue=" ")
25
+options(prompt="R> ")
26
+@ 
27
+
28
+\section{Estimating copy number}
29
+
30
+General requirements: in addition to the \R{} packages indicated
31
+below, processing a large number of samples requires a high-end
32
+computer with a large amount of RAM. Currently, the maximum number of
33
+samples that can be genotyped at once is approximately 2000 and this
34
+requires roughly 32G of RAM.
35
+
36
+\paragraph{Suggested work flow.} I typically submit code for
37
+preprocessing and genotyping as a batch job to a computing cluster.
38
+When the preprocessing is complete, I pick a chromosome and work
39
+interactively from the cluster to calculate copy number and make some
40
+of the suggested diagnostic plots.
41
+
42
+\subsection{Preprocess and genotype the samples}
43
+
44
+First, load the required libraries.
45
+
46
+<<requiredPackages>>=
47
+library(crlmm)
48
+library(genomewidesnp6Crlmm)
49
+library(ellipse)
50
+@ 
51
+
52
+Specify an output directory and provide the complete path for the CEL
53
+file names.
54
+
55
+<<>>=
56
+outdir <- "/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/GRU-EA"
57
+datadir <- "/thumper/ctsa/snpmicroarray/GAIN/Bipolar/GRU-EA"
58
+fns <- list.files(datadir, pattern=".CEL", full.names=TRUE)
59
+@ 
60
+
61
+Genotype the samples with CRLMM. Submit this job to the cluster and save the
62
+output to \Robject{outdir}.
63
+
64
+<<genotype, eval=FALSE>>=
65
+res2 <- crlmm(filenames=fns,  save.it=TRUE, intensityFile=file.path(outdir, "intensities.rda"))
66
+save(res2, file=file.path(outdir, "res2.rda"))
67
+@ 
68
+
69
+Quantile normalize the nonpolymorphic probes and save the output:
70
+
71
+<<quantileNormalizeCnProbes, eval=FALSE>>=
72
+res3 <- cnrma(fns)
73
+save(res3, file=file.path(outdir, "res3.rda"))
74
+@ 
75
+
76
+\subsection{Copy number}
77
+
78
+Copy number can be assessed one chromosome at a time.  Here we specify
79
+chromosome 15 and and load a list of indices used to subset the files
80
+saved in the previous section. The first element in the list
81
+correspond to indices of polymorphic probes on chromosome 15; the
82
+second element corresponds to indices of nonpolymorphic probes on
83
+chromosome 15.
84
+
85
+<<chromosomeIndex>>=
86
+CHR <- 15
87
+CHR_INDEX <- paste(CHR, "index", sep="")
88
+data(list=CHR_INDEX, package="genomewidesnp6Crlmm")
89
+str(index)
90
+@ 
91
+
92
+Next we load 3 files that were saved from the preprocessing step and
93
+then subset these lists using the above indices to extract the
94
+preprocessed intensities and genotypes needed for estimating copy
95
+number.  Specifically, we require 6 items:
96
+
97
+\begin{itemize}
98
+  \item quantile-normalized A intensities (I1 x J)
99
+  \item quantile-normalized B intensities (I1 x J)
100
+  \item quantile-normalized intensities from nonpolymorphic (NP) probes (I2 x J)
101
+  \item genotype calls (I1 x J)
102
+  \item confidence scores of the genotype calls  (I1 x J)
103
+  \item signal to noise ratio (SNR) of the samples (J)
104
+  \end{itemize}
105
+  
106
+  These items are extracted as follows:
107
+
108
+<<eval=FALSE>>=
109
+load(file.path(outdir, "intensities.rda"))
110
+load(file.path(outdir, "res2.rda"))
111
+load(file.path(outdir, "res3.rda"))
112
+@ 
113
+
114
+Depending on the number of samples, the above objects can be large and
115
+take a while to load.  For the figures in this vignette, I am loading
116
+only 5000 SNP-level summaries from chromosome 22 (28MB) and the
117
+quantile-normalized intensities for 5000 nonpolymorphic probes on
118
+chromosome 22 (8 MB total).
119
+
120
+<<loadTmpFiles, eval=FALSE, echo=FALSE>>=
121
+load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmp.rda")
122
+assign("res", tmp, envir=.GlobalEnv)
123
+rm(tmp); gc()
124
+load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmpcn.rda")
125
+A <- res$A
126
+B <- res$B
127
+calls <- res$calls
128
+conf <- res$conf
129
+SNR <- res$SNR
130
+NP <- res3$NP
131
+@ 
132
+
133
+<<snpAndCnSummaries, eval=FALSE>>=
134
+A <- res$A
135
+B <- res$B
136
+calls <- res2$calls
137
+conf <- res2$conf
138
+SNR <- res2$SNR
139
+NP <- res3$NP
140
+rm(res, res2, res3)
141
+gc()
142
+@ 
143
+
144
+Make a histogram of the signal to noise ratio for these samples:
145
+
146
+<<plotSnr, fig=TRUE>>=
147
+hist(SNR, xlab="SNR", main="")
148
+@ 
149
+
150
+We suggest excluding samples with a signal to noise ratio less than 5.
151
+We then extract the probes (rows) that are on chromosome 15 and the
152
+samples (columns) that have a suitable signal to noise ratio.
153
+
154
+<<subsetCrlmmOutput, eval=FALSE>>=
155
+A <- A[index[[1]], SNR > 5]
156
+B <- B[index[[1]], SNR > 5]
157
+calls <- calls[index[[1]], SNR > 5]
158
+conf <- conf[index[[1]], SNR > 5]
159
+NP <- NP[index[[2]], SNR > 5]
160
+rm(res, res2, res3); gc()
161
+@ 
162
+
163
+We want to adjust for batch effects.  In our experience, the chemistry
164
+plate is a good surrogate for batch, although this should be assessed
165
+on a study by study basis.  For samples processed by Broad, the plate
166
+is indicated by a capitalized five-letter word and is part of the
167
+sample name.  It is important that the plate (batch) variable is
168
+ordered the same as filenames in the preprocessed data.  For instance,
169
+to indicate plate in this example one can use the command
170
+
171
+<<specifyBatch>>=
172
+sns <- colnames(calls)
173
+sns[1]
174
+plates <- sapply(sns, function(x) strsplit(x, "_")[[1]][2])
175
+table(plates)
176
+@ 
177
+
178
+We are now ready to estimate the copy number for each batch.  In the
179
+current version of this package, one specifies an environment to which
180
+intermediate \R{} objects for copy number estimation are
181
+stored. Allele-specific estimates of copy number are also stored in
182
+this environment.
183
+
184
+<<copyNumberByBatch, eval=FALSE>>=
185
+e1 <- new.env()
186
+computeCnBatch(A=A,
187
+	       B=B,
188
+	       calls=calls,
189
+	       conf=conf,
190
+	       NP=NP,
191
+	       plate=plates,
192
+	       envir=e1, chrom=CHR, DF.PRIOR=75)
193
+@ 
194
+
195
+The DF.PRIOR indicates how much we will shrink SNP-specific estimates
196
+of the variance and correlation.
197
+
198
+<<accessingEstimates>>=
199
+copyA <- get("CA", e1)
200
+copyB <- get("CB", e1)
201
+copyT <- (copyA+copyB)/100
202
+copyT[copyT < 0] <- 0
203
+copyT[copyT > 6] <- 6
204
+@ 
205
+
206
+\section{Suggested plots}
207
+
208
+\paragraph{One sample at a time}
209
+
210
+<<annotation>>=
211
+data(snpProbes, package="genomewidesnp6Crlmm")
212
+data(cnProbes, package="genomewidesnp6Crlmm")
213
+position <- snpProbes[match(rownames(calls), rownames(snpProbes)), "position"]
214
+@ 
215
+
216
+To plot physical position versus copy number for the first sample:
217
+
218
+<<oneSample, fig=TRUE, width=8, height=4, include=FALSE>>=
219
+require(SNPchip)
220
+par(las=1)
221
+plotCytoband(as.character(CHR), ylim=c(0,7), cytoband.ycoords=c(6.5,7), label.cytoband=FALSE, main="Chr 15")
222
+points(position, copyT[, 1], pch=".", cex=2, , xaxt="n", col="grey70")
223
+axis(1, at=pretty(range(position)), labels=pretty(range(position))/1e6)
224
+axis(2, at=0:5, labels=0:5)
225
+mtext("position (Mb)", 1, line=2)
226
+mtext(expression(C[A] + C[B]), 2, line=2, las=3)
227
+@ 
228
+
229
+\begin{figure}
230
+  \includegraphics[width=0.9\textwidth]{copynumber-oneSample}
231
+  \caption{Total copy number (y-axis) for chromosome 22 plotted
232
+    against physical position (x-axis) for one sample.  }
233
+\end{figure}
234
+
235
+\paragraph{One SNP at a time}
236
+
237
+<<intermediateFiles>>=
238
+tau2A <- get("tau2A", e1)
239
+tau2B <- get("tau2B", e1)
240
+sig2A <- get("sig2A", e1)
241
+sig2B <- get("sig2B", e1)
242
+nuA <- get("nuA", e1)
243
+phiA <- get("phiA", e1)
244
+nuB <- get("nuB", e1)
245
+phiB <- get("phiB", e1)
246
+corr <- get("corr", e1)
247
+corrA.BB <- get("corrA.BB", e1)
248
+corrB.AA <- get("corrA.BB", e1)
249
+A <- get("A", e1)
250
+B <- get("B", e1)
251
+@ 
252
+
253
+Here, we plot the prediction regions for total copy number 2 and 3 for
254
+the first plate. Black plotting symbols are estimates from the first
255
+plate; light grey are points from other plates. (You could also draw
256
+prediction regions for 0-4 copies, but it gets crowded).  Notice that
257
+there is little evidence of a plate effect for this SNP.
258
+
259
+<<predictionRegion, fig=TRUE, width=8, height=8, include=FALSE>>=
260
+par(las=1)
261
+p <- 1 ##indicates plate
262
+J <- grep(unique(plates)[p], sns) ##sample indices for this plate
263
+ylim <- c(6.5,13)
264
+I <- which(phiA > 10 & phiB > 10)
265
+i <- I[1]
266
+plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col="grey60", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B")
267
+points(log2(A[i, J]), log2(B[i, J]), col="black", pch=as.character(calls[i, J]))
268
+for(CT in 2:3){
269
+	if(CT == 2) ellipse.col <- "brown" else ellipse.col <- "purple"
270
+	for(CA in 0:CT){
271
+		CB <- CT-CA
272
+		A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0))
273
+		B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0))
274
+		scale <- c(A.scale, B.scale)
275
+		if(CA == 0 & CB > 0) rho <- corrA.BB[i, p]