Browse code

modified copynumber vignette. added several todos for the vignette

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@37691 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 03/03/2009 14:15:38
Showing 4 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling via CRLMM Algorithm
4
-Version: 1.0.43
4
+Version: 1.0.44
5 5
 Date: 2008-12-28
6 6
 Author: Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>
8 8
Binary files a/inst/doc/copynumber.pdf and b/inst/doc/copynumber.pdf differ
... ...
@@ -42,13 +42,15 @@ library(genomewidesnp6Crlmm)
42 42
 library(ellipse)
43 43
 @ 
44 44
 
45
+You must specify the names of the CEL files and the full path, as well
46
+as a directory in which to store the output from crlmm and cnrma.
47
+
45 48
 <<celfiles>>=
46 49
 celFiles <- list.celfiles("/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m", full.names=TRUE, pattern=".CEL")
47 50
 outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/1m/affy"
48 51
 @ 
49 52
 
50
-Specify an output directory for storing the quantile-normalized
51
-intensities from crlmm.
53
+Preprocess and genotype (for more info see the crlmm vignette).
52 54
 
53 55
 <<genotype>>=
54 56
 if (!exists("crlmmResult")) {
... ...
@@ -63,7 +65,7 @@ if (!exists("crlmmResult")) {
63 65
 }
64 66
 @ 
65 67
 
66
-Quantile normalize the nonpolymorphic probes and save the output.
68
+Quantile normalize the nonpolymorphic probes and save the results.
67 69
 
68 70
 <<cnrma>>=
69 71
 if(!exists("cnrmaResult")){
... ...
@@ -113,7 +115,6 @@ calls <- crlmmResult$calls
113 115
 conf <- crlmmResult$conf
114 116
 SNR <- crlmmResult$SNR
115 117
 NP <- cnrmaResult$NP
116
-rm(res, crlmmResult, cnrmaResult)
117 118
 gc()
118 119
 @ 
119 120
 
... ...
@@ -155,14 +156,16 @@ stored. Allele-specific estimates of copy number are also stored in
155 156
 this environment.
156 157
 
157 158
 <<copyNumberByBatch>>=
158
-e1 <- new.env()
159
-computeCopynumber(A=A,
160
-		  B=B,
161
-		  calls=calls,
162
-		  conf=conf,
163
-		  NP=NP,
164
-		  plate=plates,
165
-		  envir=e1, chrom=CHR, DF.PRIOR=75)
159
+if(!exists("e1")){
160
+	e1 <- new.env()
161
+	computeCopynumber(A=A,
162
+			  B=B,
163
+			  calls=calls,
164
+			  conf=conf,
165
+			  NP=NP,
166
+			  plate=plate,
167
+			  envir=e1, chrom=CHR, DF.PRIOR=75)
168
+}
166 169
 @ 
167 170
 
168 171
 The DF.PRIOR indicates how much we will shrink SNP-specific estimates
... ...
@@ -196,7 +199,7 @@ points(position, copyT[, 1], pch=".", cex=2, , xaxt="n", col="grey70")
196 199
 axis(1, at=pretty(range(position)), labels=pretty(range(position))/1e6)
197 200
 axis(2, at=0:5, labels=0:5)
198 201
 mtext("position (Mb)", 1, line=2)
199
-mtext(expression(C[A] + C[B]), 2, line=2, las=3)
202
+mtext(expression(hat(C)[A] + hat(C)[B]), 2, line=2, las=3)
200 203
 @ 
201 204
 
202 205
 \begin{figure}
... ...
@@ -207,6 +210,8 @@ mtext(expression(C[A] + C[B]), 2, line=2, las=3)
207 210
 
208 211
 \paragraph{One SNP at a time}
209 212
 
213
+This section needs to be cleaned up (TODO).
214
+
210 215
 <<intermediateFiles>>=
211 216
 tau2A <- get("tau2A", e1)
212 217
 tau2B <- get("tau2B", e1)
... ...
@@ -224,22 +229,25 @@ B <- get("B", e1)
224 229
 @ 
225 230
 
226 231
 Here, we plot the prediction regions for total copy number 2 and 3 for
227
-the first plate. Black plotting symbols are estimates from the first
228
-plate; light grey are points from other plates. (You could also draw
229
-prediction regions for 0-4 copies, but it gets crowded).  Notice that
230
-there is little evidence of a plate effect for this SNP.
232
+the first plate. Plotting symbols are the genotype calls (1=AA, 2=AB,
233
+3=BB); light grey points are from other plates. (You could also add
234
+the prediction regions for 0-4 copies, but it gets crowded).  Notice
235
+that there is little evidence of a plate effect for this SNP.
231 236
 
232 237
 <<predictionRegion, fig=TRUE, width=8, height=8, include=FALSE>>=
233
-par(las=1)
238
+par(las=1, pty="s")
234 239
 p <- 1 ##indicates plate
235
-J <- grep(unique(plates)[p], sns) ##sample indices for this plate
240
+J <- grep(unique(plate)[p], plate) ##sample indices for this plate
236 241
 ylim <- c(6.5,13)
237 242
 I <- which(phiA > 10 & phiB > 10)
238 243
 i <- I[1]
244
+##plate effects are small
245
+log2(phiA[i, ])
246
+log2(phiB[i, ])
239 247
 plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col="grey60", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B")
240 248
 points(log2(A[i, J]), log2(B[i, J]), col="black", pch=as.character(calls[i, J]))
241
-for(CT in 2:3){
242
-	if(CT == 2) ellipse.col <- "brown" else ellipse.col <- "purple"
249
+for(CT in 2){
250
+	if(CT == 2) ellipse.col <- "black" ##else ellipse.col <- "purple"
243 251
 	for(CA in 0:CT){
244 252
 		CB <- CT-CA
245 253
 		A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0))
... ...
@@ -252,105 +260,126 @@ for(CT in 2:3){
252 260
 			      scale=scale), col=ellipse.col, lwd=2)
253 261
 	}
254 262
 }
255
-legend("topright", lwd=3, col=c("black", "purple"), legend=c("2 copies", "3 copies"), bty="n")
263
+legend("topright", lwd=3, col="black", legend="2 copies", bty="n")
256 264
 @ 
257 265
 
258 266
 \begin{figure}
259 267
   \includegraphics[width=0.9\textwidth]{copynumber-predictionRegion}
260
-  \caption{Prediction regions for copy number 2 and 3 for one SNP.
261
-    The plate effects are negligible for this SNP and we only plot the
268
+  \caption{Prediction regions for copy number 2 for one SNP.  The
269
+    plate effects are negligible for this SNP and we only plot the
262 270
     prediction region for the first plate.}
263 271
 \end{figure}
264 272
 
265
-Here is one way to identify a SNP with a large plate effect -- look
266
-for shifts in the prediction regions (here I'm looking for large
267
-shifts in the A direction).
268
-
269
-<<shifts>>=
270
-shiftA <- nuA+2*phiA 
271
-maxA <- apply(shiftA, 1, max, na.rm=T)
272
-minA <- apply(shiftA, 1, min, na.rm=T)
273
-d <- maxA - minA
274
-hist(d)
275
-index <- which(d > 3000)[2]
276
-plate1 <- which(shiftA[index, ] == maxA[index])
277
-plate2 <- which(shiftA[index, ] == minA[index])
278
-@ 
279 273
 
280
-Now plot the predictions for the two plates.  All the ellipses are
281
-prediction regions for copy number 2.  Note that while I looked for
282
-shifts in the A direction, the shift often occurs in both the B and A
283
-dimensions.
284
-
285
-<<plateEffect, fig=TRUE, width=8, height=8, include=FALSE>>=
286
-par(las=1)
287
-i <- index
288
-J1 <- grep(unique(plates)[plate1], sns) ##sample indices for this plate
289
-J2 <- grep(unique(plates)[plate2], sns) ##sample indices for this plate
274
+<<predictionRegion, eval=FALSE>>=
275
+par(las=1, pty="s", ask=TRUE)
276
+p <- 1 ##indicates plate
277
+J <- grep(unique(plate)[p], plate) ##sample indices for this plate
290 278
 ylim <- c(6.5,13)
291
-plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col="grey60", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B")
292
-points(log2(A[i, J1]), log2(B[i, J1]), col="brown", pch=as.character(calls[i, J1]))
293
-points(log2(A[i, J2]), log2(B[i, J2]), col="blue", pch=as.character(calls[i, J2]))
294
-p <- plate1
295
-CT <- 2
296
-ellipse.col <- "brown" 
297
-for(CA in 0:CT){
298
-	CB <- CT-CA
299
-	A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0))
300
-	B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0))
301
-	scale <- c(A.scale, B.scale)
302
-	if(CA == 0 & CB > 0) rho <- corrA.BB[i, p]
303
-	if(CA > 0 & CB == 0) rho <- corrB.AA[i, p]
304
-	if(CA > 0 & CB > 0) rho <- corr[i, p]			
305
-	lines(ellipse(x=rho, centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])),
306
-		      scale=scale), col=ellipse.col, lwd=2)
307
-}
308
-ellipse.col <- "blue" 
309
-p <- plate2
310
-for(CA in 0:CT){
311
-	CB <- CT-CA
312
-	A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0))
313
-	B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0))
314
-	scale <- c(A.scale, B.scale)
315
-	if(CA == 0 & CB > 0) rho <- corrA.BB[i, p]
316
-	if(CA > 0 & CB == 0) rho <- corrB.AA[i, p]
317
-	if(CA > 0 & CB > 0) rho <- corr[i, p]			
318
-	lines(ellipse(x=rho, centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])),
319
-		      scale=scale), col=ellipse.col, lwd=2)
279
+I <- which(phiA > 10 & phiB > 10)
280
+col <- c("grey0", "grey30", "grey70")
281
+cex <- 3
282
+for(j in seq(along=I)){
283
+	i <- I[j]
284
+	J1 <- grep("C", plate) ##sample indices for this plate
285
+	J2 <- grep("Y", plate) ##sample indices for this plate	
286
+	J3 <- grep("A", plate)
287
+	plot(log2(A[i, ]), log2(B[i, ]), pch=21, col="grey60", type="n", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B")
288
+	points(log2(A[i, J1]), log2(B[i, J1]), col=col[1], pch=".", cex=cex)
289
+	points(log2(A[i, J2]), log2(B[i, J2]), col=col[2], pch=".", cex=cex)
290
+	points(log2(A[i, J3]), log2(B[i, J3]), col=col[3], pch=".", cex=cex)
291
+	P <- 1:3
292
+	ellipse.col <- col
293
+	for(p in seq(along=P)){
294
+		for(CT in 2){
295
+			for(CA in 0:CT){
296
+				CB <- CT-CA
297
+				A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0))
298
+				B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0))
299
+				scale <- c(A.scale, B.scale)
300
+				if(CA == 0 & CB > 0) rho <- corrA.BB[i, p]
301
+				if(CA > 0 & CB == 0) rho <- corrB.AA[i, p]
302
+				if(CA > 0 & CB > 0) rho <- corr[i, p]		
303
+				lines(ellipse(x=rho, centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])),
304
+					      scale=scale), col=ellipse.col[p], lwd=2)
305
+			}
306
+		}
307
+
308
+	}
309
+	legend("topright", lwd=3, col=ellipse.col, legend=unique(plate), bty="n")	
320 310
 }
321
-legend("topright", lwd=3, col=c("brown", "blue"), legend=c("2 copies, plate 1", "2 copies, plate 2 "), bty="n")
322 311
 @ 
323 312
 
324
-\begin{figure}
325
-  \includegraphics[width=0.9\textwidth]{copynumber-plateEffect}
326
-  \caption{The prediction regions for copy number 2 shift when there
327
-    is a large plate effect.}
328
-\end{figure}
313
+Look at the distribution of shifts in the predicted centers across the
314
+plates.  The biggest shifts are for SNPs that have no observations in
315
+a subset of the plates.  Here, we should borrow strength across plates
316
+to improve the prediction regions (TO DO).
329 317
 
330
-Alternatively, loop through a large number of SNPs to see how these
331
-regions move
332
-<<codeForLoop, eval=FALSE>>=
333
-par(las=1, pty="s", ask=TRUE)
334
-for(i in I[1:50]){
335
-	plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col=col[DS], cex=0.9, ylim=ylim, xlim=ylim)
336
-	for(CT in 2:3){
337
-		if(CT == 2) ellipse.col <- "brown" else ellipse.col <- "purple"
318
+<<shifts, eval=FALSE>>=
319
+shiftA <- log2(nuA+phiA)
320
+maxA <- apply(shiftA, 1, max, na.rm=TRUE)
321
+minA <- apply(shiftA, 1, min, na.rm=TRUE)
322
+d <- maxA - minA
323
+hist(d)
324
+index <- which(d > 1)
325
+plate1 <- plate2 <- rep(NA, length(index))
326
+for(i in seq(along=index)){
327
+	plate1[i] <- which(shiftA[index[i], ] == maxA[index[i]])
328
+	plate2[i] <- which(shiftA[index[i], ] == minA[index[i]])
329
+}
330
+
331
+par(las=1)
332
+for(j in seq(along=index)){
333
+	i <- index[j]
334
+	J1 <- grep(plate[plate1[j]], plate) ##sample indices for this plate
335
+	J2 <- grep(plate[plate2[j]], plate) ##sample indices for this plate
336
+	ylim <- c(6.5,13)
337
+	plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col="grey60", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B")
338
+	points(log2(A[i, J1]), log2(B[i, J1]), col="brown", pch=as.character(calls[i, J1]))
339
+	points(log2(A[i, J2]), log2(B[i, J2]), col="blue", pch=as.character(calls[i, J2]))
340
+	CT <- 2
341
+	P <- c(plate1[j], plate2[j])
342
+	for(p in seq(along=P)){
343
+		if(p == 1) ellipse.col="brown" else ellipse.col="blue"
338 344
 		for(CA in 0:CT){
339 345
 			CB <- CT-CA
340 346
 			A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0))
341 347
 			B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0))
348
+			scale <- c(A.scale, B.scale)
342 349
 			if(CA == 0 & CB > 0) rho <- corrA.BB[i, p]
343 350
 			if(CA > 0 & CB == 0) rho <- corrB.AA[i, p]
344 351
 			if(CA > 0 & CB > 0) rho <- corr[i, p]			
345
-			scale <- c(A.scale, B.scale)
346 352
 			lines(ellipse(x=rho, centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])),
347 353
 				      scale=scale), col=ellipse.col, lwd=2)
348 354
 		}
349 355
 	}
350
-	legend("topright", lwd=3, col=col, legend=c("3 copies", "2 copies"), bty="n")
351 356
 }
357
+legend("topright", lwd=3, col=c("brown", "blue"), legend=c("2 copies, plate 1", "2 copies, plate 2 "), bty="n")
352 358
 @ 
353 359
 
360
+Now look at shifts for which we have at least 3 observations in each
361
+genotype cluster (TO DO).
362
+
363
+<<shifts, eval=FALSE, echo=FALSE>>=
364
+shiftA <- log2(nuA+phiA)
365
+maxA <- apply(shiftA, 1, max, na.rm=TRUE)
366
+minA <- apply(shiftA, 1, min, na.rm=TRUE)
367
+d <- maxA - minA
368
+hist(d)
369
+index <- which(d > 0.2)
370
+plate1 <- plate2 <- rep(NA, length(index))
371
+for(i in seq(along=index)){
372
+	plate1[i] <- which(shiftA[index[i], ] == maxA[index[i]])
373
+	plate2[i] <- which(shiftA[index[i], ] == minA[index[i]])
374
+}
375
+@ 
376
+
377
+%\begin{figure}
378
+%  \includegraphics[width=0.9\textwidth]{copynumber-plateEffect}
379
+%  \caption{The prediction regions for copy number 2 shift when there
380
+%    is a large plate effect.}
381
+%\end{figure}
382
+
354 383
 \section{Session information}
355 384
 <<>>=
356 385
 sessionInfo()
357 386
Binary files a/inst/scripts/copynumber.pdf and b/inst/scripts/copynumber.pdf differ