Browse code

Add vignettes for copy number analysis. Add crlmmGT2.R file

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54171 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 30/03/2011 02:41:00
Showing17 changed files

... ...
@@ -1,4 +1,7 @@
1 1
 .auto*
2
+*.aux
3
+*.dvi
4
+cnvignette.org
2 5
 test/*
3 6
 *.o
4 7
 *.so
... ...
@@ -1453,18 +1453,17 @@ shrinkSummary <- function(object,
1453 1453
 			  MIN.SAMPLES=10,
1454 1454
 			  DF.PRIOR=50,
1455 1455
 			  verbose=TRUE,
1456
-			  marker.index,
1457
-			  is.lds=TRUE){
1456
+			  marker.index){
1458 1457
 	stopifnot(type[[1]] == "SNP")
1459 1458
 	CHR.X <- FALSE ## this is no longer needed
1459
+	is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
1460
+	if(is.lds) stopifnot(isPackageLoaded("ff"))
1460 1461
 	if(missing(marker.index)){
1461 1462
 		batch <- batch(object)
1462 1463
 		is.snp <- isSnp(object)
1463 1464
 		is.autosome <- chromosome(object) < 23
1464 1465
 		is.annotated <- !is.na(chromosome(object))
1465 1466
 		is.X <- chromosome(object) == 23
1466
-		is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
1467
-		if(is.lds) stopifnot(isPackageLoaded("ff"))
1468 1467
 		marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X)
1469 1468
 	}
1470 1469
 	if(is.lds){
... ...
@@ -1489,33 +1488,31 @@ shrinkSummary <- function(object,
1489 1488
 			      DF.PRIOR=DF.PRIOR,
1490 1489
 			      is.lds=is.lds)
1491 1490
 	}
1492
-	TRUE
1491
+	if(is.lds) return(TRUE) else return(object)
1493 1492
 }
1494 1493
 
1495 1494
 genotypeSummary <- function(object,
1496 1495
 			    GT.CONF.THR=0.80,
1497 1496
 			    type=c("SNP", "NP", "X.SNP", "X.NP"), ##"X.snps", "X.nps"),
1498 1497
 			    verbose=TRUE,
1499
-			    marker.index,
1500
-			    is.lds){
1498
+			    marker.index){
1501 1499
 	if(type == "X.SNP" | type=="X.NP"){
1502 1500
 		CHR.X <- TRUE
1503 1501
 	} else CHR.X <- FALSE
1502
+	is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
1503
+	if(is.lds) stopifnot(isPackageLoaded("ff"))
1504 1504
 	if(missing(marker.index)){
1505 1505
 		batch <- batch(object)
1506 1506
 		is.snp <- isSnp(object)
1507 1507
 		is.autosome <- chromosome(object) < 23
1508 1508
 		is.annotated <- !is.na(chromosome(object))
1509 1509
 		is.X <- chromosome(object) == 23
1510
-		is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
1511
-		if(is.lds) stopifnot(isPackageLoaded("ff"))
1512 1510
 		marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X)
1513 1511
 	}
1514 1512
 	summaryFxn <- function(type){
1515 1513
 		switch(type,
1516 1514
 		       SNP="summarizeSnps",
1517 1515
 		       NP="summarizeNps",
1518
-##		       X.SNP="summarizeSnps",
1519 1516
 		       X.NP="summarizeNps")
1520 1517
 	}
1521 1518
 	myf <- summaryFxn(type[[1]])
... ...
@@ -1529,7 +1526,6 @@ genotypeSummary <- function(object,
1529 1526
 			 batchSize=ocProbesets(),
1530 1527
 			 GT.CONF.THR=GT.CONF.THR,
1531 1528
 			 verbose=verbose,
1532
-			 is.lds=is.lds,
1533 1529
 			 CHR.X=CHR.X,
1534 1530
 			 neededPkgs="crlmm")
1535 1531
 	} else{
... ...
@@ -1539,10 +1535,9 @@ genotypeSummary <- function(object,
1539 1535
 			      batchSize=ocProbesets(),
1540 1536
 			      GT.CONF.THR=GT.CONF.THR,
1541 1537
 			      verbose=verbose,
1542
-			      is.lds=is.lds,
1543 1538
 			      CHR.X=CHR.X)
1544 1539
 	}
1545
-	TRUE
1540
+	if(is.lds) return(TRUE) else return(object)
1546 1541
 }
1547 1542
 
1548 1543
 whichMarkers <- function(type, is.snp, is.autosome, is.annotated, is.X){
... ...
@@ -1556,7 +1551,8 @@ whichMarkers <- function(type, is.snp, is.autosome, is.annotated, is.X){
1556 1551
 }
1557 1552
 
1558 1553
 summarizeNps <- function(strata, index.list, object, batchSize,
1559
-			 GT.CONF.THR, verbose, is.lds, CHR.X, ...){
1554
+			 GT.CONF.THR, verbose, CHR.X, ...){
1555
+	is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
1560 1556
 	if(is.lds) {physical <- get("physical"); open(object)}
1561 1557
 	if(verbose) message("      Probe stratum ", strata, " of ", length(index.list))
1562 1558
 	index <- index.list[[strata]]
... ...
@@ -1610,11 +1606,13 @@ summarizeSnps <- function(strata,
1610 1606
 			  index.list,
1611 1607
 			  object,
1612 1608
 			  GT.CONF.THR,
1613
-			  verbose, is.lds=TRUE, CHR.X, ...){
1609
+			  verbose, CHR.X, ...){
1614 1610
 ##	if(is.lds) {
1615 1611
 ##		physical <- get("physical")
1616 1612
 ##		open(object)
1617 1613
 ##	}
1614
+	is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
1615
+	if(is.lds) stopifnot(isPackageLoaded("ff"))
1618 1616
 	if(verbose) message("      Probe stratum ", strata, " of ", length(index.list))
1619 1617
 	index <- index.list[[strata]]
1620 1618
 	batches <- split(seq_along(batch(object)), as.character(batch(object)))
... ...
@@ -1832,25 +1830,28 @@ crlmmCopynumber <- function(object,
1832 1830
 		##if(verbose) message(paste("Computing summary statistics for ", mylabel(marker.type), " genotype clusters for each batch")
1833 1831
 		marker.index <- whichMarkers(marker.type, is.snp,
1834 1832
 					     is.autosome, is.annotated, is.X)
1835
-		genotypeSummary(object=object,
1836
-				GT.CONF.THR=GT.CONF.THR,
1837
-				type=marker.type,
1838
-				verbose=verbose,
1839
-				marker.index=marker.index,
1840
-				is.lds=is.lds)
1833
+		if(length(marker.index) == 0) next()
1834
+		res <- genotypeSummary(object=object,
1835
+				       GT.CONF.THR=GT.CONF.THR,
1836
+				       type=marker.type,
1837
+				       verbose=verbose,
1838
+				       marker.index=marker.index)
1839
+		if(!is.lds) {object <- res; rm(res); gc()}
1841 1840
 	}
1842 1841
 	if(verbose) message("Imputing unobserved genotype medians and shrinking the variances (within-batch, across loci) ")##SNPs only
1843 1842
 	if("SNP" %in% type){
1844 1843
 		marker.index <- whichMarkers("SNP", is.snp,
1845 1844
 					     is.autosome, is.annotated, is.X)
1846
-		shrinkSummary(object=object,
1847
-			      MIN.OBS=MIN.OBS,
1848
-			      MIN.SAMPLES=MIN.SAMPLES,
1849
-			      DF.PRIOR=DF.PRIOR,
1850
-			      type="SNP",
1851
-			      verbose=verbose,
1852
-			      marker.index=marker.index,
1853
-			      is.lds=is.lds)
1845
+		if(length(marker.index) > 0){
1846
+			res <- shrinkSummary(object=object,
1847
+					     MIN.OBS=MIN.OBS,
1848
+					     MIN.SAMPLES=MIN.SAMPLES,
1849
+					     DF.PRIOR=DF.PRIOR,
1850
+					     type="SNP",
1851
+					     verbose=verbose,
1852
+					     marker.index=marker.index)
1853
+			if(!is.lds) {object <- res; rm(res); gc()}
1854
+		}
1854 1855
 	}
1855 1856
 	if(verbose) message("Estimating parameters for copy number")##SNPs only
1856 1857
 	for(i in seq_along(type)){
... ...
@@ -1859,6 +1860,7 @@ crlmmCopynumber <- function(object,
1859 1860
 		CHR.X <- ifelse(marker.type %in% c("X.SNP", "X.NP"), TRUE, FALSE)
1860 1861
 		marker.index <- whichMarkers(marker.type, is.snp,
1861 1862
 					     is.autosome, is.annotated, is.X)
1863
+		if(length(marker.index) == 0) next()
1862 1864
 		object <- estimateCnParameters(object=object,
1863 1865
 					       type=marker.type,
1864 1866
 					       SNRMin=SNRMin,
... ...
@@ -1875,7 +1877,7 @@ crlmmCopynumber <- function(object,
1875 1877
 					       CHR.X=CHR.X)
1876 1878
 	}
1877 1879
 	close(object)
1878
-	TRUE
1880
+	if(is.lds) return(TRUE) else return(object)
1879 1881
 }
1880 1882
 crlmmCopynumber2 <- function(){
1881 1883
 	.Defunct(msg="The crlmmCopynumber2 function has been deprecated. The function crlmmCopynumber should be used instead.  crlmmCopynumber will support large data using ff provided that the ff package is loaded.")
1882 1884
new file mode 100644
... ...
@@ -0,0 +1,264 @@
1
+crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
2
+                     col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
3
+                     SNRMin=5, recallMin=10, recallRegMin=1000,
4
+                     gender=NULL, desctrucitve=FALSE, verbose=TRUE,
5
+                     returnParams=FALSE, badSNP=.7, snp.names){
6
+	pkgname <- getCrlmmAnnotationName(cdfName)
7
+	stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
8
+	open(SNR)
9
+	open(A)
10
+	open(B)
11
+	open(mixtureParams)
12
+	## expect objects to be ff
13
+	keepIndex <- which( SNR[] > SNRMin)
14
+	if(length(keepIndex)==0) stop("No arrays above quality threshold!")
15
+	if(is.null(rownames(A))){
16
+		loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
17
+		gns <- getVarInEnv("gns", .crlmmPkgEnv)
18
+		stopifnot(nrow(A) == length(gns))
19
+		index <- seq(length=nrow(A))
20
+	}
21
+	if(!missing(snp.names)){
22
+		stopifnot(!is.null(rownames(A)))
23
+		##verify that A has only snps.  otherwise, calling function must pass rownames
24
+		index <- match(snp.names, rownames(A))
25
+	}
26
+	snpBatches <- splitIndicesByLength(index, ocProbesets())
27
+	NR <- length(unlist(snpBatches))
28
+	if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
29
+	NC <- ncol(A)
30
+	##
31
+	if(verbose) message("Loading annotations.")
32
+	obj1 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
33
+	obj2 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
34
+	## this is toget rid of the 'no visible binding' notes
35
+	## variable definitions
36
+	XIndex <- getVarInEnv("XIndex")
37
+	autosomeIndex <- getVarInEnv("autosomeIndex")
38
+	YIndex <- getVarInEnv("YIndex")
39
+	SMEDIAN <- getVarInEnv("SMEDIAN")
40
+	theKnots <- getVarInEnv("theKnots")
41
+	regionInfo <- getVarInEnv("regionInfo")
42
+	params <- getVarInEnv("params")
43
+	rm(list=c(obj1, obj2), envir=.crlmmPkgEnv)
44
+	rm(obj1, obj2)
45
+	##
46
+	## IF gender not provide, we predict
47
+	## FIXME: XIndex may be greater than ocProbesets()
48
+	if(is.null(gender)){
49
+		if(verbose) message("Determining gender.")
50
+		##    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
51
+		XMedian <- ocLapply(splitIndicesByNode(1:NC), predictGender, theA=A, theB=B, XIndex=XIndex, neededPkgs="crlmm")
52
+		XMedian <- unlist(XMedian)
53
+		if(sum(SNR[] > SNRMin)==1){
54
+			gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
55
+		}else{
56
+			gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
57
+		}
58
+	}
59
+	##
60
+	Indexes <- list(autosomeIndex, XIndex, YIndex)
61
+	cIndexes <- list(keepIndex,
62
+			 keepIndex[which(gender[keepIndex]==2)],
63
+			 keepIndex[which(gender[keepIndex]==1)])
64
+	if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
65
+	## call C
66
+	fIndex <- which(gender==2)
67
+	mIndex <- which(gender==1)
68
+	## different here
69
+	## use gtypeCallerR in batches
70
+	##snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
71
+	newparamsBatch <- vector("list", length(snpBatches))
72
+	process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
73
+			     YIndex, A, B, mixtureParams, fIndex, mIndex,
74
+			     params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
75
+		open(A)
76
+		open(B)
77
+		open(mixtureParams)
78
+		snps <- snpBatches[[idxBatch]]
79
+		rSnps <- range(snps)
80
+		last <- (idxBatch-1)*batchSize
81
+		IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
82
+				     XIndex[XIndex %in% snps]-last,
83
+				     YIndex[YIndex %in% snps]-last)
84
+		IndexesBatch <- lapply(IndexesBatch, as.integer)
85
+		tmpA <- as.matrix(A[snps,])
86
+		tmpB <- as.matrix(B[snps,])
87
+		## newparamsBatch[[idxBatch]]
88
+		tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
89
+				    params[["centers"]][snps,],
90
+				    params[["scales"]][snps,],
91
+				    params[["N"]][snps,],
92
+				    IndexesBatch, cIndexes,
93
+				    sapply(IndexesBatch, length),
94
+				    sapply(cIndexes, length), SMEDIAN,
95
+				    theKnots, mixtureParams[], DF, probs, 0.025)
96
+		rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last)
97
+		gc(verbose=FALSE)
98
+		close(A)
99
+		close(B)
100
+		close(mixtureParams)
101
+		tmp
102
+	}
103
+	##
104
+	newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
105
+				   snpBatches=snpBatches,
106
+				   autosomeIndex=autosomeIndex, XIndex=XIndex,
107
+				   YIndex=YIndex, A=A, B=B,
108
+				   mixtureParams=mixtureParams, fIndex=fIndex,
109
+				   mIndex=mIndex, params=params,
110
+				   cIndexes=cIndexes, SMEDIAN=SMEDIAN,
111
+				   theKnots=theKnots, DF=DF, probs=probs,
112
+				   batchSize=ocProbesets())
113
+	newparams <- vector("list", 3)
114
+	names(newparams) <- c("centers", "scales", "N")
115
+	newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
116
+	newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2))
117
+	newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3))
118
+	rm(newparamsBatch); gc(verbose=FALSE)
119
+	if(verbose) message("Done.")
120
+	if(verbose) message("Estimating recalibration parameters.")
121
+	d <- newparams[["centers"]] - params$centers
122
+	##
123
+	##regression
124
+	Index <- intersect(which(pmin(newparams[["N"]][, 1],
125
+				      newparams[["N"]][, 2],
126
+				      newparams[["N"]][, 3]) > recallMin &
127
+				 !apply(regionInfo, 1, any)),
128
+			   autosomeIndex)
129
+	if(length(Index) < recallRegMin){
130
+		warning("Recalibration not possible. Possible cause: small sample size.")
131
+		newparams <- params
132
+		dev <- vector("numeric", nrow(newparams[["centers"]]))
133
+		SS <- matrix(Inf, 3, 3)
134
+		DD <- 0
135
+	}else{
136
+		data4reg <- as.data.frame(newparams[["centers"]][Index,])
137
+		names(data4reg) <- c("AA", "AB", "BB")
138
+		regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
139
+				   c(coef(lm(AB~AA+BB, data=data4reg)), 0),
140
+				   coef(lm(BB~AA*AB, data=data4reg)))
141
+		rownames(regParams) <- c("intercept", "X", "Y", "XY")
142
+		rm(data4reg)
143
+		##
144
+		minN <- 3
145
+		newparams[["centers"]][newparams[["N"]] < minN] <- NA
146
+		Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
147
+		if(verbose) message("Filling out empty centers", appendLF=FALSE)
148
+		for(i in Index){
149
+			if(verbose) if(i%%10000==0) message(".", appendLF=FALSE)
150
+			mu <- newparams[["centers"]][i, ]
151
+			j <- which(is.na(mu))
152
+			newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
153
+			rm(mu, j)
154
+		}
155
+		##
156
+		##remaing NAs are made like originals
157
+		if(length(YIndex)>0){
158
+			noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex),
159
+					     YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
160
+		}
161
+		snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0)
162
+		snps2keep <- setdiff(autosomeIndex, snps2ignore)
163
+		rm(snps2ignore)
164
+		newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
165
+		if(verbose) cat("\n")
166
+		##
167
+		if(verbose) message("Calculating and standardizing size of shift... ", appendLF=FALSE)
168
+		GG <- DD <- newparams[["centers"]] - params[["centers"]]
169
+		DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
170
+		SS <- cov(DD[autosomeIndex, ])
171
+		SSI <- solve(SS)
172
+		dev <- vector("numeric", nrow(DD))
173
+		if(length(YIndex)){
174
+			dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
175
+			dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
176
+			##Now Y (only two params)
177
+			SSY <- SS[c(1, 3), c(1, 3)]
178
+			SSI <- solve(SSY)
179
+			dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
180
+			dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
181
+		} else {
182
+			dev=apply(DD,1,function(x) x%*%SSI%*%x)
183
+			dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
184
+		}
185
+	}
186
+	if (verbose) message("OK")
187
+	##
188
+	## BC: must keep SD
189
+	params[-2] <- newparams[-2]
190
+	rm(newparams)
191
+	gc(verbose=FALSE)
192
+	##
193
+	if(verbose) message("Calling ", NR, " SNPs... ", appendLF=FALSE)
194
+	##
195
+	## ###################
196
+	## ## MOVE TO C#######
197
+	##
198
+	## running in batches
199
+	process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
200
+			     YIndex, A, B, mixtureParams, fIndex, mIndex,
201
+			     params, cIndexes, SMEDIAN, theKnots, DF, probs,
202
+			     regionInfo, batchSize){
203
+		open(A)
204
+		open(B)
205
+		open(mixtureParams)
206
+		snps <- snpBatches[[idxBatch]]
207
+		tmpA <- as.matrix(A[snps,])
208
+		tmpB <- as.matrix(B[snps,])
209
+		rSnps <- range(snps)
210
+		last <- (idxBatch-1)*batchSize
211
+		IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
212
+				     XIndex[XIndex %in% snps]-last,
213
+				     YIndex[YIndex %in% snps]-last)
214
+		IndexesBatch <- lapply(IndexesBatch, as.integer)
215
+		ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
216
+					params[["centers"]][snps,],
217
+					params[["scales"]][snps,],
218
+					params[["N"]][snps,],
219
+					IndexesBatch, cIndexes,
220
+					sapply(IndexesBatch, length),
221
+					sapply(cIndexes, length),
222
+					SMEDIAN, theKnots, mixtureParams[],
223
+					DF, probs, 0.025,
224
+					which(regionInfo[snps, 2]),
225
+					which(regionInfo[snps, 1]))
226
+		A[snps,] <- tmpA
227
+		B[snps,] <- tmpB
228
+		rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull, last)
229
+		gc(verbose=FALSE)
230
+		close(A)
231
+		close(B)
232
+		close(mixtureParams)
233
+	}
234
+	##
235
+	ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
236
+		 autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
237
+		 A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
238
+		 mIndex=mIndex, params=params, cIndexes=cIndexes,
239
+		 SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
240
+		 regionInfo=regionInfo, batchSize=ocProbesets())
241
+	##  END MOVE TO C#######
242
+	## ##################
243
+	##
244
+	dev <- dev/(dev+1/383)
245
+	if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
246
+	if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
247
+	##
248
+	if(length(Index) >= recallRegMin){
249
+		tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
250
+		tmpSnpQc <- dev[autosomeIndex]
251
+		SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
252
+		batchQC <- mean(diag(SS))
253
+	}else{
254
+		SS <- matrix(0, 3, 3)
255
+		batchQC <- Inf
256
+	}
257
+	##
258
+	if(verbose) message("Done.")
259
+	if (returnParams){
260
+		return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
261
+	}else{
262
+		return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
263
+	}
264
+}
0 265
new file mode 100644
... ...
@@ -0,0 +1,98 @@
1
+%\VignetteIndexEntry{crlmm copy number Vignette}
2
+%\VignetteDepends{crlmm, genomewidesnp6Crlmm}
3
+%\VignetteKeywords{crlmm, SNP 6}
4
+%\VignettePackage{crlmm}
5
+\documentclass{article}
6
+\usepackage{graphicx}
7
+\usepackage{natbib}
8
+\usepackage{url}
9
+\newcommand{\Rfunction}[1]{{\texttt{#1}}}
10
+\newcommand{\Rmethod}[1]{{\texttt{#1}}}
11
+\newcommand{\Rcode}[1]{{\texttt{#1}}}
12
+\newcommand{\Robject}[1]{{\texttt{#1}}}
13
+\newcommand{\Rpackage}[1]{{\textsf{#1}}}
14
+\newcommand{\Rclass}[1]{{\textit{#1}}}
15
+\newcommand{\oligo}{\Rpackage{oligo }}
16
+\newcommand{\R}{\textsf{R}}
17
+\newcommand{\crlmm}{\Rpackage{crlmm}}
18
+\newcommand{\ff}{\Rpackage{ff}}
19
+\usepackage[margin=1in]{geometry}
20
+
21
+\begin{document}
22
+\title{Overview of vignettes for copy number estimation}
23
+\date{\today}
24
+\author{Rob Scharpf}
25
+\maketitle
26
+
27
+The workflow for copy number analyses in the \crlmm{} package requires
28
+preprocessing and genotyping, followed by estimation of parameters for
29
+copy number estimation.  Supported platforms are those for which a
30
+corresponding annotation package is available (see Tables
31
+\ref{overview} and \ref{illumina}).  Table \ref{overview} provides an
32
+overview of the available vignettes pertaining to copy number
33
+estimation.  These vignettes are located in the \verb+inst/scripts+
34
+subdirectory of the \crlmm{} package. HapMap datasets are used to
35
+illustrate the workflow and are not provided as part of the \crlmm{}
36
+package.  Users wishing to reproduce the analysis should download the
37
+HapMap CEL files (Affymetrix) or the \verb+idat+ files (Illumina) and
38
+modify the paths to the raw data files as appropriate.
39
+
40
+\begin{table}[h!]
41
+\begin{center}
42
+\begin{tabular}{|lp{1.2in}p{1.5in}p{2in}|}
43
+\hline
44
+ Vignette                &  Platform            &  Annotation package                        &  Scope                                               \\
45
+\hline
46
+ Infrastructure          &                      &                                            &  The CNSet container / large data support            \\
47
+ AffymetrixPreprocessCN  &  Affy 5.0, 6.0       &  genomewidesnp5Crlmm, genomewidesnp6Crlmm  &  Preprocessing and genotyping                        \\
48
+ IlluminaPreprocessCN    &  Illumina platforms  &  several$^\dagger$                                   &  Preprocessing and genotyping                        \\
49
+ copynumber              &  Affy/Illumina       &  N/A                                       &  raw copy number estimates                           \\
50
+% SmoothingRawCN          &  Affy/Illumina       &  N/A                                       &  smoothing via segmentation or hidden Markov models  \\
51
+\hline
52
+\end{tabular}
53
+\end{center}
54
+\caption{\label{overview} Vignettes for copy number
55
+  estimation. $^\dagger$ See table \ref{illumina} for the annotation
56
+  packages available for the Illumina platform.}
57
+\end{table}
58
+
59
+\begin{table}[h!]
60
+\begin{center}
61
+\begin{tabular}{|l|}
62
+\hline
63
+ human370v1cCrlmm        \\
64
+ human370quadv3cCrlmm    \\
65
+ human550v3bCrlmm        \\
66
+ human650v3aCrlmm        \\
67
+ human610quadv1bCrlmm    \\
68
+ human660quadv1aCrlmm    \\
69
+ human1mduov3bCrlmm      \\
70
+ humanomni1quadv1bCrlmm  \\
71
+\hline
72
+\end{tabular}
73
+\end{center}
74
+\caption{\label{illumina} Annotation packages for the Illumina platform.}
75
+\end{table}
76
+
77
+%We make use of the \R{} package \Rpackage{cacheSweave} for cacheing
78
+%code chunks that are computationally intensive.  In addition, we
79
+%indicate that the cached files should be stored in the directory
80
+%\verb+outdir+.
81
+
82
+In general, the workflow is
83
+\begin{enumerate}
84
+\item preprocessing and genotyping (\verb+AffymetrixPreprocessCN+ or \verb+IlluminaPreprocessCN+ vignettes)
85
+\item copy number estimation (\verb+copynumber+ vignette)
86
+%\item inferring regions of copy number gain and loss
87
+%  (\verb+SmoothingRawCN+ vignette)
88
+\end{enumerate}
89
+%The \verb+SmoothingRawCN+ vignette illustrates one approach for
90
+%interfacing with packages such as \Rpackage{DNAcopy} and
91
+%\Rpackage{VanillaICE} for identifying regions of copy number gain or
92
+%loss.
93
+The \verb+Infrastructure+ vignette provides additional details on the
94
+\Rclass{CNSet} container used to organize the processed data as well
95
+as a brief discussion regarding large data support through the \ff{}
96
+package.
97
+
98
+\end{document}
0 99
\ No newline at end of file
1 100
new file mode 100644
2 101
Binary files /dev/null and b/inst/doc/CopyNumberOverview.pdf differ
... ...
@@ -1,2 +1,3 @@
1 1
 illumina_copynumber.tex
2
+run*.R
2 3
 
3 4
new file mode 100644
... ...
@@ -0,0 +1,194 @@
1
+%\VignetteIndexEntry{AffymetrixPreprocessCN Vignette for Affymetrix}
2
+%\VignetteDepends{crlmm, genomewidesnp6Crlmm, cacheSweave, ff}
3
+%\VignetteKeywords{crlmm, SNP 6, copy number, SNP}
4
+%\VignettePackage{crlmm}
5
+\documentclass{article}
6
+\usepackage{graphicx}
7
+\usepackage{natbib}
8
+\usepackage{url}
9
+\newcommand{\Rfunction}[1]{{\texttt{#1}}}
10
+\newcommand{\Rmethod}[1]{{\texttt{#1}}}
11
+\newcommand{\Rcode}[1]{{\texttt{#1}}}
12
+\newcommand{\Robject}[1]{{\texttt{#1}}}
13
+\newcommand{\Rpackage}[1]{{\textsf{#1}}}
14
+\newcommand{\Rclass}[1]{{\textit{#1}}}
15
+\newcommand{\oligo}{\Rpackage{oligo }}
16
+\newcommand{\R}{\textsf{R}}
17
+\newcommand{\crlmm}{\Rpackage{crlmm}}
18
+\usepackage[margin=1in]{geometry}
19
+
20
+\begin{document}
21
+\title{Preprocessing \& Genotyping Affymetrix Arrays for Copy Number Analysis}
22
+\date{\today}
23
+\author{Rob Scharpf}
24
+\maketitle
25
+
26
+<<setup, echo=FALSE, results=hide>>=
27
+options(continue=" ", width=70)
28
+@
29
+
30
+%\section{Estimating copy number}
31
+
32
+%At present, software for copy number estimation is provided only for the
33
+%Affymetrix 6.0 platform.
34
+
35
+\begin{abstract}
36
+
37
+  This vignette describes the setup needed to analyze Affymetrix 6.0
38
+  (or 5.0) CEL files and the steps for preprocessing and
39
+  genotyping. These steps must be completed prior to copy number
40
+  analyses in \crlmm{}.  After completing these steps, users can refer
41
+  to the \verb+copynumber+ vignette.
42
+
43
+\end{abstract}
44
+
45
+\section{Set up}
46
+
47
+<<cdfname, results=hide>>=
48
+library(crlmm)
49
+library(ff)
50
+library(cacheSweave)
51
+@
52
+
53
+This vignette analyzes HapMap samples assayed on the Affymetrix 6.0
54
+platform.  The annotation package for this platform is
55
+\Rpackage{genomewidesnp6Crlmm}.  We assign the name of the annotation
56
+package without the \verb+Crlmm+ postfix to the name \verb+cdfName+.
57
+
58
+<<cdfname>>=
59
+cdfName <- "genomewidesnp6"
60
+@
61
+
62
+
63
+The HapMap CEL files are stored in a local directory assigned to
64
+\verb+pathToCels+ in the following code. The genotyping step will
65
+create several files with \verb+ff+ extensions. We will store these
66
+files to the path indicated by \verb+outdir+.
67
+
68
+<<setup>>=
69
+pathToCels <- "/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m"
70
+if(getRversion() < "2.13.0"){
71
+	rpath <- getRversion()
72
+} else rpath <- "trunk"
73
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="")
74
+dir.create(outdir, recursive=TRUE, showWarnings=FALSE)
75
+@
76
+
77
+By providing the path in \verb+outdir+ as an argument to the \R{}
78
+function \Rfunction{ldPath}, all of the \verb+ff+ files created during
79
+the genotyping step will be stored in \verb+outdir+.
80
+
81
+<<ldpath>>=
82
+ldPath(outdir)
83
+@
84
+
85
+This vignette uses the \R{} package \Rpackage{cacheSweave} to cache
86
+long computations.  The following step is only necessarily if one
87
+wishes to cache some of the computations.  In particular, we specify
88
+that the cached computations will be saved in the \verb+outdir+
89
+through the function \Rfunction{setCacheDir}.  Users should refer to
90
+the \Rpackage{cacheSweave} package for additional details regarding
91
+cacheing.
92
+
93
+<<cachedir, echo=FALSE>>=
94
+setCacheDir(outdir)
95
+@
96
+
97
+The \R{} functions \Rfunction{ocProbesets} and \Rfunction{ocSamples}
98
+manage the RAM required for our analysis. See the documentation for
99
+these functions and the \verb+CopyNumberOverview+ vignette for
100
+additional details.
101
+
102
+<<ram>>=
103
+ocProbesets(100000)
104
+ocSamples(200)
105
+@
106
+
107
+
108
+Next we indicate the local directory that contains the CEL files. For
109
+the purposes of this vignette, we only analyze the CEPH ('C') and
110
+Yoruban ('Y') samples.
111
+
112
+<<celfiles>>=
113
+celFiles <- list.celfiles(pathToCels, full.names=TRUE, pattern=".CEL")
114
+celFiles <- celFiles[substr(basename(celFiles), 13, 13) %in% c("C", "Y")]
115
+@
116
+
117
+Finally, copy number analyses using \crlmm{} require specification of
118
+a batch variable that is used to indicate which samples were processed
119
+together.  For example, if some of the samples were processed in April
120
+and another set of samples were processed in June, we could name the
121
+batches 'April' and 'June', respectively.  A useful surrogate for
122
+batch is often the chemistry plate or the scan date of the array. For
123
+the HapMap CEL files analyzed in this vignette, the CEPH (C) and
124
+Yoruban (Y) samples were prepared on separate chemistry plates.  In
125
+the following code chunk, we extract the population identifier from
126
+the CEL file names and assign these identifiers to the variable
127
+\Robject{plate}.
128
+
129
+<<plates>>=
130
+plates <- substr(basename(celFiles), 13, 13)
131
+@
132
+
133
+\section{Preprocessing and genotyping.}
134
+
135
+The preprocessing steps for copy number estimation includes quantile
136
+normalization of the raw intensities for each probe and a step that
137
+summarizes the intensities of multiple probes at a single locus.  For
138
+example, the Affymetrix 6.0 platform has 3 or 4 identical probes at
139
+each polymorphic locus and the normalized intensities are summarized
140
+by a median.  For the nonpolymorphic markers on Affymetrix 6.0, only
141
+one probe per locus is available and the summarization step is not
142
+needed.  After preprocessing the arrays, the \crlmm{} package
143
+estimates the genotype using the CRLMM algorithm and provides a
144
+confidence score for the genotype calls.  The function
145
+\Rfunction{genotype} performs both the preprocessing and genotyping.
146
+
147
+<<LDS_genotype, cache=TRUE>>=
148
+cnSet <- genotype(celFiles, batch=plates, cdfName=cdfName)
149
+@
150
+
151
+The value returned by genotype is an instance of the class
152
+\Robject{CNSet}.  The normalized intensities, genotype calls, and
153
+confidence scores are stored as \verb+ff+ objects in the
154
+\verb+assayData+ slot.  A concise summary of this object can be
155
+obtained throught the \Rfunction{print} or \Rfunction{show} methods.
156
+
157
+<<show>>=
158
+print(cnSet)
159
+@
160
+
161
+Note that the object is fairly small as the intensities and genotype
162
+calls are stored on disk rather than in active memory.
163
+
164
+<<objectsize>>=
165
+object.size(cnSet)
166
+@
167
+
168
+We save the \Robject{cnSet} object in a local directory for subsequent
169
+copy number analysis in the \verb+copynumber+ vignette.
170
+
171
+<<save,cache=TRUE>>=
172
+saveObject <- function(outdir, cnSet) {
173
+	save(cnSet, file=file.path(outdir, "cnSet.rda"))
174
+	TRUE
175
+}
176
+(cnset.saved <- saveObject(outdir, cnSet))
177
+@
178
+
179
+Users can proceed to the \verb+copynumber+ vignette for copy number
180
+analyses.
181
+
182
+
183
+
184
+\section{Session information}
185
+<<sessionInfo, results=tex>>=
186
+toLatex(sessionInfo())
187
+@
188
+
189
+<<echo=FALSE, results=hide>>=
190
+q("no")
191
+@
192
+
193
+
194
+\end{document}
0 195
new file mode 100644
1 196
Binary files /dev/null and b/inst/scripts/AffymetrixPreprocessCN.pdf differ
2 197
similarity index 68%
3 198
rename from inst/scripts/illumina_copynumber.Rnw
4 199
rename to inst/scripts/IlluminaPreprocessCN.Rnw
... ...
@@ -1,6 +1,6 @@
1
-%\VignetteIndexEntry{crlmm copy number Vignette for Illumina}
2
-%\VignetteDepends{crlmm}
3
-%\VignetteKeywords{crlmm, illumina}
1
+%\VignetteIndexEntry{IlluminaPreprocessCN Vignette for Illumina}
2
+%\VignetteDepends{crlmm, ff, cacheSweave}
3
+%\VignetteKeywords{crlmm, illumina, copy number, SNP}
4 4
 %\VignettePackage{crlmm}
5 5
 \documentclass{article}
6 6
 \usepackage{graphicx}
... ...
@@ -18,8 +18,7 @@
18 18
 \newcommand{\ff}{\Rpackage{ff}}
19 19
 
20 20
 \begin{document}
21
-\title{Using \crlmm{} for copy number estimation and genotype calling
22
-  with Illumina platforms}
21
+\title{Preprocessing and Genotyping Illumina Arrays for Copy Number Analysis}
23 22
 
24 23
 \date{\today}
25 24
 
... ...
@@ -28,15 +27,18 @@
28 27
 
29 28
 
30 29
 \begin{abstract}
30
+
31 31
   This vignette illustrates the steps required prior to copy number
32 32
   analysis for Infinium platforms.  Specifically, we require
33 33
   construction of a container to store processed forms of the raw
34 34
   data, preprocessing to normalize the arrays, and genotyping using
35 35
   the CRLMM algorithm.  After completing these steps, users can refer
36
-  to the copy number vignette.
36
+  to the \verb+copynumber+ vignette.
37
+
37 38
 \end{abstract}
38 39
 
39
-\section{Setup}
40
+
41
+\section{Set up}
40 42
 
41 43
 <<crlmm, results=hide, echo=FALSE>>=
42 44
 library(crlmm)
... ...
@@ -73,23 +75,15 @@ setCacheDir(outdir)
73 75
 @
74 76
 
75 77
 We declare that \crlmm{} should process 150,000 markers at a time
76
-(when possible) and/or 500 samples at a time.  As our example dataset
77
-in this vignette contains fewer than 500 samples, all samples will be
78
-processed simultaneously.
78
+and/or 500 samples at a time (when possible) to reduce the memory
79
+footprint.  As our example dataset in this vignette contains fewer
80
+than 500 samples, all samples will be processed simultaneously.
79 81
 
80 82
 <<ram>>=
81 83
 ocProbesets(150e3)
82 84
 ocSamples(500)
83 85
 @
84 86
 
85
-\textbf{Limitations:} There is no minimum number of samples required
86
-for preprocessing and genotyping.  However, for copy number estimation
87
-the \crlmm{} package currently requires at least 10 samples per batch.
88
-The parameter estimates for copy number and the corresponding
89
-estimates of raw copy number will tend to be more noisy for batches
90
-with small sample sizes (e.g., $<$ 50).  Chemistry plate or scan date
91
-are often useful surrogates for batch.
92
-
93 87
 \section{Initializing a container for storing processed data}
94 88
 
95 89
 This section will initialize a container for storing processed forms
... ...
@@ -111,13 +105,19 @@ For Infinium platforms, an Illumina sample sheet containing
111 105
 information for reading the raw IDAT files is required. Please refer
112 106
 to the BeadStudio Genotyping guide, Appendix A, for additional
113 107
 information.  The following code reads in the samplesheet for the IDAT
114
-files on our local server. For our dataset, the file extensions are
115
-`Grn.dat' and `Red.idat'.  We store the complete path to the filename
116
-without the file extension in the \Robject{arrayNames} and check that
117
-all of the green and red IDAT files exists.
108
+files on our local server.
118 109
 
119 110
 <<samplesheet>>=
120 111
 samplesheet = read.csv(file.path(datadir, "HumanHap370Duo_Sample_Map.csv"), header=TRUE, as.is=TRUE)
112
+@
113
+
114
+For the purposes of this vignette, we indicate that we only wish to
115
+process a subset of the arrays.  For our dataset, the file extensions
116
+are `Grn.dat' and `Red.idat'.  We store the complete path to the
117
+filename without the file extension in the \Robject{arrayNames} and
118
+check that all of the green and red IDAT files exists.
119
+
120
+<<subsetArrays>>=
121 121
 samplesheet <- samplesheet[-c(28:46,61:75,78:79), ]
122 122
 arrayNames <- file.path(datadir, unique(samplesheet[, "SentrixPosition"]))
123 123
 all(file.exists(paste(arrayNames, "_Grn.idat", sep="")))
... ...
@@ -125,9 +125,9 @@ all(file.exists(paste(arrayNames, "_Red.idat", sep="")))
125 125
 arrayInfo <- list(barcode=NULL, position="SentrixPosition")
126 126
 @
127 127
 
128
-As discussed previously, all supported platforms have a corresponding
129
-annotation package.  The appropriate annotation package is specified
130
-by the platform identifier without the \verb+Crlmm+ postfix.
128
+All supported platforms have a corresponding annotation package.  The
129
+appropriate annotation package is specified by the platform identifier
130
+without the \verb+Crlmm+ postfix.
131 131
 
132 132
 <<cdfname>>=
133 133
 cdfName <- "human370v1c"
... ...
@@ -164,20 +164,16 @@ print(cnSet)
164 164
 @
165 165
 
166 166
 Note that the above object does not yet contain any processed data
167
-(only \verb+NA+'s) and several \verb+.ff+ files now appear in the
168
-\verb+outdir+.  Again, these files should not be removed.
167
+(only \verb+NA+'s).  As the elements of the \verb+assayData+ slot are
168
+\Robject{ff} objects (not matrices), several \verb+.ff+ files now
169
+appear in the \verb+outdir+. The \verb+.ff+ files should not be
170
+removed and can be listed using the \R{} function
171
+\Rfunction{list.files}.  %For the most part, the \emph{appearance} that
172
+%the data is stored in memory is preserved.
169 173
 
170 174
 <<listff>>=
171
-list.files(outdir, pattern=".ff")[1:5]
172
-@
173
-
174
-Finally, note that the elements of the \verb+assayData+ slot are
175
-\Robject{ff} objects and not matrices. For the most part, the
176
-\emph{appearance} that the data is stored in memory is preserved.
177
-
178
-
179
-<<assaydataelements>>=
180 175
 sapply(assayData(cnSet), function(x) class(x)[1])
176
+list.files(outdir, pattern=".ff")[1:5]
181 177
 @
182 178
 
183 179
 \section{Preprocessing}
... ...
@@ -196,8 +192,8 @@ invisible(close(mixtureParams))
196 192
 @
197 193
 
198 194
 Note that the normalized intensities for the A and B alleles are no
199
-longer \verb+NA+s and can now be accessed and inspected using the
200
-methods \Rfunction{A} and \Rfunction{B}, respectively.
195
+longer \verb+NA+s and can be inspected using the methods \Rfunction{A}
196
+and \Rfunction{B}, respectively.
201 197
 
202 198
 <<intensities>>=
203 199
 invisible(open(A(cnSet)))
... ...
@@ -208,10 +204,8 @@ invisible(close(A(cnSet)))
208 204
 invisible(close(B(cnSet)))
209 205
 @
210 206
 
211
-
212 207
 \section{Genotyping}
213 208
 
214
-
215 209
 CRLMM genotype calls and confidence scores are estimated using the
216 210
 function \Rfunction{genotypeInf}.
217 211
 
... ...
@@ -220,12 +214,11 @@ updated <- genotypeInf(cnSet, mixtureParams=mixtureParams)
220 214
 print(updated)
221 215
 @
222 216
 
223
-The posterior probabilities for the
224
-genotype calls in the \verb+callProbability+ element of the
225
-\verb+assayData+ are stored as integers to reduce the file size on
226
-disk.  However, the scores can be easily transformed back to the
227
-probability scale using the \Rfunction{i2p} function as illustrated in
228
-the following code chunk.
217
+The posterior probabilities for the genotype calls in the
218
+\verb+callProbability+ element of the \verb+assayData+ are stored as
219
+integers to reduce the file size on disk. The scores can be
220
+transformed to the probability scale using the \Rfunction{i2p}
221
+function as illustrated in the following code chunk.
229 222
 
230 223
 <<callprobs>>=
231 224
 invisible(open(snpCallProbability(cnSet)))
... ...
@@ -234,23 +227,11 @@ i2p(callProbs)
234 227
 invisible(close(snpCallProbability(cnSet)))
235 228
 @
236 229
 
237
-%As described in the \texttt{copynumber} vignette, copy number
238
-%estimation in \crlmm{} works best when there are a sufficient number
239
-%of samples such that AA, AB, and BB genotypes are observed at most
240
-%loci. For small studies (e.g., fewer than 50 samples), there will be a
241
-%large number of SNPs that are monomorphic. For monomorphic SNPs, the
242
-%estimation problem becomes more difficult and alternative strategies
243
-%that estimate the relative total copy number may be preferable.  In
244
-%addition to installing \crlmm{}, one must also install the appropriate
245
-%annotation package for the Illumina platform.  In the following code,
246
-%we list the platforms for which annotation packages are currently
247
-%available. Next we create a directory where output files will be
248
-%stored and indicate the directory that contains the IDAT files that
249
-%will be used in our analysis.
250
-
251
-\textbf{Wrapper:} The construction of a \Rclass{CNSet} object,
252
-preprocessing, and genotype calling are wrapped into a convenience
253
-function called \Rfunction{genotype.Illumina}.
230
+\textbf{Wrapper:} As an alternative to calling the functions
231
+\Rfunction{constructInf}, \Rfunction{preprocessInf} and
232
+\Rfunction{genotypeInf} in sequence, a convenience function called
233
+\Rfunction{genotype.Illumina} is a wrapper for the above functions and
234
+produces identical results.
254 235
 
255 236
 <<genotype.Illumina,cache=TRUE,results=hide>>=
256 237
 cnSet2 <- genotype.Illumina(sampleSheet=samplesheet,
... ...
@@ -284,4 +265,27 @@ delete(cnSet2$SKW)
284 265
 rm(cnSet2)
285 266
 @
286 267
 
268
+Users may now proceed to the CopyNumber vignette for copy number
269
+analyses.
270
+
271
+
272
+
273
+\section{Session information}
274
+<<sessionInfo, results=tex>>=
275
+toLatex(sessionInfo())
276
+@
277
+
278
+
279
+<<>>=
280
+cnset.updated <- crlmmCopynumber(cnSet)
281
+@
282
+
283
+<<>>=
284
+snp.index <- which(chromosome(cnSet)==1)
285
+cnset2 <- cnSet[snp.index, ]
286
+trace(crlmmCopynumber, browser)
287
+object <- crlmmCopynumber(cnset2)
288
+## elements no longer ff
289
+@
290
+
287 291
 \end{document}
288 292
new file mode 100644
289 293
Binary files /dev/null and b/inst/scripts/IlluminaPreprocessCN.pdf differ
290 294
new file mode 100644
... ...
@@ -0,0 +1,540 @@
1
+%\VignetteIndexEntry{Infrastructure for crlmm copy number Vignette}
2
+%\VignetteDepends{crlmm, genomewidesnp6Crlmm}
3
+%\VignetteKeywords{crlmm, SNP 6}
4
+%\VignettePackage{crlmm}
5
+\documentclass{article}
6
+\usepackage{graphicx}
7
+\usepackage{natbib}
8
+\usepackage{url}
9
+\newcommand{\Rfunction}[1]{{\texttt{#1}}}
10
+\newcommand{\Rmethod}[1]{{\texttt{#1}}}
11
+\newcommand{\Rcode}[1]{{\texttt{#1}}}
12
+\newcommand{\Robject}[1]{{\texttt{#1}}}
13
+\newcommand{\Rpackage}[1]{{\textsf{#1}}}
14
+\newcommand{\Rclass}[1]{{\textit{#1}}}
15
+\newcommand{\oligo}{\Rpackage{oligo }}
16
+\newcommand{\R}{\textsf{R}}
17
+\newcommand{\crlmm}{\Rpackage{crlmm}}
18
+\newcommand{\ff}{\Rpackage{ff}}
19
+\usepackage[margin=1in]{geometry}
20
+
21
+\begin{document}
22
+\title{Infrastructure for copy number analysis in \crlmm{}}
23
+\date{\today}
24
+\author{Rob Scharpf}
25
+\maketitle
26
+
27
+
28
+\section{Set up}
29
+
30
+As in the previous vignettes, we load the required libraries and
31
+specify a path for storing output files.
32
+
33
+<<libraries,results=hide>>=
34
+library(ff)
35
+library(crlmm)
36
+library(cacheSweave)
37
+require(DNAcopy)
38
+require(VanillaICE)
39
+if(getRversion() < "2.13.0"){
40
+	rpath <- getRversion()
41
+} else rpath <- "trunk"
42
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="")
43
+@
44
+
45
+<<ram>>=
46
+ldPath(outdir)
47
+setCacheDir(outdir)
48
+@
49
+
50
+We begin by loading the \Robject{cnSet} object created by the
51
+\verb+AffymetrixPreprocessCN+ vignette.
52
+
53
+<<loadcnset>>=
54
+if(!exists("cnSet")) load(file.path(outdir, "cnSet.rda"))
55
+@
56
+
57
+\section{Supported platforms}
58
+
59
+The supported Affymetrix and Infinium platforms are those for which a
60
+corresponding annotation package is available.  The annotation
61
+packages contain information on the markers, such as physical position
62
+and chromosome, as well as pre-computed parameters estimated from
63
+HapMap used during the preprocessing and genotyping steps. For
64
+Affymetrix, the 5.0 and 6.0 platforms are supported and the
65
+corresponding annotation packages are \Rpackage{genomewidesnp5Crlmm}
66
+and \Rpackage{genomewidesnp6Crlmm}.  Supported Infinium platforms are
67
+listed in the following code chunk.
68
+
69
+<<supportedPlatforms>>=
70
+pkgs <- annotationPackages()
71
+crlmm.pkgs <- pkgs[grep("Crlmm", pkgs)]
72
+crlmm.pkgs[grep("human", crlmm.pkgs)]
73
+@
74
+
75
+\textbf{Large data:} In order to reduce \crlmm{}'s memory footprint
76
+for copy number estimation, we require the \Rpackage{ff}.  The
77
+\Rpackage{ff} package provides infrastructure for accessing and
78
+writing data to disk instead of keeping data in memory.  As the
79
+functions for preprocessing, genotyping, and copy number estimation do
80
+not simultaneously require all samples and all probes in memory,
81
+memory-usage by \crlmm{} can be fine-tuned by reading in and
82
+processing subsets of the markers and/or samples. The functions
83
+\Rfunction{ocSamples} and \Rfunction{ocProbesets} in the
84
+\Rpackage{oligoClasses} package can be used to declare how many
85
+markers and samples to read at once.  In general, specifying smaller
86
+values should reduce the RAM required for a particular job.  In
87
+general, smaller values will increase the run-time. In the following
88
+code-chunk, we declare that \crlmm{} should process 150,000 markers at
89
+a time (when possible) and 500 samples at a time.  If our dataset
90
+contained fewer than 500 samples, the \Rfunction{ocSamples} option
91
+would not have any effect.  One can view the current settings for
92
+these commands, by typing the functions without an argument.
93
+
94
+<<ram>>=
95
+ocProbesets(50e3)
96
+ocSamples(200)
97
+@
98
+
99
+\section{The \Rclass{CNSet} container}
100
+
101
+The \Rfunction{show} method provides a concise summary of the
102
+\Robject{cnSet} object.
103
+
104
+<<show>>=
105
+cnSet
106
+@
107
+
108
+We briefly outline some of the unique aspects of the
109
+\Rclass{CNSet}-class using in \crlmm{} that may differ from the more
110
+standard extensions of the virtual class \Rclass{eSet} defined in the
111
+\Rpackage{Biobase} package.
112
+
113
+\paragraph{Instantiating a \Rclass{CNSet}:} An object of class
114
+\Rclass{CNSet} can be instantiated by one of two methods:
115
+\begin{enumerate}
116
+\item during the preprocessing of the raw intensities for Illumina and
117
+  Affymetrix arrays by the the functions \Rfunction{constructInf} and
118
+  \Rfunction{genotype}, respectively.
119
+
120
+\item by subsetting an existing \Robject{CNSet} object.  As per usual,
121
+  the `[' method can be used to extract a subset of markers $i$ as in
122
+  `[i, ]', a subset of samples $j$ as in `[, j]', or a subset of
123
+  markers $i$ and samples $j$ as in `[i, j]'.
124
+
125
+\end{enumerate}
126
+
127
+There are important differences in the underlying data representation
128
+depending on how the object was instantiated.  In particular, objects
129
+generated by the functions \Rfunction{constructInf} and
130
+\Rfunction{genotype} store high-dimensional data on disk rather than
131
+in memory through protocols defined in the \R{} package \ff{}.  For
132
+instance, the normalized intensities and genotype calls in a
133
+\Rclass{CNSet}-instance from approach (1) are \ff{}-derived objects.
134
+However, when such an object is subset by the `[' method, the data is
135
+pulled from disk to memory and stored as an ordinary matrix in \R{}.
136
+To illustrate, the \Robject{cnSet} loaded at the start of this session
137
+was created by the function \Rfunction{genotype}.  As the data is
138
+stored on disk rather than in memory, we inspect attributes of the
139
+object representing the normalized intensities for the A allele by
140
+first opening the file connection and then extracting the
141
+\ff{}-derived class of the object as well as where the data is stored
142
+on disk.
143
+
144
+<<approach1>>=
145
+invisible(open(cnSet))
146
+class(A(cnSet))
147
+filename(A(cnSet))
148
+@
149
+
150
+Using approach (2), we construct a new \Robject{CNSet} object.
151
+
152
+<<approach2>>=
153
+cnset.subset <- cnSet[1:5, 1:10]
154
+invisible(close(cnSet))
155
+class(A(cnset.subset))
156
+@
157
+
158
+The files with \verb+.ff+ extension should not be removed or
159
+relocated.  The safest way to move these files if necessary is to
160
+clone all of the \ff{} objects using the \Rfunction{clone}, followed
161
+by the \Rfunction{delete} function to remove the original files on
162
+disk. See the documentation in the \ff{} package for additional
163
+details.
164
+
165
+\paragraph{Order of operations:} For \Rclass{CNSet}-instances derived
166
+by approach (1), users should be mindful of the substantial I/O when
167
+using accessors to extract data from the class.  For example, the
168
+following 2 methods would extract identical results, with the latter
169
+being much more efficient (extra parentheses are added to the second
170
+operation to emphasize the order of operations):
171
+
172
+<<orderOfOperations>>=
173
+##inefficient
174
+A(cnSet[1:5, 1:5])
175
+## preferred
176
+(A(cnSet))[1:5, 1:5]
177
+@
178
+
179
+
180
+\textbf{\texttt{featureData}}
181
+
182
+Information on physical position, chromosome, and whether the marker
183
+is a SNP can be accessed through accessors defined for the
184
+\verb+featureData+.
185
+
186
+<<featuredataAccessors>>=
187
+fvarLabels(cnSet)
188
+position(cnSet)[1:10]
189
+chromosome(cnSet)[1:10]
190
+is.snp <- isSnp(cnSet)
191
+table(is.snp)
192
+snp.index <- which(is.snp)
193
+np.index <- which(!is.snp)
194
+chr1.index <- which(chromosome(cnSet) == 1)
195
+@
196
+
197
+\textbf{\texttt{assayData}}
198
+
199
+The \verb+assayData+ elements are of the class
200
+\Rclass{ff\_matrix}/\Rclass{ffdf} or \Rclass{matrix}, depending on how
201
+the \Rclass{CNSet} object was instantiated.  Elements in the
202
+\verb+assayData+ environment can be listed using the \Rfunction{ls}
203
+function.
204
+
205
+<<assayData>>=
206
+ls(assayData(cnSet))
207
+@
208
+
209
+The normalized intensities for the A and B alleles have names
210
+\verb+alleleA+ and \verb+alleleB+ and can be accessed by the methods
211
+\Rfunction{A} and \Rfunction{B}, respectively.  Genotype calls and
212
+confidence scores can be accessed by \Rfunction{snpCall} and
213
+\Rfunction{snpCallProbability}, respectively.  Note that the
214
+confidence scores are represented as integers to reduce the filesize,
215
+but can be tranlated to the probability scale using the \R{} function
216
+\Rfunction{i2p}. For example,
217
+
218
+<<i2p>>=
219
+scores <- as.matrix(snpCallProbability(cnSet)[1:5, 1:2])
220
+i2p(scores)
221
+@
222
+
223
+Note that for the Affymetrix 6.0 platform the assay data elements each
224
+have a row dimension corresponding to the total number of polymorphic
225
+and nonpolymorphic markers interrogated by the Affymetrix 6.0
226
+platform.  A consequence of keeping the rows of the assay data
227
+elements the same for all of the statistical summaries is that the
228
+matrix used to store genotype calls is larger than necessary.
229
+
230
+Note that NA's are stored in the slot for normalized 'B' allele
231
+intensities:
232
+
233
+<<B.NAs>>=
234
+np.index <- which(!is.snp)
235
+stopifnot(all(is.na(B(cnSet)[np.index, ])))
236
+@
237
+
238
+\textbf{\texttt{batch} and \texttt{batchStatistics}}
239
+
240
+As defined in Leek \textit{et al.} 2010, \textit{Batch effects are
241
+  sub-groups of measurements that have qualitatively different
242
+  behaviour across conditions and are unrelated to the biological or
243
+  scientific variables in a study}.  The \verb+batchStatistics+ slot
244
+is an environment used to store SNP- and batch-specific summaries,
245
+such as the sufficient statistics for the genotype clusters and the
246
+linear model parameters used for copy number estimation.  The
247
+\verb+batch+ slot is used to store the 'batch name' for each
248
+array. For small studies in which the samples were processed at
249
+similar times (e.g., within a month), all the samples can be
250
+considered to be in the same batch.  For large studies in which the
251
+samples were processed over several months, users should the scan date
252
+of the array or the chemistry plate are useful surrogates. The only
253
+constraint on the \verb+batch+ variable is that it must be a character
254
+vector that is the same length as the number of samples to be
255
+processed.  The \verb+batch+ is specified as an argument to the \R{}
256
+functions \Rfunction{constructInf} and \Rfunction{genotype} that
257
+instantiate \Rclass{CNSet} objects for the Illumina and Affymetrix
258
+platforms, respectively.  The \Rfunction{batch} function can be used
259
+to access the \verb+batch+ information on the samples as in the
260
+following example.
261
+
262
+<<batchAccessor>>=
263
+table(batch(cnSet))
264
+@
265
+
266
+For the \verb+batchStatistics+ slot, the elements in the environment
267
+have the class \Rclass{ff\_matrix}/\Rclass{ffdf} or \Rclass{matrix},
268
+depending on how the \Rclass{CNSet} object was instantiated.  The
269
+dimension of each element is the number of markers (SNPs +
270
+nonpolymorphic markers) $\times$ the number of batches. The names of
271
+the elements in the environment can be list using the \R{} function
272
+\Rfunction{ls}.
273
+
274
+<<batchStatistics>>=
275
+ls(batchStatistics(cnSet))
276
+@
277
+
278
+Currently, the batch-specific summaries are stored to allow some
279
+flexibility in the choice of downstream analyses of copy number and
280
+visual assessments of model fit.  Documentation for such applications
281
+will be expanded in future versions of \crlmm{}, and are currently not
282
+intended to be accessed directly by the user.
283
+
284
+%The \Robject{CNSet} class also contains the slot
285
+%\Robject{batchStatistics} that contains batch-specific summaries
286
+%needed for copy number estimation. In particular, each element is a
287
+%matrix (or an ff object) with R rows and C columns, correspoinding to
288
+%R markers and C batches.  The summaries includes the within genotype
289
+%cluster medians and median absolute deviations (mads), but also
290
+%parameters estimated from the linear model.  (For unobserved
291
+%genotypes, the medians are imputed and the variance is obtained the
292
+%median variance (across markers) within a batch. ) The elements of the
293
+%slot can be listed as follows.
294
+
295
+\textbf{\texttt{phenoData}}
296
+
297
+Sample-level summaries obtained during the preprocessing/genotyping
298
+steps include skew (SKW), the signal to noise ratio (SNR), and gender
299
+(1=male, 2=female) are stored in the \verb+phenoData+ slot.  As for
300
+other \Rclass{eSet} extensions, the \verb+$+ method can be used to
301
+extract these summaries.  For \Rclass{CNSet} objects generated by
302
+approach (1), these elements are of the class
303
+\Rclass{ff\_vector}.
304
+
305
+<<phenodataAccessors>>=
306
+varLabels(cnSet)
307
+class(cnSet$gender)
308
+invisible(open(cnSet$gender))
309
+cnSet$gender
310
+@
311
+
312
+The `[' methods without arguments can be used to coerce to a vector.
313
+
314
+<<vector>>=
315
+cnSet$gender[]
316
+invisible(close(cnSet$gender))
317
+@
318
+
319
+\textbf{\texttt{protocolData}}
320
+
321
+The scan date of the arrays are stored in the \verb+protocolData+.
322
+
323
+<<protocolData>>=
324
+varLabels(protocolData(cnSet))
325
+protocolData(cnSet)$ScanDate[1:10]
326
+@
327
+
328
+
329
+%\section{Suggested visualizations}
330
+%
331
+%\paragraph{SNR.}
332
+%
333
+%A histogram of the signal to noise ratio for the HapMap samples:
334
+%
335
+%<<plotSnr, fig=TRUE, include=FALSE>>=
336
+%open(cnSet$SNR)
337
+%hist(cnSet$SNR[, ], xlab="SNR", main="", breaks=25, col="lightblue", xlim=c(3, max(cnSet$SNR[])))
338
+%abline(v=5, lty=2, col="grey")
339
+%text(3,5, label="SNR range for low \n quality arrays", adj=0, col="grey40")
340
+%@
341
+%\begin{figure}
342
+%  \centering
343
+%  \includegraphics[width=0.6\textwidth]{copynumber-plotSnr}
344
+%  \caption{Signal to noise ratios for the HapMap samples. SNRs below 5
345
+%   for the Affymetrix platform are often samples of lower quality.
346
+%   Such samples will tend to have much more variable estimates of copy
347
+%   number.}
348
+%\end{figure}
349
+
350
+
351
+%\paragraph{One sample at a time: locus-level estimates}
352
+%
353
+%Figure \ref{fig:oneSample} plots physical position (horizontal axis)
354
+%versus copy number (vertical axis) for the first sample.  There is
355
+%less information to estimate copy number at nonpolymorphic loci;
356
+%improvements to the univariate prediction regions at nonpolymorphic
357
+%loci are a future area of research. If the \Rpackage{SNPchip} is
358
+%available, an idiogram can be added to the existing plotting
359
+%coordinates as indicated in the following example.
360
+%
361
+%<<oneSample, fig=TRUE, width=8, height=4, include=FALSE>>=
362
+%GT.CONF.THR <- 0.8
363
+%marker.index <- which(chromosome(cnSet) == 1)
364
+%cn <- totalCopynumber(cnSet, i=marker.index, j=1)
365
+%x <- position(cnSet)[marker.index]
366
+%par(las=1, mar=c(4, 5, 4, 2))
367
+%plot(x, cn, pch=".",
368
+%     cex=2, xaxt="n", col="grey60", ylim=c(0,6),
369
+%     ylab="copy number", xlab="physical position (Mb)",
370
+%     main=paste(sampleNames(cnSet)[1], ", CHR: 1"))
371
+%axis(1, at=pretty(x), labels=pretty(x)/1e6)
372
+%require(SNPchip)
373
+%invisible(plotCytoband(1, new=FALSE, cytoband.ycoords=c(5.5, 6), label.cytoband=FALSE))
374
+%@
375
+
376
+%\begin{figure}
377
+%  \includegraphics[width=0.9\textwidth]{copynumber-oneSample}
378
+%  \caption{\label{fig:oneSample} Total copy number (y-axis) for
379
+%    chromosome 1 plotted against physical position (x-axis) for one
380
+%    sample.  Estimates at nonpolymorphic loci are plotted in light
381
+%    blue.}
382
+%\end{figure}
383
+%
384
+%\clearpage
385
+%\paragraph{One SNP at a time}
386
+%
387
+%Scatterplots of the A and B allele intensities (log-scale) can be
388
+%useful for assessing the biallelic genotype calls.  This section of
389
+%the vignette is currently under development.
390
+
391
+\section{Trouble shooting}
392
+
393
+\subsection{Missing values}
394
+
395
+% There are several reasons for estimates of the allele-specific copy
396
+% number to have missing values (\texttt{NA}'s). This section briefly
397
+% elaborates on the source of missing values in the HapMap analysis
398
+% and discusses possible alternatives to reduce the number of missing
399
+% values.  Note that allele-specific copy number, 'CA' and 'CB', is
400
+% not saved in the \Robject{cnSet} object.  Rather, the respective
401
+% accessors calculate 'CA' and 'CB' on the fly from the normalized
402
+% intensities and from the marker-specific parameter estimates in the
403
+% linear model.  In general, a missing value arises when the
404
+% background or slope parameter was not estimated in the linear
405
+% model.
406
+Most often, missing values occur when the genotype confidence scores
407
+for a SNP were below the threshold used by the
408
+\Robject{crlmmCopynumber} function. For the HapMap analysis, we used a
409
+confidence threshold of 0.80 (the default).
410
+
411
+<<NA>>=
412
+GT.CONF.THR <- 0.80
413
+autosome.index <- which(isSnp(cnSet) & chromosome(cnSet) < 23)
414
+sample.index <- which(batch(cnSet) == "C")
415
+ca <- CA(cnSet, i=autosome.index, j=sample.index)
416
+cb <- CB(cnSet, i=autosome.index, j=sample.index)
417
+missing.ca <- is.na(ca)
418
+missing.cb <- is.na(cb)
419
+(nmissing.ca <- sum(missing.ca))
420
+(nmissing.cb <- sum(missing.cb))
421
+identical(nmissing.ca, nmissing.cb)
422
+@
423
+
424
+If \Robject{nmissing.ca} is nonzero, check the genotype confidence
425
+scores provided by the crlmm genotyping algorithm against the
426
+threshold specified in \Robject{crlmmCopynumber}.
427
+
428
+<<NAconfidenceScores>>=
429
+if(nmissing.ca > 0){
430
+	invisible(open(snpCallProbability(cnSet)))
431
+	gt.conf <- i2p(snpCallProbability(cnSet)[autosome.index, sample.index])
432
+	invisible(close(snpCallProbability(cnSet)))
433
+	below.thr <- gt.conf < GT.CONF.THR
434
+	index.allbelow <- as.integer(which(rowSums(below.thr) == length(sample.index)))
435
+	nmissingBecauseOfGtThr <- length(index.allbelow) * length(sample.index)
436
+	stopifnot(identical(nmissingBecauseOfGtThr, nmissing.ca))
437
+	## or calculate the proportion of missing effected by low crlmm confidence
438
+	length(index.allbelow) * length(sample.index)/nmissing.ca
439
+}
440
+@
441
+
442
+One could inspect the cluster plots for the 'low confidence' calls.
443
+
444
+<<clusterPlotsLowConfidence>>=
445
+## TODO
446
+@
447
+
448
+We repeat the above check for missing values at polymorphic loci on
449
+chromosome X.  In this case, we compare the \Robject{rowSums} of the
450
+missing values to the number of samples to check whether all of the
451
+estimates are missing for a given SNP.
452
+
453
+<<NAchromosomeX>>=
454
+X.index <- which(isSnp(cnSet) & chromosome(cnSet) == 23)
455
+ca.X <- CA(cnSet, i=X.index, j=sample.index)
456
+missing.caX <- is.na(ca.X)
457
+(nmissing.caX <- sum(missing.caX))
458
+missing.snp.index <- which(rowSums(missing.caX) == length(sample.index))
459
+index <- which(rowSums(missing.caX) == length(sample.index))
460
+length(index)*length(sample.index)/nmissing.caX
461
+@
462
+
463
+From the above codechunk, we see that \Sexpr{length(index)} SNPs have
464
+NAs for all the samples. Next, we tally the number of NAs for
465
+polymorphic markers on chromosome X that are below the confidence
466
+threshold.  For the HapMap analysis, all of the missing values arose
467
+from SNPs in which either the men or the women had confidence scores
468
+that were all below the threshold.
469
+
470
+<<NAcrlmmConfidenceScore>>=
471
+if(nmissing.caX > 0){
472
+	invisible(open(snpCallProbability(cnSet)))
473
+	gt.conf <- i2p(snpCallProbability(cnSet)[X.index, sample.index])
474
+	invisible(close(snpCallProbability(cnSet)))
475
+	below.thr <- gt.conf < GT.CONF.THR
476
+	F <- which(cnSet$gender[sample.index] == 2)
477
+	M <- which(cnSet$gender[sample.index] == 1)
478
+	##nus <- nuA(cnSet)[X.index, "C"]
479
+	index.allbelowF <- as.integer(which(rowSums(below.thr[, F]) == length(F)))
480
+	index.allbelowM <- as.integer(which(rowSums(below.thr[, M]) == length(M)))
481
+	index.allbelow <- as.integer(unique(c(index.allbelowF, index.allbelowM)))
482
+	##all.equal(index, index.allbelow)
483
+	## or calculate the proportion of missing effected by low crlmm confidence
484
+	sum(index.allbelow %in% index)/length(index)
485
+}
486
+@
487
+
488
+For nonpolymorphic loci, the genotype confidence scores are irrelevant
489
+and estimates are available at most markers.
490
+
491
+<<NAnonpolymorphic.autosome>>=
492
+np.index <- which(!isSnp(cnSet) & chromosome(cnSet)==23)
493
+ca.F <- CA(cnSet, i=np.index, j=F)
494
+ca.M <- CA(cnSet, i=np.index, j=M)
495
+## NAs for one marker
496
+ca.F <- ca.F[-match("CN_974939", rownames(ca.F)), ]
497
+ca.M <- ca.M[-match("CN_974939", rownames(ca.M)), ]
498
+sum(is.na(ca.F))
499
+sum(is.na(ca.M))
500
+@
501
+
502
+%TODO: marker CN\_974939 has NAs for the normalized intensities.  This
503
+%is because CN\_974939 is not in the \Robject{npProbesFid} file in
504
+%\Rpackage{genomewidesnp6Crlmm}.  The \Robject{npProbesFid} file should
505
+%be updated in the next \Rpackage{genomewidesnp6Crlmm} release.
506
+
507
+In total, there were \Sexpr{length(missing.snp.index)} polymorphic
508
+markers on chromosome X for which copy number estimates are not
509
+available.  Lowering the confidence threshold would permit estimation
510
+of copy number at most of these loci.  A confidence threshold is
511
+included as a parameter for the copy number estimation as an approach
512
+to reduce the sensitivity of genotype-specific summary statistics,
513
+such as the within-genotype median, to intensities from samples that
514
+do not clearly fall into one of the biallelic genotype clusters. There
515
+are drawbacks to this approach, including variance estimates that can
516
+be a bit optimistic at some loci.  More direct approaches for outlier
517
+detection and removal may be explored in the future.
518
+
519
+Copy number estimates for other chromosomes, such as mitochondrial and
520
+chromosome Y, are not currently available in \crlmm{}.
521
+
522
+<<echo=FALSE>>=
523
+invisible(close(cnSet))
524
+@
525
+
526
+
527
+\section{Session information}
528
+<<sessionInfo, results=tex>>=
529
+toLatex(sessionInfo())
530
+@
531
+
532
+%\section*{References}
533
+%
534
+%%\begin{bibliography}
535
+%  \bibliographystyle{plain}
536
+%  \bibliography{refs}
537
+%\end{bibliography}
538
+
539
+
540
+\end{document}
0 541
new file mode 100644
... ...
@@ -0,0 +1,136 @@
1
+%\VignetteIndexEntry{SmoothingRawCN copy number Vignette}
2
+%\VignetteDepends{crlmm, genomewidesnp6Crlmm}
3
+%\VignetteKeywords{crlmm, SNP 6}
4
+%\VignettePackage{crlmm}
5
+\documentclass{article}
6
+\usepackage{graphicx}
7
+\usepackage{natbib}
8
+\usepackage{url}
9
+\newcommand{\Rfunction}[1]{{\texttt{#1}}}
10
+\newcommand{\Rmethod}[1]{{\texttt{#1}}}
11
+\newcommand{\Rcode}[1]{{\texttt{#1}}}
12
+\newcommand{\Robject}[1]{{\texttt{#1}}}
13
+\newcommand{\Rpackage}[1]{{\textsf{#1}}}
14
+\newcommand{\Rclass}[1]{{\textit{#1}}}
15
+\newcommand{\oligo}{\Rpackage{oligo}}
16
+\newcommand{\DNAcopy}{\Rpackage{DNAcopy}}
17
+\newcommand{\VanillaICE}{\Rpackage{VanillaICE}}
18
+\newcommand{\R}{\textsf{R}}
19
+\newcommand{\crlmm}{\Rpackage{crlmm}}
20
+\usepackage[margin=1in]{geometry}
21
+
22
+\begin{document}
23
+\title{Smoothing raw copy number estimates}
24
+
25
+\date{\today}
26
+
27
+\author{Rob Scharpf}
28
+\maketitle
29
+
30
+<<setup, echo=FALSE, results=hide>>=
31
+options(prompt="R> ", continue=" ", width=70)
32
+@
33
+
34
+\begin{abstract}
35
+
36
+  This vignette describes how segmentation algorithms and hidden
37
+  Markov models implemented in the \R{} packages \DNAcopy{} and
38
+  \VanillaICE{} packages, respectively, can be interfaced with the
39
+  \crlmm{} raw copy number estimates.  This vignette assumes
40
+  successfull completion of the \verb+copynumber+ vignette.
41
+
42
+\end{abstract}
43
+
44
+\section{Set up}
45
+
46
+As in the previous vignettes, we load the required libraries and
47
+specify a path for storing output files.
48
+
49
+<<libraries,results=hide>>=
50
+library(ff)
51
+library(crlmm)
52
+library(cacheSweave)
53
+require(DNAcopy)
54
+require(VanillaICE)
55
+if(getRversion() < "2.13.0"){
56
+	rpath <- getRversion()
57
+} else rpath <- "trunk"
58
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="")
59
+@
60
+
61
+<<ram>>=
62
+ldPath(outdir)
63
+setCacheDir(outdir)
64
+ocProbesets(50e3)
65
+ocSamples(200)
66
+@
67
+
68
+We begin by loading the \Robject{cnSet} object created by the
69
+\verb+AffymetrixPreprocessCN+ vignette.
70
+
71
+<<loadcnset>>=
72
+if(!exists("cnSet")) load(file.path(outdir, "cnSet.rda"))
73
+@
74
+
75
+\section{Interfacing with the \DNAcopy{} and \VanillaICE{} packages}
76
+
77
+*This section is incomplete.*
78
+
79
+As discussed in the \verb+copynumber+ vignette, we create an instance
80
+of \Rclass{oligoSnpSet} class by using the method \Rfunction{as} for
81
+subsets of the markers to keep the RAM at manageable levels (one can
82
+specify smaller values of ocProbesets() to further reduce the RAM).
83
+
84
+<<oligoset, eval=FALSE>>=
85
+marker.index <- which(!is.na(chromosome(cnSet)))
86
+marker.index <- marker.index[order(chromosome(cnSet)[marker.index], position(cnSet)[marker.index])]
87
+marker.indices <- splitIndicesByLength(marker.index, ocProbesets())
88
+cbs.results <- hmm.results <- vector("list", length(marker.indices))
89
+open(cnSet)
90
+##for(i in seq_along(marker.indices)){
91
+for(i in 1:5){
92
+	## pull data from disk for specified markers and all samples
93
+	cnset.subset <- cnSet[marker.indices[[i]], seq(length=ncol(cnSet))]
94
+	## coerce to an oligoSnpSet object with total copy number
95
+	system.time(oligoset <- as(cnset.subset, "oligoSnpSet"))
96
+	rm(cnset.subset); gc()
97
+	stopifnot(class(copyNumber(oligoset)) == "matrix")
98
+
99
+	CNA.object <- CNA(genomdat=copyNumber(oligoset),
100
+			  chrom=chromosome(oligoset),
101
+			  maploc=position(oligoset),
102
+			  data.type="logratio",
103
+			  sampleid=sampleNames(oligoset))
104
+	## - smooth single point outliers (slow)
105
+	smu.object <- smooth.CNA(CNA.object)
106
+	rm(CNA.object); gc()
107
+
108
+	## - run segmentation
109
+	cbs.results[[i]] <- segment(smu.object)
110
+
111
+	## update the copy number matrix for the HMM with outliers smoothed
112
+	sample.index <- (1:(ncol(smu.object)-2)) + 2
113
+	copyNumber(oligoset) <- as.matrix(smu.object[, sample.index])
114
+
115
+	##calculate robustSds and store in cnConfidence
116
+	sds <- robustSds(copyNumber(oligoset))
117
+	cnConfidence(oligoset) <- 1/sds
118
+	hmmOpts <- hmm.setup(oligoset,
119
+			     c("hom-del", "hem-del", "normal", "amp1copy", "amp2copy"),
120
+			     copynumberStates=c(0:4),
121
+			     normalIndex=3,
122
+			     log.initialP=rep(log(1/5), 5),
123
+			     prGenotypeHomozygous=c(0.8, 0.99, 0.7, 0.75, 0.75))
124
+	## fit the hmm
125
+	hmm.results[[i]] <- hmm(oligoset, hmmOpts, verbose=FALSE, TAUP=1e10)
126
+	rm(oligoset, sds, smu.object, CNA.object, hmmOpts); gc()
127
+}
128
+close(cnSet)
129
+save(hmm.results, file=file.path(outdir, "hmm.results.rda"))
130
+save(cbs.results, file=file.path(outdir, "cbs.results.rda"))
131
+@
132
+
133
+%\section{HMM in the \VanillaICE{} package}
134
+
135
+
136
+\end{document}
0 137
new file mode 100644
1 138
Binary files /dev/null and b/inst/scripts/SmoothingRawCN.pdf differ
... ...
@@ -6,6 +6,7 @@
6 6
 \usepackage{graphicx}
7 7
 \usepackage{natbib}
8 8
 \usepackage{url}
9
+\usepackage{amsmath,amssymb}
9 10
 \newcommand{\Rfunction}[1]{{\texttt{#1}}}
10 11
 \newcommand{\Rmethod}[1]{{\texttt{#1}}}
11 12
 \newcommand{\Rcode}[1]{{\texttt{#1}}}
... ...
@@ -18,7 +19,7 @@
18 19
 \usepackage[margin=1in]{geometry}
19 20
 
20 21
 \begin{document}
21
-\title{Copy number estimation and genotype calling with \Rpackage{crlmm}}
22
+\title{Copy number estimation with \Rpackage{crlmm}}
22 23
 \date{\today}
23 24
 \author{Rob Scharpf}
24 25
 \maketitle
... ...
@@ -27,178 +28,202 @@
27 28
 options(continue=" ", width=70)
28 29
 @
29 30
 
30
-%\section{Estimating copy number}
31
-
32
-%At present, software for copy number estimation is provided only for the
33
-%Affymetrix 6.0 platform.
34
-
35 31
 \begin{abstract}
36
-  This vignette estimates copy number for HapMap samples on the
37
-  Affymetrix 6.0 platform.  See \citep{Scharpf2010} for details
38
-  regarding the methodology implemented in \crlmm{}.  In addition, a
39
-  compendium describing copy number analysis using the \crlmm{}
40
-  package is available from the author's website:
32
+
33
+  Copy number routines in the \crlmm{} package are available for
34
+  Affymetrix 5.0 and 6.0 platforms, as well as several Illumina
35
+  platforms.  This vignette assumes that the arrays have already been
36
+  successfully preprocessed and genotyped as per the instructions in
37
+  the \verb+AffymetrixPreprocessCN+ and \verb+IlluminaPreprocessCN+
38
+  vignettes for the Affymetrix and Illumina platforms,
39
+  respectively. While this vignette uses Affymetrix 6.0 arrays for
40
+  illustration, the steps at this point are identical for both
41
+  platforms.  See \citep{Scharpf2010} for details regarding the
42
+  methodology implemented in \crlmm{} for copy number analysis.  In
43
+  addition, a compendium describing copy number analysis using the
44
+  \crlmm{} package is available from the author's website:
41 45
   \url{http://www.biostat.jhsph.edu/~rscharpf/crlmmCompendium/index.html}.
42 46
 
43 47
 \end{abstract}
44 48
 
45
-\section{Copy number estimation}
46
-
47
-\subsection{Set up}
49
+\section{Set up}
48 50
 
49
-<<cdfname>>=
51
+<<libraries,results=hide>>=
52
+library(ff)
50 53
 library(crlmm)
54
+library(lattice)
51 55
 library(cacheSweave)
52
-@
53
-
54
-Several genotyping platforms are currently supported.  Supported
55
-platforms must have a corresponding annotation package (installed
56
-separately) that are listed below.
57
-
58
-<<annotationPackages>>=
59
-pkgs <- annotationPackages()
60
-pkgs <- pkgs[grep("Crlmm", pkgs)]
61
-pkgs
62
-@
63
-
64
-Note that 'pd.genomewidesnp6' and 'genomewidesnp6Crlmm' are both
65
-annotation packages for the Affymetrix 6.0 platform available on
66
-Bioconductor, but the 'genomewidesnp6Crlmm' annotation package must be
67
-used for copy number estimation.  The annotation package is specified
68
-through the 'cdfName' -- the identifier without the 'Crlmm' postfix.
69
-In the following code, we specify the cdf name for Affymetrix 6.0,
70
-provide the complete path to the CEL files, and indicate where
71
-intermediate files from the copy number estimation are to be saved.
72
-
73
-<<setup>>=
74
-cdfName <- "genomewidesnp6"
75
-pathToCels <- "/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m"
76 56
 if(getRversion() < "2.13.0"){
77 57
 	rpath <- getRversion()
78 58
 } else rpath <- "trunk"
79 59
 outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="")
80
-dir.create(outdir, recursive=TRUE, showWarnings=FALSE)
60
+@
61
+
62
+<<ram>>=
63
+ldPath(outdir)
81 64
 setCacheDir(outdir)
65
+ocProbesets(150e3)
66
+ocSamples(200)
82 67
 @
83 68
 
84
-All long computations are saved in the output directory
85
-\Robject{outdir}.  Users should change these variables as appropriate.
86
-The following code chunk should fail unless the above arguments have
87
-been set appropriately.
69
+We begin by loading the \Robject{cnSet} object created by the
70
+\verb+AffymetrixPreprocessCN+ vignette.
88 71
 
89
-<<checkSetup>>=
90
-if(!file.exists(outdir)) stop("Please specify a valid directory for storing output")
91
-if(!file.exists(pathToCels)) stop("Please specify the correct path to the CEL files")
72
+<<loadcnset>>=
73
+if(!exists("cnSet")) load(file.path(outdir, "cnSet.rda"))
92 74
 @
93 75
 
94
-Processed data from codechunks requiring long computations are saved
95
-to disk by wrapping function calls in the \Robject{checkExists}
96
-function.  After running this vignette as a batch job, subsequent
97
-calls to \verb@Sweave@ will load the saved computations from disk. See
98
-the \Rfunction{checkExists} help file for additional details.
99
-
100
-\subsection{Preprocessing and genotyping.}
101
-
102
-In the following code chunk, we provide the complete path to the
103
-Affymetrix CEL files and define a 'batch' variable. The
104
-\Robject{batch} variable will be used to initialize a container for
105
-storing the normalized intensities, the genotype calls, and the
106
-parameter estimates for copy number.  Often the chemistry plate or the
107
-scan date of the array is a useful surrogate for batch. For the HapMap
108
-CEL files in our analysis, the CEPH (C) and Yoruban (Y) samples were
109
-prepared on separate chemistry plates.  In the following code chunk,
110
-we extract the population identifier from the CEL file names and
111
-assign these identifiers to the variable \Robject{batch}.
112
-
113
-<<celfiles>>=
114
-celFiles <- list.celfiles(pathToCels, full.names=TRUE, pattern=".CEL")
115
-celFiles <- celFiles[substr(basename(celFiles), 13, 13) %in% c("C", "Y")]
116
-plates <- substr(basename(celFiles), 13, 13)
117
-@
118 76
 
119
-The preprocessing steps for copy number estimation includes quantile
120
-normalization of the raw intensities for each probe and a step that
121
-summarizes the intensities of multiple probes at a single locus.  For
122
-example, the Affymetrix 6.0 platform has 3 or 4 identical probes at
123
-each polymorphic locus and the normalized intensities are summarized
124
-by a median.  For the nonpolymorphic markers on Affymetrix 6.0, only
125
-one probe per locus is available and the summarization step is not
126
-needed.
127
-
128
-After preprocessing the arrays, the \crlmm{} package estimates the
129
-genotype and provides a confidence score at each polymorphic locus.
130
-Unless the dataset is small (e.g., fewer than 50 samples), we suggest
131
-installing and loading the \R{} package \Rpackage{ff} to reduce the
132
-RAM required for preprocessing and genotyping.  Loading the
133
-\Rpackage{ff} package at this point will automatically enable large
134
-data support (LDS).
135
-
136
-The function \Rfunction{genotype} checks to see whether the
137
-\Rpackage{ff} is loaded.  If loaded, the normalized intensities and
138
-genotype are stored as \Robject{ff} objects on disk.  Otherwise, the
139
-genotypes and normalized intensities are stored in matrices.  A word
140
-of caution: the \Rfunction{genotype} function without \Rpackage{ff}
141
-requires a potentially large amount of RAM.  A more RAM-friendly
142
-approach to preprocessing and genotyping requires the \Rpackage{ff}
143
-package.  In particular, the functions \Rfunction{ocProbesets} and
144
-\Rfunction{ocSamples} can be used to manage how many probesets and
145
-samples are to processed at a time and can therefore be used to fine
146
-tune the needed RAM for a particular job.  The function
147
-\Rfunction{ldPath} indicates that \Robject{ff} objects will be stored
148
-in the directory \Robject{outdir}.
149
-
150
-<<ff>>=
151
-library(ff)
152
-ldPath(outdir)
153
-ocProbesets(100000)
154
-ocSamples(200)
155
-@
77
+\textbf{Limitations:} While a minimum number of samples is not
78
+required for preprocessing and genotyping, copy number estimation in
79
+the \crlmm{} package currently requires at least 10 samples per batch.
80
+The parameter estimates for copy number and the corresponding
81
+estimates of raw copy number will tend to be more noisy for batches
82
+with small sample sizes (e.g., $<$ 50).  Chemistry plate or scan date
83
+are often useful surrogates for batch.
156 84
 
157
-With LDS enabled, we preprocess and genotype 180 samples from the CEPH
158
-and Yoruban populations.  Users interested only in the genotypes
159
-should instead use the \R{} function \Rfunction{crlmm} or
160
-\Rfunction{crlmm2}. We wrap the call to \Rfunction{genotype} in
161
-\Rfunction{checkExists} so that subsequent calls to \verb@Sweave@ can
162
-be run interactively.
85
+\section{Quality control}
163 86
 
87
+The signal to noise ratio (SNR) estimated by the CRLMM genotyping
88
+algorithm is an overall measure of the separation of the diallelic
89
+genotype clusters at polymorphic loci and can be a useful measure of
90
+array quality.  Small SNR values can indicate possible problems with
91
+the DNA.  Depending on the size of the dataset and the number of
92
+samples with low SNR, users may wish to rerun the preprocessing and
93
+genotyping steps after excluding samples with low SNR.  The SNR is
94
+stored in the \verb+phenoData+ slot of the \Robject{CNSet} object and
95
+is available after preprocessing and genotyping. SNR values below 5
96
+for Affymetrix or below 25 for Illumina may indicate poor sample
97
+quality.  The following code chunk makes a histogram of the SNR values
98
+for the HapMap samples.
164 99
 
165
-<<LDS_genotype, cache=TRUE>>=
166
-cnSet <- genotype(celFiles, batch=plates, cdfName=cdfName)
100
+<<snr,fig=TRUE,include=FALSE,width=6, height=4>>=
101
+open(cnSet$SNR)
102
+snr <- cnSet$SNR[]
103
+close(cnSet$SNR)
104
+print(histogram(~snr, panel=function(...){
105
+	panel.histogram(...)
106
+	panel.abline(v=5, col="grey",lty=2)
107
+},
108
+		breaks=25,xlim=c(4.5,10), xlab="SNR"))
167 109
 @
110
+\begin{figure}[t!]
111
+  \begin{center}
112
+  \includegraphics[width=0.6\textwidth]{copynumber-snr.pdf}
113
+  \caption{The signal to noise ratio (SNR) for 180 HapMap samples. For
114
+    Affymetrix platforms, SNR values below 5 can indicate possible
115
+    problems with sample quality.  In some circumstances, it may be
116
+    more helpful to exclude samples with poor DNA quality.}
117
+\end{center}
118
+\end{figure}
168 119
 
169
-The value returned by genotype is an instance of the class
170
-\Robject{CNSet}.  In addition to the normalization and genotyping, the
171
-\Robject{genotype} function initializes a container that will store
172
-summary statistics for the batches and parameters needed for copy
173
-number estimation.  At this point, the batch summaries and parameters
174
-for copy number are all NA's.
175 120
 
176
-\subsection{Copy number estimation.}
121
+\section{Copy number estimation}
122
+
123
+As described in \cite{Scharpf2010}, the CRLMM-CopyNumber algorithm
124
+fits a linear model to the normalized intensities stratified by the
125
+diallic genotype call.  The intercept and slope from the linear model
126
+are both SNP- and batch-specific.  The implementation in the \crlmm{}
127
+package is encapsulated by the function \Rfunction{crlmmCopynumber}
128
+that, using the default settings, can be called by passing a single
129
+object of class \code{CNSet}.  See the appropriate
130
+preprocessing/genotyping vignette for the construction of an object of
131
+class \Rclass{CNSet}.
132
+<<LDS_copynumber,cache=TRUE>>=
133
+(cnSet.updated <- crlmmCopynumber(cnSet))
134
+@
177 135
 
178
-The \Rfunction{crlmmCopynumber} performs the following steps:
136
+The following steps were performed by the \Rfunction{crlmmCopynumber}
137
+function:
179 138
 \begin{itemize}
180
-\item computes summary statistics for each batch
181
-\item imputes unobserved genotype centers (for each batch)
182
-\item shrinks the within-genotype variances
183
-\item estimates parameters for allele-specific copy number
139
+\item sufficient statistics for the genotype clusters for
140
+  each batch
141
+\item unobserved genotype centers imputed
142
+\item posterior summaries of sufficient statistics
143
+\item intercept and slope for linear model
184 144
 \end{itemize}
185
-
186
-  With \texttt{verbose=TRUE}, the above steps for CN estimation are
187
-  displayed during the processing.
145
+Depending on the value of \verb+ocProbesets()+, these summaries are
146
+computed for subsets of the markers to reduce the required RAM. Note
147
+that the value returned by the \Rfunction{crlmmCopynumber} function in
148
+the above example is \verb+TRUE+.  The reason the function returns
149
+\verb+TRUE+ in the above example is that the elements of the
150
+\verb+batchStatistics+ slot have the class \Rclass{ff_matrix}.  Rather
151
+than keep the statistical summaries in memory, the summaries are
152
+written to files on disk using protocols described in the
153
+\Rpackage{ff} package. Hence, while the \Robject{cnSet} object itself
154
+is unchanged as a result of the \Rfunction{crlmmCopynumber} function,
155
+the data on disk is updated accordingly.  Users that are interested in
156
+accessing these low-level summaries can refer to the
157
+\verb+Infrastructure+ vignette.  Computation of the raw copy number
158
+estimates for each allele is described in the following section.
159
+
160
+For users that are interested in the analysis of a specific chromosome
161
+(subset of markers) or a s
162
+
163
+pointers to files on disk, are stored in the \verb+batchStatistics+
164
+slot of the class \Rclass{CNSet}.  Using the default settings for the
165
+\Rfunction{crlmmCopynumber} function, only an object of class
166
+\Rclass{CNSet} is required.  %(See \verb+AffyPreprocessCN+ or
167
+%\verb+IlluminaPreprocessCN+ vignettes for details.)
168
+
169
+Note that depends on whether the elements of the \verb+batchStatistics+
170
+slot are \Robject{ff} objects or ordinary matrices.  In this example,
171
+the elements of \verb+batchStatistics+ have the class
172
+\Rclass{ff_matrix}.
188 173
 
189 174
 <<LDS_copynumber,cache=TRUE>>=
175
+nms <- ls(batchStatistics(cnSet))
176
+cls <- rep(NA, length(nms))
177
+for(i in seq_along(nms)) cls[i] <- class(batchStatistics(cnSet)[[nms[i]]])[1]
178
+all(cls == "ff_matrix")
179
+@
180
+
181
+The batch-specific statistical summaries computed by