Browse code

Rewrote illumina_copynumber vignette. Add functions and docmentation for constructInf, preprocessInf, and genotypeInf.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54164 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 30/03/2011 02:40:07
Showing 14 changed files

... ...
@@ -1,5 +1,7 @@
1 1
 .auto*
2 2
 test/*
3
+*.o
4
+*.so
3 5
 myCrlmmGT2.Rnw
4 6
 ffobjs*
5 7
 auto*
... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.9.19
4
+Version: 1.9.20
5 5
 Date: 2010-12-10
6 6
 Author: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -70,4 +70,4 @@ export(crlmm,
70 70
 export(constructIlluminaCNSet)
71 71
 export(totalCopynumber, rawCopynumber)
72 72
 exportMethods(A, B, nuA, nuB, phiA, phiB, corr, tau2, Ns, medians, mads)
73
-
73
+export(constructInf, preprocessInf, genotypeInf)
... ...
@@ -100,21 +100,21 @@ construct <- function(filenames,
100 100
 }
101 101
 
102 102
 genotype <- function(filenames,
103
-		       cdfName,
104
-		       batch,
105
-		       mixtureSampleSize=10^5,
106
-		       eps=0.1,
107
-		       verbose=TRUE,
108
-		       seed=1,
109
-		       sns,
110
-		       probs=rep(1/3, 3),
111
-		       DF=6,
112
-		       SNRMin=5,
113
-		       recallMin=10,
114
-		       recallRegMin=1000,
103
+		     cdfName,
104
+		     batch,
105
+		     mixtureSampleSize=10^5,
106
+		     eps=0.1,
107
+		     verbose=TRUE,
108
+		     seed=1,
109
+		     sns,
110
+		     probs=rep(1/3, 3),
111
+		     DF=6,
112
+		     SNRMin=5,
113
+		     recallMin=10,
114
+		     recallRegMin=1000,
115 115
 		     gender=NULL,
116
-		       returnParams=TRUE,
117
-		       badSNP=0.7){
116
+		     returnParams=TRUE,
117
+		     badSNP=0.7){
118 118
 	is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE)
119 119
 	if(!is.lds) stop("this function now requires that the ff package be loaded")
120 120
 	if(missing(cdfName)) stop("must specify cdfName")
... ...
@@ -498,6 +498,7 @@ fit.lm1 <- function(strata,
498 498
 		    MIN.PHI,
499 499
 		    verbose, is.lds,
500 500
 		    CHR.X, ...){
501
+	open(object$gender)
501 502
 	if(is.lds) {physical <- get("physical"); open(object)}
502 503
 	if(verbose) message("      Probe stratum ", strata, " of ", length(index.list))
503 504
 	snps <- index.list[[strata]]
... ...
@@ -606,6 +607,7 @@ fit.lm2 <- function(strata,
606 607
 		    MIN.PHI,
607 608
 		    verbose, is.lds, CHR.X, ...){
608 609
 	if(is.lds) {physical <- get("physical"); open(object)}
610
+	open(object$gender)
609 611
 	if(verbose) message("      Probe stratum ", strata, " of ", length(index.list))
610 612
 	marker.index <- index.list[[strata]]
611 613
 	batches <- split(seq_along(batch(object)), as.character(batch(object)))
... ...
@@ -663,7 +665,9 @@ summarizeMaleXNps <- function(marker.index,
663 665
 	nr <- length(marker.index)
664 666
 	nc <- length(batchNames(object))
665 667
 	NN.Mlist <- imputed.medianA <- imputed.medianB <- shrink.madA <- shrink.madB <- vector("list", nc)
666
-	gender <- object$gender
668
+	open(object$gender)
669
+	gender <- object$gender[]
670
+	open(A(object))
667 671
 	AA <- as.matrix(A(object)[marker.index, gender==1])
668 672
 	madA.AA <- medianA.AA <- matrix(NA, nr, nc)
669 673
 	colnames(madA.AA) <- colnames(medianA.AA) <- batchNames(object)
... ...
@@ -697,10 +701,11 @@ summarizeXGenotypes <- function(marker.index,
697 701
 				DF.PRIOR,
698 702
 				gender="male", ...){
699 703
 	I <- unlist(batches)
704
+	open(object$gender)
700 705
 	if(gender == "male"){
701
-		gender.index <- which(object$gender == 1)
706
+		gender.index <- which(object$gender[] == 1)
702 707
 	} else {
703
-		gender.index <- which(object$gender == 2)
708
+		gender.index <- which(object$gender[] == 2)
704 709
 	}
705 710
 	batches <- lapply(batches, function(x, gender.index) intersect(x, gender.index), gender.index)
706 711
 	batch.names <- names(batches)
... ...
@@ -709,6 +714,10 @@ summarizeXGenotypes <- function(marker.index,
709 714
 	nc <- length(batch.index)
710 715
 	NN.Mlist <- medianA <- medianB <- shrink.madA <- shrink.madB <- vector("list", nc)
711 716
 	names(NN.Mlist) <- names(medianA) <- names(medianB) <- names(shrink.madA) <- names(shrink.madB) <- batch.names
717
+	open(calls(object))
718
+	open(snpCallProbability(object))
719
+	open(A(object))
720
+	open(B(object))
712 721
 	GG <- as.matrix(calls(object)[marker.index, ])
713 722
 	CP <- as.matrix(snpCallProbability(object)[marker.index, ])
714 723
 	AA <- as.matrix(A(object)[marker.index, ])
... ...
@@ -819,6 +828,10 @@ summarizeXGenotypes <- function(marker.index,
819 828
 		medianA[[k]] <- res[[1]]
820 829
 		medianB[[k]] <- res[[2]]
821 830
 	}
831
+	close(calls(object))
832
+	close(snpCallProbability(object))
833
+	close(A(object))
834
+	close(B(object))
822 835
 	return(list(madA=shrink.madA,
823 836
 		    madB=shrink.madB,
824 837
 		    NN=NN.Mlist,
... ...
@@ -841,7 +854,8 @@ fit.lm3 <- function(strata,
841 854
 		    verbose, is.lds, CHR.X, ...){
842 855
 	##if(is.lds) {physical <- get("physical"); open(object)}
843 856
 	if(verbose) message("      Probe stratum ", strata, " of ", length(index.list))
844
-	gender <- object$gender
857
+	open(object$gender)
858
+	gender <- object$gender[]
845 859
 	enough.males <- sum(gender==1) >= MIN.SAMPLES
846 860
 	enough.females <- sum(gender==2) >= MIN.SAMPLES
847 861
 	if(!enough.males & !enough.females){
... ...
@@ -1011,7 +1025,8 @@ fit.lm4 <- function(strata,
1011 1025
 		    verbose, is.lds=TRUE, ...){
1012 1026
 	##if(is.lds) {physical <- get("physical"); open(object)}
1013 1027
 	## exclude batches that have fewer than MIN.SAMPLES
1014
-	gender <- object$gender
1028
+	open(object$gender)
1029
+	gender <- object$gender[]
1015 1030
 	enough.males <- sum(gender==1) >= MIN.SAMPLES
1016 1031
 	enough.females <- sum(gender==2) >= MIN.SAMPLES
1017 1032
 	if(!enough.males & !enough.females){
... ...
@@ -1434,7 +1449,7 @@ shrinkSummary <- function(object,
1434 1449
 			  DF.PRIOR=50,
1435 1450
 			  verbose=TRUE,
1436 1451
 			  marker.index,
1437
-			  is.lds){
1452
+			  is.lds=TRUE){
1438 1453
 	stopifnot(type[[1]] == "SNP")
1439 1454
 	CHR.X <- FALSE ## this is no longer needed
1440 1455
 	if(missing(marker.index)){
... ...
@@ -1469,7 +1484,7 @@ shrinkSummary <- function(object,
1469 1484
 			      DF.PRIOR=DF.PRIOR,
1470 1485
 			      is.lds=is.lds)
1471 1486
 	}
1472
-	return(object)
1487
+	TRUE
1473 1488
 }
1474 1489
 
1475 1490
 genotypeSummary <- function(object,
... ...
@@ -1522,7 +1537,7 @@ genotypeSummary <- function(object,
1522 1537
 			      is.lds=is.lds,
1523 1538
 			      CHR.X=CHR.X)
1524 1539
 	}
1525
-	return(object)
1540
+	TRUE
1526 1541
 }
1527 1542
 
1528 1543
 whichMarkers <- function(type, is.snp, is.autosome, is.annotated, is.X){
... ...
@@ -1552,6 +1567,8 @@ summarizeNps <- function(strata, index.list, object, batchSize,
1552 1567
 	nc <- length(batchnames)
1553 1568
 	N.AA <- medianA.AA <- madA.AA <- tau2A.AA <- matrix(NA, nr, nc)
1554 1569
 	is.illumina <- whichPlatform(annotation(object)) == "illumina"
1570
+	open(A(object))
1571
+	open(object$gender)
1555 1572
 	AA <- as.matrix(A(object)[index, ])
1556 1573
 	if(is.illumina){
1557 1574
 		BB <- as.matrix(B(object)[index, ])
... ...
@@ -1593,7 +1610,6 @@ summarizeSnps <- function(strata,
1593 1610
 ##		physical <- get("physical")
1594 1611
 ##		open(object)
1595 1612
 ##	}
1596
-	open(object)
1597 1613
 	if(verbose) message("      Probe stratum ", strata, " of ", length(index.list))
1598 1614
 	index <- index.list[[strata]]
1599 1615
 	batches <- split(seq_along(batch(object)), as.character(batch(object)))
... ...
@@ -1789,6 +1805,7 @@ crlmmCopynumber <- function(object,
1789 1805
 	is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
1790 1806
 	if(is.lds) stopifnot(isPackageLoaded("ff"))
1791 1807
 	samplesPerBatch <- table(as.character(batch(object)))
1808
+	open(object)
1792 1809
 	if(any(samplesPerBatch < MIN.SAMPLES)){
1793 1810
 		tmp <- paste(names(samplesPerBatch)[samplesPerBatch < MIN.SAMPLES], collapse=", ")
1794 1811
 		message("The following batches have fewer than ", MIN.SAMPLES, " samples: ",  tmp)
... ...
@@ -1810,25 +1827,25 @@ crlmmCopynumber <- function(object,
1810 1827
 		##if(verbose) message(paste("Computing summary statistics for ", mylabel(marker.type), " genotype clusters for each batch")
1811 1828
 		marker.index <- whichMarkers(marker.type, is.snp,
1812 1829
 					     is.autosome, is.annotated, is.X)
1813
-		object <- genotypeSummary(object=object,
1814
-					  GT.CONF.THR=GT.CONF.THR,
1815
-					  type=marker.type,
1816
-					  verbose=verbose,
1817
-					  marker.index=marker.index,
1818
-					  is.lds=is.lds)
1830
+		genotypeSummary(object=object,
1831
+				GT.CONF.THR=GT.CONF.THR,
1832
+				type=marker.type,
1833
+				verbose=verbose,
1834
+				marker.index=marker.index,
1835
+				is.lds=is.lds)
1819 1836
 	}
1820 1837
 	if(verbose) message("Imputing unobserved genotype medians and shrinking the variances (within-batch, across loci) ")##SNPs only
1821 1838
 	if("SNP" %in% type){
1822 1839
 		marker.index <- whichMarkers("SNP", is.snp,
1823 1840
 					     is.autosome, is.annotated, is.X)
1824
-		object <- shrinkSummary(object=object,
1825
-					MIN.OBS=MIN.OBS,
1826
-					MIN.SAMPLES=MIN.SAMPLES,
1827
-					DF.PRIOR=DF.PRIOR,
1828
-					type="SNP",
1829
-					verbose=verbose,
1830
-					marker.index=marker.index,
1831
-					is.lds=is.lds)
1841
+		shrinkSummary(object=object,
1842
+			      MIN.OBS=MIN.OBS,
1843
+			      MIN.SAMPLES=MIN.SAMPLES,
1844
+			      DF.PRIOR=DF.PRIOR,
1845
+			      type="SNP",
1846
+			      verbose=verbose,
1847
+			      marker.index=marker.index,
1848
+			      is.lds=is.lds)
1832 1849
 	}
1833 1850
 	if(verbose) message("Estimating parameters for copy number")##SNPs only
1834 1851
 	for(i in seq_along(type)){
... ...
@@ -1852,6 +1869,7 @@ crlmmCopynumber <- function(object,
1852 1869
 					       is.lds=is.lds,
1853 1870
 					       CHR.X=CHR.X)
1854 1871
 	}
1872
+	close(object)
1855 1873
 	TRUE
1856 1874
 }
1857 1875
 crlmmCopynumber2 <- function(){
... ...
@@ -1008,16 +1008,14 @@ getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verb
1008 1008
 }
1009 1009
 
1010 1010
 
1011
-construct.Illumina <- function(sampleSheet=NULL,
1011
+constructInf <- function(sampleSheet=NULL,
1012 1012
 			       arrayNames=NULL,
1013
-			       ids=NULL,
1014 1013
 			       path=".",
1015 1014
 			       arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
1016 1015
 			       highDensity=FALSE,
1017 1016
 			       sep="_",
1018 1017
 			       fileExt=list(green="Grn.idat", red="Red.idat"),
1019 1018
 			       cdfName,
1020
-			       copynumber=TRUE,
1021 1019
 			       verbose=FALSE,
1022 1020
 			       batch, #fns,
1023 1021
 			       saveDate=TRUE) { #, outdir="."){
... ...
@@ -1082,14 +1080,8 @@ construct.Illumina <- function(sampleSheet=NULL,
1082 1080
 	if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
1083 1081
 
1084 1082
 	if(verbose) message("Initializing container for genotyping and copy number estimation")
1085
-	featureData = getFeatureData(cdfName, copynumber=copynumber)
1086
-#	if(!missing(fns)){
1087
-#		index = match(fns, featureNames(featureData))
1088
-#		if(all(is.na(index))) stop("fns not in featureNames")
1089
-#		featureData = featureData[index, ]
1090
-#	}
1083
+	featureData = getFeatureData(cdfName, copynumber=TRUE)
1091 1084
 	nr = nrow(featureData); nc = narrays
1092
-#        ldPath(outdir)
1093 1085
 	cnSet <- new("CNSet",
1094 1086
 		     alleleA=initializeBigMatrix(name="A", nr, nc),
1095 1087
 		     alleleB=initializeBigMatrix(name="B", nr, nc),
... ...
@@ -1114,13 +1106,12 @@ construct.Illumina <- function(sampleSheet=NULL,
1114 1106
 	cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double")
1115 1107
 	cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double")
1116 1108
 	sampleNames(cnSet) <- basename(sampleNames(cnSet))
1117
-	##pd = data.frame(matrix(NA, nc, 3), row.names=sampleNames(cnSet))
1118
-	##colnames(pd)=c("SKW", "SNR", "gender")
1119
-	##phenoData(cnSet) = new("AnnotatedDataFrame", data=pd)
1120 1109
 	return(cnSet)
1121 1110
 }
1111
+construct.Illumina <- constructInf
1122 1112
 
1123
-preprocess <- function(cnSet,
1113
+preprocessInf <- function(cnSet,
1114
+			  sampleSheet=NULL,
1124 1115
 		       arrayNames=NULL,
1125 1116
 		       ids=NULL,
1126 1117
 		       path=".",
... ...
@@ -1176,16 +1167,17 @@ preprocess <- function(cnSet,
1176 1167
 		 neededPkgs=c("crlmm", pkgname)) # outdir=outdir,
1177 1168
 	return(mixtureParams)
1178 1169
 }
1170
+preprocess <- preprocessInf
1179 1171
 
1180
-genotypeRS <- function(cnSet, mixtureParams, probs=rep(1/3,3),
1181
-		       SNRMin=5,
1182
-		       recallMin=10,
1183
-		       recallRegMin=1000,
1184
-		       verbose=TRUE,
1185
-		       returnParams=TRUE,
1186
-		       badSNP=0.7,
1187
-		       gender=NULL,
1188
-		       DF=6){
1172
+genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3),
1173
+			SNRMin=5,
1174
+			recallMin=10,
1175
+			recallRegMin=1000,
1176
+			verbose=TRUE,
1177
+			returnParams=TRUE,
1178
+			badSNP=0.7,
1179
+			gender=NULL,
1180
+			DF=6){
1189 1181
 	is.snp = isSnp(cnSet)
1190 1182
 	snp.index = which(is.snp)
1191 1183
 	narrays = ncol(cnSet)
... ...
@@ -1256,91 +1248,49 @@ genotype.Illumina <- function(sampleSheet=NULL,
1256 1248
 	stopifnot(is.lds)
1257 1249
 	if(missing(cdfName)) stop("must specify cdfName")
1258 1250
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
1259
-        pkgname = getCrlmmAnnotationName(cdfName)
1260
-	callSet <- construct.Illumina(sampleSheet=sampleSheet, arrayNames=arrayNames,
1261
-				      ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
1262
-				      highDensity=highDensity, sep=sep, fileExt=fileExt,
1263
-				      cdfName=cdfName, copynumber=copynumber, verbose=verbose, batch=batch, # fns=fns,
1264
-				      saveDate=saveDate) #, outdir=outdir)
1265
-	## Basic checks
1266
-	##check that mixtureParams provided
1267
-	if(missing(mixtureParams))
1268
-		stop("preprocess is FALSE but mixtureParams were not provided")
1269
-	if(ncol(callSet) != length(arrayNames))
1270
-		stop("callSet provided but number of samples is not the same as the length of arrayNames")
1271
-	snr <- callSet$SNR[]
1272
-	if(any(is.na(snr))) stop("missing values in callSet$SNR")
1273
-	if(missing(sns)) sns = basename(sampleNames(callSet))
1274
-	open(A(callSet))
1275
-	open(B(callSet))
1276
- 	is.snp = isSnp(callSet)
1277
-	snp.index = which(is.snp)
1278
-	narrays = ncol(callSet)
1279
-	sampleBatches <- splitIndicesByLength(seq(length=ncol(callSet)), ocSamples())
1280
-	mixtureParams = initializeBigMatrix("crlmmMixt-", 4, narrays, "double")
1281
-	SNR = initializeBigVector("crlmmSNR-", narrays, "double")
1282
-	SKW = initializeBigVector("crlmmSKW-", narrays, "double")
1283
-	ocLapply(seq_along(sampleBatches), processIDAT, sampleBatches=sampleBatches,
1284
-		 sampleSheet=sampleSheet, arrayNames=arrayNames,
1285
-		 ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, highDensity=highDensity,
1286
-		 sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose, mixtureSampleSize=mixtureSampleSize,
1287
-		 fitMixture=fitMixture, eps=eps, seed=seed, cdfName=cdfName, sns=sns, stripNorm=stripNorm,
1288
-		 useTarget=useTarget, A=A(callSet), B=B(callSet), SKW=SKW, SNR=SNR,
1289
-		 mixtureParams=mixtureParams, is.snp=is.snp, neededPkgs=c("crlmm", pkgname)) # outdir=outdir,
1290
-	open(SKW)
1291
-	open(SNR)
1292
-	pData(callSet)$SKW = SKW
1293
-	pData(callSet)$SNR = SNR
1294
-	save(callSet, file=file.path(ldPath(), "callSet.rda"))
1295
-	close(SNR)
1296
-	close(SKW)
1297
-	open(A(callSet))
1298
-	open(B(callSet))
1299
-	tmpA = initializeBigMatrix(name="tmpA", length(snp.index), narrays)
1300
-	tmpB = initializeBigMatrix(name="tmpB", length(snp.index), narrays)
1301
-	for(j in seq(length=ncol(callSet))){
1302
-		tmpA[, j] <- as.integer(A(callSet)[snp.index, j])
1303
-		tmpB[, j] <- as.integer(B(callSet)[snp.index, j])
1304
-	}
1305
-	close(A(callSet))
1306
-	close(B(callSet))
1307
-	close(tmpA)
1308
-	close(tmpB)
1309
-	save(tmpA, file=file.path(ldPath(), "tmpA.rda"))
1310
-	save(tmpB, file=file.path(ldPath(), "tmpB.rda"))
1311
-	save(SNR, file=file.path(ldPath(), "SNR.rda"))
1312
-	save(SKW, file=file.path(ldPath(), "SKW.rda"))
1313
-	save(mixtureParams, file=file.path(ldPath(), "mixtureParams.rda"))
1314
-	message("Begin genotyping")
1315
-	tmp <- crlmmGT2(A=tmpA,
1316
-			B=tmpB,
1317
-			SNR=SNR,
1318
-			mixtureParams=mixtureParams,
1319
-			cdfName=cdfName,
1320
-			row.names=NULL,
1321
-			col.names=sampleNames(callSet),
1322
-			probs=probs,
1323
-			DF=DF,
1324
-			SNRMin=SNRMin,
1325
-			recallMin=recallMin,
1326
-			recallRegMin=recallRegMin,
1327
-			gender=gender,
1328
-			verbose=verbose,
1329
-			returnParams=returnParams,
1330
-			badSNP=badSNP)
1331
-	save(tmp, file=file.path(ldPath(), "tmp.rda"))
1332
-	if(verbose) message("Genotyping finished.  Updating container with genotype calls and confidence scores.")
1333
-	open(tmpA); open(tmpB)
1334
-	for(j in 1:ncol(callSet)){
1335
-		snpCall(callSet)[snp.index, j] <- as.integer(tmpA[, j])
1336
-		snpCallProbability(callSet)[snp.index, j] <- as.integer(tmpB[, j])
1337
-	}
1338
-	close(tmpA); close(tmpB)
1339
-	delete(tmpA); delete(tmpB);
1340
-	callSet$gender = tmp$gender
1341
-	rm(tmp)
1342
-	close(callSet)
1343
-	return(callSet)
1251
+	stopifnot(!missing(batch))
1252
+        pkgname <- getCrlmmAnnotationName(cdfName)
1253
+	message("Instantiate CNSet container.")
1254
+	cnSet <- constructInf(sampleSheet=sampleSheet,
1255
+				    arrayNames=arrayNames,
1256
+				    path=path,
1257
+				    arrayInfoColNames=arrayInfoColNames,
1258
+				    highDensity=highDensity,
1259
+				    sep=sep,
1260
+				    fileExt=fileExt,
1261
+				    cdfName=cdfName,
1262
+				    verbose=verbose,
1263
+				    batch=batch,
1264
+				    saveDate=saveDate)
1265
+	mixtureParams <- preprocessInf(cnSet=cnSet,
1266
+				    sampleSheet=sampleSheet,
1267
+				    arrayNames=arrayNames,
1268
+				    ids=ids,
1269
+				    path=path,
1270
+				    arrayInfoColNames=arrayInfoColNames,
1271
+				    highDensity=highDensity,
1272
+				    sep=sep,
1273
+				    fileExt=fileExt,
1274
+				    saveDate=saveDate,
1275
+				    stripNorm=stripNorm,
1276
+				    useTarget=useTarget,
1277
+				    mixtureSampleSize=mixtureSampleSize,
1278
+				    fitMixture=fitMixture,
1279
+				    eps=eps,
1280
+				    verbose=verbose,
1281
+				    seed=seed)
1282
+	message("Preprocessing complete.  Begin genotyping...")
1283
+	genotypeInf(cnSet=cnSet, mixtureParams=mixtureParams,
1284
+		   probs=probs,
1285
+		   SNRMin=SNRMin,
1286
+		   recallMin=recallMin,
1287
+		   recallRegMin=recallRegMin,
1288
+		   verbose=verbose,
1289
+		   returnParams=returnParams,
1290
+		   badSNP=badSNP,
1291
+		   gender=gender,
1292
+		   DF=DF)
1293
+	return(cnSet)
1344 1294
 }
1345 1295
 
1346 1296
 
... ...
@@ -55,3 +55,44 @@ linesCNSet <- function(x, y, batch, copynumber, x.axis="A", grid=FALSE, ...){
55 55
 		}
56 56
 	}
57 57
 }
58
+
59
+
60
+cnPanel <- function(x, y, ..., pch.cols, gt, cbs.segs, hmm.segs=NULL, shades, subscripts, add.ideogram=TRUE){
61
+	##if(panel.number() == 2) browser()
62
+	add.ideogram <- add.ideogram[[panel.number()]]
63
+	##cbs.segs <- cbs.segs[[panel.number()]]
64
+	draw.hmm.states <- ifelse(panel.number() <= length(hmm.segs), TRUE, FALSE)
65
+	panel.grid(h=6, v=10)
66
+	which.hom <- which(gt[subscripts] == 1 | gt[subscripts]==3)
67
+	which.het <- which(gt[subscripts] == 2)
68
+	panel.xyplot(x, y, col="grey60", ...)
69
+	lpoints(x[which.hom], y[which.hom], col=pch.cols[1], ...)
70
+	lpoints(x[which.het], y[which.het], col=pch.cols[2], ...)
71
+	lsegments(x0=start(cbs.segs)/1e6, x1=end(cbs.segs)/1e6,
72
+		  y0=cbs.segs$seg.mean,
73
+		  y1=cbs.segs$seg.mean, ...)
74
+	if(draw.hmm.states){
75
+		hmm.segs <- hmm.segs[order(width(hmm.segs), decreasing=TRUE), ]
76
+		lrect(xleft=start(hmm.segs)/1e6,
77
+		      xright=end(hmm.segs)/1e6,
78
+		      ybottom=-0.4,
79
+		      ytop=0,
80
+		      border=shades[hmm.segs$state],
81
+		      col=shades[hmm.segs$state])
82
+	}
83
+	ltext(130, 5, paste("MAD:", round(mad(y, na.rm=TRUE), 2)))
84
+	if(add.ideogram){
85
+		pathto <- system.file("hg18", package="SNPchip")
86
+		cytoband <- read.table(file.path(pathto, "cytoBand.txt"), as.is=TRUE)
87
+		cytoband$V2 <- cytoband$V2/1e6
88
+		cytoband$V3 <- cytoband$V3/1e6
89
+		colnames(cytoband) <- c("chrom", "start", "end", "name", "gieStain")
90
+		plotCytoband(unique(hmm.segs$chrom),
91
+			     cytoband=cytoband,
92
+			     cytoband.ycoords=c(5.6, 5.9),
93
+			     new=FALSE,
94
+			     label.cytoband=FALSE,
95
+			     build="hg18",
96
+			     use.lattice=TRUE)
97
+	}
98
+}
... ...
@@ -15,6 +15,7 @@
15 15
 \newcommand{\Rclass}[1]{{\textit{#1}}}
16 16
 \newcommand{\oligo}{\Rpackage{oligo }}
17 17
 \newcommand{\R}{\textsf{R}}
18
+\newcommand{\ff}{\Rpackage{ff}}
18 19
 
19 20
 \begin{document}
20 21
 \title{Using \crlmm{} for copy number estimation and genotype calling
... ...
@@ -27,619 +28,291 @@
27 28
 
28 29
 
29 30
 \begin{abstract}
30
-  This vignette illustrates the steps necessary for obtaining
31
-  marker-level estimates of allele-specific copy number from the raw
32
-  Illumina IDAT files.  Hidden markov models or segmentation
33
-  algorithms can be applied to the marker-level estimates.  Examples
34
-  of both are illustrated.
31
+  This vignette illustrates the steps required prior to copy number
32
+  analysis for Infinium platforms.  Specifically, we require
33
+  construction of a container to store processed forms of the raw
34
+  data, preprocessing to normalize the arrays, and genotyping using
35
+  the CRLMM algorithm.  After completing these steps, users can refer
36
+  to the copy number vignette.
35 37
 \end{abstract}
36 38
 
37
-\section{About this vignette}
39
+\section{Infrastructure}
40
+
38 41
 <<crlmm, results=hide, echo=FALSE>>=
39 42
 library(crlmm)
43
+options(width=70)
44
+options(continue=" ")
40 45
 @
41 46
 
42
-%The vignette for copy number estimation for illumina platforms is
43
-%under development.  Please use the latest stable release of crlmm.
44
-
45
-%\end{document}
47
+\textbf{Supported platforms:} The supported Infinium platforms are
48
+those for which a corresponding annotation package is available.  The
49
+annotation packages contain information on the markers, such as
50
+physical position and chromosome, as well as pre-computed parameters
51
+estimated from HapMap used during the preprocessing and genotyping
52
+steps. Currently supported Infinium platforms are listed in the
53
+following code chunk.
46 54
 
47
-Allele-specific copy number estimation in the crlmm package is
48
-available for several Illumina platforms.  As described in the
49
-\texttt{copynumber} vignette, copy number estimation in \crlmm{} works
50
-best when there are a sufficient number of samples such that AA, AB,
51
-and BB genotypes are observed at most loci. For small studies (e.g.,
52
-fewer than 50 samples), there will be a large number of SNPs that are
53
-monomorphic. For monomorphic SNPs, the estimation problem becomes more
54
-difficult and alternative strategies that estimate the relative total
55
-copy number may be preferable.  In addition to installing \crlmm{},
56
-one must also install the appropriate annotation package for the
57
-Illumina platform.  In the following code, we list the platforms for
58
-which annotation packages are currently available. Next we create a
59
-directory where output files will be stored and indicate the directory
60
-that contains the IDAT files that will be used in our analysis.
61
-
62
-
63
-<<setup, echo=FALSE, results=hide>>=
64
-options(width=70)
65
-options(continue=" ")
66
-library(cacheSweave)
55
+<<supportedPlatforms>>=
56
+pkgs <- annotationPackages()
57
+crlmm.pkgs <- pkgs[grep("Crlmm", pkgs)]
58
+crlmm.pkgs[grep("human", crlmm.pkgs)]
67 59
 @
68 60
 
69
-<<libraries>>=
61
+\textbf{Large data:} In order to reduce \crlmm{}'s memory footprint,
62
+we require the \Rpackage{ff} for copy number estimation.  The
63
+\Rpackage{ff} package provides infrastructure for accessing and
64
+writing data to disk instead of keeping data in memory.  Each element
65
+of the \verb+assayData+ and \verb+batchStatistics+ slot of the
66
+\Rclass{CNSet} class are \Robject{ff} objects.  \Robject{ff} objects
67
+in the \R{} workspace contain pointers to several files with the
68
+\verb+.ff+ extension.  The location of where the data is stored on
69
+disk is specified by use of the \Rfunction{ldPath} function.
70
+
71
+<<ldpath,results=hide>>=
70 72
 library(ff)
71
-pkgs <- annotationPackages()
72
-pkgs[grep("Crlmm", pkgs)]
73 73
 if(getRversion() < "2.13.0"){
74 74
 	rpath <- getRversion()
75 75
 } else rpath <- "trunk"
76 76
 outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/illumina_vignette", sep="")
77
+ldPath(outdir)
77 78
 dir.create(outdir, recursive=TRUE, showWarnings=FALSE)
78
-datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k"
79
-setCacheDir(outdir)
80 79
 @
81 80
 
82
-%Options for controlling RAM with the \Rpackage{ff} package.
83
-
84
-<<ldOptions>>=
85
-ldPath(outdir)
81
+Users should not move or rename this directory.  If only output files
82
+are stored in \verb+outdir+, one can either remove the entire
83
+directory prior to rerunning the analysis or all of the '.ff' files.
84
+Otherwise, one would accumulate a large number of '.ff' files on disk
85
+that are no longer in use.
86
+
87
+As none of the functions for preprocessing, genotyping, or copy number
88
+estimation simultaneously require all samples and all probes in
89
+memory, memory-usage by \crlmm{} can be fine tuned by reading in and
90
+processing subsets of the markers and/or samples. The functions
91
+\Rfunction{ocSamples} and \Rfunction{ocProbesets} in the
92
+\Rpackage{oligoClasses} package can be used to declare how many
93
+markers and samples to read at once.  In general, specifying smaller
94
+values should reduce the RAM required for a particular job, but would
95
+be expected to have an increased run-time. In the following
96
+code-chunk, we declare that \crlmm{} should process 150,000 markers at
97
+a time (when possible) and 500 samples at a time.  (As our dataset in
98
+this vignette only contains 43 samples, the \Rfunction{ocSamples}
99
+option would not have any effect.)  One can view the current settings
100
+for these commands, by typing the functions without an argument.
101
+
102
+<<ram>>=
103
+ocSamples()
104
+ocProbesets()
86 105
 ocProbesets(150e3)
87
-ocSamples(200)
106
+ocSamples(500)
107
+ocSamples()
108
+ocProbesets()
88 109
 @
89 110
 
90
-This vignette was created using Illumina IDAT files that are located
91
-in a specific directory on my computer (\Robject{pathToCels}).  Long
92
-computations are saved in the output directory \Robject{outdir}.
93
-Towards this end, we make repeated use of the \Rfunction{checkExists}
94
-simply checks that a variable is in the workspace.  If not, the
95
-variable is loaded from the indicated path.  If the file (with '.rda'
96
-extension) does not exist in this path, the function indicated by the
97
-.FUN argument is executed.  Alternatively, if \Robject{.load.it} is
98
-\Robject{FALSE} the function will be executed regardless of whether
99
-the file exists.  Users should see the \Rpackage{weaver} package for a
100
-more 'correct' approach for cacheing long computations. The following
101
-code chunk checks that that the variables specific to the directory
102
-structure on my computer exist.  Users should modify the
103
-\Robject{outdir} and \Robject{datadir} variables as appropriate.
104
-
105
-<<checkSetup>>=
106
-if(!file.exists(outdir)) stop("Please specify valid directory for storing output")
107
-if(!file.exists(datadir)) stop("Please specify the correct path to the CEL files")
111
+<<cacheSweave, echo=FALSE, results=hide>>=
112
+library(cacheSweave)
113
+setCacheDir(outdir)
108 114
 @
109 115
 
110
-\section{Preprocessing Illumina IDAT files, genotyping, and copy number
111
-  estimation}
112
-To perform copy number analysis on the Illumina platform, several
113
-steps are required.  The first step is to read in the IDAT files and
114
-create a container for storing the red and green intensities. These
115
-intensities are quantile normalized in the function
116
-\Rfunction{crlmmIllumina}, and then genotyped using the crlmm
117
-algorithm.  Details on the crlmm genotyping algorithm are described
118
-elsewhere.  We will make use of the normalized intensities when we
119
-estimate copy number.  The object returned by the
120
-\Rfunction{genotype.Illumina} function is an instance of the
121
-\Robject{CNSet} class, a container for storing the genotype calls,
122
-genotype confidence scores, the normalized intensities for the A and B
123
-channels, and summary statistics on the batches.  The batch variable
124
-can be the date on which the array was scanned or the 96 well
125
-microarray chemistry plate on which the samples were processed. Both
126
-date and chemistry plate are useful surrogates for experimental
127
-factors that effect subgroups of the probe intensities.  (Quantile
128
-normalization does not remove batch effects.)  The genotype confidence
129
-scores are saved as an integer, and can be converted back to a $[0,
130
-1]$ probability scale by the transformation $round(-1000*log2(1-p))$.
116
+\textbf{Limitations:} There is no minimum number of samples required
117
+for preprocessing and genotyping.  However, for copy number estimation
118
+the \crlmm{} package currently requires at least 10 samples per batch.
119
+The parameter estimates for copy number and the corresponding
120
+estimates of raw copy number will tend to be more noisy for batches
121
+with small sample sizes (e.g., $<$ 50).  Chemistry plate or scan date
122
+are often useful surrogates for batch.
123
+
124
+\section{Initializing a container for storing processed data}
125
+
126
+This section will initialize a container for storing processed forms
127
+of the data, including the normalized intensities for the A and B
128
+alleles and the CRLMM genotype calls and confidence scores.  In
129
+addition, the container will store information on the markers
130
+(physical position, chromosome, and a SNP indicator), the batch, and
131
+the samples (e.g., gender).  To construct this container for Infinium
132
+platforms, several steps are required.
133
+
134
+We begin by specifying the path containing the raw IDAT files for a
135
+set of samples from the Infinium 370k platform.
136
+
137
+<libraries>>=
138
+datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k"
139
+@
140
+
141
+For Infinium platforms, an Illumina sample sheet containing
142
+information for reading the raw IDAT files is required. Please refer
143
+to the BeadStudio Genotyping guide, Appendix A, for additional
144
+information.  The following code reads in the samplesheet for the IDAT
145
+files on our local server. For our dataset, the file extensions are
146
+`Grn.dat' and `Red.idat'.  We store the complete path to the filename
147
+without the file extension in the \Robject{arrayNames} and check that
148
+all of the green and red IDAT files exists.
131 149
 
132 150
 <<samplesheet>>=
133 151
 samplesheet = read.csv(file.path(datadir, "HumanHap370Duo_Sample_Map.csv"), header=TRUE, as.is=TRUE)
134 152
 samplesheet <- samplesheet[-c(28:46,61:75,78:79), ]
135 153
 arrayNames <- file.path(datadir, unique(samplesheet[, "SentrixPosition"]))
136
-grnfiles = all(file.exists(paste(arrayNames, "_Grn.idat", sep="")))
137
-redfiles = all(file.exists(paste(arrayNames, "_Red.idat", sep="")))
138
-load.it <- TRUE
139
-if(!load.it){
140
-	rmFiles <- list.files(outdir, pattern=".ff", full.names=TRUE)
141
-	unlink(rmFiles)
142
-}
143
-plate <- samplesheet$Plate
144
-table(plate)
145
-## too few samples to run the plates as separate batches
146
-batch <- rep("1", nrow(samplesheet))
154
+all(file.exists(paste(arrayNames, "_Grn.idat", sep="")))
155
+all(file.exists(paste(arrayNames, "_Red.idat", sep="")))
156
+arrayInfo <- list(barcode=NULL, position="SentrixPosition")
147 157
 @
148 158
 
149
-<<genotype,cache=TRUE>>=
150
-container <- genotype.Illumina(sampleSheet=samplesheet,
151
-			       path=dirname(arrayNames[1]),
152
-			       arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"),
153
-			       cdfName="human370v1c",
154
-			       batch=batch)
155
-@
159
+As discussed previously, all supported platforms have a corresponding
160
+annotation package.  The appropriate annotation package is specified
161
+by the platform identifier without the \verb+Crlmm+ postfix.
156 162
 
157
-<<copynumber,cache=TRUE>>=
158
-GT.CONF.THR <- 0.8
159
-cnSet <- crlmmCopynumber(container, GT.CONF.THR=GT.CONF.THR)
163
+<<cdfname>>=
164
+cdfName <- "human370v1c"
160 165
 @
161 166
 
162
-The \Robject{cnSet} returned by the \Rfunction{crlmmCopynumber}
163
-function contains all of the parameters used to compute
164
-allele-specific copy number.  In order to reduce I/O and speed
165
-computation, the allele-specific copy number estimates are not written
166
-to file nor saved in the \Robject{CNSet} object.  Rather,
167
-allele-specific estimates are computed on the fly by the functions
168
-\Rfunction{CA}, \Rfunction{CB}, or \Rfunction{totalCopynumber}.  The
169
-following codechunks provide a few examples.
170
-
171
-Copy number for polymorphic markers on chromosomes 1 - 22.
172
-
173
-<<ca>>=
174
-snp.index <- which(isSnp(cnSet) & !is.na(chromosome(cnSet)) & chromosome(cnSet) < 22)
175
-ca <- CA(cnSet, i=snp.index, j=1:5)
176
-cb <- CB(cnSet, i=snp.index, j=1:5)
177
-ct <- ca+cb
178
-@
167
+Next, we construct a character vector that specifies the batch for
168
+each of the \Sexpr{length(arrayNames)} arrays.  Here, we have a small
169
+dataset and process the samples in a single batch. Processing the
170
+samples as a single batch is generally reasonable if the samples were
171
+processed at similar times (e.g., within a few weeks).
179 172
 
180
-Alternatively, total copy number can be obtained by
181
-<<totalCopynumber.snps>>=
182
-ct2 <- totalCopynumber(cnSet, i=snp.index, j=1:5)
183
-stopifnot(all.equal(ct, ct2))
173
+<<batch>>=
174
+batch <- rep("1", nrow(samplesheet))
184 175
 @
185 176
 
186
-Missing values can arise at polymorphic when the confidence score of
187
-the genotype calls are below the threshold indicated by the threshold
188
-\texttt{GT.CONF.THR} in \Robject{crlmmCopynumber}. See
189
-\Robject{?crlmmCopynumber} for additional details.  In the following
190
-codechunk, we compute the number of samples that had confidence scores
191
-below \Sexpr{GT.CONF.THR} at loci for which ${\hat CA}$ and ${\hat CB}$
192
-are missing.
193
-
194
-<<NAs.snps>>=
195
-missing.copynumber <- which(rowSums(is.na(ct)) > 0)
196
-invisible(open(snpCallProbability(cnSet)))
197
-gt.confidence <- i2p(snpCallProbability(cnSet)[snp.index, ])
198
-n.below.threshold <- rowSums(gt.confidence < 0.95)
199
-unique(n.below.threshold[missing.copynumber])
200
-@
201
-\noindent For loci with missing copy number, the confidence scores
202
-were all below the threshold.
203
-
204
-At nonpolymorphic loci, either the \Rfunction{CA} or
205
-\Rfunction{totalCopynumber} functions can be used to obtain estimates
206
-of total copy number.
207
-
208
-<<nonpolymorphicAutosomes>>=
209
-marker.index <- which(!isSnp(cnSet) & chromosome(cnSet) < 23)
210
-ct <- CA(cnSet, i=marker.index, j=1:5)
211
-## all zeros
212
-stopifnot(all(CB(cnSet, i=marker.index, j=1:5) == 0))
213
-ct2 <- totalCopynumber(cnSet, i=marker.index, j=1:5)
214
-stopifnot(all.equal(ct, ct2))
177
+Finally, we initialize an object of class \Robject{CNSet} using the
178
+function \Rfunction{constructInf}.
179
+
180
+<<container,cache=TRUE>>=
181
+cnSet <- constructInf(sampleSheet=samplesheet,
182
+		      arrayNames=arrayNames,
183
+		      batch=batch,
184
+		      arrayInfoColNames=arrayInfo,
185
+		      cdfName=cdfName,
186
+		      verbose=TRUE,
187
+		      saveDate=TRUE)
215 188
 @
216 189
 
190
+A concise summary of the object's contents can be viewed with the
191
+\Rfunction{print} function.
217 192
 
218
-\begin{figure}[t!]
219
-  Nonpolymorphic markers on chromosome X:
220
-  \centering
221
-<<nonpolymorphicX, fig=TRUE, width=8, height=4>>=
222
-npx.index <- which(chromosome(cnSet)==23 & !isSnp(cnSet))
223
-M <- sample(which(cnSet$gender==1), 5)
224
-F <- sample(which(cnSet$gender==2), 5)
225
-cn.M <- CA(cnSet, i=npx.index, j=M)
226
-cn.F <- CA(cnSet, i=npx.index, j=F)
227
-boxplot(data.frame(cbind(cn.M, cn.F)), pch=".", col="grey60", outline=FALSE)
228
-@
229
-\caption{Copy number estimates for nonpolymorphic loci on chromosome
230
-  X (5 men, 5 women).  crlmm assumes that the median copy number
231
-  across samples at a given marker on X is 1 for men and 2 for women.}
232
-\end{figure}
233
-
234
-
235
-
236
-
237
-\begin{figure}[t!]
238
- \centering
239
-<<polymorphicX, fig=TRUE, width=8, height=4>>=
240
-## copy number estimates on X for SNPs are biased towards small values.
241
-X.markers <- which(isSnp(cnSet) & chromosome(cnSet) == 23)
242
-ca.M <- CA(cnSet, i=X.markers, j=M)
243
-cb.M <- CB(cnSet, i=X.markers, j=M)
244
-ca.F <- CA(cnSet, i=X.markers, j=F)
245
-cb.F <- CB(cnSet, i=X.markers, j=F)
246
-cn.M <- ca.M+cb.M
247
-cn.F <- ca.F+cb.F
248
-boxplot(data.frame(cbind(cn.M, cn.F)), pch=".", outline=FALSE, col="grey60")
249
-cn2 <- totalCopynumber(cnSet, i=X.markers, j=c(M, F))
250
-stopifnot(all.equal(cbind(cn.M, cn.F), cn2))
251
-@
252
-\caption{Copy number estimates for polymorphic markers on chromosome
253
-  X. }
254
-\end{figure}
255
-
256
-Often it is of interest to smooth the total copy number estimates (the
257
-sum of the allele-specific copy number) as a function of the physical
258
-position.  The following code chunk can be used to create an instance
259
-of \Robject{CopyNumberSet} that contains the CA + CB at polymorphic
260
-markers and 'CA' at nonpolymorphic markers.
261
-
262
-The \Robject{cnSet} returned by the \Rfunction{crlmmCopynumber}
263
-function contains all of the parameters used to compute
264
-allele-specific copy number.  In order to reduce I/O and speed
265
-computation, the allele-specific copy number estimates are not written
266
-to file nor saved in the \Robject{CNSet} object.  Rather, the
267
-estimates are computed on the fly when the user calls one of the
268
-helper functions that uses this information. For instance, often it is
269
-of interest to smooth the total copy number estimates (the sum of the
270
-allele-specific copy number) as a function of the physical position.
271
-The following code chunk can be used to create an instance of
272
-\Robject{CopyNumberSet} that contains the CA + CB at polymorphic
273
-markers and 'CA' at nonpolymorphic markers.
274
-
275
-Alternatively, total copy number can be obtained by
276
-<<totalCopynumber.snps>>=
277
-ct2 <- totalCopynumber(cnSet, i=snp.index, j=1:5)
278
-stopifnot(all.equal(ct, ct2))
193
+<<cnset>>=
194
+print(cnSet)
279 195
 @
280 196
 
281
-Missing values can arise at polymorphic when the confidence score of
282
-the genotype calls are below the threshold indicated by the threshold
283
-\texttt{GT.CONF.THR} in \Robject{crlmmCopynumber}. See
284
-\Robject{?crlmmCopynumber} for additional details.  In the following
285
-codechunk, we compute the number of samples that had confidence scores
286
-below \Sexpr{GT.CONF.THR} at loci for which ${\hat CA}$ and ${\hat CB}$
287
-are missing.
197
+Note that the above object does not yet contain any processed data
198
+(only \verb+NA+'s) and several \verb+.ff+ files now appear in the
199
+\verb+outdir+.  Again, these files should not be removed.
288 200
 
289
-<<NAs.snps>>=
290
-missing.copynumber <- which(rowSums(is.na(ct)) > 0)
291
-invisible(open(snpCallProbability(cnSet)))
292
-gt.confidence <- i2p(snpCallProbability(cnSet)[snp.index, ])
293
-n.below.threshold <- rowSums(gt.confidence < 0.95)
294
-unique(n.below.threshold[missing.copynumber])
201
+<<listff>>=
202
+list.files(outdir, pattern=".ff")[1:5]
295 203
 @
296
-\noindent For loci with missing copy number, the confidence scores
297
-were all below the threshold.
298
-
299
-At nonpolymorphic loci, either the \Rfunction{CA} or
300
-\Rfunction{totalCopynumber} functions can be used to obtain estimates
301
-of total copy number.
302
-
303
-<<nonpolymorphicAutosomes>>=
304
-marker.index <- which(!isSnp(cnSet) & chromosome(cnSet) < 23)
305
-ct <- CA(cnSet, i=marker.index, j=1:5)
306
-## all zeros
307
-stopifnot(all(CB(cnSet, i=marker.index, j=1:5) == 0))
308
-ct2 <- totalCopynumber(cnSet, i=marker.index, j=1:5)
309
-stopifnot(all.equal(ct, ct2))
310
-@
311
-
312 204
 
313
-\begin{figure}[t!]
314
-  Nonpolymorphic markers on chromosome X:
315
-  \centering
316
-<<nonpolymorphicX, fig=TRUE, width=8, height=4>>=
317
-npx.index <- which(chromosome(cnSet)==23 & !isSnp(cnSet))
318
-M <- sample(which(cnSet$gender==1), 5)
319
-F <- sample(which(cnSet$gender==2), 5)
320
-cn.M <- CA(cnSet, i=npx.index, j=M)
321
-cn.F <- CA(cnSet, i=npx.index, j=F)
322
-boxplot(data.frame(cbind(cn.M, cn.F)), pch=".", col="grey60", outline=FALSE)
323
-@
324
-\caption{Copy number estimates for nonpolymorphic loci on chromosome
325
-  X (5 men, 5 women).  crlmm assumes that the median copy number
326
-  across samples at a given marker on X is 1 for men and 2 for women.}
327
-\end{figure}
328
-
329
-
330
-
331
-
332
-\begin{figure}[t!]
333
- \centering
334
-<<polymorphicX, fig=TRUE, width=8, height=4>>=
335
-## copy number estimates on X for SNPs are biased towards small values.
336
-X.markers <- which(isSnp(cnSet) & chromosome(cnSet) == 23)
337
-ca.M <- CA(cnSet, i=X.markers, j=M)
338
-cb.M <- CB(cnSet, i=X.markers, j=M)
339
-ca.F <- CA(cnSet, i=X.markers, j=F)
340
-cb.F <- CB(cnSet, i=X.markers, j=F)
341
-cn.M <- ca.M+cb.M
342
-cn.F <- ca.F+cb.F
343
-boxplot(data.frame(cbind(cn.M, cn.F)), pch=".", outline=FALSE, col="grey60")
344
-cn2 <- totalCopynumber(cnSet, i=X.markers, j=c(M, F))
345
-stopifnot(all.equal(cbind(cn.M, cn.F), cn2))
346
-@
347
-\caption{Copy number estimates for polymorphic markers on chromosome
348
-  X. }
349
-\end{figure}
350
-
351
-Often it is of interest to smooth the total copy number estimates (the
352
-sum of the allele-specific copy number) as a function of the physical
353
-position.  The following code chunk can be used to create an instance
354
-of \Robject{CopyNumberSet} that contains the CA + CB at polymorphic
355
-markers and 'CA' at nonpolymorphic markers.
356
-
357
-<<copynumberObject>>=
358
-marker.index <- which(chromosome(cnSet) <= 22)## & isSnp(cnSet))
359
-sample.index <- 1:5 ## first five samples
360
-invisible(open(cnSet))
361
-copynumberSet <- as(cnSet[marker.index, 1:5], "CopyNumberSet")
362
-invisible(close(cnSet))
363
-copynumberSet <- copynumberSet[order(chromosome(copynumberSet), position(copynumberSet)), ]
364
-indices <- split(1:nrow(copynumberSet), chromosome(copynumberSet))
365
-dup.index <- unlist(sapply(indices, function(i, position)  i[duplicated(position[i])], position=position(copynumberSet)))
366
-if(length(dup.index) > 0) copynumberSet <- copynumberSet[-dup.index, ]
367
-## exclude any rows that are all missing
368
-missing.index <- which(rowSums(is.na(copyNumber(copynumberSet))) == ncol(copynumberSet))
369
-table(chromosome(copynumberSet)[missing.index])
370
-copynumberSet <- copynumberSet[-missing.index, ]
371
-@
205
+Finally, note that the elements of the \verb+assayData+ slot are
206
+\Robject{ff} objects and not matrices. For the most part, the
207
+\emph{appearance} that the data is stored in memory is preserved.
372 208
 
373 209
 
374
-We suggest plotting the total copy number as a function of the
375
-physical position to evaluate whether a wave-correction is needed.
376
-
377
-
378
-<<plotTotalCn, fig=TRUE, width=8, height=8>>=
379
-require(SNPchip)
380
-data(chromosomeAnnotation)
381
-chrAnn <- chromosomeAnnotation[1,]
382
-layout(matrix(1:5, 5,1), heights=c(1, 1, 1, 1, 1))
383
-par(mar=c(0.5, 0.5, 0.5, 0.5), las=1, oma=c(4, 4, 4, 4))
384
-i <- which(chromosome(copynumberSet) == 1)
385
-for(j in 1:5){
386
-	plot(position(copynumberSet)[i], copyNumber(copynumberSet)[i, j], pch=".", cex=0.6,col="grey60", xaxt="n",
387
-	     ylim=c(-1,6), ylab="total copy number",
388
-	     yaxt="n",
389
-	     xlim=c(0, max(position(copynumberSet)[i])))
390
-	legend("topleft", legend=paste("SNR:", round(copynumberSet$SNR[j], 1)))
391
-	axis(2, at=0:6, labels=0:6)
392
-	if(j == 5){
393
-		mtext("physical position (Mb)", 1, outer=TRUE, line=2)
394
-		mtext("Chromosome 8", 3, line=0, outer=TRUE)
395
-		at <- pretty(position(copynumberSet)[i], n=8)
396
-	}
397
-	## draw centromere
398
-	xx <- c(chrAnn[1:2], chrAnn[2:1])
399
-	yy <- c(-0.5, -0.5, 5, 5)
400
-	polygon(xx, yy, col="bisque")
401
-}
210
+<<assaydataelements>>=
211
+sapply(assayData(cnSet), function(x) class(x)[1])
402 212
 @
403 213
 
404
-\section{Smoothing marker-level estimates of total copy number}
214
+\section{Preprocessing}
405 215
 
406
-In this section, we show how the total copy number can be smoothed
407
-using either the hidden Markov model implemented in the \R{}  package
408
-\Rpackage{VanillaICE} or circular binary segmentation implemented in
409
-the \R{} package{DNAcopy}.
216
+The raw intensities from the Infinium IDAT files are read and
217
+normalized using the function \Rfunction{preprocessInf}.  The function
218
+\Rfunction{preprocessInf} returns a \Robject{ff} object containing the
219
+parameters for the mixture model used by the CRLMM genotyping
220
+algorithm.
410 221
 
411
-\paragraph{Smoothing via a hidden Markov model.}
412
-
413
-<<hmm>>=
414
-if(require(VanillaICE)){
415
-	hmmOpts <- hmm.setup(copynumberSet, c("hom-del", "hem-del", "normal", "amp"),
416
-			     copynumberStates=0:3, normalIndex=3,
417
-			     log.initialP=rep(log(1/4), 4))
418
-	timing <- system.time(fit.cn <- hmm(copynumberSet, hmmOpts, TAUP=1e10, verbose=FALSE))
419
-	hmm.df <- as.data.frame(fit.cn)
420
-	print(hmm.df[1:5, c(2:4,7:9)])
421
-}
422
-@
423
-
424
-\paragraph{Smoothing via segmentation.}
425
-
426
-<<cbs>>=
427
-library("DNAcopy")
428
-CNA.object <- CNA(genomdat=copyNumber(copynumberSet),
429
-		  chrom=chromosome(copynumberSet),
430
-		  maploc=position(copynumberSet),
431
-		  data.type="logratio",
432
-		  sampleid=sampleNames(copynumberSet))
433
-smu.object <- smooth.CNA(CNA.object)
434
-if(!load.it) unlink(file.path(outdir, "cbs.segments.rda"))
435
-cbs.segments <- checkExists("cbs.segments", .path=outdir, .FUN=segment, x=smu.object, .load.it=load.it)
436
-cbs.segments <- cbind(cbs.segments$output, cbs.segments$segRows)
437
-cbs.segments$call <- rep(3, nrow(cbs.segments))
438
-cbs.segments$call[cbs.segments$seg.mean > 2.5] <- 4
439
-cbs.segments$call[cbs.segments$seg.mean < 1.25 & cbs.segments$seg.mean > 0.75] <- 2
440
-cbs.segments$call[cbs.segments$seg.mean < 0.75] <- 1
441
-cbs.ir <- RangedData(IRanges(cbs.segments$loc.start, cbs.segments$loc.end),
442
-		     chrom=cbs.segments$chrom,
443
-		     numMarkers=cbs.segments$num.mark,
444
-		     seg.mean=cbs.segments$seg.mean,
445
-		     cnCall=cbs.segments$call,
446
-		     sampleId=substr(cbs.segments$ID, 2, 13))
222
+<<preprocess,cache=TRUE>>=
223
+mixtureParams <- preprocessInf(cnSet=cnSet, sampleSheet=samplesheet, arrayNames=arrayNames, arrayInfoColNames=arrayInfo)
224
+invisible(open(mixtureParams))
225
+str(mixtureParams[])
226
+invisible(close(mixtureParams))
447 227
 @
448 228
 
449
-\paragraph{Visualizing inferred CNV.}
450
-
451
-In the following code chunk, we plot the total copy number versus
452
-physical position for chromosome 1 and overlay the copy number
453
-predictions from the HMM and a CNV call from the segmentation obtained
454
-by thresholding the segment means.  We restrict our focus to regions
455
-that have 3 or more markers.
456
-
457
-<<smoothFig, fig=TRUE, width=8, height=8>>=
458
-require(SNPchip)
459
-CHR <- 1
460
-cnSet2 <- copynumberSet
461
-## require at least 3 markers
462
-fit.cn2 <- fit.cn[fit.cn$numMarkers >= 3, ]
463
-cbs.ir2 <- cbs.ir[cbs.ir$numMarkers >= 3, ]
464
-
465
-copynumberSet2 <- copynumberSet[which(chromosome(copynumberSet) == CHR), ]
466
-layout(matrix(1:5, 5,1), heights=c(1, 1, 1, 1, 1))
467
-par(mar=c(0.5, 0.5, 0.5, 0.5), las=1, oma=c(4, 4, 4, 4))
468
-for(j in 1:5){
469
-	fit.cn3 <- fit.cn2[fit.cn2$chrom==CHR & fit.cn2$sampleId == sampleNames(copynumberSet2)[j], ]
470
-	cbs.ir3 <- cbs.ir2[cbs.ir2$chrom==CHR & cbs.ir2$sampleId == sampleNames(copynumberSet2)[j], ]
471
-	plot(position(copynumberSet2), copyNumber(copynumberSet2)[, j], pch=".", cex=0.6,col="grey60", xaxt="n",
472
-	     ylim=c(-1,6), ylab="total copy number",
473
-	     yaxt="n",
474
-	     xlim=c(0, max(position(copynumberSet2))))
475
-
476
-	legend("topleft", legend=paste("SNR:", round(copynumberSet2$SNR[j], 1)))
477
-	axis(2, at=0:6, labels=0:6)
478
-	if(j == 5){
479
-		mtext("physical position (Mb)", 1, outer=TRUE, line=2)
480
-		mtext("Chromosome 8", 3, line=0, outer=TRUE)
481
-		at <- pretty(position(copynumberSet2))
482
-	}
483
-	##Use polygon at bottom to show predictions
484
-	w <- width(fit.cn3)
485
-	fit.cn3 <- fit.cn3[order(w, decreasing=TRUE), ]
486
-	##fit.cn3 <- fit.cn3[fit.cn3$numMarkers > 5, ]
487
-	col <- c("blue", "black", "grey80", "red")
488
-	x <- c(min(start(fit.cn3)), max(end(fit.cn3)))
489
-	xx <- c(x, rev(x))
490
-	y <- rep(c(-0.3, 0), each=2)
491
-	polygon(xx, y, col="white")
492
-	for(i in seq_along(w)){
493
-		x <- c(start(fit.cn3)[i], end(fit.cn3)[i])
494
-		xx <- c(x, rev(x))
495
-		statecolor <- col[fit.cn3$state[i]]
496
-		polygon(xx, y, col=statecolor, border=statecolor)
497
-	}
498
-	w <- width(cbs.ir3)
499
-	cbs.ir3 <- cbs.ir3[order(w, decreasing=TRUE), ]
500
-	##cbs.ir3 <- cbs.ir3[cbs.ir3$numMarkers > 5, ]
501
-	y <- rep(c(-1, -0.7), each=2)
502
-	for(i in seq_along(w)){
503
-		x <- c(start(cbs.ir3)[i], end(cbs.ir3)[i])
504
-		xx <- c(x, rev(x))
505
-		statecolor <- col[cbs.ir3$cnCall[i]]
506
-		polygon(xx, y, col=statecolor, border=statecolor)
507
-	}
508
-	axis(4, at=c(-0.9, -0.1), c("CBS", "HMM"), cex=0.8)
509
-	if(j==5){
510
-		legend("topright", fill=col, legend=c(0, 1, 2, expression(3+.)))
511
-##		invisible(plotCytoband(8, label.cytoband=FALSE, xlim=c(0,max(position(copynumberSet2)))))
512
-		labels <- at/1e6
513
-		axis(1, at=at, labels=labels, outer=T, line=0)
514
-	}
515
-	## draw centromere
516
-	xx <- c(chrAnn[1:2], chrAnn[2:1])
517
-	yy <- c(-0.5, -0.5, 5, 5)
518
-	polygon(xx, yy, col="bisque")
519
-}
229
+Note that the normalized intensities for the A and B alleles are no
230
+longer \verb+NA+s and can now be accessed and inspected using the
231
+methods \Rfunction{A} and \Rfunction{B}, respectively.
232
+
233
+<<intensities>>=
234
+invisible(open(A(cnSet)))
235
+invisible(open(B(cnSet)))
236
+as.matrix(A(cnSet)[1:5, 1:5])
237
+as.matrix(B(cnSet)[1:5, 1:5])
238
+invisible(close(A(cnSet)))
239
+invisible(close(B(cnSet)))
520 240
 @
521 241
 
522
-TODO: Samples 4 and 5 have a sparse band of CN-estimates near 1 that
523
-is uniformly distributed across chromosome 1.  This suggests markers
524
-that are either not well fit by the linear model, or a problem with
525
-the genotyping.  Need to check.
526
-
527
-
528
-%Make a 4 x 4 table of the called states for each sample.
529
-% NA estimates are not counted in CBS -- need to recompute the number
530
-% of markers in each segment in order to make the 4 x 4 table
531
-
532
-<<smooth, eval=FALSE, echo=FALSE>>=
533
-nm.hmm <- sapply(split(fit.cn$numMarkers, fit.cn$sampleId), sum)
534
-stopifnot(length(unique(nm.hmm)) == 1)
535
-nm.cbs <- sapply(split(cbs.ir$numMarkers, cbs.ir$sampleId), sum)
536
-stopifnot(length(unique(nm.cbs)) == 1)
537
-stopifnot(all.equal(nm.cbs, nm.hmm))
538
-hmm.states <- rep(fit.cn$state, fit.cn$numMarkers)
539
-sample.id <- rep(fit.cn$sampleId, fit.cn$numMarkers)
540
-hmm.states <- split(hmm.states, sample.id)
541
-cbs.states <- rep(cbs.ir$cnCall, cbs.ir$numMarkers)
542
-sample.id <- rep(cbs.ir$sampleId, cbs.ir$numMarkers)
543
-cbs.states <- split(cbs.states, sample.id)
544
-tabs <- vector("list", length(cbs.states))
545
-for(i in seq_along(cbs.states)) tabs[[i]] <- table(cbs.states[[i]], hmm.states[[i]])
546
-names(tabs) <- names(cbs.states)
547
-@
548 242
 
243
+\section{Genotyping}
549 244
 
550
-\section{Accessors for extracting the locus-level copy number
551
-  estimates.}
552 245
 
553
-As an example of how to use accessors to obtain the allele-specific CN
554
-estimates, the following code chunk extracts the allele-specific copy
555
-number for polymorphic markers on chromosome 21.
246
+CRLMM genotype calls and confidence scores are estimated using the
247
+function \Rfunction{genotypeInf}.
556 248
 
557
-<<acn-accessor>>=
558
-marker.index <- which(chromosome(cnSet) == 21 & isSnp(cnSet))
559
-ca <- CA(cnSet, i=marker.index)
560
-cb <- CB(cnSet, i=marker.index)
561
-missing.index <- which(rowSums(is.na(ca))==ncol(cnSet))
562
-ca <- ca[-missing.index, ]
563
-cb <- cb[-missing.index, ]
564
-@
565
-\noindent Negating the \Robject{isSnp} function could be used to
566
-extract the estimates at nonpolymorphic markers. For instance,
567
-<<monomorphic-accessor>>=
568
-cn.monomorphic <- CA(cnSet, i=which(chromosome(cnSet) == 21 & !isSnp(cnSet)))
249
+<<genotype,cache=TRUE>>=
250
+updated <- genotypeInf(cnSet, mixtureParams=mixtureParams)
251
+print(updated)
569 252
 @
570 253
 
571
-At polymorphic loci, the total copy number is the sum of the number of
572
-copies of the A allele and the number of copies for the B allele.  At
573
-nonpolymorphic loci, the total copy number is the number of copies for
574
-the A allele.  The helper function totalCopyNumber can be used to
575
-extract the total copy number for all polymorphic and nonpolymorphic
576
-markers.  Documentation of the \Rfunction{totalCopyNumber} will be
577
-available in the devel version of the \Rpackage{oligoClasses}.
578
-
579
-<<copyNumberHelper>>=
580
-cn.total <- ca+cb
581
-cn.total2 <- totalCopynumber(cnSet, i=marker.index)
582
-cn.total2 <- cn.total2[-missing.index, ]
583
-all.equal(cn.total2, cn.total)
584
-@
254
+The posterior probabilities for the
255
+genotype calls in the \verb+callProbability+ element of the
256
+\verb+assayData+ are stored as integers to reduce the file size on
257
+disk.  However, the scores can be easily transformed back to the
258
+probability scale using the \Rfunction{i2p} function as illustrated in
259
+the following code chunk.
585 260
 
586
-A few simple visualizations may be helpful at this point. The first
587
-plot is a histogram of the signal to noise ratio of the sample -- an
588
-overall measure of how well the genotype clusters separate.  (This
589
-statistic tends to be much higher for Illumina than for the Affymetrix
590
-platforms.) The second is a visualization of the total copy number
591
-estimates plotted versus physical position on chromosome 1 for the two
592
-samples with the lowest (top) and highest (bottom) signal to noise
593
-ratios.
594
-
595
-<<snr, fig=TRUE>>=
596
-open(cnSet$SNR)
597
-snr <- cnSet$SNR[,]
598
-hist(snr, breaks=15)
261
+<<callprobs>>=
262
+invisible(open(snpCallProbability(cnSet)))
263
+callProbs <- as.matrix(snpCallProbability(cnSet)[1:5, 1:5])
264
+i2p(callProbs)
265
+invisible(close(snpCallProbability(cnSet)))
599 266
 @
600 267
 
601
-
602
-We plot the total copy number estimates versus physical position the
603
-two samples with the lowest (top) and highest (bottom) signal to noise
604
-ratios.
605
-
606
-\begin{figure}
607
-  \centering
608
-<<firstSample, fig=TRUE>>=
609
-low.snr <- which(snr == min(snr))
610
-high.snr <- which(snr == max(snr))
611
-marker.index <- which(chromosome(cnSet) == 1)
612
-x <- position(cnSet)[marker.index]
613
-cn.total <- totalCopynumber(cnSet, i=marker.index, j=c(low.snr, high.snr))
614
-##x <- x[-missing.index]
615
-snrs <- round(snr[c(low.snr, high.snr)], 2)
616
-par(mfrow=c(2,1), las=1, mar=c(0.5, 4, 0.5, 0.5), oma=c(4, 1, 1,1))
617
-for(j in 1:2){
618
-	cn <- cn.total[, j]
619
-	plot(x, cn, pch=".", ylab="copy number", xaxt="n", ylim=c(0, 6),
620
-	     col="grey50")
621
-	legend("topleft", legend=paste("SNR", snrs[j]))
622
-}
623
-axis(1, at=pretty(x), labels=pretty(x/1e6))
624
-mtext("Mb", 1, outer=TRUE, line=2)
268
+%As described in the \texttt{copynumber} vignette, copy number
269
+%estimation in \crlmm{} works best when there are a sufficient number
270
+%of samples such that AA, AB, and BB genotypes are observed at most
271
+%loci. For small studies (e.g., fewer than 50 samples), there will be a
272
+%large number of SNPs that are monomorphic. For monomorphic SNPs, the
273
+%estimation problem becomes more difficult and alternative strategies
274
+%that estimate the relative total copy number may be preferable.  In
275
+%addition to installing \crlmm{}, one must also install the appropriate
276
+%annotation package for the Illumina platform.  In the following code,
277
+%we list the platforms for which annotation packages are currently
278
+%available. Next we create a directory where output files will be
279
+%stored and indicate the directory that contains the IDAT files that
280
+%will be used in our analysis.
281
+
282
+\textbf{Wrapper:} The construction of a \Rclass{CNSet} object,
283
+preprocessing, and genotype calling are wrapped into a convenience
284
+function called \Rfunction{genotype.Illumina}.
285
+
286
+<<genotype.Illumina,cache=TRUE,results=hide>>=
287
+cnSet2 <- genotype.Illumina(sampleSheet=samplesheet,
288
+			    arrayNames=arrayNames,
289
+			    arrayInfoColNames=arrayInfo,
290
+			    cdfName="human370v1c",
291
+			    batch=batch)
625 292
 @
626 293
 
627
-\caption{Copy number estimates for samples with low signal to noise
628
-  ratio (top) and high signal to noise ratio (bottom).  Note the
629
-  slight wave in the bottom figure.  Such waves are typically
630
-  correlated with GC-content and can be removed by scatterplot
631
-  smoothers or any of the published wave-correction tools for
632
-  genotyping arrays.}
633
-\end{figure}
634
-
635
-\section{Session information}
636
-<<sessionInfo, results=tex>>=
637
-toLatex(sessionInfo())
294
+<<checkIdenticalCalls>>=
295
+invisible(open(calls(cnSet)))
296
+invisible(open(calls(cnSet2)))
297
+snp.index <- which(isSnp(cnSet))
298
+identical(calls(cnSet)[snp.index, 1:20], calls(cnSet2)[snp.index, 1:20])
299
+invisible(close(calls(cnSet)))
300
+invisible(close(calls(cnSet2)))
638 301
 @
639 302
 
303
+To fully remove the data associated with the \Robject{cnSet2} object,
304
+one should use the \Rfunction{delete} function in the \Rpackage{ff}
305
+package followed by the \Rfunction{rm} function.  The following code
306
+is not evaluated is it would change the results of the cached
307
+computations in the previous code chunk.
308
+
309
+<<delete, eval=FALSE>>=
310
+lapply(assayData(cnSet2), delete)
311
+lapply(batchStatistics(cnSet2), delete)
312
+delete(cnSet2$gender)
313
+delete(cnSet2$SNR)
314
+delete(cnSet2$SKW)
315
+rm(cnSet2)
316
+@
640 317
 
641 318
 \end{document}
642
-
643
-
644
-
645 319
Binary files a/inst/scripts/illumina_copynumber.pdf and b/inst/scripts/illumina_copynumber.pdf differ
646 320
deleted file mode 100644
... ...
@@ -1,44 +0,0 @@
1
-\name{constructIlluminaCNSet}
2
-\alias{constructIlluminaCNSet}
3
-\title{
4
-	Construct an instance of CNSetLM after preprocessing Illumina files
5
-}
6
-\description{
7
-
8
-	Assemble the preprocessed data and genotype calls from
9
-	\code{crlmmIllumina} to initialize a \code{CNSetLM} object.
10
-
11
-}
12
-\usage{
13
-constructIlluminaCNSet(crlmmResult, path, snpFile, cnFile)
14
-}
15
-\arguments{
16
-  \item{crlmmResult}{
17
-
18
-  A \code{SnpSet} object returned by function \code{crlmmIllumina} or
19
-  \code{crlmmIllumina2}.
20
-
21
-}
22
-
23
-  \item{path}{path to files created by \code{crlmmIllumina}}
24
-	
25
-  \item{snpFile}{
26
-  The \code{snpFile} filename specified in \code{crlmmIllumina}.
27
-}
28
-  \item{cnFile}{
29
-  The \code{cnFile} filename specified in \code{crlmmIllumina}.
30
-}
31
-}
32
-
33
-\value{
34
-	An object of class \code{CNSetLM}.
35
-}
36
-\author{
37
-R. Scharpf
38
-}
39
-\seealso{
40
-	\code{\link{CNSet-class}}, \code{\link{crlmmIllumina}}
41
-}
42
-
43
-\keyword{manip}
44
-