Browse code

Merge branch 'mymac'

* mymac:
add AffyGW.pdf
update vignettes in inst/scripts
Change argument of validCEL to celfiles
Update constructInf to accommodate GenomeDataFrame class for featureData
bump version to 1.13.7
Add doRUnit.R
Add celfile-utils.Rd
Streamlne some of the Rd files
add validCEL function that checks whether all celfiles can be read
getFeatureData returns GenomeAnnotatedDataFrame
Remove imports from methods. Remove pdf of illumina_copynumber.pdf (large file) and copynumber.pdf
getFeatureDAta returns GenomeAnnotatedDataFrame
Remove separate vignette for copy number in inst/scripts. Include copynumber section in both affy and illumina pipelines.
update documentation files for genotype.Illumina, preprocessInf, and genotypeInf (cdfName added as argument. Indicate that 'batch' should be a character string)
pass cdfName to genotypeInf and preprocessInf
add unitTests and cn-functions for 'simple usage'
Combine AffyPreprocess and copynumber. Combine IlluminaPreprocess and copynumber
remove depency on ff to allow installation on my mac

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@62108 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 17/01/2012 19:13:44
Showing 39 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1 @@
1
+Makefile
... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.13.5
4
+Version: 1.13.7
5 5
 Date: 2010-12-10
6 6
 Author: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -14,20 +14,19 @@ Depends: R (>= 2.13.0),
14 14
 Imports: affyio (>= 1.19.2),
15 15
          ellipse,
16 16
          genefilter (>= 1.33.0),
17
-	 ff (>= 2.2-1),
18 17
          mvtnorm,
19 18
          preprocessCore (>= 1.13.4),
20 19
          splines,
21 20
          stats,
22 21
          SNPchip,
23 22
          utils,
24
-	 lattice
23
+	 lattice,
24
+	 ff
25 25
 Suggests: hapmapsnp6,
26 26
           genomewidesnp6Crlmm (>= 1.0.4),
27 27
           snpStats,
28 28
 	  ellipse,
29
-	  RUnit,
30
-	  ff (>= 2.2-1)
29
+	  RUnit
31 30
 Collate: AllGenerics.R
32 31
 	 AllClasses.R
33 32
 	 methods-AssayData.R
... ...
@@ -42,9 +41,10 @@ Collate: AllGenerics.R
42 41
 	 crlmmGT2.R
43 42
          crlmm-illumina.R
44 43
 	 snprma-functions.R
44
+	 cn-functions.R
45 45
          utils.R
46 46
          zzz.R
47 47
 	 test_crlmm_package.R
48 48
 LazyLoad: yes
49 49
 biocViews: Microarray, Preprocessing, SNP, Bioinformatics, CopyNumberVariants
50
-Packaged: 2011-04-30 17:10:59 UTC; biocbuild
50
+Packaged: 2011-04-30 17:10:59 UTC; biocbuild
51 51
\ No newline at end of file
... ...
@@ -1,4 +1,5 @@
1 1
 useDynLib("crlmm", .registration=TRUE)
2
+
2 3
 ##---------------------------------------------------------------------------
3 4
 ## Biobase
4 5
 ##---------------------------------------------------------------------------
... ...
@@ -22,7 +23,7 @@ importFrom(Biobase, assayDataElement, assayDataElementNames,
22 23
 ##---------------------------------------------------------------------------
23 24
 ## oligoClasses
24 25
 ##---------------------------------------------------------------------------
25
-importClassesFrom(oligoClasses, SnpSuperSet, AlleleSet, CNSet)
26
+importClassesFrom(oligoClasses, SnpSuperSet, AlleleSet, CNSet)##, ff_or_matrix)
26 27
 importMethodsFrom(oligoClasses, allele, calls, "calls<-", confs,
27 28
 		  "confs<-", cnConfidence, "cnConfidence<-", isSnp,
28 29
 		  chromosome, position, A, B,
... ...
@@ -49,9 +50,9 @@ importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile, sd, updat
49 50
 importFrom(genefilter, rowSds)
50 51
 importFrom(mvtnorm, dmvnorm)
51 52
 importFrom(ellipse, ellipse)
52
-importFrom(ff, ffdf, physical.ff, physical.ffdf, ffrowapply)
53
+##importFrom(ff, ffdf, physical.ff, physical.ffdf, ffrowapply)
53 54
 
54
-importClassesFrom(oligoClasses, ff_matrix, ffdf)
55
+##importClassesFrom(oligoClasses, ff_matrix, ffdf)
55 56
 ##exportMethods(lines)
56 57
 exportMethods(CA, CB)
57 58
 export(crlmm,
... ...
@@ -70,5 +71,5 @@ export(crlmm,
70 71
 export(genotypes, totalCopynumber, rawCopynumber, xyplot)
71 72
 exportMethods(A, B, corr, nuA, nuB, phiA, phiB, predictionRegion, posteriorProbability, tau2, Ns, medians, mads,
72 73
 	      xyplot, calculateRBaf)
73
-export(ABpanel, constructInf, preprocessInf, genotypeInf)
74
+export(ABpanel, constructInf, preprocessInf, genotypeInf, validCEL, celDates)
74 75
 exportClasses(PredictionRegion)
... ...
@@ -1,4 +1,4 @@
1 1
 setOldClass("ellipse")
2 2
 setOldClass("ffdf")
3
-setClassUnion("ff_or_matrix", c("ffdf", "ff_matrix", "matrix"))
3
+##setClassUnion("ff_or_matrix", c("ffdf", "ff_matrix", "matrix"))
4 4
 setClass("PredictionRegion", contains="list")
5 5
new file mode 100644
... ...
@@ -0,0 +1,117 @@
1
+copynumber <- function(filenames,
2
+		       batch,
3
+		       summaries=c("lrr", "baf", "ca", "cb", "gt", "gtconf"),
4
+		       write=FALSE,
5
+		       outdir=".",
6
+		       onefile=FALSE,
7
+		       rda=TRUE){
8
+	fnamelist <- split(filenames, batch)
9
+	batchlist <- split(batch, batchlist)
10
+	## for each element in list.  Avoid
11
+	foreach(i=fnamelist) %do% simpleusage(filenames=fnameslist[[i]], batch=batchlist[[i]])
12
+}
13
+
14
+simpleusage <- function(filenames, batch, ...){
15
+	object <- genotype(fnamelist, batch, ...)
16
+	## run genotype on each element in fnamelist
17
+	object <- genotypeSummary(object)
18
+	## different than currently implemented
19
+	##    - shrink to saved batch medians, etc.
20
+	##    - shinkage across markers (?)
21
+	object <- shrinkSummary(object)
22
+	results <- array(NA, nrow(object), ncol(object), length(summaries))
23
+	if(c("ca", "cb") %in% summaries){
24
+		## estimateCnParameters
25
+		##results <- estimatecnParameters(...)
26
+	} else results <- NULL
27
+	is.baf <- "baf" %in% summaries || "lrr" %in% summaries
28
+	if(is.baf){
29
+		results2 <- calculateRtheta(object) ## returns list
30
+		## keep as list, or coerce to array
31
+	}
32
+	if(write){
33
+		##write2(, onefile=onefile)
34
+	}
35
+	return(results)
36
+}
37
+
38
+
39
+imputeTheta <- function(ia, ib, theta, S=100){
40
+##	y <- rbind(y1, y2)
41
+	y <- theta
42
+	N <- nrow(y); p <- ncol(y)
43
+
44
+	mu0 <- apply(y, 2, mean, na.rm=TRUE)
45
+	sd0 <- mu0/5
46
+	L0 <- matrix(.1, p, p); diag(L0) <- 1; L0 <- L0*outer(sd0, sd0)
47
+	nu0 <- p+2; S0 <- L0
48
+
49
+	## starting values
50
+	Sigma <- S0
51
+	Y.full <- y
52
+	O <- 1*(!is.na(y))
53
+	if(all(O[1, ] == 0)){
54
+		return(rep(NA, length(ia)))
55
+	}
56
+	for(j in seq_len(p)) Y.full[is.na(Y.full[, j]), j] <- mu0[j]
57
+	## Gibbs sampler
58
+	##THETA <- SIGMA <- Y.MISS <- NULL
59
+	## problems:  approx. 90 observations for the means in the other batches
60
+	##   -- only 3 observations for the mean in the current batch.
61
+##	THETA <- matrix(NA, iter, p)
62
+##	SIGMA <- matrix(NA, iter, p^2)
63
+	Y.MISS <- matrix(NA, S, sum(O[1,]==0))
64
+	bafs <- matrix(NA, S, length(a))
65
+	for(s in seq_len(S)){
66
+		## update lambda
67
+		ybar <- apply(Y.full, 2, mean)
68
+		Ln <- solve(solve(L0) + N*solve(Sigma))
69
+		mun <- Ln%*%(solve(L0)%*%mu0 + N*solve(Sigma)%*%ybar)
70
+		lambda <- rmvnorm(1, mun, Ln)
71
+
72
+		##update sigma
73
+		Sn <- S0+(t(Y.full)-c(lambda))%*%t(t(Y.full)-c(lambda))
74
+		Sigma <- solve(rwish(nu0+N, solve(Sn)))
75
+
76
+		## update missing data (only care about the first row.)
77
+		a <- O[1, ] == 1
78
+		b <- O[1, ] == 0
79
+		iSa <- solve(Sigma[a,a])
80
+		beta.j <- Sigma[b,a]%*%iSa
81
+		Sigma.j <- Sigma[b,b]-Sigma[b,a]%*%iSa%*%Sigma[a,b]
82
+		lambda.j <- lambda[b]+beta.j%*%(t(Y.full[1,a, drop=FALSE]-lambda[a]))
83
+		Y.full[1,b] <- rmvnorm(1, lambda.j, Sigma.j)
84
+		##SIGMA[s, ] <- c(Sigma)
85
+		Y.MISS[s, ] <- Y.full[1, O[1, ]==0]
86
+	}
87
+	na.cols <- which(is.na(y[1, ]))
88
+	THETA <- matrix(NA, S, 3)
89
+	THETA[, na.cols] <- Y.MISS
90
+	if(length(na.cols) < length(ia))
91
+		THETA[, -na.cols] <- y[1, -na.cols]
92
+
93
+	obs.theta <- atan2(ib, ia)*2/pi
94
+	theta.aa <- matrix(THETA[, 1], S, 3)
95
+	theta.ab <- matrix(THETA[, 2], S, 3)
96
+	theta.bb <- matrix(THETA[, 3], S, 3)
97
+	lessAA <- obs.theta < theta.aa
98
+	lessAB <- obs.theta < theta.ab
99
+	lessBB <- obs.theta < theta.bb
100
+	grAA <- !lessAA ## >= theta.aa
101
+	grAB <- !lessAB ## >= theta.ab
102
+	grBB <- !lessBB ## >= theta.bb
103
+	##not.na <- !is.na(theta.aa)
104
+	I1 <- grAA & lessAB
105
+	I2 <- grAB & lessBB
106
+	##mu <- apply(Y.MISS, 2, mean)
107
+	##bf <- matrix(NA, S, length(ia))
108
+	bf <- rep(NA, S*length(ia))
109
+	I1 <- as.logical(I1); I2 <- as.logical(I2)
110
+	bf[I1] <- 0.5 * as.numeric(((obs.theta-theta.aa)/(theta.ab-theta.aa)))[I1]
111
+	bf[I2] <- as.numeric((.5 * (obs.theta - theta.ab) / (theta.bb - theta.ab)))[I2] + 0.5
112
+	bf[as.logical(lessAA)] <- 0
113
+	bf[as.logical(grBB)] <- 1
114
+	bf <- matrix(bf, S, 3, byrow=TRUE)
115
+	pm.bf <- apply(bf, 2, mean)
116
+	return(pm.bf)
117
+}
... ...
@@ -53,9 +53,9 @@ getFeatureData <- function(cdfName, copynumber=FALSE){
53 53
 	index <- match(rownames(snpProbes), rownames(M)) #only snp probes in M get assigned position
54 54
 	M[index, "position"] <- snpProbes[, grep("pos", colnames(snpProbes))]
55 55
 	M[index, "chromosome"] <- chromosome2integer(snpProbes[, grep("chr", colnames(snpProbes))])
56
-	M[index, "isSnp"] <- 1L
57
-	##index <- which(is.na(M[, "isSnp"]))
58 56
 	##M[index, "isSnp"] <- 1L
57
+	index <- which(is.na(M[, "isSnp"]))
58
+	M[index, "isSnp"] <- 1L
59 59
 	if(copynumber){
60 60
 		index <- match(rownames(cnProbes), rownames(M)) #only snp probes in M get assigned position
61 61
 		M[index, "position"] <- cnProbes[, grep("pos", colnames(cnProbes))]
... ...
@@ -65,9 +65,14 @@ getFeatureData <- function(cdfName, copynumber=FALSE){
65 65
 	##A few of the snpProbes do not match -- I think it is chromosome Y.
66 66
 	if(any(is.na(M[, "chromosome"])))
67 67
 		M[is.na(M[, "chromosome"]), "isSnp"] <- 1L
68
-	M <- data.frame(M)
69
-	return(new("AnnotatedDataFrame", data=M))
68
+	##return(new("AnnotatedDataFrame", data=M))
69
+	new("GenomeAnnotatedDataFrame",
70
+	    position=as.integer(M[, "position"]),
71
+	    chromosome=as.integer(M[, "chromosome"]),
72
+	    isSnp=as.logical(M[, "isSnp"]),
73
+	    row.names=featurenames)
70 74
 }
75
+
71 76
 getFeatureData.Affy <- getFeatureData
72 77
 
73 78
 construct <- function(filenames,
... ...
@@ -188,12 +193,18 @@ snprmaAffy <- function(cnSet, filenames,
188 193
 	stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
189 194
 	mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
190 195
 	sampleBatches <- splitIndicesByNode(seq(along=filenames))
191
-	ocLapply(sampleBatches, rsprocessCEL, filenames=filenames,
192
-		 fitMixture=TRUE, A=A(cnSet), B=B(cnSet), C=calls(cnSet),
196
+	ocLapply(sampleBatches,
197
+		 rsprocessCEL,
198
+		 filenames=filenames,
199
+		 fitMixture=TRUE,
200
+		 A=A(cnSet), B=B(cnSet), C=calls(cnSet),
193 201
 		 D=snpCallProbability(cnSet),
194 202
 		 SKW=cnSet$SKW, SNR=cnSet$SNR,
195
-		 mixtureParams=mixtureParams, eps=eps, seed=seed,
196
-		 mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
203
+		 mixtureParams=mixtureParams,
204
+		 eps=eps,
205
+		 seed=seed,
206
+		 mixtureSampleSize=mixtureSampleSize,
207
+		 pkgname=pkgname,
197 208
 		 neededPkgs=c("crlmm", pkgname))
198 209
 	if(verbose) message("Cloning A and B matrices to store genotype calls and confidence scores.")
199 210
 	return(mixtureParams)
... ...
@@ -215,15 +226,14 @@ genotype <- function(filenames,
215 226
 		     gender=NULL,
216 227
 		     returnParams=TRUE,
217 228
 		     badSNP=0.7){
218
-	stopifnot(require("ff"))
229
+	stopifnot(isPackageLoaded("ff"))
219 230
 	cnSet <- constructAffy(filenames=filenames,
220 231
 			       cdfName=cdfName,
221 232
 			       batch=batch, verbose=verbose)
222 233
 	if(!(is(A(cnSet), "ff") || is(A(cnSet), "ffdf"))){
223 234
 		stop("The ff package is required for this function.")
224 235
 	}
225
-
226
-	mixtureParams <- snprmaAffy(cnSet, filenames=filenames,
236
+	cnSet@mixtureParams <- snprmaAffy(cnSet, filenames=filenames,
227 237
 				    mixtureSampleSize=mixtureSampleSize,
228 238
 				    eps=eps,
229 239
 				    seed=seed,
... ...
@@ -233,7 +243,7 @@ genotype <- function(filenames,
233 243
 			verbose=verbose)
234 244
 	stopifnot(ok)
235 245
 	ok <- genotypeAffy(cnSet=cnSet,
236
-			   mixtureParams=mixtureParams,
246
+			   mixtureParams=cnSet@mixtureParams,
237 247
 			   SNRMin=SNRMin,
238 248
 			   recallMin=recallMin,
239 249
 			   recallRegMin=recallRegMin,
... ...
@@ -444,7 +444,7 @@ readBPM <- function(bpmFile){
444 444
 
445 445
 
446 446
 readGenCallOutput = function(file, path=".", cdfName,
447
-    colnames=list("SampleID"="Sample ID", "SNPID"="SNP Name", "XRaw"="X Raw", "YRaw"="Y Raw"), 
447
+    colnames=list("SampleID"="Sample ID", "SNPID"="SNP Name", "XRaw"="X Raw", "YRaw"="Y Raw"),
448 448
     type=list("SampleID"="character", "SNPID"="character", "XRaw"="integer", "YRaw"="integer"), verbose=FALSE) {
449 449
 
450 450
     if(!identical(names(type), names(colnames)))
... ...
@@ -462,9 +462,9 @@ readGenCallOutput = function(file, path=".", cdfName,
462 462
 	sepp=s[2]
463 463
 	a1=unlist(strsplit(tmp[10][1],s[2]))
464 464
     }
465
-    
465
+
466 466
     if(sum(is.na(match(colnames,a1)))!=0)
467
-	stop("Cannot find required columns: ", colnames[is.na(match(colnames,a1))], " in ", file, "\nPlease check whether this data was exported.") 
467
+	stop("Cannot find required columns: ", colnames[is.na(match(colnames,a1))], " in ", file, "\nPlease check whether this data was exported.")
468 468
 
469 469
     m1=m=match(a1,colnames)
470 470
     m[is.na(m1)==FALSE] =type[m1[!is.na(m1)]]
... ...
@@ -472,19 +472,19 @@ readGenCallOutput = function(file, path=".", cdfName,
472 472
     names(m) = names(colnames)[m1]
473 473
 
474 474
     fc = file(file.path(path, file), open="r")
475
-	
475
+
476 476
     dat = scan(fc, what=m, skip=10,sep=sepp)
477 477
     close(fc)
478
-	
478
+
479 479
     samples = unique(dat$"SampleID")
480 480
     nsamples = length(samples)
481 481
     snps = unique(dat$"SNPID")
482 482
     nsnps = length(snps)
483 483
     if(verbose)
484 484
         cat("Check ordering for samples","\n")
485
-    
485
+
486 486
     X = Y = zeroes = matrix(0, nsnps, nsamples)
487
-	
487
+
488 488
     for(i in 1:length(samples)) {
489 489
         ind = dat$"SampleID"==samples[i]
490 490
 	    if(sum(dat$"SNPID"[ind]==snps)==nsnps) {
... ...
@@ -502,23 +502,23 @@ readGenCallOutput = function(file, path=".", cdfName,
502 502
         }
503 503
         gc()
504 504
     }
505
-	
505
+
506 506
     zeroes=(X=="0"|Y=="0")
507 507
     colnames(X) = colnames(Y) =  colnames(zeroes) = samples
508 508
     rownames(X) = rownames(Y) = snps
509
-    
510
-    if(verbose)      
509
+
510
+    if(verbose)
511 511
         cat("Creating NChannelSet object\n")
512
-   
512
+
513 513
     XY = new("NChannelSet", X = initializeBigMatrix(name = "X", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=X),
514 514
 			 Y = initializeBigMatrix(name = "Y", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=Y),
515 515
 			 zero = initializeBigMatrix(name = "zero", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=zeroes),
516 516
 			 annotation = cdfName, storage.mode = "environment")
517 517
     sampleNames(XY)=colnames(X)
518
-	
518
+
519 519
     if(verbose)
520 520
       cat("Done\n")
521
-	  
521
+
522 522
     XY
523 523
 }
524 524
 
... ...
@@ -855,7 +855,7 @@ crlmmIllumina = function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
855 855
                   mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
856 856
                   cdfName, sns, recallMin=10, recallRegMin=1000,
857 857
                   returnParams=FALSE, badSNP=.7) {
858
-				  
858
+
859 859
 if(missing(cdfName)) {
860 860
    if(!missing(RG))
861 861
       cdfName = annotation(RG)
... ...
@@ -864,7 +864,7 @@ if(missing(cdfName)) {
864 864
 }
865 865
 if(!isValidCdfName(cdfName))
866 866
    stop("cdfName not valid.  see validCdfNames")
867
-     
867
+
868 868
 #  if (save.it & (missing(snpFile) | missing(cnFile)))
869 869
 #    stop("'snpFile' and/or 'cnFile' is missing and you chose to save.it")
870 870
 #  if (load.it & missing(snpFile))
... ...
@@ -1213,11 +1213,11 @@ constructInf <- function(sampleSheet=NULL,
1213 1213
 		     call=initializeBigMatrix(name="call", nr, nc),
1214 1214
 		     callProbability=initializeBigMatrix(name="callPr", nr,nc),
1215 1215
 		     annotation=cdfName,
1216
+		     featureData=featureData,
1216 1217
 		     batch=batch)
1217 1218
         if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
1218 1219
 	   	sampleNames(cnSet) = sampleSheet$Sample_ID
1219
-        }
1220
-        else sampleNames(cnSet) = arrayNames
1220
+        } else  sampleNames(cnSet) = arrayNames
1221 1221
 	if(saveDate){
1222 1222
 		protocolData = getProtocolData.Illumina(grnidats, sep=sep, fileExt=fileExt$green, verbose=verbose)
1223 1223
 	} else{
... ...
@@ -1225,7 +1225,7 @@ constructInf <- function(sampleSheet=NULL,
1225 1225
 	}
1226 1226
 	rownames(pData(protocolData)) = sampleNames(cnSet)
1227 1227
 	protocolData(cnSet) = protocolData
1228
-	featureData(cnSet) = featureData
1228
+	##featureData(cnSet) = featureData
1229 1229
 	featureNames(cnSet) = featureNames(featureData)
1230 1230
 	cnSet$gender <- initializeBigVector("gender", ncol(cnSet), vmode="integer")
1231 1231
 	cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double")
... ...
@@ -1248,10 +1248,10 @@ preprocessInf <- function(cnSet,
1248 1248
 		       stripNorm=TRUE,
1249 1249
 		       useTarget=TRUE,
1250 1250
 		       mixtureSampleSize=10^5,
1251
-		       fitMixture=TRUE,
1251
+			  fitMixture=TRUE,
1252 1252
 		       eps=0.1,
1253 1253
 		       verbose=TRUE,
1254
-		       seed=1){
1254
+		       seed=1, cdfName){
1255 1255
 	stopifnot(all(c("gender", "SNR", "SKW") %in% varLabels(cnSet)))
1256 1256
 	open(A(cnSet))
1257 1257
 	open(B(cnSet))
... ...
@@ -1304,7 +1304,8 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3),
1304 1304
 			returnParams=TRUE,
1305 1305
 			badSNP=0.7,
1306 1306
 			gender=NULL,
1307
-			DF=6){
1307
+			DF=6,
1308
+			cdfName){
1308 1309
 	is.snp = isSnp(cnSet)
1309 1310
 	snp.index = which(is.snp)
1310 1311
 ##	narrays = ncol(cnSet)
... ...
@@ -1379,6 +1380,7 @@ genotype.Illumina <- function(sampleSheet=NULL,
1379 1380
 	if(missing(cdfName)) stop("must specify cdfName")
1380 1381
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
1381 1382
 	stopifnot(!missing(batch))
1383
+	if(is(batch, "factor")) batch <- as.character(batch)
1382 1384
         pkgname <- getCrlmmAnnotationName(cdfName)
1383 1385
 	message("Instantiate CNSet container.")
1384 1386
 	cnSet <- constructInf(sampleSheet=sampleSheet,
... ...
@@ -1408,7 +1410,8 @@ genotype.Illumina <- function(sampleSheet=NULL,
1408 1410
 				    fitMixture=fitMixture,
1409 1411
 				    eps=eps,
1410 1412
 				    verbose=verbose,
1411
-				    seed=seed)
1413
+				    seed=seed,
1414
+				       cdfName=cdfName)
1412 1415
 	message("Preprocessing complete.  Begin genotyping...")
1413 1416
 	genotypeInf(cnSet=cnSet, mixtureParams=mixtureParams,
1414 1417
 		   probs=probs,
... ...
@@ -1419,7 +1422,8 @@ genotype.Illumina <- function(sampleSheet=NULL,
1419 1422
 		   returnParams=returnParams,
1420 1423
 		   badSNP=badSNP,
1421 1424
 		   gender=gender,
1422
-		   DF=DF)
1425
+		   DF=DF,
1426
+		    cdfName=cdfName)
1423 1427
 	return(cnSet)
1424 1428
 }
1425 1429
 
... ...
@@ -216,3 +216,14 @@ setMethod("annotatedDataFrameFrom", "ffdf", Biobase:::annotatedDataFrameFromMatr
216 216
 getBAF <- function(theta, canonicalTheta)
217 217
     .Call('normalizeBAF', theta, ct)
218 218
 
219
+
220
+validCEL <- function(celfiles){
221
+	for(i in seq_along(celfiles)){
222
+		res <- tryCatch(read.celfile(celfiles[i], intensity.means.only=TRUE), error=function(e) NULL)
223
+		if(is.null(res)) {
224
+			msg <- message("Problem reading ", celfiles[i])
225
+			stop(msg)
226
+		}
227
+	}
228
+	return("Successfully read all cel files")
229
+}
219 230
Binary files a/data/cnSetExample.rda and b/data/cnSetExample.rda differ
220 231
Binary files a/data/cnSetExample2.rda and b/data/cnSetExample2.rda differ
221 232
similarity index 100%
222 233
rename from inst/doc/copynumber.Rnw
223 234
rename to inst/doc/AffyGW.Rnw
224 235
deleted file mode 100644
... ...
@@ -1,11 +0,0 @@
1
-%\VignetteIndexEntry{Preprocessing and genotyping Affymetrix array for copy number analysis}
2
-%\VignetteDepends{}
3
-%\VignetteKeywords{crlmm, SNP 6}
4
-%\VignettePackage{crlmm}
5
-
6
-%%% The real .Rnw is under inst/scripts.
7
-%%% It cannot be added here because it depends on external
8
-%%% data not yet available as a BioC package
9
-\documentclass{article}
10
-\begin{document}
11
-\end{document}
12 0
\ No newline at end of file
13 1
deleted file mode 100644
14 2
Binary files a/inst/doc/AffymetrixPreprocessCN.pdf and /dev/null differ
... ...
@@ -1,9 +1,8 @@
1
-All: copynumber clean
1
+All: vignettes clean
2 2
 
3 3
 ## all pdfs must be generated from the first target of the Makefile
4
-copynumber: copynumber.tex
5
-	cp ../scripts/copynumber.pdf .
6
-	cp ../scripts/AffymetrixPreprocessCN.pdf .
4
+vignettes: AffyGW.tex
5
+	cp ../scripts/AffyGW.pdf .
7 6
 	cp ../scripts/IlluminaPreprocessCN.pdf .
8 7
 	cp ../scripts/Infrastructure.pdf .
9 8
 	cp ../scripts/crlmmDownstream.pdf .
10 9
deleted file mode 100644
11 10
Binary files a/inst/doc/copynumber.pdf and /dev/null differ
12 11
deleted file mode 100644
13 12
Binary files a/inst/doc/illumina_copynumber.pdf and /dev/null differ
14 13
similarity index 88%
15 14
rename from inst/scripts/AffymetrixPreprocessCN.Rnw
16 15
rename to inst/scripts/AffyGW.Rnw
... ...
@@ -6,6 +6,7 @@
6 6
 \documentclass{article}
7 7
 \usepackage{graphicx}
8 8
 \usepackage{natbib}
9
+\usepackage{amsmath}
9 10
 \usepackage{url}
10 11
 \newcommand{\Rfunction}[1]{{\texttt{#1}}}
11 12
 \newcommand{\Rmethod}[1]{{\texttt{#1}}}
... ...
@@ -74,8 +75,7 @@ pathToCels <- "/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m"
74 75
 if(getRversion() < "2.13.0"){
75 76
 	rpath <- getRversion()
76 77
 } else rpath <- "trunk"
77
-##outdir <- paste("/amber1/archive/oncbb/rscharpf/crlmm_vignette/", rpath, sep="")
78
-outdir <- paste("/amber1/archive/oncbb/rscharpf/crlmm_vignette/", rpath, "/gw6v1.5/", sep="")
78
+outdir <- paste("/amber1/archive/oncbb/rscharpf/crlmm_vignette/", rpath, "/gw6/", sep="")
79 79
 dir.create(outdir, recursive=TRUE, showWarnings=FALSE)
80 80
 @
81 81
 
... ...
@@ -87,7 +87,6 @@ the genotyping step will be stored in \verb+outdir+.
87 87
 ldPath(outdir)
88 88
 @
89 89
 
90
-
91 90
 % only needed if cacheing
92 91
 <<cachedir, echo=FALSE>>=
93 92
 setCacheDir(outdir)
... ...
@@ -152,11 +151,7 @@ corrupt cel file. To check if any of the files are corrupt, try
152 151
 reading the files in one at a time:
153 152
 
154 153
 <<checkcorrupt,eval=FALSE>>=
155
-require(affyio)
156
-for(i in seq_along(celFiles)) {
157
-	cat("Reading ", i, "\n")
158
-	invisible(read.celfile(celFiles[i], intensity.means.only=TRUE))
159
-}
154
+validCEL(celFiles)
160 155
 @
161 156
 
162 157
 
... ...
@@ -188,10 +183,12 @@ saveObject <- function(outdir, cnSet) {
188 183
 (cnset.saved <- saveObject(outdir, cnSet))
189 184
 @
190 185
 
191
-Users can proceed to the \verb+copynumber+ vignette for copy number
192
-analyses.  See the \verb+Infrastructure+ vignette for additional
193
-details on the \Robject{CNSet} class, including an overview of the
194
-available accessors.
186
+%Users can proceed to the \verb+copynumber+ vignette for copy number
187
+%analyses.  See the \verb+Infrastructure+ vignette for additional
188
+%details on the \Robject{CNSet} class, including an overview of the
189
+%available accessors.
190
+
191
+\SweaveInput{copynumber}
195 192
 
196 193
 
197 194
 \section{Session information}
... ...
@@ -200,4 +197,20 @@ toLatex(sessionInfo())
200 197
 @
201 198
 
202 199
 
200
+\begin{figure}[f]
201
+  \begin{center}
202
+  \includegraphics[width=0.6\textwidth]{AffyGW-snr.pdf}
203
+  \caption{The signal to noise ratio (SNR) for 180 HapMap samples. For
204
+    Affymetrix platforms, SNR values below 5 can indicate possible
205
+    problems with sample quality.  In some circumstances, it may be
206
+    more helpful to exclude samples with poor DNA quality.}
207
+\end{center}
208
+\end{figure}
209
+
210
+
211
+\begin{bibliography}
212
+  \bibliographystyle{plain}
213
+  \bibliography{refs}
214
+\end{bibliography}
215
+
203 216
 \end{document}
204 217
new file mode 100644
205 218
Binary files /dev/null and b/inst/scripts/AffyGW.pdf differ
206 219
deleted file mode 100644
207 220
Binary files a/inst/scripts/AffymetrixPreprocessCN.pdf and /dev/null differ
... ...
@@ -5,6 +5,7 @@
5 5
 \documentclass{article}
6 6
 \usepackage{graphicx}
7 7
 \usepackage{natbib}
8
+\usepackage{amsmath}
8 9
 \usepackage[margin=1in]{geometry}
9 10
 \newcommand{\crlmm}{\Rpackage{crlmm}}
10 11
 \newcommand{\Rfunction}[1]{{\texttt{#1}}}
... ...
@@ -84,15 +85,15 @@ ocProbesets(150e3)
84 85
 ocSamples(500)
85 86
 @
86 87
 
87
-\section{Initializing a container for storing processed data}
88
-
89
-This section will initialize a container for storing processed forms
90
-of the data, including the normalized intensities for the A and B
91
-alleles and the CRLMM genotype calls and confidence scores.  In
92
-addition, the container will store information on the markers
93
-(physical position, chromosome, and a SNP indicator), the batch, and
94
-the samples (e.g., gender).  To construct this container for Infinium
95
-platforms, several steps are required.
88
+%\section{Initializing a container for storing processed data}
89
+%
90
+%This section will initialize a container for storing processed forms
91
+%of the data, including the normalized intensities for the A and B
92
+%alleles and the CRLMM genotype calls and confidence scores.  In
93
+%addition, the container will store information on the markers
94
+%(physical position, chromosome, and a SNP indicator), the batch, and
95
+%the samples (e.g., gender).  To construct this container for Infinium
96
+%platforms, several steps are required.
96 97
 
97 98
 We begin by specifying the path containing the raw IDAT files for a
98 99
 set of samples from the Infinium 370k platform.
... ...
@@ -143,129 +144,140 @@ processed at similar times (e.g., within a few weeks).
143 144
 batch <- rep("1", nrow(samplesheet))
144 145
 @
145 146
 
146
-Finally, we initialize an object of class \Robject{CNSet} using the
147
-function \Rfunction{constructInf}.
148
-
149
-<<container,cache=TRUE>>=
150
-cnSet <- constructInf(sampleSheet=samplesheet,
151
-		      arrayNames=arrayNames,
152
-		      batch=batch,
153
-		      arrayInfoColNames=arrayInfo,
154
-		      cdfName=cdfName,
155
-		      verbose=TRUE,
156
-		      saveDate=TRUE)
157
-@
158
-
159
-A concise summary of the object's contents can be viewed with the
160
-\Rfunction{print} function.
161
-
162
-<<cnset>>=
163
-print(cnSet)
164
-@
165
-
166
-Note that the above object does not yet contain any processed data
167
-(only \verb+NA+'s).  As the elements of the \verb+assayData+ slot are
168
-\Robject{ff} objects (not matrices), several \verb+.ff+ files now
169
-appear in the \verb+outdir+. The \verb+.ff+ files should not be
170
-removed and can be listed using the \R{} function
171
-\Rfunction{list.files}.  %For the most part, the \emph{appearance} that
147
+%Finally, we initialize an object of class \Robject{CNSet} using the
148
+%function \Rfunction{constructInf}.
149
+%
150
+%<<container,cache=TRUE>>=
151
+%cnSet <- constructInf(sampleSheet=samplesheet,
152
+%		      arrayNames=arrayNames,
153
+%		      batch=batch,
154
+%		      arrayInfoColNames=arrayInfo,
155
+%		      cdfName=cdfName,
156
+%		      verbose=TRUE,
157
+%		      saveDate=TRUE)
158
+%@
159
+%
160
+%A concise summary of the object's contents can be viewed with the
161
+%\Rfunction{print} function.
162
+%
163
+%<<cnset>>=
164
+%print(cnSet)
165
+%@
166
+%
167
+%Note that the above object does not yet contain any processed data
168
+%(only \verb+NA+'s).  As the elements of the \verb+assayData+ slot are
169
+%\Robject{ff} objects (not matrices), several \verb+.ff+ files now
170
+%appear in the \verb+outdir+. The \verb+.ff+ files should not be
171
+%removed and can be listed using the \R{} function
172
+%\Rfunction{list.files}.  %For the most part, the \emph{appearance} that
172 173
 %the data is stored in memory is preserved.
173
-
174
-<<listff>>=
175
-sapply(assayData(cnSet), function(x) class(x)[1])
176
-list.files(outdir, pattern=".ff")[1:5]
177
-@
178
-
179
-\section{Preprocessing}
174
+%
175
+%<<listff>>=
176
+%sapply(assayData(cnSet), function(x) class(x)[1])
177
+%list.files(outdir, pattern=".ff")[1:5]
178
+%@
179
+%
180
+\section{Preprocessing and genotyping}
180 181
 
181 182
 The raw intensities from the Infinium IDAT files are read and
182 183
 normalized using the function \Rfunction{preprocessInf}.  The function
183 184
 \Rfunction{preprocessInf} returns a \Robject{ff} object containing the
184 185
 parameters for the mixture model used by the CRLMM genotyping
185
-algorithm.
186
-
187
-<<preprocess,cache=TRUE, results=hide>>=
188
-mixtureParams <- preprocessInf(cnSet=cnSet, sampleSheet=samplesheet, arrayNames=arrayNames, arrayInfoColNames=arrayInfo)
189
-@
190
-
191
-<<showMixtureParams>>=
192
-invisible(open(mixtureParams))
193
-str(mixtureParams[])
194
-invisible(close(mixtureParams))
195
-@
196
-
197
-Note that the normalized intensities for the A and B alleles are no
198
-longer \verb+NA+s and can be inspected using the methods \Rfunction{A}
199
-and \Rfunction{B}, respectively.
200
-
201
-<<intensities>>=
202
-invisible(open(A(cnSet)))
203
-invisible(open(B(cnSet)))
204
-as.matrix(A(cnSet)[1:5, 1:5])
205
-as.matrix(B(cnSet)[1:5, 1:5])
206
-invisible(close(A(cnSet)))
207
-invisible(close(B(cnSet)))
208
-@
209
-
210
-\section{Genotyping}
211
-
212
-CRLMM genotype calls and confidence scores are estimated using the
213
-function \Rfunction{genotypeInf}.
214
-
215
-<<genotype,cache=TRUE>>=
216
-updated <- genotypeInf(cnSet, mixtureParams=mixtureParams)
217
-@
218
-\vspace{-0.5em}
219
-<<>>=
220
-updated
221
-@
222
-
223
-\textbf{Wrapper:} As an alternative to calling the functions
224
-\Rfunction{constructInf}, \Rfunction{preprocessInf} and
225
-\Rfunction{genotypeInf} in sequence, a convenience function called
226
-\Rfunction{genotype.Illumina} is a wrapper for the above functions and
227
-produces identical results.
186
+algorithm.  Following preprocessing, the \Rfunction{genotypeInf}
187
+genotypes the samples. The function \Rfunction{genotype.Illumina} is a
188
+wrapper to the above functions and returns an object of class
189
+\Rclass{CNSet}.
190
+
191
+%<<preprocess,cache=TRUE, results=hide>>=
192
+%mixtureParams <- preprocessInf(cnSet=cnSet, sampleSheet=samplesheet,
193
+%			       arrayNames=arrayNames,
194
+%			       arrayInfoColNames=arrayInfo,
195
+%			       cdfName=cdfName)
196
+%@
197
+%
198
+%<<showMixtureParams>>=
199
+%invisible(open(mixtureParams))
200
+%str(mixtureParams[])
201
+%invisible(close(mixtureParams))
202
+%@
203
+%
204
+%Note that the normalized intensities for the A and B alleles are no
205
+%longer \verb+NA+s and can be inspected using the methods \Rfunction{A}
206
+%and \Rfunction{B}, respectively.
207
+%
208
+%<<intensities>>=
209
+%invisible(open(A(cnSet)))
210
+%invisible(open(B(cnSet)))
211
+%as.matrix(A(cnSet)[1:5, 1:5])
212
+%as.matrix(B(cnSet)[1:5, 1:5])
213
+%invisible(close(A(cnSet)))
214
+%invisible(close(B(cnSet)))
215
+%@
216
+%
217
+%\section{Genotyping}
218
+%
219
+%CRLMM genotype calls and confidence scores are estimated using the
220
+%function \Rfunction{genotypeInf}.
221
+%
222
+%<<genotype,cache=TRUE>>=
223
+%updated <- genotypeInf(cnSet, mixtureParams=mixtureParams, cdfName=cdfName)
224
+%@
225
+%\vspace{-0.5em}
226
+%<<>>=
227
+%updated
228
+%@
229
+%
230
+%\textbf{Wrapper:} As an alternative to calling the functions
231
+%\Rfunction{constructInf}, \Rfunction{preprocessInf} and
232
+%\Rfunction{genotypeInf} in sequence, a convenience function called
233
+%\Rfunction{genotype.Illumina} is a wrapper for the above functions and
234
+%produces identical results.
228 235
 
229 236
 <<genotype.Illumina,cache=TRUE,results=hide>>=
230
-cnSet2 <- genotype.Illumina(sampleSheet=samplesheet,
237
+cnSet <- genotype.Illumina(sampleSheet=samplesheet,
231 238
 			    arrayNames=arrayNames,
232 239
 			    arrayInfoColNames=arrayInfo,
233 240
 			    cdfName="human370v1c",
234 241
 			    batch=batch)
235 242
 @
236 243
 
237
-<<checkIdenticalCalls>>=
238
-invisible(open(calls(cnSet)))
239
-invisible(open(calls(cnSet2)))
240
-snp.index <- which(isSnp(cnSet))
241
-identical(calls(cnSet)[snp.index, 1:20], calls(cnSet2)[snp.index, 1:20])
242
-invisible(close(calls(cnSet)))
243
-invisible(close(calls(cnSet2)))
244
-@
245 244
 
246
-To fully remove the data associated with the \Robject{cnSet2} object,
247
-one should use the \Rfunction{delete} function in the \Rpackage{ff}
248
-package followed by the \Rfunction{rm} function.  The following code
249
-is not evaluated is it would change the results of the cached
250
-computations in the previous code chunk.
245
+Note, to fully remove the data associated with the \Robject{cnSet2}
246
+object, one should use the \Rfunction{delete} function in the
247
+\Rpackage{ff} package followed by the \Rfunction{rm} function.  The
248
+following code is not evaluated is it would change the results of the
249
+cached computations in the previous code chunk.
251 250
 
252 251
 <<delete, eval=FALSE>>=
253
-lapply(assayData(cnSet2), delete)
254
-lapply(batchStatistics(cnSet2), delete)
255
-delete(cnSet2$gender)
256
-delete(cnSet2$SNR)
257
-delete(cnSet2$SKW)
258
-rm(cnSet2)
252
+lapply(assayData(cnSet), delete)
253
+lapply(batchStatistics(cnSet), delete)
254
+delete(cnSet$gender)
255
+delete(cnSet$SNR)
256
+delete(cnSet$SKW)
257
+rm(cnSet)
259 258
 @
260 259
 
261
-Users can proceed to the \verb+copynumber+ vignette for copy number
262
-analyses.  See the \verb+Infrastructure+ vignette for additional
263
-details on the \Robject{CNSet} class, including an overview of the
264
-available accessors.
260
+%\section{Copy number estimation}
261
+
262
+\SweaveInput{copynumber}
265 263
 
266 264
 \section{Session information}
267 265
 <<sessionInfo, results=tex>>=
268 266
 toLatex(sessionInfo())
269 267
 @
270 268
 
269
+\begin{figure}[f]
270
+  \begin{center}
271
+  \includegraphics[width=0.6\textwidth]{IlluminaPreprocessCN-snr.pdf}
272
+  \caption{The signal to noise ratio (SNR) for 180 HapMap samples. For
273
+    Affymetrix platforms, SNR values below 5 can indicate possible
274
+    problems with sample quality.  In some circumstances, it may be
275
+    more helpful to exclude samples with poor DNA quality.}
276
+\end{center}
277
+\end{figure}
278
+
279
+
280
+\bibliography{refs}
281
+\bibliographystyle{plain}
282
+
271 283
 \end{document}
272 284
Binary files a/inst/scripts/IlluminaPreprocessCN.pdf and b/inst/scripts/IlluminaPreprocessCN.pdf differ
... ...
@@ -1,35 +1,3 @@
1
-%\VignetteIndexEntry{crlmm copy number Vignette}
2
-%\VignetteDepends{crlmm, genomewidesnp6Crlmm}
3
-%\VignetteKeywords{crlmm, SNP 6}
4
-%\VignettePackage{crlmm}
5
-\documentclass{article}
6
-\usepackage{graphicx}
7
-\usepackage{natbib}
8
-\usepackage{url}
9
-\usepackage{amsmath,amssymb}
10
-\newcommand{\Rfunction}[1]{{\texttt{#1}}}
11
-\newcommand{\Rmethod}[1]{{\texttt{#1}}}
12
-\newcommand{\Rcode}[1]{{\texttt{#1}}}
13
-\newcommand{\Robject}[1]{{\texttt{#1}}}
14
-\newcommand{\Rpackage}[1]{{\textsf{#1}}}
15
-\newcommand{\Rclass}[1]{{\textit{#1}}}
16
-\newcommand{\oligo}{\Rpackage{oligo }}
17
-\newcommand{\R}{\textsf{R}}
18
-\newcommand{\crlmm}{\Rpackage{crlmm}}
19
-\usepackage[margin=1in]{geometry}
20
-
21
-\begin{document}
22
-\title{Copy number estimation with \Rpackage{crlmm}}
23
-\date{\today}
24
-\author{Rob Scharpf}
25
-\maketitle
26
-
27
-<<setup, echo=FALSE, results=hide>>=
28
-options(continue=" ", width=70)
29
-@
30
-
31
-\begin{abstract}
32
-
33 1
   Copy number routines in the \crlmm{} package are available for
34 2
   Affymetrix 5.0 and 6.0 platforms, as well as several Illumina
35 3
   platforms.  This vignette assumes that the arrays have already been
... ...
@@ -44,35 +12,6 @@ options(continue=" ", width=70)
44 12
   \crlmm{} package is available from the author's website:
45 13
   \url{http://www.biostat.jhsph.edu/~rscharpf/crlmmCompendium/index.html}.
46 14
 
47
-\end{abstract}
48
-
49
-\section{Set up}
50
-
51
-<<libraries,results=hide>>=
52
-library(ff)
53
-library(crlmm)
54
-library(lattice)
55
-library(cacheSweave)
56
-if(getRversion() < "2.13.0"){
57
-	rpath <- getRversion()
58
-} else rpath <- "trunk"
59
-outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="")
60
-@
61
-
62
-<<ram>>=
63
-ldPath(outdir)
64
-setCacheDir(outdir)
65
-ocProbesets(150e3)
66
-ocSamples(200)
67
-@
68
-
69
-We begin by loading the \Robject{cnSet} object created by the
70
-\verb+AffymetrixPreprocessCN+ vignette.
71
-
72
-<<loadcnset>>=
73
-if(!exists("cnSet")) load(file.path(outdir, "cnSet.rda"))
74
-@
75
-
76 15
 
77 16
 \textbf{Limitations:} While a minimum number of samples is not
78 17
 required for preprocessing and genotyping, copy number estimation in
... ...
@@ -100,24 +39,16 @@ quality.  The following code chunk makes a histogram of the SNR values
100 39
 for the HapMap samples.
101 40
 
102 41
 <<snr,fig=TRUE,include=FALSE,width=6, height=4>>=
42
+library(lattice)
103 43
 invisible(open(cnSet$SNR))
104 44
 snr <- cnSet$SNR[]
105 45
 close(cnSet$SNR)
106
-print(histogram(~snr, panel=function(...){
107
-	panel.histogram(...)
108
-	panel.abline(v=5, col="grey",lty=2)
109
-},
110
-		breaks=25,xlim=c(4.5,10), xlab="SNR"))
46
+print(histogram(~snr,
47
+		panel=function(...){
48
+			panel.histogram(...)},
49
+		breaks=25, xlim=range(snr), xlab="SNR"))
111 50
 @
112
-\begin{figure}[t!]
113
-  \begin{center}
114
-  \includegraphics[width=0.6\textwidth]{copynumber-snr.pdf}
115
-  \caption{The signal to noise ratio (SNR) for 180 HapMap samples. For
116
-    Affymetrix platforms, SNR values below 5 can indicate possible
117
-    problems with sample quality.  In some circumstances, it may be
118
-    more helpful to exclude samples with poor DNA quality.}
119
-\end{center}
120
-\end{figure}
51
+
121 52
 
122 53
 
123 54
 \section{Copy number estimation}
... ...
@@ -149,29 +80,17 @@ computed for subsets of the markers to reduce the required RAM. Note
149 80
 that the value returned by the \Rfunction{crlmmCopynumber} function in
150 81
 the above example is \verb+TRUE+.  The reason the function returns
151 82
 \verb+TRUE+ in the above example is that the elements of the
152
-\verb+batchStatistics+ slot have the class \Rclass{ff\_matrix}.  Rather
153
-than keep the statistical summaries in memory, the summaries are
154
-written to files on disk using protocols described in the
83
+\verb+batchStatistics+ slot have the class \Rclass{ff\_matrix}.
84
+Rather than keep the statistical summaries in memory, the summaries
85
+are written to files on disk using protocols described in the
155 86
 \Rpackage{ff} package. Hence, while the \Robject{cnSet} object itself
156 87
 is unchanged as a result of the \Rfunction{crlmmCopynumber} function,
157 88
 the data on disk is updated accordingly.  Users that are interested in
158 89
 accessing these low-level summaries can refer to the
159
-\verb+Infrastructure+ vignette.  Computation of the raw copy number
160
-estimates for each allele is described in the following section.
161
-
162
-For users that are interested in the analysis of a specific chromosome
163
-(subset of markers) or a s
164
-
165
-pointers to files on disk, are stored in the \verb+batchStatistics+
166
-slot of the class \Rclass{CNSet}.  Using the default settings for the
167
-\Rfunction{crlmmCopynumber} function, only an object of class
168
-\Rclass{CNSet} is required.  %(See \verb+AffyPreprocessCN+ or
169
-%\verb+IlluminaPreprocessCN+ vignettes for details.)
170
-
171
-Note that depends on whether the elements of the \verb+batchStatistics+
172
-slot are \Robject{ff} objects or ordinary matrices.  In this example,
173
-the elements of \verb+batchStatistics+ have the class
174
-\Rclass{ff\_matrix}.
90
+\verb+Infrastructure+ vignette.  Note that the data structure depends
91
+on whether the elements of the \verb+batchStatistics+ slot are
92
+\Robject{ff} objects or ordinary matrices.  In this example, the
93
+elements of \verb+batchStatistics+ have the class \Rclass{ff\_matrix}.
175 94
 
176 95
 <<classes>>=
177 96
 nms <- ls(batchStatistics(cnSet))
... ...
@@ -186,10 +105,8 @@ described in the \R{} package \Rpackage{ff}. The value returned by
186 105
 \Rfunction{crlmmCopynumber} is \verb+TRUE+, indicating that the files
187 106
 on disk have been successfully updated.  Note that while the
188 107
 \Robject{cnSet} object is unchanged, the values on disk are different.
189
-
190
-
191
-\noindent On the other hand, subsetting the \Robject{cnSet} with the
192
-`[' method coerces all of the elements to class \Rclass{matrix}. The
108
+On the other hand, subsetting the \Robject{cnSet} with the `[' method
109
+coerces all of the elements to class \Rclass{matrix}. The
193 110
 batch-specific summaries are now ordinary matrices stored in RAM. The
194 111
 object returned by \Robject{crlmmCopynumber} is an object of class
195 112
 \Rclass{CNSet} with the matrices in the \verb+batchStatistics+ slot
... ...
@@ -219,14 +136,18 @@ class(cnSet3)
219 136
 rm(cnSet2); gc()
220 137
 @
221 138
 
222
-\subsection{Raw copy number}
139
+\subsection{Marker-specific estimates}
140
+%\subsection{Raw copy number}
141
+
142
+%\paragraph{Log R ratios and B allele frequencies.}
223 143
 
144
+\paragraph{Raw total copy number.}
224 145
 Several functions are available that will compute relatively quickly
225 146
 the allele-specific, \emph{raw} copy number estimates. At allele $k$,
226 147
 marker $i$, sample $j$, and batch $p$, the estimate of allele-specific
227 148
 copy number is computed by subtracting the estimated background from
228
-the normalized intensity and scaling by the slope coefficient. More formally,
229
-\newcommand{\A}{A} \newcommand{\B}{B}
149
+the normalized intensity and scaling by the slope coefficient. More
150
+formally, \newcommand{\A}{A} \newcommand{\B}{B}
230 151
 \begin{eqnarray}
231 152
   \label{eq:cnK}
232 153
 {\hat c}_{k,ijp} &=& \mbox{max}\left\{\frac{1}{{\hat
... ...
@@ -246,9 +167,9 @@ at all markers for the first 2 samples, and the total copy number for
246 167
 chromosome 20 for the first 50 samples.
247 168
 
248 169
 <<totalCopynumber>>=
249
-tmp <- totalCopynumber(cnSet, i=1:nrow(cnSet), j=1:2)
170
+tmp <- totalCopynumber(cnSet, i=seq_len(nrow(cnSet)), j=1:2)
250 171
 dim(tmp)
251
-tmp2 <- totalCopynumber(cnSet, i=which(chromosome(cnSet) == 20), j=1:50)
172
+tmp2 <- totalCopynumber(cnSet, i=which(chromosome(cnSet) == 20), j=seq_len(ncol(cnSet)))
252 173
 dim(tmp2)
253 174
 @
254 175
 
... ...
@@ -258,98 +179,12 @@ chunk computes the allele-specific summaries at all polymorphic loci.
258 179
 
259 180
 <<ca>>=
260 181
 snp.index <- which(isSnp(cnSet) & !is.na(chromosome(cnSet)))
261
-ca <- CA(cnSet, i=snp.index, j=1:5)
262
-cb <- CB(cnSet, i=snp.index, j=1:5)
263
-@
264
-
265
-\noindent Note the equivalence of the following calculations.
266
-
267
-<<totalCopynumber>>=
268
-ct <- ca+cb
269
-ct2 <- totalCopynumber(cnSet, i=snp.index, j=1:5)
270
-stopifnot(all.equal(ct, ct2))
271
-@
272
-
273
-At nonpolymorphic loci, \Rfunction{CA} function returns the total copy
274
-number and, by construction, the \Rfunction{CB} function returns 0.
275
-
276
-<<nonpolymorphicAutosomes>>=
277
-marker.index <- which(!isSnp(cnSet))
278
-ct <- CA(cnSet, i=marker.index, j=1:5)
279
-## all zeros
280
-stopifnot(all(CB(cnSet, i=marker.index, j=1:5) == 0))
281
-ct2 <- totalCopynumber(cnSet, i=marker.index, j=1:5)
282
-stopifnot(all.equal(ct, ct2))
283
-@
284
-
285
-In the following code chunk, we extract estimates of the total copy
286
-number at nonpolymorphic markers on chromosome X.
287
-
288
-<<npx>>=
289
-set.seed(123)
290
-npx.index <- which(chromosome(cnSet)==23 & !isSnp(cnSet))
291
-M <- sample(which(cnSet$gender[]==1), 5)
292
-F <- sample(which(cnSet$gender[]==2), 5)
293
-cn.M <- CA(cnSet, i=npx.index, j=M)
294
-cn.F <- CA(cnSet, i=npx.index, j=F)
295
-@
296
-
297
-\noindent Again, the function \Rfunction{totalCopynumber} is
298
-equivalent.
299
-
300
-<<npx2>>=
301
-cnX <- cbind(cn.M, cn.F)
302
-cnX2 <- totalCopynumber(cnSet, i=npx.index, j=c(M, F))
303
-stopifnot(all.equal(cnX, cnX2))
182
+ca <- CA(cnSet, i=snp.index, j=seq_len(ncol(cnSet)))
183
+cb <- CB(cnSet, i=snp.index, j=seq_len(ncol(cnSet)))
304 184
 @
305 185
 
306
-<<nonpolymorphicX,fig=TRUE,include=FALSE, echo=FALSE, eval=FALSE>>=
307
-df <- data.frame(cn=as.numeric(cnX), id=factor(rep(sampleNames(cnSet)[c(M,F)], each=nrow(cnX)), levels=sampleNames(cnSet)[c(M,F)], ordered=T))
308
-library(RColorBrewer)
309
-cols <- brewer.pal(8, "Accent")[c(1, 5)]
310
-print(bwplot(cn~id,df, panel=function(x,y,...){
311
-	panel.grid(v=-10,h=0)
312
-	panel.bwplot(x,y, ...)
313
-	panel.abline(h=1:2, col="grey70",lwd=2)
314
-},
315
-	scales=list(x=list(rot=90)), cex=0.5, ylab="total copy number", main="Chr X SNPs",
316
-       fill=cols[cnSet$gender[c(M,F)]], key=mykey))
317
-@
318
-
319
-%\begin{figure}[t!]
320
-% \centering
321
-% \includegraphics[width=0.5\textwidth]{copynumber-nonpolymorphicX}
322
-% \caption{Copy number estimates for nonpolymorphic loci on chromosome
323
-%   X (5 men, 5 women).  We assume that the median copy number across
324
-%   samples at a given marker on X is 1 for men and 2 for women.}
325
-%\end{figure}
326
-
327
-Polymorphic markers on chromosome X:
328 186
 
329
-<<polymorphicX,fig=TRUE,include=FALSE>>=
330
-library(RColorBrewer)
331
-cols <- brewer.pal(8, "Accent")[c(1, 5)]
332
-X.markers <- which(isSnp(cnSet) & chromosome(cnSet) == 23)
333
-cnX <- totalCopynumber(cnSet, i=X.markers, j=c(M,F))
334
-df <- data.frame(cn=as.numeric(cnX), id=factor(rep(sampleNames(cnSet)[c(M,F)], each=length(X.markers)), levels=sampleNames(cnSet)[c(M,F)], ordered=T))
335
-mykey <- simpleKey(c("male", "female"), points=FALSE,col=cols)
336
-print(bwplot(cn~id,df, panel=function(x,y,...){
337
-	panel.grid(v=-10,h=0)
338
-	panel.bwplot(x,y, ...)
339
-	panel.abline(h=1:2, col="grey70",lwd=2)
340
-},
341
-	scales=list(x=list(rot=90)), cex=0.5, ylab="total copy number", main="Chr X SNPs",
342
-       fill=cols[cnSet$gender[c(M,F)]], key=mykey))
343
-@
344
-\begin{figure}[t!]
345
- \centering
346
- \includegraphics[width=0.5\textwidth]{copynumber-polymorphicX}
347
- \caption{Copy number estimates for polymorphic markers on chromosome
348
-   X. crlmm assumes that the median copy number across samples at a
349
-   given marker on X is 1 for men and 2 for women.}
350
-\end{figure}
351
-
352
-\subsection{A container for raw copy number}
187
+\subsection{Container for raw copy number}
353 188
 
354 189
 A useful container for storing the \crlmm{} genotypes, genotype
355 190
 confidence scores, and the total copy number at each marker is the
... ...
@@ -362,28 +197,6 @@ instantiated \Rclass{oligoSnpSet} will also be \Rpackage{ff}-dervied
362 197
 objects (a new \verb+total_cn*.ff+ file will be created in the
363 198
 \verb+ldPath()+ directory).
364 199
 
365
-%The advantage of this approach is that the raw copy number estimates
366
-%can be computed for all markers and all samples simultaneously without
367
-%excessive RAM.  The disadvantage is that such a step creates
368
-%additional I/O for reading and writing the raw copy number estimates
369
-%from disk.  For very large data sets (1000+ samples), the time
370
-%required for I/O may outweigh the benefits.  The alternative is to
371
-%perform downstream processing (such as smoothing) on the total copy
372
-%number for subsets of markers and/or samples.  As indicated
373
-%previously, the subset `[' method applied to an object of class
374
-%\Robject{CNSet} will return an object of the same class with simple
375
-%matrices instead of \Rpackage{ff} objects.  Therefore, one can
376
-%instantiate a \Rpackage{oligoSnpSet} object from a \Robject{CNSet}
377
-%object without the I/O overhead for storing the total copy number
378
-%estimates by first subsetting the \Rclass{CNSet} object.  One could
379
-%then smooth the raw copy number estimates and store a summary copy
380
-%number for the interval, thereby substantially reducing the
381
-%dimensionality of the copy number estimates.  Examples of smoothing
382
-%applications such as circular binary segmentation implemented in the
383
-%\Rpackage{DNAcopy} package or a hidden Markov model implemented in the
384
-%\Rpackage{VanillaICE} package are provided in the \verb+SmoothingCN+
385
-%vignette.
386
-
387 200
 <<oligoSnpSet>>=
388 201
 open(cnSet3)
389 202
 oligoSet <- as(cnSet3, "oligoSnpSet")
... ...
@@ -398,19 +211,6 @@ the \Rfunction{totalCopynumber} function defined over the same row and
398 211
 column indices.
399 212
 
400 213
 <<testEqual>>=
401
-total.cn3 <- totalCopynumber(cnSet3, i=1:nrow(cnSet3), j=1:ncol(cnSet3))
214
+total.cn3 <- totalCopynumber(cnSet3, i=1:nrow(cnSet3), j=seq_len(ncol(cnSet3)))
402 215
 all.equal(copyNumber(oligoSet), total.cn3)
403
-@
404
-
405
-
406
-\section{Session information}
407
-<<sessionInfo, results=tex>>=
408
-toLatex(sessionInfo())
409
-@
410
-
411
-%\begin{bibliography}
412
-  \bibliographystyle{plain}
413
-  \bibliography{refs}
414
-%\end{bibliography}
415
-
416
-\end{document}
216
+@
417 217
\ No newline at end of file
418 218
deleted file mode 100644
419 219
Binary files a/inst/scripts/copynumber.pdf and /dev/null differ
420 220
new file mode 100644
... ...
@@ -0,0 +1,17 @@
1
+TOP=../..
2
+PKG=${shell cd ${TOP};pwd}
3
+SUITE=doRUnit.R
4
+#R=${R_HOME}/bin/R
5
+
6
+all: inst test
7
+
8
+inst: # Install package
9
+
10
+	cd ${TOP}/..;\
11
+	R CMD INSTALL ${PKG}
12
+
13
+test: # Run unit tests
14
+	export RCMDCHECK=FALSE;\
15
+	export RUNITFILEPATTERN="$(file)";\
16
+	cd ${TOP}/tests;\
17
+	R --vanilla --slave < ${SUITE}
0 18
\ No newline at end of file
1 19
new file mode 100644
... ...
@@ -0,0 +1,6 @@
1
+test_dataExamples <- function(){
2
+	data(cnSetExample)
3
+	checkTrue(validObject(cnSetExample))
4
+	data(cnSetExample2)
5
+	checkTrue(validObject(cnSetExample2))
6
+}
0 7
new file mode 100644
... ...
@@ -0,0 +1,45 @@
1
+genotypingTest <- function(){
2
+	require(genomewidesnp6Crlmm) & require(hapmapsnp6)
3
+	path <- system.file("celFiles", package="hapmapsnp6")
4
+	cels <- list.celfiles(path, full.names=TRUE)
5
+	ocProbesets(50e3)
6
+	batch <- as.factor(rep("A", length(cels)))
7
+	(cnSet <- genotype(cels, cdfName="genomewidesnp6", batch=batch))
8
+	library(ff)
9
+	ldPath(tempdir())
10
+	(cnSet2 <- genotype(cels, cdfName="genomewidesnp6", batch=batch))
11
+	checkTrue(all.equal(calls(cnSet), calls(cnSet2)[,]))
12
+}
13
+
14
+genotypingTestIllumina <- function(){
15
+	setwd("/thumper/ctsa/snpmicroarray/illumina/IDATS/370k/")
16
+	library(crlmm)
17
+	library(ff)
18
+	ldPath(tempdir())
19
+
20
+	samples <- read.csv("samples370k.csv", as.is=TRUE)
21
+	RG <- readIdatFiles(sampleSheet=samples,
22
+			    arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"),
23
+			    saveDate=TRUE)
24
+
25
+	crlmmResult <-  crlmmIllumina(RG=RG,
26
+				      cdfName="human370v1c",
27
+				      returnParams=TRUE)
28
+	checkTrue(is(calls(crlmmResult)[1:5,1], "integer"))
29
+
30
+	crlmmResult2 <- crlmmIlluminaV2(sampleSheet=samples,
31
+					arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"),
32
+					saveDate=TRUE, cdfName="human370v1c", returnParams=TRUE)
33
+	checkTrue(identical(calls(crlmmResult)[1:5, ]),
34
+		  identical(calls(crlmmResult2)[1:5, ]))
35
+
36
+	crlmmResult3 <- genotype.Illumina(sampleSheet=samples,
37
+					  arrayInfoColNames=list(barcode=NULL,
38
+					  position="SentrixPosition"),
39
+					  saveDate=TRUE, cdfName="human370v1c",
40
+					  batch = as.factor(rep(1, nrow(samples))))
41
+
42
+	checkTrue(identical(calls(crlmmResult)[1:5, ]),
43
+		  identical(calls(crlmmResult3)[1:5, ]))
44
+
45
+}
... ...
@@ -35,9 +35,7 @@ ABpanel(x, y, predictRegion, copyNumber = 0:4, fill, ..., subscripts)
35 35
     See \code{xyplot} in the \pkg{lattice} package.
36 36
 }
37 37
 }
38
-\details{
39
-%%  ~~ If necessary, more details than the description above ~~
40
-}
38
+
41 39
 \value{
42 40
   Not applicable
43 41
 }
... ...
@@ -38,13 +38,10 @@ calculateRBaf(object, batch.name)
38 38
 \author{Lynn Mireless}
39 39
 
40 40
 \examples{
41
-
42 41
 data(cnSetExample)
43 42
 baf.lrr <- calculateRBaf(cnSetExample, "SHELF")
44 43
 hist(baf.lrr[["baf"]], breaks=100)
45 44
 hist(baf.lrr[["lrr"]], breaks=100)
46
-
47
-
48 45
 }
49 46
 % Add one or more standard keywords, see file 'KEYWORDS' in the
50 47
 % R documentation directory.
51 48
deleted file mode 100644
... ...
@@ -1,19 +0,0 @@
1
-\name{celDates}
2
-\alias{celDates}
3
-\title{Extract dates from the cel file header}
4
-\description{
5
-  Extract dates from the cel file header.
6
-}
7
-\usage{
8
-celDates(celfiles)
9
-}
10
-\arguments{
11
-  \item{celfiles}{CEL file names.  Must specify the complete path.}
12
-}
13
-\value{
14
-  date-time class POSIXt
15
-}
16
-\author{R. Scharpf}
17
-\seealso{ \cite{\link[affyio]{read.celfile.header}}, \cite{\link{POSIXt}}}
18
-\keyword{manip}
19
-
20 0
new file mode 100644
... ...
@@ -0,0 +1,41 @@
1
+\name{validCEL}
2
+\alias{celDates}
3
+\alias{validCEL}
4
+\title{
5
+  Reads cel files and return an error if a file is not read
6
+}
7
+\description{
8
+  Reads cel files and return an error if a file is not read
9
+}
10
+\usage{
11
+validCEL(celfiles)
12
+celDates(celfiles)
13
+}
14
+\arguments{
15
+  \item{celfiles}{
16
+    vector of cel file names to read
17
+}
18
+}
19
+\value{
20
+  Returns a message that cel files were successfully read, or an error
21
+  if there were problems reading the cel files.
22
+}
23
+
24
+\author{
25
+R. Scharpf
26
+}
27
+
28
+\seealso{ \cite{\link[affyio]{read.celfile.header}},
29
+  \cite{\link{POSIXt}},
30
+  \code{\link{read.celfile}}
31
+}
32
+
33
+\examples{
34
+if(require(hapmapsnp6)){
35
+  path <- system.file("celFiles", package="hapmapsnp6")
36
+  cels <- list.celfiles(path, full.names=TRUE)
37
+  validCEL(cels)
38
+  celDates(cels)
39
+}
40
+}
41
+\keyword{IO}
0 42
\ No newline at end of file
... ...
@@ -23,85 +23,13 @@ data(cnSetExample2)
23 23
   The data illustrates the \code{CNSet-class}, with
24 24
 	\code{assayData} containing the quantile-normalized
25 25
 	intensities for the A and B alleles, genotype calls and
26
-	confidence scores.  New slots that specific to copy number
27
-	estimation are \code{batch} and \code{batchStatistics}.
26
+	confidence scores.
28 27
 
29 28
 }
30 29
 \examples{
31
-\dontshow{
32 30
 \dontrun{
33
-
34
-      ## hapmap phase 3 data
35
-      data(hapmapSet, package="CnvScripts")
36
-      marker.index <- which(chromosome(hapmapSet) == 8)
37
-      marker.index <- marker.index[1:60e3]
38
-      cnSetExample <- hapmapSet[marker.index, c(1168:1169)]
39
-      save(cnSetExample, file="~/Software/crlmm/data/cnSetExample.rda")