Browse code

added classes/methods for intermediate files and copy number analysis; added wrapper for preprocessing and genotypes.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@40309 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 25/06/2009 12:54:08
Showing13 changed files

... ...
@@ -163,3 +163,17 @@ is decoded and scanned
163 163
 2009-06-08 R Scharpf - committed version 1.3.4
164 164
 
165 165
 * fixed bug in the biasAdjNP function.  updated biasAdj function
166
+
167
+2009-06-20 R Scharpf - committed version 1.3.5
168
+
169
+* created CrlmmSetList, ABset, and CopyNumberSet classes.  
170
+  - added a few methods to support these classes
171
+
172
+* crlmmWrapper does normalization / genotyping and optionally splits
173
+  the results by chromosome
174
+
175
+* several changes to the copynumber vignette
176
+
177
+
178
+
179
+
... ...
@@ -1,15 +1,21 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.3.4
4
+Version: 1.3.5
5 5
 Date: 2009-06-17
6 6
 Author: Rafael A Irizarry, Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
7 7
 Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
8 8
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 6.0.
9 9
 License: Artistic-2.0
10
-Imports: affyio, preprocessCore, utils, stats, genefilter, splines, Biobase (>= 2.5.3), mvtnorm
11
-Suggests: hapmapsnp5, hapmapsnp6, genomewidesnp5Crlmm (>= 1.0.2), genomewidesnp6Crlmm (>= 1.0.2), methods, GGdata, snpMatrix
10
+Depends: methods
11
+Imports: affyio, preprocessCore, utils, stats, genefilter, splines, Biobase (>= 2.5.3), mvtnorm, oligoClasses, ellipse, methods
12
+Suggests: hapmapsnp5, hapmapsnp6, genomewidesnp5Crlmm (>= 1.0.2), genomewidesnp6Crlmm (>= 1.0.2), GGdata, snpMatrix
12 13
 Collate: AllClasses.R
14
+	 AllGenerics.R
15
+	 methods-ABset.R
16
+	 methods-CopyNumberSet.R
17
+	 methods-CrlmmSetList.R
18
+	 methods-eSet.R
13 19
          methods-SnpSet.R
14 20
          cnrma-functions.R
15 21
          crlmm-functions.R
... ...
@@ -1,10 +1,83 @@
1 1
 useDynLib("crlmm", .registration=TRUE)
2
-import(Biobase)
2
+
3
+##importClassesFrom(methods, ANY, character, data.frame, "function", integer, matrix, numeric)
4
+
5
+importClassesFrom(Biobase, AnnotatedDataFrame, AssayData, eSet,
6
+		  SnpSet, MIAME, Versioned, VersionedBiobase, Versions)
7
+
8
+importClassesFrom(oligoClasses, SnpLevelSet)
9
+
10
+importMethodsFrom(Biobase, annotation, "annotation<-", annotatedDataFrameFrom,
11
+		  assayData, "assayData<-", combine,
12
+		  experimentData, "experimentData<-",
13
+                  fData, featureData, "featureData<-", featureNames,
14
+		  fvarMetadata, fvarLabels,
15
+                  pData, phenoData, "phenoData<-", pubMedIds, rowMedians, sampleNames, scanDates,
16
+                  "scanDates<-", storageMode, updateObject)
17
+
18
+
19
+##importMethodsFrom(oligoClasses, chromosome, copyNumber, position)
20
+importFrom(oligoClasses, chromosome, copyNumber, position, calls)
21
+
22
+##importMethodsFrom(methods, initialize, show)
23
+
24
+##importFrom(methods, "@<-", callNextMethod, new, validObject)
25
+
26
+importFrom(Biobase, assayDataElement, assayDataElementNames,
27
+           assayDataElementReplace, assayDataNew, classVersion)
28
+
29
+importFrom(graphics, abline, axis, layout, legend, mtext, par, plot,
30
+           polygon, rect, segments, text, points)
31
+
32
+importFrom(grDevices, grey)
33
+
3 34
 importFrom(affyio, read.celfile.header, read.celfile)
35
+
4 36
 importFrom(preprocessCore, normalize.quantiles.use.target, normalize.quantiles)
37
+
5 38
 importFrom(utils, data, packageDescription, setTxtProgressBar, txtProgressBar)
39
+
6 40
 importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile, sd)
41
+
7 42
 importFrom(genefilter, rowSds)
43
+
8 44
 importFrom(mvtnorm, dmvnorm)
9
-export("crlmm", "list.celfiles", "computeCopynumber", "cnrma", "celDates", "crlmmIllumina", "readIdatFiles", "snprma")
10
-exportMethods("calls", "confs")
45
+
46
+importFrom(ellipse, ellipse)
47
+
48
+exportClasses(ABset, CopyNumberSet, CrlmmSetList)
49
+##S3method(ellipse, CopyNumberSet)
50
+exportMethods(calls,
51
+	      CA,
52
+	      "CA<-",
53
+	      CB,
54
+	      "CB<-",
55
+	      cnIndex,
56
+	      confs,
57
+	      plot,
58
+	      points,
59
+	      show,
60
+	      snpIndex)
61
+
62
+export(celDates,
63
+       calls,
64
+       chromosome,
65
+       cnrma,
66
+       combine,
67
+       computeCopynumber,
68
+       copyNumber,
69
+       crlmm,
70
+       crlmmIllumina,
71
+       crlmmWrapper,
72
+       ellipse,
73
+       computeEmission,
74
+       list.celfiles,
75
+       position,
76
+       readIdatFiles,
77
+       sampleNames,
78
+       scanDates,
79
+       "scanDates<-",
80
+       snprma)
81
+
82
+
83
+
... ...
@@ -1,2 +1,11 @@
1 1
 ## Class definition
2
+setClass("ABset", contains="eSet",
3
+	 prototype = prototype(
4
+	 new("VersionedBiobase",
5
+	     versions=c(classVersion("eSet"), SnpSet="1.0.0"))))
2 6
 setClass("crlmmSet", contains="eSet")
7
+setClass("CrlmmSetList", contains="list")
8
+setClass("CopyNumberSet", contains="eSet",
9
+	 prototype = prototype(
10
+	 new("VersionedBiobase",
11
+	     versions=c(classVersion("eSet"), SnpSet="1.0.0"))))
... ...
@@ -127,6 +127,127 @@ celDates <- function(celfiles){
127 127
 	return(celdts)
128 128
 }
129 129
 
130
+predictGender <- function(res, cdfName="genomewidesnp6", SNRMin=5){
131
+	pkgname <- paste(cdfName, "Crlmm", sep="")
132
+	path <- system.file("extdata", package=pkgname)
133
+        loader("23index.rda", .crlmmPkgEnv, pkgname)
134
+	index <- getVarInEnv("index")
135
+	##load(file.path(path, "23index.rda"))
136
+	XIndex <- index[[1]]	
137
+	if(class(res) != "ABset"){
138
+		A <- res$A
139
+		B <- res$B
140
+	} else{
141
+		A <- res@assayData[["A"]]
142
+		B <- res@assayData[["B"]]
143
+	}
144
+	XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
145
+	SNR <- res$SNR
146
+	if(sum(SNR>SNRMin)==1){
147
+		gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
148
+	}else{
149
+		gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]]
150
+	}
151
+	return(gender)
152
+}
153
+
154
+combineIntensities <- function(res, cnrmaResult, cdfName){
155
+	rownames(res$B) <- rownames(res$A) <- res$gns
156
+	colnames(res$B) <- colnames(res$A) <- res$sns
157
+	NP <- cnrmaResult$NP
158
+	blank <- matrix(NA, nrow(NP), ncol(NP))
159
+	dimnames(blank) <- dimnames(NP)
160
+	A <- rbind(res$A, NP)
161
+	B <- rbind(res$B, blank)
162
+	aD <- assayDataNew("lockedEnvironment",
163
+			   A=A,
164
+			   B=B)
165
+	ABset <- new("ABset",
166
+		     assayData=aD,
167
+		     featureData=annotatedDataFrameFrom(A, byrow=TRUE),
168
+		     phenoData=annotatedDataFrameFrom(A, byrow=FALSE),
169
+		     annotation="genomewidesnp6")
170
+	ABset$SNR <- res$SNR
171
+	ABset$gender <- predictGender(res=res, cdfName=cdfName)
172
+	return(ABset)
173
+}
174
+
175
+harmonizeSnpSet <- function(crlmmResult, ABset){
176
+	blank <- matrix(NA, length(cnNames(ABset)), ncol(ABset))
177
+	rownames(blank) <- cnNames(ABset)
178
+	colnames(blank) <- sampleNames(ABset)
179
+	crlmmCalls <- rbind(calls(crlmmResult), blank)
180
+	crlmmConf <- rbind(confs(crlmmResult), blank)
181
+	fD <- as.matrix(fData(crlmmResult))
182
+	fD2 <- matrix(NA, nrow(blank), ncol(fD))
183
+	rownames(fD2) <- rownames(blank)
184
+	fD <- rbind(fD, fD2)
185
+	aD <- assayDataNew("lockedEnvironment",
186
+			   calls=crlmmCalls,
187
+			   callProbability=crlmmConf)
188
+	##Make crlmmResult the same dimension as ABset
189
+	fD <- new("AnnotatedDataFrame",
190
+		  data=data.frame(fD),
191
+		  varMetadata=fvarMetadata(crlmmResult))
192
+	crlmmResult <- new("SnpSet",
193
+			   call=crlmmCalls,
194
+			   callProbability=crlmmConf,
195
+			   featureData=fD,
196
+			   phenoData=phenoData(crlmmResult),
197
+			   scanDates=scanDates(ABset),
198
+			   annotation=annotation(ABset))
199
+	stopifnot(all.equal(dimnames(crlmmResult), dimnames(ABset)))
200
+	crlmmResult
201
+}
202
+
203
+harmonizeDimnamesTo <- function(object1, object2){
204
+	object1 <- object1[match(featureNames(object2), featureNames(object1)), ]
205
+	object1 <- object1[, match(sampleNames(object2), sampleNames(object1))]
206
+	stopifnot(all.equal(featureNames(object1), featureNames(object2)))
207
+	stopifnot(all.equal(sampleNames(object1), sampleNames(object2)))
208
+	return(object1)
209
+}
210
+	
211
+crlmmWrapper <- function(filenames, outdir="./", cdfName="genomewidesnp6", save.it, splitByChr=TRUE, ...){
212
+	if(!file.exists(file.path(outdir, "crlmmResult.rda"))){
213
+		crlmmResult <- crlmm(filenames=filenames, cdfName=cdfName, save.it=TRUE, ...)
214
+		if(save.it) save(crlmmResult, file=file.path(outdir, "crlmmResult.rda"))
215
+	} else {
216
+		message("Loading crlmmResult...")
217
+		load(file.path(outdir, "crlmmResult.rda"))
218
+	}
219
+
220
+	##- Make crlmmResult the same dimension as ABset
221
+	## -Create a list object that where rows and columns can be subset using '[' methods
222
+	if(!file.exists(file.path(outdir, "cnrmaResult.rda"))){
223
+		message("Quantile normalizing the copy number probes")		
224
+		cnrmaResult <- cnrma(filenames=filenames, cdfName=cdfName)
225
+		if(save.it) save(cnrmaResult, file=file.path(outdir, "cnrmaResult.rda"))
226
+	} else{
227
+		message("Loading cnrmaResult...")		
228
+		load(file.path(outdir, "cnrmaResult.rda"))
229
+	}
230
+##	loader("intensities.rda", pkgname=pkgname, envir=.crlmmPkgEnv)
231
+##	res <- get("res", envir=.crlmmPkgEnv)
232
+	load(file.path(outdir, "intensities.rda"))
233
+	ABset <- combineIntensities(res, cnrmaResult, cdfName)
234
+	scanDates(ABset) <- as.character(celDates(filenames))	
235
+	crlmmResult <- harmonizeSnpSet(crlmmResult, ABset)
236
+	stopifnot(all.equal(dimnames(crlmmResult), dimnames(ABset)))
237
+	crlmmList <- list(ABset,
238
+			  crlmmResult)
239
+	crlmmList <- as(crlmmList, "CrlmmSetList")
240
+	
241
+	if(splitByChr){
242
+		message("Saving by chromosome")
243
+		splitByChromosome(crlmmList, cdfName=cdfName, outdir=outdir)
244
+	} else{
245
+		message("Saving crlmmList object to ", outdir)
246
+		save(crlmmList, file=file.path(outdir, "crlmmList.rda"))
247
+	}
248
+	return()
249
+}
250
+
130 251
 cnrma <- function(filenames, cdfName="genomewidesnp6", sns, seed=1, verbose=FALSE){
131 252
 	pkgname <- getCrlmmAnnotationName(cdfName)
132 253
 	require(pkgname, character.only=TRUE) || stop("Package ", pkgname, " not available")
... ...
@@ -186,22 +307,29 @@ goodSnps <- function(phi.thr, envir, fewAA=20, fewBB=20){
186 307
 	return(flags)
187 308
 }
188 309
 
189
-instantiateObjects <- function(calls, conf, NP, plate, envir, chrom, A, B,
310
+instantiateObjects <- function(calls, conf, NP, plate, envir,
311
+			       chrom,
312
+			       A, B,
190 313
 			       gender, SNRmin=5, SNR,
191 314
                                pkgname,
192 315
 			       locusSet){
193
-	pkgname <- paste(pkgname, "Crlmm", sep="")
316
+##	pkgname <- paste(pkgname, "Crlmm", sep="")
194 317
 	envir[["chrom"]] <- chrom
195
-	CHR_INDEX <- paste(chrom, "index", sep="")
196
-        fname <- paste(CHR_INDEX, ".rda", sep="")
197
-        loader(fname, .crlmmPkgEnv, pkgname)
198
-        index <- get("index", envir=.crlmmPkgEnv)
199
-##	data(list=CHR_INDEX, package="genomewidesnp6Crlmm")
200
-	A <- A[index[[1]], SNR > SNRmin]
201
-	B <- B[index[[1]], SNR > SNRmin]
202
-	calls <- calls[index[[1]], SNR > SNRmin]
203
-	conf <- conf[index[[1]], SNR > SNRmin]
204
-	NP <- NP[index[[2]], SNR > SNRmin]
318
+##	CHR_INDEX <- paste(chrom, "index", sep="")
319
+##        fname <- paste(CHR_INDEX, ".rda", sep="")
320
+##        loader(fname, .crlmmPkgEnv, pkgname)
321
+##        index <- get("index", envir=.crlmmPkgEnv)
322
+####	data(list=CHR_INDEX, package="genomewidesnp6Crlmm")
323
+##	A <- A[index[[1]], SNR > SNRmin]
324
+##	B <- B[index[[1]], SNR > SNRmin]
325
+##	calls <- calls[index[[1]], SNR > SNRmin]
326
+##	conf <- conf[index[[1]], SNR > SNRmin]
327
+##	NP <- NP[index[[2]], SNR > SNRmin]
328
+	A <- A[, SNR > SNRmin]
329
+	B <- B[, SNR > SNRmin]
330
+	calls <- calls[, SNR > SNRmin]
331
+	conf <- conf[, SNR > SNRmin]
332
+	NP <- NP[, SNR > SNRmin]
205 333
 	plate <- plate[SNR > SNRmin]
206 334
 	uplate <- unique(plate)
207 335
 	SNR <- SNR[SNR > SNRmin]
... ...
@@ -279,7 +407,7 @@ instantiateObjects <- function(calls, conf, NP, plate, envir, chrom, A, B,
279 407
 	envir[["tau2B"]] <- tau2A
280 408
 	envir[["sig2A"]] <- tau2A
281 409
 	envir[["sig2B"]] <- tau2A
282
-	sig2T <- matrix(NA, nrow(NP), ncol(NP))
410
+	sig2T <- matrix(NA, nrow(NP), length(uplate))
283 411
 	envir[["sig2T"]] <- sig2T
284 412
 	envir[["corr"]] <- tau2A
285 413
 	envir[["corrA.BB"]] <- tau2A
... ...
@@ -294,16 +422,275 @@ instantiateObjects <- function(calls, conf, NP, plate, envir, chrom, A, B,
294 422
 	envir[["normalNP"]] <- normalNP
295 423
 }
296 424
 
425
+computeCopynumber <- function(object,
426
+			      CHR,
427
+			      bias.adj=FALSE,
428
+			      batch,
429
+			      SNRmin=5,
430
+			      cdfName="genomewidesnp6", ...){
431
+	##require(oligoClasses)
432
+	if(missing(CHR)) stop("Must specify CHR")
433
+	if(missing(batch)) stop("Must specify batch")
434
+	if(length(batch) != ncol(object[[1]])) stop("Batch must be the same length as the number of samples")
435
+	##the AB intensities
436
+	Nset <- object[[1]]
437
+	##indices of polymorphic loci
438
+	index <- snpIndex(Nset)
439
+	ABset <- Nset[index, ]
440
+	##indices of nonpolymorphic loci
441
+	NPset <- Nset[-index, ]
442
+	##genotypes/confidences
443
+	snpset <- object[[2]][index,]
444
+	##previous version of compute copy number
445
+	envir <- new.env()
446
+	message("Fitting model for copy number estimation...")
447
+	.computeCopynumber(chrom=CHR,
448
+			   A=A(ABset),
449
+			   B=B(ABset),
450
+			   calls=calls(snpset),
451
+			   conf=confs(snpset),
452
+			   NP=A(NPset),
453
+			   plate=batch,
454
+			   envir=envir,
455
+			   SNR=ABset$SNR,
456
+			   bias.adj=FALSE,
457
+			   SNRmin=SNRmin,
458
+			   ...)
459
+	message("Organizing results...")	
460
+	locusSet <- list2locusSet(envir, ABset=ABset, NPset=NPset, CHR=CHR, cdfName=cdfName)
461
+	if(bias.adj){
462
+		message("Running bias adjustment...")
463
+		.computeCopynumber(chrom=CHR,
464
+				   A=A(ABset),
465
+				   B=B(ABset),
466
+				   calls=calls(snpset),
467
+				   conf=confs(snpset),
468
+				   NP=A(NPset),
469
+				   plate=batch,
470
+				   envir=envir,
471
+				   SNR=ABset$SNR,
472
+				   bias.adj=TRUE,
473
+				   SNRmin=SNRmin,
474
+				   ...)
475
+		message("Organizing results...")			
476
+		locusSet <- list2locusSet(envir, ABset=ABset, NPset=NPset, CHR=CHR, cdfName=cdfName)
477
+		##.GlobalEnv[["locusSetAdj"]] <- locusSetAdj
478
+	}
479
+	return(locusSet)
480
+}
481
+
482
+##getCopyNumberEnvironment <- function(crlmmSetList, cnSet){
483
+##	envir <- new.env()
484
+##	envir[["A"]] <- A(crlmmSetList)[snpIndex(crlmmSetList), ]
485
+##	envir[["B"]] <- A(crlmmSetList)[snpIndex(crlmmSetList), ]
486
+##	envir[["CA"]] <- CA(cnSet)[snpIndex(cnSet), ]/100
487
+##	envir[["CB"]] <- CB(cnSet)[snpIndex(cnSet), ]/100		
488
+##	envir[["NP"]] <- A(crlmmSetList)[cnIndex(crlmmSetList), ]
489
+##	envir[["calls"]] <- calls(crlmmSetList)[snpIndex(crlmmSetList), ]
490
+##	envir[["chrom"]] <- unique(chromosome(cnSet))
491
+##	envir[["cnvs"]] <- cnNames(crlmmSetList)
492
+##	envir[["conf"]] <- conf(crlmmSetList)
493
+##	envir[["corr"]] <- fData(cnSet)$corr
494
+##	envir[["corrA.BB"]] <- fData(cnSet)$corrA.BB
495
+##	envir[["corrB.AA"]] <- fData(cnSet)$corrB.AA
496
+##	envir[["CT"]] <- CA(cnSet)[cnIndex(cnSet), ]/100
497
+##	##envir[["CT.sds"]] <- 
498
+##	##envir[["GT.A"]]
499
+##	##envir[["GT.B"]]
500
+##	##envir[["index"]]
501
+##	##envir[["muA"]]
502
+##	##envir[["muB"]]
503
+##	##envir[["normal"]]
504
+##	##envir[["normalNP"]]
505
+##	##envir[["npflags"]]
506
+##	##envir[["Ns"]]
507
+##	nuA <- fData(cnSet)[, grep("nuA", fvarLabels(cnSet))]
508
+##	nuB <- fData(cnSet)[, grep("nuB", fvarLabels(cnSet))]
509
+##	phiA <- fData(cnSet)[, grep("phiA", fvarLabels(cnSet))]
510
+##	phiB <- fData(cnSet)[, grep("phiB", fvarLabels(cnSet))]
511
+##	envir[["nuA"]] <- nuA[snpIndex(cnSet), ]
512
+##	envir[["nuB"]] <- nuB[snpIndex(cnSet), ]
513
+##	envir[["phiA"]] <- phiA[snpIndex(cnSet), ]
514
+##	envir[["phiB"]] <- phiB[snpIndex(cnSet), ]
515
+##	envir[["nuT"]] <- nuA[cnIndex(cnSet), ]
516
+##	envir[["phiT"]] <- phiA[cnIndex(cnSet), ]
517
+##	envir[["plate"]] <- cnSet$batch
518
+##	
519
+##}
520
+
521
+updateNuPhi <- function(crlmmSetList, cnSet){
522
+	##TODO: remove the use of environments.
523
+	##repopulate the environment
524
+	crlmmSetList <- crlmmSetList[, match(sampleNames(cnSet), sampleNames(crlmmSetList))]
525
+	crlmmSetList <- crlmmSetList[match(featureNames(cnSet), featureNames(crlmmSetList)), ]
526
+	##envir <- getCopyNumberEnvironment(crlmmSetList, cnSet)
527
+	Nset <- crlmmSetList[[1]]
528
+	##indices of polymorphic loci
529
+	index <- snpIndex(Nset)
530
+	ABset <- Nset[index, ]
531
+	##indices of nonpolymorphic loci
532
+	NPset <- Nset[-index, ]
533
+	##genotypes/confidences
534
+	snpset <- crlmmSetList[[2]][index,]
535
+	##previous version of compute copy number
536
+	envir <- new.env()	
537
+	message("Running bias adjustment...")
538
+##	.computeCopynumber(chrom=CHR,
539
+##			   A=A(ABset),
540
+##			   B=B(ABset),
541
+##			   calls=calls(snpset),
542
+##			   conf=confs(snpset),
543
+##			   NP=A(NPset),
544
+##			   plate=batch,
545
+##			   envir=envir,
546
+##			   SNR=ABset$SNR,
547
+##			   bias.adj=TRUE,
548
+##			   SNRmin=SNRmin,
549
+##			   ...)	
550
+}
551
+
552
+list2locusSet <- function(envir, ABset, NPset, CHR, cdfName="genomewidesnp6"){
553
+	if(missing(CHR)) stop("Must specify chromosome")
554
+	pkgname <- paste(cdfName, "Crlmm", sep="")	
555
+	path <- system.file("extdata", package=pkgname)
556
+	loader("cnProbes.rda", pkgname=pkgname, envir=.crlmmPkgEnv)
557
+	cnProbes <- get("cnProbes", envir=.crlmmPkgEnv)
558
+	loader("snpProbes.rda", pkgname=pkgname, envir=.crlmmPkgEnv)
559
+	snpProbes <- get("snpProbes", envir=.crlmmPkgEnv)	
560
+	##require(oligoClasses) || stop("oligoClasses package not available")
561
+	if(length(ls(envir)) == 0) stop("environment empty")
562
+	batch <- envir[["plate"]]
563
+
564
+	##SNP copy number	
565
+	CA <- envir[["CA"]]
566
+	dimnames(CA) <- list(envir[["snps"]], envir[["sns"]])	
567
+	CB <- envir[["CB"]]
568
+	dimnames(CB) <- dimnames(CA)
569
+
570
+	##NP copy number
571
+	CT <- envir[["CT"]]
572
+	rownames(CT) <- envir[["cnvs"]]
573
+	colnames(CT) <- envir[["sns"]]
574
+	sig2T <- envir[["sig2T"]]
575
+	rownames(sig2T) <- rownames(CT)
576
+	nuT <- envir[["nuT"]]
577
+	colnames(nuT) <- paste("nuT", unique(batch), sep="_")
578
+	##nuA <- rbind(nuA, nuT)
579
+	##nuB <- rbind(nuA, blank)
580
+	phiT <- envir[["phiT"]]
581
+	colnames(phiT) <- paste("phiT", unique(batch), sep="_")
582
+	naMatrix <- matrix(NA, nrow(CT), ncol(CT))
583
+	dimnames(naMatrix) <- dimnames(CT)
584
+	
585
+	CA <- rbind(CA, CT)
586
+	CB <- rbind(CB, naMatrix)	
587
+
588
+	##SNP parameters
589
+	tau2A <- envir[["tau2A"]]
590
+	colnames(tau2A) <- paste("tau2A", unique(batch), sep="_")
591
+	tau2B <- envir[["tau2B"]]
592
+	colnames(tau2B) <- paste("tau2B", unique(batch), sep="_")
593
+	sig2A <- envir[["sig2A"]]
594
+	colnames(sig2A) <- paste("sig2A", unique(batch), sep="_")
595
+	sig2B <- envir[["sig2B"]]
596
+	colnames(sig2B) <- paste("sig2B", unique(batch), sep="_")
597
+	nuA <- envir[["nuA"]]
598
+	colnames(nuA) <- paste("nuA", unique(batch), sep="_")
599
+	nuB <- envir[["nuB"]]
600
+	colnames(nuB) <- paste("nuB", unique(batch), sep="_")
601
+	phiA <- envir[["phiA"]]
602
+	colnames(phiA) <- paste("phiA", unique(batch), sep="_")
603
+	phiB <- envir[["phiB"]]
604
+	colnames(phiB) <- paste("phiB", unique(batch), sep="_")
605
+	corr <- envir[["corr"]]
606
+	colnames(corr) <- paste("corr", unique(batch), sep="_")
607
+	corrA.BB <- envir[["corrA.BB"]]
608
+	colnames(corrA.BB) <- paste("corrA.BB", unique(batch), sep="_")
609
+	corrB.AA <- envir[["corrB.AA"]]
610
+	colnames(corrB.AA) <- paste("corrB.AA", unique(batch), sep="_")
611
+
612
+
613
+	##Combine SNP and NP parameters
614
+	naMatrixParams <- matrix(NA, nrow(CT), length(unique(batch)))
615
+	dimnames(naMatrixParams) <- list(rownames(CT), unique(batch))
616
+	tau2A <- rbind(tau2A, naMatrixParams)
617
+	tau2B <- rbind(tau2B, naMatrixParams)
618
+	sig2A <- rbind(sig2A, sig2T)
619
+	sig2B <- rbind(sig2B, naMatrixParams)
620
+	corr <- rbind(corr, naMatrixParams)
621
+	corrA.BB <- rbind(corrA.BB, naMatrixParams)
622
+	corrB.AA <- rbind(corrB.AA, naMatrixParams)
623
+	nuA <- rbind(nuA, nuT)
624
+	phiA <- rbind(phiA, phiT)
625
+	nuB <- rbind(nuB, naMatrixParams)
626
+	phiB <- rbind(phiB, naMatrixParams)
627
+	rownames(tau2A) <- rownames(tau2B) <- rownames(sig2A) <- rownames(sig2B) <- rownames(CA)
628
+	rownames(corr) <- rownames(corrA.BB) <- rownames(corrB.AA) <- rownames(CA)
629
+	rownames(nuA) <- rownames(phiA) <- rownames(nuB) <- rownames(phiB) <- rownames(CA)	
630
+
631
+	##phenodata
632
+	phenodata <- phenoData(ABset)
633
+	phenodata <- phenodata[match(envir[["sns"]], sampleNames(phenodata)), ]
634
+	if(!("batch" %in% varLabels(phenodata))) phenodata$batch <- envir[["plate"]]
635
+
636
+	##Feature Data
637
+	position.snp <- snpProbes[match(envir[["snps"]], rownames(snpProbes)), "position"]
638
+	position.np <- cnProbes[match(envir[["cnvs"]], rownames(cnProbes)), "position"]
639
+	position <- c(position.snp, position.np)
640
+	if(!(identical(names(position), rownames(CA)))){
641
+		position <- position[match(rownames(CA), names(position))]
642
+	}
643
+	fd <- data.frame(cbind(CHR,
644
+			       position,
645
+			       tau2A,
646
+			       tau2B,
647
+			       sig2A,
648
+			       sig2B,
649
+			       nuA,
650
+			       nuB,
651
+			       phiA,
652
+			       phiB,
653
+			       corr,
654
+			       corrA.BB,
655
+			       corrB.AA))
656
+	colnames(fd)[1:2] <- c("chromosome", "position")
657
+	rownames(fd) <- rownames(CA)
658
+	fD <- new("AnnotatedDataFrame",
659
+		  data=fd,
660
+		  varMetadata=data.frame(labelDescription=colnames(fd)))	
661
+	assayData <- assayDataNew("lockedEnvironment",
662
+				  CA=CA,
663
+				  CB=CB)
664
+	cnset <- new("CopyNumberSet",
665
+		      assayData=assayData,
666
+		      featureData=fD,
667
+		      phenoData=phenodata,
668
+		      annotation="genomewidesnp6")
669
+	cnset <- thresholdCopyNumberSet(cnset)
670
+	return(cnset)
671
+}
672
+
673
+thresholdCopyNumberSet <- function(object){
674
+	ca <- CA(object)
675
+	cb <- CB(object)
676
+	ca[ca < 5] <- 5
677
+	ca[ca > 500] <- 500
678
+	cb[cb < 5] <- 5
679
+	cb[cb > 500] <- 500
680
+	CA(object) <- ca
681
+	CB(object) <- cb
682
+	return(object)
683
+}
297 684
 
298 685
 
299
-computeCopynumber <- function(chrom,
686
+.computeCopynumber <- function(chrom,
300 687
 			      A,
301 688
 			      B,
302 689
 			      calls,
303 690
 			      conf,
304 691
 			      NP,
305 692
 			      plate,
306
-			      MIN.OBS=1,
693
+			      MIN.OBS=5,
307 694
 			      envir,
308 695
 			      P,
309 696
 			      DF.PRIOR=50,
... ...
@@ -321,11 +708,18 @@ computeCopynumber <- function(chrom,
321 708
 			stop("plate must the same length as the number of columns of A")
322 709
 	}
323 710
 	set.seed(seed)
324
-	if(missing(chrom)) stop("must specify chromosome")
711
+	##if(missing(chrom)) stop("must specify chromosome")
325 712
 	if(length(ls(envir)) == 0) {
326
-		instantiateObjects(calls=calls, conf=conf, NP=NP, plate=plate,
327
-				   envir=envir, chrom=chrom, A=A, B=B,
328
-				   gender=gender, SNR=SNR, SNRmin=SNRmin,
713
+		instantiateObjects(calls=calls,
714
+				   conf=conf,
715
+				   NP=NP,
716
+				   plate=plate,
717
+				   envir=envir,
718
+				   chrom=chrom,
719
+				   A=A, B=B,
720
+				   gender=gender,
721
+				   SNR=SNR,
722
+				   SNRmin=SNRmin,
329 723
 				   pkgname=cdfName)
330 724
 	}
331 725
 	plate <- envir[["plate"]]
... ...
@@ -510,7 +904,8 @@ nonpolymorphic <- function(plateIndex, NP, envir, CONF.THR=0.99, DF.PRIOR=50, pk
510 904
 		CT[, plate==uplate[p]] <- matrix(as.integer(100*T), nrow(T), ncol(T))
511 905
 
512 906
 		##Variance for prediction region
513
-		sig2T[, plate==uplate[[p]]] <- rowMAD(log2(NP*normalNP), na.rm=TRUE)^2	
907
+		##sig2T[, plate==uplate[[p]]] <- rowMAD(log2(NP*normalNP), na.rm=TRUE)^2
908
+		sig2T[, p] <- rowMAD(log2(NP*normalNP), na.rm=TRUE)^2	
514 909
 		envir[["sig2T"]] <- sig2T
515 910
 		envir[["CT"]] <- CT
516 911
 		envir[["phiT"]] <- phiT
... ...
@@ -653,7 +1048,7 @@ oneBatch <- function(plateIndex,
653 1048
 	Ns <- envir[["Ns"]]
654 1049
 
655 1050
 	##---------------------------------------------------------------------------
656
-	## Predict sufficient statistics for unobserved genotypes (plate-specific)
1051
+	## Impute sufficient statistics for unobserved genotypes (plate-specific)
657 1052
 	##---------------------------------------------------------------------------
658 1053
 	index.AA <- which(Ns[, p, "AA"] >= 3)
659 1054
 	index.AB <- which(Ns[, p, "AB"] >= 3)
... ...
@@ -1042,11 +1437,11 @@ biasAdj <- function(plateIndex, envir, priorProb, PROP=0.75){
1042 1437
 		proportionSamplesAltered <- rowMeans(mostLikelyState != 3)
1043 1438
 	}else{
1044 1439
 		##should also consider pseud
1045
-		browser()
1440
+##		browser()
1046 1441
 		gender <- envir[["gender"]]
1047
-		tmp3 <- tmp2
1048
-		tmp3[, gender=="male"] <- tmp2[, gender=="male"] != 2
1049
-		tmp3[, gender=="female"] <- tmp2[, gender=="female"] != 3
1442
+##		tmp3 <- tmp2
1443
+##		tmp3[, gender=="male"] <- tmp2[, gender=="male"] != 2
1444
+##		tmp3[, gender=="female"] <- tmp2[, gender=="female"] != 3
1050 1445
 	}
1051 1446
 	##Those near 1 have NaNs for nu and phi.  this occurs by NaNs in the muA[,, "A"] or muA[, , "B"] for X chromosome
1052 1447
 	##ii <- proportionSamplesAltered < PROP
... ...
@@ -1301,7 +1696,6 @@ biasAdjNP <- function(plateIndex, envir, priorProb){
1301 1696
 	sig2T <- envir[["sig2T"]]
1302 1697
 	normalNP <- normalNP[, plate==uplate[p]]	
1303 1698
 	NP <- NP[, plate==uplate[p]]
1304
-	##sig2T <- sig2T[, plate==uplate[p]]
1305 1699
 	sig2T <- sig2T[, p]
1306 1700
 
1307 1701
 
... ...
@@ -1358,3 +1752,159 @@ biasAdjNP <- function(plateIndex, envir, priorProb){
1358 1752
 	envir[["normalNP"]] <- tmp
1359 1753
 }
1360 1754
 
1755
+
1756
+getParams <- function(object, batch){
1757
+	batch <- unique(object$batch)
1758
+	if(length(batch) > 1) stop("batch variable not unique")		
1759
+	nuA <- as.numeric(fData(object)[, match(paste("nuA", batch, sep="_"), fvarLabels(object))])
1760
+	nuB <- as.numeric(fData(object)[, match(paste("nuB", batch, sep="_"), fvarLabels(object))])	
1761
+	phiA <- as.numeric(fData(object)[, match(paste("phiA", batch, sep="_"), fvarLabels(object))])
1762
+	phiB <- as.numeric(fData(object)[, match(paste("phiB", batch, sep="_"), fvarLabels(object))])	
1763
+	tau2A <- as.numeric(fData(object)[, match(paste("tau2A", batch, sep="_"), fvarLabels(object))])
1764
+	tau2B <- as.numeric(fData(object)[, match(paste("tau2B", batch, sep="_"), fvarLabels(object))])
1765
+	sig2A <- as.numeric(fData(object)[, match(paste("sig2A", batch, sep="_"), fvarLabels(object))])
1766
+	sig2B <- as.numeric(fData(object)[, match(paste("sig2B", batch, sep="_"), fvarLabels(object))])
1767
+	corrA.BB <- as.numeric(fData(object)[, match(paste("corrA.BB", batch, sep="_"), fvarLabels(object))])
1768
+	corrB.AA <- as.numeric(fData(object)[, match(paste("corrB.AA", batch, sep="_"), fvarLabels(object))])
1769
+	corr <- as.numeric(fData(object)[, match(paste("corr", batch, sep="_"), fvarLabels(object))])
1770
+	params <- list(nuA=nuA,
1771
+		       nuB=nuB,
1772
+		       phiA=phiA,
1773
+		       phiB=phiB,
1774
+		       tau2A=tau2A,
1775
+		       tau2B=tau2B,
1776
+		       sig2A=sig2A,
1777
+		       sig2B=sig2B,
1778
+		       corrA.BB=corrA.BB,
1779
+		       corrB.AA=corrB.AA,
1780
+		       corr=corr)
1781
+	return(params)
1782
+}
1783
+
1784
+computeEmission <- function(crlmmResults, cnset, copyNumberStates=0:5, MIN=2^3){
1785
+	##threshold small nu's and phis
1786
+	cnset <- thresholdModelParams(cnset, MIN=MIN)
1787
+	index <- order(chromosome(cnset), position(cnset))
1788
+	if(any(diff(index) > 1)) stop("must be ordered by chromosome and physical position")
1789
+	emissionProbs <- array(NA, dim=c(nrow(cnset), ncol(cnset), length(copyNumberStates)))
1790
+	dimnames(emissionProbs) <- list(featureNames(crlmmResults),
1791
+					sampleNames(crlmmResults),
1792
+					paste("copy.number_", copyNumberStates, sep=""))	
1793
+	batch <- cnset$batch
1794
+	for(i in seq(along=unique(batch))){
1795
+		if(i == 1) cat("Computing emission probabilities \n")
1796
+		message("batch ", unique(batch)[i], "...")
1797
+		emissionProbs[, batch == unique(batch)[i], ] <- .getEmission(crlmmResults, cnset, batch=unique(batch)[i],
1798
+				copyNumberStates=copyNumberStates)
1799
+	}
1800
+	emissionProbs
1801
+}
1802
+
1803
+thresholdModelParams <- function(object, MIN=2^3){
1804
+	nuA <- fData(object)[, grep("nuA", fvarLabels(object))]
1805
+	nuB <- fData(object)[, grep("nuB", fvarLabels(object))]
1806
+	phiA <- fData(object)[, grep("phiA", fvarLabels(object))]
1807
+	phiB <- fData(object)[, grep("phiB", fvarLabels(object))]
1808
+
1809
+	nuA[nuA < MIN] <- MIN
1810
+	nuB[nuB < MIN] <- MIN
1811
+	phiA[phiA < MIN] <- MIN
1812
+	phiB[phiB < MIN] <- MIN
1813
+	fData(object)[, grep("nuA", fvarLabels(object))] <- nuA
1814
+	fData(object)[, grep("nuB", fvarLabels(object))] <- nuB
1815
+	fData(object)[, grep("phiA", fvarLabels(object))] <- phiA
1816
+	fData(object)[, grep("phiB", fvarLabels(object))] <- phiB
1817
+	object
1818
+}
1819
+
1820
+.getEmission <- function(crlmmResults, cnset, batch, copyNumberStates){
1821
+	if(length(batch) > 1) stop("batch variable not unique")
1822
+	crlmmResults <- crlmmResults[, cnset$batch==batch]
1823
+	cnset <- cnset[, cnset$batch == batch]	
1824
+	emissionProbs <- array(NA, dim=c(nrow(crlmmResults[[1]]), ncol(crlmmResults[[1]]), length(copyNumberStates)))
1825
+
1826
+	snpset <- cnset[snpIndex(cnset), ]
1827
+	params <- getParams(snpset, batch=batch)
1828
+	##attach(params)
1829
+	corr <- params[["corr"]]
1830
+	corrA.BB <- params[["corrA.BB"]]
1831
+	corrB.AA <- params[["corrB.AA"]]	
1832
+	nuA <- params[["nuA"]]
1833
+	nuB <- params[["nuB"]]
1834
+	phiA <- params[["phiA"]]
1835
+	phiB <- params[["phiB"]]
1836
+	sig2A <- params[["sig2A"]]
1837
+	sig2B <- params[["sig2B"]]
1838
+	tau2A <- params[["tau2A"]]
1839
+	tau2B <- params[["tau2B"]]
1840
+	a <- as.numeric(log2(A(crlmmResults[snpIndex(crlmmResults), ])))
1841
+	b <- as.numeric(log2(B(crlmmResults[snpIndex(crlmmResults), ])))
1842
+	for(k in seq(along=copyNumberStates)){
1843
+		##cat(k, " ")
1844
+		CN <- copyNumberStates[k]
1845
+		f.x.y <- matrix(0, nrow(snpset), ncol(snpset))
1846
+		for(CA in 0:CN){
1847
+			CB <- CN-CA
1848
+			sigmaA <- sqrt(tau2A*(CA==0) + sig2A*(CA > 0))
1849
+			sigmaB <- sqrt(tau2B*(CB==0) + sig2B*(CB > 0))
1850
+			if(CA == 0 & CB > 0) r <- corrA.BB
1851
+			if(CA > 0 & CB == 0) r <- corrB.AA
1852
+			if(CA > 0 & CB > 0) r <- corr
1853
+			if(CA == 0 & CB == 0) r <- 0
1854
+			muA <- log2(nuA+CA*phiA)
1855
+			muB <- log2(nuB+CB*phiB)		
1856
+
1857
+			##might want to allow the variance to be sample-specific
1858
+			##TODO:
1859
+			## rho, sd.A, and sd.B are locus-specific
1860
+			## Some samples are more noisy than others.
1861
+			##
1862
+			##  - scale the variances by a sample-specific estimate of the variances
1863
+			## var(I_A, ijp) = sigma_A_ip * sigma_A_jp
1864
+
1865
+			meanA <- as.numeric(matrix(muA, nrow(snpset), ncol(snpset)))
1866
+			meanB <- as.numeric(matrix(muB, nrow(snpset), ncol(snpset)))			
1867
+			rho <- as.numeric(matrix(r, nrow(snpset), ncol(snpset)))
1868
+			sd.A <- as.numeric(matrix(sigmaA, nrow(snpset), ncol(snpset)))
1869
+			sd.B <- as.numeric(matrix(sigmaB, nrow(snpset), ncol(snpset)))
1870
+			Q.x.y <- 1/(1-rho^2)*(((a - meanA)/sd.A)^2 + ((b - meanB)/sd.B)^2 - 2*rho*((a - meanA)*(b - meanB))/(sd.A*sd.B))
1871
+			##for CN states > 1, assume that any of the possible genotypes are equally likely a priori...just take the sum
1872
+			##for instance, for state copy number 2 there are three combinations: AA, AB, BB
1873
+			##   -- two of the three combinations should be near zero.
1874
+			## TODO: copy-neutral LOH would put near-zero mass on CA > 0, CB > 0
1875
+			f.x.y <- f.x.y + matrix(1/(2*pi*sd.A*sd.B*sqrt(1-rho^2))*exp(-0.5*Q.x.y), nrow(snpset), ncol(snpset))
1876
+		}
1877
+		emissionProbs[snpIndex(crlmmResults), , k] <- log(f.x.y)
1878
+	}
1879
+
1880
+	
1881
+	cnset <- cnset[cnIndex(cnset), ]
1882
+	params <- getParams(cnset, batch=batch)
1883
+	nuA <- params[["nuA"]]
1884
+	phiA <- params[["phiA"]]
1885
+	sig2A <- params[["sig2A"]]
1886
+	a <- as.numeric(log2(A(crlmmResults[cnIndex(crlmmResults), ])))
1887
+
1888
+##	ids=featureNames(crlmmResults)[index]
1889
+##	aa=A(crlmmResults[cnIndex(crlmmResults), ])
1890
+##	ii <- match(ids, rownames(aa))
1891
+##	ii <- ii[!is.na(ii)]
1892
+##	##putative deletion
1893
+##	log2(aa[ii, 1])
1894
+##	M <- matrix(NA, length(ii), length(copyNumberStates))
1895
+##	copynumber=1/phiA[ii]*(aa[ii, 1] - nuA[ii])
1896
+	for(k in seq(along=copyNumberStates)){
1897
+		CT <- copyNumberStates[k]
1898
+		mus.matrix=matrix(log2(nuA + CT*phiA), nrow(cnset), ncol(cnset))
1899
+		mus <- as.numeric(matrix(log2(nuA + CT*phiA), nrow(cnset), ncol(cnset)))
1900
+		##Again, should make sds sample-specific
1901
+		sds.matrix <- matrix(sqrt(sig2A), nrow(cnset), ncol(cnset))
1902
+		sds <- as.numeric(matrix(sqrt(sig2A), nrow(cnset), ncol(cnset)))
1903
+		
1904
+		tmp <- matrix(dnorm(a, mean=mus, sd=sds), nrow(cnset), ncol(cnset))
1905
+		emissionProbs[cnIndex(crlmmResults), , k] <- log(tmp)
1906
+		##first samples emission probs
1907
+##		M[, k] <- log(tmp)[ii, 1]
1908
+	}
1909
+	emissionProbs
1910
+}
... ...
@@ -1,7 +1,5 @@
1 1
 ## Methods for crlmm
2 2
 
3
-setGeneric("calls", function(x) standardGeneric("calls"))
4
-setMethod("calls", "SnpSet", function(x) assayData(x)$call)
5 3
 
6
-setGeneric("confs", function(x) standardGeneric("confs"))
4
+setMethod("calls", "SnpSet", function(object) assayData(object)$call)
7 5
 setMethod("confs", "SnpSet", function(x) assayData(x)$callProbability)
... ...
@@ -146,3 +146,4 @@ loader <- function(theFile, envir, pkgname){
146 146
 	load(theFile, envir=envir)
147 147
 }
148 148
 
149
+
... ...
@@ -23,9 +23,12 @@ B Carvalho
23 23
 ### FOR CNRMA
24 24
 #####################################
25 25
 - Bias adjustment for X chromosome
26
+
26 27
 - adjust nu for altered copy number (crosshyb)
27
-- a class for the assay data elements and metadata -- oligoSnpSet?
28
-      - should we consider a name change -- e.g., LocusSet
29
-- a class for the intermediate files and methods for SNP-specific plots
30
-- uncertainty estimates for copy number.  Put in cnConfidence slot
28
+
29
+- should store parameters in something besides the featureData slot
30
+
31
+        - Perhaps add a slot for parameters and a slot for batch.
32
+
33
+
31 34
 
... ...
@@ -36,6 +36,15 @@ HapMap samples.
36 36
 We preprocess and genotype the samples as described in the CRLMM
37 37
 vignette.
38 38
 
39
+<<test, eval=FALSE, echo=FALSE>>=
40
+library(crlmm)
41
+load("~/madman/Rpacks/crlmm/inst/scripts/example.cnset.rda")
42
+
43
+chromosome(example.cnset)[1:5]
44
+position(example.cnset)[1:5]
45
+scanDates(example.cnset)[1:5]
46
+@ 
47
+
39 48
 <<requiredPackages>>=
40 49
 library(crlmm)
41 50
 library(genomewidesnp6Crlmm)
... ...
@@ -49,31 +58,24 @@ celFiles <- list.celfiles("/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m", full
49 58
 outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/1m/affy"
50 59
 @ 
51 60
 
52
-Preprocess and genotype (for more info see the crlmm vignette).
61
+Preprocess and genotype (for more info see the crlmm vignette):
53 62
 
54
-<<genotype>>=
55
-if (!exists("crlmmResult")) {
56
-	if (file.exists(file.path(outdir, "crlmmResult.rda"))){
57
-		load(file.path(outdir, "intensities.rda"))
58
-		load(file.path(outdir, "crlmmResult.rda"))
59
-	}
60
-	else {
61
-		crlmmResult <- crlmm(celFiles, save.it=TRUE, intensityFile=file.path(outdir, "intensities.rda"))
62
-		save(crlmmResult, file=file.path(outdir, "crlmmResult.rda"))
63
-	}
63
+<<preprocessAndGenotype>>=
64
+crlmmFilenames <- file.path(outdir, paste("crlmmResults_", 1:24, ".rda", sep=""))
65
+if(!all(file.exists(crlmmFilenames))){
66
+	message("Preprocessing and crlmm genotyping.  Results are written to", outdir, "...")
67
+	intensityFile <- file.path(outdir, "intensities.rda")		
68
+	crlmmWrapper(celFiles, outdir, save.it=TRUE, intensityFile=intensityFile)
64 69
 }
65 70
 @ 
66 71
 
67
-Quantile normalize the nonpolymorphic probes and save the results.
72
+Load results for chromosome 22.
68 73
 
69
-<<cnrma>>=
70
-if(!exists("cnrmaResult")){
71
-	if(file.exists(file.path(outdir, "cnrmaResult.rda"))) load(file.path(outdir, "cnrmaResult.rda"))
72
-	else {
73
-		cnrmaResult <- cnrma(celFiles, cdfName="genomewidesnp6")
74
-		save(cnrmaResult, file=file.path(outdir, "cnrmaResult.rda"))
75
-	}
76
-}
74
+<<crlmmList-show>>=
75
+CHR <- 22
76
+if(!exists("crlmmResults")) load(file.path(outdir, paste("crlmmResults_", CHR, ".rda", sep="")))
77
+class(crlmmResults)
78
+show(crlmmResults)
77 79
 @ 
78 80
 
79 81
 \subsection{Copy number}
... ...
@@ -89,29 +91,14 @@ We require 6 items for copy number estimation:
89 91
   \item signal to noise ratio (SNR) of the samples (J)
90 92
   \end{itemize}
91 93
   
92
-  These items are extracted as follows:
93
-
94
-<<snpAndCnSummaries>>=
95
-A <- res$A
96
-B <- res$B
97
-genotypes <- crlmm:::calls(crlmmResult)##$calls
98
-conf <- confs(crlmmResult)##$conf
99
-gender <- crlmmResult$gender
100
-SNR <- crlmmResult$SNR
101
-NP <- cnrmaResult$NP
102
-gc()
103
-@ 
104
-
105
-A histogram of the signal to noise ratio for the HapMap samples:
94
+  The above items are contained in the \Robject{crlmmResults} object. A
95
+  histogram of the signal to noise ratio for the HapMap samples:
106 96
 
107 97
 <<plotSnr, fig=TRUE>>=
108
-hist(SNR, xlab="SNR", main="")
98
+hist(crlmmResults[[2]]$SNR, xlab="SNR", main="")
109 99
 @ 
110 100
 
111
-
112
-<<dates>>=
113
-dts <- celDates(celFiles)
114
-table(format(dts, "%d %b %Y"))
101
+<<setSNRmin, echo=FALSE>>=
115 102
 SNRmin <- 5
116 103
 @ 
117 104
 
... ...
@@ -122,149 +109,61 @@ chemistry plate.  Ideally, one would have 70+ files in a given
122 109
 batch. Here we make a table of date versus ancestry:
123 110
 
124 111
 <<specifyBatch>>=
125
-require(Biobase)
126
-sns <- sampleNames(crlmmResult)
112
+sns <- sampleNames(crlmmResults)
127 113
 sns[1]
128
-plate <- substr(basename(sns), 13, 13)
129
-table(plate)
130
-table(format(dts, "%d %b %Y"), plate)
114
+batch <- substr(basename(sns), 13, 13)
115
+table(batch)
116
+table(format(as.POSIXlt(scanDates(crlmmResults)), "%d %b %Y"), batch)
131 117
 @ 
132 118
 
133 119
 As all of these samples were run on the first week of March, we would
134 120
 expect that any systematic artifacts to the intensities that develop
135 121
 over time to be minimal (a best case scenario).  As this is typically
136 122
 not the case, we illustrate how one may adjust for batch using the
137
-chemistry plate as an argument to the computeCopynumber function.
138
-
139
-<<chromosome22>>=
140
-if(!exists("env")){
141
-	CHR <- 22
142
-	if(!file.exists(file.path(outdir, paste("env_chr", CHR, ".rda", sep="")))){
143
-		env <- new.env()
144
-		computeCopynumber(chrom=CHR,
145
-				  A=A,
146
-				  B=B,
147
-				  calls=genotypes,
148
-				  conf=conf,
149
-				  NP=NP,
150
-				  plate=plate,
151
-				  envir=env, 
152
-				  DF.PRIOR=50, 
153
-				  MIN.OBS=1,
154
-				  SNR=SNR,
155
-				  SNRmin=SNRmin)	
156
-		save(env, file=file.path(outdir, paste("env_chr", CHR, ".rda", sep="")))
157
-	} else {
158
-		load(file.path(outdir, paste("env_chr", CHR, ".rda", sep="")))
123
+chemistry plate as an argument for \Robject{batch} in the
124
+\Robject{computeCopynumber} function.
125
+
126
+<<computeCopynumber>>=
127
+if(!exists("cnset")){
128
+	if(file.exists(file.path(outdir, paste("cnset_", CHR, ".rda", sep="")))) load(file.path(outdir, paste("cnset_", CHR, ".rda", sep="")))	
129
+	else {
130
+		cnset <- computeCopynumber(crlmmResults, SNRmin=5, batch=batch, CHR=CHR, cdfName="genomewidesnp6")
131
+		save(cnset, file=file.path(outdir, paste("cnset_", CHR, ".rda", sep="")))
159 132
 	}
160 133
 }
134
+scanDates(cnset) <- scanDates(crlmmResults)
135
+cnset <- cnset[order(chromosome(cnset), position(cnset)), ]
136
+crlmmResults <- crlmm:::harmonizeDimnamesTo(crlmmResults, cnset)
161 137
 @ 
162 138
 
163
-A class representation would be useful for this sort of data (TODO).
164
-The \Rpackage{oligoSnpSet} class used below is a bit inefficient as the
165
-assay data elements are forced to be the same size.  The advantage is
166
-that all of the assay data is bound to the meta-data.
167
-
168
-<<oligoSnpSet>>=
169
-require(oligoClasses)
170
-path <- system.file("extdata", package="genomewidesnp6Crlmm")
171
-load(file.path(path, "snpProbes.rda"))
172
-load(file.path(path, "cnProbes.rda"))
173
-position <- snpProbes[match(env[["snps"]], rownames(snpProbes)), "position"]
174
-position.np <- cnProbes[match(rownames(env[["NP"]]), rownames(cnProbes)), "position"]
175
-CA <- env[["CA"]]
176
-CB <- env[["CB"]]
177
-snpCT <- CA+CB
178
-npCT <- env[["CT"]]
179
-CT <- rbind(snpCT, npCT)
180
-dimnames(CT) <- list(c(env[["snps"]], rownames(env[["NP"]])), env[["sns"]])
181
-genotypes <- matrix(NA, nrow(CT), ncol(CT))
182
-dimnames(genotypes) <- dimnames(CT)
183
-genotypes[1:length(env[["snps"]]), ] <- env[["calls"]]
184
-polymorphic <- c(rep(1, length(env[["snps"]])), rep(0, nrow(npCT)))
185
-fD <- new("AnnotatedDataFrame", 
186
-	  data=data.frame(chromosome=rep(CHR, nrow(CT)), 
187
-	  polymorphic=polymorphic,
188
-	  position=c(position, position.np)),
189
-	  varMetadata=data.frame(labelDescription=c("chromosome",
190
-				 "polymorphic",
191
-				 "position")))
192
-locusSet <- new("oligoSnpSet",
193
-		copyNumber=CT,
194
-		calls=genotypes,
195
-		featureData=fD,
196
-		phenoData=annotatedDataFrameFrom(CT, byrow=FALSE),
197
-		annotation="genomewidesnp6")
139
+<<example.cnset, eval=FALSE, echo=FALSE>>=
140
+example.crlmmResults <- crlmmResults[1:1000, 1:100]
141
+example.cnset <- cnset[1:1000, 1:100]
142
+save(example.cnset, file="~/madman/Rpacks/crlmm/inst/scripts/example.cnset.rda")
143
+save(example.crlmmResults, file="~/madman/Rpacks/crlmm/inst/scripts/example.crlmmResults.rda")
198 144
 @ 
199 145
 
200
-Note that an indicator for the polymorphic probes was created in the
201
-code chunk above.  I use this indicator to smooth the estimates of copy
202
-number from the nonpolymorphic and polymorphic probes separately.  The
203
-uncertainty estimates (work in progress) should reflect that the
204
-nonpolymorphic probes have more variance and the smoothing will take
205
-this into account.  The above algorithm for estimating copy number is
206
-predicated on the assumption that most samples within a batch have copy
207
-number 2 at any given SNP.  For common copy number variants, this
208
-assumption may not hold.  An additional iteration using a bias
209
-correction can improve the estimates.  Set the \Robject{bias.adj}
210
-argument to \Robject{TRUE}.
211
-
212
-%The total copy number for all chromosomes can be extracted from a
213
-%\emph{for} loop.  
214
-
215
-<<remainingChromosomes, eval=FALSE, echo=FALSE>>=
216
-require(oligoClasses)
217
-if(!exists("env")){
218
-##	for(CHR in c(1:21, 23)){  
219
-	for(CHR in 22){
220
-		cat(CHR, ", ")
221
-		if(!file.exists(file.path(outdir, paste("CT_chr", CHR, ".rda", sep="")))){
222
-			env <- new.env()
223
-			computeCopynumber(chrom=CHR,
224
-					  A=A,
225
-					  B=B,
226
-					  calls=genotypes,
227
-					  conf=conf,
228
-					  NP=NP,
229
-					  plate=plate,
230
-					  envir=env, 
231
-					  DF.PRIOR=50, 
232
-					  MIN.OBS=1,
233
-					  SNR=SNR,
234
-					  SNRmin=5)	
235
-			CA <- env[["CA"]]
236
-			CB <- env[["CB"]]
237
-			snpCT <- CA+CB
238
-			npCT <- env[["CT"]]
239
-			position.snp <- snpProbes[match(env[["snps"]], rownames(snpProbes)), "position"]
240
-			position.np <- cnProbes[match(rownames(env[["NP"]]), rownames(cnProbes)), "position"]
241
-			CT <- rbind(snpCT, npCT)
242
-			dimnames(CT) <- list(c(env[["snps"]], rownames(env[["NP"]])), env[["sns"]])
243
-			genotypes <- matrix(NA, nrow(CT), ncol(CT))
244
-			dimnames(genotypes) <- dimnames(CT)
245
-			genotypes[1:length(env[["snps"]]), ] <- env[["calls"]]
246
-			polymorphic <- c(rep(1, length(env[["snps"]])), rep(0, nrow(npCT)))
247
-			fD <- new("AnnotatedDataFrame", 
248
-				  data=data.frame(chromosome=rep(CHR, nrow(CT)), 
249
-				  polymorphic=polymorphic,
250
-				  position=c(position.snp, position.np)),
251
-				  varMetadata=data.frame(labelDescription=c("chromosome",
252
-							 "polymorphic",
253
-							 "position")))
254
-			pD <- annotatedDataFrameFrom(CT, byrow=FALSE)
255
-			locusSet <- new("oligoSnpSet",
256
-					copyNumber=CT,
257
-					calls=genotypes,
258
-					featureData=fD,
259
-					phenoData=pD,
260
-					annotation="genomewidesnp6")
261
-			save(locusSet, file=file.path(outdir, paste("locusSet_", CHR, ".rda", sep="")))			
262
-		} else {
263
-			load(file.path(outdir, paste("CT_chr", CHR, ".rda", sep="")))
264
-		}
265
-		rm(env); gc()
266
-	}
267
-}
146
+The above algorithm for estimating copy number is predicated on the
147
+assumption that most samples within a batch have copy number 2 at any
148
+given locus.  For common copy number variants, this assumption may not
149
+hold.  An additional iteration using a bias correction can improve the
150
+estimates.  Set the \Robject{bias.adj} argument to \Robject{TRUE}.
151
+
152
+<<biasAdjustment, eval=FALSE, echo=FALSE>>=
153
+library(oligoClasses);library(genefilter)
154
+source("~/madman/Rpacks/crlmm/R/AllClasses.R")
155
+source("~/madman/Rpacks/crlmm/R/AllGenerics.R")
156
+source("~/madman/Rpacks/crlmm/R/cnrma-functions.R")
157
+source("~/madman/Rpacks/crlmm/R/methods-eSet.R")
158
+source("~/madman/Rpacks/crlmm/R/methods-ABset.R")
159
+source("~/madman/Rpacks/crlmm/R/methods-CrlmmSetList.R")
160
+source("~/madman/Rpacks/crlmm/R/methods-CopyNumberSet.R")
161
+source("~/madman/Rpacks/crlmm/R/utils.R")
162
+.crlmmPkgEnv <- new.env()
163
+envir2 <- envir
164
+envir <- getCopyNumberEnvironment(crlmmSetList, cnSet)
165
+updateNuPhi(crlmmSetList, cnSet)
166
+##TODO
268 167
 @ 
269 168
 
270 169
 \section{Suggested plots}
... ...
@@ -275,24 +174,21 @@ Plot physical position versus copy number for the first sample.  Recall
275 174
 that the copy number estimates were multiplied by 100 and stored as an
276 175
 integer.
277 176
 
278
-<<oneSample, fig=TRUE, width=8, height=4, include=FALSE>>=
279
-require(ellipse)
177
+<<oneSample, fig=TRUE, width=8, height=4, include=FALSE, eval=FALSE>>=
280 178
 par(las=1)
281
-plot(position(locusSet), copyNumber(locusSet)[, 1]/100, pch=".", cex=2, xaxt="n", col="grey70", ylim=c(0,6), 
179
+plot(position(cnset), copyNumber(cnset)[, 1], pch=".", cex=2, xaxt="n", col="grey20", ylim=c(0,6), 
282 180
      ylab="copy number", xlab="physical position (Mb)",
283
-     main=paste(sampleNames(locusSet)[1], ", CHR:", unique(chromosome(locusSet))))
284
-axis(1, at=pretty(range(position(locusSet))), labels=pretty(range(position(locusSet)))/1e6)
285
-I <- fData(locusSet)$polymorphic==0
286
-points(position(locusSet)[I], copyNumber(locusSet)[I, 1]/100, pch=".", col="blue", cex=1)
181
+     main=paste(sampleNames(cnset)[1], ", CHR:", unique(chromosome(cnset))))
182
+points(position(cnset)[cnIndex(cnset)], copyNumber(cnset)[cnIndex(cnset), 1],
183
+       pch=".", cex=2, col="lightblue")
184
+axis(1, at=pretty(range(position(cnset))), labels=pretty(range(position(cnset)))/1e6)
287 185
 @ 
288
-<<eval=FALSE>>=
186
+
187
+<<idiogram, eval=FALSE, echo=FALSE>>=
289 188
 require(SNPchip)
290 189
 plotCytoband(22, new=FALSE, cytoband.ycoords=c(5.8, 6.0), label.cytoband=FALSE)
291 190
 @ 
292 191
 
293
-In this example, the estimates of copy number for the nonpolymorphic
294
-probes appear correlated with the polymorphic probes -- a good sign.
295
-
296 192
 \begin{figure}
297 193
   \includegraphics[width=0.9\textwidth]{copynumber-oneSample}
298 194
   \caption{Total copy number (y-axis) for chromosome 22 plotted
... ...
@@ -301,57 +197,87 @@ probes appear correlated with the polymorphic probes -- a good sign.
301 197
 
302 198
 \paragraph{One SNP at a time}
303 199
 
304
-This section needs to be cleaned up (TODO).  The parameters needed for
305
-drawing prediction regions are plate-specific.
306
-
307
-<<intermediateFiles>>=
308
-tau2A <- env[["tau2A"]]
309
-tau2B <- env[["tau2B"]]
310
-sig2A <- env[["sig2A"]]
311
-sig2B <- env[["sig2B"]]
312
-nuA <- env[["nuA"]]
313
-nuB <- env[["nuB"]]
314
-phiA <- env[["phiA"]]
315
-phiB <- env[["phiB"]]
316
-corr <- env[["corr"]]
317
-corrA.BB <- env[["corrA.BB"]]
318
-corrB.AA <- env[["corrB.AA"]]
319
-A <- env[["A"]]
320
-B <- env[["B"]]
321
-@ 
322
-
323 200
 Plot the prediction regions for total copy number 2 and 3 for the first
324 201
 plate. Plotting symbols are the genotype calls (1=AA, 2=AB, 3=BB); light
325 202
 grey points are from other plates. One could also add the prediction
326 203
 regions for 0-4 copies, but it gets crowded.
327 204
 
328
-<<predictionRegion, fig=TRUE, width=8, height=8, include=FALSE>>=
329
-par(las=1, pty="s")
330
-p <- 1 ##indicates plate
331
-J <- grep(unique(plate)[p], plate) ##sample indices for this plate
332
-ylim <- c(6.5,13)
333
-I <- which(phiA > 10 & phiB > 10)  
334
-i <- I[1]
335
-##plate effects are small
336
-log2(phiA[i, ])
337
-log2(phiB[i, ])
338
-plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(genotypes[i, ]), col="grey60", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B")
339
-points(log2(A[i, J]), log2(B[i, J]), col="black", pch=as.character(genotypes[i, J]))
340
-for(CT in 2){
341
-	if(CT == 2) ellipse.col <- "black" ##else ellipse.col <- "purple"
342
-	for(CA in 0:CT){
343
-		CB <- CT-CA
344
-		A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0))
345
-		B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0))
346
-		scale <- c(A.scale, B.scale)
347
-		if(CA == 0 & CB > 0) rho <- corrA.BB[i, p]
348
-		if(CA > 0 & CB == 0) rho <- corrB.AA[i, p]
349
-		if(CA > 0 & CB > 0) rho <- corr[i, p]		
350
-		lines(ellipse(x=rho, centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])),
351
-			      scale=scale), col=ellipse.col, lwd=2)
205
+<<genotypeCalls, fig=TRUE, width=8, height=8, include=FALSE, eval=FALSE>>=
206
+xlim <- ylim <- c(6.5,13)
207
+pch <- 21
208
+colors <- c("red", "blue", "green3")
209
+cex <- 0.6
210
+par(mfrow=c(3,3), las=1, pty="s", ask=FALSE, mar=c(2, 2, 2, 2), oma=c(2, 2, 1, 1))
211
+##plot 9 at a time
212
+indices <- split(snpIndex(crlmmResults), rep(1:length(snpIndex(crlmmResults)), each=9, length.out=length(snpIndex(crlmmResults))))
213
+par(ask=TRUE)
214
+j <- 1
215
+for(i in indices[[j]]){
216
+	gt <- calls(crlmmResults)[i, ]
217
+	plot(crlmmResults[i, ], 
218
+	     pch=pch, 
219
+	     col=colors[gt], 
220
+	     bg=colors[gt], cex=cex,
221
+	     xlim=xlim, ylim=ylim)
222
+	mtext("A", 1, outer=TRUE, line=1)
223
+	mtext("B", 2, outer=TRUE, line=1)	
224
+}
225
+@ 
226
+
227
+<<predictionRegion, fig=TRUE, width=8, height=8, include=FALSE, eval=FALSE>>=
228
+require(RColorBrewer)
229
+greens <- brewer.pal(9, "Greens")
230
+J <- split(1:ncol(cnset), cnset$batch)
231
+colors <- c("red", "blue", "green3")
232
+cex <- 0.6
233
+colors <- c("blue", greens[8], "red")
234
+pch.col <- c("grey40", "black", "grey40")
235
+xlim <- ylim <- c(6.5,13)
236
+plotpoints <- FALSE
237
+lwd <- 2
238
+pdf("figures/snp22plots%02d.pdf", width=600, height=600)
239
+ask <- FALSE
240
+par(mfrow=c(3,3), las=1, pty="s", ask=ask, mar=c(2, 2, 2, 2), oma=c(2, 2, 1, 1))
241
+indices <- split(snpIndex(crlmmResults), rep(1:length(snpIndex(crlmmResults)), each=9, length.out=length(snpIndex(crlmmResults))))
242
+for(j in seq(along=indices)[1:10]){
243
+	cat(j, "\n")
244
+	k <- 1
245
+	for(i in indices[[j]]){
246
+		gt <- calls(crlmmResults)[i, ]
247
+		pch <- as.character(gt)
248
+		cex <- 0.9
249
+		plot(crlmmResults[i, ], 
250
+		     pch=pch, 
251
+		     col=pch.col[gt],
252
+		     cex=cex,
253
+		     xlim=xlim, ylim=ylim,
254
+		     type="n")
255
+		if(plotpoints){
256
+			for(b in seq(along=unique(cnset$batch))){
257
+				points(crlmmResults[i, J[[b]]], 
258
+				       pch=pch, 
259
+				       col=colors[b], bg=colors[b], cex=cex,
260
+				       xlim=xlim, ylim=ylim)
261
+			}
262
+		}
263
+		for(b in seq(along=unique(cnset$batch))){
264
+			ellipse(cnset[i, J[[b]]], copynumber=2, col=colors[b], lwd=lwd)
265
+		}
266
+		##legend("bottomright", bty="n", legend=featureNames(crlmmResults)[i])
267
+		if(k == 1) {
268
+			legend("bottomleft", bty="n", fill=colors, legend=c("CEPH", "Yoruba", "Asian"))	
269
+			mtext("A", 1, outer=TRUE, line=1)
270
+			mtext("B", 2, outer=TRUE, line=0)			
271
+		}
272
+		k <- k+1
352 273
 	}
353 274
 }
354
-legend("topright", lwd=3, col="black", legend="2 copies", bty="n")
275
+dev.off()
276
+@
277
+
278
+<<problems, echo=FALSE, eval=FALSE>>=
279
+##8300117, 8469401, 8706404, 2296864, 8659838, 1936806, 4198761, 8584395
280
+
355 281
 @ 
356 282
 
357 283
 \begin{figure}
... ...
@@ -362,106 +288,68 @@ legend("topright", lwd=3, col="black", legend="2 copies", bty="n")
362 288
 \end{figure}
363 289
 
364 290
 
365
-<<predictionRegion, eval=FALSE, echo=FALSE>>=
366
-par(las=1, pty="s", ask=TRUE)
367
-p <- 1 ##indicates plate
368
-J <- grep(unique(plate)[p], plate) ##sample indices for this plate
369
-ylim <- c(6.5,13)
370
-I <- which(phiA > 10 & phiB > 10)
371
-col <- c("grey0", "grey30", "grey70")
372
-cex <- 3
373
-for(j in seq(along=I)){
374
-	i <- I[j]
375
-	J1 <- grep("C", plate) ##sample indices for this plate
376
-	J2 <- grep("Y", plate) ##sample indices for this plate	
377
-	J3 <- grep("A", plate)
378
-	plot(log2(A[i, ]), log2(B[i, ]), pch=21, col="grey60", type="n", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B")
379
-	points(log2(A[i, J1]), log2(B[i, J1]), col=col[1], pch=".", cex=cex)
380
-	points(log2(A[i, J2]), log2(B[i, J2]), col=col[2], pch=".", cex=cex)
381
-	points(log2(A[i, J3]), log2(B[i, J3]), col=col[3], pch=".", cex=cex)
382
-	P <- 1:3
383
-	ellipse.col <- col
384
-	for(p in seq(along=P)){
385
-		for(CT in 2){
386
-			for(CA in 0:CT){
387
-				CB <- CT-CA
388
-				A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0))
389
-				B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0))
390
-				scale <- c(A.scale, B.scale)
391
-				if(CA == 0 & CB > 0) rho <- corrA.BB[i, p]
392
-				if(CA > 0 & CB == 0) rho <- corrB.AA[i, p]
393
-				if(CA > 0 & CB > 0) rho <- corr[i, p]		
394
-				lines(ellipse(x=rho, centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])),
395
-					      scale=scale), col=ellipse.col[p], lwd=2)
396
-			}
397
-		}
291
+\section{Smoothing via a hidden Markov model}
398 292
 
399
-	}
400
-	legend("topright", lwd=3, col=ellipse.col, legend=unique(plate), bty="n")	
401
-}
402
-@ 
293
+Here we smooth via a hidden Markov model.  To facilitate comparisons
294
+with the Birdseye HMM, the same transition probabilities are specified.
403 295
 
404
-Look at the distribution of shifts in the predicted centers across the
405
-plates.  The biggest shifts are for SNPs that have no observations in a
406
-subset of the plates -- need more shrinkage here.
407
-
408
-<<shifts, eval=FALSE, echo=FALSE>>=
409
-shiftA <- log2(nuA+phiA)
410
-d <- suppressWarnings(apply(shiftA, 1, function(x) diff(range(x, na.rm=TRUE))))
411
-hist(d)
412
-index <- which(d > 1.5)
413
-plate1 <- plate2 <- rep(NA, length(index))
414
-for(i in seq(along=index)){
415
-	plate1[i] <- which(shiftA[index[i], ] == maxA[index[i]])
416
-	plate2[i] <- which(shiftA[index[i], ] == minA[index[i]])
417
-}
418
-par(ask=TRUE)
419
-par(las=1)
420
-for(j in seq(along=index)){
421
-	i <- index[j]
422
-	J1 <- grep(plate[plate1[j]], plate) ##sample indices for this plate
423
-	J2 <- grep(plate[plate2[j]], plate) ##sample indices for this plate
424
-	ylim <- c(6.5,13)
425
-	plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(genotypes[i, ]), col="grey60", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B")
426
-	points(log2(A[i, J1]), log2(B[i, J1]), col="brown", pch=as.character(genotypes[i, J1]))
427
-	points(log2(A[i, J2]), log2(B[i, J2]), col="blue", pch=as.character(genotypes[i, J2]))
428
-	CT <- 2
429
-	P <- c(plate1[j], plate2[j])
430
-	for(p in seq(along=P)){
431
-		if(p == 1) ellipse.col="brown" else ellipse.col="blue"
432
-		for(CA in 0:CT){
433
-			CB <- CT-CA
434
-			A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0))
435
-			B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0))
436
-			scale <- c(A.scale, B.scale)
437
-			if(CA == 0 & CB > 0) rho <- corrA.BB[i, p]
438
-			if(CA > 0 & CB == 0) rho <- corrB.AA[i, p]
439
-			if(CA > 0 & CB > 0) rho <- corr[i, p]			
440
-			lines(ellipse(x=rho, centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])),
441
-				      scale=scale), col=ellipse.col, lwd=2)
442
-		}
296
+<<segment>>=
297
+library(VanillaICE)
298
+copyNumberStates <- 0:5
299
+require(Biobase)
300
+cnset <- crlmm:::thresholdModelParams(cnset)
301
+if(!exists("fit")){
302
+	if(file.exists(file.path(outdir, paste("fit_", CHR, ".rda", sep="")))) load(file.path(outdir, paste("fit_", CHR, ".rda", sep="")))
303
+	else {
304
+		emission.cn <- computeEmission(crlmmResults, cnset, copyNumberStates)
305
+		tau <- transitionProbability(chromosome=chromosome(cnset),
306
+					     position=position(cnset),
307
+					     TAUP=1e8)
308
+		initialP <- rep(1/length(copyNumberStates), length(copyNumberStates))
309
+		fit <- viterbi(initialStateProbs=log(initialP),
310
+			       emission=emission.cn,
311
+			       tau=tau[, "transitionPr"],
312
+			       arm=tau[, "arm"], 
313
+			       normalIndex=2,
314
+			       normal2altered=0.005,
315
+			       altered2normal=0.5,
316
+			       altered2altered=0.0025)
317
+		brks <- breaks(x=fit, states=copyNumberStates, position=tau[, "position"],
318
+			       chromosome=tau[, "chromosome"],
319
+			       sampleNames=sampleNames(cnset))
443 320
 	}
444 321
 }
445
-legend("topright", lwd=3, col=c("brown", "blue"), legend=c("2 copies, plate 1", "2 copies, plate 2 "), bty="n")
446 322
 @ 
447 323
 
448
-Now look at shifts for which we have at least 3 observations in each
449
-genotype cluster (TO DO).
450
-
451
-<<shifts, eval=FALSE, echo=FALSE>>=
452
-shiftA <- log2(nuA+phiA)
453
-maxA <- apply(shiftA, 1, max, na.rm=TRUE)
454
-minA <- apply(shiftA, 1, min, na.rm=TRUE)
455
-d <- maxA - minA
456
-hist(d)
457
-index <- which(d > 0.2)
458
-plate1 <- plate2 <- rep(NA, length(index))
459
-for(i in seq(along=index)){
460
-	plate1[i] <- which(shiftA[index[i], ] == maxA[index[i]])
461
-	plate2[i] <- which(shiftA[index[i], ] == minA[index[i]])
324
+<<visualizations, eval=FALSE>>=
325
+dir.create("figures")
326
+par(las=1, mfrow=c(1,1), ask=FALSE)
327
+bmp("figures/chr22plots%02d.png", width=800, height=500)
328
+for(j in 1:ncol(cnset)){
329
+	cat(j, " ")
330
+	##A region that we would miss if we only looked at the polymorphic probes...is it real?
331
+	##index <- which(position(cnset) > 21.2*1e6 & position(cnset) < 21.6*1e6 & mns < 1)
332
+	##fit <- fit[index,]
333
+	index <- 1:nrow(fit)
334
+	plot(0:1, type="n", xlim=range(position(cnset)[index]),
335
+	     xaxt="n", ylim=c(-0.5, 6), 
336
+	     ylab="copy number", xlab="physical position (Mb)",
337
+	     main=paste(sampleNames(cnset)[j], ", CHR:", unique(chromosome(cnset))), 
338
+	     lwd=2,
339
+	     col="blue")
340
+	points(position(cnset)[index], copyNumber(cnset)[index, j], pch=".", cex=2, col="grey60")
341
+	lines(position(cnset)[index], fit[index, j]-1, type="s", lwd=2, col="blue")
342
+	abline(h=0:4, col="grey60")
343
+	axis(1, at=pretty(range(position(cnset)[index])), labels=pretty(range(position(cnset)[index]))/1e6)
462 344
 }
345
+dev.off()
463 346
 @ 
464 347
 
348
+<<fixProblems, eval=FALSE>>=
349
+## NA18522, NA18978 (missed deletion), NA18944 (noisy), NA19098 and NA19130 (missed amplification), 
350
+## NA19138 (looks like deletion with contamination), NA19143 (missed amplification)
351
+## NA19204 (looks like homozygous deletion)
352
+@ 
465 353
 
466 354
 \section{Session information}
467 355
 <<>>=
... ...
@@ -2,53 +2,26 @@
2 2
 \alias{computeCopynumber}
3 3
 \title{Computes copy number}
4 4
 \description{
5
-  
6 5
   A function that transforms the quantile-normalized fluorescence
7 6
   intensities of the polymorphic and nonpolymorphic probes to a copy
8 7
   number scale.  
9 8
   
10 9
 }
11 10
 \usage{
12
-computeCopynumber(chrom, A, B, calls, conf, NP, plate, MIN.OBS=1, 
13
-envir, P, DF.PRIOR = 50, CONF.THR = 0.99, bias.adj=FALSE,
14
-priorProb, gender=NULL, SNR, SNRmin, seed=123, cdfName="genomewidesnp6",
15
-verbose=TRUE, ...)
11
+computeCopynumber(object, CHR, bias.adj=FALSE, batch, SNRmin=5, cdfName="genomewidesnp6", ...)
16 12
 }
17 13
 \arguments{
18
-  \item{chrom}{Chromosome (an integer).  Use 23 for X and 24 for Y.}
19
-  \item{A}{The A allele intensities from \code{snprma}}
20
-  \item{B}{The B allele intensities from \code{snprma}}
21
-  \item{calls}{The genotype calls from \code{crlmm}}
22
-  \item{conf}{The genotype confidence scores from \code{crlmm}}
23
-  \item{NP}{The quantile normalized intensities of the nonpolymorphic probes}
24
-  \item{plate}{The bach variable.  Should be the same length as the
25
-    number of columns in A}
26
-  \item{MIN.OBS}{Integer: The minimum number of observations in a genotype
27
-    cluster for which a SNP is deemed complete.}
28
-  \item{envir}{An environment to save intermediate objects}
29
-  \item{P}{Mainly for debugging a particular plate/batch.}
30
-  \item{DF.PRIOR}{The degrees of freedom for the prior.  Higher numbers
31
-    will shrink the variance and correlation more.}
32
-  \item{CONF.THR}{A threshold for the genotype confidence scores.
33
-    Genotypes with scores below the threshold are ignored when computing
34
-  SNP-specific  within-genotype estimates of location and scale.}
35
-  \item{bias.adj}{Logical: whether to adjust the location and scale
36
-    parameters to account for biases due to common copy number
37
-    variants.  This is a SNP-specific adjustment.  Parameters for
38
-    background and slope must have already been estimated and available
39
-    from the environment variable.}
40
-  \item{priorProb}{Numerical vector of length 4.  The prior probability
41
-    of each copy number state (0, 1, 2, 3, and 4). The default is a
42
-    uniform prior.  Ignored if bias.adj=FALSE}
43
-  \item{gender}{Gender of subjects. If not specified, we predict the
44
-    gender from the X chromosome.}
45
-  \item{SNR}{Signal to noise ratio from crlmm.}
14
+  \item{object}{object of class \code{CrlmmSetList}.}
15
+  \item{CHR}{Chromosome (an integer).  Use 23 for X and 24 for Y.}
16
+  \item{bias.adj}{Logical.  When TRUE, an extra iteration is performed
17
+    (see details) .}
18
+  \item{batch}{Factor.  Must be the same length as the number of samples
19
+    in the object.}
46 20
   \item{SNRmin}{The minimum value for the SNR -- we suggest 5. Samples
47
-    with SNR below SNRmin are excluded.}
48
-  \item{seed}{Seed used for random samples}
21
+    with SNR below SNRmin are dropped.}
49 22
   \item{cdfName}{Annotation package }
50
-  \item{verbose}{Logical: verbose output}
51
-  \item{\dots}{Currently ignored}
23
+  \item{\dots}{arguments to \code{.computeCopynumber} -- not called by
24
+    the user.}
52 25
 }
53 26
 
54 27
 \details{
... ...
@@ -57,56 +30,28 @@ verbose=TRUE, ...)
57 30
   assume that the median within-SNP intensity across samples is 2.  We
58 31
   make no assumption about the chromosomal copy number. This function is
59 32
   useful for detecting rare variants (e.g., variants that would not
60
-  affect the SNP-specific quantile-based estimators of location and
61
-  scale).  A correction for more common variants is coming, as well as
62
-  improved estimates of the uncertainty.
63
-  
33
+  effect the SNP-specific quantile-based estimators of location and
34
+  scale).  When \code{bias.adj=TRUE}, an extra iteration is performed
35
+  whereby samples with a high posterior probability of having a
36
+  non-normal copy number are excluded.  The resulting within-genotype
37
+  estimators of location and scale are more robust to a large number of
38
+  samples having a copy number variant.
39
+
64 40
 }
65 41
 
66 42
 \value{
43
+  
44
+  An object of class \code{CopyNumberSet}.
67 45
 
68
-All objects created by this function are stored in the environment
69
-  passed to this function.  In addition, each of the elements are
70
-  specific to the chromosome(s) specified by the argument \code{chrom}.
71
-  For instance the element \code{A} is the matrix of quantile-normalized
72
-  intensities for the A-allele on chromosome(s) \code{chrom}.  The
73
-  element of this environment are as follows
46
+}
74 47
 
75
-  \item{A}{Matrix of quantile-normalized intensities for the A-allele}
76
-  \item{B}{Matrix of quantile-normalized intensities for the A-allele}
77
-  \item{CA}{Copy number estimate for the A-allele (x 100)} 
78
-  \item{CB}{Copy number estimate for the B-allele (x 100)}
79
-  \item{calls}{CRLMM genotype calls (AA=1, AB=2, BB=3)}
80
-  \item{chrom}{Integer(s) indicating the chromosome(s)} 
81
-  \item{cnvs}{Names of the nonpolymorphic probes.  These are the
82
-  rownames of \code{NP} and \code{CT}.}
83
-  \item{conf}{CRLMM confidence scores for the genotypes: 'round(-1000*log2(1-p))'}
84
-  \item{corr}{Correlation of the A and B alleles for genotypes AB}
85
-  \item{corrA.BB}{Correlation of A and B alleles for genotypes BB}
86
-  \item{corrB.AA}{Correlation of A and B alleles for genotypes AA}
87
-  \item{CT}{Copy number estimates for nonpolymorphic probe.  See
88
-  \code{cnvs} for the rownames.}
89
-  \item{CT.sds}{Standard deviation estimates for \code{CT}}
90
-  \item{npflags}{Flags for the nonpolymorphic probes.}
91
-  \item{Ns}{The number of observations for each genotype/plate}
92
-  \item{nuA}{Background/cross-hyb for the A allele (plate- and locus-specific)}
93
-  \item{nuB}{Background/cross-hyb for the B allele (plate- and locus-specific)}
94
-  \item{nuT}{Background for the nonpolymorphic probes (plate- and locus-specific)}
95
-  \item{phiA}{Slope for the A allele (plate- and locus-specific)}
96
-  \item{phiB}{Slope for the B allele (plate- and locus-specific)}
97
-  \item{phiT}{Slope for the nonpolymorphic probes (plate- and locus-specific)}
98
-  \item{plate}{Factor indicating batch (same length as number of cel files)}
99
-  \item{sig2A}{Variance estimate for the A-allele signal (plate- and locus-specific)}
100
-  \item{sig2B}{Variance estimate for the B-allele signal (plate- and locus-specific)}
101
-  \item{sig2T}{Variance estimate for the nonpolymorphic signal (plate- and locus-specific)}
102
-  \item{snpflags}{Flags for polymorphic probes}
103
-  \item{snps}{Rownames for \code{A}, \code{B}, \code{CA}, \code{CB}, ...}
104
-  \item{sns}{ Sample names -- the column names for \code{A}, \code{B}, ...}
105
-  \item{steps}{Steps completed. For internal use.}
106
-  \item{tau2A}{Variance estimate for the B-allele background/cross-hyb (plate- and locus-specific)}
107
-  \item{tau2B}{Variance estimate for the B-allele background/cross-hyb (plate- and locus-specific)}
48
+\seealso{
49
+  \code{\linkS4class{CopyNumberSet-class}}
108 50
 }
109 51
 \references{Nothing yet.}
110 52
 \author{Rob Scharpf}
111 53
 \keyword{manip}
54
+\keyword{models}
55
+\keyword{regression}
56
+\keyword{robust}
112 57
 
... ...
@@ -17,11 +17,11 @@
17 17
   where 'p' is the posterior probability of the call.
18 18
 }
19 19
 \usage{
20
-calls(x)
21
-confs(x)
20
+calls(object)
21
+confs(object)
22 22
 }
23 23
 \arguments{
24
-  \item{x}{SnpSet object}
24
+  \item{object}{SnpSet object}
25 25
 }
26 26
 \value{
27 27
   Matrix of genotype calls or confidences.
... ...
@@ -35,7 +35,7 @@ readIdatFiles(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
35 35
     named 'SentrixPosition') and this should be specified as
36 36
     \code{list(barcode=NULL, position="SentrixPosition")}}
37 37
   \item{highDensity}{logical (used when \code{sampleSheet} is
38
-    specified). If \code{TRUE}, array extensions '_A', '_B' in
38
+    specified). If \code{TRUE}, array extensions '\_A', '\_B' in
39 39
     sampleSheet are replaced with 'R01C01', 'R01C02' etc.}
40 40
   \item{sep}{character string specifying separator used in .idat file
41 41
     names.}
... ...
@@ -51,8 +51,8 @@ were used in each channel and standard errors (all on the orginal scale)
51 51
 are read in from the .idat files.
52 52
 
53 53
 Where available, a \code{sampleSheet} data.frame, in the same format
54
-as used by BeadStudio (columns 'Sample_ID', 'SentrixBarcode_A' and
55
-'SentrixPosition_A' are required) which keeps track of sample
54
+as used by BeadStudio (columns 'Sample\_ID', 'SentrixBarcode\_A' and
55
+'SentrixPosition\_A' are required) which keeps track of sample
56 56
 information can be specified.
57 57
 
58 58
 Thanks to Keith Baggerly who provided the code to read in the binary .idat files.
... ...
@@ -33,7 +33,7 @@ snprma(filenames, mixtureSampleSize = 10^5, fitMixture = FALSE, eps = 0.1, verbo
33 33
     Seed to be used when sampling.
34 34
 }
35 35
   \item{cdfName}{
36
-    cdfName: 'GenomeWideSnp_5', 'GenomeWideSnp_6'
36
+    cdfName: 'GenomeWideSnp\_5', 'GenomeWideSnp\_6'
37 37
 }
38 38
   \item{sns}{
39 39
     Sample names.