Browse code

Added accessors for linear model parameters.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@48922 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 21/08/2010 02:47:15
Showing9 changed files

... ...
@@ -56,7 +56,7 @@ importFrom(ellipse, ellipse)
56 56
 importFrom(ff, ffdf)
57 57
 
58 58
 exportClasses(CNSetLM, ffdf, list)
59
-exportMethods(open, "[", show, lM, lines)
59
+exportMethods(open, "[", show, lM, lines, nu, phi, corr, sigma2, tau2)
60 60
 export(crlmm, 
61 61
        crlmmCopynumber, 
62 62
        crlmmIllumina, 
... ...
@@ -76,3 +76,4 @@ export(constructIlluminaCNSet)
76 76
 
77 77
 
78 78
 
79
+
... ...
@@ -10,4 +10,10 @@ setGeneric("lM", function(object) standardGeneric("lM"))
10 10
 setGeneric("lM<-", function(object, value) standardGeneric("lM<-"))
11 11
 setGeneric("totalCopyNumber", function(object,...) standardGeneric("totalCopyNumber"))
12 12
 
13
+setGeneric("corr", function(object, allele) standardGeneric("corr"))
14
+setGeneric("nu", function(object, allele) standardGeneric("nu"))
15
+setGeneric("phi", function(object, allele) standardGeneric("phi"))
16
+setGeneric("sigma2", function(object, allele) standardGeneric("sigma2"))
17
+setGeneric("tau2", function(object, allele) standardGeneric("tau2"))
18
+
13 19
 
... ...
@@ -54,7 +54,7 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){
54 54
 }
55 55
 
56 56
 construct <- function(filenames, cdfName, copynumber=FALSE,
57
-		      sns, verbose=TRUE, batch){
57
+		      sns, verbose=TRUE, batch, fns){
58 58
 	if(verbose) message("Initializing container for assay data elements alleleA, alleleB, call, callProbability, CA, CB")
59 59
 	if(missing(sns)) sns <- basename(filenames)
60 60
 	protocolData <- getProtocolData.Affy(filenames)
... ...
@@ -67,6 +67,11 @@ construct <- function(filenames, cdfName, copynumber=FALSE,
67 67
 	}
68 68
 	rownames(pData(protocolData)) <- sns
69 69
 	featureData <- getFeatureData.Affy(cdfName, copynumber=copynumber)
70
+	if(!missing(fns)){
71
+		index <- match(fns, featureNames(featureData))
72
+		if(all(is.na(index))) stop("fns not in featureNames")
73
+		featureData <- featureData[index, ]
74
+	}
70 75
 	nr <- nrow(featureData); nc <- length(filenames)
71 76
 	ffObjects <- list(alleleA=initializeBigMatrix(name="A", nr, nc),
72 77
 			  alleleB=initializeBigMatrix(name="B", nr, nc),
... ...
@@ -1062,7 +1067,6 @@ crlmmCopynumberLD <- function(object,
1062 1067
 		save(snpflags, file=file.path(ldPath(), "snpflags.rda"))
1063 1068
 	} else{
1064 1069
 		load(file.path(ldPath(), "snpflags.rda"))
1065
-				
1066 1070
 	} 
1067 1071
 	if(verbose) message("Estimating allele-specific copy number at autosomal SNPs")
1068 1072
 	snpBatches <- splitIndicesByLength(autosomeIndex.snps, ocProbesets())
... ...
@@ -200,3 +200,89 @@ ellipse.CNSet <- function(x, copynumber, batch, ...){
200 200
 }
201 201
 
202 202
 
203
+setMethod("nu", c("CNSetLM", "character"), function(object, allele){
204
+	getValue <- function(allele){
205
+		switch(allele,
206
+		       A="nuA",
207
+		       B="nuB",
208
+		       stop("allele must be 'A' or 'B'"))
209
+	}	
210
+	val <- getValue(allele)
211
+	class.lm <- class(lM(object)) 
212
+	if(class.lm == "ffdf"){
213
+		return(physical(lM(object))[[val]])
214
+	} else {
215
+		if(class.lm != "matrix") stop("lM() must be matrix or ffdf")
216
+		return(lM(object)[[val]])
217
+	}
218
+})
219
+
220
+setMethod("phi", c("CNSetLM", "character"), function(object, allele){
221
+	getValue <- function(allele){
222
+		switch(allele,
223
+		       A="phiA",
224
+		       B="phiB",
225
+		       stop("allele must be 'A' or 'B'"))
226
+	}
227
+	val <- getValue(allele)	
228
+	class.lm <- class(lM(object)) 
229
+	if(class.lm == "ffdf"){
230
+		return(physical(lM(object))[[val]])
231
+	} else {
232
+		if(class.lm != "matrix") stop("lM() must be matrix or ffdf")
233
+		return(lM(object)[[val]])
234
+	}
235
+})
236
+
237
+setMethod("sigma2", c("CNSetLM", "character"), function(object, allele){
238
+	getValue <- function(allele){
239
+		switch(allele,
240
+		       A="sig2A",
241
+		       B="sig2B",
242
+		       stop("allele must be 'A' or 'B'"))
243
+	}
244
+	val <- getValue(allele)	
245
+	class.lm <- class(lM(object)) 
246
+	if(class.lm == "ffdf"){
247
+		return(physical(lM(object))[[val]])
248
+	} else {
249
+		if(class.lm != "matrix") stop("lM() must be matrix or ffdf")
250
+		return(lM(object)[[val]])
251
+	}
252
+})
253
+
254
+setMethod("tau2", c("CNSetLM", "character"), function(object, allele){
255
+	getValue <- function(allele){
256
+		switch(allele,
257
+		       A="tau2A",
258
+		       B="tau2B",
259
+		       stop("allele must be 'A' or 'B'"))
260
+	}
261
+	val <- getValue(allele)
262
+	class.lm <- class(lM(object)) 
263
+	if(class.lm == "ffdf"){
264
+		return(physical(lM(object))[[val]])
265
+	} else {
266
+		if(class.lm != "matrix") stop("lM() must be matrix or ffdf")
267
+		return(lM(object)[[val]])
268
+	}
269
+})
270
+
271
+setMethod("corr", c("CNSetLM", "character"), function(object, allele){
272
+	getValue <- function(allele){
273
+		switch(allele,
274
+		       AA="corrAA",
275
+		       AB="corrAB",
276
+		       BB="corrBB",
277
+		       stop("must be AA, AB, or BB"))
278
+	}
279
+	val <- getValue(allele)
280
+	class.lm <- class(lM(object)) 
281
+	if(class.lm == "ffdf"){
282
+		return(physical(lM(object))[[val]])
283
+	} else {
284
+		if(class.lm != "matrix") stop("lM() must be matrix or ffdf")
285
+		return(lM(object)[[val]])
286
+	}
287
+})
288
+
... ...
@@ -1,10 +1,15 @@
1 1
 ## Need to submit from a fat node!
2
-all:	illumina_copynumber, copynumber
2
+all:	illumina_copynumber copynumber 
3 3
 
4 4
 copynumber:	copynumber.Rnw
5
+##	echo "Sweave(\"$1.Rnw\"); library(tools); texi2dvi(\"$1.tex\", pdf=TRUE)" | R --no-save --no-restore;
5 6
 	echo "Stangle(\"copynumber.Rnw\")" | R --no-save --no-restore;	
6 7
 	cat ~/bin/cluster.template | perl -pe "s/Rprog/copynumber.R/" > copynumber.R.sh
7
-	qsub -m e -r y -cwd -l mem_free=10G copynumber.R.sh
8
+	qsub -m e -r y -cwd -l mem_free=12G,h_vmem=16G copynumber.R.sh
9
+## the above does not create copynumber.tex
10
+
11
+sweavecopynumber: copynumber.Rnw
12
+	echo "Sweave(\"copynumber.Rnw\")" | R --no-save --no-restore;	
8 13
 
9 14
 texcopynumber:	copynumber.tex
10 15
 	texi2dvi --pdf --clean --quiet copynumber.tex
... ...
@@ -12,7 +17,7 @@ texcopynumber:	copynumber.tex
12 17
 illumina_copynumber:	illumina_copynumber.Rnw
13 18
 	echo "Stangle(\"illumina_copynumber.Rnw\")" | R --no-save --no-restore;	
14 19
 	cat ~/bin/cluster.template | perl -pe "s/Rprog/illumina_copynumber.R/" > illumina_copynumber.R.sh
15
-	qsub -m e -r y -cwd -l mem_free=10G illumina_copynumber.R.sh
20
+	qsub -m e -r y -cwd -l mem_free=12G,h_vmem=16G illumina_copynumber.R.sh
16 21
 
17 22
 texillumina_copynumber:	illumina_copynumber.tex
18 23
 	texi2dvi --pdf --clean --quiet illumina_copynumber.tex		
... ...
@@ -47,22 +47,48 @@ library(oligoClasses)
47 47
 library(crlmm)
48 48
 crlmm:::validCdfNames()
49 49
 cdfName <- "genomewidesnp6"
50
+pathToCels <- "/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m"
51
+if(getRversion() < "2.12.0"){
52
+	rpath <- getRversion()
53
+} else rpath <- "trunk"
54
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="")
50 55
 @ 
51 56
 
52
-\paragraph{Preprocess and genotype.}
57
+\paragraph{About this vignette.} This vignette was created using CEL
58
+files that are located in a specific directory on my computer
59
+(\Robject{pathToCels}).  Long computations are saved in the output
60
+directory \Robject{outdir}.  Users should change these variables as
61
+appropriate.  The following code chunk should fail unless the above
62
+arguments have been set appropriately.
63
+
64
+<<checkSetup>>=
65
+if(!file.exists(outdir)) stop("Please specify valid directory for storing output")
66
+if(!file.exists(pathToCels)) stop("Please specify the correct path to the CEL files")
67
+@ 
68
+
69
+Processed data from codechunks requiring long computations are saved
70
+to disk automatically by wrapping function calls in the
71
+\Robject{checkExists} function.  After the initial batch job,
72
+subsequent calls to \verb@Sweave@ will load saved computations from
73
+disk and can therefore be run interactively. This may be useful as you
74
+begin to explore some of the tools for accessing processed data and
75
+visualizations.
76
+
77
+\paragraph{Preprocessing and genotyping.}
53 78
 
54 79
 In the following code chunk, we provide the complete path to the
55
-Affymetrix CEL files, assign a path to store the normalized
56
-intensities and genotype data, and define a 'batch' variable. The
57
-batch variable will later be useful when we estimate copy number.  We
58
-typically use the 96 well chemistry plate or the scan date of the
59
-array as a surrogate for batch. Adjusting by date or chemistry plate
60
-can be helpful for limiting the influence of batch effects.  
80
+Affymetrix CEL files and define a 'batch' variable. The
81
+\Robject{batch} variable will later be useful when we estimate copy
82
+number.  We typically use the 96 well chemistry plate or the scan date
83
+of the array as a surrogate for batch. Adjusting by date or chemistry
84
+plate can be critical for limiting the influence of batch effects. For
85
+the HapMap CEL files in our analysis, the CEPH (C) and Yoruban (Y)
86
+samples were run on separate plates and can be extracted from the CEL
87
+filename as below.  
61 88
 
62 89
 <<celfiles>>=
63
-celFiles <- list.celfiles("/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m", full.names=TRUE, pattern=".CEL")
64
-outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/1m/crlmm"
65
-if(!file.exists(outdir)) dir.create(outdir)
90
+celFiles <- list.celfiles(pathToCels, full.names=TRUE, pattern=".CEL")
91
+if(length(celFiles) < 1) stop("Please specify the appropriate path to CEL files")
66 92
 ## CEPH and Yoruban
67 93
 batch <- substr(basename(celFiles), 13, 13)
68 94
 celFiles <- celFiles[batch %in% c("C", "Y")]
... ...
@@ -77,115 +103,164 @@ summarized for each locus.  As the markers at polymorphic loci on the
77 103
 Affymetrix 6.0 platform are identical, we summarize the intensities by
78 104
 the median. For the nonpolymorphic markers, no summarization step is
79 105
 performs.  Next, the \crlmm{} package provides a genotype call and a
80
-confidence score at each polymorphic locus.  Unless the dataset is small
81
-(e.g., fewer than 50 samples), we suggest installing and loading the
82
-\R{} package \Rpackage{ff}.  The function \Rfunction{genotype} checks to
83
-see whether the \Rpackage{ff} is loaded.  If loaded, the normalized
84
-intensities and genotype are stored as \Robject{ff} objects on disk.
85
-Otherwise, the genotypes and normalized intensities are stored in
86
-matrices.  A word of caution: the \Rfunction{genotype} function requires
87
-a potentially large amount of RAM even when the \R{} package
88
-\Rpackage{ff} is loaded.  Users with large datasets may want to skip
89
-this part.  For the 180 HapMap samples processed in this vignette, the
90
-\Rfunction{genotype} function required 25MB RAM. The next two code
91
-chunks should be submitted as batch jobs.
106
+confidence score at each polymorphic locus.  Unless the dataset is
107
+small (e.g., fewer than 50 samples), we suggest installing and loading
108
+the \R{} package \Rpackage{ff}.  The function \Rfunction{genotype}
109
+checks to see whether the \Rpackage{ff} is loaded.  If loaded, the
110
+normalized intensities and genotype are stored as \Robject{ff} objects
111
+on disk.  Otherwise, the genotypes and normalized intensities are
112
+stored in matrices.  A word of caution: the \Rfunction{genotype}
113
+function without \Rpackage{ff} requires a potentially large amount of
114
+RAM.  Therefore, we illustrate the \Robject{genotype} function using
115
+only the first 20 CEL files. We wrap the function \Rfunction{genotype}
116
+in the \Rfunction{checkExists} so that a user can load and inspect the
117
+saved object.  We first check that the file \Robject{cnSet.rda} does
118
+not exist. Existence of this file implies that we have already run the
119
+copy number estimation and, therefore, we do not need to preprocess
120
+and genotype the files a second time.
92 121
 
93 122
 <<genotype>>=
94
-if(!exists("cnSet")){
95
-	if(!file.exists(file.path(outdir, "cnSet.rda"))){
96
-		##ops <- options(error=recover)
97
-		cnSet <- genotype(celFiles, cdfName=cdfName, copynumber=TRUE, batch=batch)
98
-		save(cnSet, file=file.path(outdir, "cnSet.rda"))
99
-	} else load(file.path(outdir, "cnSet.rda"))
123
+if(!file.exists(file.path(outdir, "cnSet.rda"))){
124
+	gtSet.assayData_matrix <- checkExists("gtSet.assayData_matrix",
125
+					   .path=outdir,
126
+					   .FUN=genotype,
127
+					   filenames=celFiles[1:20],
128
+					   cdfName=cdfName,
129
+					   batch=batch[1:20])
130
+	class(calls(gtSet.assayData_matrix))
100 131
 }
101 132
 @ 
102 133
 
134
+Next, we estimate copy number for the 20 CEL files.  The copy number
135
+estimation step requirees a minimum number of CEL files in each batch
136
+(we suggest 10).  While estimation can be performed using only the 20
137
+samples as we do here, it would be far preferable to process all of
138
+the files that were on the same chemistry plate.  The code chunk below
139
+will load \Robject{cnSet.assayData_matrix} from disk if this
140
+computation had already been performed as part of the batch job.
141
+
142
+<<copynumber>>=
143
+cnSet.assayData_matrix <- checkExists("cnSet.assayData_matrix",
144
+				      .path=outdir,
145
+				      .FUN=crlmmCopynumber,
146
+				      object=gtSet.assayData_matrix)
147
+if(file.exists(file.path(outdir, "gtSet.assayData_matrix.rda")))
148
+	unlink(file.path(outdir, "gtSet.assayData_matrix.rda"))
149
+@ 
150
+
151
+
103 152
 A more memory efficient approach to preprocessing and genotyping is
104
-implemented in the \R{} function \Rfunction{genotype2}.  The arguments
105
-to this function are similar to \Rfunction{genotype} but requires
106
-explicit loading of the \R{} package \Rpackage{ff} prior to calling
107
-the function. The functions \Rfunction{ocProbesets} and
153
+implemented in the \R{} function \Rfunction{genotypeLD} and requires
154
+the \Rpackage{ff} package.  The functions \Rfunction{ocProbesets} and
108 155
 \Rfunction{ocSamples} can be used to manage how many probesets and
109
-samples are to be loaded into memory and processed.  Using the default
110
-settings, the following code required 1.9Mb RAM to process 180 CEL
111
-files.
156
+samples are to processed at a time and can therefore be used to fine
157
+tune the required RAM.  The function \Rfunction{ldPath} indicates that
158
+\Robject{ff} objects will be stored in the directory \Robject{outdir}.
112 159
 
113
-<<genotype2>>=
160
+<<ff>>=
114 161
 library(ff)
115
-ld.path <- file.path(outdir, "ffobjs")
116
-if(!file.exists(ld.path)) dir.create(ld.path)
117
-ldPath(ld.path)
118
-if(!exists("cnSet2")){
119
-	if(!file.exists(file.path(outdir, "cnSet2.rda"))){
120
-		cnSet2 <- genotype2(celFiles, cdfName=cdfName, copynumber=TRUE, batch=batch)
121
-		save(cnSet2, file=file.path(outdir, "cnSet2.rda"))
122
-	} else load(file.path(outdir, "cnSet2.rda"))
162
+ldPath(outdir)
163
+ocProbesets(50000)
164
+ocSamples(200)
165
+@ 
166
+
167
+With LDS now enabled, we preprocess and genotype 180 samples from the
168
+CEPH and Yoruban populations using the \R{} function
169
+\Robject{genotypeLD}.  Users that only want the genotypes at
170
+polymorphic loci (and not the copy number estimates) should instead
171
+use the \R{} function \Rfunction{crlmm} or \Rfunction{crlmm2}. Again,
172
+we wrap the call to \Rfunction{genotypeLD} in \Rfunction{checkExists}
173
+so that subsequent calls to \verb@Sweave@ can be run interactively. 
174
+
175
+
176
+<<LDS_genotype>>=
177
+if(!file.exists(file.path(outdir, "cnSet.assayData_ff.rda"))){
178
+	gtSet.assayData_ff <- checkExists("gtSet.assayData_ff",
179
+					  .path=outdir,
180
+					  .FUN=genotypeLD,
181
+					  filenames=celFiles,
182
+					  cdfName=cdfName,
183
+					  batch=batch)
184
+	class(calls(gtSet.assayData_ff))
123 185
 }
124 186
 @ 
125 187
 
126
-The objects returned by the \Rfunction{genotype} and
127
-\Rfunction{genotype2} differ in size as the former returns an object
128
-with matrices in the assay data slot, whereas the latter return an
129
-object with pointers to files on disk.  The functions \Rfunction{open}
130
-and \Rfunction{close} can be used to open and close the connections
131
-stored in the assay data of the \Robject{cnSet2} object, respectively.
132
-Subsetting the \Robject{ff} object pulls the data from disk into
133
-memory.  As subsetting operations pull the data from disk to memory,
134
-these operations must be performed with care.  In particular,
135
-subsetting the \Robject{cnSet2} will subset each assay data element
136
-and this can be slow.  The preferred approach would be to extract the
137
-assay data element that is needed, and then subset this function
138
-object as illustrated below for the genotype calls.
188
+The analogous function for copy number estimation follows. In
189
+practice, once the object \Robject{cnSet.assayData_ff} is created the
190
+\Robject{gtSet.assayData_ff} is no longer needed and can be removed
191
+from the \Robject{outdir}.
192
+
193
+<<LDS_copynumber>>=
194
+cnSet.assayData_ff <- checkExists("cnSet.assayData_ff",
195
+				  .path=outdir,
196
+				  .FUN=crlmmCopynumberLD,
197
+				  filenames=celFiles,
198
+				  cdfName=cdfName,
199
+				  batch=batch)
200
+if(file.exists(file.path(outdir, "gtSet.assayData_ff.rda")))
201
+	unlink(file.path(outdir, "gtSet.assayData_ff.rda"))
202
+gc()
203
+@ 
204
+
205
+The objects returned by the \Rfunction{genotypeLD} and
206
+\Rfunction{crlmmCopynumberLD} have assay data elements that are
207
+pointers to \Robject{ff} objects stored in the directory
208
+\Robject{outdir}.  The functions \Rfunction{open} and
209
+\Rfunction{close} open and close the connections to the
210
+\Robject{assayData} elements. Subsetting an \Robject{ff} object pulls
211
+the data from disk into memory and should therefore be used with
212
+caution. In particular, subsetting the \Robject{gt.assayData_ff} would
213
+subset each elements in the \Robject{assayData} slot, returning an
214
+object of the same class but with \Robject{assayData} elements that
215
+are matrices.  Such an operation can be exceedingly slow and require
216
+subsantial RAM.  The preferred approach is to extract only the assay
217
+data element that is needed. In the example below, we extract the
218
+genotype calls for the the first 50 samples.
139 219
 
140 220
 <<check>>=
141
-class(calls(cnSet))
142
-dims(cnSet)
143
-class(calls(cnSet2))
144
-dims(cnSet2)
145
-print(object.size(cnSet), units="Mb")
146
-print(object.size(cnSet2), units="Mb")
221
+dims(cnSet.assayData_ff)
222
+dims(cnSet.assayData_matrix)
223
+print(object.size(cnSet.assayData_matrix), units="Mb")
224
+print(object.size(cnSet.assayData_ff), units="Mb")
147 225
 if(FALSE){
148
-	replicate(5, system.time(gt1 <- calls(cnSet)[, 1:50])[[1]])
226
+	replicate(5, system.time(gt1 <- calls(cnSet.assayData_matrix)[, 1:50])[[1]])
149 227
 	open(calls(cnSet2))
150
-	replicate(5, system.time(gt2 <- calls(cnSet2)[, 1:50])[[1]])
228
+	replicate(5, system.time(gt2 <- calls(cnSet.assayData_ff)[, 1:50])[[1]])
151 229
 }
152
-gt1 <- calls(cnSet)[, 1:50]
153
-gt2 <- calls(cnSet2)[, 1:50]
230
+gt1 <- calls(cnSet.assayData_matrix)[, 1:50]
231
+gt2 <- calls(cnSet.assayData_ff)[, 1:50]
154 232
 all.equal(gt1, gt2)
155 233
 @ 
156 234
 
157
-\noindent With \Robject{ff} objects the additional I/O time required
158
-for extracting data is substantial and will increase for larger
159
-datasets.  Patience is required.
160
-
161 235
 Note that for the Affymetrix 6.0 platform the assay data elements each
162 236
 have a row dimension corresponding to the total number of polymorphic
163
-and nonpolymorphic markers interrogated by the Affymetrix 6.0 platform.
164
-A consequence of keeping the rows of the assay data elements the same
165
-for all of the statistical summaries is that the matrix used to store
166
-genotype calls is larger than necessary.  Also, note the additional
167
-overhead of some operations when using \Robject{ff} objects.  For
168
-instance, the posterior probabilities for the CRLMM genotype calls are
169
-represented as integers. The accessor \Robject{snpCallProbability} can
170
-be used to access these confidence scores.  When stored as matrices,
171
-converting the integer representation back to the probability scale is
172
-straightforward as shown below.  However, for the \Robject{ff} objects
173
-we must first bring the data into memory. To bring all the scores into
174
-memory, one could use the function \Rfunction{[,]} but this could be
175
-slow and require a lot of RAM depending on the size of the dataset.
176
-Instead, we suggest pulling only the needed rows and columns from
177
-memory. In the following example, we convert the integer scores to
178
-probabilities for the CEPH samples.  As genotype confidence scores are
179
-not applicable to the nonpolymorphic markers, we extract only the
237
+and nonpolymorphic markers interrogated by the Affymetrix 6.0
238
+platform.  A consequence of keeping the rows of the assay data
239
+elements the same for all of the statistical summaries is that the
240
+matrix used to store genotype calls is larger than necessary.  Also,
241
+note the additional overhead of some operations when using
242
+\Robject{ff} objects.  For instance, the posterior probabilities for
243
+the CRLMM genotype calls are represented as integers. The accessor
244
+\Robject{snpCallProbability} can be used to access these confidence
245
+scores.  When stored as matrices, converting the integer
246
+representation back to the probability scale is straightforward as
247
+shown below.  However, for the \Robject{ff} objects we must first
248
+bring the data into memory. To bring all the scores into memory, one
249
+could use the function \Rfunction{[,]} but this could be slow and
250
+require a lot of RAM depending on the size of the dataset.  Instead,
251
+we suggest pulling only the needed rows and columns from memory. In
252
+the following example, we convert the integer scores to probabilities
253
+for the CEPH samples.  As genotype confidence scores are not
254
+applicable to the nonpolymorphic markers, we extract only the
180 255
 polymorphic markers using the \Rfunction{isSnp} function.
181 256
 
182 257
 <<assayData>>=
183 258
 rows <- which(isSnp(cnSet))
184 259
 cols <- which(batch == "C")
185
-posterior.prob <- tryCatch(integerScoreToProbability(snpCallProbability),
260
+posterior.prob <- tryCatch(i2p(snpCallProbability(cnSet.assayData_ff)),
186 261
 			    error=function(e) print("This will not work for an ff object."))
187
-posterior.prob1 <- integerScoreToProbability(snpCallProbability(cnSet)[rows, cols])
188
-posterior.prob2 <- integerScoreToProbability(snpCallProbability(cnSet2)[rows, cols])
262
+posterior.prob1 <- i2p(snpCallProbability(cnSet.assayData_matrix)[rows, cols])
263
+posterior.prob2 <- i2p(snpCallProbability(cnSet.assayData_ff)[rows, cols])
189 264
 all.equal(posterior.prob1, posterior.prob2)
190 265
 @ 
191 266
 
... ...
@@ -204,15 +279,6 @@ explore the latter. As with the preprocessing and genotyping, the
204 279
 following code should be submitted as part of the batch job as it is too
205 280
 slow for interactive analysis.
206 281
 
207
-<<cn, echo=FALSE, eval=FALSE>>=
208
-##Matrix format
209
-cnSet <- crlmmCopynumber(cnSet)
210
-@ 
211
-
212
-<<save.cnset, echo=FALSE, eval=FALSE>>=
213
-save(cnSet, file=file.path(outdir, "cnSet.rda"))
214
-@ 
215
-
216 282
 <<cn.ff, echo=FALSE, eval=FALSE>>=
217 283
 rm(cnSet); gc()
218 284
 ocProbesets(75e3)
... ...
@@ -489,5 +555,6 @@ toLatex(sessionInfo())
489 555
   \bibliographystyle{plain}
490 556
   \bibliography{refs}
491 557
 %\end{bibliography}
558
+  
492 559
 
493 560
 \end{document}
... ...
@@ -49,13 +49,26 @@ options(continue=" ")
49 49
 library(ff)
50 50
 library(crlmm)
51 51
 crlmm:::validCdfNames()
52
-if(length(grep("development", sessionInfo()[[1]]$status)) == 1){
53
-	outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/crlmmVignette/devel/illumina"	
54
-} else outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/crlmmVignette/release/illumina"	
55
-dir.create(outdir, showWarnings=FALSE, recursive=TRUE)
52
+if(getRversion() < "2.12.0"){
53
+	rpath <- getRversion()
54
+} else rpath <- "trunk"
55
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/illumina_vignette", sep="")	
56 56
 datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k"
57 57
 @ 
58 58
 
59
+\paragraph{About this vignette.} This vignette was created using
60
+Illumina IDAT files that are located in a specific directory on my
61
+computer (\Robject{pathToCels}).  Long computations are saved in the
62
+output directory \Robject{outdir}.  Users should modify these
63
+variables as appropriate.  The following code chunk checks that that
64
+these directories exist.
65
+
66
+<<checkSetup>>=
67
+if(!file.exists(outdir)) stop("Please specify valid directory for storing output")
68
+if(!file.exists(pathToCels)) stop("Please specify the correct path to the CEL files")
69
+@ 
70
+
71
+\paragraph{Preprocessing Illumina IDAT files.}
59 72
 To perform copy number analysis on the Illumina platform, several
60 73
 steps are required.  The first step is to read in the IDAT files and
61 74
 create a container for storing the red and green intensities. These
... ...
@@ -4,11 +4,16 @@
4 4
 \alias{CNSetLM}
5 5
 \alias{CNSetLM-class}
6 6
 \alias{[,CNSetLM-method}
7
+\alias{corr,CNSetLM,character-method}
7 8
 \alias{lines,CNSetLM-method}
8 9
 \alias{lM,CNSetLM-method}
9 10
 \alias{lM<-,CNSetLM,list_or_ffdf-method}
10 11
 \alias{open,CNSetLM-method}
12
+\alias{nu,CNSetLM,character-method}
13
+\alias{phi,CNSetLM,character-method}
11 14
 \alias{show,CNSetLM-method}
15
+\alias{sigma2,CNSetLM,character-method}
16
+\alias{tau2,CNSetLM,character-method}
12 17
 
13 18
 \title{CNSetLM class}
14 19
 \description{Container for allele-specific copy number and linear model
... ...
@@ -45,10 +50,16 @@ Class \code{"\linkS4class{Versioned}"}, by class "CNSet", distance 6.
45 50
     \item{lines}{\code{signature(x="CNSetLM")}: for drawing prediction regions on A vs B scatterplots}
46 51
     \item{lM}{\code{signature(object = "CNSetLM")}: Extract list or
47 52
     ffdf object containing linear model parameters}
48
-   \item{open}{\code{signature(con = "CNSetLM")}: opens file connects
53
+    \item{nu}{\code{signature(object = "CNSetLM"), \code{signature(allele="character")}}: 
54
+            intercept for linear model. See \code{\link{nu}}.}
55
+  \item{open}{\code{signature(con = "CNSetLM")}: opens file connects
49 56
     to ff objects for assayData elements and linear model parameters}
57
+    \item{phi}{\code{signature(object = "CNSetLM"), \code{signature(allele="character")}}: 
58
+            slope for linear model.  See \code{\link{phi}}.}
50 59
     \item{show}{\code{signature(object = "CNSetLM")}: print method
51 60
     for the class }
61
+    \item{sigma2}{\code{signature(object = "CNSetLM"), \code{signature(allele="character")}}: 
62
+    The within genotype         }   
52 63
 	 }
53 64
 }
54 65
 \author{ R. Scharpf}
55 66
new file mode 100644
... ...
@@ -0,0 +1,106 @@
1
+\name{linearModelParams}
2
+\alias{corr}
3
+\alias{nu}
4
+\alias{phi}
5
+\alias{sigma2}
6
+\alias{tau2}
7
+\title{
8
+	Accessors for linear model parameters.
9
+}
10
+\description{
11
+	
12
+	Accessors for linear model parameters.
13
+
14
+}
15
+
16
+\usage{
17
+lM(object)
18
+corr(object, allele)
19
+nu(object, allele)
20
+phi(object, allele)
21
+sigma2(object, allele)
22
+tau2(object, allele)
23
+}
24
+
25
+\arguments{
26
+  \item{object}{  An object of class \code{CNSetLM}}
27
+  \item{allele}{
28
+   Character string. Valid entries for most accessors are 'A' or 'B'.
29
+   For the \code{corr} accessor, valid entries are 'AA', 'AB', or 'BB'.
30
+   }
31
+}
32
+
33
+\details{
34
+
35
+\item{\code{lM}:  Extracts entire list of linear model parameters.}
36
+
37
+\item{\code{corr}: The within-genotype correlation of log2(A) and log2(B) intensities.}
38
+
39
+\item{\code{nu}: The intercept for the linear model.  The linear model is
40
+fit to the A and B alleles independently.}
41
+
42
+\item{\code{phi}: The slope for the linear model.  The linear model is fit
43
+independently to the A and B alleles.}
44
+
45
+\item{\code{sigma2}: For allele A, sigma2 is calculated as the squared MAD
46
+of the log2(intensity) for allele 'A' among subjects with genotype AA.
47
+For allele B, sigma2 is calculated as the squared MAD of the
48
+log2(intensity) for allele 'B' among subjects with genotype BB.
49
+sigma2 can be interpreted as a robust estimate of the signal variance.}
50
+
51
+\item{\code{tau2}: For allele A, tau2 is calculated as the squared MAD of
52
+the log2(intensity) for allele 'A' among subjects with genotype BB.
53
+For allele B, tau2 is calculated as the squared MAD of the
54
+log2(intensity) for allele 'B' among subjects with genotype AA.  tau2
55
+can be interpeted as a robust estimate of the background variance.}
56
+
57
+}
58
+
59
+\value{
60
+
61
+	A matrix or \code{ff} object.
62
+
63
+}
64
+
65
+\author{
66
+R. Scharpf
67
+}
68
+
69
+\seealso{
70
+	\code{\link{CNSetLM-class}}
71
+}
72
+\examples{
73
+## object with ff class
74
+data(sample.CNSetLMff)
75
+invisible(open(sample.CNSetLMff))
76
+class(lM(sample.CNSetLMff))
77
+params <- lM(sample.CNSetLMff)
78
+nuA <- nu(sample.CNSetLMff, "A")
79
+nuB <- nu(sample.CNSetLMff, "B")
80
+phA <- phi(sample.CNSetLMff, "A")
81
+phB <- phi(sample.CNSetLMff, "B")
82
+sig2A <- sigma2(sample.CNSetLMff, "A") 
83
+sig2B <- sigma2(sample.CNSetLMff, "B")
84
+tau2A <- tau2(sample.CNSetLMff, "A")
85
+tau2B <- tau2(sample.CNSetLMff, "B")
86
+corrAA <- corr(sample.CNSetLMff, "AA")
87
+corrBB <- corr(sample.CNSetLMff, "BB")
88
+corrAB <- corr(sample.CNSetLMff, "AB")
89
+invisible(close(sample.CNSetLMff))
90
+## object with matrix class
91
+data(sample.CNSetLM)
92
+class(lM(sample.CNSetlM))
93
+nuA <- nu(sample.CNSetLM, "A")
94
+nuB <- nu(sample.CNSetLM, "B")
95
+phA <- phi(sample.CNSetLM, "A")
96
+phB <- phi(sample.CNSetLM, "B")
97
+sig2A <- sigma2(sample.CNSetLM, "A") 
98
+sig2B <- sigma2(sample.CNSetLM, "B")
99
+tau2A <- tau2(sample.CNSetLM, "A")
100
+tau2B <- tau2(sample.CNSetLM, "B")
101
+corrAA <- corr(sample.CNSetLM, "AA")
102
+corrBB <- corr(sample.CNSetLM, "BB")
103
+corrAB <- corr(sample.CNSetLM, "AB")
104
+}
105
+\keyword{manip}
106
+