git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@48922 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -56,7 +56,7 @@ importFrom(ellipse, ellipse) |
56 | 56 |
importFrom(ff, ffdf) |
57 | 57 |
|
58 | 58 |
exportClasses(CNSetLM, ffdf, list) |
59 |
-exportMethods(open, "[", show, lM, lines) |
|
59 |
+exportMethods(open, "[", show, lM, lines, nu, phi, corr, sigma2, tau2) |
|
60 | 60 |
export(crlmm, |
61 | 61 |
crlmmCopynumber, |
62 | 62 |
crlmmIllumina, |
... | ... |
@@ -76,3 +76,4 @@ export(constructIlluminaCNSet) |
76 | 76 |
|
77 | 77 |
|
78 | 78 |
|
79 |
+ |
... | ... |
@@ -10,4 +10,10 @@ setGeneric("lM", function(object) standardGeneric("lM")) |
10 | 10 |
setGeneric("lM<-", function(object, value) standardGeneric("lM<-")) |
11 | 11 |
setGeneric("totalCopyNumber", function(object,...) standardGeneric("totalCopyNumber")) |
12 | 12 |
|
13 |
+setGeneric("corr", function(object, allele) standardGeneric("corr")) |
|
14 |
+setGeneric("nu", function(object, allele) standardGeneric("nu")) |
|
15 |
+setGeneric("phi", function(object, allele) standardGeneric("phi")) |
|
16 |
+setGeneric("sigma2", function(object, allele) standardGeneric("sigma2")) |
|
17 |
+setGeneric("tau2", function(object, allele) standardGeneric("tau2")) |
|
18 |
+ |
|
13 | 19 |
|
... | ... |
@@ -54,7 +54,7 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){ |
54 | 54 |
} |
55 | 55 |
|
56 | 56 |
construct <- function(filenames, cdfName, copynumber=FALSE, |
57 |
- sns, verbose=TRUE, batch){ |
|
57 |
+ sns, verbose=TRUE, batch, fns){ |
|
58 | 58 |
if(verbose) message("Initializing container for assay data elements alleleA, alleleB, call, callProbability, CA, CB") |
59 | 59 |
if(missing(sns)) sns <- basename(filenames) |
60 | 60 |
protocolData <- getProtocolData.Affy(filenames) |
... | ... |
@@ -67,6 +67,11 @@ construct <- function(filenames, cdfName, copynumber=FALSE, |
67 | 67 |
} |
68 | 68 |
rownames(pData(protocolData)) <- sns |
69 | 69 |
featureData <- getFeatureData.Affy(cdfName, copynumber=copynumber) |
70 |
+ if(!missing(fns)){ |
|
71 |
+ index <- match(fns, featureNames(featureData)) |
|
72 |
+ if(all(is.na(index))) stop("fns not in featureNames") |
|
73 |
+ featureData <- featureData[index, ] |
|
74 |
+ } |
|
70 | 75 |
nr <- nrow(featureData); nc <- length(filenames) |
71 | 76 |
ffObjects <- list(alleleA=initializeBigMatrix(name="A", nr, nc), |
72 | 77 |
alleleB=initializeBigMatrix(name="B", nr, nc), |
... | ... |
@@ -1062,7 +1067,6 @@ crlmmCopynumberLD <- function(object, |
1062 | 1067 |
save(snpflags, file=file.path(ldPath(), "snpflags.rda")) |
1063 | 1068 |
} else{ |
1064 | 1069 |
load(file.path(ldPath(), "snpflags.rda")) |
1065 |
- |
|
1066 | 1070 |
} |
1067 | 1071 |
if(verbose) message("Estimating allele-specific copy number at autosomal SNPs") |
1068 | 1072 |
snpBatches <- splitIndicesByLength(autosomeIndex.snps, ocProbesets()) |
... | ... |
@@ -200,3 +200,89 @@ ellipse.CNSet <- function(x, copynumber, batch, ...){ |
200 | 200 |
} |
201 | 201 |
|
202 | 202 |
|
203 |
+setMethod("nu", c("CNSetLM", "character"), function(object, allele){ |
|
204 |
+ getValue <- function(allele){ |
|
205 |
+ switch(allele, |
|
206 |
+ A="nuA", |
|
207 |
+ B="nuB", |
|
208 |
+ stop("allele must be 'A' or 'B'")) |
|
209 |
+ } |
|
210 |
+ val <- getValue(allele) |
|
211 |
+ class.lm <- class(lM(object)) |
|
212 |
+ if(class.lm == "ffdf"){ |
|
213 |
+ return(physical(lM(object))[[val]]) |
|
214 |
+ } else { |
|
215 |
+ if(class.lm != "matrix") stop("lM() must be matrix or ffdf") |
|
216 |
+ return(lM(object)[[val]]) |
|
217 |
+ } |
|
218 |
+}) |
|
219 |
+ |
|
220 |
+setMethod("phi", c("CNSetLM", "character"), function(object, allele){ |
|
221 |
+ getValue <- function(allele){ |
|
222 |
+ switch(allele, |
|
223 |
+ A="phiA", |
|
224 |
+ B="phiB", |
|
225 |
+ stop("allele must be 'A' or 'B'")) |
|
226 |
+ } |
|
227 |
+ val <- getValue(allele) |
|
228 |
+ class.lm <- class(lM(object)) |
|
229 |
+ if(class.lm == "ffdf"){ |
|
230 |
+ return(physical(lM(object))[[val]]) |
|
231 |
+ } else { |
|
232 |
+ if(class.lm != "matrix") stop("lM() must be matrix or ffdf") |
|
233 |
+ return(lM(object)[[val]]) |
|
234 |
+ } |
|
235 |
+}) |
|
236 |
+ |
|
237 |
+setMethod("sigma2", c("CNSetLM", "character"), function(object, allele){ |
|
238 |
+ getValue <- function(allele){ |
|
239 |
+ switch(allele, |
|
240 |
+ A="sig2A", |
|
241 |
+ B="sig2B", |
|
242 |
+ stop("allele must be 'A' or 'B'")) |
|
243 |
+ } |
|
244 |
+ val <- getValue(allele) |
|
245 |
+ class.lm <- class(lM(object)) |
|
246 |
+ if(class.lm == "ffdf"){ |
|
247 |
+ return(physical(lM(object))[[val]]) |
|
248 |
+ } else { |
|
249 |
+ if(class.lm != "matrix") stop("lM() must be matrix or ffdf") |
|
250 |
+ return(lM(object)[[val]]) |
|
251 |
+ } |
|
252 |
+}) |
|
253 |
+ |
|
254 |
+setMethod("tau2", c("CNSetLM", "character"), function(object, allele){ |
|
255 |
+ getValue <- function(allele){ |
|
256 |
+ switch(allele, |
|
257 |
+ A="tau2A", |
|
258 |
+ B="tau2B", |
|
259 |
+ stop("allele must be 'A' or 'B'")) |
|
260 |
+ } |
|
261 |
+ val <- getValue(allele) |
|
262 |
+ class.lm <- class(lM(object)) |
|
263 |
+ if(class.lm == "ffdf"){ |
|
264 |
+ return(physical(lM(object))[[val]]) |
|
265 |
+ } else { |
|
266 |
+ if(class.lm != "matrix") stop("lM() must be matrix or ffdf") |
|
267 |
+ return(lM(object)[[val]]) |
|
268 |
+ } |
|
269 |
+}) |
|
270 |
+ |
|
271 |
+setMethod("corr", c("CNSetLM", "character"), function(object, allele){ |
|
272 |
+ getValue <- function(allele){ |
|
273 |
+ switch(allele, |
|
274 |
+ AA="corrAA", |
|
275 |
+ AB="corrAB", |
|
276 |
+ BB="corrBB", |
|
277 |
+ stop("must be AA, AB, or BB")) |
|
278 |
+ } |
|
279 |
+ val <- getValue(allele) |
|
280 |
+ class.lm <- class(lM(object)) |
|
281 |
+ if(class.lm == "ffdf"){ |
|
282 |
+ return(physical(lM(object))[[val]]) |
|
283 |
+ } else { |
|
284 |
+ if(class.lm != "matrix") stop("lM() must be matrix or ffdf") |
|
285 |
+ return(lM(object)[[val]]) |
|
286 |
+ } |
|
287 |
+}) |
|
288 |
+ |
... | ... |
@@ -1,10 +1,15 @@ |
1 | 1 |
## Need to submit from a fat node! |
2 |
-all: illumina_copynumber, copynumber |
|
2 |
+all: illumina_copynumber copynumber |
|
3 | 3 |
|
4 | 4 |
copynumber: copynumber.Rnw |
5 |
+## echo "Sweave(\"$1.Rnw\"); library(tools); texi2dvi(\"$1.tex\", pdf=TRUE)" | R --no-save --no-restore; |
|
5 | 6 |
echo "Stangle(\"copynumber.Rnw\")" | R --no-save --no-restore; |
6 | 7 |
cat ~/bin/cluster.template | perl -pe "s/Rprog/copynumber.R/" > copynumber.R.sh |
7 |
- qsub -m e -r y -cwd -l mem_free=10G copynumber.R.sh |
|
8 |
+ qsub -m e -r y -cwd -l mem_free=12G,h_vmem=16G copynumber.R.sh |
|
9 |
+## the above does not create copynumber.tex |
|
10 |
+ |
|
11 |
+sweavecopynumber: copynumber.Rnw |
|
12 |
+ echo "Sweave(\"copynumber.Rnw\")" | R --no-save --no-restore; |
|
8 | 13 |
|
9 | 14 |
texcopynumber: copynumber.tex |
10 | 15 |
texi2dvi --pdf --clean --quiet copynumber.tex |
... | ... |
@@ -12,7 +17,7 @@ texcopynumber: copynumber.tex |
12 | 17 |
illumina_copynumber: illumina_copynumber.Rnw |
13 | 18 |
echo "Stangle(\"illumina_copynumber.Rnw\")" | R --no-save --no-restore; |
14 | 19 |
cat ~/bin/cluster.template | perl -pe "s/Rprog/illumina_copynumber.R/" > illumina_copynumber.R.sh |
15 |
- qsub -m e -r y -cwd -l mem_free=10G illumina_copynumber.R.sh |
|
20 |
+ qsub -m e -r y -cwd -l mem_free=12G,h_vmem=16G illumina_copynumber.R.sh |
|
16 | 21 |
|
17 | 22 |
texillumina_copynumber: illumina_copynumber.tex |
18 | 23 |
texi2dvi --pdf --clean --quiet illumina_copynumber.tex |
... | ... |
@@ -47,22 +47,48 @@ library(oligoClasses) |
47 | 47 |
library(crlmm) |
48 | 48 |
crlmm:::validCdfNames() |
49 | 49 |
cdfName <- "genomewidesnp6" |
50 |
+pathToCels <- "/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m" |
|
51 |
+if(getRversion() < "2.12.0"){ |
|
52 |
+ rpath <- getRversion() |
|
53 |
+} else rpath <- "trunk" |
|
54 |
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="") |
|
50 | 55 |
@ |
51 | 56 |
|
52 |
-\paragraph{Preprocess and genotype.} |
|
57 |
+\paragraph{About this vignette.} This vignette was created using CEL |
|
58 |
+files that are located in a specific directory on my computer |
|
59 |
+(\Robject{pathToCels}). Long computations are saved in the output |
|
60 |
+directory \Robject{outdir}. Users should change these variables as |
|
61 |
+appropriate. The following code chunk should fail unless the above |
|
62 |
+arguments have been set appropriately. |
|
63 |
+ |
|
64 |
+<<checkSetup>>= |
|
65 |
+if(!file.exists(outdir)) stop("Please specify valid directory for storing output") |
|
66 |
+if(!file.exists(pathToCels)) stop("Please specify the correct path to the CEL files") |
|
67 |
+@ |
|
68 |
+ |
|
69 |
+Processed data from codechunks requiring long computations are saved |
|
70 |
+to disk automatically by wrapping function calls in the |
|
71 |
+\Robject{checkExists} function. After the initial batch job, |
|
72 |
+subsequent calls to \verb@Sweave@ will load saved computations from |
|
73 |
+disk and can therefore be run interactively. This may be useful as you |
|
74 |
+begin to explore some of the tools for accessing processed data and |
|
75 |
+visualizations. |
|
76 |
+ |
|
77 |
+\paragraph{Preprocessing and genotyping.} |
|
53 | 78 |
|
54 | 79 |
In the following code chunk, we provide the complete path to the |
55 |
-Affymetrix CEL files, assign a path to store the normalized |
|
56 |
-intensities and genotype data, and define a 'batch' variable. The |
|
57 |
-batch variable will later be useful when we estimate copy number. We |
|
58 |
-typically use the 96 well chemistry plate or the scan date of the |
|
59 |
-array as a surrogate for batch. Adjusting by date or chemistry plate |
|
60 |
-can be helpful for limiting the influence of batch effects. |
|
80 |
+Affymetrix CEL files and define a 'batch' variable. The |
|
81 |
+\Robject{batch} variable will later be useful when we estimate copy |
|
82 |
+number. We typically use the 96 well chemistry plate or the scan date |
|
83 |
+of the array as a surrogate for batch. Adjusting by date or chemistry |
|
84 |
+plate can be critical for limiting the influence of batch effects. For |
|
85 |
+the HapMap CEL files in our analysis, the CEPH (C) and Yoruban (Y) |
|
86 |
+samples were run on separate plates and can be extracted from the CEL |
|
87 |
+filename as below. |
|
61 | 88 |
|
62 | 89 |
<<celfiles>>= |
63 |
-celFiles <- list.celfiles("/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m", full.names=TRUE, pattern=".CEL") |
|
64 |
-outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/1m/crlmm" |
|
65 |
-if(!file.exists(outdir)) dir.create(outdir) |
|
90 |
+celFiles <- list.celfiles(pathToCels, full.names=TRUE, pattern=".CEL") |
|
91 |
+if(length(celFiles) < 1) stop("Please specify the appropriate path to CEL files") |
|
66 | 92 |
## CEPH and Yoruban |
67 | 93 |
batch <- substr(basename(celFiles), 13, 13) |
68 | 94 |
celFiles <- celFiles[batch %in% c("C", "Y")] |
... | ... |
@@ -77,115 +103,164 @@ summarized for each locus. As the markers at polymorphic loci on the |
77 | 103 |
Affymetrix 6.0 platform are identical, we summarize the intensities by |
78 | 104 |
the median. For the nonpolymorphic markers, no summarization step is |
79 | 105 |
performs. Next, the \crlmm{} package provides a genotype call and a |
80 |
-confidence score at each polymorphic locus. Unless the dataset is small |
|
81 |
-(e.g., fewer than 50 samples), we suggest installing and loading the |
|
82 |
-\R{} package \Rpackage{ff}. The function \Rfunction{genotype} checks to |
|
83 |
-see whether the \Rpackage{ff} is loaded. If loaded, the normalized |
|
84 |
-intensities and genotype are stored as \Robject{ff} objects on disk. |
|
85 |
-Otherwise, the genotypes and normalized intensities are stored in |
|
86 |
-matrices. A word of caution: the \Rfunction{genotype} function requires |
|
87 |
-a potentially large amount of RAM even when the \R{} package |
|
88 |
-\Rpackage{ff} is loaded. Users with large datasets may want to skip |
|
89 |
-this part. For the 180 HapMap samples processed in this vignette, the |
|
90 |
-\Rfunction{genotype} function required 25MB RAM. The next two code |
|
91 |
-chunks should be submitted as batch jobs. |
|
106 |
+confidence score at each polymorphic locus. Unless the dataset is |
|
107 |
+small (e.g., fewer than 50 samples), we suggest installing and loading |
|
108 |
+the \R{} package \Rpackage{ff}. The function \Rfunction{genotype} |
|
109 |
+checks to see whether the \Rpackage{ff} is loaded. If loaded, the |
|
110 |
+normalized intensities and genotype are stored as \Robject{ff} objects |
|
111 |
+on disk. Otherwise, the genotypes and normalized intensities are |
|
112 |
+stored in matrices. A word of caution: the \Rfunction{genotype} |
|
113 |
+function without \Rpackage{ff} requires a potentially large amount of |
|
114 |
+RAM. Therefore, we illustrate the \Robject{genotype} function using |
|
115 |
+only the first 20 CEL files. We wrap the function \Rfunction{genotype} |
|
116 |
+in the \Rfunction{checkExists} so that a user can load and inspect the |
|
117 |
+saved object. We first check that the file \Robject{cnSet.rda} does |
|
118 |
+not exist. Existence of this file implies that we have already run the |
|
119 |
+copy number estimation and, therefore, we do not need to preprocess |
|
120 |
+and genotype the files a second time. |
|
92 | 121 |
|
93 | 122 |
<<genotype>>= |
94 |
-if(!exists("cnSet")){ |
|
95 |
- if(!file.exists(file.path(outdir, "cnSet.rda"))){ |
|
96 |
- ##ops <- options(error=recover) |
|
97 |
- cnSet <- genotype(celFiles, cdfName=cdfName, copynumber=TRUE, batch=batch) |
|
98 |
- save(cnSet, file=file.path(outdir, "cnSet.rda")) |
|
99 |
- } else load(file.path(outdir, "cnSet.rda")) |
|
123 |
+if(!file.exists(file.path(outdir, "cnSet.rda"))){ |
|
124 |
+ gtSet.assayData_matrix <- checkExists("gtSet.assayData_matrix", |
|
125 |
+ .path=outdir, |
|
126 |
+ .FUN=genotype, |
|
127 |
+ filenames=celFiles[1:20], |
|
128 |
+ cdfName=cdfName, |
|
129 |
+ batch=batch[1:20]) |
|
130 |
+ class(calls(gtSet.assayData_matrix)) |
|
100 | 131 |
} |
101 | 132 |
@ |
102 | 133 |
|
134 |
+Next, we estimate copy number for the 20 CEL files. The copy number |
|
135 |
+estimation step requirees a minimum number of CEL files in each batch |
|
136 |
+(we suggest 10). While estimation can be performed using only the 20 |
|
137 |
+samples as we do here, it would be far preferable to process all of |
|
138 |
+the files that were on the same chemistry plate. The code chunk below |
|
139 |
+will load \Robject{cnSet.assayData_matrix} from disk if this |
|
140 |
+computation had already been performed as part of the batch job. |
|
141 |
+ |
|
142 |
+<<copynumber>>= |
|
143 |
+cnSet.assayData_matrix <- checkExists("cnSet.assayData_matrix", |
|
144 |
+ .path=outdir, |
|
145 |
+ .FUN=crlmmCopynumber, |
|
146 |
+ object=gtSet.assayData_matrix) |
|
147 |
+if(file.exists(file.path(outdir, "gtSet.assayData_matrix.rda"))) |
|
148 |
+ unlink(file.path(outdir, "gtSet.assayData_matrix.rda")) |
|
149 |
+@ |
|
150 |
+ |
|
151 |
+ |
|
103 | 152 |
A more memory efficient approach to preprocessing and genotyping is |
104 |
-implemented in the \R{} function \Rfunction{genotype2}. The arguments |
|
105 |
-to this function are similar to \Rfunction{genotype} but requires |
|
106 |
-explicit loading of the \R{} package \Rpackage{ff} prior to calling |
|
107 |
-the function. The functions \Rfunction{ocProbesets} and |
|
153 |
+implemented in the \R{} function \Rfunction{genotypeLD} and requires |
|
154 |
+the \Rpackage{ff} package. The functions \Rfunction{ocProbesets} and |
|
108 | 155 |
\Rfunction{ocSamples} can be used to manage how many probesets and |
109 |
-samples are to be loaded into memory and processed. Using the default |
|
110 |
-settings, the following code required 1.9Mb RAM to process 180 CEL |
|
111 |
-files. |
|
156 |
+samples are to processed at a time and can therefore be used to fine |
|
157 |
+tune the required RAM. The function \Rfunction{ldPath} indicates that |
|
158 |
+\Robject{ff} objects will be stored in the directory \Robject{outdir}. |
|
112 | 159 |
|
113 |
-<<genotype2>>= |
|
160 |
+<<ff>>= |
|
114 | 161 |
library(ff) |
115 |
-ld.path <- file.path(outdir, "ffobjs") |
|
116 |
-if(!file.exists(ld.path)) dir.create(ld.path) |
|
117 |
-ldPath(ld.path) |
|
118 |
-if(!exists("cnSet2")){ |
|
119 |
- if(!file.exists(file.path(outdir, "cnSet2.rda"))){ |
|
120 |
- cnSet2 <- genotype2(celFiles, cdfName=cdfName, copynumber=TRUE, batch=batch) |
|
121 |
- save(cnSet2, file=file.path(outdir, "cnSet2.rda")) |
|
122 |
- } else load(file.path(outdir, "cnSet2.rda")) |
|
162 |
+ldPath(outdir) |
|
163 |
+ocProbesets(50000) |
|
164 |
+ocSamples(200) |
|
165 |
+@ |
|
166 |
+ |
|
167 |
+With LDS now enabled, we preprocess and genotype 180 samples from the |
|
168 |
+CEPH and Yoruban populations using the \R{} function |
|
169 |
+\Robject{genotypeLD}. Users that only want the genotypes at |
|
170 |
+polymorphic loci (and not the copy number estimates) should instead |
|
171 |
+use the \R{} function \Rfunction{crlmm} or \Rfunction{crlmm2}. Again, |
|
172 |
+we wrap the call to \Rfunction{genotypeLD} in \Rfunction{checkExists} |
|
173 |
+so that subsequent calls to \verb@Sweave@ can be run interactively. |
|
174 |
+ |
|
175 |
+ |
|
176 |
+<<LDS_genotype>>= |
|
177 |
+if(!file.exists(file.path(outdir, "cnSet.assayData_ff.rda"))){ |
|
178 |
+ gtSet.assayData_ff <- checkExists("gtSet.assayData_ff", |
|
179 |
+ .path=outdir, |
|
180 |
+ .FUN=genotypeLD, |
|
181 |
+ filenames=celFiles, |
|
182 |
+ cdfName=cdfName, |
|
183 |
+ batch=batch) |
|
184 |
+ class(calls(gtSet.assayData_ff)) |
|
123 | 185 |
} |
124 | 186 |
@ |
125 | 187 |
|
126 |
-The objects returned by the \Rfunction{genotype} and |
|
127 |
-\Rfunction{genotype2} differ in size as the former returns an object |
|
128 |
-with matrices in the assay data slot, whereas the latter return an |
|
129 |
-object with pointers to files on disk. The functions \Rfunction{open} |
|
130 |
-and \Rfunction{close} can be used to open and close the connections |
|
131 |
-stored in the assay data of the \Robject{cnSet2} object, respectively. |
|
132 |
-Subsetting the \Robject{ff} object pulls the data from disk into |
|
133 |
-memory. As subsetting operations pull the data from disk to memory, |
|
134 |
-these operations must be performed with care. In particular, |
|
135 |
-subsetting the \Robject{cnSet2} will subset each assay data element |
|
136 |
-and this can be slow. The preferred approach would be to extract the |
|
137 |
-assay data element that is needed, and then subset this function |
|
138 |
-object as illustrated below for the genotype calls. |
|
188 |
+The analogous function for copy number estimation follows. In |
|
189 |
+practice, once the object \Robject{cnSet.assayData_ff} is created the |
|
190 |
+\Robject{gtSet.assayData_ff} is no longer needed and can be removed |
|
191 |
+from the \Robject{outdir}. |
|
192 |
+ |
|
193 |
+<<LDS_copynumber>>= |
|
194 |
+cnSet.assayData_ff <- checkExists("cnSet.assayData_ff", |
|
195 |
+ .path=outdir, |
|
196 |
+ .FUN=crlmmCopynumberLD, |
|
197 |
+ filenames=celFiles, |
|
198 |
+ cdfName=cdfName, |
|
199 |
+ batch=batch) |
|
200 |
+if(file.exists(file.path(outdir, "gtSet.assayData_ff.rda"))) |
|
201 |
+ unlink(file.path(outdir, "gtSet.assayData_ff.rda")) |
|
202 |
+gc() |
|
203 |
+@ |
|
204 |
+ |
|
205 |
+The objects returned by the \Rfunction{genotypeLD} and |
|
206 |
+\Rfunction{crlmmCopynumberLD} have assay data elements that are |
|
207 |
+pointers to \Robject{ff} objects stored in the directory |
|
208 |
+\Robject{outdir}. The functions \Rfunction{open} and |
|
209 |
+\Rfunction{close} open and close the connections to the |
|
210 |
+\Robject{assayData} elements. Subsetting an \Robject{ff} object pulls |
|
211 |
+the data from disk into memory and should therefore be used with |
|
212 |
+caution. In particular, subsetting the \Robject{gt.assayData_ff} would |
|
213 |
+subset each elements in the \Robject{assayData} slot, returning an |
|
214 |
+object of the same class but with \Robject{assayData} elements that |
|
215 |
+are matrices. Such an operation can be exceedingly slow and require |
|
216 |
+subsantial RAM. The preferred approach is to extract only the assay |
|
217 |
+data element that is needed. In the example below, we extract the |
|
218 |
+genotype calls for the the first 50 samples. |
|
139 | 219 |
|
140 | 220 |
<<check>>= |
141 |
-class(calls(cnSet)) |
|
142 |
-dims(cnSet) |
|
143 |
-class(calls(cnSet2)) |
|
144 |
-dims(cnSet2) |
|
145 |
-print(object.size(cnSet), units="Mb") |
|
146 |
-print(object.size(cnSet2), units="Mb") |
|
221 |
+dims(cnSet.assayData_ff) |
|
222 |
+dims(cnSet.assayData_matrix) |
|
223 |
+print(object.size(cnSet.assayData_matrix), units="Mb") |
|
224 |
+print(object.size(cnSet.assayData_ff), units="Mb") |
|
147 | 225 |
if(FALSE){ |
148 |
- replicate(5, system.time(gt1 <- calls(cnSet)[, 1:50])[[1]]) |
|
226 |
+ replicate(5, system.time(gt1 <- calls(cnSet.assayData_matrix)[, 1:50])[[1]]) |
|
149 | 227 |
open(calls(cnSet2)) |
150 |
- replicate(5, system.time(gt2 <- calls(cnSet2)[, 1:50])[[1]]) |
|
228 |
+ replicate(5, system.time(gt2 <- calls(cnSet.assayData_ff)[, 1:50])[[1]]) |
|
151 | 229 |
} |
152 |
-gt1 <- calls(cnSet)[, 1:50] |
|
153 |
-gt2 <- calls(cnSet2)[, 1:50] |
|
230 |
+gt1 <- calls(cnSet.assayData_matrix)[, 1:50] |
|
231 |
+gt2 <- calls(cnSet.assayData_ff)[, 1:50] |
|
154 | 232 |
all.equal(gt1, gt2) |
155 | 233 |
@ |
156 | 234 |
|
157 |
-\noindent With \Robject{ff} objects the additional I/O time required |
|
158 |
-for extracting data is substantial and will increase for larger |
|
159 |
-datasets. Patience is required. |
|
160 |
- |
|
161 | 235 |
Note that for the Affymetrix 6.0 platform the assay data elements each |
162 | 236 |
have a row dimension corresponding to the total number of polymorphic |
163 |
-and nonpolymorphic markers interrogated by the Affymetrix 6.0 platform. |
|
164 |
-A consequence of keeping the rows of the assay data elements the same |
|
165 |
-for all of the statistical summaries is that the matrix used to store |
|
166 |
-genotype calls is larger than necessary. Also, note the additional |
|
167 |
-overhead of some operations when using \Robject{ff} objects. For |
|
168 |
-instance, the posterior probabilities for the CRLMM genotype calls are |
|
169 |
-represented as integers. The accessor \Robject{snpCallProbability} can |
|
170 |
-be used to access these confidence scores. When stored as matrices, |
|
171 |
-converting the integer representation back to the probability scale is |
|
172 |
-straightforward as shown below. However, for the \Robject{ff} objects |
|
173 |
-we must first bring the data into memory. To bring all the scores into |
|
174 |
-memory, one could use the function \Rfunction{[,]} but this could be |
|
175 |
-slow and require a lot of RAM depending on the size of the dataset. |
|
176 |
-Instead, we suggest pulling only the needed rows and columns from |
|
177 |
-memory. In the following example, we convert the integer scores to |
|
178 |
-probabilities for the CEPH samples. As genotype confidence scores are |
|
179 |
-not applicable to the nonpolymorphic markers, we extract only the |
|
237 |
+and nonpolymorphic markers interrogated by the Affymetrix 6.0 |
|
238 |
+platform. A consequence of keeping the rows of the assay data |
|
239 |
+elements the same for all of the statistical summaries is that the |
|
240 |
+matrix used to store genotype calls is larger than necessary. Also, |
|
241 |
+note the additional overhead of some operations when using |
|
242 |
+\Robject{ff} objects. For instance, the posterior probabilities for |
|
243 |
+the CRLMM genotype calls are represented as integers. The accessor |
|
244 |
+\Robject{snpCallProbability} can be used to access these confidence |
|
245 |
+scores. When stored as matrices, converting the integer |
|
246 |
+representation back to the probability scale is straightforward as |
|
247 |
+shown below. However, for the \Robject{ff} objects we must first |
|
248 |
+bring the data into memory. To bring all the scores into memory, one |
|
249 |
+could use the function \Rfunction{[,]} but this could be slow and |
|
250 |
+require a lot of RAM depending on the size of the dataset. Instead, |
|
251 |
+we suggest pulling only the needed rows and columns from memory. In |
|
252 |
+the following example, we convert the integer scores to probabilities |
|
253 |
+for the CEPH samples. As genotype confidence scores are not |
|
254 |
+applicable to the nonpolymorphic markers, we extract only the |
|
180 | 255 |
polymorphic markers using the \Rfunction{isSnp} function. |
181 | 256 |
|
182 | 257 |
<<assayData>>= |
183 | 258 |
rows <- which(isSnp(cnSet)) |
184 | 259 |
cols <- which(batch == "C") |
185 |
-posterior.prob <- tryCatch(integerScoreToProbability(snpCallProbability), |
|
260 |
+posterior.prob <- tryCatch(i2p(snpCallProbability(cnSet.assayData_ff)), |
|
186 | 261 |
error=function(e) print("This will not work for an ff object.")) |
187 |
-posterior.prob1 <- integerScoreToProbability(snpCallProbability(cnSet)[rows, cols]) |
|
188 |
-posterior.prob2 <- integerScoreToProbability(snpCallProbability(cnSet2)[rows, cols]) |
|
262 |
+posterior.prob1 <- i2p(snpCallProbability(cnSet.assayData_matrix)[rows, cols]) |
|
263 |
+posterior.prob2 <- i2p(snpCallProbability(cnSet.assayData_ff)[rows, cols]) |
|
189 | 264 |
all.equal(posterior.prob1, posterior.prob2) |
190 | 265 |
@ |
191 | 266 |
|
... | ... |
@@ -204,15 +279,6 @@ explore the latter. As with the preprocessing and genotyping, the |
204 | 279 |
following code should be submitted as part of the batch job as it is too |
205 | 280 |
slow for interactive analysis. |
206 | 281 |
|
207 |
-<<cn, echo=FALSE, eval=FALSE>>= |
|
208 |
-##Matrix format |
|
209 |
-cnSet <- crlmmCopynumber(cnSet) |
|
210 |
-@ |
|
211 |
- |
|
212 |
-<<save.cnset, echo=FALSE, eval=FALSE>>= |
|
213 |
-save(cnSet, file=file.path(outdir, "cnSet.rda")) |
|
214 |
-@ |
|
215 |
- |
|
216 | 282 |
<<cn.ff, echo=FALSE, eval=FALSE>>= |
217 | 283 |
rm(cnSet); gc() |
218 | 284 |
ocProbesets(75e3) |
... | ... |
@@ -489,5 +555,6 @@ toLatex(sessionInfo()) |
489 | 555 |
\bibliographystyle{plain} |
490 | 556 |
\bibliography{refs} |
491 | 557 |
%\end{bibliography} |
558 |
+ |
|
492 | 559 |
|
493 | 560 |
\end{document} |
... | ... |
@@ -49,13 +49,26 @@ options(continue=" ") |
49 | 49 |
library(ff) |
50 | 50 |
library(crlmm) |
51 | 51 |
crlmm:::validCdfNames() |
52 |
-if(length(grep("development", sessionInfo()[[1]]$status)) == 1){ |
|
53 |
- outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/crlmmVignette/devel/illumina" |
|
54 |
-} else outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/crlmmVignette/release/illumina" |
|
55 |
-dir.create(outdir, showWarnings=FALSE, recursive=TRUE) |
|
52 |
+if(getRversion() < "2.12.0"){ |
|
53 |
+ rpath <- getRversion() |
|
54 |
+} else rpath <- "trunk" |
|
55 |
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/illumina_vignette", sep="") |
|
56 | 56 |
datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k" |
57 | 57 |
@ |
58 | 58 |
|
59 |
+\paragraph{About this vignette.} This vignette was created using |
|
60 |
+Illumina IDAT files that are located in a specific directory on my |
|
61 |
+computer (\Robject{pathToCels}). Long computations are saved in the |
|
62 |
+output directory \Robject{outdir}. Users should modify these |
|
63 |
+variables as appropriate. The following code chunk checks that that |
|
64 |
+these directories exist. |
|
65 |
+ |
|
66 |
+<<checkSetup>>= |
|
67 |
+if(!file.exists(outdir)) stop("Please specify valid directory for storing output") |
|
68 |
+if(!file.exists(pathToCels)) stop("Please specify the correct path to the CEL files") |
|
69 |
+@ |
|
70 |
+ |
|
71 |
+\paragraph{Preprocessing Illumina IDAT files.} |
|
59 | 72 |
To perform copy number analysis on the Illumina platform, several |
60 | 73 |
steps are required. The first step is to read in the IDAT files and |
61 | 74 |
create a container for storing the red and green intensities. These |
... | ... |
@@ -4,11 +4,16 @@ |
4 | 4 |
\alias{CNSetLM} |
5 | 5 |
\alias{CNSetLM-class} |
6 | 6 |
\alias{[,CNSetLM-method} |
7 |
+\alias{corr,CNSetLM,character-method} |
|
7 | 8 |
\alias{lines,CNSetLM-method} |
8 | 9 |
\alias{lM,CNSetLM-method} |
9 | 10 |
\alias{lM<-,CNSetLM,list_or_ffdf-method} |
10 | 11 |
\alias{open,CNSetLM-method} |
12 |
+\alias{nu,CNSetLM,character-method} |
|
13 |
+\alias{phi,CNSetLM,character-method} |
|
11 | 14 |
\alias{show,CNSetLM-method} |
15 |
+\alias{sigma2,CNSetLM,character-method} |
|
16 |
+\alias{tau2,CNSetLM,character-method} |
|
12 | 17 |
|
13 | 18 |
\title{CNSetLM class} |
14 | 19 |
\description{Container for allele-specific copy number and linear model |
... | ... |
@@ -45,10 +50,16 @@ Class \code{"\linkS4class{Versioned}"}, by class "CNSet", distance 6. |
45 | 50 |
\item{lines}{\code{signature(x="CNSetLM")}: for drawing prediction regions on A vs B scatterplots} |
46 | 51 |
\item{lM}{\code{signature(object = "CNSetLM")}: Extract list or |
47 | 52 |
ffdf object containing linear model parameters} |
48 |
- \item{open}{\code{signature(con = "CNSetLM")}: opens file connects |
|
53 |
+ \item{nu}{\code{signature(object = "CNSetLM"), \code{signature(allele="character")}}: |
|
54 |
+ intercept for linear model. See \code{\link{nu}}.} |
|
55 |
+ \item{open}{\code{signature(con = "CNSetLM")}: opens file connects |
|
49 | 56 |
to ff objects for assayData elements and linear model parameters} |
57 |
+ \item{phi}{\code{signature(object = "CNSetLM"), \code{signature(allele="character")}}: |
|
58 |
+ slope for linear model. See \code{\link{phi}}.} |
|
50 | 59 |
\item{show}{\code{signature(object = "CNSetLM")}: print method |
51 | 60 |
for the class } |
61 |
+ \item{sigma2}{\code{signature(object = "CNSetLM"), \code{signature(allele="character")}}: |
|
62 |
+ The within genotype } |
|
52 | 63 |
} |
53 | 64 |
} |
54 | 65 |
\author{ R. Scharpf} |
55 | 66 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,106 @@ |
1 |
+\name{linearModelParams} |
|
2 |
+\alias{corr} |
|
3 |
+\alias{nu} |
|
4 |
+\alias{phi} |
|
5 |
+\alias{sigma2} |
|
6 |
+\alias{tau2} |
|
7 |
+\title{ |
|
8 |
+ Accessors for linear model parameters. |
|
9 |
+} |
|
10 |
+\description{ |
|
11 |
+ |
|
12 |
+ Accessors for linear model parameters. |
|
13 |
+ |
|
14 |
+} |
|
15 |
+ |
|
16 |
+\usage{ |
|
17 |
+lM(object) |
|
18 |
+corr(object, allele) |
|
19 |
+nu(object, allele) |
|
20 |
+phi(object, allele) |
|
21 |
+sigma2(object, allele) |
|
22 |
+tau2(object, allele) |
|
23 |
+} |
|
24 |
+ |
|
25 |
+\arguments{ |
|
26 |
+ \item{object}{ An object of class \code{CNSetLM}} |
|
27 |
+ \item{allele}{ |
|
28 |
+ Character string. Valid entries for most accessors are 'A' or 'B'. |
|
29 |
+ For the \code{corr} accessor, valid entries are 'AA', 'AB', or 'BB'. |
|
30 |
+ } |
|
31 |
+} |
|
32 |
+ |
|
33 |
+\details{ |
|
34 |
+ |
|
35 |
+\item{\code{lM}: Extracts entire list of linear model parameters.} |
|
36 |
+ |
|
37 |
+\item{\code{corr}: The within-genotype correlation of log2(A) and log2(B) intensities.} |
|
38 |
+ |
|
39 |
+\item{\code{nu}: The intercept for the linear model. The linear model is |
|
40 |
+fit to the A and B alleles independently.} |
|
41 |
+ |
|
42 |
+\item{\code{phi}: The slope for the linear model. The linear model is fit |
|
43 |
+independently to the A and B alleles.} |
|
44 |
+ |
|
45 |
+\item{\code{sigma2}: For allele A, sigma2 is calculated as the squared MAD |
|
46 |
+of the log2(intensity) for allele 'A' among subjects with genotype AA. |
|
47 |
+For allele B, sigma2 is calculated as the squared MAD of the |
|
48 |
+log2(intensity) for allele 'B' among subjects with genotype BB. |
|
49 |
+sigma2 can be interpreted as a robust estimate of the signal variance.} |
|
50 |
+ |
|
51 |
+\item{\code{tau2}: For allele A, tau2 is calculated as the squared MAD of |
|
52 |
+the log2(intensity) for allele 'A' among subjects with genotype BB. |
|
53 |
+For allele B, tau2 is calculated as the squared MAD of the |
|
54 |
+log2(intensity) for allele 'B' among subjects with genotype AA. tau2 |
|
55 |
+can be interpeted as a robust estimate of the background variance.} |
|
56 |
+ |
|
57 |
+} |
|
58 |
+ |
|
59 |
+\value{ |
|
60 |
+ |
|
61 |
+ A matrix or \code{ff} object. |
|
62 |
+ |
|
63 |
+} |
|
64 |
+ |
|
65 |
+\author{ |
|
66 |
+R. Scharpf |
|
67 |
+} |
|
68 |
+ |
|
69 |
+\seealso{ |
|
70 |
+ \code{\link{CNSetLM-class}} |
|
71 |
+} |
|
72 |
+\examples{ |
|
73 |
+## object with ff class |
|
74 |
+data(sample.CNSetLMff) |
|
75 |
+invisible(open(sample.CNSetLMff)) |
|
76 |
+class(lM(sample.CNSetLMff)) |
|
77 |
+params <- lM(sample.CNSetLMff) |
|
78 |
+nuA <- nu(sample.CNSetLMff, "A") |
|
79 |
+nuB <- nu(sample.CNSetLMff, "B") |
|
80 |
+phA <- phi(sample.CNSetLMff, "A") |
|
81 |
+phB <- phi(sample.CNSetLMff, "B") |
|
82 |
+sig2A <- sigma2(sample.CNSetLMff, "A") |
|
83 |
+sig2B <- sigma2(sample.CNSetLMff, "B") |
|
84 |
+tau2A <- tau2(sample.CNSetLMff, "A") |
|
85 |
+tau2B <- tau2(sample.CNSetLMff, "B") |
|
86 |
+corrAA <- corr(sample.CNSetLMff, "AA") |
|
87 |
+corrBB <- corr(sample.CNSetLMff, "BB") |
|
88 |
+corrAB <- corr(sample.CNSetLMff, "AB") |
|
89 |
+invisible(close(sample.CNSetLMff)) |
|
90 |
+## object with matrix class |
|
91 |
+data(sample.CNSetLM) |
|
92 |
+class(lM(sample.CNSetlM)) |
|
93 |
+nuA <- nu(sample.CNSetLM, "A") |
|
94 |
+nuB <- nu(sample.CNSetLM, "B") |
|
95 |
+phA <- phi(sample.CNSetLM, "A") |
|
96 |
+phB <- phi(sample.CNSetLM, "B") |
|
97 |
+sig2A <- sigma2(sample.CNSetLM, "A") |
|
98 |
+sig2B <- sigma2(sample.CNSetLM, "B") |
|
99 |
+tau2A <- tau2(sample.CNSetLM, "A") |
|
100 |
+tau2B <- tau2(sample.CNSetLM, "B") |
|
101 |
+corrAA <- corr(sample.CNSetLM, "AA") |
|
102 |
+corrBB <- corr(sample.CNSetLM, "BB") |
|
103 |
+corrAB <- corr(sample.CNSetLM, "AB") |
|
104 |
+} |
|
105 |
+\keyword{manip} |
|
106 |
+ |