git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54261 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.9.22 |
|
4 |
+Version: 1.9.23 |
|
5 | 5 |
Date: 2010-12-10 |
6 | 6 |
Author: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry |
7 | 7 |
Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
... | ... |
@@ -99,22 +99,7 @@ construct <- function(filenames, |
99 | 99 |
return(cnSet) |
100 | 100 |
} |
101 | 101 |
|
102 |
-genotype <- function(filenames, |
|
103 |
- cdfName, |
|
104 |
- batch, |
|
105 |
- mixtureSampleSize=10^5, |
|
106 |
- eps=0.1, |
|
107 |
- verbose=TRUE, |
|
108 |
- seed=1, |
|
109 |
- sns, |
|
110 |
- probs=rep(1/3, 3), |
|
111 |
- DF=6, |
|
112 |
- SNRMin=5, |
|
113 |
- recallMin=10, |
|
114 |
- recallRegMin=1000, |
|
115 |
- gender=NULL, |
|
116 |
- returnParams=TRUE, |
|
117 |
- badSNP=0.7){ |
|
102 |
+constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE){ |
|
118 | 103 |
is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE) |
119 | 104 |
if(!is.lds) stop("this function now requires that the ff package be loaded") |
120 | 105 |
if(missing(cdfName)) stop("must specify cdfName") |
... | ... |
@@ -139,8 +124,8 @@ genotype <- function(filenames, |
139 | 124 |
gns <- getVarInEnv("gns") |
140 | 125 |
rm(list=obj, envir=.crlmmPkgEnv) |
141 | 126 |
rm(obj) |
142 |
- if(verbose) message("Initializing objects.") |
|
143 |
- mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double") |
|
127 |
+ if(verbose) message("Initializing ff objects.") |
|
128 |
+ ##mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double") |
|
144 | 129 |
SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double") |
145 | 130 |
SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double") |
146 | 131 |
genderff <- initializeBigVector("gender", length(filenames), "integer") |
... | ... |
@@ -148,30 +133,16 @@ genotype <- function(filenames, |
148 | 133 |
nr <- nrow(featureData); nc <- length(sns) |
149 | 134 |
A <- initializeBigMatrix("crlmmA-", nr, length(filenames), "integer") |
150 | 135 |
B <- initializeBigMatrix("crlmmB-", nr, length(filenames), "integer") |
136 |
+ call <- initializeBigMatrix("call-", nr, length(filenames), "integer") |
|
137 |
+ callPr <- initializeBigMatrix("callPr-", nr, length(filenames), "integer") |
|
151 | 138 |
rownames(A) <- rownames(B) <- featureNames(featureData) |
152 |
- sampleBatches <- splitIndicesByNode(seq(along=filenames)) |
|
153 |
- if(verbose) message("Processing ", length(filenames), " files.") |
|
154 |
- ocLapply(sampleBatches, rsprocessCEL, filenames=filenames, |
|
155 |
- fitMixture=TRUE, A=A, B=B, SKW=SKW, SNR=SNR, |
|
156 |
- mixtureParams=mixtureParams, eps=eps, seed=seed, |
|
157 |
- mixtureSampleSize=mixtureSampleSize, pkgname=pkgname, |
|
158 |
- neededPkgs=c("crlmm", pkgname)) |
|
159 |
- ## Now we initialize a CNSet object, cloning A and B |
|
160 |
- if(verbose) message("Cloning A and B matrices to store genotype calls and confidence scores.") |
|
161 |
- ## The clones will be used to store calls and confidence scores |
|
162 |
- open(A) |
|
163 |
- open(B) |
|
164 |
- ##user do |
|
165 |
- ## options("ffbatchbytes"=XXX) to make this efficient |
|
166 |
- call <- clone(A) |
|
167 |
- callProbability=clone(B) |
|
168 |
- close(A) |
|
169 |
- close(B) |
|
139 |
+ rownames(call) <- rownames(callPr) <- featureNames(featureData) |
|
140 |
+ if(verbose) message("Instantiating CNSet container") |
|
170 | 141 |
cnSet <- new("CNSet", |
171 | 142 |
alleleA=A, |
172 | 143 |
alleleB=B, |
173 | 144 |
call=call, |
174 |
- callProbability=callProbability, |
|
145 |
+ callProbability=callPr, |
|
175 | 146 |
featureData=featureData, |
176 | 147 |
annotation=cdfName, |
177 | 148 |
batch=batch) |
... | ... |
@@ -190,26 +161,152 @@ genotype <- function(filenames, |
190 | 161 |
cnSet$SNR <- SNR |
191 | 162 |
cnSet$gender <- genderff |
192 | 163 |
stopifnot(validObject(cnSet)) |
193 |
- snp.I <- isSnp(cnSet) |
|
194 |
- snp.index <- which(snp.I) |
|
195 |
- ##pData(cnSet)$SKW <- SKW |
|
196 |
- ##pData(cnSet)$SNR <- SNR |
|
197 |
- np.index <- which(!snp.I) |
|
198 |
- if(verbose) message("Normalizing nonpolymorphic markers") |
|
199 |
- FUN <- ifelse(is.lds, "cnrma2", "cnrma") |
|
200 |
- if(verbose) message("Quantile normalizing nonpolymorphic markers") |
|
201 |
- cnrma2(A=A(cnSet), |
|
202 |
- filenames=filenames, |
|
203 |
- row.names=featureNames(cnSet)[np.index], |
|
204 |
- cdfName=cdfName, |
|
205 |
- sns=sampleNames(cnSet), |
|
206 |
- seed=seed, |
|
207 |
- verbose=verbose) |
|
164 |
+ return(cnSet) |
|
165 |
+} |
|
166 |
+ |
|
167 |
+snprmaAffy <- function(cnSet, filenames, |
|
168 |
+ mixtureSampleSize=10^5, |
|
169 |
+ eps=0.1, |
|
170 |
+ seed=1, |
|
171 |
+ verbose=TRUE){ |
|
172 |
+ if(verbose) message("Preprocessing ", length(filenames), " files.") |
|
173 |
+ pkgname <- getCrlmmAnnotationName(annotation(cnSet)) |
|
174 |
+ stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
|
175 |
+ mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double") |
|
176 |
+ sampleBatches <- splitIndicesByNode(seq(along=filenames)) |
|
177 |
+ ocLapply(sampleBatches, rsprocessCEL, filenames=filenames, |
|
178 |
+ fitMixture=TRUE, A=A(cnSet), B=B(cnSet), C=calls(cnSet), |
|
179 |
+ D=snpCallProbability(cnSet), |
|
180 |
+ SKW=cnSet$SKW, SNR=cnSet$SNR, |
|
181 |
+ mixtureParams=mixtureParams, eps=eps, seed=seed, |
|
182 |
+ mixtureSampleSize=mixtureSampleSize, pkgname=pkgname, |
|
183 |
+ neededPkgs=c("crlmm", pkgname)) |
|
184 |
+ if(verbose) message("Cloning A and B matrices to store genotype calls and confidence scores.") |
|
185 |
+ return(mixtureParams) |
|
186 |
+} |
|
187 |
+genotype <- function(filenames, |
|
188 |
+ cdfName, |
|
189 |
+ batch, |
|
190 |
+ mixtureSampleSize=10^5, |
|
191 |
+ eps=0.1, |
|
192 |
+ verbose=TRUE, |
|
193 |
+ seed=1, |
|
194 |
+ sns, |
|
195 |
+ probs=rep(1/3, 3), |
|
196 |
+ DF=6, |
|
197 |
+ SNRMin=5, |
|
198 |
+ recallMin=10, |
|
199 |
+ recallRegMin=1000, |
|
200 |
+ gender=NULL, |
|
201 |
+ returnParams=TRUE, |
|
202 |
+ badSNP=0.7){ |
|
203 |
+## is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE) |
|
204 |
+## if(!is.lds) stop("this function now requires that the ff package be loaded") |
|
205 |
+## if(missing(cdfName)) stop("must specify cdfName") |
|
206 |
+## if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
|
207 |
+## if(missing(sns)) sns <- basename(filenames) |
|
208 |
+## stopifnot(!missing(batch)) |
|
209 |
+## ##--------------------------------------------------------------------------- |
|
210 |
+## ## |
|
211 |
+## ## from snprma2. Goal is to initialize A and B with |
|
212 |
+## ## appropriate dimension for snps+nps |
|
213 |
+## ## |
|
214 |
+## ##--------------------------------------------------------------------------- |
|
215 |
+## if (missing(sns)) sns <- basename(filenames) |
|
216 |
+## if (missing(cdfName)) |
|
217 |
+## cdfName <- read.celfile.header(filenames[1])[["cdfName"]] |
|
218 |
+## pkgname <- getCrlmmAnnotationName(cdfName) |
|
219 |
+## stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
|
220 |
+## if(verbose) message("Loading annotations and mixture model parameters.") |
|
221 |
+## obj <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
222 |
+## pnsa <- getVarInEnv("pnsa") |
|
223 |
+## pnsb <- getVarInEnv("pnsb") |
|
224 |
+## gns <- getVarInEnv("gns") |
|
225 |
+## rm(list=obj, envir=.crlmmPkgEnv) |
|
226 |
+## rm(obj) |
|
227 |
+## if(verbose) message("Initializing objects.") |
|
228 |
+## mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double") |
|
229 |
+## SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double") |
|
230 |
+## SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double") |
|
231 |
+## genderff <- initializeBigVector("gender", length(filenames), "integer") |
|
232 |
+## featureData <- getFeatureData(cdfName, copynumber=TRUE) |
|
233 |
+## nr <- nrow(featureData); nc <- length(sns) |
|
234 |
+## A <- initializeBigMatrix("crlmmA-", nr, length(filenames), "integer") |
|
235 |
+## B <- initializeBigMatrix("crlmmB-", nr, length(filenames), "integer") |
|
236 |
+## rownames(A) <- rownames(B) <- featureNames(featureData) |
|
237 |
+## sampleBatches <- splitIndicesByNode(seq(along=filenames)) |
|
238 |
+## if(verbose) message("Processing ", length(filenames), " files.") |
|
239 |
+## ocLapply(sampleBatches, rsprocessCEL, filenames=filenames, |
|
240 |
+## fitMixture=TRUE, A=A, B=B, SKW=SKW, SNR=SNR, |
|
241 |
+## mixtureParams=mixtureParams, eps=eps, seed=seed, |
|
242 |
+## mixtureSampleSize=mixtureSampleSize, pkgname=pkgname, |
|
243 |
+## neededPkgs=c("crlmm", pkgname)) |
|
244 |
+## ## Now we initialize a CNSet object, cloning A and B |
|
245 |
+## if(verbose) message("Cloning A and B matrices to store genotype calls and confidence scores.") |
|
246 |
+## ## The clones will be used to store calls and confidence scores |
|
247 |
+## open(A) |
|
248 |
+## open(B) |
|
249 |
+## ##user do |
|
250 |
+## ## options("ffbatchbytes"=XXX) to make this efficient |
|
251 |
+## call <- clone(A) |
|
252 |
+## callProbability=clone(B) |
|
253 |
+## close(A) |
|
254 |
+## close(B) |
|
255 |
+## cnSet <- new("CNSet", |
|
256 |
+## alleleA=A, |
|
257 |
+## alleleB=B, |
|
258 |
+## call=call, |
|
259 |
+## callProbability=callProbability, |
|
260 |
+## featureData=featureData, |
|
261 |
+## annotation=cdfName, |
|
262 |
+## batch=batch) |
|
263 |
+## if(!missing(sns)){ |
|
264 |
+## sampleNames(cnSet) <- sns |
|
265 |
+## } else { |
|
266 |
+## sampleNames(cnSet) <- basename(filenames) |
|
267 |
+## } |
|
268 |
+## protocolData <- getProtocolData.Affy(filenames) |
|
269 |
+## rownames(pData(protocolData)) <- sns |
|
270 |
+## protocolData(cnSet) <- protocolData |
|
271 |
+## ##pd <- data.frame(matrix(NA, nc, 3), row.names=sns) |
|
272 |
+## ##colnames(pd)=c("SKW", "SNR", "gender") |
|
273 |
+## ##phenoData(cnSet) <- new("AnnotatedDataFrame", data=pd) |
|
274 |
+## cnSet$SKW <- SKW |
|
275 |
+## cnSet$SNR <- SNR |
|
276 |
+## cnSet$gender <- genderff |
|
277 |
+## save(cnSet, file=file.path(ldPath(), "cnSet.rda")) |
|
278 |
+## stopifnot(validObject(cnSet)) |
|
279 |
+ cnSet <- constructAffy(filenames=filenames, |
|
280 |
+ cdfName=cdfName, |
|
281 |
+ batch=batch, verbose=verbose) |
|
282 |
+ mixtureParams <- snprmaAffy(cnSet, filenames=filenames, |
|
283 |
+ mixtureSampleSize=mixtureSampleSize, |
|
284 |
+ eps=eps, |
|
285 |
+ seed=seed, |
|
286 |
+ verbose=verbose) |
|
287 |
+ ok <- cnrmaAffy(cnSet=cnSet, filenames=filenames, seed=seed, |
|
288 |
+ verbose=verbose) |
|
289 |
+ stopifnot(ok) |
|
290 |
+ ok <- genotypeAffy(cnSet=cnSet, mixtureParams=mixtureParams, |
|
291 |
+ SNRMin=SNRMin, recallMin=recallMin, |
|
292 |
+ recallRegMin=recallRegMin, |
|
293 |
+ gender=gender, |
|
294 |
+ badSNP=badSNP, |
|
295 |
+ returnParams=returnParams, |
|
296 |
+ verbose=verbose) |
|
297 |
+ return(cnSet) |
|
298 |
+} |
|
299 |
+ |
|
300 |
+genotypeAffy <- function(cnSet, mixtureParams, SNRMin=5, recallMin=10, |
|
301 |
+ recallRegMin=1000, |
|
302 |
+ gender=NULL, badSNP=0.7, returnParams=TRUE, |
|
303 |
+ verbose=TRUE){ |
|
304 |
+ snp.index <- which(isSnp(cnSet)) |
|
208 | 305 |
tmp <- crlmmGT2(A=calls(cnSet), |
209 | 306 |
B=snpCallProbability(cnSet), |
210 |
- SNR=SNR, |
|
307 |
+ SNR=cnSet$SNR, |
|
211 | 308 |
mixtureParams=mixtureParams, |
212 |
- cdfName=cdfName, |
|
309 |
+ cdfName=annotation(cnSet), |
|
213 | 310 |
row.names=NULL, |
214 | 311 |
col.names=sampleNames(cnSet), |
215 | 312 |
SNRMin=SNRMin, |
... | ... |
@@ -224,10 +321,25 @@ genotype <- function(filenames, |
224 | 321 |
open(cnSet$gender) |
225 | 322 |
cnSet$gender[,] <- tmp[["gender"]] |
226 | 323 |
close(cnSet$gender) |
227 |
- return(cnSet) |
|
324 |
+ return(TRUE) |
|
325 |
+} |
|
326 |
+ |
|
327 |
+cnrmaAffy <- function(cnSet, filenames, seed=1, verbose=TRUE){ |
|
328 |
+ snp.I <- isSnp(cnSet) |
|
329 |
+ snp.index <- which(snp.I) |
|
330 |
+ np.index <- which(!snp.I) |
|
331 |
+ if(verbose) message("Quantile normalizing nonpolymorphic markers") |
|
332 |
+ cnrma2(A=A(cnSet), |
|
333 |
+ filenames=filenames, |
|
334 |
+ row.names=featureNames(cnSet)[np.index], |
|
335 |
+ cdfName=cdfName, |
|
336 |
+ sns=sampleNames(cnSet), |
|
337 |
+ seed=seed, |
|
338 |
+ verbose=verbose) |
|
339 |
+ TRUE |
|
228 | 340 |
} |
229 | 341 |
|
230 |
-rsprocessCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR, |
|
342 |
+rsprocessCEL <- function(i, filenames, fitMixture, A, B, C, D, SKW, SNR, |
|
231 | 343 |
mixtureParams, eps, seed, mixtureSampleSize, |
232 | 344 |
pkgname){ |
233 | 345 |
obj1 <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
... | ... |
@@ -254,6 +366,8 @@ rsprocessCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR, |
254 | 366 |
|
255 | 367 |
open(A) |
256 | 368 |
open(B) |
369 |
+ open(C) |
|
370 |
+ open(D) |
|
257 | 371 |
open(SKW) |
258 | 372 |
open(mixtureParams) |
259 | 373 |
open(SNR) |
... | ... |
@@ -270,6 +384,8 @@ rsprocessCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR, |
270 | 384 |
## RS: add index for row assignment |
271 | 385 |
A[index, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
272 | 386 |
B[index, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
387 |
+ C[index, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
|
388 |
+ D[index, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
|
273 | 389 |
rm(y) |
274 | 390 |
if(fitMixture){ |
275 | 391 |
S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN |
... | ... |
@@ -25,7 +25,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
25 | 25 |
} |
26 | 26 |
snpBatches <- splitIndicesByLength(index, ocProbesets()) |
27 | 27 |
NR <- length(unlist(snpBatches)) |
28 |
- if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
28 |
+ if(verbose) message("Calling ", NR, " SNPs for recalibration... ") |
|
29 | 29 |
NC <- ncol(A) |
30 | 30 |
## |
31 | 31 |
if(verbose) message("Loading annotations.") |
... | ... |
@@ -61,7 +61,6 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
61 | 61 |
cIndexes <- list(keepIndex, |
62 | 62 |
keepIndex[which(gender[keepIndex]==2)], |
63 | 63 |
keepIndex[which(gender[keepIndex]==1)]) |
64 |
- if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
65 | 64 |
## call C |
66 | 65 |
fIndex <- which(gender==2) |
67 | 66 |
mIndex <- which(gender==1) |
... | ... |
@@ -101,6 +100,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
101 | 100 |
tmp |
102 | 101 |
} |
103 | 102 |
## |
103 |
+ if(verbose) message("Calling process1") |
|
104 | 104 |
newparamsBatch <- ocLapply(seq(along=snpBatches), process1, |
105 | 105 |
snpBatches=snpBatches, |
106 | 106 |
autosomeIndex=autosomeIndex, XIndex=XIndex, |
... | ... |
@@ -1,10 +1,34 @@ |
1 | 1 |
\name{NEWS} |
2 | 2 |
\title{News for Package 'crlmm'} |
3 | 3 |
|
4 |
-\section{Changes in version 1.10}{ |
|
4 |
+ |
|
5 |
+\section{Changes in version 1.9}{ |
|
5 | 6 |
\subsection{USER VISIBLE CHANGES}{ |
6 | 7 |
\itemize{ |
7 | 8 |
\item Using NEWS.Rd |
9 |
+ |
|
10 |
+ \item batch slot in CNSet objects is class 'character' |
|
11 |
+ (previously, class was 'factor') |
|
12 |
+ |
|
13 |
+ \item the ff package is required for preprocessing and genotyping |
|
14 |
+ prior to copy number analyses with the crlmmCopynumber function. |
|
15 |
+ |
|
16 |
+ \item We have added several vignettes pertaining to copy number |
|
17 |
+ analyses in the crlmm package: CopyNumberOverview, |
|
18 |
+ AffymetrixPreprocessCN, IlluminaPreprocessCN, and Infrastructure. |
|
19 |
+ The 'copynumber' and 'Infrastructure' vignettes are applicable to |
|
20 |
+ both the Illumina and Affymetrix platforms. The CopyNumberOverview |
|
21 |
+ vignette gives a brief summary of the available vignettes for copy |
|
22 |
+ number analyses. |
|
23 |
+ |
|
24 |
+ \item Exporting function constructInf, preprocessInf, and |
|
25 |
+ genotypeInf for preprocessing and genotyping Illumina files prior |
|
26 |
+ to copy number analyses. The genotype.Illumina function is now a |
|
27 |
+ wrapper for these functions. |
|
28 |
+ |
|
29 |
+ \item additional documentation for crlmm is provided in a |
|
30 |
+ compendium: http://www.biostat.jhsph.edu/~rscharpf/crlmmCompendium/index.html |
|
31 |
+ |
|
8 | 32 |
} |
9 | 33 |
} |
10 | 34 |
} |
... | ... |
@@ -37,7 +61,7 @@ |
37 | 61 |
elements of this object, including featureNames, |
38 | 62 |
sampleNames, and '['. |
39 | 63 |
} |
40 |
- |
|
64 |
+ |
|
41 | 65 |
\item 'CopyNumberSet': contains locus-level estimates of copy |
42 | 66 |
number for SNPs and polymorphic probes. |
43 | 67 |
\enumerate{ |
... | ... |
@@ -47,7 +71,7 @@ |
47 | 71 |
|
48 | 72 |
\item For nonpolymorphic probes, the total copy number is |
49 | 73 |
stored in the 'CA' slot and a NA is recorded for the |
50 |
- corresponding row in the CB matrix. |
|
74 |
+ corresponding row in the CB matrix. |
|
51 | 75 |
|
52 | 76 |
\item Useful methods: 'copyNumber', 'ellipse', 'points' |
53 | 77 |
} |
... | ... |
@@ -60,4 +84,5 @@ |
60 | 84 |
'CrlmmSetList' and returns an object of class 'CopyNumberSet'. |
61 | 85 |
} |
62 | 86 |
} |
87 |
+ |
|
63 | 88 |
} |