git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45348 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -477,3 +477,10 @@ then readIDAT() should work. Thanks to Pierre Cherel who reported this error. |
477 | 477 |
|
478 | 478 |
** a few updates to initializeBigMatrix |
479 | 479 |
** show, [ defined for CNSetLM |
480 |
+ |
|
481 |
+2010-03-18 R.Scharpf committed version 1.5.38 |
|
482 |
+ |
|
483 |
+** import snpCall, snpCallProbability, snpCall<-, snpCallProbability<- |
|
484 |
+ from Biobase |
|
485 |
+** updates to genotype and crlmmIlluminaRS |
|
486 |
+** class union of ff_matrix, matrix, and ffdf |
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.5.38 |
|
4 |
+Version: 1.5.39 |
|
5 | 5 |
Date: 2010-02-05 |
6 | 6 |
Author: Rafael A Irizarry, Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au> |
7 | 7 |
Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
... | ... |
@@ -14,7 +14,9 @@ importMethodsFrom(Biobase, annotation, "annotation<-", |
14 | 14 |
fData, featureData, "featureData<-", featureNames, |
15 | 15 |
fvarMetadata, fvarLabels, pData, phenoData, |
16 | 16 |
"phenoData<-", protocolData, "protocolData<-", |
17 |
- pubMedIds, rowMedians, sampleNames, storageMode, |
|
17 |
+ pubMedIds, rowMedians, sampleNames, snpCall, |
|
18 |
+ snpCallProbability, |
|
19 |
+ "snpCall<-", "snpCallProbability<-", storageMode, |
|
18 | 20 |
"storageMode<-", updateObject, varLabels) |
19 | 21 |
|
20 | 22 |
importFrom(Biobase, assayDataElement, assayDataElementNames, |
... | ... |
@@ -2,6 +2,7 @@ setOldClass("ffdf") |
2 | 2 |
setOldClass("ff_matrix") |
3 | 3 |
##setClassUnion("matrix_or_ff", c("matrix", "ff_matrix")) |
4 | 4 |
setClassUnion("list_or_ffdf", c("list", "ffdf")) |
5 |
+setClassUnion("ff_or_matrix", c("ff_matrix", "matrix", "ffdf")) |
|
5 | 6 |
setClass("CNSetLM", contains="CNSet", representation(lM="list_or_ffdf")) |
6 | 7 |
setMethod("initialize", "CNSetLM", function(.Object, CA=new("matrix"), CB=new("matrix"), lM=new("list"), ...){ |
7 | 8 |
.Object <- callNextMethod(.Object, CA=CA, CB=CB, lM=lM, ...) |
... | ... |
@@ -92,10 +92,14 @@ setMethod("open", "AlleleSet", function(con, ...){ |
92 | 92 |
##setReplaceMethod("calls", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "call", value)) |
93 | 93 |
##setReplaceMethod("confs", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "callProbability", value)) |
94 | 94 |
##setMethod("confs", "SnpSuperSet", function(object) assayDataElement(object, "callProbability")) |
95 |
-genotype <- function(filenames, cdfName, mixtureSampleSize=10^5, |
|
96 |
- fitMixture=TRUE, |
|
97 |
- eps=0.1, verbose=TRUE, |
|
98 |
- seed=1, sns, copynumber=FALSE, |
|
95 |
+genotype <- function(filenames, |
|
96 |
+ cdfName, |
|
97 |
+ mixtureSampleSize=10^5, |
|
98 |
+ eps=0.1, |
|
99 |
+ verbose=TRUE, |
|
100 |
+ seed=1, |
|
101 |
+ sns, |
|
102 |
+ copynumber=FALSE, |
|
99 | 103 |
probs=rep(1/3, 3), |
100 | 104 |
DF=6, |
101 | 105 |
SNRMin=5, |
... | ... |
@@ -117,10 +121,12 @@ genotype <- function(filenames, cdfName, mixtureSampleSize=10^5, |
117 | 121 |
mixtureParams <- matrix(NA, 4, length(filenames)) |
118 | 122 |
snp.index <- which(isSnp(callSet)==1) |
119 | 123 |
batches <- splitIndicesByLength(1:ncol(callSet), ocSamples()) |
124 |
+ iter <- 1 |
|
120 | 125 |
for(j in batches){ |
126 |
+ if(verbose) message("Batch ", iter, " of ", length(batches)) |
|
121 | 127 |
snprmaRes <- snprma(filenames=filenames[j], |
122 | 128 |
mixtureSampleSize=mixtureSampleSize, |
123 |
- fitMixture=fitMixture, |
|
129 |
+ fitMixture=TRUE, |
|
124 | 130 |
eps=eps, |
125 | 131 |
verbose=verbose, |
126 | 132 |
seed=seed, |
... | ... |
@@ -129,27 +135,25 @@ genotype <- function(filenames, cdfName, mixtureSampleSize=10^5, |
129 | 135 |
stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns)) |
130 | 136 |
pData(callSet)$SKW[j] <- snprmaRes$SKW |
131 | 137 |
pData(callSet)$SNR[j] <- snprmaRes$SNR |
132 |
- A(callSet)[snp.index, j] <- snprmaRes[["A"]] |
|
133 |
- B(callSet)[snp.index, j] <- snprmaRes[["B"]] |
|
138 |
+ suppressWarnings(A(callSet)[snp.index, j] <- snprmaRes[["A"]]) |
|
139 |
+ suppressWarnings(B(callSet)[snp.index, j] <- snprmaRes[["B"]]) |
|
134 | 140 |
mixtureParams[, j] <- snprmaRes$mixtureParams |
135 | 141 |
rm(snprmaRes); gc() |
136 |
- } |
|
137 |
- ## remove snprmaRes and garbage collect before quantile-normalizing NP probes |
|
138 |
- if(copynumber){ |
|
139 |
- np.index <- which(isSnp(callSet) == 0) |
|
140 |
- cnrmaRes <- cnrma(filenames=filenames[j], |
|
141 |
- cdfName=cdfName, |
|
142 |
- row.names=featureNames(callSet)[np.index], |
|
143 |
- sns=sns[j], |
|
144 |
- seed=seed, |
|
145 |
- verbose=verbose) |
|
146 |
- stopifnot(identical(featureNames(callSet)[np.index], rownames(cnrmaRes))) |
|
147 |
- A(callSet)[np.index, j] <- cnrmaRes |
|
148 |
- rm(cnrmaRes); gc() |
|
149 |
- } |
|
150 |
- for(j in batches){ |
|
151 |
- tmp <- crlmmGT(A=A(callSet)[snp.index, j], |
|
152 |
- B=B(callSet)[snp.index, j], |
|
142 |
+ if(copynumber){ |
|
143 |
+ np.index <- which(isSnp(callSet) == 0) |
|
144 |
+ cnrmaRes <- cnrma(filenames=filenames[j], |
|
145 |
+ cdfName=cdfName, |
|
146 |
+ row.names=featureNames(callSet)[np.index], |
|
147 |
+ sns=sns[j], |
|
148 |
+ seed=seed, |
|
149 |
+ verbose=verbose) |
|
150 |
+ stopifnot(identical(featureNames(callSet)[np.index], rownames(cnrmaRes))) |
|
151 |
+ A(callSet)[np.index, j] <- cnrmaRes |
|
152 |
+ rm(cnrmaRes); gc() |
|
153 |
+ } |
|
154 |
+ ## as.matrix needed when ffdf is used |
|
155 |
+ tmp <- crlmmGT(A=as.matrix(A(callSet)[snp.index, j]), |
|
156 |
+ B=as.matrix(B(callSet)[snp.index, j]), |
|
153 | 157 |
SNR=callSet$SNR[j], |
154 | 158 |
mixtureParams=mixtureParams[, j], |
155 | 159 |
cdfName=annotation(callSet), |
... | ... |
@@ -164,11 +168,11 @@ genotype <- function(filenames, cdfName, mixtureSampleSize=10^5, |
164 | 168 |
verbose=verbose, |
165 | 169 |
returnParams=returnParams, |
166 | 170 |
badSNP=badSNP) |
167 |
- snpCall(callSet)[snp.index, j] <- tmp[["calls"]] |
|
168 |
- snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]] |
|
171 |
+ suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]]) |
|
172 |
+ suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]]) |
|
169 | 173 |
callSet$gender[j] <- tmp$gender |
170 |
- rm(tmp); gc() |
|
171 |
- } |
|
174 |
+ iter <- iter+1 |
|
175 |
+ } |
|
172 | 176 |
return(callSet) |
173 | 177 |
} |
174 | 178 |
|
... | ... |
@@ -267,17 +271,11 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
267 | 271 |
if(missing(cdfName)) stop("must specify cdfName") |
268 | 272 |
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
269 | 273 |
if(missing(sns)) sns <- basename(arrayNames) |
270 |
- RG <- constructRG(filenames=arrayNames, |
|
271 |
- cdfName=cdfName, |
|
272 |
- sns=sns, |
|
273 |
- verbose=verbose, |
|
274 |
- fileExt=fileExt, |
|
275 |
- sep=sep, |
|
276 |
- sampleSheet=sampleSheet, |
|
277 |
- arrayInfoColNames=arrayInfoColNames) |
|
278 | 274 |
batches <- splitIndicesByLength(seq(along=arrayNames), ocSamples()) |
275 |
+ k <- 1 |
|
279 | 276 |
for(j in batches){ |
280 |
- tmp <- readIdatFiles(sampleSheet=sampleSheet[j, ], |
|
277 |
+ if(verbose) message("Batch ", k, " of ", length(batches)) |
|
278 |
+ RG <- readIdatFiles(sampleSheet=sampleSheet[j, ], |
|
281 | 279 |
arrayNames=arrayNames[j], |
282 | 280 |
ids=ids, |
283 | 281 |
path=path, |
... | ... |
@@ -286,55 +284,9 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
286 | 284 |
sep=sep, |
287 | 285 |
fileExt=fileExt, |
288 | 286 |
saveDate=TRUE) |
289 |
- if(nrow(tmp) != nrow(RG)){ |
|
290 |
- ##RS: I don't understand why the IDATS for the |
|
291 |
- ##same platform do not have necessarily have |
|
292 |
- ##the same length |
|
293 |
- tmp <- tmp[featureNames(tmp) %in% featureNames(RG), ] |
|
294 |
- } |
|
295 |
- index <- match(featureNames(tmp), featureNames(RG)) |
|
296 |
- assayData(RG)$R[index,j] <- assayData(tmp)$R |
|
297 |
- assayData(RG)$G[index, j] <- assayData(tmp)$G |
|
298 |
- assayData(RG)$zero[index, j] <- assayData(tmp)$zero |
|
299 |
- pData(RG)[j, ] <- pData(tmp) |
|
300 |
- annotation(RG) <- annotation(tmp) |
|
301 |
- pData(protocolData(RG))[j, ] <- pData(protocolData(tmp)) |
|
302 |
- rm(tmp); gc() |
|
303 |
- } |
|
304 |
- message("Saving RG") |
|
305 |
- save(RG, file=file.path(ldPath(), "RG.rda")) |
|
306 |
- k <- 1 |
|
307 |
- for(j in batches){ |
|
308 |
- ##MR: Might want to to nonpolymorphic markers in a |
|
309 |
- ##separate step and make that optional |
|
310 |
- tmp <- RGtoXY(RG[, j], chipType=cdfName) |
|
311 |
- if(k == 1){ |
|
312 |
- ##initialize XYSet |
|
313 |
- nc <- ncol(tmp) |
|
314 |
- nr <- nrow(tmp) |
|
315 |
- XY <- new("NChannelSet", |
|
316 |
- X=initializeBigMatrix(name="X", nr, nc), |
|
317 |
- Y=initializeBigMatrix(name="Y", nr, nc), |
|
318 |
- zero=initializeBigMatrix(name="zero", nr, nc), |
|
319 |
- experimentData=experimentData(RG), |
|
320 |
- phenoData=phenoData(RG), |
|
321 |
- protocolData=protocolData(RG), |
|
322 |
- annotation=annotation(RG)) |
|
323 |
- } |
|
324 |
- storageMode(XY) <- "environment" |
|
325 |
- assayData(XY)$X[, j] <- assayData(tmp)$X |
|
326 |
- assayData(XY)$Y[, j] <- assayData(tmp)$Y |
|
327 |
- assayData(XY)$zero[, j] <- assayData(tmp)$zero |
|
328 |
- k <- k+1 |
|
329 |
- rm(tmp); gc() |
|
330 |
- } |
|
331 |
- annotation(XY) <- cdfName |
|
332 |
- message("Saving XY") |
|
333 |
- save(XY, file=file.path(ldPath(), "XY.rda")) |
|
334 |
- mixtureParams <- matrix(NA, 4, length(filenames)) |
|
335 |
- k <- 1 |
|
336 |
- for(j in batches){ |
|
337 |
- res <- preprocessInfinium2(XY[, j], |
|
287 |
+ XY <- RGtoXY(RG, chipType=cdfName) |
|
288 |
+ rm(RG); gc() |
|
289 |
+ res <- preprocessInfinium2(XY, |
|
338 | 290 |
mixtureSampleSize=mixtureSampleSize, |
339 | 291 |
fitMixture=TRUE, |
340 | 292 |
verbose=verbose, |
... | ... |
@@ -344,40 +296,55 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
344 | 296 |
sns=sns[j], |
345 | 297 |
stripNorm=stripNorm, |
346 | 298 |
useTarget=useTarget) |
347 |
- if(k==1){ |
|
348 |
- ## MR: number of rows should be number of SNPs + number of nonpolymorphic markers. |
|
349 |
- ## Here, I'm just using the # of rows returned from the above function |
|
299 |
+ ## MR: number of rows should be number of SNPs + number of nonpolymorphic markers. |
|
300 |
+ ## Here, I'm just using the # of rows returned from the above function |
|
301 |
+ if(k == 1){ |
|
302 |
+ if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability") |
|
350 | 303 |
callSet <- new("SnpSuperSet", |
351 |
- alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=ncol(XY)), |
|
352 |
- alleleB=initializeBigMatrix(name="B", nr=nrow(res[[2]]), nc=ncol(XY)), |
|
353 |
- call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=ncol(XY)), |
|
354 |
- callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=ncol(XY)), |
|
355 |
- protocolData=protocolData(XY), |
|
304 |
+ alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)), |
|
305 |
+ alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)), |
|
306 |
+ call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)), |
|
307 |
+ callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)), |
|
356 | 308 |
experimentData=experimentData(XY), |
357 |
- phenoData=phenoData(XY), |
|
358 | 309 |
annotation=annotation(XY)) |
359 |
- featureNames(callSet) <- res[["gns"]] |
|
360 | 310 |
sampleNames(callSet) <- sns |
311 |
+ phenoData(callSet) <- getPhenoData(sampleSheet=sampleSheet, |
|
312 |
+ arrayNames=sns, |
|
313 |
+ arrayInfoColNames=arrayInfoColNames) |
|
314 |
+ pD <- data.frame(matrix(NA, length(sns), 1), |
|
315 |
+ row.names=sns) |
|
316 |
+ colnames(pD) <- "ScanDate" |
|
317 |
+ protocolData(callSet) <- new("AnnotatedDataFrame", data=pD) |
|
318 |
+ pData(protocolData(callSet))[j, ] <- pData(protocolData(XY)) |
|
319 |
+ featureNames(callSet) <- res[["gns"]] |
|
320 |
+ pData(callSet)$SKW <- rep(NA, length(sns)) |
|
321 |
+ pData(callSet)$SNR <- rep(NA, length(sns)) |
|
322 |
+ pData(callSet)$gender <- rep(NA, length(sns)) |
|
323 |
+ ##sampleNames(callSet) <- sns |
|
324 |
+ } |
|
325 |
+ if(k > 1 & nrow(res[[1]]) != nrow(callSet)){ |
|
326 |
+ ##RS: I don't understand why the IDATS for the |
|
327 |
+ ##same platform potentially have different lengths |
|
328 |
+ res[["A"]] <- res[["A"]][res$gns %in% featureNames(callSet), ] |
|
329 |
+ res[["B"]] <- res[["B"]][res$gns %in% featureNames(callSet), ] |
|
361 | 330 |
} |
362 |
- ## MR: we need to define a snp.index |
|
363 |
- A(callSet)[, j] <- res[["A"]] |
|
364 |
- B(callSet)[, j] <- res[["B"]] |
|
331 |
+ ## MR: we need to define a snp.index vs np.index |
|
332 |
+ snp.index <- match(res$gns, featureNames(callSet)) |
|
333 |
+ suppressWarnings(A(callSet)[snp.index, j] <- res[["A"]]) |
|
334 |
+ suppressWarnings(B(callSet)[snp.index, j] <- res[["B"]]) |
|
365 | 335 |
pData(callSet)$SKW[j] <- res$SKW |
366 | 336 |
pData(callSet)$SNR[j] <- res$SNR |
367 |
- mixtureParams[, j] <- res$mixtureParams |
|
337 |
+ ##mixtureParams[, j] <- res$mixtureParams |
|
338 |
+ mixtureParams <- res$mixtureParams |
|
368 | 339 |
rm(res); gc() |
369 |
- } |
|
370 |
- message("Saving callSe") |
|
371 |
- save(callSet, file=file.path(ldPath(), "callSet.rda")) |
|
372 |
- for(j in batches){ |
|
373 | 340 |
##MR: edit snp.index |
374 |
- snp.index <- 1:nrow(callSet) |
|
375 |
- tmp <- crlmmGT(A=A(callSet)[snp.index, j], |
|
376 |
- B=B(callSet)[snp.index, j], |
|
341 |
+ ##snp.index <- 1:nrow(callSet) |
|
342 |
+ tmp <- crlmmGT(A=as.matrix(A(callSet)[, j]), |
|
343 |
+ B=as.matrix(B(callSet)[, j]), |
|
377 | 344 |
SNR=callSet$SNR[j], |
378 |
- mixtureParams=mixtureParams[, j], |
|
345 |
+ mixtureParams=mixtureParams, |
|
379 | 346 |
cdfName=annotation(callSet), |
380 |
- row.names=featureNames(callSet)[snp.index], |
|
347 |
+ row.names=featureNames(callSet), |
|
381 | 348 |
col.names=sampleNames(callSet)[j], |
382 | 349 |
probs=probs, |
383 | 350 |
DF=DF, |
... | ... |
@@ -388,11 +355,12 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
388 | 355 |
verbose=verbose, |
389 | 356 |
returnParams=returnParams, |
390 | 357 |
badSNP=badSNP) |
391 |
- snpCall(callSet)[snp.index, j] <- tmp[["calls"]] |
|
392 |
- ## many zeros here (?) |
|
393 |
- snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]] |
|
358 |
+ suppressWarnings(snpCall(callSet)[, j] <- tmp[["calls"]]) |
|
359 |
+ ## MR: many zeros in the conf. scores (?) |
|
360 |
+ suppressWarnings(snpCallProbability(callSet)[, j] <- tmp[["confs"]]) |
|
394 | 361 |
callSet$gender[j] <- tmp$gender |
395 | 362 |
rm(tmp); gc() |
363 |
+ k <- k+1 |
|
396 | 364 |
} |
397 | 365 |
return(callSet) |
398 | 366 |
} |
... | ... |
@@ -1,3 +1,15 @@ |
1 |
+setReplaceMethod("snpCall", c("SnpSuperSet", "ff_or_matrix"), |
|
2 |
+ function(object, ..., value) |
|
3 |
+{ |
|
4 |
+ assayDataElementReplace(object, "call", value) |
|
5 |
+}) |
|
6 |
+setReplaceMethod("snpCallProbability", c("SnpSuperSet", "ff_or_matrix"), |
|
7 |
+ function(object, ..., value) |
|
8 |
+{ |
|
9 |
+ assayDataElementReplace(object, "callProbability", value) |
|
10 |
+}) |
|
11 |
+ |
|
12 |
+ |
|
1 | 13 |
## Method("initialize", "AlleleSet", |
2 | 14 |
## function(.Object, |
3 | 15 |
## assayData = assayDataNew(alleleA=alleleA, |