git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45672 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -489,3 +489,10 @@ then readIDAT() should work. Thanks to Pierre Cherel who reported this error. |
489 | 489 |
|
490 | 490 |
** added parallel/large dataset support to snprma/crlmm |
491 | 491 |
** merged changes on .41 with my local changes |
492 |
+ |
|
493 |
+2010-04-01 R.Scharpf committed version 1.5.44 |
|
494 |
+ |
|
495 |
+** added functions genotype2, cnrma2 for preprocessing and genotyping |
|
496 |
+ with crlmm2 |
|
497 |
+** added crlmmCopynumber2 for parallel support with copy number |
|
498 |
+ est. (needs more checking) |
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.5.43 |
|
4 |
+Version: 1.5.44 |
|
5 | 5 |
Date: 2010-02-05 |
6 | 6 |
Author: Rafael A Irizarry, Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au> |
7 | 7 |
Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
... | ... |
@@ -12,7 +12,7 @@ importMethodsFrom(Biobase, annotation, "annotation<-", |
12 | 12 |
annotatedDataFrameFrom, assayData, "assayData<-", |
13 | 13 |
combine, dims, experimentData, "experimentData<-", |
14 | 14 |
fData, featureData, "featureData<-", featureNames, |
15 |
- fvarMetadata, fvarLabels, pData, phenoData, |
|
15 |
+ fvarMetadata, fvarLabels, pData, "pData<-", phenoData, |
|
16 | 16 |
"phenoData<-", protocolData, "protocolData<-", |
17 | 17 |
pubMedIds, rowMedians, sampleNames, snpCall, |
18 | 18 |
snpCallProbability, |
... | ... |
@@ -67,7 +67,12 @@ export(crlmm, |
67 | 67 |
readIdatFiles, |
68 | 68 |
snprma, |
69 | 69 |
snprma2, |
70 |
- crlmm2) |
|
70 |
+ crlmm2, |
|
71 |
+ genotype2, |
|
72 |
+ cnrma2, |
|
73 |
+ processCEL2, |
|
74 |
+ batch, |
|
75 |
+ crlmmCopynumber2) |
|
71 | 76 |
|
72 | 77 |
export(initializeParamObject) |
73 | 78 |
|
... | ... |
@@ -1,3 +1,4 @@ |
1 |
+setGeneric("batch", function(object) standardGeneric("batch")) |
|
1 | 2 |
setGeneric("getParam", function(object, name, batch) standardGeneric("getParam")) |
2 | 3 |
setGeneric("cnIndex", function(object) standardGeneric("cnIndex")) |
3 | 4 |
setGeneric("cnNames", function(object) standardGeneric("cnNames")) |
... | ... |
@@ -54,19 +54,20 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){ |
54 | 54 |
##crlmmOpts$snpRange <- range(snpIndex) |
55 | 55 |
##crlmmOpts$npRange <- range(npIndex) |
56 | 56 |
} |
57 |
-construct <- function(filenames, cdfName, copynumber=FALSE, sns, verbose=TRUE){ |
|
57 |
+construct <- function(filenames, cdfName, copynumber=FALSE, |
|
58 |
+ sns, verbose=TRUE){ |
|
58 | 59 |
if(verbose) message("Initializing container for assay data elements alleleA, alleleB, call, callProbability") |
59 | 60 |
if(missing(sns)) sns <- basename(filenames) |
60 | 61 |
protocolData <- getProtocolData.Affy(filenames) |
61 | 62 |
featureData <- getFeatureData.Affy(cdfName, copynumber=copynumber) |
62 | 63 |
nr <- nrow(featureData); nc <- length(filenames) |
63 | 64 |
callSet <- new("SnpSuperSet", |
64 |
- alleleA=initializeBigMatrix(name="A", nr, nc), |
|
65 |
- alleleB=initializeBigMatrix(name="B", nr, nc), |
|
66 |
- call=initializeBigMatrix(name="call", nr, nc), |
|
67 |
- callProbability=initializeBigMatrix(name="callPr", nr,nc), |
|
68 |
- featureData=featureData, |
|
69 |
- annotation=cdfName) |
|
65 |
+ alleleA=initializeBigMatrix(name="A", nr, nc), |
|
66 |
+ alleleB=initializeBigMatrix(name="B", nr, nc), |
|
67 |
+ call=initializeBigMatrix(name="call", nr, nc), |
|
68 |
+ callProbability=initializeBigMatrix(name="callPr", nr,nc), |
|
69 |
+ featureData=featureData, |
|
70 |
+ annotation=cdfName) |
|
70 | 71 |
pd <- data.frame(matrix(NA, nc, 3), row.names=sns) |
71 | 72 |
colnames(pd)=c("SKW", "SNR", "gender") |
72 | 73 |
phenoData(callSet) <- new("AnnotatedDataFrame", data=pd) |
... | ... |
@@ -167,14 +168,557 @@ genotype <- function(filenames, |
167 | 168 |
verbose=verbose, |
168 | 169 |
returnParams=returnParams, |
169 | 170 |
badSNP=badSNP) |
170 |
- suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]]) |
|
171 |
- suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]]) |
|
171 |
+ snpCall(callSet)[snp.index, j] <- tmp[["calls"]] |
|
172 |
+ snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]] |
|
172 | 173 |
callSet$gender[j] <- tmp$gender |
173 | 174 |
iter <- iter+1 |
174 | 175 |
} |
175 | 176 |
return(callSet) |
176 | 177 |
} |
177 | 178 |
|
179 |
+genotype2 <- function(filenames, |
|
180 |
+ cdfName, |
|
181 |
+ batch, |
|
182 |
+ mixtureSampleSize=10^5, |
|
183 |
+ eps=0.1, |
|
184 |
+ verbose=TRUE, |
|
185 |
+ seed=1, |
|
186 |
+ sns, |
|
187 |
+ copynumber=FALSE, |
|
188 |
+ probs=rep(1/3, 3), |
|
189 |
+ DF=6, |
|
190 |
+ SNRMin=5, |
|
191 |
+ recallMin=10, |
|
192 |
+ recallRegMin=1000, |
|
193 |
+ gender=NULL, |
|
194 |
+ returnParams=TRUE, |
|
195 |
+ badSNP=0.7){ |
|
196 |
+ if(!copynumber){ |
|
197 |
+ callSet <- crlmm2(filenames=filenames, |
|
198 |
+ cdfName=cdfName, |
|
199 |
+ mixtureSampleSize=mixtureSampleSize, |
|
200 |
+ eps=eps, |
|
201 |
+ verbose=verbose, |
|
202 |
+ seed=seed, |
|
203 |
+ sns=sns, |
|
204 |
+ probs=probs, |
|
205 |
+ DF=DF, |
|
206 |
+ SNRMin=SNRMin, |
|
207 |
+ recallMin=recallMin, |
|
208 |
+ recallRegMin=recallRegMin, |
|
209 |
+ gender=gender, |
|
210 |
+ returnParams=returnParams, |
|
211 |
+ badSNP=badSNP) |
|
212 |
+ return(callSet) |
|
213 |
+ } |
|
214 |
+ if(missing(cdfName)) stop("must specify cdfName") |
|
215 |
+ if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
|
216 |
+ if(missing(batch)){ |
|
217 |
+ warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch. The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.") |
|
218 |
+ } else { |
|
219 |
+ if(length(batch) != length(filenames)) |
|
220 |
+ stop("batch variable must be the same length as the filenames") |
|
221 |
+ } |
|
222 |
+ if(missing(sns)) sns <- basename(filenames) |
|
223 |
+ ## callSet contains potentially very big matrices |
|
224 |
+ callSet <- construct(filenames=filenames, |
|
225 |
+ cdfName=cdfName, |
|
226 |
+ copynumber=TRUE, |
|
227 |
+ sns=sns, |
|
228 |
+ verbose=verbose) |
|
229 |
+ if(missing(batch)){ |
|
230 |
+ protocolData(callSet)$batch <- as.numeric(as.factor(protocolData(callSet)$ScanDate)) |
|
231 |
+ } |
|
232 |
+ if(!missing(batch)) protocolData(callSet)$batch <- batch |
|
233 |
+ mixtureParams <- matrix(NA, 4, length(filenames)) |
|
234 |
+ snp.index <- which(isSnp(callSet)==1) |
|
235 |
+ snprmaRes <- snprma2(##A=A(callSet), |
|
236 |
+ ##B=B(callSet), |
|
237 |
+ filenames=filenames, |
|
238 |
+ mixtureSampleSize=mixtureSampleSize, |
|
239 |
+ fitMixture=TRUE, |
|
240 |
+ eps=eps, |
|
241 |
+ verbose=verbose, |
|
242 |
+ seed=seed, |
|
243 |
+ cdfName=cdfName, |
|
244 |
+ sns=sns) |
|
245 |
+ if(verbose) message("Finished preprocessing.") |
|
246 |
+ open(snprmaRes[["A"]]) |
|
247 |
+ open(snprmaRes[["B"]]) |
|
248 |
+ open(snprmaRes[["SNR"]]) |
|
249 |
+ open(snprmaRes[["SKW"]]) |
|
250 |
+ open(snprmaRes[["mixtureParams"]]) |
|
251 |
+ if(verbose) message("Updating elements of callSet") |
|
252 |
+## A(callSet) <- snprmaRes[["A"]] |
|
253 |
+## B(callSet) <- snprmaRes[["B"]] |
|
254 |
+ bb = ocProbesets()*ncol(A)*8 |
|
255 |
+ ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]], BATCHBYTES=bb) |
|
256 |
+ ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]], BATCHBYTES=bb) |
|
257 |
+ ##ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]]) |
|
258 |
+ ##ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]]) |
|
259 |
+ if(verbose) message("Finished updating elements of callSet") |
|
260 |
+ stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns)) |
|
261 |
+ pData(callSet)$SKW <- snprmaRes$SKW |
|
262 |
+ pData(callSet)$SNR <- snprmaRes$SNR |
|
263 |
+ mixtureParams <- snprmaRes$mixtureParams |
|
264 |
+ np.index <- which(isSnp(callSet) == 0) |
|
265 |
+ cnrmaRes <- cnrma2(A=A(callSet), |
|
266 |
+ filenames=filenames, |
|
267 |
+ row.names=featureNames(callSet)[np.index], |
|
268 |
+ cdfName=cdfName, |
|
269 |
+ sns=sns, |
|
270 |
+ seed=seed, |
|
271 |
+ verbose=verbose) |
|
272 |
+ rm(cnrmaRes); gc() |
|
273 |
+ ## as.matrix needed when ffdf is used |
|
274 |
+ tmp <- crlmmGT2(A=snprmaRes[["A"]], |
|
275 |
+ B=snprmaRes[["B"]], |
|
276 |
+ SNR=snprmaRes[["SNR"]], |
|
277 |
+ mixtureParams=snprmaRes[["mixtureParams"]], |
|
278 |
+ cdfName=cdfName, |
|
279 |
+ row.names=NULL, ##featureNames(callSet),##[snp.index], |
|
280 |
+ col.names=sampleNames(callSet), |
|
281 |
+ probs=probs, |
|
282 |
+ DF=DF, |
|
283 |
+ SNRMin=SNRMin, |
|
284 |
+ recallMin=recallMin, |
|
285 |
+ recallRegMin=recallRegMin, |
|
286 |
+ gender=gender, |
|
287 |
+ verbose=verbose, |
|
288 |
+ returnParams=returnParams, |
|
289 |
+ badSNP=badSNP) |
|
290 |
+ open(tmp[["calls"]]) |
|
291 |
+ open(tmp[["confs"]]) |
|
292 |
+ ##snpCall(callSet) <- tmp[["calls"]] |
|
293 |
+ ##snpCallProbability(callSet) <- tmp[["confs"]] |
|
294 |
+ bb = ocProbesets()*ncol(A)*8 |
|
295 |
+ ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb) |
|
296 |
+ ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb) |
|
297 |
+## ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]]) |
|
298 |
+## ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]]) |
|
299 |
+ callSet$gender <- tmp$gender |
|
300 |
+ cnSet <- as(callSet, "CNSetLM") |
|
301 |
+ return(cnSet) |
|
302 |
+} |
|
303 |
+ |
|
304 |
+snprma2RS <- function(A, B, |
|
305 |
+ filenames, mixtureSampleSize=10^5, fitMixture=FALSE, |
|
306 |
+ eps=0.1, verbose=TRUE, seed=1, cdfName, sns){ |
|
307 |
+ if (missing(sns)) sns <- basename(filenames) |
|
308 |
+ if (missing(cdfName)) |
|
309 |
+ cdfName <- read.celfile.header(filenames[1])[["cdfName"]] |
|
310 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
311 |
+ stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
|
312 |
+ |
|
313 |
+ if(verbose) message("Loading annotations and mixture model parameters.") |
|
314 |
+ loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
315 |
+ pnsa <- getVarInEnv("pnsa") |
|
316 |
+ pnsb <- getVarInEnv("pnsb") |
|
317 |
+ gns <- getVarInEnv("gns") |
|
318 |
+ |
|
319 |
+ ##We will read each cel file, summarize, and run EM one by one |
|
320 |
+ ##We will save parameters of EM to use later |
|
321 |
+ if(verbose) message("Initializing objects.") |
|
322 |
+ mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double") |
|
323 |
+ SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double") |
|
324 |
+ SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double") |
|
325 |
+ |
|
326 |
+ ## This is the sample for the fitting of splines |
|
327 |
+ ## BC: I like better the idea of the user passing the seed, |
|
328 |
+ ## because this might intefere with other analyses |
|
329 |
+ ## (like what happened to GCRMA) |
|
330 |
+ ##S will hold (A+B)/2 and M will hold A-B |
|
331 |
+ ##NOTE: We actually dont need to save S. Only for pics etc... |
|
332 |
+ ##f is the correction. we save to avoid recomputing |
|
333 |
+## A <- initializeBigMatrix("crlmmA-", length(pnsa), length(filenames), "integer") |
|
334 |
+## B <- initializeBigMatrix("crlmmB-", length(pnsb), length(filenames), "integer") |
|
335 |
+ |
|
336 |
+ sampleBatches <- splitIndicesByNode(seq(along=filenames)) |
|
337 |
+ |
|
338 |
+ if(verbose) message("Processing ", length(filenames), " files.") |
|
339 |
+ |
|
340 |
+ ocLapply(sampleBatches, processCEL_RS, filenames=filenames, |
|
341 |
+ fitMixture=fitMixture, A=A, B=B, SKW=SKW, SNR=SNR, |
|
342 |
+ mixtureParams=mixtureParams, eps=eps, seed=seed, |
|
343 |
+ mixtureSampleSize=mixtureSampleSize, pkgname=pkgname, |
|
344 |
+ neededPkgs=c("crlmm", pkgname)) |
|
345 |
+ |
|
346 |
+ list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName) |
|
347 |
+} |
|
348 |
+ |
|
349 |
+processCEL_RS <- function(i, filenames, fitMixture, A, B, SKW, SNR, |
|
350 |
+ mixtureParams, eps, seed, mixtureSampleSize, |
|
351 |
+ pkgname){ |
|
352 |
+ |
|
353 |
+ loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
354 |
+ loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
355 |
+ loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
356 |
+ autosomeIndex <- getVarInEnv("autosomeIndex") |
|
357 |
+ pnsa <- getVarInEnv("pnsa") |
|
358 |
+ pnsb <- getVarInEnv("pnsb") |
|
359 |
+ fid <- getVarInEnv("fid") |
|
360 |
+ reference <- getVarInEnv("reference") |
|
361 |
+ aIndex <- getVarInEnv("aIndex") |
|
362 |
+ bIndex <- getVarInEnv("bIndex") |
|
363 |
+ SMEDIAN <- getVarInEnv("SMEDIAN") |
|
364 |
+ theKnots <- getVarInEnv("theKnots") |
|
365 |
+ gns <- getVarInEnv("gns") |
|
366 |
+ |
|
367 |
+ ## for mixture |
|
368 |
+ set.seed(seed) |
|
369 |
+ idx <- sort(sample(autosomeIndex, mixtureSampleSize)) |
|
370 |
+ ##for skewness. no need to do everything |
|
371 |
+ idx2 <- sample(length(fid), 10^5) |
|
372 |
+ |
|
373 |
+ open(A) |
|
374 |
+ open(B) |
|
375 |
+ open(SKW) |
|
376 |
+ open(mixtureParams) |
|
377 |
+ open(SNR) |
|
378 |
+ ##RS |
|
379 |
+ iii <- seq(along=pnsa) |
|
380 |
+ |
|
381 |
+ for (k in i){ |
|
382 |
+ y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
|
383 |
+ x <- log2(y[idx2]) |
|
384 |
+ SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3) |
|
385 |
+ rm(x) |
|
386 |
+ y <- normalize.quantiles.use.target(y, target=reference) |
|
387 |
+ A[iii, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
|
388 |
+ B[iii, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
|
389 |
+ rm(y) |
|
390 |
+ |
|
391 |
+ if(fitMixture){ |
|
392 |
+ S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN |
|
393 |
+ M <- log2(A[idx, k])-log2(B[idx, k]) |
|
394 |
+ tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps) |
|
395 |
+ mixtureParams[, k] <- tmp[["coef"]] |
|
396 |
+ SNR[k] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) |
|
397 |
+ } else { |
|
398 |
+ mixtureParams[, k] <- NA |
|
399 |
+ SNR[k] <- NA |
|
400 |
+ } |
|
401 |
+ } |
|
402 |
+ close(A) |
|
403 |
+ close(B) |
|
404 |
+ close(SKW) |
|
405 |
+ close(mixtureParams) |
|
406 |
+ close(SNR) |
|
407 |
+ TRUE |
|
408 |
+} |
|
409 |
+ |
|
410 |
+crlmmGT2_RS<- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
|
411 |
+ col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6, |
|
412 |
+ SNRMin=5, recallMin=10, recallRegMin=1000, |
|
413 |
+ gender=NULL, desctrucitve=FALSE, verbose=TRUE, |
|
414 |
+ returnParams=FALSE, badSNP=.7){ |
|
415 |
+ open(SNR) |
|
416 |
+ open(A) |
|
417 |
+ open(B) |
|
418 |
+ open(mixtureParams) |
|
419 |
+ ## expect objects to be ff |
|
420 |
+ |
|
421 |
+ keepIndex <- which( SNR[] > SNRMin) |
|
422 |
+ if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
423 |
+ |
|
424 |
+ NC <- ncol(A) |
|
425 |
+ NR <- nrow(A) |
|
426 |
+ |
|
427 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
428 |
+ stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
|
429 |
+ |
|
430 |
+ if(verbose) message("Loading annotations.") |
|
431 |
+ loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
432 |
+ loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
433 |
+ ## this is toget rid of the 'no visible binding' notes |
|
434 |
+ ## variable definitions |
|
435 |
+ XIndex <- getVarInEnv("XIndex") |
|
436 |
+ autosomeIndex <- getVarInEnv("autosomeIndex") |
|
437 |
+ YIndex <- getVarInEnv("YIndex") |
|
438 |
+ SMEDIAN <- getVarInEnv("SMEDIAN") |
|
439 |
+ theKnots <- getVarInEnv("theKnots") |
|
440 |
+ regionInfo <- getVarInEnv("regionInfo") |
|
441 |
+ params <- getVarInEnv("params") |
|
442 |
+ ##RS |
|
443 |
+ pnsa <- getVarInEnv("pnsa") |
|
444 |
+ NR <- length(pnsa) |
|
445 |
+ |
|
446 |
+ ##IF gender not provide, we predict |
|
447 |
+ if(is.null(gender)){ |
|
448 |
+ if(verbose) message("Determining gender.") |
|
449 |
+ XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
450 |
+ if(sum(SNR[] > SNRMin)==1){ |
|
451 |
+ gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
452 |
+ }else{ |
|
453 |
+ gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]] |
|
454 |
+ } |
|
455 |
+ } |
|
456 |
+ |
|
457 |
+ Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
458 |
+ cIndexes <- list(keepIndex, |
|
459 |
+ keepIndex[which(gender[keepIndex]==2)], |
|
460 |
+ keepIndex[which(gender[keepIndex]==1)]) |
|
461 |
+ |
|
462 |
+ if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
463 |
+ |
|
464 |
+ ## call C |
|
465 |
+ fIndex <- which(gender==2) |
|
466 |
+ mIndex <- which(gender==1) |
|
467 |
+ |
|
468 |
+ ## different here |
|
469 |
+ ## use gtypeCallerR in batches |
|
470 |
+ ##RS |
|
471 |
+ snpBatches <- splitIndicesByLength(1:NR, ocProbesets()) |
|
472 |
+ newparamsBatch <- vector("list", length(snpBatches)) |
|
473 |
+ |
|
474 |
+ process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex, |
|
475 |
+ YIndex, A, B, mixtureParams, fIndex, mIndex, |
|
476 |
+ params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){ |
|
477 |
+ open(A) |
|
478 |
+ open(B) |
|
479 |
+ open(mixtureParams) |
|
480 |
+ snps <- snpBatches[[idxBatch]] |
|
481 |
+ rSnps <- range(snps) |
|
482 |
+ last <- (idxBatch-1)*batchSize |
|
483 |
+ IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
484 |
+ XIndex[XIndex %in% snps]-last, |
|
485 |
+ YIndex[YIndex %in% snps]-last) |
|
486 |
+ IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
487 |
+ tmpA <- as.matrix(A[snps,]) |
|
488 |
+ tmpB <- as.matrix(B[snps,]) |
|
489 |
+ ## newparamsBatch[[idxBatch]] |
|
490 |
+ |
|
491 |
+ tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex, |
|
492 |
+ params[["centers"]][snps,], |
|
493 |
+ params[["scales"]][snps,], |
|
494 |
+ params[["N"]][snps,], |
|
495 |
+ IndexesBatch, cIndexes, |
|
496 |
+ sapply(IndexesBatch, length), |
|
497 |
+ sapply(cIndexes, length), SMEDIAN, |
|
498 |
+ theKnots, mixtureParams[], DF, probs, 0.025) |
|
499 |
+ |
|
500 |
+ last <- rSnps[2] |
|
501 |
+ rm(snps, rSnps, IndexesBatch, tmpA, tmpB) |
|
502 |
+ close(A) |
|
503 |
+ close(B) |
|
504 |
+ close(mixtureParams) |
|
505 |
+ tmp |
|
506 |
+ } |
|
507 |
+ |
|
508 |
+ newparamsBatch <- ocLapply(seq(along=snpBatches), process1, |
|
509 |
+ snpBatches=snpBatches, |
|
510 |
+ autosomeIndex=autosomeIndex, XIndex=XIndex, |
|
511 |
+ YIndex=YIndex, A=A, B=B, |
|
512 |
+ mixtureParams=mixtureParams, fIndex=fIndex, |
|
513 |
+ mIndex=mIndex, params=params, |
|
514 |
+ cIndexes=cIndexes, SMEDIAN=SMEDIAN, |
|
515 |
+ theKnots=theKnots, DF=DF, probs=probs, |
|
516 |
+ batchSize=ocProbesets()) |
|
517 |
+## last <- 0 |
|
518 |
+## for (idxBatch in seq(along=snpBatches)){ |
|
519 |
+## snps <- snpBatches[[idxBatch]] |
|
520 |
+## rSnps <- range(snps) |
|
521 |
+## IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
522 |
+## XIndex[XIndex %in% snps]-last, |
|
523 |
+## YIndex[YIndex %in% snps]-last) |
|
524 |
+## IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
525 |
+## tmpA <- A[snps,] |
|
526 |
+## tmpB <- B[snps,] |
|
527 |
+## newparamsBatch[[idxBatch]] <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex, |
|
528 |
+## params[["centers"]][snps,], |
|
529 |
+## params[["scales"]][snps,], |
|
530 |
+## params[["N"]][snps,], |
|
531 |
+## IndexesBatch, cIndexes, |
|
532 |
+## sapply(IndexesBatch, length), |
|
533 |
+## sapply(cIndexes, length), |
|
534 |
+## SMEDIAN, theKnots, |
|
535 |
+## mixtureParams[], DF, probs, 0.025) |
|
536 |
+## last <- rSnps[2] |
|
537 |
+## rm(snps, rSnps, IndexesBatch, tmpA, tmpB) |
|
538 |
+## } |
|
539 |
+## rm(last) |
|
540 |
+ |
|
541 |
+ newparams <- vector("list", 3) |
|
542 |
+ names(newparams) <- c("centers", "scales", "N") |
|
543 |
+ newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1)) |
|
544 |
+ newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2)) |
|
545 |
+ newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3)) |
|
546 |
+ rm(newparamsBatch) |
|
547 |
+ if(verbose) message("Done.") |
|
548 |
+ if(verbose) message("Estimating recalibration parameters.") |
|
549 |
+ d <- newparams[["centers"]] - params$centers |
|
550 |
+ |
|
551 |
+ ##regression |
|
552 |
+ Index <- intersect(which(pmin(newparams[["N"]][, 1], |
|
553 |
+ newparams[["N"]][, 2], |
|
554 |
+ newparams[["N"]][, 3]) > recallMin & |
|
555 |
+ !apply(regionInfo, 1, any)), |
|
556 |
+ autosomeIndex) |
|
557 |
+ if(length(Index) < recallRegMin){ |
|
558 |
+ warning("Recalibration not possible. Possible cause: small sample size.") |
|
559 |
+ newparams <- params |
|
560 |
+ dev <- vector("numeric", nrow(newparams[["centers"]])) |
|
561 |
+ SS <- matrix(Inf, 3, 3) |
|
562 |
+ DD <- 0 |
|
563 |
+ }else{ |
|
564 |
+ data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
565 |
+ names(data4reg) <- c("AA", "AB", "BB") |
|
566 |
+ regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
567 |
+ c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
568 |
+ coef(lm(BB~AA*AB, data=data4reg))) |
|
569 |
+ rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
570 |
+ rm(data4reg) |
|
571 |
+ |
|
572 |
+ minN <- 3 |
|
573 |
+ newparams[["centers"]][newparams[["N"]] < minN] <- NA |
|
574 |
+ Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
575 |
+ if(verbose) cat("Filling out empty centers") |
|
576 |
+ for(i in Index){ |
|
577 |
+ if(verbose) if(i%%10000==0)cat(".") |
|
578 |
+ mu <- newparams[["centers"]][i, ] |
|
579 |
+ j <- which(is.na(mu)) |
|
580 |
+ newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
581 |
+ } |
|
582 |
+ |
|
583 |
+ ##remaing NAs are made like originals |
|
584 |
+ if(length(YIndex)>0){ |
|
585 |
+ noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
586 |
+ YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
587 |
+ } |
|
588 |
+ snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0) |
|
589 |
+ snps2keep <- setdiff(autosomeIndex, snps2ignore) |
|
590 |
+ rm(snps2ignore) |
|
591 |
+ newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
592 |
+ if(verbose) cat("\n") |
|
593 |
+ |
|
594 |
+ if(verbose) message("Calculating and standardizing size of shift.") |
|
595 |
+ GG <- DD <- newparams[["centers"]] - params[["centers"]] |
|
596 |
+ DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ])) |
|
597 |
+ SS <- cov(DD[autosomeIndex, ]) |
|
598 |
+ SSI <- solve(SS) |
|
599 |
+ dev <- vector("numeric", nrow(DD)) |
|
600 |
+ if(length(YIndex)){ |
|
601 |
+ dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
602 |
+ dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
603 |
+ ##Now Y (only two params) |
|
604 |
+ SSY <- SS[c(1, 3), c(1, 3)] |
|
605 |
+ SSI <- solve(SSY) |
|
606 |
+ dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
607 |
+ dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
608 |
+ } else { |
|
609 |
+ dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
610 |
+ dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
611 |
+ } |
|
612 |
+ } |
|
613 |
+ |
|
614 |
+ ## BC: must keep SD |
|
615 |
+ params[-2] <- newparams[-2] |
|
616 |
+ |
|
617 |
+ rm(newparams);gc(verbose=FALSE) |
|
618 |
+ if(verbose) cat("Calling", NR, "SNPs... ") |
|
619 |
+ ## ################### |
|
620 |
+ ## ## MOVE TO C####### |
|
621 |
+ |
|
622 |
+ ## running in batches |
|
623 |
+ ## snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets()) |
|
624 |
+ |
|
625 |
+ process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex, |
|
626 |
+ YIndex, A, B, mixtureParams, fIndex, mIndex, |
|
627 |
+ params, cIndexes, SMEDIAN, theKnots, DF, probs, |
|
628 |
+ regionInfo, batchSize){ |
|
629 |
+ open(A) |
|
630 |
+ open(B) |
|
631 |
+ open(mixtureParams) |
|
632 |
+ snps <- snpBatches[[idxBatch]] |
|
633 |
+ tmpA <- as.matrix(A[snps,]) |
|
634 |
+ tmpB <- as.matrix(B[snps,]) |
|
635 |
+ rSnps <- range(snps) |
|
636 |
+ last <- (idxBatch-1)*batchSize |
|
637 |
+ IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
638 |
+ XIndex[XIndex %in% snps]-last, |
|
639 |
+ YIndex[YIndex %in% snps]-last) |
|
640 |
+ IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
641 |
+ ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex, |
|
642 |
+ params[["centers"]][snps,], |
|
643 |
+ params[["scales"]][snps,], |
|
644 |
+ params[["N"]][snps,], |
|
645 |
+ IndexesBatch, cIndexes, |
|
646 |
+ sapply(IndexesBatch, length), |
|
647 |
+ sapply(cIndexes, length), |
|
648 |
+ SMEDIAN, theKnots, mixtureParams[], |
|
649 |
+ DF, probs, 0.025, |
|
650 |
+ which(regionInfo[snps, 2]), |
|
651 |
+ which(regionInfo[snps, 1])) |
|
652 |
+ A[snps,] <- tmpA |
|
653 |
+ B[snps,] <- tmpB |
|
654 |
+ last <- rSnps[2] |
|
655 |
+ rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull) |
|
656 |
+ close(A) |
|
657 |
+ close(B) |
|
658 |
+ close(mixtureParams) |
|
659 |
+ } |
|
660 |
+ |
|
661 |
+ ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches, |
|
662 |
+ autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex, |
|
663 |
+ A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex, |
|
664 |
+ mIndex=mIndex, params=params, cIndexes=cIndexes, |
|
665 |
+ SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs, |
|
666 |
+ regionInfo=regionInfo, batchSize=ocProbesets()) |
|
667 |
+ |
|
668 |
+## last <- 0 |
|
669 |
+## for (idxBatch in seq(along=snpBatches)){ |
|
670 |
+## snps <- snpBatches[[idxBatch]] |
|
671 |
+## tmpA <- A[snps,] |
|
672 |
+## tmpB <- B[snps,] |
|
673 |
+## rSnps <- range(snps) |
|
674 |
+## IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
675 |
+## XIndex[XIndex %in% snps]-last, |
|
676 |
+## YIndex[YIndex %in% snps]-last) |
|
677 |
+## IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
678 |
+## ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex, |
|
679 |
+## params[["centers"]][snps,], |
|
680 |
+## params[["scales"]][snps,], |
|
681 |
+## params[["N"]][snps,], |
|
682 |
+## IndexesBatch, cIndexes, |
|
683 |
+## sapply(IndexesBatch, length), |
|
684 |
+## sapply(cIndexes, length), |
|
685 |
+## SMEDIAN, theKnots, mixtureParams[], |
|
686 |
+## DF, probs, 0.025, |
|
687 |
+## which(regionInfo[snps, 2]), |
|
688 |
+## which(regionInfo[snps, 1])) |
|
689 |
+## A[snps,] <- tmpA |
|
690 |
+## B[snps,] <- tmpB |
|
691 |
+## last <- rSnps[2] |
|
692 |
+## rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull) |
|
693 |
+## } |
|
694 |
+## |
|
695 |
+## gc(verbose=FALSE) |
|
696 |
+ ## END MOVE TO C####### |
|
697 |
+ ## ################## |
|
698 |
+ |
|
699 |
+ dev <- dev/(dev+1/383) |
|
700 |
+ if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
701 |
+ if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
702 |
+ |
|
703 |
+ if(length(Index) >= recallRegMin){ |
|
704 |
+ tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1) |
|
705 |
+ tmpSnpQc <- dev[autosomeIndex] |
|
706 |
+ SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,]) |
|
707 |
+ batchQC <- mean(diag(SS)) |
|
708 |
+ }else{ |
|
709 |
+ SS <- matrix(0, 3, 3) |
|
710 |
+ batchQC <- Inf |
|
711 |
+ } |
|
712 |
+ |
|
713 |
+ if(verbose) message("Done.") |
|
714 |
+ if (returnParams){ |
|
715 |
+ return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname)) |
|
716 |
+ }else{ |
|
717 |
+ return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname)) |
|
718 |
+ } |
|
719 |
+} |
|
720 |
+ |
|
721 |
+ |
|
178 | 722 |
##--------------------------------------------------------------------------- |
179 | 723 |
##--------------------------------------------------------------------------- |
180 | 724 |
## For Illumina |
... | ... |
@@ -252,6 +796,7 @@ constructRG <- function(filenames, cdfName, sns, verbose, fileExt, sep, sampleSh |
252 | 796 |
} |
253 | 797 |
crlmmIlluminaRS <- function(sampleSheet=NULL, |
254 | 798 |
arrayNames=NULL, |
799 |
+ batch, |
|
255 | 800 |
ids=NULL, |
256 | 801 |
path=".", |
257 | 802 |
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
... | ... |
@@ -266,10 +811,18 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
266 | 811 |
seed=1, save.ab=FALSE, snpFile, cnFile, |
267 | 812 |
mixtureSampleSize=10^5, eps=0.1, verbose=TRUE, |
268 | 813 |
cdfName, sns, recallMin=10, recallRegMin=1000, |
269 |
- returnParams=FALSE, badSNP=.7) { |
|
814 |
+ returnParams=FALSE, badSNP=.7, |
|
815 |
+ copynumber=FALSE, |
|
816 |
+ load.it=TRUE) { |
|
270 | 817 |
if(missing(cdfName)) stop("must specify cdfName") |
271 | 818 |
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
272 | 819 |
if(missing(sns)) sns <- basename(arrayNames) |
820 |
+ if(missing(batch)){ |
|
821 |
+ warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch. The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.") |
|
822 |
+ } else { |
|
823 |
+ if(length(batch) != length(sns)) |
|
824 |
+ stop("batch variable must be the same length as the filenames") |
|
825 |
+ } |
|
273 | 826 |
batches <- splitIndicesByLength(seq(along=arrayNames), ocSamples()) |
274 | 827 |
k <- 1 |
275 | 828 |
for(j in batches){ |
... | ... |
@@ -300,13 +853,17 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
300 | 853 |
## Here, I'm just using the # of rows returned from the above function |
301 | 854 |
if(k == 1){ |
302 | 855 |
if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability") |
303 |
- callSet <- new("SnpSuperSet", |
|
304 |
- alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)), |
|
305 |
- alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)), |
|
306 |
- call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)), |
|
307 |
- callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)), |
|
308 |
- annotation=cdfName) |
|
309 |
- sampleNames(callSet) <- sns |
|
856 |
+ load.obj <- loadObject("callSet", load.it) |
|
857 |
+ if(!load.obj){ |
|
858 |
+ callSet <- new("SnpSuperSet", |
|
859 |
+ alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)), |
|
860 |
+ alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)), |
|
861 |
+ call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)), |
|
862 |
+ callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)), |
|
863 |
+ annotation=cdfName) |
|
864 |
+ sampleNames(callSet) <- sns |
|
865 |
+ save(callSet, file=file.path(ldPath(), "callSet.rda")) |
|
866 |
+ } else load(file.path(ldPath(), "callSet.rda")) |
|
310 | 867 |
phenoData(callSet) <- getPhenoData(sampleSheet=sampleSheet, |
311 | 868 |
arrayNames=sns, |
312 | 869 |
arrayInfoColNames=arrayInfoColNames) |
... | ... |
@@ -315,9 +872,21 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
315 | 872 |
protocolData(callSet) <- new("AnnotatedDataFrame", data=pD) |
316 | 873 |
pData(protocolData(callSet))[j, ] <- pData(protocolData) |
317 | 874 |
featureNames(callSet) <- res[["gns"]] |
318 |
- pData(callSet)$SKW <- rep(NA, length(sns)) |
|
319 |
- pData(callSet)$SNR <- rep(NA, length(sns)) |
|
875 |
+ pData(callSet)$SNR <- initializeBigVector("crlmmSNR-", length(sns), "double") |
|
876 |
+ pData(callSet)$SKW <- initializeBigVector("crlmmSKW-", length(sns), "double") |
|
877 |
+ ##pData(callSet)$SKW <- rep(NA, length(sns)) |
|
878 |
+ ##pData(callSet)$SNR <- rep(NA, length(sns)) |
|
320 | 879 |
pData(callSet)$gender <- rep(NA, length(sns)) |
880 |
+ mixtureParams <- initializeBigMatrix("crlmmMixt-", nr=4, nc=ncol(callSet), vmode="double") |
|
881 |
+ if(missing(batch)){ |
|
882 |
+ protocolData(callSet)$batch <- rep(NA, length(sns)) |
|
883 |
+ } else{ |
|
884 |
+ protocolData(callSet)$batch <- batch |
|
885 |
+ } |
|
886 |
+ featureData(callSet) <- addFeatureAnnotation(callSet) |
|
887 |
+ open(mixtureParams) |
|
888 |
+ open(callSet$SNR) |
|
889 |
+ open(callSet$SKW) |
|
321 | 890 |
} |
322 | 891 |
if(k > 1 & nrow(res[[1]]) != nrow(callSet)){ |
323 | 892 |
##RS: I don't understand why the IDATS for the |
... | ... |
@@ -325,40 +894,61 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
325 | 894 |
res[["A"]] <- res[["A"]][res$gns %in% featureNames(callSet), ] |
326 | 895 |
res[["B"]] <- res[["B"]][res$gns %in% featureNames(callSet), ] |
327 | 896 |
} |
897 |
+ if(missing(batch)){ |
|
898 |
+ protocolData(callSet)$batch[j] <- as.numeric(as.factor(protocolData$ScanDate)) |
|
899 |
+ } |
|
328 | 900 |
## MR: we need to define a snp.index vs np.index |
329 | 901 |
snp.index <- match(res$gns, featureNames(callSet)) |
330 |
- suppressWarnings(A(callSet)[snp.index, j] <- res[["A"]]) |
|
331 |
- suppressWarnings(B(callSet)[snp.index, j] <- res[["B"]]) |
|
902 |
+ A(callSet)[snp.index, j] <- res[["A"]] |
|
903 |
+ B(callSet)[snp.index, j] <- res[["B"]] |
|
332 | 904 |
pData(callSet)$SKW[j] <- res$SKW |
333 | 905 |
pData(callSet)$SNR[j] <- res$SNR |
334 |
- mixtureParams <- res$mixtureParams |
|
906 |
+ mixtureParams[, j] <- res$mixtureParams |
|
335 | 907 |
rm(res); gc() |
336 |
- ##MR: edit snp.index |
|
337 |
- ##snp.index <- 1:nrow(callSet) |
|
338 |
- tmp <- crlmmGT(A=as.matrix(A(callSet)[, j]), |
|
339 |
- B=as.matrix(B(callSet)[, j]), |
|
340 |
- SNR=callSet$SNR[j], |
|
341 |
- mixtureParams=mixtureParams, |
|
342 |
- cdfName=annotation(callSet), |
|
343 |
- row.names=featureNames(callSet), |
|
344 |
- col.names=sampleNames(callSet)[j], |
|
345 |
- probs=probs, |
|
346 |
- DF=DF, |
|
347 |
- SNRMin=SNRMin, |
|
348 |
- recallMin=recallMin, |
|
349 |
- recallRegMin=recallRegMin, |
|
350 |
- gender=gender, |
|
351 |
- verbose=verbose, |
|
352 |
- returnParams=returnParams, |
|
353 |
- badSNP=badSNP) |
|
354 |
- suppressWarnings(snpCall(callSet)[, j] <- tmp[["calls"]]) |
|
355 |
- ## MR: many zeros in the conf. scores (?) |
|
356 |
- suppressWarnings(snpCallProbability(callSet)[, j] <- tmp[["confs"]]) |
|
357 |
- callSet$gender[j] <- tmp$gender |
|
358 |
- rm(tmp); gc() |
|
359 | 908 |
k <- k+1 |
360 | 909 |
} |
361 |
- return(callSet) |
|
910 |
+ tmp <- crlmmGT2(A=A(callSet), |
|
911 |
+ B=B(callSet), |
|
912 |
+ SNR=callSet$SNR, |
|
913 |
+ mixtureParams=mixtureParams, |
|
914 |
+ cdfName=annotation(callSet), |
|
915 |
+ row.names=featureNames(callSet), |
|
916 |
+ col.names=sampleNames(callSet), |
|
917 |
+ probs=probs, |
|
918 |
+ DF=DF, |
|
919 |
+ SNRMin=SNRMin, |
|
920 |
+ recallMin=recallMin, |
|
921 |
+ recallRegMin=recallRegMin, |
|
922 |
+ gender=gender, |
|
923 |
+ verbose=verbose, |
|
924 |
+ returnParams=returnParams, |
|
925 |
+ badSNP=badSNP) |
|
926 |
+ open(tmp[["calls"]]) |
|
927 |
+ open(tmp[["confs"]]) |
|
928 |
+ snpCall(callSet) <- tmp[["calls"]] |
|
929 |
+ ## MR: many zeros in the conf. scores (?) |
|
930 |
+ snpCallProbability(callSet) <- tmp[["confs"]] |
|
931 |
+ callSet$gender <- tmp$gender |
|
932 |
+ if(copynumber){ |
|
933 |
+ load.obj <- loadObject("cnSet", load.it) |
|
934 |
+ if(!load.obj){ |
|
935 |
+ cnSet <- as(callSet, "CNSetLM") |
|
936 |
+ } else { |
|
937 |
+ load(file.path(ldPath(), "cnSet.rda")) |
|
938 |
+ A(cnSet) <- A(callSet) |
|
939 |
+ B(cnSet) <- B(callSet) |
|
940 |
+ snpCall(cnSet) <- snpCall(callSet) |
|
941 |
+ snpCallProbability(cnSet) <- snpCallProbability(callSet) |
|
942 |
+ annotation(cnSet) <- annotation(callSet) |
|
943 |
+ featureData(cnSet) <- featureData(callSet) |
|
944 |
+ protocolData(cnSet) <- protocolData(callSet) |
|
945 |
+ phenoData(cnSet) <- phenoData(callSet) |
|
946 |
+ experimentData(cnSet) <- experimentData(callSet) |
|
947 |
+ } |
|
948 |
+ } |
|
949 |
+ close(mixtureParams) |
|
950 |
+ rm(tmp); gc() |
|
951 |
+ return(cnSet) |
|
362 | 952 |
} |
363 | 953 |
##--------------------------------------------------------------------------- |
364 | 954 |
##--------------------------------------------------------------------------- |
... | ... |
@@ -377,6 +967,29 @@ rowMAD <- function(x, y, ...){ |
377 | 967 |
return(mad) |
378 | 968 |
} |
379 | 969 |
|
970 |
+shrink <- function(x, Ns, DF.PRIOR){ |
|
971 |
+ DF <- Ns-1 |
|
972 |
+ DF[DF < 1] <- 1 |
|
973 |
+ x.0 <- apply(x, 2, median, na.rm=TRUE) |
|
974 |
+ x <- (x*DF + x.0*DF.PRIOR)/(DF.PRIOR + DF) |
|
975 |
+ for(j in 1:ncol(x)) x[is.na(x[, j]), j] <- x.0[j] |
|
976 |
+ return(x) |
|
977 |
+} |
|
978 |
+ |
|
979 |
+applyByGenotype <- function(x, FUN, G){ |
|
980 |
+ FUN <- match.fun(FUN) |
|
981 |
+ tmp <- matrix(NA, nrow(x), 3) |
|
982 |
+ for(j in 1:3){ |
|
983 |
+ GT <- G==j |
|
984 |
+ GT[GT == FALSE] <- NA |
|
985 |
+ gt.x <- GT*x |
|
986 |
+ tmp[, j] <- FUN(gt.x, na.rm=TRUE) |
|
987 |
+ } |
|
988 |
+ tmp |
|
989 |
+} |
|
990 |
+ |
|
991 |
+ |
|
992 |
+ |
|
380 | 993 |
rowCors <- function(x, y, ...){ |
381 | 994 |
N <- rowSums(!is.na(x)) |
382 | 995 |
x <- suppressWarnings(log2(x)) |
... | ... |
@@ -387,6 +1000,15 @@ rowCors <- function(x, y, ...){ |
387 | 1000 |
return(covar/(sd.x*sd.y)) |
388 | 1001 |
} |
389 | 1002 |
|
1003 |
+corByGenotype <- function(A, B, G, Ns, which.cluster=c(1,2,3)[1], DF.PRIOR){ |
|
1004 |
+ x <- A * (G == which.cluster) |
|
1005 |
+ x[x==0] <- NA |
|
1006 |
+ y <- B * (G == which.cluster) |
|
1007 |
+ res <- as.matrix(rowCors(x, y, na.rm=TRUE)) |
|
1008 |
+ cors <- shrink(res, Ns[, which.cluster], DF.PRIOR) |
|
1009 |
+ cors |
|
1010 |
+} |
|
1011 |
+ |
|
390 | 1012 |
generateX <- function(w, X) as.numeric(diag(w) %*% X) |
391 | 1013 |
generateIXTX <- function(x, nrow=3) { |
392 | 1014 |
X <- matrix(x, nrow=nrow) |
... | ... |
@@ -394,6 +1016,61 @@ generateIXTX <- function(x, nrow=3) { |
394 | 1016 |
solve(XTX) |
395 | 1017 |
} |
396 | 1018 |
|
1019 |
+nuphiAllele2 <- function(allele, Ystar, W, Ns){ |
|
1020 |
+ complete <- rowSums(is.na(W)) == 0 |
|
1021 |
+ if(any(!is.finite(W))){## | any(!is.finite(V))){ |
|
1022 |
+ i <- which(rowSums(!is.finite(W)) > 0) |
|
1023 |
+ stop("Possible zeros in the within-genotype estimates of the spread (vA, vB). ") |
|
1024 |
+ } |
|
1025 |
+ NOHET <- mean(Ns[, 2], na.rm=TRUE) < 0.05 |
|
1026 |
+ if(missing(allele)) stop("must specify allele") |
|
1027 |
+ if(allele == "A") X <- cbind(1, 2:0) else X <- cbind(1, 0:2) |
|
1028 |
+ if(NOHET) X <- X[-2, ] ##more than 1 X chromosome, but all homozygous |
|
1029 |
+ ##How to quickly generate Xstar, Xstar = diag(W) %*% X |
|
1030 |
+ Xstar <- apply(W, 1, generateX, X) |
|
1031 |
+ IXTX <- apply(Xstar, 2, generateIXTX, nrow=nrow(X)) |
|
1032 |
+ betahat <- matrix(NA, 2, nrow(Ystar)) |
|
1033 |
+ ses <- matrix(NA, 2, nrow(Ystar)) |
|
1034 |
+ for(i in 1:nrow(Ystar)){ |
|
1035 |
+ betahat[, i] <- crossprod(matrix(IXTX[, i], ncol(X), ncol(X)), crossprod(matrix(Xstar[, i], nrow=nrow(X)), Ystar[i, ])) |
|
1036 |
+ ssr <- sum((Ystar[i, ] - matrix(Xstar[, i], nrow(X), ncol(X)) %*% matrix(betahat[, i], ncol(X), 1))^2) |
|
1037 |
+ ses[, i] <- sqrt(diag(matrix(IXTX[, i], ncol(X), ncol(X)) * ssr)) |
|
1038 |
+ } |
|
1039 |
+ nu <- betahat[1, ] |
|
1040 |
+ phi <- betahat[2, ] |
|
1041 |
+ return(list(nu, phi)) |
|
1042 |
+} |
|
1043 |
+ |
|
1044 |
+nuphiAlleleX <- function(allele, Ystar, W, Ns, chrX=FALSE){ |
|
1045 |
+ complete <- rowSums(is.na(W)) == 0 |
|
1046 |
+ if(any(!is.finite(W))){## | any(!is.finite(V))){ |
|
1047 |
+ i <- which(rowSums(!is.finite(W)) > 0) |
|
1048 |
+ stop("Possible zeros in the within-genotype estimates of the spread (vA, vB). ") |
|
1049 |
+ } |
|
1050 |
+ ##NOHET <- mean(Ns[, 2], na.rm=TRUE) < 0.05 |
|
1051 |
+ if(missing(allele)) stop("must specify allele") |
|
1052 |
+ if(allele == "A") X <- cbind(1, c(1, 0, 2, 1, 0), c(0, 1, 0, 1, 2)) |
|
1053 |
+ if(allele == "B") X <- cbind(1, c(0, 1, 0, 1, 2), c(1, 0, 2, 1, 0)) |
|
1054 |
+ ##if(NOHET) X <- X[-2, ] ##more than 1 X chromosome, but all homozygous |
|
1055 |
+ ##How to quickly generate Xstar, Xstar = diag(W) %*% X |
|
1056 |
+ Xstar <- apply(W, 1, generateX, X) |
|
1057 |
+ IXTX <- apply(Xstar, 2, generateIXTX, nrow=nrow(X)) |
|
1058 |
+ |
|
1059 |
+ betahat <- matrix(NA, 3, nrow(Ystar)) |
|
1060 |
+ ses <- matrix(NA, 3, nrow(Ystar)) |
|
1061 |
+ ##betahat <- matrix(NA, 2, nrow(Ystar)) |
|
1062 |
+ ##ses <- matrix(NA, 2, nrow(Ystar)) |
|
1063 |
+ for(i in 1:nrow(Ystar)){ |
|
1064 |
+ betahat[, i] <- crossprod(matrix(IXTX[, i], ncol(X), ncol(X)), crossprod(matrix(Xstar[, i], nrow=nrow(X)), Ystar[i, ])) |
|
1065 |
+ ssr <- sum((Ystar[i, ] - matrix(Xstar[, i], nrow(X), ncol(X)) %*% matrix(betahat[, i], ncol(X), 1))^2) |
|
1066 |
+ ses[, i] <- sqrt(diag(matrix(IXTX[, i], ncol(X), ncol(X)) * ssr)) |
|
1067 |
+ } |
|
1068 |
+ nu <- betahat[1, ] |
|
1069 |
+ phi <- betahat[2, ] |
|
1070 |
+ phi2 <- betahat[3, ] |
|
1071 |
+ return(list(nu, phi, phi2)) |
|
1072 |
+} |
|
1073 |
+ |
|
397 | 1074 |
|
398 | 1075 |
nuphiAllele <- function(object, allele, Ystar, W, tmp.objects, cnOptions){ |
399 | 1076 |
I <- isSnp(object) |
... | ... |
@@ -410,7 +1087,8 @@ nuphiAllele <- function(object, allele, Ystar, W, tmp.objects, cnOptions){ |
410 | 1087 |
Ns <- Ns[I, ] |
411 | 1088 |
|
412 | 1089 |
CHR <- unique(chromosome(object)) |
413 |
- batch <- unique(object$batch) |
|
1090 |
+ ##batch <- unique(object$batch) |
|
1091 |
+ batch <- unique(batch(object)) |
|
414 | 1092 |
nuA <- getParam(object, "nuA", batch) |
415 | 1093 |
nuB <- getParam(object, "nuB", batch) |
416 | 1094 |
nuA.se <- getParam(object, "nuA.se", batch) |
... | ... |
@@ -518,57 +1196,6 @@ predictGender <- function(res, cdfName="genomewidesnp6", SNRMin=5){ |
518 | 1196 |
return(gender) |
519 | 1197 |
} |
520 | 1198 |
|
521 |
-##combineIntensities <- function(res, NP=NULL, callSet){ |
|
522 |
-## rownames(res$B) <- rownames(res$A) <- res$gns |
|
523 |
-## colnames(res$B) <- colnames(res$A) <- res$sns |
|
524 |
-## if(!is.null(NP)){ |
|
525 |
-## blank <- matrix(NA, nrow(NP), ncol(NP)) |
|
526 |
-## dimnames(blank) <- dimnames(NP) |
|
527 |
-## A <- rbind(res$A, NP) |
|
528 |
-## B <- rbind(res$B, blank) |
|
529 |
-## } else { |
|
530 |
-## A <- res$A |
|
531 |
-## B <- res$B |
|
532 |
-## } |
|
533 |
-## dimnames(B) <- dimnames(A) |
|
534 |
-## index.snps <- match(res$gns, rownames(A)) |
|
535 |
-## callsConfs <- calls <- matrix(NA, nrow(A), ncol(A), dimnames=dimnames(A)) |
|
536 |
-## |
|
537 |
-## calls[index.snps, ] <- calls(callSet) |
|
538 |
-## callsConfs[index.snps, ] <- assayData(callSet)[["callProbability"]] |
|
539 |
-## fd <- data.frame(matrix(NA, nrow(calls), length(fvarLabels(callSet)))) |
|
540 |
-## fd[index.snps, ] <- fData(callSet) |
|
541 |
-## rownames(fd) <- rownames(A) |
|
542 |
-## colnames(fd) <- fvarLabels(callSet) |
|
543 |
-## fD <- new("AnnotatedDataFrame", |
|
544 |
-## data=data.frame(fd), |
|
545 |
-## varMetadata=data.frame(labelDescription=colnames(fd), row.names=colnames(fd))) |
|
546 |
-## superSet <- new("CNSet", |
|
547 |
-## CA=matrix(NA, nrow(A), ncol(A), dimnames=dimnames(A)), |
|
548 |
-## CB=matrix(NA, nrow(A), ncol(A), dimnames=dimnames(A)), |
|
549 |
-## alleleA=A, |
|
550 |
-## alleleB=B, |
|
551 |
-## call=calls, |
|
552 |
-## callProbability=callsConfs, |
|
553 |
-## featureData=fD, |
|
554 |
-## phenoData=phenoData(callSet), |
|
555 |
-## experimentData=experimentData(callSet), |
|
556 |
-## protocolData=protocolData(callSet), |
|
557 |
-## annotation=annotation(callSet)) |
|
558 |
-## return(superSet) |
|
559 |
-##} |
|
560 |
-## |
|
561 |
-harmonizeDimnamesTo <- function(object1, object2){ |
|
562 |
- #object2 should be a subset of object 1 |
|
563 |
- object2 <- object2[featureNames(object2) %in% featureNames(object1), ] |
|
564 |
- object1 <- object1[match(featureNames(object2), featureNames(object1)), ] |
|
565 |
- object1 <- object1[, match(sampleNames(object2), sampleNames(object1))] |
|
566 |
- stopifnot(all.equal(featureNames(object1), featureNames(object2))) |
|
567 |
- stopifnot(all.equal(sampleNames(object1), sampleNames(object2))) |
|
568 |
- return(object1) |
|
569 |
-} |
|
570 |
- |
|
571 |
- |
|
572 | 1199 |
|
573 | 1200 |
crlmmCopynumber <- function(object, |
574 | 1201 |
chromosome=1:23, |
... | ... |
@@ -588,12 +1215,22 @@ crlmmCopynumber <- function(object, |
588 | 1215 |
MIN.PHI=2^3, |
589 | 1216 |
THR.NU.PHI=TRUE, |
590 | 1217 |
thresholdCopynumber=TRUE){ |
591 |
- stopifnot("batch" %in% varLabels(object)) |
|
1218 |
+ if(isPackageLoaded("ff")){ |
|
1219 |
+ open(object) |
|
1220 |
+ open(object$SKW) |
|
1221 |
+ open(object$SNR) |
|
1222 |
+ } |
|
1223 |
+ stopifnot("batch" %in% varLabels(protocolData(object))) |
|
592 | 1224 |
stopifnot("chromosome" %in% fvarLabels(object)) |
593 | 1225 |
stopifnot("position" %in% fvarLabels(object)) |
594 | 1226 |
stopifnot("isSnp" %in% fvarLabels(object)) |
595 |
- batch <- object$batch |
|
596 |
- batches <- split((1:ncol(object))[object$SNR > SNRMin], batch[object$SNR > SNRMin]) |
|
1227 |
+ ##batch <- object$batch |
|
1228 |
+ batch <- batch(object) |
|
1229 |
+ if(isPackageLoaded("ff")) { |
|
1230 |
+ open(object$SNR) |
|
1231 |
+ SNR <- object$SNR[, ] |
|
1232 |
+ } else SNR <- object$SNR |
|
1233 |
+ batches <- split((1:ncol(object))[SNR > SNRMin], batch[SNR > SNRMin]) |
|
597 | 1234 |
if(any(sapply(batches, length) < MIN.SAMPLES)) message("Excluding batches with fewer than ", MIN.SAMPLES, " samples") |
598 | 1235 |
batches <- batches[sapply(batches, length) >= MIN.SAMPLES] |
599 | 1236 |
if(missing(which.batches)) which.batches <- seq(along=batches) |
... | ... |
@@ -618,7 +1255,7 @@ crlmmCopynumber <- function(object, |
618 | 1255 |
annotation=annotation(object)) |
619 | 1256 |
featureData(tmp) <- addFeatureAnnotation(tmp) |
620 | 1257 |
featureData(tmp) <- lm.parameters(tmp, batch=unique(batch[j])) |
621 |
- tmp$batch <- batch[j] |
|
1258 |
+ tmp$batch <- batch(object)[j] |
|
622 | 1259 |
tmp <- computeCopynumber(tmp, |
623 | 1260 |
MIN.OBS=MIN.OBS, |
624 | 1261 |
DF.PRIOR=DF.PRIOR, |
... | ... |
@@ -640,10 +1277,19 @@ crlmmCopynumber <- function(object, |
640 | 1277 |
dimnames=list(featureNames(tmp), sampleNames(tmp))) |
641 | 1278 |
CA(object)[row.index, j] <- CA(tmp) |
642 | 1279 |
CB(object)[row.index, j] <- CB(tmp) |
643 |
- labels.asis <- fvarLabels(tmp) |
|
644 |
- labels.asis <- gsub(paste("_", unique(tmp$batch), sep=""), paste(".", ii, sep=""), labels.asis) |
|
645 |
- k <- match(labels.asis, colnames(lM(object))) |
|
646 |
- lM(object)[row.index, k] <- fData(tmp) |
|
1280 |
+ ##ad-hocery |
|
1281 |
+ batchName <- unique(batch(object)[j]) |
|
1282 |
+ fvarLabels(tmp)[15:17] <- paste(c("corrAB", "corrBB", "corrAA"), batchName, sep=".") |
|
1283 |
+ fvarLabels(tmp)[13:14] <- paste(c("phiPrimeA", "phiPrimeB"), batchName, sep=".") |
|
1284 |
+ fvarLabels(tmp) <- gsub("_", ".", fvarLabels(tmp)) |
|
1285 |
+ ##fvarLabels(tmp) <- gsub("\\.[1-9]", "", fvarLabels(tmp)) |
|
1286 |
+ fData(tmp) <- fData(tmp)[, fvarLabels(tmp) %in% names(physical(lM(object)))] |
|
1287 |
+ jj <- match(fvarLabels(tmp), names(lM(object))) |
|
1288 |
+ lM(object)[row.index, jj] <- fData(tmp) |
|
1289 |
+ ##labels.asis <- fvarLabels(tmp) |
|
1290 |
+ ##labels.asis <- gsub(paste("_", unique(tmp$batch), sep=""), paste(".", ii, sep=""), labels.asis) |
|
1291 |
+ ##k <- match(labels.asis, colnames(lM(object))) |
|
1292 |
+ ##lM(object)[row.index, k] <- fData(tmp) |
|
647 | 1293 |
rm(tmp); gc() |
648 | 1294 |
ii <- ii+1 |
649 | 1295 |
} |
... | ... |
@@ -651,6 +1297,705 @@ crlmmCopynumber <- function(object, |
651 | 1297 |
return(object) |
652 | 1298 |
} |
653 | 1299 |
|
1300 |
+crlmmCopynumber2 <- function(object, |
|
1301 |
+ which.batches, |
|
1302 |
+ MIN.SAMPLES=10, |
|
1303 |
+ SNRMin=5, |
|
1304 |
+ MIN.OBS=1, |
|
1305 |
+ DF.PRIOR=50, |
|
1306 |
+ bias.adj=FALSE, |
|
1307 |
+ prior.prob=rep(1/4,4), |
|
1308 |
+ seed=1, |
|
1309 |
+ verbose=TRUE, |
|
1310 |
+ GT.CONF.THR=0.99, |
|
1311 |
+ PHI.THR=2^6, |
|
1312 |
+ nHOM.THR=5, |
|
1313 |
+ MIN.NU=2^3, |
|
1314 |
+ MIN.PHI=2^3, |
|
1315 |
+ THR.NU.PHI=TRUE, |
|
1316 |
+ thresholdCopynumber=TRUE, |
|
1317 |
+ load.it=TRUE){ |
|
1318 |
+ stopifnot("batch" %in% varLabels(protocolData(object))) |
|
1319 |
+ stopifnot("chromosome" %in% fvarLabels(object)) |
|
1320 |
+ stopifnot("position" %in% fvarLabels(object)) |
|
1321 |
+ stopifnot("isSnp" %in% fvarLabels(object)) |
|
1322 |
+ batch <- batch(object) |
|
1323 |
+ XIndex.snps <- (1:nrow(object))[chromosome(object) == 23 & isSnp(object) & !is.na(chromosome(object))] |
|
1324 |
+ ##YIndex.snps <- (1:nrow(object))[chromosome(object) == 24 & isSnp(object)] |
|
1325 |
+ XIndex.nps <- (1:nrow(object))[chromosome(object) == 23 & !isSnp(object) & !is.na(chromosome(object))] |
|
1326 |
+ autosomeIndex.snps <- (1:nrow(object))[chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object))] |
|
1327 |
+ autosomeIndex.nps <- (1:nrow(object))[chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object))] |
|
1328 |
+ ##Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
1329 |
+ ##snpBatches <- splitIndicesByLength(1:nrow(object), ocProbesets()) |
|
1330 |
+ snpBatches <- splitIndicesByLength(autosomeIndex.snps, ocProbesets()) |
|
1331 |
+ ## Do chromosome X in batches |
|
1332 |
+ Ns <- initializeBigMatrix("Ns", nrow(object), 5) |
|
1333 |
+ colnames(Ns) <- c("A", "B", "AA", "AB", "BB") |
|
1334 |
+ if(!file.exists(file.path(ldPath(), "normal.rda"))){ |
|
1335 |
+ normal <- initializeBigMatrix("normal", nrow(object), ncol(object), vmode="integer") |
|
1336 |
+ normal[,] <- 1L |
|
1337 |
+ save(normal, file=file.path(ldPath(), "normal.rda")) |
|
1338 |
+ } else load(file.path(ldPath(), "normal.rda")) |
|
1339 |
+ if(!file.exists(file.path(ldPath(), "snpflags.rda"))){ |
|
1340 |
+ snpflags <- initializeBigMatrix("snpflags", nrow(object), length(unique(batch(object))), vmode="integer") |
|
1341 |
+ snpflags[,] <- 0L |
|
1342 |
+ save(snpflags, file=file.path(ldPath(), "snpflags.rda")) |
|
1343 |
+ } else{ |
|
1344 |
+ load(file.path(ldPath(), "snpflags.rda")) |
|
1345 |
+ |
|
1346 |
+ } |
|
1347 |
+ if(verbose) message("Estimating allele-specific copy number at autosomal SNPs") |
|
1348 |
+ ocLapply(seq(along=snpBatches), |
|
1349 |
+ fit.lm1, |
|
1350 |
+ autosomeIndex=autosomeIndex.snps, |
|
1351 |
+ object=object, |
|
1352 |
+ Ns=Ns, |
|
1353 |
+ normal=normal, |
|
1354 |
+ snpflags=snpflags, |
|
1355 |
+ snpBatches=snpBatches, |
|
1356 |
+ batchSize=ocProbesets(), |
|
1357 |
+ SNRMin=SNRMin, |
|
1358 |
+ MIN.SAMPLES=MIN.SAMPLES, |
|
1359 |
+ MIN.OBS=MIN.OBS, |
|
1360 |
+ DF=DF.PRIOR, |
|
1361 |
+ GT.CONF.THR=GT.CONF.THR, |
|
1362 |
+ THR.NU.PHI=THR.NU.PHI, |
|
1363 |
+ MIN.NU=MIN.NU, |
|
1364 |
+ MIN.PHI=MIN.PHI, |
|
1365 |
+ verbose=verbose, |
|
1366 |
+ neededPkgs="crlmm") |
|
1367 |
+ ## autosomal NPs |
|
1368 |
+ snpBatches <- splitIndicesByLength(autosomeIndex.nps, ocProbesets()) |
|
1369 |
+ if(verbose) message("Estimating total copy number at nonpolymorphic loci") |
|
1370 |
+ ocLapply(seq(along=snpBatches), |
|
1371 |
+ fit.lm2, |
|
1372 |
+ autosomeIndex=autosomeIndex.nps, |
|
1373 |
+ object=object, |
|
1374 |
+ Ns=Ns, |
|
1375 |
+ normal=normal, |
|
1376 |
+ snpflags=snpflags, |
|
1377 |
+ snpBatches=snpBatches, |
|
1378 |
+ batchSize=ocProbesets(), |
|
1379 |
+ SNRMin=SNRMin, |
|
1380 |
+ MIN.SAMPLES=MIN.SAMPLES, |
|
1381 |
+ MIN.OBS=MIN.OBS, |
|
1382 |
+ DF=DF.PRIOR, |
|
1383 |
+ GT.CONF.THR=GT.CONF.THR, |
|
1384 |
+ THR.NU.PHI=THR.NU.PHI, |
|
1385 |
+ MIN.NU=MIN.NU, |
|
1386 |
+ MIN.PHI=MIN.PHI, |
|
1387 |
+ verbose=verbose, |
|
1388 |
+ neededPkgs="crlmm") |
|
1389 |
+ snpBatches <- splitIndicesByLength(XIndex.snps, ocProbesets()) |
|
1390 |
+ if(verbose) message("Estimating total copy number at polymorphic loci on chromosome X") |
|
1391 |
+ ocLapply(seq(along=snpBatches), |
|
1392 |
+ fit.lm3, |
|
1393 |
+ autosomeIndex=XIndex.snps, |
|
1394 |
+ object=object, |
|
1395 |
+ Ns=Ns, |
|
1396 |
+ normal=normal, |
|
1397 |
+ snpflags=snpflags, |
|
1398 |
+ snpBatches=snpBatches, |
|
1399 |
+ batchSize=ocProbesets(), |
|
1400 |
+ SNRMin=SNRMin, |
|
1401 |
+ MIN.SAMPLES=MIN.SAMPLES, |
|
1402 |
+ MIN.OBS=MIN.OBS, |
|
1403 |
+ DF=DF.PRIOR, |
|
1404 |
+ GT.CONF.THR=GT.CONF.THR, |
|
1405 |
+ THR.NU.PHI=THR.NU.PHI, |
|
1406 |
+ MIN.NU=MIN.NU, |
|
1407 |
+ MIN.PHI=MIN.PHI, |
|
1408 |
+ verbose=verbose, |
|
1409 |
+ neededPkgs="crlmm") |
|
1410 |
+ if(verbose) message("Estimating total copy number for nonpolymorphic loci on chromosome X") |
|
1411 |
+ snpBatches <- splitIndicesByLength(XIndex.nps, ocProbesets()) |
|
1412 |
+ if(verbose) message("Estimating total copy number at nonpolymorphic loci on chromosome X") |
|
1413 |
+ tmp <- ocLapply(seq(along=snpBatches), |
|
1414 |
+ fit.lm4, |
|
1415 |
+ XIndex=XIndex.nps, |
|
1416 |
+ object=object, |
|
1417 |
+ Ns=Ns, |
|
1418 |
+ normal=normal, |
|
1419 |
+ snpflags=snpflags, |
|
1420 |
+ snpBatches=snpBatches, |
|
1421 |
+ batchSize=ocProbesets(), |
|
1422 |
+ SNRMin=SNRMin, |
|
1423 |
+ MIN.SAMPLES=MIN.SAMPLES, |
|
1424 |
+ MIN.OBS=MIN.OBS, |
|
1425 |
+ DF=DF.PRIOR, |
|
1426 |
+ GT.CONF.THR=GT.CONF.THR, |
|
1427 |
+ THR.NU.PHI=THR.NU.PHI, |
|
1428 |
+ MIN.NU=MIN.NU, |
|
1429 |
+ MIN.PHI=MIN.PHI, |
|
1430 |
+ verbose=verbose, |
|
1431 |
+ neededPkgs="crlmm") |
|
1432 |
+ return(object) |
|
1433 |
+} |
|
1434 |
+ |
|
1435 |
+fit.lm1 <- function(idxBatch, |
|
1436 |
+ snpBatches, |
|
1437 |
+ autosomeIndex, |
|
1438 |
+ object, |
|
1439 |
+ Ns, |
|
1440 |
+ normal, |
|
1441 |
+ snpflags, |
|
1442 |
+ batchSize, |
|
1443 |
+ SNRMin, |
|
1444 |
+ MIN.SAMPLES, |
|
1445 |
+ MIN.OBS, |
|
1446 |
+ DF.PRIOR, |
|
1447 |
+ GT.CONF.THR, |
|
1448 |
+ THR.NU.PHI, |
|
1449 |
+ MIN.NU, |
|
1450 |
+ MIN.PHI, |
|
1451 |
+ verbose, ...){ |
|
1452 |
+ ## which.batches, ...){ |
|
1453 |
+ if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches)) |
|
1454 |
+ open(object) |
|
1455 |
+ open(snpflags) |
|
1456 |
+ open(normal) |
|
1457 |
+ snps <- snpBatches[[idxBatch]] |
|
1458 |
+ batches <- split(seq(along=batch(object)), batch(object)) |
|
1459 |
+ batches <- batches[sapply(batches, length) >= MIN.SAMPLES] |
|
1460 |
+ corr <- corrA.BB <- corrB.AA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batches))) |
|
1461 |
+ flags <- nuA <- nuB <- phiA <- phiB <- corr |
|
1462 |
+ cB <- cA <- matrix(NA, length(snps), ncol(object)) |
|
1463 |
+ for(k in batches){ |
|
1464 |
+ G <- calls(object)[snps, k] |
|
1465 |
+ NORM <- normal[snps, k] |
|
1466 |
+ xx <- snpCallProbability(object)[snps, k] |
|
1467 |
+ highConf <- (1-exp(-xx/1000)) > GT.CONF.THR |
|
1468 |
+ G <- G*highConf*NORM |
|
1469 |
+ A <- A(object)[snps, k] |
|
1470 |
+ B <- B(object)[snps, k] |
|
1471 |
+ ##index <- GT.B <- GT.A <- vector("list", 3) |
|
1472 |
+ ##names(index) <- names(GT.B) <- names(GT.A) <- c("AA", "AB", "BB") |
|
1473 |
+ Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G) |
|
1474 |
+ muA <- applyByGenotype(A, rowMedians, G) |
|
1475 |
+ muB <- applyByGenotype(B, rowMedians, G) |
|
1476 |
+ vA <- applyByGenotype(A, rowMAD, G) |
|
1477 |
+ vB <- applyByGenotype(B, rowMAD, G) |
|
1478 |
+ vA <- shrink(vA, Ns, DF.PRIOR) |
|
1479 |
+ vB <- shrink(vB, Ns, DF.PRIOR) |
|
1480 |
+ ##location and scale |
|
1481 |
+ J <- match(unique(batch(object)[k]), unique(batch(object))) |
|
1482 |
+ ##background variance for alleleA |
|
1483 |
+ taus <- applyByGenotype(log2(A), rowMAD, G)^2 |
|
1484 |
+ tau2A[, J] <- shrink(taus[, 3, drop=FALSE], Ns[, 3], DF.PRIOR) |
|
1485 |
+ sig2A[, J] <- shrink(taus[, 1, drop=FALSE], Ns[, 1], DF.PRIOR) |
|
1486 |
+ taus <- applyByGenotype(log2(B), rowMAD, G)^2 |
|
1487 |
+ tau2B[, J] <- shrink(taus[, 3, drop=FALSE], Ns[, 1], DF.PRIOR) |
|
1488 |
+ sig2B[, J] <- shrink(taus[, 1, drop=FALSE], Ns[, 3], DF.PRIOR) |
|
1489 |
+ |
|
1490 |
+ corr[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=2, DF.PRIOR) |
|
1491 |
+ corrB.AA[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=1, DF.PRIOR) |
|
1492 |
+ corrA.BB[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=3, DF.PRIOR) |
|
1493 |
+ ##formerly oneBatch()... |
|
1494 |
+ ##--------------------------------------------------------------------------- |
|
1495 |
+ ## Impute sufficient statistics for unobserved genotypes (plate-specific) |
|
1496 |
+ ##--------------------------------------------------------------------------- |
|
1497 |
+ index <- apply(Ns, 2, function(x, MIN.OBS) which(x > MIN.OBS), MIN.OBS) |
|
1498 |
+ correct.orderA <- muA[, 1] > muA[, 3] |
|
1499 |
+ correct.orderB <- muB[, 3] > muB[, 1] |
|
1500 |
+ index.complete <- intersect(which(correct.orderA & correct.orderB), intersect(index[[1]], intersect(index[[2]], index[[3]]))) |
|
1501 |
+ size <- min(5000, length(index.complete)) |
|
1502 |
+ if(size == 5000) index.complete <- sample(index.complete, 5000, replace=TRUE) |
|
1503 |
+ if(length(index.complete) < 200){ |
|
1504 |
+ stop("fewer than 200 snps pass criteria for predicting the sufficient statistics") |
|
1505 |
+ } |
|
1506 |
+ index <- vector("list", 3) |
|
1507 |
+ index[[1]] <- which(Ns[, 1] == 0 & (Ns[, 2] >= MIN.OBS & Ns[, 3] >= MIN.OBS)) |
|
1508 |
+ index[[2]] <- which(Ns[, 2] == 0 & (Ns[, 1] >= MIN.OBS & Ns[, 3] >= MIN.OBS)) |
|
1509 |
+ index[[3]] <- which(Ns[, 3] == 0 & (Ns[, 2] >= MIN.OBS & Ns[, 1] >= MIN.OBS)) |
|
1510 |
+ res <- imputeCenter(muA, muB, index.complete, index) |
|
1511 |
+ muA <- res[[1]] |
|
1512 |
+ muB <- res[[2]] |
|
1513 |
+ |
|
1514 |
+ ## Monomorphic SNPs. Mixture model may be better |
|
1515 |
+ ## Improve estimation by borrowing strength across batch |
|
1516 |
+ noAA <- Ns[, 1] < MIN.OBS |
|
1517 |
+ noAB <- Ns[, 2] < MIN.OBS |
|
1518 |
+ noBB <- Ns[, 3] < MIN.OBS |
|
1519 |
+ index[[1]] <- noAA & noAB |
|
1520 |
+ index[[2]] <- noBB & noAB |
|
1521 |
+ index[[3]] <- noAA & noBB |
|
1522 |
+ cols <- c(3, 1, 2) |
|
1523 |
+ for(j in 1:3){ |
|
1524 |
+ if(sum(index[[j]]) == 0) next() |
|
1525 |
+ kk <- cols[j] |
|
1526 |
+ X <- cbind(1, muA[index.complete, kk], muB[index.complete, kk]) |
|
1527 |
+ Y <- cbind(muA[index.complete, -kk], |
|
1528 |
+ muB[index.complete, -kk]) |
|
1529 |
+ betahat <- solve(crossprod(X), crossprod(X,Y)) |
|
1530 |
+ X <- cbind(1, muA[index[[j]], kk], muB[index[[j]], kk]) |
|
1531 |
+ mus <- X %*% betahat |
|
1532 |
+ muA[index[[j]], -kk] <- mus[, 1:2] |
|
1533 |
+ muB[index[[j]], -kk] <- mus[, 3:4] |
|
1534 |
+ } |
|
1535 |
+ negA <- rowSums(muA < 0) > 0 |
|
1536 |
+ negB <- rowSums(muB < 0) > 0 |
|
1537 |
+ flags[, J] <- rowSums(Ns == 0) > 0 |
|
1538 |
+ ##flags[, J] <- index[[1]] | index[[2]] | index[[3]] | rowSums( |
|
1539 |
+ |
|
1540 |
+ ##formerly coefs() |
|
1541 |
+ Np <- Ns |
|
1542 |
+ Np[Np < 1] <- 1 |
|
1543 |
+ vA2 <- vA^2/Np |
|
1544 |
+ vB2 <- vB^2/Np |
|
1545 |
+ wA <- sqrt(1/vA2) |
|
1546 |
+ wB <- sqrt(1/vB2) |
|
1547 |
+ YA <- muA*wA |
|
1548 |
+ YB <- muB*wB |
|
1549 |
+ res <- nuphiAllele2(allele="A", Ystar=YA, W=wA, Ns=Ns) |
|
1550 |
+ nuA[, J] <- res[[1]] |
|
1551 |
+ phiA[, J] <- res[[2]] |
|
1552 |
+ res <- nuphiAllele2(allele="B", Ystar=YB, W=wB, Ns=Ns) |
|
1553 |
+ nuB[, J] <- res[[1]] |
|
1554 |
+ phiB[, J] <- res[[2]] |
|
1555 |
+ if(THR.NU.PHI){ |
|
1556 |
+ nuA[nuA[, J] < MIN.NU, J] <- MIN.NU |
|
1557 |
+ nuB[nuB[, J] < MIN.NU, J] <- MIN.NU |
|
1558 |
+ phiA[phiA[, J] < MIN.PHI, J] <- MIN.PHI |
|
1559 |
+ phiB[phiB[, J] < MIN.PHI, J] <- MIN.PHI |
|
1560 |
+ } |
|
1561 |
+ ## formerly polymorphic(): calculate copy number |
|
1562 |
+ cA[, k] <- matrix((1/phiA[, J]*(A-nuA[, J])), nrow(A), ncol(A)) |
|
1563 |
+ cB[, k] <- matrix((1/phiB[, J]*(B-nuB[, J])), nrow(B), ncol(B)) |
|
1564 |
+ } |
|
1565 |
+ cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA)) |
|
1566 |
+ cB <- matrix(as.integer(cB*100), nrow(cB), ncol(cB)) |
|
1567 |
+ CA(object)[snps, ] <- cA |
|
1568 |
+ CB(object)[snps, ] <- cB |
|
1569 |
+ snpflags[snps, ] <- flags |
|
1570 |
+ lapply(lM(object), open) |
|
1571 |
+ lM(object)$tau2A[snps, ] <- tau2A |
|
1572 |
+ lM(object)$tau2B[snps, ] <- tau2B |
|
1573 |
+ lM(object)$sig2A[snps, ] <- sig2A |
|
1574 |
+ lM(object)$sig2B[snps, ] <- sig2B |
|
1575 |
+ lM(object)$nuA[snps, ] <- nuA |
|
1576 |
+ lM(object)$nuB[snps, ] <- nuB |
|
1577 |
+ lM(object)$phiA[snps, ] <- phiA |
|
1578 |
+ lM(object)$phiB[snps, ] <- phiB |
|
1579 |
+ lapply(assayData(object), close) |
|
1580 |
+ lapply(lM(object), close) |
|
1581 |
+ TRUE |
|
1582 |
+} |
|
1583 |
+ |
|
1584 |
+fit.lm2 <- function(idxBatch, |
|
1585 |
+ snpBatches, |
|
1586 |
+ autosomeIndex, |
|
1587 |
+ object, |
|
1588 |
+ Ns, |
|
1589 |
+ normal, |
|
1590 |
+ snpflags, |
|
1591 |
+ batchSize, |
|
1592 |
+ SNRMin, |
|
1593 |
+ MIN.SAMPLES, |
|
1594 |
+ MIN.OBS, |
|
1595 |
+ DF.PRIOR, |
|
1596 |
+ GT.CONF.THR, |
|
1597 |
+ THR.NU.PHI, |
|
1598 |
+ MIN.NU, |
|
1599 |
+ MIN.PHI, |
|
1600 |
+ verbose, ...){ |
|
1601 |
+ ## which.batches, ...){ |
|
1602 |
+ if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches)) |
|
1603 |
+ open(object) |
|
1604 |
+ open(snpflags) |
|
1605 |
+ open(normal) |
|
1606 |
+ snps <- snpBatches[[idxBatch]] |
|
1607 |
+ batches <- split(seq(along=batch(object)), batch(object)) |
|
1608 |
+ batches <- batches[sapply(batches, length) >= MIN.SAMPLES] |
|
1609 |
+ cA <- matrix(NA, length(snps), ncol(object)) |
|
1610 |
+ ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object)) |
|
1611 |
+ flags <- snpflags[, ] |
|
1612 |
+ noflags <- rowSums(flags) == 0 |
|
1613 |
+ ## We do not want to write to discuss for each batch. More efficient to |
|
1614 |
+ ## write to disk after estimating these parameters for all batches. |
|
1615 |
+ nuA.np <- phiA.np <- sig2A.np <- matrix(NA, length(snps), length(unique(batches))) |
|
1616 |
+ ## for imputation, we need the corresponding parameters of the snps |
|
1617 |
+ nuA <- lM(object)$nuA |
|
1618 |
+ phiA <- lM(object)$phiA |
|
1619 |
+ nuB <- lM(object)$nuB |
|
1620 |
+ phiB <- lM(object)$phiB |
|
1621 |
+ snp.ind <- ii & noflags |
|
1622 |
+ NORM.np <- normal[snps, ] |
|
1623 |
+ for(k in batches){ |
|
1624 |
+ ##if(verbose) message("SNP batch ", ii, " of ", length(batches)) |
|
1625 |
+ J <- match(unique(batch(object)[k]), unique(batch(object))) |
|
1626 |
+ snp.index <- snp.ind & nuA[, J] > 20 & nuB[, J] > 20 & phiA[, J] > 20 & phiB[, J] > 20 |
|
1627 |
+ if(sum(snp.index) >= 5000){ |
|
1628 |
+ snp.index <- sample(which(snp.index), 5000) |
|
1629 |
+ } |
|
1630 |
+ phiA.snp <- phiA[snp.index, J] |
|
1631 |
+ phiB.snp <- phiB[snp.index, J] |
|
1632 |
+ A.snp <- A(object)[snp.index, k] |
|
1633 |
+ B.snp <- B(object)[snp.index, k] |
|
1634 |
+ NORM.snp <- normal[snp.index, k] |
|
1635 |
+ G <- calls(object)[snp.index, k] |
|
1636 |
+ xx <- snpCallProbability(object)[snp.index, k] |
|
1637 |
+ highConf <- (1-exp(-xx/1000)) > GT.CONF.THR |
|
1638 |
+ G <- G*highConf*NORM.snp[, k] |
|
1639 |
+ G[G==0] <- NA |
|
1640 |
+ ##nonpolymorphic |
|
1641 |
+ A.np <- A(object)[snps, k] |
|
1642 |
+ Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G) |
|
1643 |
+ muA <- applyByGenotype(A.snp, rowMedians, G) |
|
1644 |
+ muB <- applyByGenotype(B.snp, rowMedians, G) |
|
1645 |
+ muA <- muA[, 1] |
|
1646 |
+ muB <- muB[, 3] |
|
1647 |
+ X <- cbind(1, log2(c(muA, muB))) |
|