git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45171 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.5.33 |
|
4 |
+Version: 1.5.34 |
|
5 | 5 |
Date: 2010-02-05 |
6 | 6 |
Author: Rafael A Irizarry, Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au> |
7 | 7 |
Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
... | ... |
@@ -47,6 +47,8 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){ |
47 | 47 |
M[index, "chromosome"] <- cnProbes[, grep("chr", colnames(cnProbes))] |
48 | 48 |
M[index, "isSnp"] <- 0L |
49 | 49 |
} |
50 |
+ ##A few of the snpProbes do not match -- I think it is chromosome Y. |
|
51 |
+ M[is.na(M[, "isSnp"]), "isSnp"] <- 1L |
|
50 | 52 |
return(new("AnnotatedDataFrame", data=data.frame(M))) |
51 | 53 |
##list(snpIndex, npIndex, fns) |
52 | 54 |
##crlmmOpts$snpRange <- range(snpIndex) |
... | ... |
@@ -68,63 +70,23 @@ construct <- function(filenames, cdfName, copynumber=FALSE, sns){ |
68 | 70 |
sampleNames(callSet) <- sns |
69 | 71 |
return(callSet) |
70 | 72 |
} |
73 |
+setMethod("close", "AlleleSet", function(con, ...){ |
|
74 |
+ object <- con |
|
75 |
+ names <- ls(assayData(object)) |
|
76 |
+ L <- length(names) |
|
77 |
+ for(i in 1:L) close(eval(substitute(assayData(object)[[NAME]], list(NAME=names[i])))) |
|
78 |
+ return() |
|
79 |
+}) |
|
80 |
+setMethod("open", "AlleleSet", function(con, ...){ |
|
81 |
+ object <- con |
|
82 |
+ names <- ls(assayData(object)) |
|
83 |
+ L <- length(names) |
|
84 |
+ for(i in 1:L) open(eval(substitute(assayData(object)[[NAME]], list(NAME=names[i])))) |
|
85 |
+ return() |
|
86 |
+}) |
|
71 | 87 |
setReplaceMethod("calls", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "call", value)) |
72 | 88 |
setReplaceMethod("confs", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "callProbability", value)) |
73 | 89 |
setMethod("confs", "SnpSuperSet", function(object) assayDataElement(object, "callProbability")) |
74 |
-crlmm.batch <- function(object, |
|
75 |
- batchSize, |
|
76 |
- mixtureParams, |
|
77 |
- probs=rep(1/3,3), |
|
78 |
- DF=6, |
|
79 |
- SNRMin=5, |
|
80 |
- recallMin=10, |
|
81 |
- recallRegMin=1000, |
|
82 |
- gender=NULL, |
|
83 |
- desctrucitve=FALSE, |
|
84 |
- verbose=TRUE, |
|
85 |
- returnParams=FALSE, |
|
86 |
- badSNP=0.7){ |
|
87 |
- ##Call in batches to reduce ram |
|
88 |
- BS <- batchSize |
|
89 |
- nc <- ncol(object) |
|
90 |
- if(nc > BS){ |
|
91 |
- N <- ceiling(nc/BS) |
|
92 |
- S <- ceiling(nc/N) |
|
93 |
- colindex <- split(1:nc, rep(1:nc, each=S, length.out=nc)) |
|
94 |
- } else { |
|
95 |
- colindex <- list(1:nc) |
|
96 |
- } |
|
97 |
- if(length(colindex) > 1) |
|
98 |
- message("Calling genotypes in batches of size ", length(colindex[[1]]), " to reduce required RAM") |
|
99 |
- row.index <- which(isSnp(object)==1 | is.na(isSnp(object))) |
|
100 |
- for(i in seq(along=colindex)){ |
|
101 |
- col.index <- colindex[[i]] |
|
102 |
- tmp <- crlmmGT(A=A(object)[row.index, col.index], |
|
103 |
- B=B(object)[row.index, col.index], |
|
104 |
- SNR=object$SNR[col.index], |
|
105 |
- mixtureParams=mixtureParams[, col.index], |
|
106 |
- cdfName=annotation(object), |
|
107 |
- row.names=featureNames(object)[row.index], |
|
108 |
- col.names=sampleNames(object)[col.index], |
|
109 |
- probs=probs, |
|
110 |
- DF=DF, |
|
111 |
- SNRMin=SNRMin, |
|
112 |
- recallMin=recallMin, |
|
113 |
- recallRegMin=recallRegMin, |
|
114 |
- gender=gender, |
|
115 |
- desctrucitve=desctrucitve, |
|
116 |
- verbose=verbose, |
|
117 |
- returnParams=returnParams, |
|
118 |
- badSNP=badSNP) |
|
119 |
- ##ensure matrix is passed |
|
120 |
- calls(object)[row.index, col.index] <- tmp[["calls"]] |
|
121 |
- confs(object)[row.index, col.index] <- tmp[["confs"]] |
|
122 |
- object$gender[col.index] <- tmp$gender |
|
123 |
- rm(tmp); gc() |
|
124 |
- } |
|
125 |
- return(object) |
|
126 |
-} |
|
127 |
- |
|
128 | 90 |
genotype <- function(filenames, cdfName, mixtureSampleSize=10^5, |
129 | 91 |
fitMixture=TRUE, |
130 | 92 |
eps=0.1, verbose=TRUE, |
... | ... |
@@ -143,60 +105,66 @@ genotype <- function(filenames, cdfName, mixtureSampleSize=10^5, |
143 | 105 |
if(missing(sns)) sns <- basename(filenames) |
144 | 106 |
## callSet contains potentially very big matrices |
145 | 107 |
## More big matrices are created within snprma, that will then be removed. |
146 |
- snprmaRes <- snprma(filenames=filenames, |
|
147 |
- mixtureSampleSize=mixtureSampleSize, |
|
148 |
- fitMixture=fitMixture, |
|
149 |
- eps=eps, |
|
150 |
- verbose=verbose, |
|
151 |
- seed=seed, |
|
152 |
- cdfName=cdfName, |
|
153 |
- sns=sns) |
|
154 |
- message("Initializing container for assay data elements alleleA, alleleB, call, callProbability") |
|
155 | 108 |
callSet <- construct(filenames=filenames, |
156 | 109 |
cdfName=cdfName, |
157 | 110 |
copynumber=copynumber, |
158 | 111 |
sns=sns) |
159 |
- sampleStats <- data.frame(SKW=snprmaRes$SKW, |
|
160 |
- SNR=snprmaRes$SNR) |
|
161 |
- pD <- new("AnnotatedDataFrame", |
|
162 |
- data=sampleStats, |
|
163 |
- varMetadata=data.frame(labelDescription=colnames(sampleStats))) |
|
164 |
- sampleNames(pD) <- sampleNames(callSet) |
|
165 |
- phenoData(callSet) <- pD |
|
166 |
- if(!copynumber){ |
|
167 |
- ## A and B are the right size |
|
168 |
- A(callSet) <- snprmaRes[["A"]] ## should work regardless of ff, matrix |
|
169 |
- B(callSet) <- snprmaRes[["B"]] |
|
170 |
- } else { |
|
171 |
- ## A and B are not big enough to hold the nonpolymorphic markers |
|
172 |
- index <- which(fData(callSet)[, "isSnp"] == 1) |
|
173 |
- ## Inefficient. |
|
174 |
- ## write one column at a time (????). (we don't want to bring in the whole matrix if its huge) |
|
175 |
- if(isPackageLoaded("ff")){ |
|
176 |
- for(j in 1:ncol(callSet)){ |
|
177 |
- A(callSet)[index, j] <- snprmaRes[["A"]][, j] |
|
178 |
- B(callSet)[index, j] <- snprmaRes[["B"]][, j] |
|
179 |
- } |
|
180 |
- delete(snprmaRes[["A"]])##removes the file on disk |
|
181 |
- delete(snprmaRes[["B"]])##removes the file on disk |
|
182 |
- } else { |
|
183 |
- A(callSet)[index, ] <- snprmaRes[["A"]] |
|
184 |
- B(callSet)[index, ] <- snprmaRes[["B"]] |
|
185 |
- } |
|
112 |
+ mixtureParams <- matrix(NA, 4, length(filenames)) |
|
113 |
+ snp.index <- which(isSnp(callSet)==1) |
|
114 |
+ batches <- splitIndicesByLength(1:ncol(callSet), ocSamples()) |
|
115 |
+ for(j in batches){ |
|
116 |
+ snprmaRes <- snprma(filenames=filenames[j], |
|
117 |
+ mixtureSampleSize=mixtureSampleSize, |
|
118 |
+ fitMixture=fitMixture, |
|
119 |
+ eps=eps, |
|
120 |
+ verbose=verbose, |
|
121 |
+ seed=seed, |
|
122 |
+ cdfName=cdfName, |
|
123 |
+ sns=sns) |
|
124 |
+ stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns)) |
|
125 |
+ message("Initializing container for assay data elements alleleA, alleleB, call, callProbability") |
|
126 |
+ pData(callSet)$SKW[j] <- snprmaRes$SKW |
|
127 |
+ pData(callSet)$SNR[j] <- snprmaRes$SNR |
|
128 |
+ A(callSet)[snp.index, j] <- snprmaRes[["A"]] |
|
129 |
+ B(callSet)[snp.index, j] <- snprmaRes[["B"]] |
|
130 |
+ mixtureParams[, j] <- snprmaRes$mixtureParams |
|
131 |
+ rm(snprmaRes); gc() |
|
132 |
+ } |
|
133 |
+ ## remove snprmaRes and garbage collect before quantile-normalizing NP probes |
|
134 |
+ if(copynumber){ |
|
135 |
+ np.index <- which(isSnp(callSet) == 0) |
|
136 |
+ cnrmaRes <- cnrma(filenames=filenames[j], |
|
137 |
+ cdfName=cdfName, |
|
138 |
+ row.names=featureNames(callSet)[np.index], |
|
139 |
+ sns=sns, |
|
140 |
+ seed=seed, |
|
141 |
+ verbose=verbose) |
|
142 |
+ stopifnot(identical(featureNames(callSet)[np.index], rownames(cnrmaRes))) |
|
143 |
+ A(callSet)[np.index, j] <- cnrmaRes |
|
144 |
+ rm(cnrmaRes); gc() |
|
186 | 145 |
} |
187 |
- gc() |
|
188 |
- callSet <- crlmm.batch(object=callSet, |
|
189 |
- batchSize=batchSize, |
|
190 |
- mixtureParams=snprmaRes$mixtureParams, |
|
191 |
- probs=probs, |
|
192 |
- DF=DF, |
|
193 |
- SNRMin=SNRMin, |
|
194 |
- recallMin=recallMin, |
|
195 |
- recallRegMin=recallRegMin, |
|
196 |
- gender=gender, |
|
197 |
- verbose=verbose, |
|
198 |
- returnParams=returnParams, |
|
199 |
- badSNP=badSNP) |
|
146 |
+ for(j in batches){ |
|
147 |
+ tmp <- crlmmGT(A=A(callSet)[snp.index, j], |
|
148 |
+ B=B(callSet)[snp.index, j], |
|
149 |
+ SNR=callSet$SNR[j], |
|
150 |
+ mixtureParams=mixtureParams[, j], |
|
151 |
+ cdfName=annotation(callSet), |
|
152 |
+ row.names=featureNames(callSet)[snp.index], |
|
153 |
+ col.names=sampleNames(callSet)[j], |
|
154 |
+ probs=probs, |
|
155 |
+ DF=DF, |
|
156 |
+ SNRMin=SNRMin, |
|
157 |
+ recallMin=recallMin, |
|
158 |
+ recallRegMin=recallRegMin, |
|
159 |
+ gender=gender, |
|
160 |
+ verbose=verbose, |
|
161 |
+ returnParams=returnParams, |
|
162 |
+ badSNP=badSNP) |
|
163 |
+ calls(callSet)[snp.index, j] <- tmp[["calls"]] |
|
164 |
+ confs(callSet)[snp.index, j] <- tmp[["confs"]] |
|
165 |
+ callSet$gender[j] <- tmp$gender |
|
166 |
+ rm(tmp); gc() |
|
167 |
+ } |
|
200 | 168 |
return(callSet) |
201 | 169 |
} |
202 | 170 |
|
... | ... |
@@ -671,26 +639,18 @@ whichPlatform <- function(cdfName){ |
671 | 639 |
|
672 | 640 |
|
673 | 641 |
# steps: quantile normalize hapmap: create 1m_reference_cn.rda object |
674 |
-cnrma <- function(filenames, cdfName, sns, seed=1, verbose=FALSE, outdir){ |
|
642 |
+cnrma <- function(filenames, cdfName, row.names, sns, seed=1, verbose=FALSE){ |
|
675 | 643 |
if(missing(cdfName)) stop("must specify cdfName") |
676 | 644 |
pkgname <- getCrlmmAnnotationName(cdfName) |
677 | 645 |
require(pkgname, character.only=TRUE) || stop("Package ", pkgname, " not available") |
678 | 646 |
if (missing(sns)) sns <- basename(filenames) |
679 | 647 |
loader("npProbesFid.rda", .crlmmPkgEnv, pkgname) |
680 | 648 |
fid <- getVarInEnv("npProbesFid") |
649 |
+ fid <- fid[match(row.names, names(fid))] |
|
681 | 650 |
set.seed(seed) |
682 | 651 |
idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything |
683 | 652 |
SKW <- vector("numeric", length(filenames)) |
684 |
-## if(bigmemory){ |
|
685 |
-## NP <- filebacked.big.matrix(length(pnsa), length(filenames), |
|
686 |
-## type="integer", |
|
687 |
-## init=as.integer(0), |
|
688 |
-## backingpath=outdir, |
|
689 |
-## backingfile="NP.bin", |
|
690 |
-## descriptorfile="NP.desc") |
|
691 |
-## } else{ |
|
692 |
- NP <- matrix(NA, length(fid), length(filenames)) |
|
693 |
-## } |
|
653 |
+ NP <- matrix(NA, length(fid), length(filenames)) |
|
694 | 654 |
verbose <- TRUE |
695 | 655 |
if(verbose){ |
696 | 656 |
message("Processing ", length(filenames), " files.") |
... | ... |
@@ -716,9 +676,9 @@ cnrma <- function(filenames, cdfName, sns, seed=1, verbose=FALSE, outdir){ |
716 | 676 |
} |
717 | 677 |
dimnames(NP) <- list(names(fid), sns) |
718 | 678 |
##dimnames(NP) <- list(map[, "man_fsetid"], sns) |
719 |
- res3 <- list(NP=NP, SKW=SKW) |
|
679 |
+ ##res3 <- list(NP=NP, SKW=SKW) |
|
720 | 680 |
cat("\n") |
721 |
- return(res3) |
|
681 |
+ return(NP) |
|
722 | 682 |
} |
723 | 683 |
|
724 | 684 |
getFlags <- function(object, PHI.THR){ |
... | ... |
@@ -42,11 +42,8 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1, |
42 | 42 |
##S will hold (A+B)/2 and M will hold A-B |
43 | 43 |
##NOTE: We actually dont need to save S. Only for pics etc... |
44 | 44 |
##f is the correction. we save to avoid recomputing |
45 |
- ##**RS** |
|
46 |
- ##A <- matrix(as.integer(0), length(pnsa), length(filenames)) |
|
47 |
- ##B <- matrix(as.integer(0), length(pnsb), length(filenames)) |
|
48 |
- A <- initializeBigMatrix(name="tmpA", length(pnsa), length(filenames)) |
|
49 |
- B <- initializeBigMatrix(name="tmpB", length(pnsa), length(filenames)) |
|
45 |
+ A <- matrix(as.integer(0), length(pnsa), length(filenames)) |
|
46 |
+ B <- matrix(as.integer(0), length(pnsb), length(filenames)) |
|
50 | 47 |
|
51 | 48 |
if(verbose){ |
52 | 49 |
message("Processing ", length(filenames), " files.") |