git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45165 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -26,7 +26,7 @@ importClassesFrom(oligoClasses, SnpSuperSet, AlleleSet) |
26 | 26 |
|
27 | 27 |
importMethodsFrom(oligoClasses, allele, calls, "calls<-", confs, |
28 | 28 |
"confs<-", cnConfidence, "cnConfidence<-", |
29 |
- isSnp, chromosome, position, CA, "CA<-", CB, "CB<-", A, B) |
|
29 |
+ isSnp, chromosome, position, CA, "CA<-", CB, "CB<-", A, B, "A<-", "B<-") |
|
30 | 30 |
|
31 | 31 |
|
32 | 32 |
importFrom(oligoClasses, chromosome2integer, celfileDate, list.celfiles) |
... | ... |
@@ -52,14 +52,6 @@ importFrom(mvtnorm, dmvnorm) |
52 | 52 |
|
53 | 53 |
importFrom(ellipse, ellipse) |
54 | 54 |
|
55 |
-exportMethods(A, B, copyNumber) |
|
55 |
+exportMethods(copyNumber) |
|
56 | 56 |
export(cnOptions, crlmm, crlmmIllumina, crlmmCopynumber, ellipse, readIdatFiles, snprma, getParam) |
57 |
- |
|
58 |
-##export(##viterbi.CNSet, ##move to VanillaICE |
|
59 |
-## combineIntensities, |
|
60 |
-## whichPlatform, |
|
61 |
-## isValidCdfName, |
|
62 |
-## crlmmWrapper, |
|
63 |
-## withinGenotypeMoments, |
|
64 |
-## locationAndScale, getParam.SnpSuperSet) |
|
65 |
-export(thresholdModelParams, computeCopynumber.CNSet, nuphiAllele, coefs, biasAdjNP) |
|
57 |
+export(genotype) |
... | ... |
@@ -52,21 +52,152 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){ |
52 | 52 |
##crlmmOpts$snpRange <- range(snpIndex) |
53 | 53 |
##crlmmOpts$npRange <- range(npIndex) |
54 | 54 |
} |
55 |
-construct <- function(filenames, cdfName){ |
|
55 |
+construct <- function(filenames, cdfName, copynumber=FALSE, sns){ |
|
56 |
+ if(missing(sns)) sns <- basename(filenames) |
|
56 | 57 |
protocolData <- getProtocolData.Affy(filenames) |
57 |
- M <- getFeatureData.Affy(cdfName) |
|
58 |
- dns <- list(rownames(M), basename(filenames)) |
|
59 |
- nr <- nrow(M) |
|
60 |
- alleleSet <- new("AffymetrixAlleleSet", |
|
61 |
- alleleA=initializeBigMatrix(dns), |
|
62 |
- alleleB=initializeBigMatrix(dns), |
|
63 |
- genomeAnnotation=M, |
|
64 |
- options=crlmmOptions(object), |
|
65 |
- annotation=annotation(object)) |
|
66 |
- protocolData(alleleSet) <- protocolData |
|
67 |
- sampleNames(alleleSet) <- basename(filenames) |
|
68 |
- featureNames(alleleSet) <- dns[[1]] |
|
69 |
- return(alleleSet) |
|
58 |
+ featureData <- getFeatureData.Affy(cdfName, copynumber=copynumber) |
|
59 |
+ nr <- nrow(featureData); nc <- length(filenames) |
|
60 |
+ callSet <- new("SnpSuperSet", |
|
61 |
+ alleleA=initializeBigMatrix(name="A", nr, nc), |
|
62 |
+ alleleB=initializeBigMatrix(name="B", nr, nc), |
|
63 |
+ call=initializeBigMatrix(name="call", nr, nc), |
|
64 |
+ callProbability=initializeBigMatrix(name="callPr", nr,nc), |
|
65 |
+ featureData=featureData, |
|
66 |
+ annotation=cdfName) |
|
67 |
+ protocolData(callSet) <- protocolData |
|
68 |
+ sampleNames(callSet) <- sns |
|
69 |
+ return(callSet) |
|
70 |
+} |
|
71 |
+setReplaceMethod("calls", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "call", value)) |
|
72 |
+setReplaceMethod("confs", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "callProbability", value)) |
|
73 |
+setMethod("confs", "SnpSuperSet", function(object) assayDataElement(object, "callProbability")) |
|
74 |
+crlmm.batch <- function(object, |
|
75 |
+ batchSize, |
|
76 |
+ mixtureParams, |
|
77 |
+ probs=rep(1/3,3), |
|
78 |
+ DF=6, |
|
79 |
+ SNRMin=5, |
|
80 |
+ recallMin=10, |
|
81 |
+ recallRegMin=1000, |
|
82 |
+ gender=NULL, |
|
83 |
+ desctrucitve=FALSE, |
|
84 |
+ verbose=TRUE, |
|
85 |
+ returnParams=FALSE, |
|
86 |
+ badSNP=0.7){ |
|
87 |
+ ##Call in batches to reduce ram |
|
88 |
+ BS <- batchSize |
|
89 |
+ nc <- ncol(object) |
|
90 |
+ if(nc > BS){ |
|
91 |
+ N <- ceiling(nc/BS) |
|
92 |
+ S <- ceiling(nc/N) |
|
93 |
+ colindex <- split(1:nc, rep(1:nc, each=S, length.out=nc)) |
|
94 |
+ } else { |
|
95 |
+ colindex <- list(1:nc) |
|
96 |
+ } |
|
97 |
+ if(length(colindex) > 1) |
|
98 |
+ message("Calling genotypes in batches of size ", length(colindex[[1]]), " to reduce required RAM") |
|
99 |
+ row.index <- which(isSnp(object)==1 | is.na(isSnp(object))) |
|
100 |
+ for(i in seq(along=colindex)){ |
|
101 |
+ col.index <- colindex[[i]] |
|
102 |
+ tmp <- crlmmGT(A=A(object)[row.index, col.index], |
|
103 |
+ B=B(object)[row.index, col.index], |
|
104 |
+ SNR=object$SNR[col.index], |
|
105 |
+ mixtureParams=mixtureParams[, col.index], |
|
106 |
+ cdfName=annotation(object), |
|
107 |
+ row.names=featureNames(object)[row.index], |
|
108 |
+ col.names=sampleNames(object)[col.index], |
|
109 |
+ probs=probs, |
|
110 |
+ DF=DF, |
|
111 |
+ SNRMin=SNRMin, |
|
112 |
+ recallMin=recallMin, |
|
113 |
+ recallRegMin=recallRegMin, |
|
114 |
+ gender=gender, |
|
115 |
+ desctrucitve=desctrucitve, |
|
116 |
+ verbose=verbose, |
|
117 |
+ returnParams=returnParams, |
|
118 |
+ badSNP=badSNP) |
|
119 |
+ ##ensure matrix is passed |
|
120 |
+ calls(object)[row.index, col.index] <- tmp[["calls"]] |
|
121 |
+ confs(object)[row.index, col.index] <- tmp[["confs"]] |
|
122 |
+ object$gender[col.index] <- tmp$gender |
|
123 |
+ rm(tmp); gc() |
|
124 |
+ } |
|
125 |
+ return(object) |
|
126 |
+} |
|
127 |
+ |
|
128 |
+genotype <- function(filenames, cdfName, mixtureSampleSize=10^5, |
|
129 |
+ fitMixture=TRUE, |
|
130 |
+ eps=0.1, verbose=TRUE, |
|
131 |
+ seed=1, sns, copynumber=FALSE, |
|
132 |
+ batchSize=1000, |
|
133 |
+ probs=rep(1/3, 3), |
|
134 |
+ DF=6, |
|
135 |
+ SNRMin=5, |
|
136 |
+ recallMin=10, |
|
137 |
+ recallRegMin=1000, |
|
138 |
+ gender=NULL, |
|
139 |
+ returnParams=TRUE, |
|
140 |
+ badSNP=0.7){ |
|
141 |
+ if(missing(cdfName)) stop("must specify cdfName") |
|
142 |
+ if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
|
143 |
+ if(missing(sns)) sns <- basename(filenames) |
|
144 |
+ ## callSet contains potentially very big matrices |
|
145 |
+ ## More big matrices are created within snprma, that will then be removed. |
|
146 |
+ snprmaRes <- snprma(filenames=filenames, |
|
147 |
+ mixtureSampleSize=mixtureSampleSize, |
|
148 |
+ fitMixture=fitMixture, |
|
149 |
+ eps=eps, |
|
150 |
+ verbose=verbose, |
|
151 |
+ seed=seed, |
|
152 |
+ cdfName=cdfName, |
|
153 |
+ sns=sns) |
|
154 |
+ message("Initializing container for assay data elements alleleA, alleleB, call, callProbability") |
|
155 |
+ callSet <- construct(filenames=filenames, |
|
156 |
+ cdfName=cdfName, |
|
157 |
+ copynumber=copynumber, |
|
158 |
+ sns=sns) |
|
159 |
+ sampleStats <- data.frame(SKW=snprmaRes$SKW, |
|
160 |
+ SNR=snprmaRes$SNR) |
|
161 |
+ pD <- new("AnnotatedDataFrame", |
|
162 |
+ data=sampleStats, |
|
163 |
+ varMetadata=data.frame(labelDescription=colnames(sampleStats))) |
|
164 |
+ sampleNames(pD) <- sampleNames(callSet) |
|
165 |
+ phenoData(callSet) <- pD |
|
166 |
+ if(!copynumber){ |
|
167 |
+ ## A and B are the right size |
|
168 |
+ A(callSet) <- snprmaRes[["A"]] ## should work regardless of ff, matrix |
|
169 |
+ B(callSet) <- snprmaRes[["B"]] |
|
170 |
+ } else { |
|
171 |
+ ## A and B are not big enough to hold the nonpolymorphic markers |
|
172 |
+ index <- which(fData(callSet)[, "isSnp"] == 1) |
|
173 |
+ ## Inefficient. |
|
174 |
+ ## write one column at a time (????). (we don't want to bring in the whole matrix if its huge) |
|
175 |
+ if(isPackageLoaded("ff")){ |
|
176 |
+ for(j in 1:ncol(callSet)){ |
|
177 |
+ A(callSet)[index, j] <- snprmaRes[["A"]][, j] |
|
178 |
+ B(callSet)[index, j] <- snprmaRes[["B"]][, j] |
|
179 |
+ } |
|
180 |
+ delete(snprmaRes[["A"]])##removes the file on disk |
|
181 |
+ delete(snprmaRes[["B"]])##removes the file on disk |
|
182 |
+ } else { |
|
183 |
+ A(callSet)[index, ] <- snprmaRes[["A"]] |
|
184 |
+ B(callSet)[index, ] <- snprmaRes[["B"]] |
|
185 |
+ } |
|
186 |
+ } |
|
187 |
+ gc() |
|
188 |
+ callSet <- crlmm.batch(object=callSet, |
|
189 |
+ batchSize=batchSize, |
|
190 |
+ mixtureParams=snprmaRes$mixtureParams, |
|
191 |
+ probs=probs, |
|
192 |
+ DF=DF, |
|
193 |
+ SNRMin=SNRMin, |
|
194 |
+ recallMin=recallMin, |
|
195 |
+ recallRegMin=recallRegMin, |
|
196 |
+ gender=gender, |
|
197 |
+ verbose=verbose, |
|
198 |
+ returnParams=returnParams, |
|
199 |
+ badSNP=badSNP) |
|
200 |
+ return(callSet) |
|
70 | 201 |
} |
71 | 202 |
|
72 | 203 |
|
... | ... |
@@ -42,8 +42,11 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1, |
42 | 42 |
##S will hold (A+B)/2 and M will hold A-B |
43 | 43 |
##NOTE: We actually dont need to save S. Only for pics etc... |
44 | 44 |
##f is the correction. we save to avoid recomputing |
45 |
- A <- matrix(as.integer(0), length(pnsa), length(filenames)) |
|
46 |
- B <- matrix(as.integer(0), length(pnsb), length(filenames)) |
|
45 |
+ ##**RS** |
|
46 |
+ ##A <- matrix(as.integer(0), length(pnsa), length(filenames)) |
|
47 |
+ ##B <- matrix(as.integer(0), length(pnsb), length(filenames)) |
|
48 |
+ A <- initializeBigMatrix(name="tmpA", length(pnsa), length(filenames)) |
|
49 |
+ B <- initializeBigMatrix(name="tmpB", length(pnsa), length(filenames)) |
|
47 | 50 |
|
48 | 51 |
if(verbose){ |
49 | 52 |
message("Processing ", length(filenames), " files.") |
... | ... |
@@ -155,7 +155,69 @@ celDates <- function(celfiles){ |
155 | 155 |
return(celdts) |
156 | 156 |
} |
157 | 157 |
|
158 |
+validCdfNames <- function(){ |
|
159 |
+ c("genomewidesnp6", |
|
160 |
+ "genomewidesnp5", |
|
161 |
+ "human370v1c", |
|
162 |
+ "human370quadv3c", |
|
163 |
+ "human550v3b", |
|
164 |
+ "human650v3a", |
|
165 |
+ "human610quadv1b", |
|
166 |
+ "human660quadv1a", |
|
167 |
+ "human1mduov3b", |
|
168 |
+ "humanomni1quadv1b") |
|
169 |
+} |
|
170 |
+isValidCdfName <- function(cdfName){ |
|
171 |
+ chipList <- validCdfNames() |
|
172 |
+ result <- cdfName %in% chipList |
|
173 |
+ if(!(result)){ |
|
174 |
+ warning("cdfName must be one of the following: ", |
|
175 |
+ chipList) |
|
176 |
+ } |
|
177 |
+ return(result) |
|
178 |
+} |
|
158 | 179 |
|
180 |
+isPackageLoaded <- function(pkg){ |
|
181 |
+ stopifnot(is.character(pkg)) |
|
182 |
+ pkg <- paste("package:", pkg, sep="") |
|
183 |
+ pkg %in% search() |
|
184 |
+} |
|
159 | 185 |
|
186 |
+initializeBigMatrix <- function(name, nr, nc, vmode="integer"){ |
|
187 |
+ if(isPackageLoaded("ff")){ |
|
188 |
+ if(prod(nr, nc) > 2^31){ |
|
189 |
+ ##Need multiple matrices |
|
190 |
+ ## -- use ffdf |
|
191 |
+ |
|
192 |
+ ## How many samples per ff object |
|
193 |
+ S <- floor(2^31/nr - 1) |
|
194 |
+ |
|
195 |
+ ## How many ff objects |
|
196 |
+ L <- ceiling(nc/S) |
|
197 |
+ name <- paste(name, 1:L, sep="_") |
|
198 |
+ |
|
199 |
+ results <- vector("list", L) |
|
200 |
+ ##resultsB <- vector("list", L) |
|
201 |
+ for(i in 1:(L-1)){ ## the Lth object may have fewer than nc columns |
|
202 |
+ results[[i]] <- createFF(name=name[i], |
|
203 |
+ dim=c(nr, S), |
|
204 |
+ vmode=vmode) |
|
205 |
+ } |
|
206 |
+ ##the Lth element |
|
207 |
+ leftOver <- nc - ((L-1)*S) |
|
208 |
+ results[[L]] <- createFF(name=name[L], |
|
209 |
+ dim=c(nr, leftOver), |
|
210 |
+ vmode=vmode) |
|
211 |
+ resultsff <- do.call(ffdf, results) |
|
212 |
+ ##dimnames(resultsff) <- dns |
|
213 |
+ } else { |
|
214 |
+ resultsff <- createFF(name=name, |
|
215 |
+ dim=c(nr, nc), |
|
216 |
+ vmode=vmode) |
|
217 |
+ } |
|
218 |
+ resultsff[,] <- NA |
|
219 |
+ } else resultsff <- matrix(NA, nr, nc) |
|
220 |
+ return(resultsff) |
|
221 |
+} |
|
160 | 222 |
setMethod("annotatedDataFrameFrom", "ff_matrix", Biobase:::annotatedDataFrameFromMatrix) |
161 | 223 |
setMethod("annotatedDataFrameFrom", "ffdf", Biobase:::annotatedDataFrameFromMatrix) |