Browse code

copied processCEL to rsprocessCEL. Removed call to snprma2 from genotype function.

Goal is to initialize container for A and B of the right dimension and avoid unnecessary read / writes

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@52852 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 16/02/2011 15:59:24
Showing2 changed files

... ...
@@ -73,8 +73,8 @@ construct <- function(filenames,
73 73
 	cnSet <- new("CNSet",
74 74
 		     alleleA=initializeBigMatrix(name="A", nr, nc),
75 75
 		     alleleB=initializeBigMatrix(name="B", nr, nc),
76
-##		     call=initializeBigMatrix(name="call", nr, nc),
77
-##		     callProbability=initializeBigMatrix(name="callPr", nr,nc),
76
+		     call=initializeBigMatrix(name="call", nr, nc),
77
+		     callProbability=initializeBigMatrix(name="callPr", nr,nc),
78 78
 		     annotation=cdfName,
79 79
 		     batch=batch)
80 80
 	sampleNames(cnSet) <- sns
... ...
@@ -116,27 +116,64 @@ genotype <- function(filenames,
116 116
 	if(missing(cdfName)) stop("must specify cdfName")
117 117
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
118 118
 	if(missing(sns)) sns <- basename(filenames)
119
-	callSet <- construct(filenames=filenames,
120
-			     cdfName=cdfName,
121
-			     copynumber=TRUE,
122
-			     sns=sns,
123
-			     verbose=verbose,
124
-			     batch=batch)
125
-	FUN <- ifelse(is.lds, "snprma2", "snprma")
126
-	snprmaFxn <- function(FUN,...){
127
-		switch(FUN,
128
-		       snprma=snprma(...),
129
-		       snprma2=snprma2(...))
130
-	}
131
-	snprmaRes <- snprmaFxn(FUN,
132
-			       filenames=filenames,
133
-			       mixtureSampleSize=mixtureSampleSize,
134
-			       fitMixture=TRUE,
135
-			       eps=eps,
136
-			       verbose=verbose,
137
-			       seed=seed,
138
-			       cdfName=cdfName,
139
-			       sns=sns)
119
+##	callSet <- construct(filenames=filenames,
120
+##			     cdfName=cdfName,
121
+##			     copynumber=TRUE,
122
+##			     sns=sns,
123
+##			     verbose=verbose,
124
+##			     batch=batch)
125
+##	FUN <- ifelse(is.lds, "snprma2", "snprma")
126
+##	snprmaFxn <- function(FUN,...){
127
+##		switch(FUN,
128
+##		       snprma=snprma(...),
129
+##		       snprma2=snprma2(...))
130
+##	}
131
+##	snprmaRes <- snprmaFxn(FUN,
132
+##			       filenames=filenames,
133
+##			       mixtureSampleSize=mixtureSampleSize,
134
+##			       fitMixture=TRUE,
135
+##			       eps=eps,
136
+##			       verbose=verbose,
137
+##			       seed=seed,
138
+##			       cdfName=cdfName,
139
+##			       sns=sns)
140
+	##---------------------------------------------------------------------------
141
+	##
142
+	## from snprma2.  Goal is to initialize A and B with appropriate dimension for snps+nps
143
+	##
144
+	##---------------------------------------------------------------------------
145
+	if (missing(sns)) sns <- basename(filenames)
146
+	if (missing(cdfName))
147
+		cdfName <- read.celfile.header(filenames[1])[["cdfName"]]
148
+	pkgname <- getCrlmmAnnotationName(cdfName)
149
+	stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
150
+	if(verbose) message("Loading annotations and mixture model parameters.")
151
+	obj <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
152
+	pnsa <- getVarInEnv("pnsa")
153
+	pnsb <- getVarInEnv("pnsb")
154
+	gns <- getVarInEnv("gns")
155
+	rm(list=obj, envir=.crlmmPkgEnv)
156
+	rm(obj)
157
+	if(verbose) message("Initializing objects.")
158
+	mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
159
+	SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
160
+	SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")
161
+##	A <- initializeBigMatrix("crlmmA-", length(pnsa), length(filenames), "integer")
162
+##	B <- initializeBigMatrix("crlmmB-", length(pnsb), length(filenames), "integer")
163
+	featureData <- getFeatureData(cdfName, copynumber=TRUE)
164
+	nr <- nrow(featureData); nc <- length(sns)
165
+	A <- initializeBigMatrix("crlmmA-", nr, length(filenames), "integer")
166
+	B <- initializeBigMatrix("crlmmB-", nr, length(filenames), "integer")
167
+	rownames(A) <- rownames(B) <- featureNames(featureData)
168
+
169
+	sampleBatches <- splitIndicesByNode(seq(along=filenames))
170
+	if(verbose) message("Processing ", length(filenames), " files.")
171
+	ocLapply(sampleBatches, rsprocessCEL, filenames=filenames,
172
+		 fitMixture=fitMixture, A=A, B=B, SKW=SKW, SNR=SNR,
173
+		 mixtureParams=mixtureParams, eps=eps, seed=seed,
174
+		 mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
175
+		 neededPkgs=c("crlmm", pkgname))
176
+
140 177
 	gns <- snprmaRes[["gns"]]
141 178
 	snp.I <- isSnp(callSet)
142 179
 	is.snp <- which(snp.I)
... ...
@@ -242,6 +279,69 @@ genotype <- function(filenames,
242 279
 	return(callSet)
243 280
 }
244 281
 
282
+rsprocessCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR,
283
+			 mixtureParams, eps, seed, mixtureSampleSize,
284
+			 pkgname){
285
+	obj1 <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
286
+	obj2 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
287
+	obj3 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
288
+	autosomeIndex <- getVarInEnv("autosomeIndex")
289
+	pnsa <- getVarInEnv("pnsa")
290
+	pnsb <- getVarInEnv("pnsb")
291
+	fid <- getVarInEnv("fid")
292
+	reference <- getVarInEnv("reference")
293
+	aIndex <- getVarInEnv("aIndex")
294
+	bIndex <- getVarInEnv("bIndex")
295
+	SMEDIAN <- getVarInEnv("SMEDIAN")
296
+	theKnots <- getVarInEnv("theKnots")
297
+	gns <- getVarInEnv("gns")
298
+	rm(list=c(obj1, obj2, obj3), envir=.crlmmPkgEnv)
299
+	rm(obj1, obj2, obj3)
300
+
301
+	## for mixture
302
+	set.seed(seed)
303
+	idx <- sort(sample(autosomeIndex, mixtureSampleSize))
304
+	##for skewness. no need to do everything
305
+	idx2 <- sample(length(fid), 10^5)
306
+
307
+	open(A)
308
+	open(B)
309
+	open(SKW)
310
+	open(mixtureParams)
311
+	open(SNR)
312
+
313
+	for (k in i){
314
+		y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
315
+		x <- log2(y[idx2])
316
+		SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3)
317
+		rm(x)
318
+		y <- normalize.quantiles.use.target(y, target=reference)
319
+		A[, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
320
+		B[, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
321
+		rm(y)
322
+		if(fitMixture){
323
+			S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN
324
+			M <- log2(A[idx, k])-log2(B[idx, k])
325
+			tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
326
+			rm(S, M)
327
+			mixtureParams[, k] <- tmp[["coef"]]
328
+			SNR[k] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
329
+			rm(tmp)
330
+		} else {
331
+			mixtureParams[, k] <- NA
332
+			SNR[k] <- NA
333
+		}
334
+	}
335
+	close(A)
336
+	close(B)
337
+	close(SKW)
338
+	close(mixtureParams)
339
+	close(SNR)
340
+	rm(list=ls())
341
+	gc(verbose=FALSE)
342
+	TRUE
343
+}
344
+
245 345
 genotype2 <- function(){
246 346
 	.Defunct(msg="The genotype2 function has been deprecated. The function genotype should be used instead.  genotype will support large data using ff provided that the ff package is loaded.")
247 347
 }
... ...
@@ -15,7 +15,7 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
15 15
     message(strwrap(msg))
16 16
     stop("Package ", pkgname, " could not be found.")
17 17
   }
18
-  
18
+
19 19
   if(verbose) message("Loading annotations and mixture model parameters.")
20 20
   loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
21 21
   loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
... ...
@@ -30,7 +30,7 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
30 30
   SMEDIAN <- getVarInEnv("SMEDIAN")
31 31
   theKnots <- getVarInEnv("theKnots")
32 32
   gns <- getVarInEnv("gns")
33
-  
33
+
34 34
   ##We will read each cel file, summarize, and run EM one by one
35 35
   ##We will save parameters of EM to use later
36 36
   mixtureParams <- matrix(0, 4, length(filenames))
... ...
@@ -39,7 +39,7 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
39 39
 ##   mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames))
40 40
 ##   SNR <- initializeBigVector("crlmmSNR-", length(filenames), "numeric")
41 41
 ##   SKW <- initializeBigVector("crlmmSKW-", length(filenames), "numeric")
42
-  
42
+
43 43
   ## This is the sample for the fitting of splines
44 44
   ## BC: I like better the idea of the user passing the seed,
45 45
   ##     because this might intefere with other analyses
... ...
@@ -51,15 +51,15 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
51 51
   ##f is the correction. we save to avoid recomputing
52 52
   A <- matrix(as.integer(0), length(pnsa), length(filenames))
53 53
   B <- matrix(as.integer(0), length(pnsb), length(filenames))
54
-  
54
+
55 55
   if(verbose){
56 56
     message("Processing ", length(filenames), " files.")
57 57
     pb <- txtProgressBar(min=0, max=length(filenames), style=3)
58 58
   }
59
-  
59
+
60 60
   ##for skewness. no need to do everything
61 61
   idx2 <- sample(length(fid), 10^5)
62
-  
62
+
63 63
   ##We start looping throug cel files
64 64
   for(i in seq(along=filenames)){
65 65
     y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
... ...
@@ -73,10 +73,10 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
73 73
     if(fitMixture){
74 74
       S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN
75 75
       M <- log2(A[idx, i])-log2(B[idx, i])
76
-      
76
+
77 77
       ##we need to test the choice of eps.. it is not the max diff between funcs
78 78
       tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
79
-      
79
+
80 80
       mixtureParams[, i] <- tmp[["coef"]]
81 81
       SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
82 82
     }
... ...
@@ -97,12 +97,12 @@ fitAffySnpMixture56 <- function(S, M, knots, probs=rep(1/3, 3), eps=.01, maxit=1
97 97
   mus <- append(quantile(M, c(1, 5)/6, names=FALSE), 0, 1)
98 98
   sigmas <- rep(mad(c(M[M<mus[1]]-mus[1], M[M>mus[3]]-mus[3])), 3)
99 99
   sigmas[2] <- sigmas[2]/2
100
- 
100
+
101 101
   weights <- apply(cbind(mus, sigmas), 1, function(p) dnorm(M, p[1], p[2]))
102 102
   previousF1 <- -Inf
103 103
   change <- eps+1
104 104
   it <- 0
105
- 
105
+
106 106
   if(verbose) message("Max change must be under ", eps, ".")
107 107
   matS <- stupidSplineBasis(S, knots)
108 108
   while (change > eps & it < maxit){
... ...
@@ -112,19 +112,19 @@ fitAffySnpMixture56 <- function(S, M, knots, probs=rep(1/3, 3), eps=.01, maxit=1
112 112
     LogLik <- rowSums(z)
113 113
     z <- sweep(z, 1, LogLik, "/")
114 114
     probs <- colMeans(z)
115
- 
115
+
116 116
     ## M
117 117
     fit1 <- crossprod(chol2inv(chol(crossprod(sweep(matS, 1, z[, 1], FUN="*"), matS))), crossprod(matS, z[, 1]*M))
118
- 
118
+
119 119
     fit2 <- sum(z[, 2]*M)/sum(z[, 2])
120 120
     F1 <- matS%*%fit1
121 121
     sigmas[c(1, 3)] <- sqrt(sum(z[, 1]*(M-F1)^2)/sum(z[, 1]))
122 122
     sigmas[2] <- sqrt(sum(z[, 2]*(M-fit2)^2)/sum(z[, 2]))
123
- 
123
+
124 124
     weights[, 1] <- dnorm(M, F1, sigmas[1])
125 125
     weights[, 2] <- dnorm(M, fit2, sigmas[2])
126 126
     weights[, 3] <- dnorm(M, -F1, sigmas[3])
127
-    
127
+
128 128
     change <- max(abs(F1-previousF1))
129 129
     previousF1 <- F1
130 130
     if(verbose) message("Iter ", it, ": ", change, ".")
... ...
@@ -140,7 +140,7 @@ snprma2 <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
140 140
     cdfName <- read.celfile.header(filenames[1])[["cdfName"]]
141 141
   pkgname <- getCrlmmAnnotationName(cdfName)
142 142
   stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
143
-  
143
+
144 144
   if(verbose) message("Loading annotations and mixture model parameters.")
145 145
   obj <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
146 146
   pnsa <- getVarInEnv("pnsa")
... ...
@@ -148,14 +148,14 @@ snprma2 <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
148 148
   gns <- getVarInEnv("gns")
149 149
   rm(list=obj, envir=.crlmmPkgEnv)
150 150
   rm(obj)
151
-  
151
+
152 152
   ##We will read each cel file, summarize, and run EM one by one
153 153
   ##We will save parameters of EM to use later
154 154
   if(verbose) message("Initializing objects.")
155 155
   mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
156 156
   SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
157 157
   SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")
158
-  
158
+
159 159
   ## This is the sample for the fitting of splines
160 160
   ## BC: I like better the idea of the user passing the seed,
161 161
   ##     because this might intefere with other analyses
... ...
@@ -180,7 +180,7 @@ snprma2 <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
180 180
   close(SKW)
181 181
   close(A)
182 182
   close(B)
183
-  
183
+
184 184
   list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
185 185
 }
186 186
 
... ...
@@ -188,7 +188,7 @@ snprma2 <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
188 188
 processCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR,
189 189
                        mixtureParams, eps, seed, mixtureSampleSize,
190 190
                        pkgname){
191
-  
191
+
192 192
   obj1 <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
193 193
   obj2 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
194 194
   obj3 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
... ...
@@ -226,7 +226,7 @@ processCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR,
226 226
     A[, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
227 227
     B[, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
228 228
     rm(y)
229
-    
229
+
230 230
     if(fitMixture){
231 231
       S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN
232 232
       M <- log2(A[idx, k])-log2(B[idx, k])