Browse code

Merge branch 'collab'

* collab:
Update CNSet objects in data/ with datadir slot and protocolData(object)$filename
update cnrmaAffy. processCEL2 located inside cnrmaAffy function. Uses lexical scope.
Change API for genotypeAffy -- remove mixtureParams argument. Update call to genotypeAffy in genotype function
snprmaAffy no longer initializes mixtureParams object, but accesses this information from the cnSet
constructAffyCNSet initializes mixtureParams slot of the appropriate dimensions
updated cnrmaAffy. Removed cnrma2, cnrma. cnrmaAffy uses lexical scope
Fix bug in crlmmGT2 caused by unequal batch sizes
Moved rsprocessCel inside of snprmaAffy for lexical scope. Moved imputeGender inside crlmmGT2 for lexical scope
Revert imputeGender to original approach for crlmm. Splitting samples across nodes does not work well if there are not a lot of samples in the individual nodes. Probably better to use fewer markers on chr X when large number of samples are processed
contains old process1 call
change gender <- unlist(gender) to gender <- unlist(genderList)
v1.15.15 Fix memory leak in imputeGender step by running this function in sample batches of size ocSamples(). Use lexical scope in calling process1 function in crlmmGT2.
set default values in summarizeNps
depends on v 1.19.39 of oligoClasses
v1.15.14: Export constructAffyCNSet. Used datadir slot in CNSet object added to v 1.19.39 of oligoClasses
update getFeatureData for use with annotation package that contains a number of SNPs not necessarily included in the genotyping. These additional snps are removed when constructing the featureData in constructAffy

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@67826 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 19/07/2012 03:57:08
Showing 9 changed files

... ...
@@ -1,12 +1,12 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.15.12
4
+Version: 1.15.18
5 5
 Author: Benilton S Carvalho, Robert Scharpf, Matt Ritchie, Ingo Ruczinski, Rafael A Irizarry
6 6
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
7 7
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms
8 8
 License: Artistic-2.0
9
-Depends: R (>= 2.14.0), oligoClasses (>= 1.19.17)
9
+Depends: R (>= 2.14.0), oligoClasses (>= 1.19.39)
10 10
 Imports: methods,
11 11
          Biobase (>= 2.15.4),
12 12
          BiocGenerics,
... ...
@@ -79,14 +79,19 @@ exportMethods(CA, CB,
79 79
 export(crlmm,
80 80
        crlmmIllumina,
81 81
        crlmmIlluminaV2,
82
+       constructAffyCNSet,
82 83
        genotype,
84
+       genotypeAffy,
83 85
        readIDAT,
84 86
        readIdatFiles,
85 87
        readGenCallOutput,
86 88
        snprma,
87 89
        snprma2,
90
+       cnrmaAffy,
91
+       snprmaAffy,
88 92
        crlmm2,
89 93
        genotype2, genotypeLD,
94
+       genotypeAffy,
90 95
        genotype.Illumina,
91 96
        crlmmCopynumber2, crlmmCopynumberLD, crlmmCopynumber)
92 97
 export(genotypes, totalCopynumber, rawCopynumber, xyplot)
... ...
@@ -40,14 +40,16 @@ getFeatureData <- function(cdfName, copynumber=FALSE, genome=genome){
40 40
 	}
41 41
 	path <- system.file("extdata", package=pkgname)
42 42
 	multiple.builds <- length(grep("hg19", list.files(path)) > 0)
43
-	if(!multiple.builds)
43
+	if(!multiple.builds){
44 44
 		load(file.path(path, "snpProbes.rda"))
45
-	else load(file.path(path, paste("snpProbes_", genome, ".rda", sep="")))
45
+	} else load(file.path(path, paste("snpProbes_", genome, ".rda", sep="")))
46 46
 	snpProbes <- get("snpProbes")
47
+	## if we use a different build we may throw out a number of snps...
48
+	snpProbes <- snpProbes[rownames(snpProbes) %in% gns, ]
47 49
 	if(copynumber){
48
-		if(!multiple.builds)
50
+		if(!multiple.builds){
49 51
 			load(file.path(path, "cnProbes.rda"))
50
-		else load(file.path(path, paste("cnProbes_", genome, ".rda", sep="")))
52
+		} else load(file.path(path, paste("cnProbes_", genome, ".rda", sep="")))
51 53
 		cnProbes <- get("cnProbes")
52 54
 		snpIndex <- seq(along=gns)
53 55
 		npIndex <- seq(along=rownames(cnProbes)) + max(snpIndex)
... ...
@@ -126,7 +128,7 @@ construct <- function(filenames,
126 128
 	return(cnSet)
127 129
 }
128 130
 
129
-constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE, genome){
131
+constructAffyCNSet <- function(filenames, sns, cdfName, batch, verbose=TRUE, genome){
130 132
 	##stopifnot(!missing(filenames))
131 133
 	if(missing(sns)) sns <- basename(filenames)
132 134
 	stopifnot(!missing(batch))
... ...
@@ -158,8 +160,14 @@ constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE, genome){
158 160
 	B <- initializeBigMatrix("crlmmB-", nr, length(filenames), "integer")
159 161
 	call <- initializeBigMatrix("call-", nr, length(filenames), "integer")
160 162
 	callPr <- initializeBigMatrix("callPr-", nr, length(filenames), "integer")
163
+	mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
161 164
 	rownames(A) <- rownames(B) <- featureNames(featureData)
162 165
 	rownames(call) <- rownames(callPr) <- featureNames(featureData)
166
+	dirNames <- dirname(filenames)
167
+	unames <- unique(dirNames)
168
+	dirNames <- factor(dirNames, levels=unames)
169
+	dd <- split(seq_len(length(filenames)), dirNames)
170
+	datadir <- list(dirname=names(dd), n=as.integer(sapply(dd, length)))
163 171
 	if(verbose) message("Instantiating CNSet container")
164 172
 	cnSet <- new("CNSet",
165 173
 		     alleleA=A,
... ...
@@ -169,9 +177,12 @@ constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE, genome){
169 177
 		     featureData=featureData,
170 178
 		     annotation=cdfName,
171 179
 		     batch=as.character(batch),
172
-		     genome=genome)
180
+		     genome=genome,
181
+		     datadir=datadir)
182
+	cnSet@mixtureParams <- mixtureParams
173 183
 	sampleNames(cnSet) <- sns
174 184
 	protocolData <- getProtocolData.Affy(filenames)
185
+	protocolData$filename <- basename(filenames)
175 186
 	rownames(pData(protocolData)) <- sns
176 187
 	protocolData(cnSet) <- protocolData
177 188
 	cnSet$SKW <- SKW
... ...
@@ -181,33 +192,84 @@ constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE, genome){
181 192
 	return(cnSet)
182 193
 }
183 194
 
184
-snprmaAffy <- function(cnSet, filenames,
195
+snprmaAffy <- function(cnSet,
185 196
 		       mixtureSampleSize=10^5,
186 197
 		       eps=0.1,
187 198
 		       seed=1,
188 199
 		       verbose=TRUE){
200
+	filenames <- celfileName(cnSet)
189 201
 	if(verbose) message("Preprocessing ", length(filenames), " files.")
190 202
 	pkgname <- getCrlmmAnnotationName(annotation(cnSet))
191 203
 	stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
192
-	mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
193
-	sampleBatches <- splitIndicesByLength(seq_along(filenames), ocSamples()/getDoParWorkers())
194
-##	if(length(sampleBatches) < getDoParWorkers())
195
-##		sampleBatches <- splitIndicesByNode(seq_along(filenames))
196
-	ocLapply(sampleBatches,
197
-		 rsprocessCEL,
198
-		 filenames=filenames,
199
-		 fitMixture=TRUE,
200
-		 A=A(cnSet), B=B(cnSet), C=calls(cnSet),
201
-		 D=snpCallProbability(cnSet),
202
-		 SKW=cnSet$SKW, SNR=cnSet$SNR,
203
-		 mixtureParams=mixtureParams,
204
-		 eps=eps,
205
-		 seed=seed,
206
-		 mixtureSampleSize=mixtureSampleSize,
207
-		 pkgname=pkgname,
208
-		 neededPkgs=c("crlmm", pkgname))
204
+	sampleBatches <- splitIndicesByNode(seq_along(filenames))
205
+	mixtureParams <- cnSet@mixtureParams
206
+	obj1 <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
207
+	obj2 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
208
+	obj3 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
209
+	autosomeIndex <- getVarInEnv("autosomeIndex")
210
+	pnsa <- getVarInEnv("pnsa")
211
+	pnsb <- getVarInEnv("pnsb")
212
+	fid <- getVarInEnv("fid")
213
+	reference <- getVarInEnv("reference")
214
+	aIndex <- getVarInEnv("aIndex")
215
+	bIndex <- getVarInEnv("bIndex")
216
+	SMEDIAN <- getVarInEnv("SMEDIAN")
217
+	theKnots <- getVarInEnv("theKnots")
218
+	gns <- getVarInEnv("gns")
219
+	rm(list=c(obj1, obj2, obj3), envir=.crlmmPkgEnv)
220
+	rm(obj1, obj2, obj3)
221
+	## for mixture
222
+	set.seed(seed)
223
+	idx <- sort(sample(autosomeIndex, mixtureSampleSize))
224
+	##for skewness. no need to do everything
225
+	idx2 <- sample(length(fid), 10^5)
226
+	A <- A(cnSet)
227
+	B <- B(cnSet)
228
+	C <- calls(cnSet)
229
+	D <- snpCallProbability(cnSet)
230
+	SKW <- cnSet$SKW; SNR <- cnSet$SNR
231
+	open(A)
232
+	open(B)
233
+	open(C)
234
+	open(D)
235
+	open(SKW)
236
+	open(mixtureParams)
237
+	open(SNR)
209 238
 	if(verbose) message("Cloning A and B matrices to store genotype calls and confidence scores.")
210
-	return(mixtureParams)
239
+	## RS ADDED
240
+	index <- match(gns, rownames(A))
241
+	rsprocessCEL <- function(i){
242
+		for (k in i){
243
+			y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
244
+			x <- log2(y[idx2])
245
+			SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3)
246
+			rm(x)
247
+			y <- normalize.quantiles.use.target(y, target=reference)
248
+			## RS: add index for row assignment
249
+			A[index, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
250
+			B[index, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
251
+			C[index, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
252
+			D[index, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
253
+			rm(y)
254
+			S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN
255
+			M <- log2(A[idx, k])-log2(B[idx, k])
256
+			tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
257
+			rm(S, M)
258
+			mixtureParams[, k] <- tmp[["coef"]]
259
+			SNR[k] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
260
+			rm(tmp)
261
+		}
262
+		TRUE
263
+	}
264
+	ocLapply(sampleBatches, rsprocessCEL, neededPkgs="crlmm")
265
+	close(A)
266
+	close(B)
267
+	close(C)
268
+	close(D)
269
+	close(SKW)
270
+	close(mixtureParams)
271
+	close(SNR)
272
+	return()
211 273
 }
212 274
 
213 275
 genotype <- function(filenames,
... ...
@@ -231,25 +293,24 @@ genotype <- function(filenames,
231 293
 	if (missing(sns)) sns <- basename(filenames)
232 294
 	if (any(duplicated(sns))) stop("sample identifiers must be unique")
233 295
 	genome <- match.arg(genome)
234
-	cnSet <- constructAffy(filenames=filenames,
235
-			       sns=sns,
236
-			       cdfName=cdfName,
237
-			       batch=batch, verbose=verbose,
238
-			       genome=genome)
296
+	cnSet <- constructAffyCNSet(filenames=filenames,
297
+				    sns=sns,
298
+				    cdfName=cdfName,
299
+				    batch=batch, verbose=verbose,
300
+				    genome=genome)
239 301
 	if(!(is(A(cnSet), "ff") || is(A(cnSet), "ffdf"))){
240 302
 		stop("The ff package is required for this function.")
241 303
 	}
242
-	cnSet@mixtureParams <- snprmaAffy(cnSet, filenames=filenames,
243
-					  mixtureSampleSize=mixtureSampleSize,
244
-					  eps=eps,
245
-					  seed=seed,
246
-					  verbose=verbose)
247
-	ok <- cnrmaAffy(cnSet=cnSet, filenames=filenames,
304
+	cnSet <- snprmaAffy(cnSet,
305
+			    mixtureSampleSize=mixtureSampleSize,
306
+			    eps=eps,
307
+			    seed=seed,
308
+			    verbose=verbose)
309
+	ok <- cnrmaAffy(cnSet=cnSet,
248 310
 			cdfName=annotation(cnSet), seed=seed,
249 311
 			verbose=verbose)
250 312
 	stopifnot(ok)
251 313
 	ok <- genotypeAffy(cnSet=cnSet,
252
-			   mixtureParams=cnSet@mixtureParams,
253 314
 			   SNRMin=SNRMin,
254 315
 			   recallMin=recallMin,
255 316
 			   recallRegMin=recallRegMin,
... ...
@@ -260,15 +321,14 @@ genotype <- function(filenames,
260 321
 	return(cnSet)
261 322
 }
262 323
 
263
-genotypeAffy <- function(cnSet, mixtureParams, SNRMin=5, recallMin=10,
324
+genotypeAffy <- function(cnSet, SNRMin=5, recallMin=10,
264 325
 			 recallRegMin=1000,
265 326
 			 gender=NULL, badSNP=0.7, returnParams=TRUE,
266 327
 			 verbose=TRUE){
267
-	##snp.index <- which(isSnp(cnSet))
268 328
 	tmp <- crlmmGT2(A=calls(cnSet),
269 329
 			B=snpCallProbability(cnSet),
270 330
 			SNR=cnSet$SNR,
271
-			mixtureParams=mixtureParams,
331
+			mixtureParams=cnSet@mixtureParams,
272 332
 			cdfName=annotation(cnSet),
273 333
 			row.names=NULL,
274 334
 			col.names=sampleNames(cnSet),
... ...
@@ -279,7 +339,6 @@ genotypeAffy <- function(cnSet, mixtureParams, SNRMin=5, recallMin=10,
279 339
 			verbose=verbose,
280 340
 			returnParams=returnParams,
281 341
 			badSNP=badSNP)
282
-##			snp.names=featureNames(cnSet)[snp.index])
283 342
 	if(verbose) message("Genotyping finished.  Updating container with genotype calls and confidence scores.")
284 343
 	open(cnSet$gender)
285 344
 	cnSet$gender[,] <- tmp[["gender"]]
... ...
@@ -287,92 +346,48 @@ genotypeAffy <- function(cnSet, mixtureParams, SNRMin=5, recallMin=10,
287 346
 	return(TRUE)
288 347
 }
289 348
 
290
-cnrmaAffy <- function(cnSet, filenames, cdfName, seed=1, verbose=TRUE){
291
-	snp.I <- isSnp(cnSet)
292
-	snp.index <- which(snp.I)
293
-	np.index <- which(!snp.I)
349
+cnrmaAffy <- function(cnSet, seed=1, verbose=TRUE){
350
+	filenames <- celfileName(cnSet)
351
+	np.index <- which(!isSnp(cnSet))
352
+	A <- A(cnSet)
353
+	cdfName <- annotation(cnSet)
294 354
 	if(verbose) message("Quantile normalizing nonpolymorphic markers")
295
-	cnrma2(A=A(cnSet),
296
-	       filenames=filenames,
297
-	       row.names=featureNames(cnSet)[np.index],
298
-	       cdfName=cdfName,
299
-	       sns=sampleNames(cnSet),
300
-	       seed=seed,
301
-	       verbose=verbose)
302
-	TRUE
303
-}
304
-
305
-rsprocessCEL <- function(i, filenames, fitMixture, A, B, C, D, SKW, SNR,
306
-			 mixtureParams, eps, seed, mixtureSampleSize,
307
-			 pkgname){
308
-	obj1 <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
309
-	obj2 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
310
-	obj3 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
311
-	autosomeIndex <- getVarInEnv("autosomeIndex")
312
-	pnsa <- getVarInEnv("pnsa")
313
-	pnsb <- getVarInEnv("pnsb")
314
-	fid <- getVarInEnv("fid")
355
+	##if(missing(cdfName)) stop("must specify cdfName")
356
+	pkgname <- getCrlmmAnnotationName(cdfName)
357
+	require(pkgname, character.only=TRUE) || stop("Package ", pkgname, " not available")
358
+	sampleBatches <- splitIndicesByNode(seq_along(filenames))
359
+	if(verbose) message("Processing nonpolymorphic probes for ", length(filenames), " files.")
360
+	if(pkgname=="genomewidesnp6Crlmm"){
361
+		loader("1m_reference_cn.rda", .crlmmPkgEnv, pkgname)
362
+	}
363
+	if(pkgname=="genomewidesnp5Crlmm"){
364
+		loader("5.0_reference_cn.rda", .crlmmPkgEnv, pkgname)
365
+	}
315 366
 	reference <- getVarInEnv("reference")
316
-	aIndex <- getVarInEnv("aIndex")
317
-	bIndex <- getVarInEnv("bIndex")
318
-	SMEDIAN <- getVarInEnv("SMEDIAN")
319
-	theKnots <- getVarInEnv("theKnots")
320
-	gns <- getVarInEnv("gns")
321
-	rm(list=c(obj1, obj2, obj3), envir=.crlmmPkgEnv)
322
-	rm(obj1, obj2, obj3)
323
-
324
-	## for mixture
367
+        loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
368
+	fid <- getVarInEnv("npProbesFid")
369
+	row.names <- featureNames(cnSet)[np.index]
370
+	row.names <- row.names[row.names %in% names(fid)] ##removes chromosome Y
371
+	fid <- fid[match(row.names, names(fid))]
372
+	np.index <- match(row.names, rownames(A))
373
+	gns <- names(fid)
325 374
 	set.seed(seed)
326
-	idx <- sort(sample(autosomeIndex, mixtureSampleSize))
327
-	##for skewness. no need to do everything
328
-	idx2 <- sample(length(fid), 10^5)
329
-
375
+	## updates A
330 376
 	open(A)
331
-	open(B)
332
-	open(C)
333
-	open(D)
334
-	open(SKW)
335
-	open(mixtureParams)
336
-	open(SNR)
337
-
338
-	## RS ADDED
339
-	index <- match(gns, rownames(A))
340
-
341
-	for (k in i){
342
-		y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
343
-		x <- log2(y[idx2])
344
-		SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3)
345
-		rm(x)
346
-		y <- normalize.quantiles.use.target(y, target=reference)
347
-		## RS: add index for row assignment
348
-		A[index, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
349
-		B[index, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
350
-		C[index, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
351
-		D[index, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
352
-		rm(y)
353
-		if(fitMixture){
354
-			S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN
355
-			M <- log2(A[idx, k])-log2(B[idx, k])
356
-			tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
357
-			rm(S, M)
358
-			mixtureParams[, k] <- tmp[["coef"]]
359
-			SNR[k] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
360
-			rm(tmp)
361
-		} else {
362
-			mixtureParams[, k] <- NA
363
-			SNR[k] <- NA
377
+	processCEL2 <- function(i){
378
+		for (k in i){
379
+			y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
380
+			A[np.index, k] <- as.integer(normalize.quantiles.use.target(y, target=reference))
364 381
 		}
382
+		return(TRUE)
365 383
 	}
384
+	ocLapply(sampleBatches, processCEL2, neededPkgs="crlmm")
366 385
 	close(A)
367
-	close(B)
368
-	close(SKW)
369
-	close(mixtureParams)
370
-	close(SNR)
371
-	rm(list=ls())
372
-	gc(verbose=FALSE)
373 386
 	TRUE
374 387
 }
375 388
 
389
+
390
+
376 391
 genotype2 <- function(){
377 392
 	.Defunct(msg="The genotype2 function has been deprecated. The function genotype should be used instead.  genotype will support large data using ff provided that the ff package is loaded.")
378 393
 }
... ...
@@ -1251,80 +1266,10 @@ whichPlatform <- function(cdfName){
1251 1266
 	return(platform)
1252 1267
 }
1253 1268
 
1254
-cnrma <- function(A, filenames, row.names, verbose=TRUE, seed=1, cdfName, sns){
1255
-	if(missing(cdfName)) stop("must specify cdfName")
1256
-	pkgname <- getCrlmmAnnotationName(cdfName)
1257
-	require(pkgname, character.only=TRUE) || stop("Package ", pkgname, " not available")
1258
-	if (missing(sns)) sns <- basename(filenames)
1259
-	if(verbose) message("Loading annotations for nonpolymorphic probes")
1260
-        loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
1261
-	fid <- getVarInEnv("npProbesFid")
1262
-	if(cdfName=="genomewidesnp6"){
1263
-		loader("1m_reference_cn.rda", .crlmmPkgEnv, pkgname)
1264
-	}
1265
-	if(cdfName=="genomewidesnp5"){
1266
-		loader("5.0_reference_cn.rda", .crlmmPkgEnv, pkgname)
1267
-	}
1268
-	reference <- getVarInEnv("reference")
1269
-	fid <- fid[match(row.names, names(fid))]
1270
-	np.index <- match(row.names, rownames(A))
1271
-	gns <- names(fid)
1272
-	set.seed(seed)
1273
-	##idx2 <- sample(length(fid), 10^5)
1274
-	for (k in seq_along(filenames)){
1275
-		y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
1276
-		A[np.index, k] <- as.integer(normalize.quantiles.use.target(y, target=reference))
1277
-	}
1278
-	return(A)
1279
-}
1280
-
1281 1269
 cnrma2 <- function(A, filenames, row.names, verbose=TRUE, seed=1, cdfName, sns){
1282
-	if(missing(cdfName)) stop("must specify cdfName")
1283
-	pkgname <- getCrlmmAnnotationName(cdfName)
1284
-	require(pkgname, character.only=TRUE) || stop("Package ", pkgname, " not available")
1285
-	if (missing(sns)) sns <- basename(filenames)
1286
-	sampleBatches <- splitIndicesByLength(seq_along(filenames), ocSamples()/getDoParWorkers())
1287
-	if(verbose) message("Processing nonpolymorphic probes for ", length(filenames), " files.")
1288
-	## updates A
1289
-	ocLapply(sampleBatches,
1290
-		 processCEL2,
1291
-		 row.names=row.names,
1292
-		 filenames=filenames,
1293
-		 A=A,
1294
-		 seed=seed,
1295
-		 pkgname=pkgname,
1296
-		 neededPkgs=c("crlmm", pkgname))
1297
-	##list(sns=sns, gns=row.names, SKW=SKW, cdfName=cdfName)
1298
-	return(A)
1299
-}
1300 1270
 
1301
-processCEL2 <- function(i, filenames, row.names, A, seed, pkgname){
1302
-	if(pkgname=="genomewidesnp6Crlmm"){
1303
-		loader("1m_reference_cn.rda", .crlmmPkgEnv, pkgname)
1304
-	}
1305
-	if(pkgname=="genomewidesnp5Crlmm"){
1306
-		loader("5.0_reference_cn.rda", .crlmmPkgEnv, pkgname)
1307
-	}
1308
-	reference <- getVarInEnv("reference")
1309
-        loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
1310
-	fid <- getVarInEnv("npProbesFid")
1311
-	row.names <- row.names[row.names %in% names(fid)] ##removes chromosome Y
1312
-	fid <- fid[match(row.names, names(fid))]
1313
-	##stopifnot(all.equal(names(fid), row.names))
1314
-	np.index <- match(row.names, rownames(A))
1315
-	gns <- names(fid)
1316
-	set.seed(seed)
1317
-	open(A)
1318
-	##idx2 <- sample(length(fid), 10^5)
1319
-	for (k in i){
1320
-		y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
1321
-		A[np.index, k] <- as.integer(normalize.quantiles.use.target(y, target=reference))
1322
-	}
1323
-	close(A)
1324
-	return(TRUE)
1325 1271
 }
1326 1272
 
1327
-
1328 1273
 imputeCenter <- function(muA, muB, index.complete, index.missing){
1329 1274
 	index <- index.missing
1330 1275
 	mnA <- muA
... ...
@@ -1618,9 +1563,6 @@ genotypeSummary <- function(object,
1618 1563
 	FUN <- get(myf)
1619 1564
 	if(is.lds){
1620 1565
 		index.list <- splitIndicesByLength(marker.index, ocProbesets())
1621
-##		if(parStatus()){
1622
-##			index.list <- splitIndicesByNode(marker.index)
1623
-##		} else index.list <- splitIndicesByLength(marker.index, ocProbesets())
1624 1566
 		ocLapply(seq(along=index.list),
1625 1567
 			 FUN,
1626 1568
 			 index.list=index.list,
... ...
@@ -1653,7 +1595,7 @@ whichMarkers <- function(type, is.snp, is.autosome, is.annotated, is.X){
1653 1595
 }
1654 1596
 
1655 1597
 summarizeNps <- function(strata, index.list, object, batchSize,
1656
-			 GT.CONF.THR, verbose, CHR.X, ...){
1598
+			 GT.CONF.THR, verbose=TRUE, CHR.X=FALSE, ...){
1657 1599
 	is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix")
1658 1600
 	if(is.lds) {physical <- get("physical"); open(object)}
1659 1601
 	if(verbose) message("      Probe stratum ", strata, " of ", length(index.list))
... ...
@@ -1707,9 +1649,9 @@ summarizeNps <- function(strata, index.list, object, batchSize,
1707 1649
 summarizeSnps <- function(strata,
1708 1650
 			  index.list,
1709 1651
 			  object,
1710
-			  GT.CONF.THR,
1711
-			  verbose,
1712
-			  CHR.X, ...){
1652
+			  GT.CONF.THR=0.80,
1653
+			  verbose=TRUE,
1654
+			  CHR.X=FALSE, ...){
1713 1655
 ##	if(is.lds) {
1714 1656
 ##		physical <- get("physical")
1715 1657
 ##		open(object)
... ...
@@ -2012,7 +1954,8 @@ crlmmCopynumber <- function(object,
2012 1954
 			    MIN.NU=2^3,
2013 1955
 			    MIN.PHI=2^3,
2014 1956
 			    THR.NU.PHI=TRUE,
2015
-			    type=c("SNP", "NP", "X.SNP", "X.NP")){
1957
+			    type=c("SNP", "NP", "X.SNP", "X.NP"),
1958
+			    fit.linearModel=TRUE){
2016 1959
 	typeof <- paste(type, collapse=",")
2017 1960
 	stopifnot(typeof %in% c("SNP", "NP", "SNP,NP", "SNP,X.SNP", "SNP,X.NP", "SNP,NP,X.SNP", "SNP,NP,X.SNP,X.NP"))
2018 1961
 	if(GT.CONF.THR >= 1 | GT.CONF.THR < 0) stop("GT.CONF.THR must be in [0,1)")
... ...
@@ -2070,33 +2013,34 @@ crlmmCopynumber <- function(object,
2070 2013
 		}
2071 2014
 	}
2072 2015
 	if(verbose) message("Estimating parameters for copy number")##SNPs only
2073
-	for(i in seq_along(type)){
2074
-		marker.type <- type[i]
2075
-		message(paste("...", mylabel(marker.type)))
2076
-		CHR.X <- ifelse(marker.type %in% c("X.SNP", "X.NP"), TRUE, FALSE)
2077
-		marker.index <- whichMarkers(marker.type, is.snp,
2078
-					     is.autosome, is.annotated, is.X)
2079
-		if(length(marker.index) == 0) next()
2080
-		res <- estimateCnParameters(object=object,
2081
-					       type=marker.type,
2082
-					       SNRMin=SNRMin,
2083
-					       DF.PRIOR=DF.PRIOR,
2084
-					       GT.CONF.THR=GT.CONF.THR,
2085
-					       MIN.SAMPLES=MIN.SAMPLES,
2086
-					       MIN.OBS=MIN.OBS,
2087
-					       MIN.NU=MIN.NU,
2088
-					       MIN.PHI=MIN.PHI,
2089
-					       THR.NU.PHI=THR.NU.PHI,
2090
-					       verbose=verbose,
2091
-					       marker.index=marker.index,
2092
-					       is.lds=is.lds,
2093
-					       CHR.X=CHR.X)
2094
-		##if(!is.lds) {object <- res; rm(res); gc()}
2095
-		##if(!is.lds) {object <- res; rm(res); gc()}
2096
-	}
2097
-	close(object)
2016
+	if(fit.linearModel){
2017
+		for(i in seq_along(type)){
2018
+			marker.type <- type[i]
2019
+			message(paste("...", mylabel(marker.type)))
2020
+			CHR.X <- ifelse(marker.type %in% c("X.SNP", "X.NP"), TRUE, FALSE)
2021
+			marker.index <- whichMarkers(marker.type, is.snp,
2022
+						     is.autosome, is.annotated, is.X)
2023
+			if(length(marker.index) == 0) next()
2024
+			res <- estimateCnParameters(object=object,
2025
+						    type=marker.type,
2026
+						    SNRMin=SNRMin,
2027
+						    DF.PRIOR=DF.PRIOR,
2028
+						    GT.CONF.THR=GT.CONF.THR,
2029
+						    MIN.SAMPLES=MIN.SAMPLES,
2030
+						    MIN.OBS=MIN.OBS,
2031
+						    MIN.NU=MIN.NU,
2032
+						    MIN.PHI=MIN.PHI,
2033
+						    THR.NU.PHI=THR.NU.PHI,
2034
+						    verbose=verbose,
2035
+						    marker.index=marker.index,
2036
+						    is.lds=is.lds,
2037
+						    CHR.X=CHR.X)
2038
+		}
2039
+		close(object)
2040
+	}
2098 2041
 	if(is.lds) return(TRUE) else return(object)
2099 2042
 }
2043
+
2100 2044
 crlmmCopynumber2 <- function(){
2101 2045
 	.Defunct(msg="The crlmmCopynumber2 function has been deprecated. The function crlmmCopynumber should be used instead.  crlmmCopynumber will support large data using ff provided that the ff package is loaded.")
2102 2046
 }
... ...
@@ -83,7 +83,6 @@ crlmmGT <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
83 83
   regionInfo <- getVarInEnv("regionInfo")
84 84
   params <- getVarInEnv("params")
85 85
 
86
-  ##IF gender not provide, we predict
87 86
   if(is.null(gender)){
88 87
 	  if(verbose) message("Determining gender.")
89 88
 	  gender <- imputeGender(A, B, XIndex, YIndex, SNR, SNRMin)
... ...
@@ -20,6 +20,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
20 20
 	} else {
21 21
 		index <- match(gns, rownames(A))
22 22
 	}
23
+	##snpBatches <- splitIndicesByLength(index, ocProbesets(), balance=TRUE)
23 24
 	snpBatches <- splitIndicesByLength(index, ocProbesets())
24 25
 	NR <- length(unlist(snpBatches))
25 26
 	if(verbose) message("Calling ", NR, " SNPs for recalibration... ")
... ...
@@ -39,14 +40,42 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
39 40
 	params <- getVarInEnv("params")
40 41
 	rm(list=c(obj1, obj2), envir=.crlmmPkgEnv)
41 42
 	rm(obj1, obj2)
42
-	##
43
-	## IF gender not provide, we predict
44
-	## FIXME: XIndex may be greater than ocProbesets()
43
+	## use lexical scope
44
+	imputeGender <- function(XIndex, YIndex){
45
+		if(length(YIndex) > 0){
46
+			a <- log2(A[XIndex,,drop=FALSE])
47
+			b <- log2(B[XIndex,,drop=FALSE])
48
+			meds.X <- (apply(a+b, 2, median))/2
49
+			a <- log2(A[YIndex,,drop=FALSE])
50
+			b <- log2(B[YIndex,,drop=FALSE])
51
+			meds.Y <- (apply(a+b, 2, median))/2
52
+			R <- meds.X - meds.Y
53
+			if(sum(SNR > SNRMin) == 1){
54
+				gender <- ifelse(R[SNR[] > SNRMin] > 0.5, 2L, 1L)
55
+			} else{
56
+				gender <- kmeans(R, c(min(R[SNR[]>SNRMin]), max(R[SNR[]>SNRMin])))[["cluster"]]
57
+			}
58
+		} else {
59
+			XMedian <- apply(log2(A[XIndex,,drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
60
+			if(sum(SNR > SNRMin) == 1){
61
+				gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
62
+			} else{
63
+				gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
64
+			}
65
+		}
66
+		return(gender)
67
+	}
45 68
 	if(is.null(gender)){
46
-		if(verbose) message("Determining gender.")
47
-		gender <- imputeGender(A, B, XIndex, YIndex, SNR, SNRMin)
69
+		if(ocProbesets() < length(XIndex)){
70
+			if(verbose) message("Using ", ocProbesets(), " SNPs on chrom X and Y to assign gender.")
71
+			XIndex2 <- sample(XIndex, ocProbesets(), replace=FALSE)
72
+		} else XIndex2 <- XIndex
73
+		if(ocProbesets() < length(YIndex)){
74
+			YIndex2 <- sample(YIndex, ocProbesets(), replace=FALSE)
75
+		} else YIndex2 <- YIndex
76
+		message("Imputing gender")
77
+		gender <- imputeGender(XIndex=XIndex2, YIndex=YIndex2)
48 78
 	}
49
-	##
50 79
 	Indexes <- list(autosomeIndex, XIndex, YIndex)
51 80
 	cIndexes <- list(keepIndex,
52 81
 			 keepIndex[which(gender[keepIndex]==2)],
... ...
@@ -56,16 +85,13 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
56 85
 	mIndex <- which(gender==1)
57 86
 	## different here
58 87
 	## use gtypeCallerR in batches
59
-	##snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
60
-	newparamsBatch <- vector("list", length(snpBatches))
61
-	process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
62
-			     YIndex, A, B, mixtureParams, fIndex, mIndex,
63
-			     params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
64
-		open(A)
65
-		open(B)
66
-		open(mixtureParams)
88
+	batchSize <- ocProbesets()
89
+	open(A)
90
+	open(B)
91
+	open(mixtureParams)
92
+	process1 <- function(idxBatch){
67 93
 		snps <- snpBatches[[idxBatch]]
68
-		rSnps <- range(snps)
94
+		##rSnps <- range(snps)
69 95
 		last <- (idxBatch-1)*batchSize
70 96
 		IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
71 97
 				     XIndex[XIndex %in% snps]-last,
... ...
@@ -73,7 +99,6 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
73 99
 		IndexesBatch <- lapply(IndexesBatch, as.integer)
74 100
 		tmpA <- as.matrix(A[snps,])
75 101
 		tmpB <- as.matrix(B[snps,])
76
-		## newparamsBatch[[idxBatch]]
77 102
 		tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
78 103
 				    params[["centers"]][snps,],
79 104
 				    params[["scales"]][snps,],
... ...
@@ -82,25 +107,13 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
82 107
 				    sapply(IndexesBatch, length),
83 108
 				    sapply(cIndexes, length), SMEDIAN,
84 109
 				    theKnots, mixtureParams[], DF, probs, 0.025)
85
-		rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last)
86
-		gc(verbose=FALSE)
87
-		close(A)
88
-		close(B)
89
-		close(mixtureParams)
90 110
 		tmp
91 111
 	}
92
-	##
93
-	##if(verbose) message("Calling process1")
94
-	newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
95
-				   snpBatches=snpBatches,
96
-				   autosomeIndex=autosomeIndex, XIndex=XIndex,
97
-				   YIndex=YIndex, A=A, B=B,
98
-				   mixtureParams=mixtureParams, fIndex=fIndex,
99
-				   mIndex=mIndex, params=params,
100
-				   cIndexes=cIndexes, SMEDIAN=SMEDIAN,
101
-				   theKnots=theKnots, DF=DF, probs=probs,
102
-				   batchSize=ocProbesets(),
103
-				   neededPkgs="crlmm")
112
+	## Lexical scope
113
+	gc(verbose=FALSE)
114
+	newparamsBatch <- ocLapply(seq(along=snpBatches), process1, neededPkgs="crlmm")
115
+	gc(verbose=FALSE)
116
+	if(verbose) message("finished process1")
104 117
 	newparams <- vector("list", 3)
105 118
 	names(newparams) <- c("centers", "scales", "N")
106 119
 	newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
... ...
@@ -174,6 +187,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
174 187
 			dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
175 188
 		}
176 189
 	}
190
+
177 191
 	if (verbose) message("OK")
178 192
 	##
179 193
 	## BC: must keep SD
... ...
@@ -187,13 +201,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
187 201
 	## ## MOVE TO C#######
188 202
 	##
189 203
 	## running in batches
190
-	process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
191
-			     YIndex, A, B, mixtureParams, fIndex, mIndex,
192
-			     params, cIndexes, SMEDIAN, theKnots, DF, probs,
193
-			     regionInfo, batchSize){
194
-		open(A)
195
-		open(B)
196
-		open(mixtureParams)
204
+	process2 <- function(idxBatch){
197 205
 		snps <- snpBatches[[idxBatch]]
198 206
 		tmpA <- as.matrix(A[snps,])
199 207
 		tmpB <- as.matrix(B[snps,])
... ...
@@ -216,20 +224,14 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
216 224
 					which(regionInfo[snps, 1]))
217 225
 		A[snps,] <- tmpA
218 226
 		B[snps,] <- tmpB
219
-		rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull, last)
220
-		gc(verbose=FALSE)
221
-		close(A)
222
-		close(B)
223
-		close(mixtureParams)
224 227
 	}
225
-	##
226
-	ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
227
-		 autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
228
-		 A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
229
-		 mIndex=mIndex, params=params, cIndexes=cIndexes,
230
-		 SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
231
-		 regionInfo=regionInfo, batchSize=ocProbesets(),
232
-		 neededPkgs="crlmm")
228
+	gc(verbose=FALSE)
229
+	ocLapply(seq(along=snpBatches), process2, neededPkgs="crlmm")
230
+	close(A)
231
+	close(B)
232
+	close(mixtureParams)
233
+	gc(verbose=FALSE)
234
+	message("Done with process2")
233 235
 	##  END MOVE TO C#######
234 236
 	## ##################
235 237
 	##
... ...
@@ -11,7 +11,7 @@ changeToCrlmmAnnotationName <- function(x){
11 11
 }
12 12
 
13 13
 getCrlmmAnnotationName <- function(x){
14
-  paste(tolower(gsub("_", "", x)), "Crlmm", sep="")
14
+	paste(tolower(gsub("_", "", x)), "Crlmm", sep="")
15 15
 }
16 16
 
17 17
 medianSummaries <- function(mat, grps)
18 18
Binary files a/data/cnSetExample.rda and b/data/cnSetExample.rda differ
19 19
Binary files a/data/cnSetExample2.rda and b/data/cnSetExample2.rda differ
... ...
@@ -29,9 +29,7 @@ data(cnSetExample2)
29 29
 
30 30
 }
31 31
 \examples{
32
-\dontrun{
33
-data(cnSetExample)
34
-data(cnSetExample2)
35
-}
32
+     data(cnSetExample)
33
+     data(cnSetExample2)
36 34
 }
37 35
 \keyword{datasets}