Browse code

genotype function is a wrapper for constructAffy, snprmaAffy, cnrmaAffy, and genotypeAffy. the *Affy functions are not currently exported.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54261 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 30/03/2011 19:27:38
Showing 4 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.9.22
4
+Version: 1.9.23
5 5
 Date: 2010-12-10
6 6
 Author: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -99,22 +99,7 @@ construct <- function(filenames,
99 99
 	return(cnSet)
100 100
 }
101 101
 
102
-genotype <- function(filenames,
103
-		     cdfName,
104
-		     batch,
105
-		     mixtureSampleSize=10^5,
106
-		     eps=0.1,
107
-		     verbose=TRUE,
108
-		     seed=1,
109
-		     sns,
110
-		     probs=rep(1/3, 3),
111
-		     DF=6,
112
-		     SNRMin=5,
113
-		     recallMin=10,
114
-		     recallRegMin=1000,
115
-		     gender=NULL,
116
-		     returnParams=TRUE,
117
-		     badSNP=0.7){
102
+constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE){
118 103
 	is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE)
119 104
 	if(!is.lds) stop("this function now requires that the ff package be loaded")
120 105
 	if(missing(cdfName)) stop("must specify cdfName")
... ...
@@ -139,8 +124,8 @@ genotype <- function(filenames,
139 124
 	gns <- getVarInEnv("gns")
140 125
 	rm(list=obj, envir=.crlmmPkgEnv)
141 126
 	rm(obj)
142
-	if(verbose) message("Initializing objects.")
143
-	mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
127
+	if(verbose) message("Initializing ff objects.")
128
+	##mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
144 129
 	SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
145 130
 	SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")
146 131
 	genderff <- initializeBigVector("gender", length(filenames), "integer")
... ...
@@ -148,30 +133,16 @@ genotype <- function(filenames,
148 133
 	nr <- nrow(featureData); nc <- length(sns)
149 134
 	A <- initializeBigMatrix("crlmmA-", nr, length(filenames), "integer")
150 135
 	B <- initializeBigMatrix("crlmmB-", nr, length(filenames), "integer")
136
+	call <- initializeBigMatrix("call-", nr, length(filenames), "integer")
137
+	callPr <- initializeBigMatrix("callPr-", nr, length(filenames), "integer")
151 138
 	rownames(A) <- rownames(B) <- featureNames(featureData)
152
-	sampleBatches <- splitIndicesByNode(seq(along=filenames))
153
-	if(verbose) message("Processing ", length(filenames), " files.")
154
-	ocLapply(sampleBatches, rsprocessCEL, filenames=filenames,
155
-		 fitMixture=TRUE, A=A, B=B, SKW=SKW, SNR=SNR,
156
-		 mixtureParams=mixtureParams, eps=eps, seed=seed,
157
-		 mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
158
-		 neededPkgs=c("crlmm", pkgname))
159
-	## Now we initialize a CNSet object, cloning A and B
160
-	if(verbose) message("Cloning A and B matrices to store genotype calls and confidence scores.")
161
-	## The clones will be used to store calls and confidence scores
162
-	open(A)
163
-	open(B)
164
-	##user do
165
-	## options("ffbatchbytes"=XXX) to make this efficient
166
-	call <- clone(A)
167
-	callProbability=clone(B)
168
-	close(A)
169
-	close(B)
139
+	rownames(call) <- rownames(callPr) <- featureNames(featureData)
140
+	if(verbose) message("Instantiating CNSet container")
170 141
 	cnSet <- new("CNSet",
171 142
 		     alleleA=A,
172 143
 		     alleleB=B,
173 144
 		     call=call,
174
-		     callProbability=callProbability,
145
+		     callProbability=callPr,
175 146
 		     featureData=featureData,
176 147
 		     annotation=cdfName,
177 148
 		     batch=batch)
... ...
@@ -190,26 +161,152 @@ genotype <- function(filenames,
190 161
 	cnSet$SNR <- SNR
191 162
 	cnSet$gender <- genderff
192 163
 	stopifnot(validObject(cnSet))
193
-	snp.I <- isSnp(cnSet)
194
-	snp.index <- which(snp.I)
195
-	##pData(cnSet)$SKW <- SKW
196
-	##pData(cnSet)$SNR <- SNR
197
-	np.index <- which(!snp.I)
198
-	if(verbose) message("Normalizing nonpolymorphic markers")
199
-	FUN <- ifelse(is.lds, "cnrma2", "cnrma")
200
-	if(verbose) message("Quantile normalizing nonpolymorphic markers")
201
-	cnrma2(A=A(cnSet),
202
-	       filenames=filenames,
203
-	       row.names=featureNames(cnSet)[np.index],
204
-	       cdfName=cdfName,
205
-	       sns=sampleNames(cnSet),
206
-	       seed=seed,
207
-	       verbose=verbose)
164
+	return(cnSet)
165
+}
166
+
167
+snprmaAffy <- function(cnSet, filenames,
168
+		       mixtureSampleSize=10^5,
169
+		       eps=0.1,
170
+		       seed=1,
171
+		       verbose=TRUE){
172
+	if(verbose) message("Preprocessing ", length(filenames), " files.")
173
+	pkgname <- getCrlmmAnnotationName(annotation(cnSet))
174
+	stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
175
+	mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
176
+	sampleBatches <- splitIndicesByNode(seq(along=filenames))
177
+	ocLapply(sampleBatches, rsprocessCEL, filenames=filenames,
178
+		 fitMixture=TRUE, A=A(cnSet), B=B(cnSet), C=calls(cnSet),
179
+		 D=snpCallProbability(cnSet),
180
+		 SKW=cnSet$SKW, SNR=cnSet$SNR,
181
+		 mixtureParams=mixtureParams, eps=eps, seed=seed,
182
+		 mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
183
+		 neededPkgs=c("crlmm", pkgname))
184
+	if(verbose) message("Cloning A and B matrices to store genotype calls and confidence scores.")
185
+	return(mixtureParams)
186
+}
187
+genotype <- function(filenames,
188
+		     cdfName,
189
+		     batch,
190
+		     mixtureSampleSize=10^5,
191
+		     eps=0.1,
192
+		     verbose=TRUE,
193
+		     seed=1,
194
+		     sns,
195
+		     probs=rep(1/3, 3),
196
+		     DF=6,
197
+		     SNRMin=5,
198
+		     recallMin=10,
199
+		     recallRegMin=1000,
200
+		     gender=NULL,
201
+		     returnParams=TRUE,
202
+		     badSNP=0.7){
203
+##	is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE)
204
+##	if(!is.lds) stop("this function now requires that the ff package be loaded")
205
+##	if(missing(cdfName)) stop("must specify cdfName")
206
+##	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
207
+##	if(missing(sns)) sns <- basename(filenames)
208
+##	stopifnot(!missing(batch))
209
+##	##---------------------------------------------------------------------------
210
+##	##
211
+##	## from snprma2.  Goal is to initialize A and B with
212
+##	## appropriate dimension for snps+nps
213
+##	##
214
+##	##---------------------------------------------------------------------------
215
+##	if (missing(sns)) sns <- basename(filenames)
216
+##	if (missing(cdfName))
217
+##		cdfName <- read.celfile.header(filenames[1])[["cdfName"]]
218
+##	pkgname <- getCrlmmAnnotationName(cdfName)
219
+##	stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
220
+##	if(verbose) message("Loading annotations and mixture model parameters.")
221
+##	obj <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
222
+##	pnsa <- getVarInEnv("pnsa")
223
+##	pnsb <- getVarInEnv("pnsb")
224
+##	gns <- getVarInEnv("gns")
225
+##	rm(list=obj, envir=.crlmmPkgEnv)
226
+##	rm(obj)
227
+##	if(verbose) message("Initializing objects.")
228
+##	mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
229
+##	SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
230
+##	SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")
231
+##	genderff <- initializeBigVector("gender", length(filenames), "integer")
232
+##	featureData <- getFeatureData(cdfName, copynumber=TRUE)
233
+##	nr <- nrow(featureData); nc <- length(sns)
234
+##	A <- initializeBigMatrix("crlmmA-", nr, length(filenames), "integer")
235
+##	B <- initializeBigMatrix("crlmmB-", nr, length(filenames), "integer")
236
+##	rownames(A) <- rownames(B) <- featureNames(featureData)
237
+##	sampleBatches <- splitIndicesByNode(seq(along=filenames))
238
+##	if(verbose) message("Processing ", length(filenames), " files.")
239
+##	ocLapply(sampleBatches, rsprocessCEL, filenames=filenames,
240
+##		 fitMixture=TRUE, A=A, B=B, SKW=SKW, SNR=SNR,
241
+##		 mixtureParams=mixtureParams, eps=eps, seed=seed,
242
+##		 mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
243
+##		 neededPkgs=c("crlmm", pkgname))
244
+##	## Now we initialize a CNSet object, cloning A and B
245
+##	if(verbose) message("Cloning A and B matrices to store genotype calls and confidence scores.")
246
+##	## The clones will be used to store calls and confidence scores
247
+##	open(A)
248
+##	open(B)
249
+##	##user do
250
+##	## options("ffbatchbytes"=XXX) to make this efficient
251
+##	call <- clone(A)
252
+##	callProbability=clone(B)
253
+##	close(A)
254
+##	close(B)
255
+##	cnSet <- new("CNSet",
256
+##		     alleleA=A,
257
+##		     alleleB=B,
258
+##		     call=call,
259
+##		     callProbability=callProbability,
260
+##		     featureData=featureData,
261
+##		     annotation=cdfName,
262
+##		     batch=batch)
263
+##	if(!missing(sns)){
264
+##		sampleNames(cnSet) <- sns
265
+##	} else {
266
+##		sampleNames(cnSet) <- basename(filenames)
267
+##	}
268
+##	protocolData <- getProtocolData.Affy(filenames)
269
+##	rownames(pData(protocolData)) <- sns
270
+##	protocolData(cnSet) <- protocolData
271
+##	##pd <- data.frame(matrix(NA, nc, 3), row.names=sns)
272
+##	##colnames(pd)=c("SKW", "SNR", "gender")
273
+##	##phenoData(cnSet) <- new("AnnotatedDataFrame", data=pd)
274
+##	cnSet$SKW <- SKW
275
+##	cnSet$SNR <- SNR
276
+##	cnSet$gender <- genderff
277
+##	save(cnSet, file=file.path(ldPath(), "cnSet.rda"))
278
+##	stopifnot(validObject(cnSet))
279
+	cnSet <- constructAffy(filenames=filenames,
280
+			       cdfName=cdfName,
281
+			       batch=batch, verbose=verbose)
282
+	mixtureParams <- snprmaAffy(cnSet, filenames=filenames,
283
+				    mixtureSampleSize=mixtureSampleSize,
284
+				    eps=eps,
285
+				    seed=seed,
286
+				    verbose=verbose)
287
+	ok <- cnrmaAffy(cnSet=cnSet, filenames=filenames, seed=seed,
288
+			verbose=verbose)
289
+	stopifnot(ok)
290
+	ok <- genotypeAffy(cnSet=cnSet, mixtureParams=mixtureParams,
291
+			   SNRMin=SNRMin, recallMin=recallMin,
292
+			   recallRegMin=recallRegMin,
293
+			   gender=gender,
294
+			   badSNP=badSNP,
295
+			   returnParams=returnParams,
296
+			   verbose=verbose)
297
+	return(cnSet)
298
+}
299
+
300
+genotypeAffy <- function(cnSet, mixtureParams, SNRMin=5, recallMin=10,
301
+			 recallRegMin=1000,
302
+			 gender=NULL, badSNP=0.7, returnParams=TRUE,
303
+			 verbose=TRUE){
304
+	snp.index <- which(isSnp(cnSet))
208 305
 	tmp <- crlmmGT2(A=calls(cnSet),
209 306
 			B=snpCallProbability(cnSet),
210
-			SNR=SNR,
307
+			SNR=cnSet$SNR,
211 308
 			mixtureParams=mixtureParams,
212
-			cdfName=cdfName,
309
+			cdfName=annotation(cnSet),
213 310
 			row.names=NULL,
214 311
 			col.names=sampleNames(cnSet),
215 312
 			SNRMin=SNRMin,
... ...
@@ -224,10 +321,25 @@ genotype <- function(filenames,
224 321
 	open(cnSet$gender)
225 322
 	cnSet$gender[,] <- tmp[["gender"]]
226 323
 	close(cnSet$gender)
227
-	return(cnSet)
324
+	return(TRUE)
325
+}
326
+
327
+cnrmaAffy <- function(cnSet, filenames, seed=1, verbose=TRUE){
328
+	snp.I <- isSnp(cnSet)
329
+	snp.index <- which(snp.I)
330
+	np.index <- which(!snp.I)
331
+	if(verbose) message("Quantile normalizing nonpolymorphic markers")
332
+	cnrma2(A=A(cnSet),
333
+	       filenames=filenames,
334
+	       row.names=featureNames(cnSet)[np.index],
335
+	       cdfName=cdfName,
336
+	       sns=sampleNames(cnSet),
337
+	       seed=seed,
338
+	       verbose=verbose)
339
+	TRUE
228 340
 }
229 341
 
230
-rsprocessCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR,
342
+rsprocessCEL <- function(i, filenames, fitMixture, A, B, C, D, SKW, SNR,
231 343
 			 mixtureParams, eps, seed, mixtureSampleSize,
232 344
 			 pkgname){
233 345
 	obj1 <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
... ...
@@ -254,6 +366,8 @@ rsprocessCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR,
254 366
 
255 367
 	open(A)
256 368
 	open(B)
369
+	open(C)
370
+	open(D)
257 371
 	open(SKW)
258 372
 	open(mixtureParams)
259 373
 	open(SNR)
... ...
@@ -270,6 +384,8 @@ rsprocessCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR,
270 384
 		## RS: add index for row assignment
271 385
 		A[index, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
272 386
 		B[index, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
387
+		C[index, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
388
+		D[index, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
273 389
 		rm(y)
274 390
 		if(fitMixture){
275 391
 			S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN
... ...
@@ -25,7 +25,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
25 25
 	}
26 26
 	snpBatches <- splitIndicesByLength(index, ocProbesets())
27 27
 	NR <- length(unlist(snpBatches))
28
-	if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
28
+	if(verbose) message("Calling ", NR, " SNPs for recalibration... ")
29 29
 	NC <- ncol(A)
30 30
 	##
31 31
 	if(verbose) message("Loading annotations.")
... ...
@@ -61,7 +61,6 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
61 61
 	cIndexes <- list(keepIndex,
62 62
 			 keepIndex[which(gender[keepIndex]==2)],
63 63
 			 keepIndex[which(gender[keepIndex]==1)])
64
-	if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
65 64
 	## call C
66 65
 	fIndex <- which(gender==2)
67 66
 	mIndex <- which(gender==1)
... ...
@@ -101,6 +100,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
101 100
 		tmp
102 101
 	}
103 102
 	##
103
+	if(verbose) message("Calling process1")
104 104
 	newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
105 105
 				   snpBatches=snpBatches,
106 106
 				   autosomeIndex=autosomeIndex, XIndex=XIndex,
... ...
@@ -1,10 +1,34 @@
1 1
 \name{NEWS}
2 2
 \title{News for Package 'crlmm'}
3 3
 
4
-\section{Changes in version 1.10}{
4
+
5
+\section{Changes in version 1.9}{
5 6
   \subsection{USER VISIBLE CHANGES}{
6 7
     \itemize{
7 8
       \item Using NEWS.Rd
9
+
10
+      \item batch slot in CNSet objects is class 'character'
11
+      (previously, class was 'factor')
12
+
13
+      \item the ff package is required for preprocessing and genotyping
14
+      prior to copy number analyses with the crlmmCopynumber function.
15
+
16
+      \item We have added several vignettes pertaining to copy number
17
+      analyses in the crlmm package: CopyNumberOverview,
18
+      AffymetrixPreprocessCN, IlluminaPreprocessCN, and Infrastructure.
19
+      The 'copynumber' and 'Infrastructure' vignettes are applicable to
20
+      both the Illumina and Affymetrix platforms. The CopyNumberOverview
21
+      vignette gives a brief summary of the available vignettes for copy
22
+      number analyses.
23
+
24
+      \item Exporting function constructInf, preprocessInf, and
25
+      genotypeInf for preprocessing and genotyping Illumina files prior
26
+      to copy number analyses.  The genotype.Illumina function is now a
27
+      wrapper for these functions.
28
+
29
+      \item additional documentation for crlmm is provided in a
30
+      compendium: http://www.biostat.jhsph.edu/~rscharpf/crlmmCompendium/index.html
31
+
8 32
     }
9 33
   }
10 34
 }
... ...
@@ -37,7 +61,7 @@
37 61
  	  elements of this object, including featureNames,
38 62
  	  sampleNames, and '['.
39 63
 	}
40
-	
64
+
41 65
         \item 'CopyNumberSet': contains locus-level estimates of copy
42 66
   	number for SNPs and polymorphic probes.
43 67
 	\enumerate{
... ...
@@ -47,7 +71,7 @@
47 71
 
48 72
 	  \item For nonpolymorphic probes, the total copy number is
49 73
           stored in the 'CA' slot and a NA is recorded for the
50
-          corresponding row in the CB matrix. 
74
+          corresponding row in the CB matrix.
51 75
 
52 76
 	  \item Useful methods: 'copyNumber', 'ellipse', 'points'
53 77
 	}
... ...
@@ -60,4 +84,5 @@
60 84
      'CrlmmSetList' and returns an object of class 'CopyNumberSet'.
61 85
     }
62 86
   }
87
+
63 88
 }