Browse code

copynumber vignette in inst/scripts uses hapmap data. added man page for computeCopynumber, updated NAMESPACE

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@37686 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 03/03/2009 11:24:38
Showing 4 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling via CRLMM Algorithm
4
-Version: 1.0.41
4
+Version: 1.0.42
5 5
 Date: 2008-12-28
6 6
 Author: Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>
... ...
@@ -1,5 +1,5 @@
1 1
 useDynLib("crlmm", .registration=TRUE)
2
-export("crlmm", "list.celfiles", "computeCnBatch", "cnrma")
2
+export("crlmm", "list.celfiles", "computeCopynumber", "cnrma")
3 3
 exportMethods("list2crlmmSet", "calls", "confs")
4 4
 exportClasses("crlmmSet")
5 5
 import(Biobase)
... ...
@@ -12,17 +12,6 @@ rowMAD <- function(x, y, ...){
12 12
 	return(mad)
13 13
 }
14 14
 
15
-getCnvFid <- function(chromosome, pkg="pd.genomewidesnp.6", max.size=1000){
16
-	##requires the pd.mapping package
17
-	##best to drop this
18
-	require(pkg, character.only=TRUE)
19
-	conn <- db(get(pkg))
20
-	sql <- paste("SELECT featureSetCNV.chrom_start, featureSetCNV.man_fsetid, featureSetCNV.fsetid, fid FROM featureSetCNV, pmfeatureCNV WHERE ",
21
-		     "featureSetCNV.fsetid=pmfeatureCNV.fsetid AND featureSetCNV.chrom IN ('", chromosome, "')", sep="")
22
-	tmp <- dbGetQuery(conn, sql)
23
-	tmp
24
-}
25
-
26 15
 rowCors <- function(x, y, ...){
27 16
 	N <- rowSums(!is.na(x))
28 17
 	x <- suppressWarnings(log2(x))
... ...
@@ -121,16 +110,9 @@ cnrma <- function(filenames, sns, cdfName, seed=1){
121 110
 	}
122 111
 	
123 112
 	##I'm using the pd.mapping package
124
-	map <- list(); j <- 1
125
-	for(chrom in c(1:22, "X")){
126
-		map[[j]] <- getCnvFid(chrom)
127
-		j <- j+1
128
-	}
129
-	map <- lapply(map, function(x) x[order(x[, "chrom_start"]), ])
130
-	map <- do.call("rbind", map)
131
-
132
-	
133
-	fid <- map[, "fid"]
113
+	data(npProbesFid, package=pkgname)
114
+	fid <- npProbesFid
115
+	gc()
134 116
 	set.seed(seed)
135 117
 	idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything
136 118
 	SKW <- vector("numeric", length(filenames))
... ...
@@ -141,7 +123,7 @@ cnrma <- function(filenames, sns, cdfName, seed=1){
141 123
 		if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3)
142 124
 	}
143 125
 	##load reference distribution obtained from hapmap
144
-	load("/thumper/ctsa/snpmicroarray/hapmap/processed/affy/1m_reference_cn.rda")
126
+	data(list="1m_reference_cn", pkgname)
145 127
 	for(i in seq(along=filenames)){
146 128
 		y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
147 129
 		x <- log2(y[idx2])
... ...
@@ -155,7 +137,8 @@ cnrma <- function(filenames, sns, cdfName, seed=1){
155 137
 			if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
156 138
 			else cat(".")
157 139
 	}
158
-	dimnames(NP) <- list(map[, "man_fsetid"], sns)
140
+	dimnames(NP) <- list(names(fid), sns)
141
+	##dimnames(NP) <- list(map[, "man_fsetid"], sns)
159 142
 	res3 <- list(NP=NP, SKW=SKW)
160 143
 	return(res3)
161 144
 }
... ...
@@ -255,17 +238,16 @@ instantiateObjects <- function(calls, NP, plate, envir, chrom){
255 238
 	assign("CB", CB, envir=envir)
256 239
 }
257 240
 
258
-computeCnBatch <- function(A,
259
-			   B,
260
-			   calls,
261
-			   conf,
262
-			   NP,
263
-			   plate,
264
-			   fit.variance=NULL,
265
-			   seed=123,
266
-			   MIN.OBS=3,
267
-			   envir,
268
-			   chrom, P, DF.PRIOR=50, CONF.THR=0.99, trim=0, upperTail=TRUE, ...){
241
+computeCopynumber <- function(A,
242
+			      B,
243
+			      calls,
244
+			      conf,
245
+			      NP,
246
+			      plate,
247
+			      fit.variance=NULL,
248
+			      MIN.OBS=3,
249
+			      envir,
250
+			      chrom, P, DF.PRIOR=50, CONF.THR=0.99, trim=0, upperTail=TRUE, ...){
269 251
 	if(length(ls(envir)) == 0) instantiateObjects(calls, NP, plate, envir, chrom)
270 252
 	##require(genefilter)
271 253
 	require(splines)
... ...
@@ -297,7 +279,7 @@ computeCnBatch <- function(A,
297 279
 	message("\nAllele specific copy number")	
298 280
 	for(p in P){
299 281
 		cat(".")
300
-		copynumber(plateIndex=p,
282
+		polymorphic(plateIndex=p,
301 283
 			   A=A[, plate==uplate[p]],
302 284
 			   B=B[, plate==uplate[p]],
303 285
 			   envir=envir)
... ...
@@ -872,7 +854,7 @@ coefs <- function(plateIndex, conf, MIN.OBS=3, envir, CONF.THR=0.99){
872 854
 	assign("phiB.se", phiB.se, envir)
873 855
 }
874 856
 
875
-copynumber <- function(plateIndex, A, B, envir){
857
+polymorphic <- function(plateIndex, A, B, envir){
876 858
 	p <- plateIndex
877 859
 	plates.completed <- get("plates.completed", envir)
878 860
 	if(!plates.completed[p]) return()
... ...
@@ -1,4 +1,4 @@
1
-%\VignetteIndexEntry{crlmm Vignette - Copy Number}
1
+%\VignetteIndexEntry{crlmm copy number Vignette}
2 2
 %\VignetteDepends{crlmm, genomewidesnp6Crlmm}
3 3
 %\VignetteKeywords{crlmm, SNP 6}
4 4
 %\VignettePackage{crlmm}
... ...
@@ -27,21 +27,14 @@ options(prompt="R> ")
27 27
 
28 28
 \section{Estimating copy number}
29 29
 
30
-General requirements: in addition to the \R{} packages indicated
31
-below, processing a large number of samples requires a high-end
32
-computer with a large amount of RAM. Currently, the maximum number of
33
-samples that can be genotyped at once is approximately 2000 and this
34
-requires roughly 32G of RAM.
35
-
36
-\paragraph{Suggested work flow.} I typically submit code for
37
-preprocessing and genotyping as a batch job to a computing cluster.
38
-When the preprocessing is complete, I pick a chromosome and work
39
-interactively from the cluster to calculate copy number and make some
40
-of the suggested diagnostic plots.
30
+At present, software for copy number estimation is provided only for
31
+the Affymetrix 6.0 platform.  This vignette estimates copy number for
32
+the HapMap samples.
41 33
 
42 34
 \subsection{Preprocess and genotype the samples}
43 35
 
44
-First, load the required libraries.
36
+See the crlmm vignette for additional details on
37
+preprocessing/genotyping.  
45 38
 
46 39
 <<requiredPackages>>=
47 40
 library(crlmm)
... ...
@@ -49,41 +42,49 @@ library(genomewidesnp6Crlmm)
49 42
 library(ellipse)
50 43
 @ 
51 44
 
52
-Specify an output directory and provide the complete path for the CEL
53
-file names.
54
-
55
-<<>>=
56
-outdir <- "/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/GRU-EA"
57
-datadir <- "/thumper/ctsa/snpmicroarray/GAIN/Bipolar/GRU-EA"
58
-fns <- list.files(datadir, pattern=".CEL", full.names=TRUE)
45
+<<celfiles>>=
46
+celFiles <- list.celfiles("/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m", full.names=TRUE, pattern=".CEL")
47
+outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/1m/affy"
59 48
 @ 
60 49
 
61
-Genotype the samples with CRLMM. Submit this job to the cluster and save the
62
-output to \Robject{outdir}.
50
+Specify an output directory for storing the quantile-normalized
51
+intensities from crlmm.
63 52
 
64
-<<genotype, eval=FALSE>>=
65
-res2 <- crlmm(filenames=fns,  save.it=TRUE, intensityFile=file.path(outdir, "intensities.rda"))
66
-save(res2, file=file.path(outdir, "res2.rda"))
53
+<<genotype>>=
54
+if (!exists("crlmmResult")) {
55
+	if (file.exists(file.path(outdir, "crlmmResult.rda"))){
56
+		load(file.path(outdir, "intensities.rda"))
57
+		load(file.path(outdir, "crlmmResult.rda"))
58
+	}
59
+	else {
60
+		crlmmResult <- crlmm(celFiles, save.it=TRUE, intensityFile=file.path(outdir, "intensities.rda"))
61
+		save(crlmmResult, file=file.path(outdir, "crlmmResult.rda"))
62
+	}
63
+}
67 64
 @ 
68 65
 
69
-Quantile normalize the nonpolymorphic probes and save the output:
66
+Quantile normalize the nonpolymorphic probes and save the output.
70 67
 
71
-<<quantileNormalizeCnProbes, eval=FALSE>>=
72
-res3 <- cnrma(fns)
73
-save(res3, file=file.path(outdir, "res3.rda"))
68
+<<cnrma>>=
69
+if(!exists("cnrmaResult")){
70
+	if(file.exists(file.path(outdir, "cnrmaResult.rda"))) load(file.path(outdir, "cnrmaResult.rda"))
71
+	else {
72
+		cnrmaResult <- cnrma(celFiles)
73
+		save(cnrmaResult, file=file.path(outdir, "cnrmaResult.rda"))
74
+	}
75
+}
74 76
 @ 
75 77
 
76 78
 \subsection{Copy number}
77 79
 
78 80
 Copy number can be assessed one chromosome at a time.  Here we specify
79
-chromosome 15 and and load a list of indices used to subset the files
80
-saved in the previous section. The first element in the list
81
-correspond to indices of polymorphic probes on chromosome 15; the
82
-second element corresponds to indices of nonpolymorphic probes on
83
-chromosome 15.
81
+chromosome 15 and and load a list of indices to subset the data. The
82
+first element in the list correspond to indices of polymorphic probes
83
+on chromosome 15; the second element corresponds to indices of
84
+nonpolymorphic probes on chromosome 15.
84 85
 
85 86
 <<chromosomeIndex>>=
86
-CHR <- 15
87
+CHR <- 21
87 88
 CHR_INDEX <- paste(CHR, "index", sep="")
88 89
 data(list=CHR_INDEX, package="genomewidesnp6Crlmm")
89 90
 str(index)
... ...
@@ -105,39 +106,14 @@ number.  Specifically, we require 6 items:
105 106
   
106 107
   These items are extracted as follows:
107 108
 
108
-<<eval=FALSE>>=
109
-load(file.path(outdir, "intensities.rda"))
110
-load(file.path(outdir, "res2.rda"))
111
-load(file.path(outdir, "res3.rda"))
112
-@ 
113
-
114
-Depending on the number of samples, the above objects can be large and
115
-take a while to load.  For the figures in this vignette, I am loading
116
-only 5000 SNP-level summaries from chromosome 22 (28MB) and the
117
-quantile-normalized intensities for 5000 nonpolymorphic probes on
118
-chromosome 22 (8 MB total).
119
-
120
-<<loadTmpFiles, eval=FALSE, echo=FALSE>>=
121
-load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmp.rda")
122
-assign("res", tmp, envir=.GlobalEnv)
123
-rm(tmp); gc()
124
-load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmpcn.rda")
125
-A <- res$A
126
-B <- res$B
127
-calls <- res$calls
128
-conf <- res$conf
129
-SNR <- res$SNR
130
-NP <- res3$NP
131
-@ 
132
-
133 109
 <<snpAndCnSummaries, eval=FALSE>>=
134 110
 A <- res$A
135 111
 B <- res$B
136
-calls <- res2$calls
137
-conf <- res2$conf
138
-SNR <- res2$SNR
139
-NP <- res3$NP
140
-rm(res, res2, res3)
112
+calls <- crlmmResult$calls
113
+conf <- crlmmResult$conf
114
+SNR <- crlmmResult$SNR
115
+NP <- cnrmaResult$NP
116
+rm(res, crlmmResult, cnrmaResult)
141 117
 gc()
142 118
 @ 
143 119
 
... ...
@@ -157,25 +133,22 @@ B <- B[index[[1]], SNR > 5]
157 133
 calls <- calls[index[[1]], SNR > 5]
158 134
 conf <- conf[index[[1]], SNR > 5]
159 135
 NP <- NP[index[[2]], SNR > 5]
160
-rm(res, res2, res3); gc()
161 136
 @ 
162 137
 
163
-We want to adjust for batch effects.  In our experience, the chemistry
164
-plate is a good surrogate for batch, although this should be assessed
165
-on a study by study basis.  For samples processed by Broad, the plate
166
-is indicated by a capitalized five-letter word and is part of the
167
-sample name.  It is important that the plate (batch) variable is
168
-ordered the same as filenames in the preprocessed data.  For instance,
169
-to indicate plate in this example one can use the command
138
+Batch effects can be very large in the quantile-normalized
139
+intensities.  Often the chemistry plate is a good surrogate for batch
140
+effects, although it can also be the lab, etc.  Here we define batch
141
+by the chemistry plate. For the HapMap data, plate is often confounded
142
+with ancestry.
170 143
 
171 144
 <<specifyBatch>>=
172 145
 sns <- colnames(calls)
173 146
 sns[1]
174
-plates <- sapply(sns, function(x) strsplit(x, "_")[[1]][2])
175
-table(plates)
147
+plate <- substr(basename(sns), 13, 13)
148
+table(plate)
176 149
 @ 
177 150
 
178
-We are now ready to estimate the copy number for each batch.  In the
151
+We are now ready to estimate copy number for each batch.  In the
179 152
 current version of this package, one specifies an environment to which
180 153
 intermediate \R{} objects for copy number estimation are
181 154
 stored. Allele-specific estimates of copy number are also stored in
... ...
@@ -183,13 +156,13 @@ this environment.
183 156
 
184 157
 <<copyNumberByBatch, eval=FALSE>>=
185 158
 e1 <- new.env()
186
-computeCnBatch(A=A,
187
-	       B=B,
188
-	       calls=calls,
189
-	       conf=conf,
190
-	       NP=NP,
191
-	       plate=plates,
192
-	       envir=e1, chrom=CHR, DF.PRIOR=75)
159
+computeCopynumber(A=A,
160
+		  B=B,
161
+		  calls=calls,
162
+		  conf=conf,
163
+		  NP=NP,
164
+		  plate=plates,
165
+		  envir=e1, chrom=CHR, DF.PRIOR=75)
193 166
 @ 
194 167
 
195 168
 The DF.PRIOR indicates how much we will shrink SNP-specific estimates
... ...
@@ -290,8 +263,8 @@ legend("topright", lwd=3, col=c("black", "purple"), legend=c("2 copies", "3 copi
290 263
 \end{figure}
291 264
 
292 265
 Here is one way to identify a SNP with a large plate effect -- look
293
-for shifts in the prediction regions (here I'm lucking for large
294
-shifts in the A direction). 
266
+for shifts in the prediction regions (here I'm looking for large
267
+shifts in the A direction).
295 268
 
296 269
 <<shifts>>=
297 270
 shiftA <- nuA+2*phiA