Browse code

Manually removed conflicts with my branch and Matt Ritchie's commit that had a bug fig for v 1.7.6

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@48613 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 02/08/2010 08:51:34
Showing5 changed files

... ...
@@ -8,7 +8,6 @@ setMethod("initialize", "CNSetLM", function(.Object, lM=new("list"), ...){
8 8
 	.Object@lM <- lM
9 9
 	.Object <- callNextMethod(.Object, ...)
10 10
 })
11
-
12 11
 setValidity("CNSetLM",
13 12
 	    function(object){
14 13
 		    if(!"batch" %in% varLabels(protocolData(object)))
... ...
@@ -8,4 +8,5 @@ setGeneric("snpIndex", function(object) standardGeneric("snpIndex"))
8 8
 setGeneric("snpNames", function(object) standardGeneric("snpNames"))
9 9
 setGeneric("lM", function(object) standardGeneric("lM"))
10 10
 setGeneric("lM<-", function(object, value) standardGeneric("lM<-"))
11
+setGeneric("totalCopyNumber", function(object,...) standardGeneric("totalCopyNumber"))
11 12
 
... ...
@@ -87,11 +87,7 @@ readIdatFiles <- function(sampleSheet=NULL,
87 87
 	       cat("reading", arrayNames[i], "\t")
88 88
 	       idsG = idsR = G = R = NULL
89 89
 	       cat(paste(sep, fileExt$green, sep=""), "\t")
90
-	       G <- tryCatch(readIDAT(grnidats[i]), error=function(e) NULL)
91
-	       if(is.null(G)) { 
92
-		       if(i==1) stop("Read in of first IDAT file failed - check your data")
93
-     		   else cat("\n"); next()
94
-		   }
90
+	       G = readIDAT(grnidats[i])
95 91
 	       idsG = rownames(G$Quants)
96 92
 	       headerInfo$nProbes[i] = G$nSNPsRead
97 93
 	       headerInfo$Barcode[i] = G$Barcode
... ...
@@ -265,11 +261,7 @@ readIdatFiles2 <- function(sampleSheet=NULL,
265 261
 	       cat("reading", arrayNames[i], "\t")
266 262
 	       idsG = idsR = G = R = NULL
267 263
 	       cat(paste(sep, fileExt$green, sep=""), "\t")
268
-	       G <- tryCatch(readIDAT(grnidats[i]), error=function(e) NULL)
269
-	       if(is.null(G)) { 
270
-		       if(i==1) stop("Read in of first IDAT file failed - check your data")
271
-     		   else cat("\n"); next()
272
-		   }
264
+	       G = readIDAT(grnidats[i])
273 265
 	       idsG = rownames(G$Quants)
274 266
 	       headerInfo$nProbes[i] = G$nSNPsRead
275 267
 	       headerInfo$Barcode[i] = G$Barcode
276 268
new file mode 100644
... ...
@@ -0,0 +1,2 @@
1
+illumina_copynumber.tex
2
+Makefile
... ...
@@ -5,6 +5,8 @@
5 5
 \documentclass{article}
6 6
 \usepackage{graphicx}
7 7
 \usepackage{natbib}
8
+\usepackage[margin=1in]{geometry}
9
+\newcommand{\crlmm}{\Rpackage{crlmm}}
8 10
 \newcommand{\Rfunction}[1]{{\texttt{#1}}}
9 11
 \newcommand{\Rmethod}[1]{{\texttt{#1}}}
10 12
 \newcommand{\Rcode}[1]{{\texttt{#1}}}
... ...
@@ -16,48 +18,115 @@
16 18
 
17 19
 \begin{document}
18 20
 \title{Copy number estimation and genotype calling with \Rpackage{crlmm}}
19
-\date{Oct, 2009}
21
+\date{\today}
20 22
 \author{Rob Scharpf}
21 23
 \maketitle
22 24
 
25
+Allele-specific copy number estimation in the crlmm package is
26
+available for several Illumina platforms.  As described in the
27
+\texttt{copynumber.pdf} vignette, this algorithm works best when there
28
+are a sufficient number of samples such that AA, AB, and BB genotypes
29
+are observed at most loci. For small studies (e.g., fewer than 50
30
+samples), there will be a large number of SNPs that are
31
+monomorphic. For monomorphic SNPs, the estimation problem becomes more
32
+difficult and alternative strategies that estimate the relative total
33
+copy number (as to absolute allele-specific copy number) may be
34
+preferable.  In the following code, we list the platforms for which
35
+annotation packages are available for computations performed by
36
+\crlmm{}. Next we create a directory where output files will be stored
37
+and indicate the directory that contains the IDAT files that will be
38
+used in our analysis.
39
+
23 40
 <<setup, echo=FALSE, results=hide>>=
24
-options(width=60)
41
+options(width=70)
25 42
 options(continue=" ")
26
-options(prompt="R> ")
27 43
 @ 
28 44
 
29 45
 <<>>=
30 46
 library(crlmm)
31 47
 crlmm:::validCdfNames()
32
-outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/illumina/HumanCNV370-Duo"
48
+outdir <- "/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/trunk/illumina_vignette"
33 49
 datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k"
34 50
 cdfName <- "human370v1c"
35 51
 @ 
36 52
 
53
+To perform copy number analysis on the Illumina platform, several
54
+steps are required.  The first step is to read in the IDAT files and
55
+create a container for storing the red and green intensities. These
56
+intensities are quantile normalized in the function
57
+\Rfunction{crlmmIllumina}, and then genotyped using the crlmm
58
+algorithm.  Details on the crlmm genotyping algorithm are described
59
+elsewhere.  It is important to specify \Robject{save.it = TRUE} and
60
+provide output files to store the quantile normalized intensities. We
61
+will make use of the normalized intensities when we estimate copy
62
+number.  The object returned by \Rfunction{crlmmIllumina} in an
63
+instance of the \Robject{SnpSet} class, a container for storing the
64
+genotype calls and the genotype confidence scores.  The genotype
65
+confidence scores are saved as an integer, and can be converted back
66
+to a $[0, 1]$ probability scale by the transformation
67
+$round(-1000*log2(1-p))$.  At this point, one may want to extract the
68
+scan date of the arrays for later use.  The scan dates can be pulled
69
+from the RG object and added to the \Robject{SnpSet} returned by
70
+\Rfunction{crlmmIllumina} as illustrated below.
71
+
37 72
 <<samplesToProcess>>=
38 73
 samplesheet = read.csv(file.path(datadir, "HumanHap370Duo_Sample_Map.csv"), header=TRUE, as.is=TRUE)
39
-## Excluding a few of the problem samples
40 74
 samplesheet <- samplesheet[-c(28:46,61:75,78:79), ]
41 75
 arrayNames <- file.path(datadir, unique(samplesheet[, "SentrixPosition"]))
42 76
 ##Check that files exist
43
-grnfiles = all(file.exists(paste(arrayNames, ".Grn.dat", sep="")))
44
-redfiles = all(file.exists(paste(arrayNames, ".Red.dat", sep="")))
45
-@ 
46
-
47
-Alternatively, arguments to the readIdatFiles can be passed through the
48
-{\tt ...} argument of the \R{} function \Robject{crlmmWrapper}.
49
-
50
-<<wrapper>>=
51
-crlmmWrapper(sampleSheet=samplesheet,
52
-	     arrayNames=arrayNames,
53
-	     arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), 
54
-	     saveDate=TRUE,
55
-	     cdfName=cdfName,
56
-	     load.it=FALSE,
57
-	     save.it=FALSE,
58
-	     intensityFile=file.path(outdir, "normalizedIntensities.rda"),
59
-	     crlmmFile=file.path(outdir, "snpsetObject.rda"),
60
-	     rgFile=file.path(outdir, "rgFile.rda"))
77
+grnfiles = all(file.exists(paste(arrayNames, "_Grn.idat", sep="")))
78
+redfiles = all(file.exists(paste(arrayNames, "_Red.idat", sep="")))
79
+myfun <- function(outdir, filenames, samplesheet, arrayInfoColNames, cdfName, cnFile, snpFile){
80
+	if(missing(cnFile)) cnFile <- file.path(outdir, "cnFile.rda")
81
+	if(missing(snpFile)) snpFile <- file.path(outdir, "snpFile.rda")
82
+	RG <- readIdatFiles(sampleSheet=samplesheet, 
83
+			    path=dirname(filenames), 
84
+			    arrayInfoColNames=arrayInfoColNames,
85
+			    saveDate=TRUE)	
86
+	crlmmResult <- crlmmIllumina(RG=RG, 
87
+				     cdfName="human370v1c", 
88
+				     sns=pData(RG)$ID, 
89
+				     returnParams=TRUE,
90
+				     cnFile=cnFile,
91
+				     snpFile=snpFile,
92
+				     save.it=TRUE)	
93
+	protocolData(crlmmResult)$ScanDate <- protocolData(RG)$ScanDate
94
+	range(protocolData(crlmmResult)$ScanDate)
95
+	return(crlmmResult)
96
+}
97
+trace(myfun, browser)
98
+crlmmResult <- checkExists("crlmmResult", path=outdir, FUN=myfun,
99
+			   outdir=outdir,
100
+			   filenames=arrayNames,
101
+			   samplesheet=samplesheet,
102
+			   arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"),
103
+			   cdfName=cdfName)
104
+##if(!exists("crlmmResult")){
105
+##	if(!file.exists(file.path(outdir, "crlmmResult.rda"))){
106
+##		RG <- readIdatFiles(samplesheet, 
107
+##				    path=dirname(arrayNames[1]), 
108
+##				    arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), 
109
+##				    saveDate=TRUE)
110
+##		crlmmResult <- crlmmIllumina(RG=RG, 
111
+##					     cdfName="human370v1c", 
112
+##					     sns=pData(RG)$ID, 
113
+##					     returnParams=TRUE,
114
+##					     cnFile=file.path(outdir, "cnFile.rda"),
115
+##					     snpFile=file.path(outdir, "snpFile.rda"),
116
+##					     save.it=TRUE)
117
+##		protocolData(crlmmResult)$ScanDate <- protocolData(RG)$ScanDate
118
+##		range(protocolData(crlmmResult)$ScanDate)
119
+##		save(crlmmResult, file=file.path(outdir, "crlmmResult.rda"))
120
+##		rm(RG); gc()
121
+##	} else {
122
+##		message("Loading previously saved crlmm results")
123
+##		load(file.path(outdir, "snpFile.rda"))
124
+##		res <- get("res")
125
+##		load(file.path(outdir, "cnFile.rda"))
126
+##		cnAB <- get("cnAB")
127
+##		load(file.path(outdir, "crlmmResult.rda"))
128
+##	}
129
+##}
61 130
 @ 
62 131
 
63 132
 This creates a \Robject{crlmmSetList} object in the \Robject{outdir}
... ...
@@ -94,6 +163,104 @@ load(filename)
94 163
 cn <- copyNumber(crlmmSetList)
95 164
 @ 
96 165
 
166
+The following helper function can facilitate access to the total copy
167
+number.
168
+
169
+<<copyNumberHelper>>=
170
+trace(totalCopyNumber, browser, signature="CNSet")
171
+cn.total2 <- totalCopyNumber(cnSet, i=which(chromosome(cnSet)==1), j=1:20)
172
+@ 
173
+
174
+A few simple visualizations may be helpful at this point. The first
175
+plot is a histogram of the signal to noise ratio of the sample -- an
176
+overall measure of how well the genotype clusters separate.  (This
177
+statistic tends to be much higher for Illumina than for the Affymetrix
178
+platforms.) The second is a visualization of the total copy number
179
+estimates plotted versus physical position on chromosome 1 for the two
180
+samples with the lowest (top) and highest (bottom) signal to noise
181
+ratios.  
182
+
183
+<<snr, fig=TRUE>>=
184
+hist(cnSet$SNR, breaks=15)
185
+@ 
186
+
187
+<<firstSample, fig=TRUE>>=
188
+low.snr <- which(cnSet$SNR == min(cnSet$SNR))
189
+high.snr <- which(cnSet$SNR == max(cnSet$SNR))
190
+x <- position(cnSet)[chromosome(cnSet) == 1]
191
+par(mfrow=c(2,1), las=1, mar=c(0.5, 4, 0.5, 0.5), oma=c(4, 1, 1,1))
192
+for(j in c(low.snr, high.snr)){
193
+	cn <- copyNumber(cnSet)[, j]/100
194
+	cn[cn < 0.05] <- 0.05
195
+	plot(x,
196
+	     cn[chromosome(cnSet) == 1],
197
+	     pch=".", ylab="copy number", xaxt="n", log="y")
198
+}
199
+axis(1, at=pretty(x), labels=pretty(x/1e6))
200
+mtext("Mb", 1, outer=TRUE, line=2)
201
+@ 
202
+
203
+Here's a very simple approach to handle outliers by applying a running
204
+median using a window of size 3.  Following outlier removal, we
205
+suggest applying a wave correction to adjust for more global waves
206
+followed by a segmentation or hidden markov model.
207
+
208
+<<removeOutliers, fig=TRUE>>=
209
+par(mfrow=c(2,1), las=1, mar=c(0.5, 4, 0.5, 0.5), oma=c(4, 1, 1,1))
210
+for(j in c(low.snr, high.snr)){
211
+	x <- position(cnSet)[chromosome(cnSet)==1]
212
+	cn <- copyNumber(cnSet)[, j]/100
213
+	cn[cn < 0.05] <- 0.05
214
+	##cn <- log(cn)
215
+	x <- x[!is.na(cn)]
216
+	cn <- cn[!is.na(cn)]
217
+	y <- as.numeric(runmed(cn, k=3))
218
+	plot(x,
219
+	     y,
220
+	     pch=".", ylab="copy number", xaxt="n", log="y", ylim=c(0.5,5))
221
+	legend("topright", bty="n", legend=paste("SNR =", round(cnSet$SNR[j], 1)))
222
+	abline(h=2, col="grey70")
223
+}
224
+axis(1, at=pretty(x), labels=pretty(x/1e6))
225
+mtext("Mb", 1, outer=TRUE, line=2)
226
+@ 
227
+
228
+%Next we remove some of the waves using \Rpackage{limma}'s function
229
+%\Rfunction{loessFit}.
230
+%<<wavecorrection, fig=TRUE>>=
231
+%require(limma)
232
+%par(mfrow=c(2,2), las=1, mar=c(0.5, 4, 0.5, 0.5), oma=c(4, 1, 1,1))
233
+%for(j in c(low.snr, high.snr)){
234
+%	y <- copyNumber(cnSet)[chromosome(cnSet) == 1, j]/100
235
+%	missing <- is.na(y)
236
+%	x <- x[!missing]
237
+%	y <- y[!missing]
238
+%	dist <- diff(x)
239
+%	index <- c(0, cumsum(log10(dist) > 6))
240
+%	x.split <- split(x, index)
241
+%	y <- as.numeric(runmed(y, k=3))
242
+%	y.split <- split(y, index)
243
+%	y.smooth <- vector("list", length(y.split))
244
+%	for(i in seq_along(y.smooth)){
245
+%		fit <- loessFit(y=y.split[[i]], x= x.split[[i]], span=0.3)
246
+%		y.smooth[[i]] <- 2+fit$residuals 
247
+%	}
248
+%	plot(unlist(x.split),
249
+%	     unlist(y.split),
250
+%	     pch=".", ylab="copy number", xaxt="n", log="y", ylim=c(0.5,5))
251
+%	plot(unlist(x.split),
252
+%	     unlist(y.smooth),
253
+%	     pch=".", ylab="copy number", xaxt="n", log="y", ylim=c(0.5,5), yaxt="n")
254
+%	legend("topright", bty="n", legend=paste("SNR =", round(cnSet$SNR[j], 1)))
255
+%	abline(h=2, col="grey70")	
256
+%	if(j == high.snr){
257
+%		axis(1, at=pretty(x), labels=pretty(x/1e6))
258
+%		mtext("Mb", 1, outer=TRUE, line=2)
259
+%	}
260
+%}
261
+%@ 
262
+
263
+
97 264
 \end{document}
98 265
 
99 266