git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@48613 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -8,4 +8,5 @@ setGeneric("snpIndex", function(object) standardGeneric("snpIndex")) |
8 | 8 |
setGeneric("snpNames", function(object) standardGeneric("snpNames")) |
9 | 9 |
setGeneric("lM", function(object) standardGeneric("lM")) |
10 | 10 |
setGeneric("lM<-", function(object, value) standardGeneric("lM<-")) |
11 |
+setGeneric("totalCopyNumber", function(object,...) standardGeneric("totalCopyNumber")) |
|
11 | 12 |
|
... | ... |
@@ -87,11 +87,7 @@ readIdatFiles <- function(sampleSheet=NULL, |
87 | 87 |
cat("reading", arrayNames[i], "\t") |
88 | 88 |
idsG = idsR = G = R = NULL |
89 | 89 |
cat(paste(sep, fileExt$green, sep=""), "\t") |
90 |
- G <- tryCatch(readIDAT(grnidats[i]), error=function(e) NULL) |
|
91 |
- if(is.null(G)) { |
|
92 |
- if(i==1) stop("Read in of first IDAT file failed - check your data") |
|
93 |
- else cat("\n"); next() |
|
94 |
- } |
|
90 |
+ G = readIDAT(grnidats[i]) |
|
95 | 91 |
idsG = rownames(G$Quants) |
96 | 92 |
headerInfo$nProbes[i] = G$nSNPsRead |
97 | 93 |
headerInfo$Barcode[i] = G$Barcode |
... | ... |
@@ -265,11 +261,7 @@ readIdatFiles2 <- function(sampleSheet=NULL, |
265 | 261 |
cat("reading", arrayNames[i], "\t") |
266 | 262 |
idsG = idsR = G = R = NULL |
267 | 263 |
cat(paste(sep, fileExt$green, sep=""), "\t") |
268 |
- G <- tryCatch(readIDAT(grnidats[i]), error=function(e) NULL) |
|
269 |
- if(is.null(G)) { |
|
270 |
- if(i==1) stop("Read in of first IDAT file failed - check your data") |
|
271 |
- else cat("\n"); next() |
|
272 |
- } |
|
264 |
+ G = readIDAT(grnidats[i]) |
|
273 | 265 |
idsG = rownames(G$Quants) |
274 | 266 |
headerInfo$nProbes[i] = G$nSNPsRead |
275 | 267 |
headerInfo$Barcode[i] = G$Barcode |
... | ... |
@@ -5,6 +5,8 @@ |
5 | 5 |
\documentclass{article} |
6 | 6 |
\usepackage{graphicx} |
7 | 7 |
\usepackage{natbib} |
8 |
+\usepackage[margin=1in]{geometry} |
|
9 |
+\newcommand{\crlmm}{\Rpackage{crlmm}} |
|
8 | 10 |
\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
9 | 11 |
\newcommand{\Rmethod}[1]{{\texttt{#1}}} |
10 | 12 |
\newcommand{\Rcode}[1]{{\texttt{#1}}} |
... | ... |
@@ -16,48 +18,115 @@ |
16 | 18 |
|
17 | 19 |
\begin{document} |
18 | 20 |
\title{Copy number estimation and genotype calling with \Rpackage{crlmm}} |
19 |
-\date{Oct, 2009} |
|
21 |
+\date{\today} |
|
20 | 22 |
\author{Rob Scharpf} |
21 | 23 |
\maketitle |
22 | 24 |
|
25 |
+Allele-specific copy number estimation in the crlmm package is |
|
26 |
+available for several Illumina platforms. As described in the |
|
27 |
+\texttt{copynumber.pdf} vignette, this algorithm works best when there |
|
28 |
+are a sufficient number of samples such that AA, AB, and BB genotypes |
|
29 |
+are observed at most loci. For small studies (e.g., fewer than 50 |
|
30 |
+samples), there will be a large number of SNPs that are |
|
31 |
+monomorphic. For monomorphic SNPs, the estimation problem becomes more |
|
32 |
+difficult and alternative strategies that estimate the relative total |
|
33 |
+copy number (as to absolute allele-specific copy number) may be |
|
34 |
+preferable. In the following code, we list the platforms for which |
|
35 |
+annotation packages are available for computations performed by |
|
36 |
+\crlmm{}. Next we create a directory where output files will be stored |
|
37 |
+and indicate the directory that contains the IDAT files that will be |
|
38 |
+used in our analysis. |
|
39 |
+ |
|
23 | 40 |
<<setup, echo=FALSE, results=hide>>= |
24 |
-options(width=60) |
|
41 |
+options(width=70) |
|
25 | 42 |
options(continue=" ") |
26 |
-options(prompt="R> ") |
|
27 | 43 |
@ |
28 | 44 |
|
29 | 45 |
<<>>= |
30 | 46 |
library(crlmm) |
31 | 47 |
crlmm:::validCdfNames() |
32 |
-outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/illumina/HumanCNV370-Duo" |
|
48 |
+outdir <- "/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/trunk/illumina_vignette" |
|
33 | 49 |
datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k" |
34 | 50 |
cdfName <- "human370v1c" |
35 | 51 |
@ |
36 | 52 |
|
53 |
+To perform copy number analysis on the Illumina platform, several |
|
54 |
+steps are required. The first step is to read in the IDAT files and |
|
55 |
+create a container for storing the red and green intensities. These |
|
56 |
+intensities are quantile normalized in the function |
|
57 |
+\Rfunction{crlmmIllumina}, and then genotyped using the crlmm |
|
58 |
+algorithm. Details on the crlmm genotyping algorithm are described |
|
59 |
+elsewhere. It is important to specify \Robject{save.it = TRUE} and |
|
60 |
+provide output files to store the quantile normalized intensities. We |
|
61 |
+will make use of the normalized intensities when we estimate copy |
|
62 |
+number. The object returned by \Rfunction{crlmmIllumina} in an |
|
63 |
+instance of the \Robject{SnpSet} class, a container for storing the |
|
64 |
+genotype calls and the genotype confidence scores. The genotype |
|
65 |
+confidence scores are saved as an integer, and can be converted back |
|
66 |
+to a $[0, 1]$ probability scale by the transformation |
|
67 |
+$round(-1000*log2(1-p))$. At this point, one may want to extract the |
|
68 |
+scan date of the arrays for later use. The scan dates can be pulled |
|
69 |
+from the RG object and added to the \Robject{SnpSet} returned by |
|
70 |
+\Rfunction{crlmmIllumina} as illustrated below. |
|
71 |
+ |
|
37 | 72 |
<<samplesToProcess>>= |
38 | 73 |
samplesheet = read.csv(file.path(datadir, "HumanHap370Duo_Sample_Map.csv"), header=TRUE, as.is=TRUE) |
39 |
-## Excluding a few of the problem samples |
|
40 | 74 |
samplesheet <- samplesheet[-c(28:46,61:75,78:79), ] |
41 | 75 |
arrayNames <- file.path(datadir, unique(samplesheet[, "SentrixPosition"])) |
42 | 76 |
##Check that files exist |
43 |
-grnfiles = all(file.exists(paste(arrayNames, ".Grn.dat", sep=""))) |
|
44 |
-redfiles = all(file.exists(paste(arrayNames, ".Red.dat", sep=""))) |
|
45 |
-@ |
|
46 |
- |
|
47 |
-Alternatively, arguments to the readIdatFiles can be passed through the |
|
48 |
-{\tt ...} argument of the \R{} function \Robject{crlmmWrapper}. |
|
49 |
- |
|
50 |
-<<wrapper>>= |
|
51 |
-crlmmWrapper(sampleSheet=samplesheet, |
|
52 |
- arrayNames=arrayNames, |
|
53 |
- arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), |
|
54 |
- saveDate=TRUE, |
|
55 |
- cdfName=cdfName, |
|
56 |
- load.it=FALSE, |
|
57 |
- save.it=FALSE, |
|
58 |
- intensityFile=file.path(outdir, "normalizedIntensities.rda"), |
|
59 |
- crlmmFile=file.path(outdir, "snpsetObject.rda"), |
|
60 |
- rgFile=file.path(outdir, "rgFile.rda")) |
|
77 |
+grnfiles = all(file.exists(paste(arrayNames, "_Grn.idat", sep=""))) |
|
78 |
+redfiles = all(file.exists(paste(arrayNames, "_Red.idat", sep=""))) |
|
79 |
+myfun <- function(outdir, filenames, samplesheet, arrayInfoColNames, cdfName, cnFile, snpFile){ |
|
80 |
+ if(missing(cnFile)) cnFile <- file.path(outdir, "cnFile.rda") |
|
81 |
+ if(missing(snpFile)) snpFile <- file.path(outdir, "snpFile.rda") |
|
82 |
+ RG <- readIdatFiles(sampleSheet=samplesheet, |
|
83 |
+ path=dirname(filenames), |
|
84 |
+ arrayInfoColNames=arrayInfoColNames, |
|
85 |
+ saveDate=TRUE) |
|
86 |
+ crlmmResult <- crlmmIllumina(RG=RG, |
|
87 |
+ cdfName="human370v1c", |
|
88 |
+ sns=pData(RG)$ID, |
|
89 |
+ returnParams=TRUE, |
|
90 |
+ cnFile=cnFile, |
|
91 |
+ snpFile=snpFile, |
|
92 |
+ save.it=TRUE) |
|
93 |
+ protocolData(crlmmResult)$ScanDate <- protocolData(RG)$ScanDate |
|
94 |
+ range(protocolData(crlmmResult)$ScanDate) |
|
95 |
+ return(crlmmResult) |
|
96 |
+} |
|
97 |
+trace(myfun, browser) |
|
98 |
+crlmmResult <- checkExists("crlmmResult", path=outdir, FUN=myfun, |
|
99 |
+ outdir=outdir, |
|
100 |
+ filenames=arrayNames, |
|
101 |
+ samplesheet=samplesheet, |
|
102 |
+ arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), |
|
103 |
+ cdfName=cdfName) |
|
104 |
+##if(!exists("crlmmResult")){ |
|
105 |
+## if(!file.exists(file.path(outdir, "crlmmResult.rda"))){ |
|
106 |
+## RG <- readIdatFiles(samplesheet, |
|
107 |
+## path=dirname(arrayNames[1]), |
|
108 |
+## arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), |
|
109 |
+## saveDate=TRUE) |
|
110 |
+## crlmmResult <- crlmmIllumina(RG=RG, |
|
111 |
+## cdfName="human370v1c", |
|
112 |
+## sns=pData(RG)$ID, |
|
113 |
+## returnParams=TRUE, |
|
114 |
+## cnFile=file.path(outdir, "cnFile.rda"), |
|
115 |
+## snpFile=file.path(outdir, "snpFile.rda"), |
|
116 |
+## save.it=TRUE) |
|
117 |
+## protocolData(crlmmResult)$ScanDate <- protocolData(RG)$ScanDate |
|
118 |
+## range(protocolData(crlmmResult)$ScanDate) |
|
119 |
+## save(crlmmResult, file=file.path(outdir, "crlmmResult.rda")) |
|
120 |
+## rm(RG); gc() |
|
121 |
+## } else { |
|
122 |
+## message("Loading previously saved crlmm results") |
|
123 |
+## load(file.path(outdir, "snpFile.rda")) |
|
124 |
+## res <- get("res") |
|
125 |
+## load(file.path(outdir, "cnFile.rda")) |
|
126 |
+## cnAB <- get("cnAB") |
|
127 |
+## load(file.path(outdir, "crlmmResult.rda")) |
|
128 |
+## } |
|
129 |
+##} |
|
61 | 130 |
@ |
62 | 131 |
|
63 | 132 |
This creates a \Robject{crlmmSetList} object in the \Robject{outdir} |
... | ... |
@@ -94,6 +163,104 @@ load(filename) |
94 | 163 |
cn <- copyNumber(crlmmSetList) |
95 | 164 |
@ |
96 | 165 |
|
166 |
+The following helper function can facilitate access to the total copy |
|
167 |
+number. |
|
168 |
+ |
|
169 |
+<<copyNumberHelper>>= |
|
170 |
+trace(totalCopyNumber, browser, signature="CNSet") |
|
171 |
+cn.total2 <- totalCopyNumber(cnSet, i=which(chromosome(cnSet)==1), j=1:20) |
|
172 |
+@ |
|
173 |
+ |
|
174 |
+A few simple visualizations may be helpful at this point. The first |
|
175 |
+plot is a histogram of the signal to noise ratio of the sample -- an |
|
176 |
+overall measure of how well the genotype clusters separate. (This |
|
177 |
+statistic tends to be much higher for Illumina than for the Affymetrix |
|
178 |
+platforms.) The second is a visualization of the total copy number |
|
179 |
+estimates plotted versus physical position on chromosome 1 for the two |
|
180 |
+samples with the lowest (top) and highest (bottom) signal to noise |
|
181 |
+ratios. |
|
182 |
+ |
|
183 |
+<<snr, fig=TRUE>>= |
|
184 |
+hist(cnSet$SNR, breaks=15) |
|
185 |
+@ |
|
186 |
+ |
|
187 |
+<<firstSample, fig=TRUE>>= |
|
188 |
+low.snr <- which(cnSet$SNR == min(cnSet$SNR)) |
|
189 |
+high.snr <- which(cnSet$SNR == max(cnSet$SNR)) |
|
190 |
+x <- position(cnSet)[chromosome(cnSet) == 1] |
|
191 |
+par(mfrow=c(2,1), las=1, mar=c(0.5, 4, 0.5, 0.5), oma=c(4, 1, 1,1)) |
|
192 |
+for(j in c(low.snr, high.snr)){ |
|
193 |
+ cn <- copyNumber(cnSet)[, j]/100 |
|
194 |
+ cn[cn < 0.05] <- 0.05 |
|
195 |
+ plot(x, |
|
196 |
+ cn[chromosome(cnSet) == 1], |
|
197 |
+ pch=".", ylab="copy number", xaxt="n", log="y") |
|
198 |
+} |
|
199 |
+axis(1, at=pretty(x), labels=pretty(x/1e6)) |
|
200 |
+mtext("Mb", 1, outer=TRUE, line=2) |
|
201 |
+@ |
|
202 |
+ |
|
203 |
+Here's a very simple approach to handle outliers by applying a running |
|
204 |
+median using a window of size 3. Following outlier removal, we |
|
205 |
+suggest applying a wave correction to adjust for more global waves |
|
206 |
+followed by a segmentation or hidden markov model. |
|
207 |
+ |
|
208 |
+<<removeOutliers, fig=TRUE>>= |
|
209 |
+par(mfrow=c(2,1), las=1, mar=c(0.5, 4, 0.5, 0.5), oma=c(4, 1, 1,1)) |
|
210 |
+for(j in c(low.snr, high.snr)){ |
|
211 |
+ x <- position(cnSet)[chromosome(cnSet)==1] |
|
212 |
+ cn <- copyNumber(cnSet)[, j]/100 |
|
213 |
+ cn[cn < 0.05] <- 0.05 |
|
214 |
+ ##cn <- log(cn) |
|
215 |
+ x <- x[!is.na(cn)] |
|
216 |
+ cn <- cn[!is.na(cn)] |
|
217 |
+ y <- as.numeric(runmed(cn, k=3)) |
|
218 |
+ plot(x, |
|
219 |
+ y, |
|
220 |
+ pch=".", ylab="copy number", xaxt="n", log="y", ylim=c(0.5,5)) |
|
221 |
+ legend("topright", bty="n", legend=paste("SNR =", round(cnSet$SNR[j], 1))) |
|
222 |
+ abline(h=2, col="grey70") |
|
223 |
+} |
|
224 |
+axis(1, at=pretty(x), labels=pretty(x/1e6)) |
|
225 |
+mtext("Mb", 1, outer=TRUE, line=2) |
|
226 |
+@ |
|
227 |
+ |
|
228 |
+%Next we remove some of the waves using \Rpackage{limma}'s function |
|
229 |
+%\Rfunction{loessFit}. |
|
230 |
+%<<wavecorrection, fig=TRUE>>= |
|
231 |
+%require(limma) |
|
232 |
+%par(mfrow=c(2,2), las=1, mar=c(0.5, 4, 0.5, 0.5), oma=c(4, 1, 1,1)) |
|
233 |
+%for(j in c(low.snr, high.snr)){ |
|
234 |
+% y <- copyNumber(cnSet)[chromosome(cnSet) == 1, j]/100 |
|
235 |
+% missing <- is.na(y) |
|
236 |
+% x <- x[!missing] |
|
237 |
+% y <- y[!missing] |
|
238 |
+% dist <- diff(x) |
|
239 |
+% index <- c(0, cumsum(log10(dist) > 6)) |
|
240 |
+% x.split <- split(x, index) |
|
241 |
+% y <- as.numeric(runmed(y, k=3)) |
|
242 |
+% y.split <- split(y, index) |
|
243 |
+% y.smooth <- vector("list", length(y.split)) |
|
244 |
+% for(i in seq_along(y.smooth)){ |
|
245 |
+% fit <- loessFit(y=y.split[[i]], x= x.split[[i]], span=0.3) |
|
246 |
+% y.smooth[[i]] <- 2+fit$residuals |
|
247 |
+% } |
|
248 |
+% plot(unlist(x.split), |
|
249 |
+% unlist(y.split), |
|
250 |
+% pch=".", ylab="copy number", xaxt="n", log="y", ylim=c(0.5,5)) |
|
251 |
+% plot(unlist(x.split), |
|
252 |
+% unlist(y.smooth), |
|
253 |
+% pch=".", ylab="copy number", xaxt="n", log="y", ylim=c(0.5,5), yaxt="n") |
|
254 |
+% legend("topright", bty="n", legend=paste("SNR =", round(cnSet$SNR[j], 1))) |
|
255 |
+% abline(h=2, col="grey70") |
|
256 |
+% if(j == high.snr){ |
|
257 |
+% axis(1, at=pretty(x), labels=pretty(x/1e6)) |
|
258 |
+% mtext("Mb", 1, outer=TRUE, line=2) |
|
259 |
+% } |
|
260 |
+%} |
|
261 |
+%@ |
|
262 |
+ |
|
263 |
+ |
|
97 | 264 |
\end{document} |
98 | 265 |
|
99 | 266 |
|