... | ... |
@@ -48,8 +48,9 @@ information used by CRLMM is stored in the \Rpackage{human370v1cCrlmm} package. |
48 | 48 |
The required packages can be installed in the usual way using the \Rfunction{biocLite} function. |
49 | 49 |
|
50 | 50 |
<<echo=TRUE, results=hide, eval=FALSE>>= |
51 |
-source("http://www.bioconductor.org/biocLite.R") |
|
52 |
-biocLite(c("crlmm", "hapmap370k", "human370v1cCrlmm")) |
|
51 |
+if (!requireNamespace("BiocManager", quietly=TRUE)) |
|
52 |
+ install.packages("BiocManager") |
|
53 |
+BiocManager::install(c("crlmm", "hapmap370k", "human370v1cCrlmm")) |
|
53 | 54 |
@ |
54 | 55 |
|
55 | 56 |
\section{Reading in data} |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@117087 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -62,8 +62,8 @@ options(width=70) |
62 | 62 |
@ |
63 | 63 |
|
64 | 64 |
<<read, results=hide, eval=TRUE>>= |
65 |
-library(Biobase) |
|
66 | 65 |
library(crlmm) |
66 |
+library(ff) |
|
67 | 67 |
library(hapmap370k) |
68 | 68 |
|
69 | 69 |
data.dir = system.file("idatFiles", package="hapmap370k") |
... | ... |
@@ -73,73 +73,31 @@ samples = read.csv(file.path(data.dir, "samples370k.csv"), as.is=TRUE) |
73 | 73 |
samples[1:5,] |
74 | 74 |
@ |
75 | 75 |
|
76 |
-<<read2, results=hide, cache=TRUE>>= |
|
77 |
-# Read in .idats using sampleSheet information |
|
78 |
-RG = readIdatFiles(samples, path=data.dir, |
|
79 |
-arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"),saveDate=TRUE) |
|
80 |
-@ |
|
81 |
- |
|
82 |
-Reading in this data takes approximately 100 seconds and peak memory usage |
|
83 |
-was 0.8 GB of RAM on our linux system. |
|
84 |
-If memory is limiting, load the \Rpackage{ff} package and run the same command. |
|
85 |
-When this package is available, the objects are stored using disk rather then RAM. |
|
86 |
-The \Robject{RG} object is an \Rclass{NChannelSet} which stores the |
|
87 |
-Red and Green intensities, the number of beads and standard errors for |
|
88 |
-each bead-type. |
|
89 |
-The scanning date of each array is stored in \Robject{protocolData}. |
|
90 |
- |
|
91 |
-<<explore>>= |
|
92 |
-class(RG) |
|
93 |
-dim(RG) |
|
94 |
-slotNames(RG) |
|
95 |
-channelNames(RG) |
|
96 |
-exprs(channel(RG, "R"))[1:5,1:5] |
|
97 |
-exprs(channel(RG, "G"))[1:5,1:5] |
|
98 |
-pd = pData(RG) |
|
99 |
-pd[1:5,] |
|
100 |
- |
|
101 |
-scandatetime = strptime(protocolData(RG)[["ScanDate"]], "%m/%d/%Y %H:%M:%S %p") |
|
102 |
-datescanned = substr(scandatetime, 1, 10) |
|
103 |
-scanbatch = factor(datescanned) |
|
104 |
-levels(scanbatch) = 1:16 |
|
105 |
-scanbatch = as.numeric(scanbatch) |
|
106 |
-@ |
|
107 |
- |
|
108 |
-If GenCall output is available instead of idat files, the function \Rfunction{readGenCallOutput} can be |
|
109 |
-used to read in the data. |
|
110 |
-This function assumes the GenCall output is formatted to have samples listed one below the other, |
|
111 |
-and that the columns 'X Raw' and 'Y Raw' are available in the file. |
|
112 |
-The resulting \Robject{NChannelSet} from this function can be used as input to \Rfunction{crlmmIllumina} via the \Robject{XY} argument (instead of the usual \Rfunction{RG} argument used when the data has been read in from idat files). |
|
113 |
- |
|
114 |
-Plots of the summarised data can be easily generated to check for arrays with poor signal. |
|
115 |
- |
|
116 |
-<<boxplots, fig=TRUE, width=8, height=8>>= |
|
117 |
-par(mfrow=c(2,1), mai=c(0.4,0.4,0.4,0.1), oma=c(1,1,0,0)) |
|
118 |
-boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", names=1:40, |
|
119 |
-main="Red channel",outline=FALSE,las=2) |
|
120 |
-boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", names=1:40, |
|
121 |
-main="Green channel",outline=FALSE,las=2) |
|
122 |
-mtext(expression(log[2](intensity)), side=2, outer=TRUE) |
|
123 |
-mtext("Array", side=1, outer=TRUE) |
|
124 |
-@ |
|
125 |
- |
|
126 | 76 |
\section{Genotyping} |
127 | 77 |
|
128 | 78 |
Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing followed by genotyping using the CRLMM algorithm. |
79 |
+It reads in data from idat files and stores results in a \Rclass{CNSet} object. |
|
129 | 80 |
|
130 | 81 |
<<genotype, results=hide, cache=TRUE>>= |
131 |
-crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", returnParams=TRUE) |
|
132 |
-@ |
|
82 |
+crlmmResult = crlmmIllumina(samples, path=data.dir, |
|
83 |
+ arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"), |
|
84 |
+ saveDate=TRUE, cdfName="human370v1c", verbose=FALSE) |
|
133 | 85 |
|
134 |
-This analysis took 3 minutes to complete and peak memory usage was 1.9 GB on our system. |
|
135 |
-The output stored in \Robject{crlmmResult} is a \Rclass{SnpSet} object. |
|
136 |
-<<explore2>>= |
|
137 | 86 |
class(crlmmResult) |
138 | 87 |
dim(crlmmResult) |
139 | 88 |
slotNames(crlmmResult) |
140 | 89 |
calls(crlmmResult)[1:10, 1:5] |
90 |
+i2p(confs(crlmmResult)[1:10,1:5]) |
|
141 | 91 |
@ |
142 | 92 |
|
93 |
+If GenCall output is available instead of idat files, the function \Rfunction{readGenCallOutput} can be |
|
94 |
+used to read in the data. |
|
95 |
+This function assumes the GenCall output is formatted to have samples listed one below the other, |
|
96 |
+and that the columns 'X Raw' and 'Y Raw' are available in the file. |
|
97 |
+The resulting \Robject{NChannelSet} from this function can be used as input to |
|
98 |
+\Rfunction{crlmmIllumina} (or equivalently \code{genotype.Illumina}) via the \Robject{XY} argument |
|
99 |
+(instead of the usual \Rfunction{RG} argument used when the data has been read in from idat files). |
|
100 |
+ |
|
143 | 101 |
Plotting the {\it SNR} reveals no obvious batch effects in this data set (different symbols are used for |
144 | 102 |
arrays scanned on different days). |
145 | 103 |
|
... | ... |
@@ -148,15 +106,16 @@ plot(crlmmResult[["SNR"]], pch=scanbatch, xlab="Array", ylab="SNR", |
148 | 106 |
main="Signal-to-noise ratio per array",las=2) |
149 | 107 |
@ |
150 | 108 |
|
151 |
-An all-in-one function named \Rfunction{crlmmIlluminaV2} that combines |
|
152 |
-reading of idat files with genotyping is also available. |
|
153 |
- |
|
154 |
-<<readandgenotypeinone, results=hide, cache=TRUE>>= |
|
155 |
-crlmmResult2 <- crlmmIlluminaV2(samples, path=data.dir, |
|
156 |
- arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"), |
|
157 |
- saveDate=TRUE, cdfName="human370v1c", returnParams=TRUE) |
|
109 |
+<<plotsamples, fig=TRUE, width=8, height=8>>= |
|
110 |
+par(mfrow=c(2,2)) |
|
111 |
+plotSamples(crlmmResult, col=1:4) |
|
158 | 112 |
@ |
159 | 113 |
|
114 |
+<<plotsnps, fig=TRUE, width=8, height=8>>= |
|
115 |
+par(mfrow=c(2,2)) |
|
116 |
+plotSNPs(crlmmResult, row=1:4) |
|
117 |
+@ |
|
118 |
+ |
|
160 | 119 |
\section{System information} |
161 | 120 |
|
162 | 121 |
This analysis was carried out on a linux machine with 32GB of RAM |
* collab:
Update AffyGW.pdf and IlluminaPreprocessCN.pdf
crlmmIlluminaV2 calls crlmmGT. comment call to crlmmGT2
Update AffyGW vignette to step through construction of CNSet, normalization, and genotyping
v 1.15.27: Fix bug in crlmmGT2. clean up comments in crlmmIllumina.
update getFeatureData
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@69561 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -29,8 +29,8 @@ |
29 | 29 |
|
30 | 30 |
\textwidth=6.2in |
31 | 31 |
|
32 |
-\bibliographystyle{plainnat} |
|
33 |
- |
|
32 |
+\bibliographystyle{plainnat} |
|
33 |
+ |
|
34 | 34 |
\begin{document} |
35 | 35 |
%\setkeys{Gin}{width=0.55\textwidth} |
36 | 36 |
|
... | ... |
@@ -40,26 +40,26 @@ |
40 | 40 |
|
41 | 41 |
\section{Getting started} |
42 | 42 |
|
43 |
-In this user guide we read in and genotype data from 40 HapMap samples |
|
43 |
+In this user guide we read in and genotype data from 40 HapMap samples |
|
44 | 44 |
which have been analyzed using Illumina's 370k Duo BeadChips. |
45 |
-This data is available in the \Rpackage{hapmap370k} package. |
|
46 |
-Additional chip-specific model parameters and basic SNP annotation |
|
45 |
+This data is available in the \Rpackage{hapmap370k} package. |
|
46 |
+Additional chip-specific model parameters and basic SNP annotation |
|
47 | 47 |
information used by CRLMM is stored in the \Rpackage{human370v1cCrlmm} package. |
48 | 48 |
The required packages can be installed in the usual way using the \Rfunction{biocLite} function. |
49 | 49 |
|
50 | 50 |
<<echo=TRUE, results=hide, eval=FALSE>>= |
51 | 51 |
source("http://www.bioconductor.org/biocLite.R") |
52 | 52 |
biocLite(c("crlmm", "hapmap370k", "human370v1cCrlmm")) |
53 |
-@ |
|
53 |
+@ |
|
54 | 54 |
|
55 |
-\section{Reading in data} |
|
56 |
-The function \Rfunction{readIdatFiles} extracts the Red and Green intensities |
|
55 |
+\section{Reading in data} |
|
56 |
+The function \Rfunction{readIdatFiles} extracts the Red and Green intensities |
|
57 | 57 |
from the binary {\tt idat} files output by Illumina's scanning device. |
58 | 58 |
The file {\tt samples370k.csv} contains information about each sample. |
59 | 59 |
|
60 | 60 |
<<echo=FALSE, results=hide, eval=TRUE>>= |
61 | 61 |
options(width=70) |
62 |
-@ |
|
62 |
+@ |
|
63 | 63 |
|
64 | 64 |
<<read, results=hide, eval=TRUE>>= |
65 | 65 |
library(Biobase) |
... | ... |
@@ -71,21 +71,21 @@ data.dir = system.file("idatFiles", package="hapmap370k") |
71 | 71 |
# Read in sample annotation info |
72 | 72 |
samples = read.csv(file.path(data.dir, "samples370k.csv"), as.is=TRUE) |
73 | 73 |
samples[1:5,] |
74 |
-@ |
|
74 |
+@ |
|
75 | 75 |
|
76 | 76 |
<<read2, results=hide, cache=TRUE>>= |
77 | 77 |
# Read in .idats using sampleSheet information |
78 |
-RG = readIdatFiles(samples, path=data.dir, |
|
78 |
+RG = readIdatFiles(samples, path=data.dir, |
|
79 | 79 |
arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"),saveDate=TRUE) |
80 | 80 |
@ |
81 | 81 |
|
82 |
-Reading in this data takes approximately 100 seconds and peak memory usage |
|
82 |
+Reading in this data takes approximately 100 seconds and peak memory usage |
|
83 | 83 |
was 0.8 GB of RAM on our linux system. |
84 | 84 |
If memory is limiting, load the \Rpackage{ff} package and run the same command. |
85 | 85 |
When this package is available, the objects are stored using disk rather then RAM. |
86 |
-The \Robject{RG} object is an \Rclass{NChannelSet} which stores the |
|
87 |
-Red and Green intensities, the number of beads and standard errors for |
|
88 |
-each bead-type. |
|
86 |
+The \Robject{RG} object is an \Rclass{NChannelSet} which stores the |
|
87 |
+Red and Green intensities, the number of beads and standard errors for |
|
88 |
+each bead-type. |
|
89 | 89 |
The scanning date of each array is stored in \Robject{protocolData}. |
90 | 90 |
|
91 | 91 |
<<explore>>= |
... | ... |
@@ -105,19 +105,19 @@ levels(scanbatch) = 1:16 |
105 | 105 |
scanbatch = as.numeric(scanbatch) |
106 | 106 |
@ |
107 | 107 |
|
108 |
-If GenCall output is available instead of idat files, the function \Rfunction{readGenCallOutput} can be |
|
109 |
-used to read in the data. |
|
108 |
+If GenCall output is available instead of idat files, the function \Rfunction{readGenCallOutput} can be |
|
109 |
+used to read in the data. |
|
110 | 110 |
This function assumes the GenCall output is formatted to have samples listed one below the other, |
111 |
-and that the columns 'X Raw' and 'Y Raw' are available in the file. |
|
111 |
+and that the columns 'X Raw' and 'Y Raw' are available in the file. |
|
112 | 112 |
The resulting \Robject{NChannelSet} from this function can be used as input to \Rfunction{crlmmIllumina} via the \Robject{XY} argument (instead of the usual \Rfunction{RG} argument used when the data has been read in from idat files). |
113 | 113 |
|
114 | 114 |
Plots of the summarised data can be easily generated to check for arrays with poor signal. |
115 | 115 |
|
116 | 116 |
<<boxplots, fig=TRUE, width=8, height=8>>= |
117 | 117 |
par(mfrow=c(2,1), mai=c(0.4,0.4,0.4,0.1), oma=c(1,1,0,0)) |
118 |
-boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", names=1:40, |
|
118 |
+boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", names=1:40, |
|
119 | 119 |
main="Red channel",outline=FALSE,las=2) |
120 |
-boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", names=1:40, |
|
120 |
+boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", names=1:40, |
|
121 | 121 |
main="Green channel",outline=FALSE,las=2) |
122 | 122 |
mtext(expression(log[2](intensity)), side=2, outer=TRUE) |
123 | 123 |
mtext("Array", side=1, outer=TRUE) |
... | ... |
@@ -129,36 +129,37 @@ Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing |
129 | 129 |
|
130 | 130 |
<<genotype, results=hide, cache=TRUE>>= |
131 | 131 |
crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", returnParams=TRUE) |
132 |
-@ |
|
132 |
+@ |
|
133 | 133 |
|
134 | 134 |
This analysis took 3 minutes to complete and peak memory usage was 1.9 GB on our system. |
135 |
-The output stored in \Robject{crlmmResult} is a \Rclass{SnpSet} object. |
|
135 |
+The output stored in \Robject{crlmmResult} is a \Rclass{SnpSet} object. |
|
136 | 136 |
<<explore2>>= |
137 | 137 |
class(crlmmResult) |
138 | 138 |
dim(crlmmResult) |
139 | 139 |
slotNames(crlmmResult) |
140 | 140 |
calls(crlmmResult)[1:10, 1:5] |
141 |
-@ |
|
141 |
+@ |
|
142 | 142 |
|
143 |
-Plotting the {\it SNR} reveals no obvious batch effects in this data set (different symbols are used for |
|
143 |
+Plotting the {\it SNR} reveals no obvious batch effects in this data set (different symbols are used for |
|
144 | 144 |
arrays scanned on different days). |
145 | 145 |
|
146 | 146 |
<<snr, fig=TRUE, width=8, height=6>>= |
147 |
-plot(crlmmResult[["SNR"]], pch=scanbatch, xlab="Array", ylab="SNR", |
|
148 |
-main="Signal-to-noise ratio per array",las=2) |
|
147 |
+plot(crlmmResult[["SNR"]], pch=scanbatch, xlab="Array", ylab="SNR", |
|
148 |
+ main="Signal-to-noise ratio per array",las=2) |
|
149 | 149 |
@ |
150 | 150 |
|
151 |
-An all-in-one function named \Rfunction{crlmmIlluminaV2} that combines reading of idat files with genotyping is also available. |
|
151 |
+An all-in-one function named \Rfunction{crlmmIlluminaV2} that combines |
|
152 |
+reading of idat files with genotyping is also available. |
|
152 | 153 |
|
153 | 154 |
<<readandgenotypeinone, results=hide, cache=TRUE>>= |
154 |
-crlmmResult2 = crlmmIlluminaV2(samples, path=data.dir, |
|
155 |
-arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"), |
|
156 |
-saveDate=TRUE, cdfName="human370v1c", returnParams=TRUE) |
|
157 |
-@ |
|
155 |
+crlmmResult2 <- crlmmIlluminaV2(samples, path=data.dir, |
|
156 |
+ arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"), |
|
157 |
+ saveDate=TRUE, cdfName="human370v1c", returnParams=TRUE) |
|
158 |
+@ |
|
158 | 159 |
|
159 | 160 |
\section{System information} |
160 | 161 |
|
161 |
-This analysis was carried out on a linux machine with 32GB of RAM |
|
162 |
+This analysis was carried out on a linux machine with 32GB of RAM |
|
162 | 163 |
using the following packages: |
163 | 164 |
|
164 | 165 |
<<session>>= |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@59206 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -47,7 +47,7 @@ Additional chip-specific model parameters and basic SNP annotation |
47 | 47 |
information used by CRLMM is stored in the \Rpackage{human370v1cCrlmm} package. |
48 | 48 |
The required packages can be installed in the usual way using the \Rfunction{biocLite} function. |
49 | 49 |
|
50 |
-<<<echo=TRUE, results=hide, eval=FALSE>>= |
|
50 |
+<<echo=TRUE, results=hide, eval=FALSE>>= |
|
51 | 51 |
source("http://www.bioconductor.org/biocLite.R") |
52 | 52 |
biocLite(c("crlmm", "hapmap370k", "human370v1cCrlmm")) |
53 | 53 |
@ |
... | ... |
@@ -57,8 +57,8 @@ The function \Rfunction{readIdatFiles} extracts the Red and Green intensities |
57 | 57 |
from the binary {\tt idat} files output by Illumina's scanning device. |
58 | 58 |
The file {\tt samples370k.csv} contains information about each sample. |
59 | 59 |
|
60 |
-<<<echo=FALSE, results=hide, eval=TRUE>>= |
|
61 |
-options(width=50) |
|
60 |
+<<echo=FALSE, results=hide, eval=TRUE>>= |
|
61 |
+options(width=70) |
|
62 | 62 |
@ |
63 | 63 |
|
64 | 64 |
<<read, results=hide, eval=TRUE>>= |
... | ... |
@@ -75,7 +75,8 @@ samples[1:5,] |
75 | 75 |
|
76 | 76 |
<<read2, results=hide, cache=TRUE>>= |
77 | 77 |
# Read in .idats using sampleSheet information |
78 |
-RG = readIdatFiles(samples, path=data.dir, arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), saveDate=TRUE) |
|
78 |
+RG = readIdatFiles(samples, path=data.dir, |
|
79 |
+arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"),saveDate=TRUE) |
|
79 | 80 |
@ |
80 | 81 |
|
81 | 82 |
Reading in this data takes approximately 100 seconds and peak memory usage |
... | ... |
@@ -104,13 +105,20 @@ levels(scanbatch) = 1:16 |
104 | 105 |
scanbatch = as.numeric(scanbatch) |
105 | 106 |
@ |
106 | 107 |
|
107 |
-Plots of the summarised data can be easily generated to check for arrays |
|
108 |
-with poor signal. |
|
108 |
+If GenCall output is available instead of idat files, the function \Rfunction{readGenCallOutput} can be |
|
109 |
+used to read in the data. |
|
110 |
+This function assumes the GenCall output is formatted to have samples listed one below the other, |
|
111 |
+and that the columns 'X Raw' and 'Y Raw' are available in the file. |
|
112 |
+The resulting \Robject{NChannelSet} from this function can be used as input to \Rfunction{crlmmIllumina} via the \Robject{XY} argument (instead of the usual \Rfunction{RG} argument used when the data has been read in from idat files). |
|
113 |
+ |
|
114 |
+Plots of the summarised data can be easily generated to check for arrays with poor signal. |
|
109 | 115 |
|
110 | 116 |
<<boxplots, fig=TRUE, width=8, height=8>>= |
111 | 117 |
par(mfrow=c(2,1), mai=c(0.4,0.4,0.4,0.1), oma=c(1,1,0,0)) |
112 |
-boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", names=1:40, main="Red channel", outline=FALSE, las=2) |
|
113 |
-boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", names=1:40, main="Green channel", outline=FALSE, las=2) |
|
118 |
+boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", names=1:40, |
|
119 |
+main="Red channel",outline=FALSE,las=2) |
|
120 |
+boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", names=1:40, |
|
121 |
+main="Green channel",outline=FALSE,las=2) |
|
114 | 122 |
mtext(expression(log[2](intensity)), side=2, outer=TRUE) |
115 | 123 |
mtext("Array", side=1, outer=TRUE) |
116 | 124 |
@ |
... | ... |
@@ -120,25 +128,34 @@ mtext("Array", side=1, outer=TRUE) |
120 | 128 |
Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing followed by genotyping using the CRLMM algorithm. |
121 | 129 |
|
122 | 130 |
<<genotype, results=hide, cache=TRUE>>= |
123 |
-crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", sns=pData(RG)$ID, returnParams=TRUE) |
|
131 |
+crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", returnParams=TRUE) |
|
124 | 132 |
@ |
125 | 133 |
|
126 |
-This analysis took 18 minutes to complete and peak memory usage was 2.5 GB on our system. |
|
134 |
+This analysis took 3 minutes to complete and peak memory usage was 1.9 GB on our system. |
|
127 | 135 |
The output stored in \Robject{crlmmResult} is a \Rclass{SnpSet} object. |
128 | 136 |
<<explore2>>= |
129 |
- class(crlmmResult) |
|
130 |
- dim(crlmmResult) |
|
131 |
- slotNames(crlmmResult) |
|
132 |
- calls(crlmmResult)[1:10, 1:5] |
|
137 |
+class(crlmmResult) |
|
138 |
+dim(crlmmResult) |
|
139 |
+slotNames(crlmmResult) |
|
140 |
+calls(crlmmResult)[1:10, 1:5] |
|
133 | 141 |
@ |
134 | 142 |
|
135 | 143 |
Plotting the {\it SNR} reveals no obvious batch effects in this data set (different symbols are used for |
136 | 144 |
arrays scanned on different days). |
137 | 145 |
|
138 | 146 |
<<snr, fig=TRUE, width=8, height=6>>= |
139 |
-plot(crlmmResult[["SNR"]], pch=scanbatch, xlab="Array", ylab="SNR", main="Signal-to-noise ratio per array", las=2) |
|
147 |
+plot(crlmmResult[["SNR"]], pch=scanbatch, xlab="Array", ylab="SNR", |
|
148 |
+main="Signal-to-noise ratio per array",las=2) |
|
140 | 149 |
@ |
141 | 150 |
|
151 |
+An all-in-one function named \Rfunction{crlmmIlluminaV2} that combines reading of idat files with genotyping is also available. |
|
152 |
+ |
|
153 |
+<<readandgenotypeinone, results=hide, cache=TRUE>>= |
|
154 |
+crlmmResult2 = crlmmIlluminaV2(samples, path=data.dir, |
|
155 |
+arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"), |
|
156 |
+saveDate=TRUE, cdfName="human370v1c", returnParams=TRUE) |
|
157 |
+@ |
|
158 |
+ |
|
142 | 159 |
\section{System information} |
143 | 160 |
|
144 | 161 |
This analysis was carried out on a linux machine with 32GB of RAM |
... | ... |
@@ -78,8 +78,8 @@ samples[1:5,] |
78 | 78 |
RG = readIdatFiles(samples, path=data.dir, arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), saveDate=TRUE) |
79 | 79 |
@ |
80 | 80 |
|
81 |
-Reading in this data takes approximately 90 seconds and peak memory usage |
|
82 |
-was 1.2 GB of RAM on our linux system. |
|
81 |
+Reading in this data takes approximately 100 seconds and peak memory usage |
|
82 |
+was 0.8 GB of RAM on our linux system. |
|
83 | 83 |
If memory is limiting, load the \Rpackage{ff} package and run the same command. |
84 | 84 |
When this package is available, the objects are stored using disk rather then RAM. |
85 | 85 |
The \Robject{RG} object is an \Rclass{NChannelSet} which stores the |
... | ... |
@@ -123,7 +123,7 @@ Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing |
123 | 123 |
crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", sns=pData(RG)$ID, returnParams=TRUE) |
124 | 124 |
@ |
125 | 125 |
|
126 |
-This analysis took 470 seconds to complete and peak memory usage was 3.3 GB on our system. |
|
126 |
+This analysis took 18 minutes to complete and peak memory usage was 2.5 GB on our system. |
|
127 | 127 |
The output stored in \Robject{crlmmResult} is a \Rclass{SnpSet} object. |
128 | 128 |
<<explore2>>= |
129 | 129 |
class(crlmmResult) |
... | ... |
@@ -45,11 +45,11 @@ which have been analyzed using Illumina's 370k Duo BeadChips. |
45 | 45 |
This data is available in the \Rpackage{hapmap370k} package. |
46 | 46 |
Additional chip-specific model parameters and basic SNP annotation |
47 | 47 |
information used by CRLMM is stored in the \Rpackage{human370v1cCrlmm} package. |
48 |
-These packages can be installed in the usual way using the \Rfunction{biocLite} function. |
|
48 |
+The required packages can be installed in the usual way using the \Rfunction{biocLite} function. |
|
49 | 49 |
|
50 | 50 |
<<<echo=TRUE, results=hide, eval=FALSE>>= |
51 | 51 |
source("http://www.bioconductor.org/biocLite.R") |
52 |
-biocLite(c("hapmap370k", "human370v1cCrlmm")) |
|
52 |
+biocLite(c("crlmm", "hapmap370k", "human370v1cCrlmm")) |
|
53 | 53 |
@ |
54 | 54 |
|
55 | 55 |
\section{Reading in data} |
... | ... |
@@ -80,6 +80,8 @@ RG = readIdatFiles(samples, path=data.dir, arrayInfoColNames=list(barcode=NULL, |
80 | 80 |
|
81 | 81 |
Reading in this data takes approximately 90 seconds and peak memory usage |
82 | 82 |
was 1.2 GB of RAM on our linux system. |
83 |
+If memory is limiting, load the \Rpackage{ff} package and run the same command. |
|
84 |
+When this package is available, the objects are stored using disk rather then RAM. |
|
83 | 85 |
The \Robject{RG} object is an \Rclass{NChannelSet} which stores the |
84 | 86 |
Red and Green intensities, the number of beads and standard errors for |
85 | 87 |
each bead-type. |
... | ... |
@@ -44,12 +44,15 @@ In this user guide we read in and genotype data from 40 HapMap samples |
44 | 44 |
which have been analyzed using Illumina's 370k Duo BeadChips. |
45 | 45 |
This data is available in the \Rpackage{hapmap370k} package. |
46 | 46 |
Additional chip-specific model parameters and basic SNP annotation |
47 |
-information used by CRLMM is stored in the \Rpackage{human370v1c} package. |
|
48 |
-These can be downloaded from \href{http://rafalab.jhsph.edu/software.html}{http://rafalab.jhsph.edu/software.html} |
|
49 |
-and must be installed for the following code to work. |
|
47 |
+information used by CRLMM is stored in the \Rpackage{human370v1cCrlmm} package. |
|
48 |
+These packages can be installed in the usual way using the \Rfunction{biocLite} function. |
|
50 | 49 |
|
51 |
-\section{Reading in data} |
|
50 |
+<<<echo=TRUE, results=hide, eval=FALSE>>= |
|
51 |
+source("http://www.bioconductor.org/biocLite.R") |
|
52 |
+biocLite(c("hapmap370k", "human370v1cCrlmm")) |
|
53 |
+@ |
|
52 | 54 |
|
55 |
+\section{Reading in data} |
|
53 | 56 |
The function \Rfunction{readIdatFiles} extracts the Red and Green intensities |
54 | 57 |
from the binary {\tt idat} files output by Illumina's scanning device. |
55 | 58 |
The file {\tt samples370k.csv} contains information about each sample. |
... | ... |
@@ -58,7 +61,7 @@ The file {\tt samples370k.csv} contains information about each sample. |
58 | 61 |
options(width=50) |
59 | 62 |
@ |
60 | 63 |
|
61 |
-<<read>>= |
|
64 |
+<<read, results=hide, eval=TRUE>>= |
|
62 | 65 |
library(Biobase) |
63 | 66 |
library(crlmm) |
64 | 67 |
library(hapmap370k) |
... | ... |
@@ -112,8 +115,7 @@ mtext("Array", side=1, outer=TRUE) |
112 | 115 |
|
113 | 116 |
\section{Genotyping} |
114 | 117 |
|
115 |
-Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing followed by genotyping |
|
116 |
-using the CRLMM algorithm. |
|
118 |
+Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing followed by genotyping using the CRLMM algorithm. |
|
117 | 119 |
|
118 | 120 |
<<genotype, results=hide, cache=TRUE>>= |
119 | 121 |
crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", sns=pData(RG)$ID, returnParams=TRUE) |
... | ... |
@@ -80,7 +80,7 @@ was 1.2 GB of RAM on our linux system. |
80 | 80 |
The \Robject{RG} object is an \Rclass{NChannelSet} which stores the |
81 | 81 |
Red and Green intensities, the number of beads and standard errors for |
82 | 82 |
each bead-type. |
83 |
-The scanning date of each array is stored in the \Robject{protocolData} slot. |
|
83 |
+The scanning date of each array is stored in \Robject{protocolData}. |
|
84 | 84 |
|
85 | 85 |
<<explore>>= |
86 | 86 |
class(RG) |
... | ... |
@@ -104,8 +104,8 @@ with poor signal. |
104 | 104 |
|
105 | 105 |
<<boxplots, fig=TRUE, width=8, height=8>>= |
106 | 106 |
par(mfrow=c(2,1), mai=c(0.4,0.4,0.4,0.1), oma=c(1,1,0,0)) |
107 |
-boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", main="Red channel", outline=FALSE, las=2) |
|
108 |
-boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", main="Green channel", outline=FALSE, las=2) |
|
107 |
+boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", names=1:40, main="Red channel", outline=FALSE, las=2) |
|
108 |
+boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", names=1:40, main="Green channel", outline=FALSE, las=2) |
|
109 | 109 |
mtext(expression(log[2](intensity)), side=2, outer=TRUE) |
110 | 110 |
mtext("Array", side=1, outer=TRUE) |
111 | 111 |
@ |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@40809 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -80,7 +80,7 @@ was 1.2 GB of RAM on our linux system. |
80 | 80 |
The \Robject{RG} object is an \Rclass{NChannelSet} which stores the |
81 | 81 |
Red and Green intensities, the number of beads and standard errors for |
82 | 82 |
each bead-type. |
83 |
-The scanning date of each array is stored in the \Robject{scanDates} slot. |
|
83 |
+The scanning date of each array is stored in the \Robject{protocolData} slot. |
|
84 | 84 |
|
85 | 85 |
<<explore>>= |
86 | 86 |
class(RG) |
... | ... |
@@ -92,7 +92,7 @@ exprs(channel(RG, "G"))[1:5,1:5] |
92 | 92 |
pd = pData(RG) |
93 | 93 |
pd[1:5,] |
94 | 94 |
|
95 |
-scandatetime = strptime(scanDates(RG), "%m/%d/%Y %H:%M:%S %p") |
|
95 |
+scandatetime = strptime(protocolData(RG)[["ScanDate"]], "%m/%d/%Y %H:%M:%S %p") |
|
96 | 96 |
datescanned = substr(scandatetime, 1, 10) |
97 | 97 |
scanbatch = factor(datescanned) |
98 | 98 |
levels(scanbatch) = 1:16 |
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,147 @@ |
1 |
+%\VignetteIndexEntry{crlmm Vignette - Illumina 370k chip} |
|
2 |
+%\VignetteKeywords{genotype, crlmm, Illumina} |
|
3 |
+%\VignettePackage{crlmm} |
|
4 |
+ |
|
5 |
+\documentclass[12pt]{article} |
|
6 |
+ |
|
7 |
+\usepackage{amsmath,pstricks} |
|
8 |
+\usepackage[authoryear,round]{natbib} |
|
9 |
+\usepackage{hyperref} |
|
10 |
+\usepackage{Sweave} |
|
11 |
+ |
|
12 |
+\textwidth=6.2in |
|
13 |
+\textheight=8.5in |
|
14 |
+%\parskip=.3cm |
|
15 |
+\oddsidemargin=.1in |
|
16 |
+\evensidemargin=.1in |
|
17 |
+\headheight=-.3in |
|
18 |
+ |
|
19 |
+\newcommand{\scscst}{\scriptscriptstyle} |
|
20 |
+\newcommand{\scst}{\scriptstyle} |
|
21 |
+ |
|
22 |
+ |
|
23 |
+\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
|
24 |
+\newcommand{\Robject}[1]{{\texttt{#1}}} |
|
25 |
+\newcommand{\Rpackage}[1]{{\textit{#1}}} |
|
26 |
+\newcommand{\Rmethod}[1]{{\texttt{#1}}} |
|
27 |
+\newcommand{\Rfunarg}[1]{{\texttt{#1}}} |
|
28 |
+\newcommand{\Rclass}[1]{{\textit{#1}}} |
|
29 |
+ |
|
30 |
+\textwidth=6.2in |
|
31 |
+ |
|
32 |
+\bibliographystyle{plainnat} |
|
33 |
+ |
|
34 |
+\begin{document} |
|
35 |
+%\setkeys{Gin}{width=0.55\textwidth} |
|
36 |
+ |
|
37 |
+\title{Using \Rpackage{crlmm} to genotype data from Illumina's Infinium BeadChips} |
|
38 |
+\author{Matt Ritchie} |
|
39 |
+\maketitle |
|
40 |
+ |
|
41 |
+\section{Getting started} |
|
42 |
+ |
|
43 |
+In this user guide we read in and genotype data from 40 HapMap samples |
|
44 |
+which have been analyzed using Illumina's 370k Duo BeadChips. |
|
45 |
+This data is available in the \Rpackage{hapmap370k} package. |
|
46 |
+Additional chip-specific model parameters and basic SNP annotation |
|
47 |
+information used by CRLMM is stored in the \Rpackage{human370v1c} package. |
|
48 |
+These can be downloaded from \href{http://rafalab.jhsph.edu/software.html}{http://rafalab.jhsph.edu/software.html} |
|
49 |
+and must be installed for the following code to work. |
|
50 |
+ |
|
51 |
+\section{Reading in data} |
|
52 |
+ |
|
53 |
+The function \Rfunction{readIdatFiles} extracts the Red and Green intensities |
|
54 |
+from the binary {\tt idat} files output by Illumina's scanning device. |
|
55 |
+The file {\tt samples370k.csv} contains information about each sample. |
|
56 |
+ |
|
57 |
+<<<echo=FALSE, results=hide, eval=TRUE>>= |
|
58 |
+options(width=50) |
|
59 |
+@ |
|
60 |
+ |
|
61 |
+<<read>>= |
|
62 |
+library(Biobase) |
|
63 |
+library(crlmm) |
|
64 |
+library(hapmap370k) |
|
65 |
+ |
|
66 |
+data.dir = system.file("idatFiles", package="hapmap370k") |
|
67 |
+ |
|
68 |
+# Read in sample annotation info |
|
69 |
+samples = read.csv(file.path(data.dir, "samples370k.csv"), as.is=TRUE) |
|
70 |
+samples[1:5,] |
|
71 |
+@ |
|
72 |
+ |
|
73 |
+<<read2, results=hide, cache=TRUE>>= |
|
74 |
+# Read in .idats using sampleSheet information |
|
75 |
+RG = readIdatFiles(samples, path=data.dir, arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), saveDate=TRUE) |
|
76 |
+@ |
|
77 |
+ |
|
78 |
+Reading in this data takes approximately 90 seconds and peak memory usage |
|
79 |
+was 1.2 GB of RAM on our linux system. |
|
80 |
+The \Robject{RG} object is an \Rclass{NChannelSet} which stores the |
|
81 |
+Red and Green intensities, the number of beads and standard errors for |
|
82 |
+each bead-type. |
|
83 |
+The scanning date of each array is stored in the \Robject{scanDates} slot. |
|
84 |
+ |
|
85 |
+<<explore>>= |
|
86 |
+class(RG) |
|
87 |
+dim(RG) |
|
88 |
+slotNames(RG) |
|
89 |
+channelNames(RG) |
|
90 |
+exprs(channel(RG, "R"))[1:5,1:5] |
|
91 |
+exprs(channel(RG, "G"))[1:5,1:5] |
|
92 |
+pd = pData(RG) |
|
93 |
+pd[1:5,] |
|
94 |
+ |
|
95 |
+scandatetime = strptime(scanDates(RG), "%m/%d/%Y %H:%M:%S %p") |
|
96 |
+datescanned = substr(scandatetime, 1, 10) |
|
97 |
+scanbatch = factor(datescanned) |
|
98 |
+levels(scanbatch) = 1:16 |
|
99 |
+scanbatch = as.numeric(scanbatch) |
|
100 |
+@ |
|
101 |
+ |
|
102 |
+Plots of the summarised data can be easily generated to check for arrays |
|
103 |
+with poor signal. |
|
104 |
+ |
|
105 |
+<<boxplots, fig=TRUE, width=8, height=8>>= |
|
106 |
+par(mfrow=c(2,1), mai=c(0.4,0.4,0.4,0.1), oma=c(1,1,0,0)) |
|
107 |
+boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", main="Red channel", outline=FALSE, las=2) |
|
108 |
+boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", main="Green channel", outline=FALSE, las=2) |
|
109 |
+mtext(expression(log[2](intensity)), side=2, outer=TRUE) |
|
110 |
+mtext("Array", side=1, outer=TRUE) |
|
111 |
+@ |
|
112 |
+ |
|
113 |
+\section{Genotyping} |
|
114 |
+ |
|
115 |
+Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing followed by genotyping |
|
116 |
+using the CRLMM algorithm. |
|
117 |
+ |
|
118 |
+<<genotype, results=hide, cache=TRUE>>= |
|
119 |
+crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", sns=pData(RG)$ID, returnParams=TRUE) |
|
120 |
+@ |
|
121 |
+ |
|
122 |
+This analysis took 470 seconds to complete and peak memory usage was 3.3 GB on our system. |
|
123 |
+The output stored in \Robject{crlmmResult} is a \Rclass{SnpSet} object. |
|
124 |
+<<explore2>>= |
|
125 |
+ class(crlmmResult) |
|
126 |
+ dim(crlmmResult) |
|
127 |
+ slotNames(crlmmResult) |
|
128 |
+ calls(crlmmResult)[1:10, 1:5] |
|
129 |
+@ |
|
130 |
+ |
|
131 |
+Plotting the {\it SNR} reveals no obvious batch effects in this data set (different symbols are used for |
|
132 |
+arrays scanned on different days). |
|
133 |
+ |
|
134 |
+<<snr, fig=TRUE, width=8, height=6>>= |
|
135 |
+plot(crlmmResult[["SNR"]], pch=scanbatch, xlab="Array", ylab="SNR", main="Signal-to-noise ratio per array", las=2) |
|
136 |
+@ |
|
137 |
+ |
|
138 |
+\section{System information} |
|
139 |
+ |
|
140 |
+This analysis was carried out on a linux machine with 32GB of RAM |
|
141 |
+using the following packages: |
|
142 |
+ |
|
143 |
+<<session>>= |
|
144 |
+sessionInfo() |
|
145 |
+@ |
|
146 |
+ |
|
147 |
+\end{document} |