git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@117087 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,5 +1,6 @@ |
1 | 1 |
\name{genotype.Illumina} |
2 | 2 |
\alias{genotype.Illumina} |
3 |
+\alias{crlmmIllumina} |
|
3 | 4 |
|
4 | 5 |
\title{ |
5 | 6 |
Preprocessing and genotyping of Illumina Infinium II arrays. |
... | ... |
@@ -10,11 +11,20 @@ |
10 | 11 |
\usage{ |
11 | 12 |
genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
12 | 13 |
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
13 |
- highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), XY=NULL, |
|
14 |
+ highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), XY=NULL, anno, genome, |
|
14 | 15 |
call.method="crlmm", trueCalls=NULL, cdfName, copynumber=TRUE, batch=NULL, saveDate=FALSE, stripNorm=TRUE, |
15 |
- useTarget=TRUE, quantile.method="between", mixtureSampleSize=10^5, fitMixture=TRUE, |
|
16 |
- eps =0.1, verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, |
|
16 |
+ useTarget=TRUE, quantile.method="between", nopackage.norm="quantile", mixtureSampleSize=10^5, fitMixture=TRUE, |
|
17 |
+ eps=0.1, verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, |
|
17 | 18 |
recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7) |
19 |
+ |
|
20 |
+crlmmIllumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
|
21 |
+ arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
|
22 |
+ highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), XY=NULL, anno, genome, |
|
23 |
+ call.method="crlmm", trueCalls=NULL, cdfName, copynumber=TRUE, batch=NULL, saveDate=FALSE, stripNorm=TRUE, |
|
24 |
+ useTarget=TRUE, quantile.method="between", nopackage.norm="quantile", mixtureSampleSize=10^5, fitMixture=TRUE, |
|
25 |
+ eps=0.1, verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, |
|
26 |
+ recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
27 |
+ |
|
18 | 28 |
} |
19 | 29 |
\arguments{ |
20 | 30 |
\item{sampleSheet}{\code{data.frame} containing Illumina sample sheet |
... | ... |
@@ -42,9 +52,13 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
42 | 52 |
\item{fileExt}{list containing elements 'Green' and 'Red' which |
43 | 53 |
specify the .idat file extension for the Cy3 and Cy5 channels.} |
44 | 54 |
\item{XY}{\code{NChannelSet} containing X and Y intensities.} |
55 |
+ \item{anno}{data.frame containing SNP annotation information from |
|
56 |
+ manifest and additional columns 'isSnp', 'position', 'chromosome' |
|
57 |
+ and 'featureNames'. For use when \code{cdfName}='nopackage'} |
|
58 |
+ \item{genome}{character string specifying which genome is used in annotation} |
|
45 | 59 |
\item{call.method}{character string specifying the genotype calling algorithm to use ('crlmm' or 'krlmm').} |
46 | 60 |
\item{trueCalls}{matrix specifying known Genotype calls(can contain some NAs) for a subset of samples and features (1 - AA, 2 - AB, 3 - BB).} |
47 |
- \item{cdfName}{ annotation package (see also \code{validCdfNames})} |
|
61 |
+ \item{cdfName}{annotation package (see also \code{validCdfNames}) or 'nopackage' when combined with 'krlmm', an \code{anno} data.frame and \code{genome}.} |
|
48 | 62 |
\item{copynumber}{ 'logical.' Whether to store copy number intensities with SNP output.} |
49 | 63 |
\item{batch}{ character vector indicating the batch variable. Must be |
50 | 64 |
the same length as the number of samples. See details.} |
... | ... |
@@ -54,6 +68,8 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
54 | 68 |
\item{useTarget}{'logical' (only used when \code{stripNorm=TRUE}). |
55 | 69 |
Should the reference HapMap intensities be used in strip-level normalization?} |
56 | 70 |
\item{quantile.method}{character string specifying the quantile normalization method to use ('within' or 'between' channels).} |
71 |
+ \item{nopackage.norm}{character string specifying normalization to be used when \code{cdfName}='nopackage'. |
|
72 |
+ Options are 'none', 'quantile' (within channel, between array) and 'loess'.} |
|
57 | 73 |
\item{mixtureSampleSize}{ Sample size to be use when fitting the mixture model.} |
58 | 74 |
\item{fitMixture}{ 'logical.' Whether to fit per-array mixture model.} |
59 | 75 |
\item{eps}{ Stop criteria.} |
... | ... |
@@ -74,23 +90,15 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
74 | 90 |
|
75 | 91 |
\details{ |
76 | 92 |
|
77 |
- For large datasets it is important to utilize the large data |
|
78 |
- support by installing and loading the ff package before calling |
|
79 |
- the \code{genotype} function. In previous versions of the |
|
80 |
- \code{crlmm} package, we used different functions for |
|
81 |
- genotyping depending on whether the ff package is loaded, namely |
|
82 |
- \code{genotype} and \code{genotype2}. The \code{genotype} |
|
83 |
- function now handles both instances. |
|
84 |
- |
|
85 |
- \code{genotype.Illumina} is a wrapper of the \code{crlmm} |
|
86 |
- function for genotyping. Differences include (1) that the copy |
|
87 |
- number probes (if present) are also quantile-normalized and (2) |
|
88 |
- the class of object returned by this function, \code{CNSet}, is |
|
89 |
- needed for subsequent copy number estimation. Note that the |
|
90 |
- batch variable (a character string) that must be passed to this |
|
91 |
- function has no effect on the normalization or genotyping steps. |
|
92 |
- Rather, \code{batch} is required in order to initialize a |
|
93 |
- \code{CNSet} container with the appropriate dimensions. |
|
93 |
+ \code{genotype.Illumina} (or equivalently \code{crlmmIllumina}) |
|
94 |
+ is a wrapper of the \code{crlmm} function for genotyping. |
|
95 |
+ Differences include (1) that the copy number probes (if present) |
|
96 |
+ are also quantile-normalized and (2) the class of object returned |
|
97 |
+ by this function, \code{CNSet}, is needed for subsequent copy number |
|
98 |
+ estimation. Note that the batch variable (a character string) has |
|
99 |
+ no effect on the normalization or genotyping steps. Rather, \code{batch} |
|
100 |
+ is required in order to initialize a \code{CNSet} container with the |
|
101 |
+ appropriate dimensions. |
|
94 | 102 |
|
95 | 103 |
The new 'krlmm' option is available for certain chip types. Optional |
96 | 104 |
argument \code{trueCalls} matrix contains known Genotype calls |
... | ... |
@@ -103,6 +111,15 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
103 | 111 |
8 clusters. This is configurable by setting up an option named |
104 | 112 |
"krlmm.cores", e.g. options("krlmm.cores" = 16). |
105 | 113 |
|
114 |
+ In general, a chip specific annotation package is required to use the |
|
115 |
+ \code{genotype.Illumina} function. If this is not available (newer chip |
|
116 |
+ types or custom chips often don't have a chip-specific package available |
|
117 |
+ on Bioconductor), consider using \code{cdfName}='nopackage' and specifying |
|
118 |
+ \code{anno} and \code{genome}, which runs 'krlmm' on the samples available. |
|
119 |
+ Here \code{anno} is a data.frame read in from the relevant chip-specific |
|
120 |
+ manifest, which must have additional columns 'isSnp' which is a logical that |
|
121 |
+ indicates whether a probe is polymorphic or not, 'position', 'chromosome' and |
|
122 |
+ 'featureNames' that give the location on the chromosome and SNP name. |
|
106 | 123 |
} |
107 | 124 |
|
108 | 125 |
\value{ A \code{SnpSuperSet} instance.} |
... | ... |
@@ -111,6 +128,10 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
111 | 128 |
R/Bioconductor software for Illumina's Infinium whole-genome |
112 | 129 |
genotyping BeadChips. Bioinformatics. 2009 Oct 1;25(19):2621-3. |
113 | 130 |
|
131 |
+ Liu R, Dai Z, Yeager M, Irizarry RA1, Ritchie ME. |
|
132 |
+ KRLMM: an adaptive genotype calling method for common and low frequency variants. |
|
133 |
+ BMC Bioinformatics. 2014 May 23;15:158. |
|
134 |
+ |
|
114 | 135 |
Carvalho B, Bengtsson H, Speed TP, Irizarry RA. Exploration, |
115 | 136 |
normalization, and genotype calls of high-density oligonucleotide SNP |
116 | 137 |
array data. Biostatistics. 2007 Apr;8(2):485-99. Epub 2006 Dec |
... | ... |
@@ -119,19 +140,11 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
119 | 140 |
Carvalho BS, Louis TA, Irizarry RA. |
120 | 141 |
Quantifying uncertainty in genotype calls. |
121 | 142 |
Bioinformatics. 2010 Jan 15;26(2):242-9. |
122 |
- |
|
123 | 143 |
} |
124 | 144 |
|
125 | 145 |
\author{Matt Ritchie, Cynthia Liu, Zhiyin Dai} |
126 | 146 |
|
127 |
- \note{For large datasets, load the 'ff' package prior to genotyping |
|
128 |
-\code{ldPath} and \code{ocSamples}. The function |
|
129 |
-\code{genotype.Illumina} supports parallelization, as the (not run) |
|
130 |
-example below indicates.} |
|
131 |
- |
|
132 | 147 |
\seealso{ |
133 |
- \code{\link{crlmmIlluminaV2}}, |
|
134 | 148 |
\code{\link[oligoClasses]{ocSamples}}, |
135 | 149 |
\code{\link[oligoClasses]{ldOpts}} |
136 | 150 |
} |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@88085 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -11,7 +11,7 @@ |
11 | 11 |
genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
12 | 12 |
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
13 | 13 |
highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), XY=NULL, |
14 |
- call.method="crlmm", trueCalls=NULL, cdfName, copynumber=TRUE, batch=NULL, saveDate=TRUE, stripNorm=TRUE, |
|
14 |
+ call.method="crlmm", trueCalls=NULL, cdfName, copynumber=TRUE, batch=NULL, saveDate=FALSE, stripNorm=TRUE, |
|
15 | 15 |
useTarget=TRUE, quantile.method="between", mixtureSampleSize=10^5, fitMixture=TRUE, |
16 | 16 |
eps =0.1, verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, |
17 | 17 |
recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7) |
* collab:
Update DESCRIPTION
bug fixed for krlmm option, documentation changes
es
# Please enter a commit message to explain why this merge is necessary,
# especially if it merges an updated upstream into a topic branch.
#
# Lines starting with '#' will be ignored, and an empty message aborts
# the commit.
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@81002 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -93,8 +93,16 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
93 | 93 |
\code{CNSet} container with the appropriate dimensions. |
94 | 94 |
|
95 | 95 |
The new 'krlmm' option is available for certain chip types. Optional |
96 |
- \code{trueCalls} matrix contains known Genotype calls (1 - AA, 2 - AB, 3 - BB) |
|
97 |
- for a subset of samples and features. |
|
96 |
+ argument \code{trueCalls} matrix contains known Genotype calls |
|
97 |
+ (1 - AA, 2 - AB, 3 - BB) for a subset of samples and features. This |
|
98 |
+ will used to compute KRLMM coefficients by calling \code{vglm} function |
|
99 |
+ from \code{VGAM} package. |
|
100 |
+ |
|
101 |
+ The 'krlmm' method makes use of functions provided in \code{parallel} |
|
102 |
+ package to speed up the process. It by default initialises up to |
|
103 |
+ 8 clusters. This is configurable by setting up an option named |
|
104 |
+ "krlmm.cores", e.g. options("krlmm.cores" = 16). |
|
105 |
+ |
|
98 | 106 |
} |
99 | 107 |
|
100 | 108 |
\value{ A \code{SnpSuperSet} instance.} |
... | ... |
@@ -114,7 +122,7 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
114 | 122 |
|
115 | 123 |
} |
116 | 124 |
|
117 |
-\author{Matt Ritchie} |
|
125 |
+\author{Matt Ritchie, Cynthia Liu, Zhiyin Dai} |
|
118 | 126 |
|
119 | 127 |
\note{For large datasets, load the 'ff' package prior to genotyping |
120 | 128 |
-- this will greatly reduce the RAM required for big jobs. See |
... | ... |
@@ -129,6 +137,7 @@ example below indicates.} |
129 | 137 |
} |
130 | 138 |
\examples{ |
131 | 139 |
\dontrun{ |
140 |
+ # example for 'crlmm' option |
|
132 | 141 |
library(ff) |
133 | 142 |
library(crlmm) |
134 | 143 |
## to enable paralellization, set to TRUE |
... | ... |
@@ -151,6 +160,32 @@ example below indicates.} |
151 | 160 |
arrayInfoColNames=arrayInfo, |
152 | 161 |
cdfName="human370v1c", |
153 | 162 |
batch=rep("1", nrow(samplesheet))) |
163 |
+ |
|
164 |
+} |
|
165 |
+\dontrun{ |
|
166 |
+ # example for 'krlmm' option |
|
167 |
+ library(crlmm) |
|
168 |
+ library(ff) |
|
169 |
+ # line below is an optional step for krlmm to initialise 16 workers |
|
170 |
+ # options("krlmm.cores" = 16) |
|
171 |
+ # read in raw X and Y intensities output by GenomeStudio's GenCall genotyping module |
|
172 |
+ XY = readGenCallOutput(c("HumanOmni2-5_4v1_FinalReport_83TUSCAN.csv","HumanOmni2-5_4v1_FinalReport_88CHB-JPT.csv"), |
|
173 |
+ cdfName="humanomni25quadv1b", |
|
174 |
+ verbose=TRUE) |
|
175 |
+ krlmmResult = genotype.Illumina(XY=XY, |
|
176 |
+ cdfName=ThiscdfName, |
|
177 |
+ call.method="krlmm", |
|
178 |
+ verbose=TRUE) |
|
179 |
+ |
|
180 |
+ # example for 'krlmm' option with known genotype call for some SNPs and samples |
|
181 |
+ library(VGAM) |
|
182 |
+ hapmapCalls = load("hapmapCalls.rda") |
|
183 |
+ # hapmapCalls should have rownames and colnames corresponding to XY featureNames and sampleNames |
|
184 |
+ krlmmResult = genotype.Illumina(XY=XY, |
|
185 |
+ cdfName=ThiscdfName, |
|
186 |
+ call.method="krlmm", |
|
187 |
+ trueCalls=hapmapCalls, |
|
188 |
+ verbose=TRUE) |
|
154 | 189 |
} |
155 | 190 |
} |
156 | 191 |
\keyword{classif} |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@80947 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
old mode 100644 |
2 | 2 |
new mode 100755 |
... | ... |
@@ -11,7 +11,7 @@ |
11 | 11 |
genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
12 | 12 |
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
13 | 13 |
highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), XY=NULL, |
14 |
- call.method="crlmm", cdfName, copynumber=TRUE, batch=NULL, saveDate=TRUE, stripNorm=TRUE, |
|
14 |
+ call.method="crlmm", trueCalls=NULL, cdfName, copynumber=TRUE, batch=NULL, saveDate=TRUE, stripNorm=TRUE, |
|
15 | 15 |
useTarget=TRUE, quantile.method="between", mixtureSampleSize=10^5, fitMixture=TRUE, |
16 | 16 |
eps =0.1, verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, |
17 | 17 |
recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7) |
... | ... |
@@ -43,6 +43,7 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
43 | 43 |
specify the .idat file extension for the Cy3 and Cy5 channels.} |
44 | 44 |
\item{XY}{\code{NChannelSet} containing X and Y intensities.} |
45 | 45 |
\item{call.method}{character string specifying the genotype calling algorithm to use ('crlmm' or 'krlmm').} |
46 |
+ \item{trueCalls}{matrix specifying known Genotype calls(can contain some NAs) for a subset of samples and features (1 - AA, 2 - AB, 3 - BB).} |
|
46 | 47 |
\item{cdfName}{ annotation package (see also \code{validCdfNames})} |
47 | 48 |
\item{copynumber}{ 'logical.' Whether to store copy number intensities with SNP output.} |
48 | 49 |
\item{batch}{ character vector indicating the batch variable. Must be |
... | ... |
@@ -65,7 +66,7 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
65 | 66 |
out samples.} |
66 | 67 |
\item{recallMin}{Minimum number of samples for recalibration. } |
67 | 68 |
\item{recallRegMin}{Minimum number of SNP's for regression.} |
68 |
- \item{gender}{ integer vector ( male = 1, female =2 ) or missing, |
|
69 |
+ \item{gender}{ integer vector ( male = 1, female = 2 ) or missing, |
|
69 | 70 |
with same length as filenames. If missing, the gender is predicted.} |
70 | 71 |
\item{returnParams}{'logical'. Return recalibrated parameters from crlmm.} |
71 | 72 |
\item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)} |
... | ... |
@@ -91,7 +92,9 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
91 | 92 |
Rather, \code{batch} is required in order to initialize a |
92 | 93 |
\code{CNSet} container with the appropriate dimensions. |
93 | 94 |
|
94 |
- The new 'krlmm' option is available for certain chip types. |
|
95 |
+ The new 'krlmm' option is available for certain chip types. Optional |
|
96 |
+ \code{trueCalls} matrix contains known Genotype calls (1 - AA, 2 - AB, 3 - BB) |
|
97 |
+ for a subset of samples and features. |
|
95 | 98 |
} |
96 | 99 |
|
97 | 100 |
\value{ A \code{SnpSuperSet} instance.} |
* collab:
add warning in vignette about NAs with BafLrrSetList function
Added Human Omni Express Exome 8 v1.1b as a supported chip
updated version number of pacakge and man pages to reflect these changes
skeleton for krlmm capability added. genotype.Illumina() can now take and XY object as input
update copynumber.Rnw to use BafLrrSetList
updates to vignettes
update namespace
# Please enter a commit message to explain why this merge is necessary,
# especially if it merges an updated upstream into a topic branch.
#
# Lines starting with '#' will be ignored, and an empty message aborts
# the commit.
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@79138 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -10,11 +10,11 @@ |
10 | 10 |
\usage{ |
11 | 11 |
genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
12 | 12 |
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
13 |
- highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), |
|
14 |
- cdfName, copynumber=TRUE, batch, saveDate=TRUE, stripNorm=TRUE, useTarget=TRUE, |
|
15 |
- mixtureSampleSize=10^5, fitMixture=TRUE, eps =0.1, verbose = TRUE, seed = 1, |
|
16 |
- sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, |
|
17 |
- gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
13 |
+ highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), XY=NULL, |
|
14 |
+ call.method="crlmm", cdfName, copynumber=TRUE, batch=NULL, saveDate=TRUE, stripNorm=TRUE, |
|
15 |
+ useTarget=TRUE, quantile.method="between", mixtureSampleSize=10^5, fitMixture=TRUE, |
|
16 |
+ eps =0.1, verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, |
|
17 |
+ recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
18 | 18 |
} |
19 | 19 |
\arguments{ |
20 | 20 |
\item{sampleSheet}{\code{data.frame} containing Illumina sample sheet |
... | ... |
@@ -41,6 +41,8 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
41 | 41 |
names.} |
42 | 42 |
\item{fileExt}{list containing elements 'Green' and 'Red' which |
43 | 43 |
specify the .idat file extension for the Cy3 and Cy5 channels.} |
44 |
+ \item{XY}{\code{NChannelSet} containing X and Y intensities.} |
|
45 |
+ \item{call.method}{character string specifying the genotype calling algorithm to use ('crlmm' or 'krlmm').} |
|
44 | 46 |
\item{cdfName}{ annotation package (see also \code{validCdfNames})} |
45 | 47 |
\item{copynumber}{ 'logical.' Whether to store copy number intensities with SNP output.} |
46 | 48 |
\item{batch}{ character vector indicating the batch variable. Must be |
... | ... |
@@ -50,6 +52,7 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
50 | 52 |
\item{stripNorm}{'logical'. Should the data be strip-level normalized?} |
51 | 53 |
\item{useTarget}{'logical' (only used when \code{stripNorm=TRUE}). |
52 | 54 |
Should the reference HapMap intensities be used in strip-level normalization?} |
55 |
+ \item{quantile.method}{character string specifying the quantile normalization method to use ('within' or 'between' channels).} |
|
53 | 56 |
\item{mixtureSampleSize}{ Sample size to be use when fitting the mixture model.} |
54 | 57 |
\item{fitMixture}{ 'logical.' Whether to fit per-array mixture model.} |
55 | 58 |
\item{eps}{ Stop criteria.} |
... | ... |
@@ -88,8 +91,7 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
88 | 91 |
Rather, \code{batch} is required in order to initialize a |
89 | 92 |
\code{CNSet} container with the appropriate dimensions. |
90 | 93 |
|
91 |
- |
|
92 |
- |
|
94 |
+ The new 'krlmm' option is available for certain chip types. |
|
93 | 95 |
} |
94 | 96 |
|
95 | 97 |
\value{ A \code{SnpSuperSet} instance.} |
* collab:
remove getCluster() calls and replace with parStatus()
update man pages for crlmm and genotype.Illumina with respect to the setup for parallelization
add neededPkgs argument to ocLapply calls in crlmmGT2
bump dependency on oligoClasses
Update R/crlmm-illumina.R
contructInf, preprocessInf and genotypeInf no longer exported
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@64211 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -129,8 +129,10 @@ example below indicates.} |
129 | 129 |
## to enable paralellization, set to TRUE |
130 | 130 |
if(FALSE){ |
131 | 131 |
library(snow) |
132 |
- ## 10 workers |
|
133 |
- setCluster(10, "SOCK") |
|
132 |
+ library(doSNOW) |
|
133 |
+ ## with 10 workers |
|
134 |
+ cl <- makeCluster(10, type="SOCK") |
|
135 |
+ registerDoSNOW(cl) |
|
134 | 136 |
} |
135 | 137 |
## path to idat files |
136 | 138 |
datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k" |
* collab:
replace splitIndicesByLength with splitIndicesByNode throughout cnrma-functions.R (check that snow is loaded and getCluster is not null)
add an example to genotype.Illumina that indicates how parallelization would be enabled. The example requires local data and is not run.
change outdir in IlluminaPreprocessCN and AffyGW
Update R/crlmm-illumina.R
bump version for parallelization of genotype.Illumina
Update R/crlmm-illumina.R
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@64151 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -88,6 +88,8 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
88 | 88 |
Rather, \code{batch} is required in order to initialize a |
89 | 89 |
\code{CNSet} container with the appropriate dimensions. |
90 | 90 |
|
91 |
+ |
|
92 |
+ |
|
91 | 93 |
} |
92 | 94 |
|
93 | 95 |
\value{ A \code{SnpSuperSet} instance.} |
... | ... |
@@ -106,10 +108,14 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
106 | 108 |
Bioinformatics. 2010 Jan 15;26(2):242-9. |
107 | 109 |
|
108 | 110 |
} |
111 |
+ |
|
109 | 112 |
\author{Matt Ritchie} |
110 |
-\note{For large datasets, load the 'ff' package prior to genotyping -- |
|
111 |
-this will greatly reduce the RAM required for big jobs. See |
|
112 |
-\code{ldPath} and \code{ocSamples}.} |
|
113 |
+ |
|
114 |
+ \note{For large datasets, load the 'ff' package prior to genotyping |
|
115 |
+-- this will greatly reduce the RAM required for big jobs. See |
|
116 |
+\code{ldPath} and \code{ocSamples}. The function |
|
117 |
+\code{genotype.Illumina} supports parallelization, as the (not run) |
|
118 |
+example below indicates.} |
|
113 | 119 |
|
114 | 120 |
\seealso{ |
115 | 121 |
\code{\link{crlmmIlluminaV2}}, |
... | ... |
@@ -117,6 +123,27 @@ this will greatly reduce the RAM required for big jobs. See |
117 | 123 |
\code{\link[oligoClasses]{ldOpts}} |
118 | 124 |
} |
119 | 125 |
\examples{ |
120 |
- ## |
|
126 |
+\dontrun{ |
|
127 |
+ library(ff) |
|
128 |
+ library(crlmm) |
|
129 |
+ ## to enable paralellization, set to TRUE |
|
130 |
+ if(FALSE){ |
|
131 |
+ library(snow) |
|
132 |
+ ## 10 workers |
|
133 |
+ setCluster(10, "SOCK") |
|
134 |
+ } |
|
135 |
+ ## path to idat files |
|
136 |
+ datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k" |
|
137 |
+ ## read in your samplesheet |
|
138 |
+ samplesheet = read.csv(file.path(datadir, "HumanHap370Duo_Sample_Map.csv"), header=TRUE, as.is=TRUE) |
|
139 |
+ samplesheet <- samplesheet[-c(28:46,61:75,78:79), ] |
|
140 |
+ arrayNames <- file.path(datadir, unique(samplesheet[, "SentrixPosition"])) |
|
141 |
+ arrayInfo <- list(barcode=NULL, position="SentrixPosition") |
|
142 |
+ cnSet <- genotype.Illumina(sampleSheet=samplesheet, |
|
143 |
+ arrayNames=arrayNames, |
|
144 |
+ arrayInfoColNames=arrayInfo, |
|
145 |
+ cdfName="human370v1c", |
|
146 |
+ batch=rep("1", nrow(samplesheet))) |
|
147 |
+} |
|
121 | 148 |
} |
122 | 149 |
\keyword{classif} |
* mymac:
add AffyGW.pdf
update vignettes in inst/scripts
Change argument of validCEL to celfiles
Update constructInf to accommodate GenomeDataFrame class for featureData
bump version to 1.13.7
Add doRUnit.R
Add celfile-utils.Rd
Streamlne some of the Rd files
add validCEL function that checks whether all celfiles can be read
getFeatureData returns GenomeAnnotatedDataFrame
Remove imports from methods. Remove pdf of illumina_copynumber.pdf (large file) and copynumber.pdf
getFeatureDAta returns GenomeAnnotatedDataFrame
Remove separate vignette for copy number in inst/scripts. Include copynumber section in both affy and illumina pipelines.
update documentation files for genotype.Illumina, preprocessInf, and genotypeInf (cdfName added as argument. Indicate that 'batch' should be a character string)
pass cdfName to genotypeInf and preprocessInf
add unitTests and cn-functions for 'simple usage'
Combine AffyPreprocess and copynumber. Combine IlluminaPreprocess and copynumber
remove depency on ff to allow installation on my mac
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@62108 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -11,8 +11,8 @@ |
11 | 11 |
genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
12 | 12 |
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
13 | 13 |
highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), |
14 |
- cdfName, copynumber=TRUE, batch, saveDate=TRUE, stripNorm=TRUE, useTarget=TRUE, |
|
15 |
- mixtureSampleSize=10^5, fitMixture=TRUE, eps =0.1, verbose = TRUE, seed = 1, |
|
14 |
+ cdfName, copynumber=TRUE, batch, saveDate=TRUE, stripNorm=TRUE, useTarget=TRUE, |
|
15 |
+ mixtureSampleSize=10^5, fitMixture=TRUE, eps =0.1, verbose = TRUE, seed = 1, |
|
16 | 16 |
sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, |
17 | 17 |
gender = NULL, returnParams = TRUE, badSNP = 0.7) |
18 | 18 |
} |
... | ... |
@@ -42,8 +42,9 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
42 | 42 |
\item{fileExt}{list containing elements 'Green' and 'Red' which |
43 | 43 |
specify the .idat file extension for the Cy3 and Cy5 channels.} |
44 | 44 |
\item{cdfName}{ annotation package (see also \code{validCdfNames})} |
45 |
- \item{copynumber}{ 'logical.' Whether to store copy number intensities with SNP output.} |
|
46 |
- \item{batch}{ batch variable. See details.} |
|
45 |
+ \item{copynumber}{ 'logical.' Whether to store copy number intensities with SNP output.} |
|
46 |
+ \item{batch}{ character vector indicating the batch variable. Must be |
|
47 |
+ the same length as the number of samples. See details.} |
|
47 | 48 |
\item{saveDate}{'logical'. Should the dates from each .idat be saved |
48 | 49 |
with sample information?} |
49 | 50 |
\item{stripNorm}{'logical'. Should the data be strip-level normalized?} |
... | ... |
@@ -82,16 +83,17 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
82 | 83 |
number probes (if present) are also quantile-normalized and (2) |
83 | 84 |
the class of object returned by this function, \code{CNSet}, is |
84 | 85 |
needed for subsequent copy number estimation. Note that the |
85 |
- batch variable that must be passed to this function has no |
|
86 |
- effect on the normalization or genotyping steps. Rather, |
|
87 |
- \code{batch} is required in order to initialize a \code{CNSet} |
|
88 |
- container with the appropriate dimensions. |
|
89 |
-} |
|
86 |
+ batch variable (a character string) that must be passed to this |
|
87 |
+ function has no effect on the normalization or genotyping steps. |
|
88 |
+ Rather, \code{batch} is required in order to initialize a |
|
89 |
+ \code{CNSet} container with the appropriate dimensions. |
|
90 |
+ |
|
91 |
+ } |
|
90 | 92 |
|
91 | 93 |
\value{ A \code{SnpSuperSet} instance.} |
92 | 94 |
\references{ |
93 | 95 |
Ritchie ME, Carvalho BS, Hetrick KN, Tavar\'{e} S, Irizarry RA. |
94 |
- R/Bioconductor software for Illumina's Infinium whole-genome |
|
96 |
+ R/Bioconductor software for Illumina's Infinium whole-genome |
|
95 | 97 |
genotyping BeadChips. Bioinformatics. 2009 Oct 1;25(19):2621-3. |
96 | 98 |
|
97 | 99 |
Carvalho B, Bengtsson H, Speed TP, Irizarry RA. Exploration, |
... | ... |
@@ -110,7 +112,7 @@ this will greatly reduce the RAM required for big jobs. See |
110 | 112 |
\code{ldPath} and \code{ocSamples}.} |
111 | 113 |
|
112 | 114 |
\seealso{ |
113 |
- \code{\link{crlmmIlluminaV2}}, |
|
115 |
+ \code{\link{crlmmIlluminaV2}}, |
|
114 | 116 |
\code{\link[oligoClasses]{ocSamples}}, |
115 | 117 |
\code{\link[oligoClasses]{ldOpts}} |
116 | 118 |
} |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54164 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -72,7 +72,7 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
72 | 72 |
For large datasets it is important to utilize the large data |
73 | 73 |
support by installing and loading the ff package before calling |
74 | 74 |
the \code{genotype} function. In previous versions of the |
75 |
- \code{crlmm} package, we useed different functions for |
|
75 |
+ \code{crlmm} package, we used different functions for |
|
76 | 76 |
genotyping depending on whether the ff package is loaded, namely |
77 | 77 |
\code{genotype} and \code{genotype2}. The \code{genotype} |
78 | 78 |
function now handles both instances. |
... | ... |
@@ -11,7 +11,7 @@ |
11 | 11 |
genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
12 | 12 |
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
13 | 13 |
highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), |
14 |
- cdfName, copynumber=TRUE, batch, outdir=".", saveDate=TRUE, stripNorm=TRUE, useTarget=TRUE, |
|
14 |
+ cdfName, copynumber=TRUE, batch, saveDate=TRUE, stripNorm=TRUE, useTarget=TRUE, |
|
15 | 15 |
mixtureSampleSize=10^5, fitMixture=TRUE, eps =0.1, verbose = TRUE, seed = 1, |
16 | 16 |
sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, |
17 | 17 |
gender = NULL, returnParams = TRUE, badSNP = 0.7) |
... | ... |
@@ -43,8 +43,7 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
43 | 43 |
specify the .idat file extension for the Cy3 and Cy5 channels.} |
44 | 44 |
\item{cdfName}{ annotation package (see also \code{validCdfNames})} |
45 | 45 |
\item{copynumber}{ 'logical.' Whether to store copy number intensities with SNP output.} |
46 |
- \item{batch}{ batch variable. See details. } |
|
47 |
- \item{outdir}{character string specifying the location to store large data objects.} |
|
46 |
+ \item{batch}{ batch variable. See details.} |
|
48 | 47 |
\item{saveDate}{'logical'. Should the dates from each .idat be saved |
49 | 48 |
with sample information?} |
50 | 49 |
\item{stripNorm}{'logical'. Should the data be strip-level normalized?} |
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,121 @@ |
1 |
+\name{genotype.Illumina} |
|
2 |
+\alias{genotype.Illumina} |
|
3 |
+ |
|
4 |
+\title{ |
|
5 |
+ Preprocessing and genotyping of Illumina Infinium II arrays. |
|
6 |
+} |
|
7 |
+\description{ |
|
8 |
+ Preprocessing and genotyping of Illumina Infinium II arrays. |
|
9 |
+} |
|
10 |
+\usage{ |
|
11 |
+genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
|
12 |
+ arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
|
13 |
+ highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), |
|
14 |
+ cdfName, copynumber=TRUE, batch, outdir=".", saveDate=TRUE, stripNorm=TRUE, useTarget=TRUE, |
|
15 |
+ mixtureSampleSize=10^5, fitMixture=TRUE, eps =0.1, verbose = TRUE, seed = 1, |
|
16 |
+ sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, |
|
17 |
+ gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
18 |
+} |
|
19 |
+\arguments{ |
|
20 |
+ \item{sampleSheet}{\code{data.frame} containing Illumina sample sheet |
|
21 |
+ information (for required columns, refer to BeadStudio Genotyping |
|
22 |
+ guide - Appendix A).} |
|
23 |
+ \item{arrayNames}{character vector containing names of arrays to be |
|
24 |
+ read in. If \code{NULL}, all arrays that can be found in the |
|
25 |
+ specified working directory will be read in.} |
|
26 |
+ \item{ids}{vector containing ids of probes to be read in. If |
|
27 |
+ \code{NULL} all probes found on the first array are read in.} |
|
28 |
+ \item{path}{character string specifying the location of files to be |
|
29 |
+ read by the function} |
|
30 |
+ \item{arrayInfoColNames}{(used when \code{sampleSheet} is specified) |
|
31 |
+ list containing elements 'barcode' which indicates column names in |
|
32 |
+ the \code{sampleSheet} which contains the arrayNumber/barcode number |
|
33 |
+ and 'position' which indicates the strip number. In older style |
|
34 |
+ sample sheets, this information is combined (usually in a column |
|
35 |
+ named 'SentrixPosition') and this should be specified as |
|
36 |
+ \code{list(barcode=NULL, position="SentrixPosition")}} |
|
37 |
+ \item{highDensity}{logical (used when \code{sampleSheet} is |
|
38 |
+ specified). If \code{TRUE}, array extensions '\_A', '\_B' in |
|
39 |
+ sampleSheet are replaced with 'R01C01', 'R01C02' etc.} |
|
40 |
+ \item{sep}{character string specifying separator used in .idat file |
|
41 |
+ names.} |
|
42 |
+ \item{fileExt}{list containing elements 'Green' and 'Red' which |
|
43 |
+ specify the .idat file extension for the Cy3 and Cy5 channels.} |
|
44 |
+ \item{cdfName}{ annotation package (see also \code{validCdfNames})} |
|
45 |
+ \item{copynumber}{ 'logical.' Whether to store copy number intensities with SNP output.} |
|
46 |
+ \item{batch}{ batch variable. See details. } |
|
47 |
+ \item{outdir}{character string specifying the location to store large data objects.} |
|
48 |
+ \item{saveDate}{'logical'. Should the dates from each .idat be saved |
|
49 |
+ with sample information?} |
|
50 |
+ \item{stripNorm}{'logical'. Should the data be strip-level normalized?} |
|
51 |
+ \item{useTarget}{'logical' (only used when \code{stripNorm=TRUE}). |
|
52 |
+ Should the reference HapMap intensities be used in strip-level normalization?} |
|
53 |
+ \item{mixtureSampleSize}{ Sample size to be use when fitting the mixture model.} |
|
54 |
+ \item{fitMixture}{ 'logical.' Whether to fit per-array mixture model.} |
|
55 |
+ \item{eps}{ Stop criteria.} |
|
56 |
+ \item{verbose}{ 'logical.' Whether to print descriptive messages during processing.} |
|
57 |
+ \item{seed}{ Seed to be used when sampling. Useful for reproducibility} |
|
58 |
+ \item{sns}{The sample identifiers. If missing, the default sample names are \code{basename(filenames)}} |
|
59 |
+ \item{probs}{'numeric' vector with priors for AA, AB and BB.} |
|
60 |
+ \item{DF}{'integer' with number of degrees of freedom to use with t-distribution.} |
|
61 |
+ \item{SNRMin}{'numeric' scalar defining the minimum SNR used to filter |
|
62 |
+ out samples.} |
|
63 |
+ \item{recallMin}{Minimum number of samples for recalibration. } |
|
64 |
+ \item{recallRegMin}{Minimum number of SNP's for regression.} |
|
65 |
+ \item{gender}{ integer vector ( male = 1, female =2 ) or missing, |
|
66 |
+ with same length as filenames. If missing, the gender is predicted.} |
|
67 |
+ \item{returnParams}{'logical'. Return recalibrated parameters from crlmm.} |
|
68 |
+ \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)} |
|
69 |
+} |
|
70 |
+ |
|
71 |
+\details{ |
|
72 |
+ |
|
73 |
+ For large datasets it is important to utilize the large data |
|
74 |
+ support by installing and loading the ff package before calling |
|
75 |
+ the \code{genotype} function. In previous versions of the |
|
76 |
+ \code{crlmm} package, we useed different functions for |
|
77 |
+ genotyping depending on whether the ff package is loaded, namely |
|
78 |
+ \code{genotype} and \code{genotype2}. The \code{genotype} |
|
79 |
+ function now handles both instances. |
|
80 |
+ |
|
81 |
+ \code{genotype.Illumina} is a wrapper of the \code{crlmm} |
|
82 |
+ function for genotyping. Differences include (1) that the copy |
|
83 |
+ number probes (if present) are also quantile-normalized and (2) |
|
84 |
+ the class of object returned by this function, \code{CNSet}, is |
|
85 |
+ needed for subsequent copy number estimation. Note that the |
|
86 |
+ batch variable that must be passed to this function has no |
|
87 |
+ effect on the normalization or genotyping steps. Rather, |
|
88 |
+ \code{batch} is required in order to initialize a \code{CNSet} |
|
89 |
+ container with the appropriate dimensions. |
|
90 |
+} |
|
91 |
+ |
|
92 |
+\value{ A \code{SnpSuperSet} instance.} |
|
93 |
+\references{ |
|
94 |
+ Ritchie ME, Carvalho BS, Hetrick KN, Tavar\'{e} S, Irizarry RA. |
|
95 |
+ R/Bioconductor software for Illumina's Infinium whole-genome |
|
96 |
+ genotyping BeadChips. Bioinformatics. 2009 Oct 1;25(19):2621-3. |
|
97 |
+ |
|
98 |
+ Carvalho B, Bengtsson H, Speed TP, Irizarry RA. Exploration, |
|
99 |
+ normalization, and genotype calls of high-density oligonucleotide SNP |
|
100 |
+ array data. Biostatistics. 2007 Apr;8(2):485-99. Epub 2006 Dec |
|
101 |
+ 22. PMID: 17189563. |
|
102 |
+ |
|
103 |
+ Carvalho BS, Louis TA, Irizarry RA. |
|
104 |
+ Quantifying uncertainty in genotype calls. |
|
105 |
+ Bioinformatics. 2010 Jan 15;26(2):242-9. |
|
106 |
+ |
|
107 |
+} |
|
108 |
+\author{Matt Ritchie} |
|
109 |
+\note{For large datasets, load the 'ff' package prior to genotyping -- |
|
110 |
+this will greatly reduce the RAM required for big jobs. See |
|
111 |
+\code{ldPath} and \code{ocSamples}.} |
|
112 |
+ |
|
113 |
+\seealso{ |
|
114 |
+ \code{\link{crlmmIlluminaV2}}, |
|
115 |
+ \code{\link[oligoClasses]{ocSamples}}, |
|
116 |
+ \code{\link[oligoClasses]{ldOpts}} |
|
117 |
+} |
|
118 |
+\examples{ |
|
119 |
+ ## |
|
120 |
+} |
|
121 |
+\keyword{classif} |