git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@38297 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -72,3 +72,12 @@ BioC data packages |
72 | 72 |
* cnrma function requires the cdfName |
73 | 73 |
|
74 | 74 |
* update man files for cnrma and computeCopynumber |
75 |
+ |
|
76 |
+2009-03-30 Matt Ritchie - committed version 1.0.66 |
|
77 |
+ |
|
78 |
+* added functions to read in idat files, pre-process and genotype |
|
79 |
+data from Illumina SNP arrays (functions named readIdatFiles() |
|
80 |
+and crlmmIllumina() in crlmm-illumina.R) |
|
81 |
+ |
|
82 |
+* added man pages for these functions |
|
83 |
+ |
... | ... |
@@ -1,8 +1,8 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for SNP 5.0 and 6.0 arrays. |
4 |
-Version: 1.0.65 |
|
5 |
-Date: 2008-12-28 |
|
4 |
+Version: 1.0.66 |
|
5 |
+Date: 2008-12-30 |
|
6 | 6 |
Author: Rafael A Irizarry, Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu> |
7 | 7 |
Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu> |
8 | 8 |
Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 6.0. |
... | ... |
@@ -13,6 +13,7 @@ Collate: AllClasses.R |
13 | 13 |
methods-SnpSet.R |
14 | 14 |
cnrma-functions.R |
15 | 15 |
crlmm-functions.R |
16 |
+ crlmm-illumina.R |
|
16 | 17 |
snprma-functions.R |
17 | 18 |
utils.R |
18 | 19 |
zzz.R |
... | ... |
@@ -7,5 +7,5 @@ importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile, sd) |
7 | 7 |
importFrom(genefilter, rowSds) |
8 | 8 |
importFrom(mvtnorm, dmvnorm) |
9 | 9 |
|
10 |
-export("crlmm", "list.celfiles", "computeCopynumber", "cnrma", "celDates") |
|
10 |
+export("crlmm", "list.celfiles", "computeCopynumber", "cnrma", "celDates", "crlmmIllumina", "readIdatFiles") |
|
11 | 11 |
exportMethods("calls", "confs") |
12 | 12 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,692 @@ |
1 |
+# function below works OK provided all .idat files are in the current working directory |
|
2 |
+# - could add an option to allow files in Illumina directory structure to be handled |
|
3 |
+# or to use the optional 'Path' column in sampleSheet |
|
4 |
+# - there is a lot of header information that is currently discarded - could try and store this somewhere in the resulting NChannelSet |
|
5 |
+ |
|
6 |
+readIdatFiles = function(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
|
7 |
+ arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
|
8 |
+ highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat")) { |
|
9 |
+ if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet |
|
10 |
+ arrayNames=NULL |
|
11 |
+ if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) { |
|
12 |
+ barcode = sampleSheet[,arrayInfoColNames$barcode] |
|
13 |
+ arrayNames=barcode |
|
14 |
+ } |
|
15 |
+ if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) { |
|
16 |
+ position = sampleSheet[,arrayInfoColNames$position] |
|
17 |
+ if(is.null(arrayNames)) |
|
18 |
+ arrayNames=position |
|
19 |
+ else |
|
20 |
+ arrayNames = paste(arrayNames, position, sep=sep) |
|
21 |
+ if(highDensity) { |
|
22 |
+ hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02") |
|
23 |
+ for(i in names(hdExt)) |
|
24 |
+ arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames) |
|
25 |
+ } |
|
26 |
+ } |
|
27 |
+# pd = new("AnnotatedDataFrame", data = sampleSheet) |
|
28 |
+ } |
|
29 |
+ if(is.null(arrayNames)) { |
|
30 |
+ arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path)) |
|
31 |
+ if(!is.null(sampleSheet)) { |
|
32 |
+ sampleSheet=NULL |
|
33 |
+ cat("Could not find required info in \'sampleSheet\' - ignoring. Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n") |
|
34 |
+ } |
|
35 |
+ } |
|
36 |
+ grnfiles = paste(arrayNames, fileExt$green, sep=sep) |
|
37 |
+ redfiles = paste(arrayNames, fileExt$red, sep=sep) |
|
38 |
+ if(length(grnfiles)==0 || length(redfiles)==0) |
|
39 |
+ stop("Cannot find .idat files") |
|
40 |
+ if(length(grnfiles)!=length(redfiles)) |
|
41 |
+ stop("Cannot find matching .idat files") |
|
42 |
+ if(!all(c(redfiles,grnfiles) %in% dir(path=path))) |
|
43 |
+ stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n", |
|
44 |
+ paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" ")) |
|
45 |
+ grnidats = file.path(path, grnfiles) |
|
46 |
+ redidats = file.path(path, redfiles) |
|
47 |
+ |
|
48 |
+ headerInfo = list(nProbes = rep(NA, length(arrayNames)), |
|
49 |
+ Barcode = rep(NA, length(arrayNames)), |
|
50 |
+ ChipType = rep(NA, length(arrayNames)), |
|
51 |
+ Manifest = rep(NA, length(arrayNames)), # not sure about this one - sometimes blank |
|
52 |
+ Position = rep(NA, length(arrayNames))) # this may also vary a bit |
|
53 |
+ # read in the data |
|
54 |
+ for(i in seq(along=arrayNames)) { |
|
55 |
+ cat("reading", arrayNames[i], "\t") |
|
56 |
+ idsG = idsR = G = R = NULL |
|
57 |
+# if(grnfiles[i] %in% dir(path=path)) { |
|
58 |
+ cat(paste(sep, fileExt$green, sep=""), "\t") |
|
59 |
+ G = readIDAT(grnidats[i]) |
|
60 |
+ idsG = rownames(G$Quants) |
|
61 |
+# } |
|
62 |
+# else |
|
63 |
+# stop("Could not find ", grnidats[i]) |
|
64 |
+ |
|
65 |
+# if(redfiles[i] %in% dir(path=path)) { |
|
66 |
+ cat(paste(sep, fileExt$red, sep=""), "\n") |
|
67 |
+ R = readIDAT(redidats[i]) |
|
68 |
+ idsR = rownames(R$Quants) |
|
69 |
+# } |
|
70 |
+# else |
|
71 |
+# stop("Could not find ", redidats[i]) |
|
72 |
+ |
|
73 |
+ headerInfo$nProbes[i] = G$nSNPsRead |
|
74 |
+ headerInfo$Barcode[i] = G$Barcode |
|
75 |
+ headerInfo$ChipType[i] = G$ChipType |
|
76 |
+ headerInfo$Manifest[i] = G$Unknown$MostlyNull |
|
77 |
+ headerInfo$Position[i] = G$Unknowns$MostlyA |
|
78 |
+ |
|
79 |
+ if(headerInfo$ChipType[i]!=headerInfo$ChipType[1] || headerInfo$Manifest[i]!=headerInfo$Manifest[1]) |
|
80 |
+ # || headerInfo$nProbes[i]!=headerInfo$nProbes[1] ## removed this condition as some arrays used the same manifest |
|
81 |
+ # but differed by a few SNPs for some reason - most of the chip was the same though |
|
82 |
+ stop("Chips are not of all of the same type - please check your data") |
|
83 |
+ if(i==1) { |
|
84 |
+ if(is.null(ids) && !is.null(G)) |
|
85 |
+ ids = idsG |
|
86 |
+ else |
|
87 |
+ stop("Could not find probe IDs") |
|
88 |
+ nprobes = length(ids) |
|
89 |
+ narrays = length(arrayNames) |
|
90 |
+ Gintens = Rintens = Gnobeads = Rnobeads = Gstderr = Rstderr = matrix(NA, nprobes, narrays) |
|
91 |
+ rownames(Gintens) = rownames(Rintens) = rownames(Gnobeads) = rownames(Rnobeads) = rownames(Gstderr) = rownames(Rstderr) = ids |
|
92 |
+ if(!is.null(sampleSheet)) |
|
93 |
+ colnames(Gintens) = colnames(Rintens) = colnames(Gnobeads) = colnames(Rnobeads) = colnames(Gstderr) = colnames(Rstderr) = sampleSheet$Sample_ID |
|
94 |
+ else |
|
95 |
+ colnames(Gintens) = colnames(Rintens) = colnames(Gnobeads) = colnames(Rnobeads) = colnames(Gstderr) = colnames(Rstderr) = arrayNames |
|
96 |
+ } |
|
97 |
+ |
|
98 |
+ if(length(ids)==length(idsG)) { |
|
99 |
+ if(sum(ids==idsG)==nprobes) { |
|
100 |
+ Gintens[,i] = G$Quants[, "Mean"] |
|
101 |
+ Gnobeads[,i] = G$Quants[, "NBeads"] |
|
102 |
+ Gstderr[,i] = G$Quants[, "SD"] |
|
103 |
+ } |
|
104 |
+ } |
|
105 |
+ else { |
|
106 |
+ indG = match(ids, idsG) |
|
107 |
+ Gintens[,i] = G$Quants[indG, "Mean"] |
|
108 |
+ Gnobeads[,i] = G$Quants[indG, "NBeads"] |
|
109 |
+ Gstderr[,i] = G$Quants[indG, "SD"] |
|
110 |
+ } |
|
111 |
+ if(length(ids)==length(idsG)) { |
|
112 |
+ if(sum(ids==idsR)==nprobes) { |
|
113 |
+ Rintens[,i] = R$Quants[ ,"Mean"] |
|
114 |
+ Rnobeads[,i] = R$Quants[ ,"NBeads"] |
|
115 |
+ Rstderr[,i] = R$Quants[ ,"SD"] |
|
116 |
+ } |
|
117 |
+ } |
|
118 |
+ else { |
|
119 |
+ indR = match(ids, idsR) |
|
120 |
+ Rintens[,i] = R$Quants[indR, "Mean"] |
|
121 |
+ Rnobeads[,i] = R$Quants[indR, "NBeads"] |
|
122 |
+ Rstderr[,i] = R$Quants[indR, "SD"] |
|
123 |
+ } |
|
124 |
+ } |
|
125 |
+ if(is.null(sampleSheet)) |
|
126 |
+ pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames)) |
|
127 |
+ else |
|
128 |
+ pd = new("AnnotatedDataFrame", data = sampleSheet) |
|
129 |
+ RG = new("NChannelSet", |
|
130 |
+ R=Rintens, G=Gintens, Rnb=Rnobeads, Gnb=Gnobeads, |
|
131 |
+ Rse=Rstderr, Gse=Gstderr, annotation=headerInfo$Manifest[1], |
|
132 |
+ phenoData = pd) |
|
133 |
+} |
|
134 |
+ |
|
135 |
+ |
|
136 |
+# the readIDAT() and readBPM() functions below were provided by Keith Baggerly, 27/8/2008 |
|
137 |
+readIDAT <- function(idatFile){ |
|
138 |
+ |
|
139 |
+ fileSize <- file.info(idatFile)$size |
|
140 |
+ |
|
141 |
+ tempCon <- file(idatFile,"rb") |
|
142 |
+ prefixCheck <- readChar(tempCon,4) |
|
143 |
+ if(prefixCheck != "IDAT"){ |
|
144 |
+ |
|
145 |
+ } |
|
146 |
+ |
|
147 |
+ versionNumber <- readBin(tempCon, "integer", n=1, size=8, |
|
148 |
+ endian="little", signed=FALSE) |
|
149 |
+ |
|
150 |
+ nFields <- readBin(tempCon, "integer", n=1, size=4, |
|
151 |
+ endian="little", signed=FALSE) |
|
152 |
+ |
|
153 |
+ fields <- matrix(0,nFields,3); |
|
154 |
+ colnames(fields) <- c("Field Code", "Byte Offset", "Bytes") |
|
155 |
+ for(i1 in 1:nFields){ |
|
156 |
+ fields[i1,"Field Code"] <- |
|
157 |
+ readBin(tempCon, "integer", n=1, size=2, endian="little", signed=FALSE) |
|
158 |
+ fields[i1,"Byte Offset"] <- |
|
159 |
+ readBin(tempCon, "integer", n=1, size=8, endian="little", signed=FALSE) |
|
160 |
+ } |
|
161 |
+ |
|
162 |
+ knownCodes <- c(1000, 102, 103, 104, 107, 200, 300, 400, |
|
163 |
+ 401, 402, 403, 404, 405, 406, 407, 408, 409) |
|
164 |
+ names(knownCodes) <- |
|
165 |
+ c("nSNPsRead", # 1000 |
|
166 |
+ "IlluminaID", # 102 |
|
167 |
+ "SD", # 103 |
|
168 |
+ "Mean", # 104 |
|
169 |
+ "NBeads", # 107 |
|
170 |
+ "MidBlock", # 200 |
|
171 |
+ "RunInfo", # 300 |
|
172 |
+ "RedGreen", # 400 |
|
173 |
+ "MostlyNull", # 401 |
|
174 |
+ "Barcode", # 402 |
|
175 |
+ "ChipType", # 403 |
|
176 |
+ "MostlyA", # 404 |
|
177 |
+ "Unknown.1", # 405 |
|
178 |
+ "Unknown.2", # 406 |
|
179 |
+ "Unknown.3", # 407 |
|
180 |
+ "Unknown.4", # 408 |
|
181 |
+ "Unknown.5" # 409 |
|
182 |
+ ) |
|
183 |
+ |
|
184 |
+ nNewFields <- 1 |
|
185 |
+ rownames(fields) <- paste("Null", 1:nFields) |
|
186 |
+ for(i1 in 1:nFields){ |
|
187 |
+ temp <- match(fields[i1,"Field Code"], knownCodes) |
|
188 |
+ if(!is.na(temp)){ |
|
189 |
+ rownames(fields)[i1] <- names(knownCodes)[temp] |
|
190 |
+ }else{ |
|
191 |
+ rownames(fields)[i1] <- paste("newField", nNewFields, sep=".") |
|
192 |
+ nNewFields <- nNewFields + 1 |
|
193 |
+ } |
|
194 |
+ } |
|
195 |
+ |
|
196 |
+ seek(tempCon, fields["nSNPsRead", "Byte Offset"]) |
|
197 |
+ nSNPsRead <- readBin(tempCon, "integer", n=1, size=4, |
|
198 |
+ endian="little", signed=FALSE) |
|
199 |
+ |
|
200 |
+ seek(tempCon, fields["IlluminaID", "Byte Offset"]) |
|
201 |
+ IlluminaID <- readBin(tempCon, "integer", n=nSNPsRead, size=4, |
|
202 |
+ endian="little", signed=FALSE) |
|
203 |
+ |
|
204 |
+ seek(tempCon, fields["SD", "Byte Offset"]) |
|
205 |
+ SD <- readBin(tempCon, "integer", n=nSNPsRead, size=2, |
|
206 |
+ endian="little", signed=FALSE) |
|
207 |
+ |
|
208 |
+ seek(tempCon, fields["Mean", "Byte Offset"]) |
|
209 |
+ Mean <- readBin(tempCon, "integer", n=nSNPsRead, size=2, |
|
210 |
+ endian="little", signed=FALSE) |
|
211 |
+ |
|
212 |
+ seek(tempCon, fields["NBeads", "Byte Offset"]) |
|
213 |
+ NBeads <- readBin(tempCon, "integer", n=nSNPsRead, size=1, signed=FALSE) |
|
214 |
+ |
|
215 |
+ seek(tempCon, fields["MidBlock", "Byte Offset"]) |
|
216 |
+ nMidBlockEntries <- readBin(tempCon, "integer", n=1, size=4, |
|
217 |
+ endian="little", signed=FALSE) |
|
218 |
+ MidBlock <- readBin(tempCon, "integer", n=nMidBlockEntries, size=4, |
|
219 |
+ endian="little", signed=FALSE) |
|
220 |
+ |
|
221 |
+ seek(tempCon, fields["RunInfo", "Byte Offset"]) |
|
222 |
+ nRunInfoBlocks <- readBin(tempCon, "integer", n=1, size=4, |
|
223 |
+ endian="little", signed=FALSE) |
|
224 |
+ RunInfo <- matrix(NA, nRunInfoBlocks, 5) |
|
225 |
+ colnames(RunInfo) <- c("RunTime", "BlockType", "BlockPars", |
|
226 |
+ "BlockCode", "CodeVersion") |
|
227 |
+ for(i1 in 1:nRunInfoBlocks){ |
|
228 |
+ for(i2 in 1:5){ |
|
229 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
230 |
+ RunInfo[i1,i2] <- readChar(tempCon, nChars) |
|
231 |
+ } |
|
232 |
+ } |
|
233 |
+ |
|
234 |
+ seek(tempCon, fields["RedGreen", "Byte Offset"]) |
|
235 |
+ RedGreen <- readBin(tempCon, "numeric", n=1, size=4, |
|
236 |
+ endian="little", signed=FALSE) |
|
237 |
+ #RedGreen <- readBin(tempCon, "integer", n=4, size=1, |
|
238 |
+ # endian="little", signed=FALSE) |
|
239 |
+ |
|
240 |
+ seek(tempCon, fields["MostlyNull", "Byte Offset"]) |
|
241 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
242 |
+ MostlyNull <- readChar(tempCon, nChars) |
|
243 |
+ |
|
244 |
+ seek(tempCon, fields["Barcode", "Byte Offset"]) |
|
245 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
246 |
+ Barcode <- readChar(tempCon, nChars) |
|
247 |
+ |
|
248 |
+ seek(tempCon, fields["ChipType", "Byte Offset"]) |
|
249 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
250 |
+ ChipType <- readChar(tempCon, nChars) |
|
251 |
+ |
|
252 |
+ seek(tempCon, fields["MostlyA", "Byte Offset"]) |
|
253 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
254 |
+ MostlyA <- readChar(tempCon, nChars) |
|
255 |
+ |
|
256 |
+ seek(tempCon, fields["Unknown.1", "Byte Offset"]) |
|
257 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
258 |
+ Unknown.1 <- readChar(tempCon, nChars) |
|
259 |
+ |
|
260 |
+ seek(tempCon, fields["Unknown.2", "Byte Offset"]) |
|
261 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
262 |
+ Unknown.2 <- readChar(tempCon, nChars) |
|
263 |
+ |
|
264 |
+ seek(tempCon, fields["Unknown.3", "Byte Offset"]) |
|
265 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
266 |
+ Unknown.3 <- readChar(tempCon, nChars) |
|
267 |
+ |
|
268 |
+ seek(tempCon, fields["Unknown.4", "Byte Offset"]) |
|
269 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
270 |
+ Unknown.4 <- readChar(tempCon, nChars) |
|
271 |
+ |
|
272 |
+ seek(tempCon, fields["Unknown.5", "Byte Offset"]) |
|
273 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
274 |
+ Unknown.5 <- readChar(tempCon, nChars) |
|
275 |
+ |
|
276 |
+ close(tempCon) |
|
277 |
+ |
|
278 |
+ Unknowns <- |
|
279 |
+ list(MostlyNull=MostlyNull, |
|
280 |
+ MostlyA=MostlyA, |
|
281 |
+ Unknown.1=Unknown.1, |
|
282 |
+ Unknown.2=Unknown.2, |
|
283 |
+ Unknown.3=Unknown.3, |
|
284 |
+ Unknown.4=Unknown.4, |
|
285 |
+ Unknown.5=Unknown.5) |
|
286 |
+ |
|
287 |
+ Quants <- cbind(Mean, SD, NBeads) |
|
288 |
+ colnames(Quants) <- c("Mean", "SD", "NBeads") |
|
289 |
+ rownames(Quants) <- as.character(IlluminaID) |
|
290 |
+ |
|
291 |
+ idatValues <- |
|
292 |
+ list(fileSize=fileSize, |
|
293 |
+ versionNumber=versionNumber, |
|
294 |
+ nFields=nFields, |
|
295 |
+ fields=fields, |
|
296 |
+ nSNPsRead=nSNPsRead, |
|
297 |
+ #IlluminaID=IlluminaID, |
|
298 |
+ #SD=SD, |
|
299 |
+ #Mean=Mean, |
|
300 |
+ #NBeads=NBeads, |
|
301 |
+ Quants=Quants, |
|
302 |
+ MidBlock=MidBlock, |
|
303 |
+ RunInfo=RunInfo, |
|
304 |
+ RedGreen=RedGreen, |
|
305 |
+ Barcode=Barcode, |
|
306 |
+ ChipType=ChipType, |
|
307 |
+ Unknowns=Unknowns) |
|
308 |
+ |
|
309 |
+ idatValues |
|
310 |
+ |
|
311 |
+} |
|
312 |
+ |
|
313 |
+readBPM <- function(bpmFile){ |
|
314 |
+ |
|
315 |
+ ## Reads and parses Illumina BPM files |
|
316 |
+ |
|
317 |
+ fileSize <- file.info(bpmFile)$size |
|
318 |
+ |
|
319 |
+ tempCon <- file(bpmFile,"rb") |
|
320 |
+ |
|
321 |
+ # The first few bytes of the egtFile are some type of |
|
322 |
+ # header, but there's no related byte offset information. |
|
323 |
+ |
|
324 |
+ prefixCheck <- readChar(tempCon,3) ## should be "BPM" |
|
325 |
+ |
|
326 |
+ null.1 <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
327 |
+ ## should be 1 |
|
328 |
+ |
|
329 |
+ versionNumber <- |
|
330 |
+ readBin(tempCon, "integer", n=1, size=4, endian="little", signed=FALSE) |
|
331 |
+ ## should be 4 |
|
332 |
+ |
|
333 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
334 |
+ chipType <- readChar(tempCon, nChars) |
|
335 |
+ |
|
336 |
+ null.2 <- readBin(tempCon, "integer", n=2, size=1, signed=FALSE) |
|
337 |
+ |
|
338 |
+ csvLines <- readLines(tempCon, 22) |
|
339 |
+ |
|
340 |
+ entriesByteOffset <- seek(tempCon); |
|
341 |
+ nEntries <- readBin(tempCon, "integer", n=1, size=4, |
|
342 |
+ endian="little", signed=FALSE) |
|
343 |
+ |
|
344 |
+ if(FALSE){ |
|
345 |
+ |
|
346 |
+ snpIndexByteOffset <- seek(tempCon) |
|
347 |
+ snpIndex <- readBin(tempCon, "integer", n=nEntries, size=4, |
|
348 |
+ endian="little", signed=FALSE) |
|
349 |
+ ## for the 1M array, these are simply in order from 1 to 1072820. |
|
350 |
+ |
|
351 |
+ snpNamesByteOffset <- seek(tempCon) |
|
352 |
+ snpNames <- rep("A", nEntries) |
|
353 |
+ for(i1 in 1:nEntries){ |
|
354 |
+ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) |
|
355 |
+ snpNames[i1] <- readChar(tempCon, nChars) |
|
356 |
+ } |
|
357 |
+ |
|
358 |
+ } |
|
359 |
+ |
|
360 |
+ seek(tempCon, 15278138) |
|
361 |
+ |
|
362 |
+ normIDByteOffset <- seek(tempCon) |
|
363 |
+ normID <- readBin(tempCon, "integer", n=nEntries, size=1, signed=FALSE) + 1 |
|
364 |
+ |
|
365 |
+ newBlockByteOffset <- seek(tempCon) |
|
366 |
+ newBlock <- readBin(tempCon, "integer", n=10000, size=1, signed=FALSE) |
|
367 |
+ |
|
368 |
+ close(tempCon) |
|
369 |
+ |
|
370 |
+ byteOffsets <- list(entriesByteOffset=entriesByteOffset, |
|
371 |
+ #snpIndexByteOffset=snpIndexByteOffset, |
|
372 |
+ #snpNamesByteOffset=snpNamesByteOffset, |
|
373 |
+ normIDByteOffset=normIDByteOffset, |
|
374 |
+ newBlockByteOffset=newBlockByteOffset) |
|
375 |
+ |
|
376 |
+ allStuff <- list(prefixCheck=prefixCheck, |
|
377 |
+ null.1=null.1, |
|
378 |
+ versionNumber=versionNumber, |
|
379 |
+ chipType=chipType, |
|
380 |
+ null.2=null.2, |
|
381 |
+ csvLines=csvLines, |
|
382 |
+ nEntries=nEntries, |
|
383 |
+ #snpIndex=snpIndex, |
|
384 |
+ #snpNames=snpNames, |
|
385 |
+ normID=normID, |
|
386 |
+ newBlock=newBlock, |
|
387 |
+ byteOffsets=byteOffsets) |
|
388 |
+ allStuff |
|
389 |
+ |
|
390 |
+} |
|
391 |
+ |
|
392 |
+ |
|
393 |
+RGtoXY = function(RG, chipType, remove=TRUE, verbose=TRUE) { |
|
394 |
+ chipList = c("human1mv1c", # 1M |
|
395 |
+ "human370v1c", # 370CNV |
|
396 |
+ "human650v3a", # 650Y |
|
397 |
+ "human610quadv1b", # 610 quad |
|
398 |
+ "human660quadv1a", # 660 quad |
|
399 |
+ "human370quadv3c", # 370CNV quad |
|
400 |
+ "human550v3b", # 550K |
|
401 |
+ "human1mduov3b") # 1M Duo |
|
402 |
+ if(missing(chipType)) |
|
403 |
+ chipType = match.arg(annotation(RG), chipList) |
|
404 |
+ else |
|
405 |
+ chipType = match.arg(chipType, chipList) |
|
406 |
+ |
|
407 |
+ pkgname <- getCrlmmAnnotationName(chipType) |
|
408 |
+ if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ |
|
409 |
+ suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
410 |
+ msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
411 |
+ message(strwrap(msg)) |
|
412 |
+ stop("Package ", pkgname, " could not be found.") |
|
413 |
+ rm(suggCall, msg) |
|
414 |
+ } |
|
415 |
+ if(verbose) message("Loading chip annotation information.") |
|
416 |
+ loader("annotation.rda", .crlmmPkgEnv, pkgname) |
|
417 |
+# data(annotation, package=pkgname, envir=.crlmmPkgEnv) |
|
418 |
+ annot <- getVarInEnv("annot") |
|
419 |
+ |
|
420 |
+ if(remove) |
|
421 |
+ annot = annot[annot[,"ToCall"]==1,] |
|
422 |
+ nsnps = nrow(annot) |
|
423 |
+ narrays = ncol(RG) |
|
424 |
+ aidcol = match("AddressA_ID", colnames(annot)) |
|
425 |
+ if(is.na(aidcol)) |
|
426 |
+ aidcol = match("Address", colnames(annot)) |
|
427 |
+ bidcol = match("AddressB_ID", colnames(annot)) |
|
428 |
+ if(is.na(bidcol)) |
|
429 |
+ bidcol = match("Address2", colnames(annot)) |
|
430 |
+ aids = annot[, aidcol] |
|
431 |
+ bids = annot[, bidcol] |
|
432 |
+ snpids = annot[,"Name"] |
|
433 |
+ snpbase = annot[,"SNP"] |
|
434 |
+ rrgg = !is.na(bids) & bids!=0 |
|
435 |
+ aord = match(aids, featureNames(RG)) # NAs are possible here |
|
436 |
+ bord = match(bids[!rrgg], featureNames(RG)) # and here |
|
437 |
+# argrg = aids[rrgg] |
|
438 |
+# brgrg = bids[rrgg] |
|
439 |
+ X = Y = Xnb = Ynb = Xse = Yse = zero = matrix(0, nsnps, narrays) |
|
440 |
+ rownames(X) = rownames(Y) = rownames(Xnb) = rownames(Ynb) = rownames(Xse) = rownames(Yse) = snpids |
|
441 |
+ colnames(X) = colnames(Y) = colnames(Xnb) = colnames(Ynb) = colnames(Xse) = colnames(Yse) = sampleNames(RG)$G |
|
442 |
+ X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red |
|
443 |
+ Y[!is.na(aord),] = exprs(channel(RG, "G"))[aord[!is.na(aord)],] # mostly green |
|
444 |
+ Xnb[!is.na(aord),] = exprs(channel(RG, "Rnb"))[aord[!is.na(aord)],] |
|
445 |
+ Ynb[!is.na(aord),] = exprs(channel(RG, "Gnb"))[aord[!is.na(aord)],] |
|
446 |
+ Xse[!is.na(aord),] = exprs(channel(RG, "Rse"))[aord[!is.na(aord)],] |
|
447 |
+ Yse[!is.na(aord),] = exprs(channel(RG, "Gse"))[aord[!is.na(aord)],] |
|
448 |
+ |
|
449 |
+ # Important addition needed to get the correct intensities for the transitional Infinium I SNPs |
|
450 |
+ # these SNPs are intergorated by 2 separate bead types (the second bead type is given in the 'AddressB_ID' |
|
451 |
+ # (or 'Address2') column, and will have bid != 0 | !is.na(bid) |
|
452 |
+ # for these it could be that the X signal is the R/G from the A bead and the Y signal the R/G from the B snp |
|
453 |
+ # at present these are all zeroed out, so will presumably be called as hets |
|
454 |
+ # this is not a problem for the training data, since we get the X and Y intensities directly from BeadStudio output |
|
455 |
+ |
|
456 |
+ X[!is.na(bord),] = 0 |
|
457 |
+ Y[!is.na(bord),] = 0 |
|
458 |
+ Xnb[!is.na(bord),] = 0 |
|
459 |
+ Ynb[!is.na(bord),] = 0 |
|
460 |
+ Xse[!is.na(bord),] = 0 |
|
461 |
+ Yse[!is.na(bord),] = 0 |
|
462 |
+ |
|
463 |
+ zero[X==0 | Y==0] = 1 |
|
464 |
+ |
|
465 |
+ rm(annot) |
|
466 |
+ gc() |
|
467 |
+ |
|
468 |
+ XY = new("NChannelSet", |
|
469 |
+ X=X, Y=Y, |
|
470 |
+ Xnb=Xnb, Ynb=Ynb, |
|
471 |
+ Xse=Xse, Yse=Yse, |
|
472 |
+ zero=zero, |
|
473 |
+ annotation=chipType, |
|
474 |
+ phenoData=RG@phenoData) |
|
475 |
+} |
|
476 |
+ |
|
477 |
+stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) { |
|
478 |
+ pkgname <- getCrlmmAnnotationName(annotation(XY)) |
|
479 |
+ if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ |
|
480 |
+ suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
481 |
+ msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
482 |
+ message(strwrap(msg)) |
|
483 |
+ stop("Package ", pkgname, " could not be found.") |
|
484 |
+ rm(suggCall, msg) |
|
485 |
+ } |
|
486 |
+ if(verbose) message("Loading strip and reference normalization information.") |
|
487 |
+ loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
488 |
+# data(preprocStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
489 |
+ |
|
490 |
+ stripnum <- getVarInEnv("stripnum") |
|
491 |
+ |
|
492 |
+ if(useTarget) |
|
493 |
+ targetdist = getVarInEnv("reference") |
|
494 |
+ |
|
495 |
+ Xqws = Yqws = matrix(0, nrow(XY), ncol(XY)) |
|
496 |
+ colnames(Xqws) = colnames(Yqws) = sampleNames(XY)$X |
|
497 |
+ rownames(Xqws) = rownames(Yqws) = featureNames(XY) |
|
498 |
+ |
|
499 |
+ if(verbose){ |
|
500 |
+ message("Quantile normalizing ", ncol(XY), " arrays by ", max(stripnum), " strips.") |
|
501 |
+ if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=max(stripnum), style=3) |
|
502 |
+ } |
|
503 |
+ for(s in 1:max(stripnum)) { |
|
504 |
+ if(verbose) { |
|
505 |
+ if (getRversion() > '2.7.0') setTxtProgressBar(pb, s) |
|
506 |
+ else cat(".") |
|
507 |
+ } |
|
508 |
+ sel = stripnum==s |
|
509 |
+ subX = exprs(channel(XY, "X"))[sel,] |
|
510 |
+ subY = exprs(channel(XY, "Y"))[sel,] |
|
511 |
+ if(useTarget) |
|
512 |
+ tmp = normalize.quantiles.use.target(as.matrix(cbind(subX, subY)), targetdist[[s]]) |
|
513 |
+ else |
|
514 |
+ tmp = normalize.quantiles(as.matrix(cbind(subX, subY))) |
|
515 |
+ Xqws[sel,] = tmp[,1:(ncol(tmp)/2)] |
|
516 |
+ Yqws[sel,] = tmp[,(ncol(tmp)/2+1):ncol(tmp)] |
|
517 |
+ rm(subX, subY, tmp, sel) |
|
518 |
+ gc() |
|
519 |
+ } |
|
520 |
+ if(verbose) |
|
521 |
+ cat("\n") |
|
522 |
+ XYNorm = new("NChannelSet", |
|
523 |
+ X=Xqws+16, |
|
524 |
+ Y=Yqws+16, |
|
525 |
+ Xnb=exprs(channel(XY, "Xnb")), |
|
526 |
+ Ynb=exprs(channel(XY, "Ynb")), |
|
527 |
+ Xse=exprs(channel(XY, "Xse")), |
|
528 |
+ Yse=exprs(channel(XY, "Yse")), |
|
529 |
+ zero=exprs(channel(XY, "zero")), |
|
530 |
+ annotation=annotation(XY), |
|
531 |
+ phenoData=XY@phenoData) |
|
532 |
+} |
|
533 |
+ |
|
534 |
+ |
|
535 |
+preprocessInfinium2 = function(XY, mixtureSampleSize=10^5, fitMixture=TRUE, eps=0.1, verbose=TRUE, seed=1, |
|
536 |
+ cdfName, sns, stripNorm=TRUE, useTarget=TRUE) { |
|
537 |
+ if(stripNorm) |
|
538 |
+ XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose) |
|
539 |
+ |
|
540 |
+## MR: the code below is mostly straight from snprma.R |
|
541 |
+ if (missing(sns)) sns <- sampleNames(XY) |
|
542 |
+ if(missing(cdfName)) |
|
543 |
+ cdfName <- annotation(XY) |
|
544 |
+## stuffDir <- changeToCrlmmAnnotationName(cdfName) |
|
545 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
546 |
+ if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ |
|
547 |
+ suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
548 |
+ msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
549 |
+ message(strwrap(msg)) |
|
550 |
+ stop("Package ", pkgname, " could not be found.") |
|
551 |
+ rm(suggCall, msg) |
|
552 |
+ } |
|
553 |
+ if(verbose) message("Loading snp annotation and mixture model parameters.") |
|
554 |
+ loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
555 |
+ loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
556 |
+# data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
557 |
+ autosomeIndex <- getVarInEnv("autosomeIndex") |
|
558 |
+ # pnsa <- getVarInEnv("pnsa") |
|
559 |
+ # pnsb <- getVarInEnv("pnsb") |
|
560 |
+ # fid <- getVarInEnv("fid") |
|
561 |
+ # reference <- getVarInEnv("reference") |
|
562 |
+ # aIndex <- getVarInEnv("aIndex") |
|
563 |
+ # bIndex <- getVarInEnv("bIndex") |
|
564 |
+ SMEDIAN <- getVarInEnv("SMEDIAN") |
|
565 |
+ theKnots <- getVarInEnv("theKnots") |
|
566 |
+ gns <- getVarInEnv("gns") |
|
567 |
+ |
|
568 |
+ nprobes <- nrow(XY) |
|
569 |
+ narrays <- ncol(XY) |
|
570 |
+ |
|
571 |
+ ##We will read each cel file, summarize, and run EM one by one |
|
572 |
+ ##We will save parameters of EM to use later |
|
573 |
+ mixtureParams <- matrix(0, 4, narrays) |
|
574 |
+ SNR <- vector("numeric", narrays) |
|
575 |
+ SKW <- vector("numeric", narrays) |
|
576 |
+ |
|
577 |
+ ## This is the sample for the fitting of splines |
|
578 |
+ ## BC: I like better the idea of the user passing the seed, |
|
579 |
+ ## because this might intefere with other analyses |
|
580 |
+ ## (like what happened to GCRMA) |
|
581 |
+ set.seed(seed) |
|
582 |
+ |
|
583 |
+ idx <- sort(sample(autosomeIndex, mixtureSampleSize)) |
|
584 |
+ |
|
585 |
+ ##S will hold (A+B)/2 and M will hold A-B |
|
586 |
+ ##NOTE: We actually dont need to save S. Only for pics etc... |
|
587 |
+ ##f is the correction. we save to avoid recomputing |
|
588 |
+ A <- matrix(as.integer(exprs(channel(XY, "X"))), nprobes, narrays) # matrix(as.integer(0), length(pnsa), length(filenames)) |
|
589 |
+ B <- matrix(as.integer(exprs(channel(XY, "Y"))), nprobes, narrays) # matrix(as.integer(0), length(pnsb), length(filenames)) |
|
590 |
+ |
|
591 |
+ # new lines below - useful to keep track of zeroed out SNPs |
|
592 |
+ zero <- matrix(as.integer(exprs(channel(XY, "zero"))), nprobes, narrays) |
|
593 |
+ |
|
594 |
+ colnames(A) <- colnames(B) <- colnames(zero) <- sampleNames(XY)$X |
|
595 |
+ rownames(A) <- rownames(B) <- rownames(zero) <- featureNames(XY) |
|
596 |
+ |
|
597 |
+ if(verbose){ |
|
598 |
+ message("Calibrating ", narrays, " arrays.") |
|
599 |
+ if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3) |
|
600 |
+ } |
|
601 |
+ |
|
602 |
+ idx2 <- sample(nprobes, 10^5) |
|
603 |
+ for(i in 1:narrays){ |
|
604 |
+ SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3) |
|
605 |
+ if(fitMixture){ |
|
606 |
+ S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN |
|
607 |
+ M <- log2(A[idx, i])-log2(B[idx, i]) |
|
608 |
+ |
|
609 |
+ ##we need to test the choice of eps.. it is not the max diff between funcs |
|
610 |
+ tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps) |
|
611 |
+ |
|
612 |
+ mixtureParams[, i] <- tmp[["coef"]] |
|
613 |
+ SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) |
|
614 |
+ } |
|
615 |
+ if(verbose) { |
|
616 |
+ if (getRversion() > '2.7.0') setTxtProgressBar(pb, i) |
|
617 |
+ else cat(".") |
|
618 |
+ } |
|
619 |
+ } |
|
620 |
+ if (verbose) { |
|
621 |
+ if (getRversion() > '2.7.0') close(pb) |
|
622 |
+ else cat("\n") |
|
623 |
+ } |
|
624 |
+ if (!fitMixture) SNR <- mixtureParams <- NA |
|
625 |
+ ## gns comes from preprocStuff.rda |
|
626 |
+ list(A=A, B=B, zero=zero, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName) |
|
627 |
+} |
|
628 |
+ |
|
629 |
+ |
|
630 |
+## MR: Could add arguments to allow this function to read in idat data as well, although this add a further 7 arguments, which might over-complicate things |
|
631 |
+crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE, |
|
632 |
+ row.names=TRUE, col.names=TRUE, |
|
633 |
+ probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
634 |
+ seed=1, # save.it=FALSE, load.it=FALSE, intensityFile, |
|
635 |
+ mixtureSampleSize=10^5, eps=0.1, verbose=TRUE, |
|
636 |
+ cdfName, sns, recallMin=10, recallRegMin=1000, |
|
637 |
+ returnParams=FALSE, badSNP=.7) { |
|
638 |
+ if(!missing(RG)) { |
|
639 |
+ if(missing(XY)) |
|
640 |
+ XY = RGtoXY(RG, chipType=cdfName) |
|
641 |
+ else |
|
642 |
+ stop("Both RG and XY specified - please use one or the other") |
|
643 |
+ } |
|
644 |
+# if ((load.it | save.it) & missing(intensityFile)) |
|
645 |
+# stop("'intensityFile' is missing, and you chose either load.it or save.it") |
|
646 |
+ if (missing(sns)) sns <- sampleNames(XY) #basename(filenames) |
|
647 |
+# if (!missing(intensityFile)) |
|
648 |
+# if (load.it & !file.exists(intensityFile)){ |
|
649 |
+# load.it <- FALSE |
|
650 |
+# message("File ", intensityFile, " does not exist.") |
|
651 |
+# message("Not loading it, but running SNPRMA from scratch.") |
|
652 |
+# } |
|
653 |
+# if (!load.it){ |
|
654 |
+# res <- snprma(filenames, fitMixture=TRUE, |
|
655 |
+# mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
656 |
+# eps=eps, cdfName=cdfName, sns=sns) |
|
657 |
+ res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose, |
|
658 |
+ seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) |
|
659 |
+# if(save.it){ |
|
660 |
+# t0 <- proc.time() |
|
661 |
+# save(res, file=intensityFile) |
|
662 |
+# t0 <- proc.time()-t0 |
|
663 |
+# if (verbose) message("Used ", t0[3], " seconds to save ", intensityFile, ".") |
|
664 |
+# } |
|
665 |
+# }else{ |
|
666 |
+# if (verbose) message("Loading ", intensityFile, ".") |
|
667 |
+# obj <- load(intensityFile) |
|
668 |
+# if (verbose) message("Done.") |
|
669 |
+# if (obj != "res") |
|
670 |
+# stop("Object in ", intensityFile, " seems to be invalid.") |
|
671 |
+# } |
|
672 |
+ if(row.names) row.names=res$gns else row.names=NULL |
|
673 |
+ if(col.names) col.names=res$sns else col.names=NULL |
|
674 |
+ |
|
675 |
+ res2 <- crlmmGT(res[["A"]], res[["B"]], res[["SNR"]], |
|
676 |
+ res[["mixtureParams"]], res[["cdfName"]], |
|
677 |
+ gender=gender, row.names=row.names, |
|
678 |
+ col.names=col.names, recallMin=recallMin, |
|
679 |
+ recallRegMin=1000, SNRMin=SNRMin, |
|
680 |
+ returnParams=returnParams, badSNP=badSNP, |
|
681 |
+ verbose=verbose) |
|
682 |
+ |
|
683 |
+ res2[["A"]] <- res[["A"]] # added for copy number analysis |
|
684 |
+ res2[["B"]] <- res[["B"]] # added for copy number analysis |
|
685 |
+ res2[["SNR"]] <- res[["SNR"]] |
|
686 |
+ res2[["SKW"]] <- res[["SKW"]] |
|
687 |
+ res2[["zero"]] <- res[["zero"]] |
|
688 |
+ rm(res) |
|
689 |
+ gc() |
|
690 |
+ # MR: FIXME - for consistency with crlmm, need to save results in a 'SnpSet' object |
|
691 |
+ return(res2) |
|
692 |
+} |
0 | 693 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,77 @@ |
1 |
+\name{crlmmIllumina} |
|
2 |
+\alias{crlmmIllumina} |
|
3 |
+\title{Genotype Illumina Infinium II BeadChip data with CRLMM} |
|
4 |
+\description{ |
|
5 |
+ This implementation of the CRLMM is especially designed for |
|
6 |
+ data from Illumina Infinium II BeadChips. |
|
7 |
+} |
|
8 |
+\usage{ |
|
9 |
+ |
|
10 |
+crlmmIllumina(RG, XY, stripNorm=TRUE, useTarget=TRUE, |
|
11 |
+ row.names=TRUE, col.names=TRUE, |
|
12 |
+ probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, |
|
13 |
+ gender=NULL, seed=1, mixtureSampleSize=10^5, |
|
14 |
+ eps=0.1, verbose=TRUE, cdfName, sns, recallMin=10, |
|
15 |
+ recallRegMin=1000, returnParams=FALSE, badSNP=0.7) |
|
16 |
+} |
|
17 |
+ |
|
18 |
+\arguments{ |
|
19 |
+ \item{RG}{\code{NChannelSet} containing R and G bead intensities} |
|
20 |
+ \item{XY}{\code{NChannelSet} containing X and Y bead intensities} |
|
21 |
+ \item{stripNorm}{'logical'. Should the data be strip-level normalized?} |
|
22 |
+ \item{useTarget}{'logical' (only used when \code{stripNorm=TRUE}). |
|
23 |
+ Should the reference HapMap intensities be used in strip-level normalization?} |
|
24 |
+ \item{row.names}{'logical'. Use rownames - SNP names?} |
|
25 |
+ \item{col.names}{'logical'. Use colnames - Sample names?} |
|
26 |
+ \item{probs}{'numeric' vector with priors for AA, AB and BB.} |
|
27 |
+ \item{DF}{'integer' with number of degrees of freedom to use with t-distribution.} |
|
28 |
+ \item{SNRMin}{'numeric' scalar defining the minimum SNR used to filter |
|
29 |
+ out samples.} |
|
30 |
+ \item{gender}{'integer' vector, with same length as 'filenames', |
|
31 |
+ defining sex. (1 - male; 2 - female)} |
|
32 |
+ \item{seed}{'integer' scalar for random number generator (used to |
|
33 |
+ sample \code{mixtureSampleSize} SNPs for mixture model.} |
|
34 |
+ \item{mixtureSampleSize}{'integer'. The number of SNP's to be used |
|
35 |
+ when fitting the mixture model.} |
|
36 |
+ \item{eps}{Minimum change for mixture model.} |
|
37 |
+ \item{verbose}{'logical'.} |
|
38 |
+ \item{cdfName}{'character' defining the chip annotation (manifest) to use |
|
39 |
+ ('human370v1c', human550v3b', 'human650v3a', 'human1mv1c', |
|
40 |
+ 'human370quadv3c', 'human610quadv1b', 'human660quadv1a' 'human1mduov3b')} |
|
41 |
+ \item{sns}{'character' vector with sample names to be used.} |
|
42 |
+ \item{recallMin}{'integer'. Minimum number of samples for recalibration.} |
|
43 |
+ \item{recallRegMin}{'integer'. Minimum number of SNP's for regression.} |
|
44 |
+ \item{returnParams}{'logical'. Return recalibrated parameters.} |
|
45 |
+ \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)} |
|
46 |
+} |
|
47 |
+\value{ |
|
48 |
+ \item{A}{normalized A allele intensity} |
|
49 |
+ \item{B}{normalized B allele intensity} |
|
50 |
+ \item{calls}{Genotype calls (1 - AA, 2 - AB, 3 - BB)} |
|
51 |
+ \item{confs}{Confidence scores 'round(-1000*log2(1-p))'} |
|
52 |
+ \item{SNPQC}{SNP Quality Scores} |
|
53 |
+ \item{batchQC}{Batch Quality Score} |
|
54 |
+ \item{params}{Recalibrated parameters} |
|
55 |
+} |
|
56 |
+ |
|
57 |
+\details{ |
|
58 |
+ Note: The user should specify either the \code{RG} or \code{XY} |
|
59 |
+ intensities, not both. |
|
60 |
+} |
|
61 |
+ |
|
62 |
+\references{ |
|
63 |
+ Carvalho B, Bengtsson H, Speed TP, Irizarry RA. Exploration, |
|
64 |
+ normalization, and genotype calls of high-density oligonucleotide SNP |
|
65 |
+ array data. Biostatistics. 2007 Apr;8(2):485-99. Epub 2006 Dec |
|
66 |
+ 22. PMID: 17189563. |
|
67 |
+ |
|
68 |
+ Carvalho B, Louis TA, Irizarry RA. Describing Uncertainty in |
|
69 |
+ Genome-wide Genotype Calling. (in prep) |
|
70 |
+} |
|
71 |
+ |
|
72 |
+\author{Matt Ritchie} |
|
73 |
+ |
|
74 |
+\examples{ |
|
75 |
+## crlmmOut = crlmmIllumina(RG) |
|
76 |
+} |
|
77 |
+\keyword{classif} |
0 | 78 |
new file mode 100755 |
... | ... |
@@ -0,0 +1,71 @@ |
1 |
+\name{readIdatFiles} |
|
2 |
+\alias{readIdatFiles} |
|
3 |
+ |
|
4 |
+\title{Reads Idat Files from Infinium II Illumina BeadChips} |
|
5 |
+ |
|
6 |
+\description{ |
|
7 |
+ Reads intensity information for each bead type from |
|
8 |
+ .idat files of Infinium II genotyping BeadChips} |
|
9 |
+ |
|
10 |
+\usage{ |
|
11 |
+readIdatFiles(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
|
12 |
+ arrayInfoColNames=list(barcode="SentrixBarcode_A", |
|
13 |
+ position="SentrixPosition_A"), |
|
14 |
+ highDensity=FALSE, sep="_", |
|
15 |
+ fileExt=list(green="Grn.idat", red="Red.idat")) |
|
16 |
+} |
|
17 |
+ |
|
18 |
+\arguments{ |
|
19 |
+ \item{sampleSheet}{\code{data.frame} containing Illumina sample sheet |
|
20 |
+ information (for required columns, refer to BeadStudio Genotyping |
|
21 |
+ guide - Appendix A).} |
|
22 |
+ \item{arrayNames}{character vector containing names of arrays to be |
|
23 |
+ read in. If \code{NULL}, all arrays that can be found in the |
|
24 |
+ specified working directory will be read in.} |
|
25 |
+ \item{ids}{vector containing ids of probes to be read in. If |
|
26 |
+ \code{NULL} all probes found on the first array are read in.} |
|
27 |
+ \item{path}{character string specifying the location of files to be |
|
28 |
+ read by the function} |
|
29 |
+ \item{arrayInfoColNames}{(used when \code{sampleSheet} is specified) |
|
30 |
+ list containing elements 'barcode' which indicates column names in |
|
31 |
+ the \code{sampleSheet} which contains the arrayNumber/barcode number |
|
32 |
+ and 'position' which indicates the strip number. In older style |
|
33 |
+ sample sheets, this information is combined (usually in a column |
|
34 |
+ named 'SentrixPosition') and this should be specified as |
|
35 |
+ \code{list(barcode=NULL, position="SentrixPosition")}} |
|
36 |
+ \item{highDensity}{logical (used when \code{sampleSheet} is |
|
37 |
+ specified). If \code{TRUE}, array extensions '_A', '_B' in |
|
38 |
+ sampleSheet are replaced with 'R01C01', 'R01C02' etc.} |
|
39 |
+ \item{sep}{character string specifying separator used in .idat file |
|
40 |
+ names.} |
|
41 |
+ \item{fileExt}{list containing elements 'Green' and 'Red' which |
|
42 |
+ specify the .idat file extension for the Cy3 and Cy5 channels.} |
|
43 |
+} |
|
44 |
+ |
|
45 |
+\details{ |
|
46 |
+The summarised Cy3 (G) and Cy5 (R) intensity, number of beads that |
|
47 |
+were used in each channel and standard errors (all on the orginal scale) |
|
48 |
+are read in from the .idat files. |
|
49 |
+ |
|
50 |
+Where available, a \code{sampleSheet} data.frame, in the same format |
|
51 |
+as used by BeadStudio (columns 'Sample_ID', 'SentrixBarcode_A' and |
|
52 |
+'SentrixPosition_A' are required) which keeps track of sample |
|
53 |
+information can be specified. |
|
54 |
+ |
|
55 |
+Thanks to Keith Baggerly who provided the code to read in the binary .idat files. |
|
56 |
+} |
|
57 |
+ |
|
58 |
+\value{ |
|
59 |
+ NChannelSet with intensity data (\code{R}, \code{G}), number of beads |
|
60 |
+ (\code{Rnb}, \code{Gnb}) and standard errors (\code{Rse}, \code{Gse}) |
|
61 |
+ for each bead type. |
|
62 |
+} |
|
63 |
+ |
|
64 |
+\author{Matt Ritchie} |
|
65 |
+ |
|
66 |
+\examples{ |
|
67 |
+ |
|
68 |
+#RG = readIdatFiles() |
|
69 |
+ |
|
70 |
+} |
|
71 |
+\keyword{IO} |