Browse code

added code to handle illumina snp array data

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@38297 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 30/03/2009 02:25:09
Showing6 changed files

... ...
@@ -72,3 +72,12 @@ BioC data packages
72 72
 * cnrma function requires the cdfName
73 73
 
74 74
 * update man files for cnrma and computeCopynumber
75
+
76
+2009-03-30 Matt Ritchie - committed version 1.0.66
77
+
78
+* added functions to read in idat files, pre-process and genotype 
79
+data from Illumina SNP arrays (functions named readIdatFiles() 
80
+and crlmmIllumina() in crlmm-illumina.R)
81
+
82
+* added man pages for these functions
83
+
... ...
@@ -1,8 +1,8 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for SNP 5.0 and 6.0 arrays.
4
-Version: 1.0.65
5
-Date: 2008-12-28
4
+Version: 1.0.66
5
+Date: 2008-12-30
6 6
 Author: Rafael A Irizarry, Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>
7 7
 Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>
8 8
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 6.0.
... ...
@@ -13,6 +13,7 @@ Collate: AllClasses.R
13 13
          methods-SnpSet.R
14 14
 	 cnrma-functions.R
15 15
          crlmm-functions.R
16
+         crlmm-illumina.R
16 17
          snprma-functions.R
17 18
          utils.R
18 19
          zzz.R
... ...
@@ -7,5 +7,5 @@ importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile, sd)
7 7
 importFrom(genefilter, rowSds)
8 8
 importFrom(mvtnorm, dmvnorm)
9 9
 
10
-export("crlmm", "list.celfiles", "computeCopynumber", "cnrma", "celDates")
10
+export("crlmm", "list.celfiles", "computeCopynumber", "cnrma", "celDates", "crlmmIllumina", "readIdatFiles")
11 11
 exportMethods("calls", "confs")
12 12
new file mode 100644
... ...
@@ -0,0 +1,692 @@
1
+# function below works OK provided all .idat files are in the current working directory
2
+# - could add an option to allow files in Illumina directory structure to be handled
3
+# or to use the optional 'Path' column in sampleSheet
4
+# - there is a lot of header information that is currently discarded - could try and store this somewhere in the resulting NChannelSet
5
+
6
+readIdatFiles = function(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
7
+                                arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
8
+                                highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat")) {
9
+       if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet
10
+         arrayNames=NULL
11
+         if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) {
12
+           barcode = sampleSheet[,arrayInfoColNames$barcode]
13
+           arrayNames=barcode
14
+         }
15
+         if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {  
16
+           position = sampleSheet[,arrayInfoColNames$position]
17
+           if(is.null(arrayNames))
18
+             arrayNames=position
19
+           else
20
+             arrayNames = paste(arrayNames, position, sep=sep)
21
+           if(highDensity) {
22
+             hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02")
23
+             for(i in names(hdExt))
24
+               arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames)
25
+            }
26
+         }
27
+#         pd = new("AnnotatedDataFrame", data = sampleSheet)
28
+       }
29
+       if(is.null(arrayNames)) {
30
+         arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
31
+         if(!is.null(sampleSheet)) {
32
+           sampleSheet=NULL
33
+           cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
34
+         }
35
+       }
36
+       grnfiles = paste(arrayNames, fileExt$green, sep=sep)
37
+       redfiles = paste(arrayNames, fileExt$red, sep=sep)
38
+       if(length(grnfiles)==0 || length(redfiles)==0)
39
+         stop("Cannot find .idat files")
40
+       if(length(grnfiles)!=length(redfiles))
41
+         stop("Cannot find matching .idat files")
42
+       if(!all(c(redfiles,grnfiles) %in% dir(path=path)))
43
+         stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n",
44
+                 paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" "))
45
+       grnidats = file.path(path, grnfiles)
46
+       redidats = file.path(path, redfiles)
47
+       
48
+       headerInfo = list(nProbes = rep(NA, length(arrayNames)),
49
+                         Barcode = rep(NA, length(arrayNames)),
50
+                         ChipType = rep(NA, length(arrayNames)),
51
+                         Manifest = rep(NA, length(arrayNames)), # not sure about this one - sometimes blank
52
+                         Position = rep(NA, length(arrayNames))) # this may also vary a bit
53
+       # read in the data
54
+       for(i in seq(along=arrayNames)) {
55
+         cat("reading", arrayNames[i], "\t")
56
+         idsG = idsR = G = R = NULL
57
+#         if(grnfiles[i] %in% dir(path=path)) {
58
+         cat(paste(sep, fileExt$green, sep=""), "\t")
59
+         G = readIDAT(grnidats[i])
60
+         idsG = rownames(G$Quants)
61
+#         }
62
+#         else
63
+#           stop("Could not find ", grnidats[i])
64
+         
65
+#         if(redfiles[i] %in% dir(path=path)) {
66
+          cat(paste(sep, fileExt$red, sep=""), "\n")
67
+          R = readIDAT(redidats[i])
68
+          idsR = rownames(R$Quants)
69
+#         }
70
+#         else
71
+#           stop("Could not find ", redidats[i])
72
+         
73
+         headerInfo$nProbes[i] = G$nSNPsRead
74
+         headerInfo$Barcode[i] = G$Barcode
75
+         headerInfo$ChipType[i] = G$ChipType
76
+         headerInfo$Manifest[i] = G$Unknown$MostlyNull
77
+         headerInfo$Position[i] = G$Unknowns$MostlyA
78
+         
79
+         if(headerInfo$ChipType[i]!=headerInfo$ChipType[1] || headerInfo$Manifest[i]!=headerInfo$Manifest[1])
80
+                  # || headerInfo$nProbes[i]!=headerInfo$nProbes[1] ## removed this condition as some arrays used the same manifest
81
+                  # but differed by a few SNPs for some reason - most of the chip was the same though
82
+           stop("Chips are not of all of the same type - please check your data")
83
+         if(i==1) {
84
+           if(is.null(ids) && !is.null(G))
85
+             ids = idsG
86
+           else
87
+             stop("Could not find probe IDs")
88
+           nprobes = length(ids)
89
+           narrays = length(arrayNames)
90
+           Gintens = Rintens = Gnobeads = Rnobeads = Gstderr = Rstderr = matrix(NA, nprobes, narrays)
91
+           rownames(Gintens) = rownames(Rintens) = rownames(Gnobeads) = rownames(Rnobeads) = rownames(Gstderr) = rownames(Rstderr) = ids
92
+           if(!is.null(sampleSheet))
93
+             colnames(Gintens) = colnames(Rintens) =  colnames(Gnobeads) = colnames(Rnobeads) = colnames(Gstderr) = colnames(Rstderr) = sampleSheet$Sample_ID
94
+           else
95
+             colnames(Gintens) = colnames(Rintens) =  colnames(Gnobeads) = colnames(Rnobeads) = colnames(Gstderr) = colnames(Rstderr) = arrayNames
96
+         }
97
+         
98
+         if(length(ids)==length(idsG)) {
99
+           if(sum(ids==idsG)==nprobes) {
100
+             Gintens[,i] = G$Quants[, "Mean"]
101
+             Gnobeads[,i] = G$Quants[, "NBeads"]
102
+             Gstderr[,i] = G$Quants[, "SD"]
103
+           }
104
+         }
105
+         else {
106
+           indG = match(ids, idsG)
107
+           Gintens[,i] = G$Quants[indG, "Mean"]
108
+           Gnobeads[,i] = G$Quants[indG, "NBeads"]
109
+           Gstderr[,i] = G$Quants[indG, "SD"]
110
+         }
111
+         if(length(ids)==length(idsG)) {   
112
+           if(sum(ids==idsR)==nprobes) {
113
+             Rintens[,i] = R$Quants[ ,"Mean"]
114
+             Rnobeads[,i] = R$Quants[ ,"NBeads"]
115
+             Rstderr[,i] = R$Quants[ ,"SD"]
116
+           }
117
+         }
118
+         else {
119
+           indR = match(ids, idsR)
120
+           Rintens[,i] = R$Quants[indR, "Mean"]
121
+           Rnobeads[,i] = R$Quants[indR, "NBeads"]
122
+           Rstderr[,i] = R$Quants[indR, "SD"]
123
+         }
124
+       }
125
+       if(is.null(sampleSheet))
126
+         pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
127
+       else
128
+         pd = new("AnnotatedDataFrame", data = sampleSheet) 
129
+       RG = new("NChannelSet",
130
+                R=Rintens, G=Gintens, Rnb=Rnobeads, Gnb=Gnobeads,
131
+                Rse=Rstderr, Gse=Gstderr, annotation=headerInfo$Manifest[1],
132
+                phenoData = pd)
133
+}
134
+
135
+
136
+# the readIDAT() and readBPM() functions below were provided by Keith Baggerly, 27/8/2008
137
+readIDAT <- function(idatFile){
138
+
139
+  fileSize <- file.info(idatFile)$size
140
+
141
+  tempCon <- file(idatFile,"rb")
142
+  prefixCheck <- readChar(tempCon,4)
143
+  if(prefixCheck != "IDAT"){
144
+
145
+  }
146
+
147
+  versionNumber <- readBin(tempCon, "integer", n=1, size=8, 
148
+                           endian="little", signed=FALSE)
149
+  
150
+  nFields <- readBin(tempCon, "integer", n=1, size=4, 
151
+                     endian="little", signed=FALSE)
152
+
153
+  fields <- matrix(0,nFields,3);
154
+  colnames(fields) <- c("Field Code", "Byte Offset", "Bytes")
155
+  for(i1 in 1:nFields){
156
+    fields[i1,"Field Code"] <- 
157
+      readBin(tempCon, "integer", n=1, size=2, endian="little", signed=FALSE)
158
+    fields[i1,"Byte Offset"] <- 
159
+      readBin(tempCon, "integer", n=1, size=8, endian="little", signed=FALSE)
160
+  }
161
+
162
+  knownCodes <- c(1000, 102, 103, 104, 107, 200, 300, 400,
163
+                  401, 402, 403, 404, 405, 406, 407, 408, 409)
164
+  names(knownCodes) <- 
165
+    c("nSNPsRead",  # 1000
166
+      "IlluminaID", #  102
167
+      "SD",         #  103
168
+      "Mean",       #  104
169
+      "NBeads",     #  107
170
+      "MidBlock",   #  200
171
+      "RunInfo",    #  300
172
+      "RedGreen",   #  400
173
+      "MostlyNull", #  401
174
+      "Barcode",    #  402
175
+      "ChipType",   #  403
176
+      "MostlyA",    #  404
177
+      "Unknown.1",  #  405
178
+      "Unknown.2",  #  406
179
+      "Unknown.3",  #  407
180
+      "Unknown.4",  #  408
181
+      "Unknown.5"   #  409
182
+      )
183
+
184
+  nNewFields <- 1
185
+  rownames(fields) <- paste("Null", 1:nFields)
186
+  for(i1 in 1:nFields){
187
+    temp <- match(fields[i1,"Field Code"], knownCodes)
188
+    if(!is.na(temp)){
189
+      rownames(fields)[i1] <- names(knownCodes)[temp]
190
+    }else{
191
+      rownames(fields)[i1] <- paste("newField", nNewFields, sep=".")
192
+      nNewFields <- nNewFields + 1
193
+    }
194
+  }
195
+
196
+  seek(tempCon, fields["nSNPsRead", "Byte Offset"])
197
+  nSNPsRead <- readBin(tempCon, "integer", n=1, size=4, 
198
+                       endian="little", signed=FALSE)
199
+
200
+  seek(tempCon, fields["IlluminaID", "Byte Offset"])
201
+  IlluminaID <- readBin(tempCon, "integer", n=nSNPsRead, size=4, 
202
+                       endian="little", signed=FALSE)
203
+
204
+  seek(tempCon, fields["SD", "Byte Offset"])
205
+  SD <- readBin(tempCon, "integer", n=nSNPsRead, size=2, 
206
+                endian="little", signed=FALSE)
207
+
208
+  seek(tempCon, fields["Mean", "Byte Offset"])
209
+  Mean <- readBin(tempCon, "integer", n=nSNPsRead, size=2, 
210
+                  endian="little", signed=FALSE)
211
+
212
+  seek(tempCon, fields["NBeads", "Byte Offset"])
213
+  NBeads <- readBin(tempCon, "integer", n=nSNPsRead, size=1, signed=FALSE)
214
+
215
+  seek(tempCon, fields["MidBlock", "Byte Offset"])
216
+  nMidBlockEntries <- readBin(tempCon, "integer", n=1, size=4, 
217
+                              endian="little", signed=FALSE)
218
+  MidBlock <- readBin(tempCon, "integer", n=nMidBlockEntries, size=4, 
219
+                      endian="little", signed=FALSE)
220
+
221
+  seek(tempCon, fields["RunInfo", "Byte Offset"])
222
+  nRunInfoBlocks <- readBin(tempCon, "integer", n=1, size=4, 
223
+                            endian="little", signed=FALSE)
224
+  RunInfo <- matrix(NA, nRunInfoBlocks, 5)
225
+  colnames(RunInfo) <- c("RunTime", "BlockType", "BlockPars", 
226
+                         "BlockCode", "CodeVersion")
227
+  for(i1 in 1:nRunInfoBlocks){
228
+    for(i2 in 1:5){
229
+      nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
230
+      RunInfo[i1,i2] <- readChar(tempCon, nChars)
231
+    }
232
+  }
233
+
234
+  seek(tempCon, fields["RedGreen", "Byte Offset"])
235
+  RedGreen <- readBin(tempCon, "numeric", n=1, size=4, 
236
+                      endian="little", signed=FALSE)
237
+  #RedGreen <- readBin(tempCon, "integer", n=4, size=1, 
238
+  #                    endian="little", signed=FALSE)
239
+
240
+  seek(tempCon, fields["MostlyNull", "Byte Offset"])
241
+  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
242
+  MostlyNull <- readChar(tempCon, nChars) 
243
+
244
+  seek(tempCon, fields["Barcode", "Byte Offset"])
245
+  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
246
+  Barcode <- readChar(tempCon, nChars) 
247
+
248
+  seek(tempCon, fields["ChipType", "Byte Offset"])
249
+  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
250
+  ChipType <- readChar(tempCon, nChars) 
251
+
252
+  seek(tempCon, fields["MostlyA", "Byte Offset"])
253
+  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
254
+  MostlyA <- readChar(tempCon, nChars) 
255
+
256
+  seek(tempCon, fields["Unknown.1", "Byte Offset"])
257
+  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
258
+  Unknown.1 <- readChar(tempCon, nChars) 
259
+
260
+  seek(tempCon, fields["Unknown.2", "Byte Offset"])
261
+  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
262
+  Unknown.2 <- readChar(tempCon, nChars) 
263
+
264
+  seek(tempCon, fields["Unknown.3", "Byte Offset"])
265
+  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
266
+  Unknown.3 <- readChar(tempCon, nChars) 
267
+
268
+  seek(tempCon, fields["Unknown.4", "Byte Offset"])
269
+  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
270
+  Unknown.4 <- readChar(tempCon, nChars) 
271
+
272
+  seek(tempCon, fields["Unknown.5", "Byte Offset"])
273
+  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
274
+  Unknown.5 <- readChar(tempCon, nChars) 
275
+
276
+  close(tempCon)
277
+
278
+  Unknowns <- 
279
+    list(MostlyNull=MostlyNull,
280
+         MostlyA=MostlyA,
281
+         Unknown.1=Unknown.1,
282
+         Unknown.2=Unknown.2,
283
+         Unknown.3=Unknown.3,
284
+         Unknown.4=Unknown.4,
285
+         Unknown.5=Unknown.5)
286
+
287
+  Quants <- cbind(Mean, SD, NBeads)
288
+  colnames(Quants) <- c("Mean", "SD", "NBeads")
289
+  rownames(Quants) <- as.character(IlluminaID)
290
+
291
+  idatValues <- 
292
+    list(fileSize=fileSize, 
293
+         versionNumber=versionNumber, 
294
+         nFields=nFields, 
295
+         fields=fields,
296
+         nSNPsRead=nSNPsRead, 
297
+         #IlluminaID=IlluminaID, 
298
+         #SD=SD, 
299
+         #Mean=Mean, 
300
+         #NBeads=NBeads,
301
+         Quants=Quants,
302
+         MidBlock=MidBlock, 
303
+         RunInfo=RunInfo, 
304
+         RedGreen=RedGreen, 
305
+         Barcode=Barcode, 
306
+         ChipType=ChipType,
307
+         Unknowns=Unknowns)
308
+
309
+  idatValues
310
+
311
+}
312
+
313
+readBPM <- function(bpmFile){
314
+
315
+  ## Reads and parses Illumina BPM files
316
+  
317
+  fileSize <- file.info(bpmFile)$size
318
+
319
+  tempCon <- file(bpmFile,"rb")
320
+
321
+  # The first few bytes of the egtFile are some type of
322
+  # header, but there's no related byte offset information. 
323
+
324
+  prefixCheck <- readChar(tempCon,3) ## should be "BPM"
325
+
326
+  null.1 <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
327
+  ## should be 1  
328
+  
329
+  versionNumber <- 
330
+    readBin(tempCon, "integer", n=1, size=4, endian="little", signed=FALSE)
331
+  ## should be 4
332
+
333
+  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
334
+  chipType <- readChar(tempCon, nChars)
335
+
336
+  null.2 <- readBin(tempCon, "integer", n=2, size=1, signed=FALSE)
337
+
338
+  csvLines <- readLines(tempCon, 22)
339
+
340
+  entriesByteOffset <- seek(tempCon);
341
+  nEntries <- readBin(tempCon, "integer", n=1, size=4,
342
+                      endian="little", signed=FALSE)
343
+    
344
+  if(FALSE){
345
+    
346
+    snpIndexByteOffset <- seek(tempCon)
347
+    snpIndex <- readBin(tempCon, "integer", n=nEntries, size=4,
348
+                        endian="little", signed=FALSE)
349
+    ## for the 1M array, these are simply in order from 1 to 1072820.
350
+    
351
+    snpNamesByteOffset <- seek(tempCon)
352
+    snpNames <- rep("A", nEntries)
353
+    for(i1 in 1:nEntries){
354
+      nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
355
+      snpNames[i1] <- readChar(tempCon, nChars)
356
+    }
357
+
358
+  }
359
+
360
+  seek(tempCon, 15278138)
361
+    
362
+  normIDByteOffset <- seek(tempCon)
363
+  normID <- readBin(tempCon, "integer", n=nEntries, size=1, signed=FALSE) + 1
364
+  
365
+  newBlockByteOffset <- seek(tempCon)
366
+  newBlock <- readBin(tempCon, "integer", n=10000, size=1, signed=FALSE)
367
+  
368
+  close(tempCon)
369
+
370
+  byteOffsets <- list(entriesByteOffset=entriesByteOffset,
371
+                      #snpIndexByteOffset=snpIndexByteOffset, 
372
+                      #snpNamesByteOffset=snpNamesByteOffset,
373
+                      normIDByteOffset=normIDByteOffset,
374
+                      newBlockByteOffset=newBlockByteOffset)
375
+  
376
+  allStuff <- list(prefixCheck=prefixCheck,
377
+                   null.1=null.1,
378
+                   versionNumber=versionNumber,
379
+                   chipType=chipType,
380
+                   null.2=null.2,
381
+                   csvLines=csvLines,
382
+                   nEntries=nEntries,
383
+                   #snpIndex=snpIndex,
384
+                   #snpNames=snpNames,
385
+                   normID=normID,
386
+                   newBlock=newBlock,
387
+                   byteOffsets=byteOffsets)
388
+  allStuff
389
+  
390
+}
391
+
392
+
393
+RGtoXY = function(RG, chipType, remove=TRUE, verbose=TRUE) {
394
+  chipList = c("human1mv1c",             # 1M
395
+               "human370v1c",            # 370CNV
396
+               "human650v3a",            # 650Y
397
+               "human610quadv1b",        # 610 quad
398
+               "human660quadv1a",        # 660 quad
399
+               "human370quadv3c",        # 370CNV quad
400
+               "human550v3b",            # 550K
401
+               "human1mduov3b")          # 1M Duo   
402
+  if(missing(chipType))
403
+     chipType = match.arg(annotation(RG), chipList)
404
+  else
405
+     chipType = match.arg(chipType, chipList)
406
+  
407
+  pkgname <- getCrlmmAnnotationName(chipType)
408
+  if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
409
+     suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
410
+     msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
411
+     message(strwrap(msg))
412
+     stop("Package ", pkgname, " could not be found.")
413
+     rm(suggCall, msg)
414
+  }
415
+  if(verbose) message("Loading chip annotation information.")
416
+    loader("annotation.rda", .crlmmPkgEnv, pkgname)
417
+#    data(annotation, package=pkgname, envir=.crlmmPkgEnv)
418
+  annot <- getVarInEnv("annot")
419
+  
420
+  if(remove)
421
+    annot = annot[annot[,"ToCall"]==1,]
422
+  nsnps = nrow(annot)
423
+  narrays = ncol(RG)
424
+  aidcol = match("AddressA_ID", colnames(annot))
425
+  if(is.na(aidcol))
426
+    aidcol = match("Address", colnames(annot))
427
+  bidcol = match("AddressB_ID", colnames(annot))
428
+  if(is.na(bidcol)) 
429
+    bidcol = match("Address2", colnames(annot))
430
+  aids = annot[, aidcol]
431
+  bids = annot[, bidcol]
432
+  snpids = annot[,"Name"]
433
+  snpbase = annot[,"SNP"] 
434
+  rrgg = !is.na(bids) & bids!=0  
435
+  aord = match(aids, featureNames(RG)) # NAs are possible here
436
+  bord = match(bids[!rrgg], featureNames(RG)) # and here
437
+#  argrg = aids[rrgg]
438
+#  brgrg = bids[rrgg]
439
+  X = Y = Xnb = Ynb = Xse = Yse = zero = matrix(0, nsnps, narrays)
440
+  rownames(X) = rownames(Y) = rownames(Xnb) = rownames(Ynb) = rownames(Xse) = rownames(Yse) = snpids
441
+  colnames(X) = colnames(Y) = colnames(Xnb) = colnames(Ynb) = colnames(Xse) = colnames(Yse) = sampleNames(RG)$G
442
+  X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red
443
+  Y[!is.na(aord),] = exprs(channel(RG, "G"))[aord[!is.na(aord)],] # mostly green
444
+  Xnb[!is.na(aord),] = exprs(channel(RG, "Rnb"))[aord[!is.na(aord)],]
445
+  Ynb[!is.na(aord),] = exprs(channel(RG, "Gnb"))[aord[!is.na(aord)],]
446
+  Xse[!is.na(aord),] = exprs(channel(RG, "Rse"))[aord[!is.na(aord)],]
447
+  Yse[!is.na(aord),] = exprs(channel(RG, "Gse"))[aord[!is.na(aord)],]
448
+  
449
+  # Important addition needed to get the correct intensities for the transitional Infinium I SNPs
450
+  # these SNPs are intergorated by 2 separate bead types (the second bead type is given in the 'AddressB_ID'
451
+  # (or 'Address2') column, and will have bid != 0 | !is.na(bid)
452
+  # for these it could be that the X signal is the R/G from the A bead and the Y signal the R/G from the B snp
453
+  # at present these are all zeroed out, so will presumably be called as hets
454
+  # this is not a problem for the training data, since we get the X and Y intensities directly from BeadStudio output
455
+  
456
+  X[!is.na(bord),] = 0
457
+  Y[!is.na(bord),] = 0
458
+  Xnb[!is.na(bord),] = 0
459
+  Ynb[!is.na(bord),] = 0
460
+  Xse[!is.na(bord),] = 0
461
+  Yse[!is.na(bord),] = 0
462
+
463
+  zero[X==0 | Y==0] = 1
464
+  
465
+  rm(annot)
466
+  gc()
467
+  
468
+  XY = new("NChannelSet",
469
+           X=X, Y=Y,
470
+           Xnb=Xnb, Ynb=Ynb,
471
+           Xse=Xse, Yse=Yse,
472
+           zero=zero,
473
+           annotation=chipType,
474
+           phenoData=RG@phenoData)
475
+}
476
+
477
+stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
478
+  pkgname <- getCrlmmAnnotationName(annotation(XY))
479
+  if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
480
+    suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
481
+    msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
482
+    message(strwrap(msg))
483
+    stop("Package ", pkgname, " could not be found.")
484
+    rm(suggCall, msg)
485
+  }
486
+  if(verbose) message("Loading strip and reference normalization information.")
487
+  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
488
+#  data(preprocStuff, package=pkgname, envir=.crlmmPkgEnv)
489
+
490
+  stripnum <- getVarInEnv("stripnum")
491
+  
492
+  if(useTarget)
493
+    targetdist = getVarInEnv("reference")
494
+  
495
+  Xqws = Yqws = matrix(0, nrow(XY), ncol(XY))
496
+  colnames(Xqws) = colnames(Yqws) = sampleNames(XY)$X
497
+  rownames(Xqws) = rownames(Yqws) = featureNames(XY)
498
+
499
+  if(verbose){
500
+    message("Quantile normalizing ", ncol(XY), " arrays by ", max(stripnum), " strips.")
501
+    if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=max(stripnum), style=3)
502
+  } 
503
+  for(s in 1:max(stripnum)) {
504
+    if(verbose) {
505
+      if (getRversion() > '2.7.0') setTxtProgressBar(pb, s)
506
+      else cat(".")
507
+    }
508
+    sel = stripnum==s
509
+    subX = exprs(channel(XY, "X"))[sel,]
510
+    subY = exprs(channel(XY, "Y"))[sel,]
511
+    if(useTarget)
512
+      tmp = normalize.quantiles.use.target(as.matrix(cbind(subX, subY)), targetdist[[s]])
513
+    else
514
+      tmp = normalize.quantiles(as.matrix(cbind(subX, subY)))
515
+    Xqws[sel,] = tmp[,1:(ncol(tmp)/2)]
516
+    Yqws[sel,] = tmp[,(ncol(tmp)/2+1):ncol(tmp)]
517
+    rm(subX, subY, tmp, sel)
518
+    gc()
519
+  }
520
+  if(verbose)
521
+    cat("\n")
522
+  XYNorm = new("NChannelSet",
523
+                X=Xqws+16,
524
+                Y=Yqws+16,
525
+                Xnb=exprs(channel(XY, "Xnb")),
526
+                Ynb=exprs(channel(XY, "Ynb")),
527
+                Xse=exprs(channel(XY, "Xse")),
528
+                Yse=exprs(channel(XY, "Yse")),
529
+                zero=exprs(channel(XY, "zero")),
530
+                annotation=annotation(XY),
531
+                phenoData=XY@phenoData)
532
+}
533
+
534
+
535
+preprocessInfinium2 = function(XY, mixtureSampleSize=10^5, fitMixture=TRUE, eps=0.1, verbose=TRUE, seed=1,
536
+                               cdfName, sns, stripNorm=TRUE, useTarget=TRUE) {
537
+  if(stripNorm)
538
+    XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose)
539
+  
540
+## MR: the code below is mostly straight from snprma.R
541
+  if (missing(sns)) sns <- sampleNames(XY)
542
+  if(missing(cdfName))
543
+    cdfName <- annotation(XY)
544
+##  stuffDir <- changeToCrlmmAnnotationName(cdfName)
545
+  pkgname <- getCrlmmAnnotationName(cdfName)
546
+  if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
547
+    suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
548
+    msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
549
+    message(strwrap(msg))
550
+    stop("Package ", pkgname, " could not be found.")
551
+    rm(suggCall, msg)
552
+  }
553
+  if(verbose) message("Loading snp annotation and mixture model parameters.")
554
+  loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
555
+  loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)  
556
+#  data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
557
+  autosomeIndex <- getVarInEnv("autosomeIndex")
558
+  #  pnsa <- getVarInEnv("pnsa")
559
+  #  pnsb <- getVarInEnv("pnsb")
560
+  #  fid <- getVarInEnv("fid")
561
+  #  reference <- getVarInEnv("reference")
562
+  #  aIndex <- getVarInEnv("aIndex")
563
+  #  bIndex <- getVarInEnv("bIndex")
564
+  SMEDIAN <- getVarInEnv("SMEDIAN")
565
+  theKnots <- getVarInEnv("theKnots")
566
+  gns <- getVarInEnv("gns")
567
+
568
+  nprobes <- nrow(XY)
569
+  narrays <- ncol(XY)
570
+  
571
+  ##We will read each cel file, summarize, and run EM one by one
572
+  ##We will save parameters of EM to use later
573
+  mixtureParams <- matrix(0, 4, narrays)
574
+  SNR <- vector("numeric", narrays)
575
+  SKW <- vector("numeric", narrays)
576
+
577
+  ## This is the sample for the fitting of splines
578
+  ## BC: I like better the idea of the user passing the seed,
579
+  ##     because this might intefere with other analyses
580
+  ##     (like what happened to GCRMA)
581
+  set.seed(seed)
582
+
583
+  idx <- sort(sample(autosomeIndex, mixtureSampleSize))
584
+  
585
+  ##S will hold (A+B)/2 and M will hold A-B
586
+  ##NOTE: We actually dont need to save S. Only for pics etc...
587
+  ##f is the correction. we save to avoid recomputing
588
+  A <- matrix(as.integer(exprs(channel(XY, "X"))), nprobes, narrays) # matrix(as.integer(0), length(pnsa), length(filenames))
589
+  B <- matrix(as.integer(exprs(channel(XY, "Y"))), nprobes, narrays) # matrix(as.integer(0), length(pnsb), length(filenames))
590
+
591
+  # new lines below - useful to keep track of zeroed out SNPs
592
+  zero <- matrix(as.integer(exprs(channel(XY, "zero"))), nprobes, narrays)
593
+    
594
+  colnames(A) <- colnames(B) <- colnames(zero) <- sampleNames(XY)$X
595
+  rownames(A) <- rownames(B) <- rownames(zero) <- featureNames(XY)
596
+  
597
+  if(verbose){
598
+     message("Calibrating ", narrays, " arrays.")
599
+     if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3)
600
+  }
601
+
602
+  idx2 <- sample(nprobes, 10^5)
603
+  for(i in 1:narrays){
604
+    SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3)
605
+    if(fitMixture){
606
+      S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN
607
+      M <- log2(A[idx, i])-log2(B[idx, i])
608
+
609
+      ##we need to test the choice of eps.. it is not the max diff between funcs
610
+      tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
611
+
612
+      mixtureParams[, i] <- tmp[["coef"]]
613
+      SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
614
+    }
615
+    if(verbose) {
616
+      if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
617
+      else cat(".")
618
+    }
619
+  }
620
+  if (verbose) {
621
+    if (getRversion() > '2.7.0') close(pb)
622
+    else cat("\n")
623
+  }
624
+  if (!fitMixture) SNR <- mixtureParams <- NA
625
+  ## gns comes from preprocStuff.rda
626
+  list(A=A, B=B, zero=zero, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
627
+}
628
+
629
+
630
+## MR: Could add arguments to allow this function to read in idat data as well, although this add a further 7 arguments, which might over-complicate things
631
+crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
632
+                  row.names=TRUE, col.names=TRUE,
633
+                  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
634
+                  seed=1, # save.it=FALSE, load.it=FALSE, intensityFile,
635
+                  mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
636
+                  cdfName, sns, recallMin=10, recallRegMin=1000,
637
+                  returnParams=FALSE, badSNP=.7) {
638
+  if(!missing(RG)) {
639
+    if(missing(XY))
640
+      XY = RGtoXY(RG, chipType=cdfName)
641
+    else
642
+      stop("Both RG and XY specified - please use one or the other")
643
+  }
644
+#  if ((load.it | save.it) & missing(intensityFile))
645
+#    stop("'intensityFile' is missing, and you chose either load.it or save.it")
646
+  if (missing(sns)) sns <- sampleNames(XY) #basename(filenames)
647
+#  if (!missing(intensityFile))
648
+#    if (load.it & !file.exists(intensityFile)){
649
+#      load.it <- FALSE
650
+#      message("File ", intensityFile, " does not exist.")
651
+#      message("Not loading it, but running SNPRMA from scratch.")
652
+# }
653
+#  if (!load.it){
654
+#    res <- snprma(filenames, fitMixture=TRUE,
655
+#                    mixtureSampleSize=mixtureSampleSize, verbose=verbose,
656
+#                    eps=eps, cdfName=cdfName, sns=sns)
657
+  res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
658
+                        seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget)
659
+#    if(save.it){
660
+#      t0 <- proc.time()
661
+#      save(res, file=intensityFile)
662
+#      t0 <- proc.time()-t0
663
+#      if (verbose) message("Used ", t0[3], " seconds to save ", intensityFile, ".")
664
+#    }
665
+#      }else{
666
+#            if (verbose) message("Loading ", intensityFile, ".")
667
+#                obj <- load(intensityFile)
668
+#                if (verbose) message("Done.")
669
+#                if (obj != "res")
670
+#                        stop("Object in ", intensityFile, " seems to be invalid.")
671
+#          }
672
+  if(row.names) row.names=res$gns else row.names=NULL
673
+  if(col.names) col.names=res$sns else col.names=NULL
674
+
675
+  res2 <- crlmmGT(res[["A"]], res[["B"]], res[["SNR"]],
676
+                  res[["mixtureParams"]], res[["cdfName"]],
677
+                  gender=gender, row.names=row.names,
678
+                  col.names=col.names, recallMin=recallMin,
679
+                  recallRegMin=1000, SNRMin=SNRMin,
680
+                  returnParams=returnParams, badSNP=badSNP,
681
+                  verbose=verbose)
682
+  
683
+  res2[["A"]] <- res[["A"]]  # added for copy number analysis
684
+  res2[["B"]] <- res[["B"]]  # added for copy number analysis
685
+  res2[["SNR"]] <- res[["SNR"]]
686
+  res2[["SKW"]] <- res[["SKW"]]
687
+  res2[["zero"]] <- res[["zero"]]
688
+  rm(res)
689
+  gc()
690
+  # MR: FIXME - for consistency with crlmm, need to save results in a 'SnpSet' object
691
+  return(res2)
692
+}
0 693
new file mode 100644
... ...
@@ -0,0 +1,77 @@
1
+\name{crlmmIllumina}
2
+\alias{crlmmIllumina}
3
+\title{Genotype Illumina Infinium II BeadChip data with CRLMM}
4
+\description{
5
+  This implementation of the CRLMM is especially designed for
6
+  data from Illumina Infinium II BeadChips.
7
+}
8
+\usage{
9
+
10
+crlmmIllumina(RG, XY, stripNorm=TRUE, useTarget=TRUE,
11
+      row.names=TRUE, col.names=TRUE,
12
+      probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5,
13
+      gender=NULL, seed=1, mixtureSampleSize=10^5,
14
+      eps=0.1, verbose=TRUE, cdfName, sns, recallMin=10,
15
+      recallRegMin=1000, returnParams=FALSE, badSNP=0.7)
16
+}
17
+
18
+\arguments{
19
+  \item{RG}{\code{NChannelSet} containing R and G bead intensities}
20
+  \item{XY}{\code{NChannelSet} containing X and Y bead intensities}
21
+  \item{stripNorm}{'logical'.  Should the data be strip-level normalized?}
22
+  \item{useTarget}{'logical' (only used when \code{stripNorm=TRUE}).
23
+    Should the reference HapMap intensities be used in strip-level normalization?}
24
+  \item{row.names}{'logical'. Use rownames - SNP names?}
25
+  \item{col.names}{'logical'. Use colnames - Sample names?}
26
+  \item{probs}{'numeric' vector with priors for AA, AB and BB.}
27
+  \item{DF}{'integer' with number of degrees of freedom to use with t-distribution.}
28
+  \item{SNRMin}{'numeric' scalar defining the minimum SNR used to filter
29
+  out samples.}
30
+  \item{gender}{'integer' vector, with same length as 'filenames',
31
+    defining sex. (1 - male; 2 - female)}
32
+  \item{seed}{'integer' scalar for random number generator (used to
33
+    sample \code{mixtureSampleSize} SNPs for mixture model.}
34
+  \item{mixtureSampleSize}{'integer'. The number of SNP's to be used
35
+    when fitting the mixture model.}
36
+  \item{eps}{Minimum change for mixture model.}
37
+  \item{verbose}{'logical'.}
38
+  \item{cdfName}{'character' defining the chip annotation (manifest) to use
39
+    ('human370v1c', human550v3b', 'human650v3a', 'human1mv1c',
40
+    'human370quadv3c', 'human610quadv1b', 'human660quadv1a' 'human1mduov3b')}
41
+  \item{sns}{'character' vector with sample names to be used.}
42
+  \item{recallMin}{'integer'. Minimum number of samples for recalibration.}
43
+  \item{recallRegMin}{'integer'. Minimum number of SNP's for regression.}
44
+  \item{returnParams}{'logical'. Return recalibrated parameters.}
45
+  \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)}
46
+}
47
+\value{
48
+  \item{A}{normalized A allele intensity}
49
+  \item{B}{normalized B allele intensity}
50
+  \item{calls}{Genotype calls (1 - AA, 2 - AB, 3 - BB)}
51
+  \item{confs}{Confidence scores 'round(-1000*log2(1-p))'}
52
+  \item{SNPQC}{SNP Quality Scores}
53
+  \item{batchQC}{Batch Quality Score}
54
+  \item{params}{Recalibrated parameters}
55
+}
56
+
57
+\details{
58
+  Note: The user should specify either the \code{RG} or \code{XY}
59
+  intensities, not both.
60
+}
61
+
62
+\references{
63
+  Carvalho B, Bengtsson H, Speed TP, Irizarry RA. Exploration,
64
+  normalization, and genotype calls of high-density oligonucleotide SNP
65
+  array data. Biostatistics. 2007 Apr;8(2):485-99. Epub 2006 Dec
66
+  22. PMID: 17189563.
67
+
68
+  Carvalho B, Louis TA, Irizarry RA. Describing Uncertainty in
69
+  Genome-wide Genotype Calling. (in prep)
70
+}
71
+
72
+\author{Matt Ritchie}
73
+
74
+\examples{
75
+## crlmmOut = crlmmIllumina(RG)
76
+}
77
+\keyword{classif}
0 78
new file mode 100755
... ...
@@ -0,0 +1,71 @@
1
+\name{readIdatFiles}
2
+\alias{readIdatFiles}
3
+
4
+\title{Reads Idat Files from Infinium II Illumina BeadChips}
5
+
6
+\description{
7
+  Reads intensity information for each bead type from
8
+  .idat files of Infinium II genotyping BeadChips}
9
+
10
+\usage{
11
+readIdatFiles(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
12
+              arrayInfoColNames=list(barcode="SentrixBarcode_A",
13
+                                     position="SentrixPosition_A"),
14
+              highDensity=FALSE, sep="_",
15
+              fileExt=list(green="Grn.idat", red="Red.idat"))
16
+}
17
+
18
+\arguments{
19
+  \item{sampleSheet}{\code{data.frame} containing Illumina sample sheet
20
+    information (for required columns, refer to BeadStudio Genotyping
21
+    guide - Appendix A).}
22
+  \item{arrayNames}{character vector containing names of arrays to be
23
+    read in.  If \code{NULL}, all arrays that can be found in the
24
+    specified working directory will be read in.}
25
+  \item{ids}{vector containing ids of probes to be read in.  If
26
+    \code{NULL} all probes found on the first array are read in.}
27
+  \item{path}{character string specifying the location of files to be
28
+    read by the function}
29
+  \item{arrayInfoColNames}{(used when \code{sampleSheet} is specified)
30
+    list containing elements 'barcode' which indicates column names in
31
+    the \code{sampleSheet} which contains the arrayNumber/barcode number
32
+    and 'position' which indicates the strip number.  In older style
33
+    sample sheets, this information is combined (usually in a column
34
+    named 'SentrixPosition') and this should be specified as
35
+    \code{list(barcode=NULL, position="SentrixPosition")}}
36
+  \item{highDensity}{logical (used when \code{sampleSheet} is
37
+    specified). If \code{TRUE}, array extensions '_A', '_B' in
38
+    sampleSheet are replaced with 'R01C01', 'R01C02' etc.}
39
+  \item{sep}{character string specifying separator used in .idat file
40
+    names.}
41
+  \item{fileExt}{list containing elements 'Green' and 'Red' which
42
+    specify the .idat file extension for the Cy3 and Cy5 channels.}
43
+}
44
+
45
+\details{
46
+The summarised Cy3 (G) and Cy5 (R) intensity, number of beads that
47
+were used in each channel and standard errors (all on the orginal scale)
48
+are read in from the .idat files.
49
+
50
+Where available, a \code{sampleSheet} data.frame, in the same format
51
+as used by BeadStudio (columns 'Sample_ID', 'SentrixBarcode_A' and
52
+'SentrixPosition_A' are required) which keeps track of sample
53
+information can be specified.
54
+
55
+Thanks to Keith Baggerly who provided the code to read in the binary .idat files.
56
+}
57
+
58
+\value{
59
+  NChannelSet with intensity data (\code{R}, \code{G}), number of beads
60
+  (\code{Rnb}, \code{Gnb}) and standard errors (\code{Rse}, \code{Gse})
61
+  for each bead type.
62
+}
63
+
64
+\author{Matt Ritchie}
65
+
66
+\examples{
67
+
68
+#RG = readIdatFiles()
69
+
70
+}
71
+\keyword{IO}