Browse code

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45780 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 07/04/2010 06:26:50
Showing3 changed files

... ...
@@ -496,3 +496,6 @@ then readIDAT() should work. Thanks to Pierre Cherel who reported this error.
496 496
    with crlmm2
497 497
 ** added crlmmCopynumber2 for parallel support with copy number
498 498
    est. (needs more checking)
499
+
500
+2010-04-07 M. Ritchie committed version 1.5.45
501
+** modified the storage of RG, XY and genotype call data to use the ff package and initializeBigMatrix() function from oligoClasses package.  The genotype call data now includes non-polymorphic probes (which have NA calls).  Functions which use ff storage are named readIdatFiles2(), RGtoXY2(), preprocessInfinium2v2(), crlmmIllumina2() and crlmmIlluminaV2().
... ...
@@ -1,8 +1,8 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.5.44
5
-Date: 2010-02-05
4
+Version: 1.5.45
5
+Date: 2010-02-07
6 6
 Author: Rafael A Irizarry, Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>
7 7
 Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
8 8
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms
... ...
@@ -177,6 +177,187 @@ readIdatFiles <- function(sampleSheet=NULL,
177 177
 }
178 178
 
179 179
 
180
+readIdatFiles2 <- function(sampleSheet=NULL,
181
+			  arrayNames=NULL,
182
+			  ids=NULL,
183
+			  path=".",
184
+			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
185
+			  highDensity=FALSE,
186
+			  sep="_",
187
+			  fileExt=list(green="Grn.idat", red="Red.idat"),
188
+			  saveDate=FALSE) {
189
+#       if(!is.null(arrayNames)) {
190
+#	       arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
191
+#	       if(!is.null(sampleSheet)) {
192
+#		       sampleSheet=NULL
193
+#		       cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
194
+#	       }
195
+#	       pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
196
+#       }
197
+       if(!is.null(arrayNames)) {
198
+               pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
199
+       }
200
+       if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet
201
+	       if(is.null(arrayNames)){
202
+		       ##arrayNames=NULL
203
+		       if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) {
204
+			       barcode = sampleSheet[,arrayInfoColNames$barcode]
205
+			       arrayNames=barcode
206
+		       }
207
+		       if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {  
208
+			       position = sampleSheet[,arrayInfoColNames$position]
209
+			       if(is.null(arrayNames))
210
+				       arrayNames=position
211
+			       else
212
+				       arrayNames = paste(arrayNames, position, sep=sep)
213
+			       if(highDensity) {
214
+				       hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02")
215
+				       for(i in names(hdExt))
216
+					       arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames)
217
+			       }
218
+		       }
219
+	       }
220
+	       pd = new("AnnotatedDataFrame", data = sampleSheet)
221
+	       sampleNames(pd) <- basename(arrayNames)               
222
+       }
223
+       if(is.null(arrayNames)) {
224
+               arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
225
+               if(!is.null(sampleSheet)) {
226
+                      sampleSheet=NULL
227
+                      cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
228
+               }
229
+               pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
230
+       }
231
+       
232
+       narrays = length(arrayNames)
233
+       grnfiles = paste(arrayNames, fileExt$green, sep=sep)
234
+       redfiles = paste(arrayNames, fileExt$red, sep=sep)
235
+       if(length(grnfiles)==0 || length(redfiles)==0)
236
+	       stop("Cannot find .idat files")
237
+       if(length(grnfiles)!=length(redfiles))
238
+	       stop("Cannot find matching .idat files")
239
+       if(path[1] != "."){
240
+	       grnidats = file.path(path, grnfiles)
241
+	       redidats = file.path(path, redfiles)
242
+       }  else {
243
+	       message("path arg not set.  Assuming files are in local directory")
244
+	       grnidats <- grnfiles
245
+	       redidats <- redfiles
246
+       }
247
+       if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
248
+       if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")       
249
+##       if(!all(c(redfiles,grnfiles) %in% dir(path=path))){
250
+##	       stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n",
251
+##		    paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" "))
252
+##       }
253
+       headerInfo = list(nProbes = rep(NA, narrays),
254
+                         Barcode = rep(NA, narrays),
255
+                         ChipType = rep(NA, narrays),
256
+                         Manifest = rep(NA, narrays), # not sure about this one - sometimes blank
257
+                         Position = rep(NA, narrays)) # this may also vary a bit
258
+       dates = list(decode=rep(NA, narrays),
259
+                    scan=rep(NA, narrays))
260
+       ## read in the data
261
+       for(i in seq(along=arrayNames)) {
262
+	       cat("reading", arrayNames[i], "\t")
263
+	       idsG = idsR = G = R = NULL
264
+	       cat(paste(sep, fileExt$green, sep=""), "\t")
265
+	       G = readIDAT(grnidats[i])
266
+	       idsG = rownames(G$Quants)
267
+	       headerInfo$nProbes[i] = G$nSNPsRead
268
+	       headerInfo$Barcode[i] = G$Barcode
269
+	       headerInfo$ChipType[i] = G$ChipType
270
+	       headerInfo$Manifest[i] = G$Unknown$MostlyNull
271
+	       headerInfo$Position[i] = G$Unknowns$MostlyA
272
+               if(headerInfo$nProbes[i]>(headerInfo$nProbes[1]+10000) || headerInfo$nProbes[i]<(headerInfo$nProbes[1]-10000)) {
273
+                       ## headerInfo$ChipType[i]!=headerInfo$ChipType[1] || headerInfo$Manifest[i]!=headerInfo$Manifest[1]) {
274
+		       ## || headerInfo$nProbes[i]!=headerInfo$nProbes[1] ## removed this condition as some arrays used the same manifest
275
+		       ## but differed by a few SNPs for some reason - most of the chip was the same though
276
+		       ##           stop("Chips are not of all of the same type - please check your data")
277
+		       warning("Chips are not of the same type.  Skipping ", basename(grnidats[i]), " and ", basename(redidats[i]))
278
+		       next()
279
+	       }
280
+	       dates$decode[i] = G$RunInfo[1, 1]
281
+	       dates$scan[i] = G$RunInfo[2, 1]
282
+	       if(i==1) {
283
+		       if(is.null(ids) && !is.null(G)){
284
+			       ids = idsG
285
+		       } else stop("Could not find probe IDs")
286
+		       nprobes = length(ids)
287
+		       narrays = length(arrayNames)
288
+#		        tmpmat = matrix(NA, nprobes, narrays)
289
+#		        rownames(tmpmat) = ids
290
+#		       fD <- new("AnnotatedDataFrame", data=data.frame(row.names=ids))##, varMetadata=data.frame(labelDescript
291
+		       RG <- new("NChannelSet",
292
+		                 R=initializeBigMatrix(name="R", nr=nprobes, nc=narrays, vmode="integer"),
293
+		                 G=initializeBigMatrix(name="G", nr=nprobes, nc=narrays, vmode="integer"),
294
+		                 zero=initializeBigMatrix(name="zero", nr=nprobes, nc=narrays, vmode="integer"),
295
+#		                 featureData=fD,
296
+#		                 annotation=cdfName)
297
+#				 R=tmpmat,
298
+#				 G=tmpmat,
299
+#				 zero=tmpmat,
300
+#				 Rnb=tmpmat,
301
+#				 Gnb=tmpmat,
302
+#				 Rse=tmpmat,
303
+#				 Gse=tmpmat,
304
+				 annotation=headerInfo$Manifest[1],
305
+				 phenoData=pd,
306
+				 storage.mode="environment")
307
+                       featureNames(RG) = ids
308
+#		       rm(tmpmat)
309
+		       if(!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)){
310
+		            sampleNames(RG) = sampleSheet$Sample_ID
311
+		       } else  sampleNames(RG) = arrayNames
312
+		       gc()
313
+	       }
314
+	       if(length(ids)==length(idsG)) {
315
+		       if(sum(ids==idsG)==nprobes) {
316
+			       RG@assayData$G[,i] = G$Quants[, "Mean"]
317
+			       zeroG = G$Quants[, "NBeads"]==0
318
+#			       RG@assayData$Gnb[,i] = G$Quants[, "NBeads"]
319
+#			       RG@assayData$Gse[,i] = G$Quants[, "SD"]
320
+		       }
321
+	       } else {
322
+		       indG = match(ids, idsG)
323
+		       RG@assayData$G[,i] = G$Quants[indG, "Mean"]
324
+		       zeroG = G$Quants[indG, "NBeads"]==0
325
+#		       RG@assayData$Gnb[,i] = G$Quants[indG, "NBeads"]
326
+#		       RG@assayData$Gse[,i] = G$Quants[indG, "SD"]
327
+	       }
328
+	       rm(G)
329
+	       gc()
330
+         
331
+	       cat(paste(sep, fileExt$red, sep=""), "\n")
332
+	       R = readIDAT(redidats[i])
333
+	       idsR = rownames(R$Quants)
334
+         
335
+	       if(length(ids)==length(idsG)) {   
336
+		       if(sum(ids==idsR)==nprobes) {
337
+			       RG@assayData$R[,i] = R$Quants[ ,"Mean"]
338
+		               zeroR = R$Quants[ ,"NBeads"]==0
339
+#			       RG@assayData$Rnb[,i] = R$Quants[ ,"NBeads"]
340
+#			       RG@assayData$Rse[,i] = R$Quants[ ,"SD"]
341
+		       }
342
+	       } else {
343
+		       indR = match(ids, idsR)
344
+		       RG@assayData$R[,i] = R$Quants[indR, "Mean"]
345
+		       zeroR = R$Quants[indR, "NBeads"]==0
346
+#		       RG@assayData$Rnb[,i] = R$Quants[indR, "NBeads"]
347
+#		       RG@assayData$Rse[,i] = R$Quants[indR, "SD"]
348
+	       }
349
+	       RG@assayData$zero[,i] = zeroG | zeroR
350
+	       rm(R, zeroG, zeroR)
351
+	       gc()
352
+       }
353
+       if(saveDate) {
354
+	       protocolData(RG)[["ScanDate"]] = dates$scan
355
+       }
356
+       storageMode(RG) = "lockedEnvironment"
357
+       RG
358
+}
359
+
360
+
180 361
 ## the readIDAT() and readBPM() functions below were provided by Keith Baggerly, 27/8/2008
181 362
 readIDAT <- function(idatFile){
182 363
 
... ...
@@ -433,9 +614,6 @@ readBPM <- function(bpmFile){
433 614
   
434 615
 }
435 616
 
436
-
437
-
438
-
439 617
 RGtoXY = function(RG, chipType, verbose=TRUE) {
440 618
   chipList = c("human1mv1c",             # 1M
441 619
                "human370v1c",            # 370CNV
... ...
@@ -541,6 +719,115 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
541 719
   XY
542 720
 }
543 721
 
722
+
723
+RGtoXY2 = function(RG, chipType, verbose=TRUE) {
724
+  chipList = c("human1mv1c",             # 1M
725
+               "human370v1c",            # 370CNV
726
+               "human650v3a",            # 650Y
727
+               "human610quadv1b",        # 610 quad
728
+               "human660quadv1a",        # 660 quad
729
+               "human370quadv3c",        # 370CNV quad
730
+               "human550v3b",            # 550K
731
+               "human1mduov3b",          # 1M Duo
732
+               "humanomni1quadv1b")      # Omni1 quad
733
+  if(missing(chipType)){
734
+	  chipType = match.arg(annotation(RG), chipList)
735
+  } else chipType = match.arg(chipType, chipList)
736
+  
737
+  pkgname <- getCrlmmAnnotationName(chipType)
738
+  if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
739
+     suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
740
+     msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
741
+     message(strwrap(msg))
742
+     stop("Package ", pkgname, " could not be found.")
743
+     rm(suggCall, msg)
744
+  }
745
+  if(verbose) message("Loading chip annotation information.")
746
+    loader("address.rda", .crlmmPkgEnv, pkgname)
747
+#    data(annotation, package=pkgname, envir=.crlmmPkgEnv)
748
+  aids <- getVarInEnv("addressA") # comes from AddressA_ID or Address column in manifest
749
+  bids <- getVarInEnv("addressB") # comes from AddressB_ID or Address2 column in manifest
750
+  ids <- names(aids)
751
+  snpbase <- getVarInEnv("base")
752
+  
753
+  nsnps = length(aids)
754
+  narrays = ncol(RG)
755
+  
756
+#  aidcol = match("AddressA_ID", colnames(annot))
757
+#  if(is.na(aidcol))
758
+#    aidcol = match("Address", colnames(annot))
759
+#  bidcol = match("AddressB_ID", colnames(annot))
760
+#  if(is.na(bidcol)) 
761
+#    bidcol = match("Address2", colnames(annot))
762
+#  aids = annot[, aidcol]
763
+#  bids = annot[, bidcol]
764
+#  snpids = annot[,"Name"]
765
+#  snpbase = annot[,"SNP"] 
766
+  infI = !is.na(bids) & bids!=0  
767
+  aord = match(aids, featureNames(RG)) # NAs are possible here
768
+  bord = match(bids, featureNames(RG)) # and here
769
+#  argrg = aids[rrgg]
770
+#  brgrg = bids[rrgg]
771
+
772
+#  fD <- new("AnnotatedDataFrame", data=data.frame(row.names=ids))##, varMetadata=data.frame(labelDescript
773
+  XY <- new("NChannelSet",
774
+	     X=initializeBigMatrix(name="X", nr=nsnps, nc=narrays, vmode="integer"),
775
+	     Y=initializeBigMatrix(name="Y", nr=nsnps, nc=narrays, vmode="integer"),
776
+	     zero=initializeBigMatrix(name="zero", nr=nsnps, nc=narrays, vmode="integer"),
777
+	     annotation=chipType, phenoData=RG@phenoData, # featureData=fD
778
+	     protocolData=RG@protocolData, storage.mode="environment")
779
+  featureNames(XY) = ids # featureNames(RG)
780
+  gc()
781
+  
782
+  # First sort out Infinium II SNPs, X -> R (allele A)  and Y -> G (allele B) from the same probe
783
+  XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red
784
+  XY@assayData$Y[!is.na(aord),] = exprs(channel(RG, "G"))[aord[!is.na(aord)],] # mostly green
785
+  XY@assayData$zero[!is.na(aord),] = exprs(channel(RG, "zero"))[aord[!is.na(aord)],] # mostly green
786
+#  XY@assayData$Xnb[!is.na(aord),] = exprs(channel(RG, "Rnb"))[aord[!is.na(aord)],]
787
+#  XY@assayData$Ynb[!is.na(aord),] = exprs(channel(RG, "Gnb"))[aord[!is.na(aord)],]
788
+#  XY@assayData$Xse[!is.na(aord),] = exprs(channel(RG, "Rse"))[aord[!is.na(aord)],]
789
+#  XY@assayData$Yse[!is.na(aord),] = exprs(channel(RG, "Gse"))[aord[!is.na(aord)],]
790
+  gc()
791
+  
792
+  ## Warning - not 100% sure that the code below is correct - could be more complicated than this
793
+  
794
+  # Next Infinium I where X -> R from allele A probe and Y -> R from allele B probe
795
+#  infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]")
796
+  
797
+#  X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red
798
+#  Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green
799
+#  Xnb[infIRR,] = exprs(channel(RG, "Rnb"))[aord[infIRR],]
800
+#  Ynb[infIRR,] = exprs(channel(RG, "Rnb"))[bord[infIRR],]
801
+#  Xse[infIRR,] = exprs(channel(RG, "Rse"))[aord[infIRR],]
802
+#  Yse[infIRR,] = exprs(channel(RG, "Rse"))[bord[infIRR],]
803
+  
804
+  # Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe
805
+#  infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]")
806
+
807
+#  X[infIGG,] = exprs(channel(RG, "G"))[aord[infIGG],] # mostly red
808
+#  Y[infIGG,] = exprs(channel(RG, "G"))[bord[infIGG],] # mostly green
809
+#  Xnb[infIGG,] = exprs(channel(RG, "Gnb"))[aord[infIGG],]
810
+#  Ynb[infIGG,] = exprs(channel(RG, "Gnb"))[bord[infIGG],]
811
+#  Xse[infIGG,] = exprs(channel(RG, "Gse"))[aord[infIGG],]
812
+#  Yse[infIGG,] = exprs(channel(RG, "Gse"))[bord[infIGG],]
813
+    
814
+  #  For now zero out Infinium I probes
815
+  XY@assayData$X[infI,] = 0
816
+  XY@assayData$Y[infI,] = 0
817
+  XY@assayData$zero[infI,] = 0  
818
+#  XY@assayData$Xnb[infI,] = 0
819
+#  XY@assayData$Ynb[infI,] = 0
820
+#  XY@assayData$Xse[infI,] = 0
821
+#  XY@assayData$Yse[infI,] = 0
822
+
823
+#  XY@assayData$zero[XY@assayData$X==0 | XY@assayData$Y==0] = 1
824
+  gc()
825
+
826
+#  storageMode(XY) = "lockedEnvironment"
827
+  XY
828
+}
829
+
830
+
544 831
 stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
545 832
   pkgname <- getCrlmmAnnotationName(annotation(XY))
546 833
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
... ...
@@ -739,8 +1026,157 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
739 1026
 }
740 1027
 
741 1028
 
742
-## MR: Could add arguments to allow this function to read in idat data as well,
743
-## although this would add a further 7 arguments, which might over-complicate things
1029
+preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5,
1030
+				fitMixture=TRUE,
1031
+				eps=0.1,
1032
+				verbose=TRUE,
1033
+				seed=1,
1034
+				cdfName,
1035
+				sns,
1036
+				stripNorm=TRUE,
1037
+				useTarget=TRUE) { #,
1038
+#				save.it=FALSE,
1039
+#				snpFile,
1040
+#				cnFile) {
1041
+  if(stripNorm)
1042
+    XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose)
1043
+
1044
+## MR: the code below is mostly straight from snprma.R
1045
+  if (missing(sns)) sns <- sampleNames(XY) #$X
1046
+  if(missing(cdfName))
1047
+    cdfName <- annotation(XY)
1048
+##  stuffDir <- changeToCrlmmAnnotationName(cdfName)
1049
+  pkgname <- getCrlmmAnnotationName(cdfName)
1050
+  if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
1051
+    suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
1052
+    msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
1053
+    message(strwrap(msg))
1054
+    stop("Package ", pkgname, " could not be found.")
1055
+    rm(suggCall, msg)
1056
+  }
1057
+  if(verbose) message("Loading snp annotation and mixture model parameters.")
1058
+  loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
1059
+  loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
1060
+  loader("snpProbesFid.rda", .crlmmPkgEnv, pkgname)
1061
+  loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
1062
+#  data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
1063
+  autosomeIndex <- getVarInEnv("autosomeIndex")
1064
+  #  pnsa <- getVarInEnv("pnsa")
1065
+  #  pnsb <- getVarInEnv("pnsb")
1066
+  #  fid <- getVarInEnv("fid")
1067
+  #  reference <- getVarInEnv("reference")
1068
+  #  aIndex <- getVarInEnv("aIndex")
1069
+  #  bIndex <- getVarInEnv("bIndex")
1070
+  SMEDIAN <- getVarInEnv("SMEDIAN")
1071
+  theKnots <- getVarInEnv("theKnots")
1072
+  gns <- featureNames(XY) # getVarInEnv("gns") # needs to include np probes - gns is only for snps
1073
+  
1074
+  # separate out copy number probes
1075
+  npIndex = getVarInEnv("npProbesFid")
1076
+#  nprobes = length(npIndex)
1077
+  narrays = ncol(XY)
1078
+#  A <- matrix(as.integer(exprs(channel(XY, "X"))[npIndex,]), nprobes, narrays)
1079
+#  B <- matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays)
1080
+
1081
+  # new lines below - useful to keep track of zeroed out probes
1082
+#  zero <- matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays) 
1083
+
1084
+#  colnames(A) <- colnames(B) <- colnames(zero) <- sns
1085
+#  rownames(A) <- rownames(B) <- rownames(zero) <- names(npIndex)
1086
+  
1087
+#  cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName)
1088
+#  if(save.it & !missing(cnFile)) {
1089
+#    t0 <- proc.time() 
1090
+#    save(cnAB, file=cnFile) 
1091
+#    t0 <- proc.time()-t0
1092
+#    if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".")
1093
+#  }
1094
+#  rm(cnAB, B, zero)
1095
+  
1096
+  # next process snp probes
1097
+  snpIndex = getVarInEnv("snpProbesFid")
1098
+  nprobes <- length(snpIndex)
1099
+  
1100
+  ##We will read each cel file, summarize, and run EM one by one
1101
+  ##We will save parameters of EM to use later
1102
+#  mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(narrays), "double")
1103
+#  SNR <- initializeBigVector("crlmmSNR-", narrays, "double")
1104
+#  SKW <- initializeBigVector("crlmmSKW-", narrays, "double") 
1105
+  mixtureParams <- matrix(0, 4, narrays)
1106
+  SNR <- vector("numeric", narrays)
1107
+  SKW <- vector("numeric", narrays)
1108
+
1109
+  ## This is the sample for the fitting of splines
1110
+  ## BC: I like better the idea of the user passing the seed,
1111
+  ##     because this might intefere with other analyses
1112
+  ##     (like what happened to GCRMA)
1113
+  set.seed(seed)
1114
+  idx <- sort(sample(autosomeIndex, mixtureSampleSize))
1115
+  
1116
+  ##S will hold (A+B)/2 and M will hold A-B
1117
+  ##NOTE: We actually dont need to save S. Only for pics etc...
1118
+  ##f is the correction. we save to avoid recomputing
1119
+  A <- exprs(channel(XY, "X"))[,] # ), nrow(XY), narrays) # [snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsa), length(filenames))
1120
+  B <- exprs(channel(XY, "Y"))[,] # ), nrow(XY), narrays) # [snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsb), length(filenames))
1121
+
1122
+  # new lines below - useful to keep track of zeroed out SNPs
1123
+  zero <- exprs(channel(XY, "zero"))[,] # )) #[snpIndex,]), nprobes, narrays)
1124
+
1125
+#  if(!is.matrix(A)) {
1126
+#     A = A[,]
1127
+#     B = B[,]
1128
+#     zero = zero[,]
1129
+#  }
1130
+
1131
+  if(!is.integer(A)) {
1132
+     A = matrix(as.integer(A), nrow(A), ncol(A))
1133
+     B = matrix(as.integer(B), nrow(B), ncol(B))
1134
+  }  
1135
+    
1136
+#  colnames(A) <- colnames(B) <- colnames(zero) <- sns
1137
+#  rownames(A) <- rownames(B) <- rownames(zero) <- names(snpIndex) # gns # featureNames(XY)
1138
+  
1139
+  if(verbose){
1140
+     message("Calibrating ", narrays, " arrays.")
1141
+     if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3)
1142
+  }
1143
+
1144
+  idx2 <- sample(nprobes, 10^5)
1145
+  for(i in 1:narrays){
1146
+    SKW[i] = mean((A[snpIndex,i][idx2]-mean(A[snpIndex,i][idx2]))^3)/(sd(A[snpIndex,i][idx2])^3)
1147
+    if(fitMixture){
1148
+      S <- (log2(A[snpIndex,i][idx])+log2(B[snpIndex,i][idx]))/2 - SMEDIAN
1149
+      M <- log2(A[snpIndex,i][idx])-log2(B[snpIndex,i][idx])
1150
+
1151
+      ##we need to test the choice of eps.. it is not the max diff between funcs
1152
+      tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
1153
+
1154
+      mixtureParams[, i] <- tmp[["coef"]]
1155
+      SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
1156
+    }
1157
+    if(verbose) {
1158
+       if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
1159
+       else cat(".")
1160
+    }
1161
+  }
1162
+  if (verbose) {
1163
+    if (getRversion() > '2.7.0') close(pb)
1164
+    else cat("\n")
1165
+  }
1166
+  if (!fitMixture) SNR <- mixtureParams <- NA
1167
+  ## gns comes from preprocStuff.rda
1168
+  res = list(A=A, B=B, zero=zero, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName, snpIndex=snpIndex, npIndex=npIndex)
1169
+
1170
+#  if(save.it & !missing(snpFile)) {
1171
+#    t0 <- proc.time() 
1172
+#    save(res, file=snpFile) 
1173
+#    t0 <- proc.time()-t0
1174
+#    if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".")
1175
+#  }
1176
+  return(res)
1177
+}
1178
+
1179
+
744 1180
 crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
745 1181
                   row.names=TRUE, col.names=TRUE,
746 1182
                   probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
... ...
@@ -795,11 +1231,104 @@ crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
795 1231
   return(list2SnpSet(res2, returnParams=returnParams)) # return(res2)
796 1232
 }
797 1233
 
1234
+
1235
+crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
1236
+                  row.names=TRUE, col.names=TRUE,
1237
+                  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
1238
+                  seed=1, # save.it=FALSE, load.it=FALSE, snpFile, cnFile,
1239
+                  mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
1240
+                  cdfName, sns, recallMin=10, recallRegMin=1000,
1241
+                  returnParams=FALSE, badSNP=.7) {
1242
+#  if (save.it & (missing(snpFile) | missing(cnFile)))
1243
+#    stop("'snpFile' and/or 'cnFile' is missing and you chose to save.it")
1244
+#  if (load.it & missing(snpFile))
1245
+#    stop("'snpFile' is missing and you chose to load.it")
1246
+#  if (!missing(snpFile))
1247
+#    if (load.it & !file.exists(snpFile)){
1248
+#      load.it <- FALSE
1249
+#      message("File ", snpFile, " does not exist.")
1250
+#      stop("Cannot load SNP data.")
1251
+#  }
1252
+#  if (!load.it){
1253
+    if(!missing(RG)) {
1254
+      if(missing(XY))
1255
+        XY = RGtoXY2(RG, chipType=cdfName)
1256
+      else
1257
+        stop("Both RG and XY specified - please use one or the other")
1258
+    }
1259
+    if (missing(sns)) sns <- sampleNames(XY) #$X
1260
+    
1261
+    res = preprocessInfinium2v2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1262
+                        seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #,
1263
+                      #  save.it=save.it, snpFile=snpFile, cnFile=cnFile)
1264
+
1265
+    fD = featureData(XY)
1266
+    phenD = XY@phenoData
1267
+    protD = XY@protocolData
1268
+    rm(XY)
1269
+    gc()
1270
+    if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability")
1271
+    callSet <- new("SnpSuperSet",
1272
+                    alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
1273
+                    alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
1274
+                    call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
1275
+                    callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
1276
+  	            annotation=cdfName, protocolData=protD, phenoData=phenD, featureData=fD) 
1277
+    sampleNames(callSet) <- sns
1278
+    featureNames(callSet) <- res[["gns"]]
1279
+    pData(callSet)$SKW <- rep(NA, length(sns))
1280
+    pData(callSet)$SNR <- rep(NA, length(sns))
1281
+    pData(callSet)$gender <- rep(NA, length(sns))
1282
+					  
1283
+#  }else{
1284
+#      if(verbose) message("Loading ", snpFile, ".")
1285
+#        obj <- load(snpFile)
1286
+#        if(verbose) message("Done.")
1287
+#        if(!any(obj == "res"))
1288
+#          stop("Object in ", snpFile, " seems to be invalid.")
1289
+#  }
1290
+
1291
+    rm(phenD, protD , fD)
1292
+	
1293
+    snp.index <- res$snpIndex #match(res$gns, featureNames(callSet))                
1294
+    suppressWarnings(A(callSet) <- res[["A"]])
1295
+    suppressWarnings(B(callSet) <- res[["B"]])
1296
+    pData(callSet)$SKW <- res$SKW
1297
+    pData(callSet)$SNR <- res$SNR
1298
+    mixtureParams <- res$mixtureParams
1299
+    rm(res); gc()
1300
+    tmp <- crlmmGT(A=as.matrix(A(callSet)[snp.index,]), # j]),
1301
+                  B=as.matrix(B(callSet)[snp.index,]),  # j]),
1302
+                  SNR=callSet$SNR, # [j],
1303
+                  mixtureParams=mixtureParams,
1304
+                  cdfName=annotation(callSet),
1305
+#                  row.names=featureNames(callSet)[snp.index],
1306
+#                  col.names=sampleNames(callSet), #[j],
1307
+                  probs=probs,
1308
+                  DF=DF,
1309
+                  SNRMin=SNRMin,
1310
+                  recallMin=recallMin,
1311
+                  recallRegMin=recallRegMin,
1312
+                  gender=gender,
1313
+                  verbose=verbose,
1314
+                  returnParams=returnParams,
1315
+                  badSNP=badSNP)
1316
+#    suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
1317
+#    suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
1318
+#    callSet$gender[j] <- tmp$gender
1319
+    suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]])
1320
+    suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]])
1321
+    callSet$gender <- tmp$gender
1322
+    rm(tmp); gc()
1323
+    return(callSet)
1324
+}
1325
+
1326
+
798 1327
 ## MR: Below is a more memory efficient version of crlmmIllumina() which 
799 1328
 ## reads in the .idats and genotypes in the one function and removes objects 
800 1329
 ## after they have been used
801 1330
 crlmmIlluminaV2 = function(sampleSheet=NULL,
802
-                          arrayNames=NULL,
1331
+			  arrayNames=NULL,
803 1332
 			  ids=NULL,
804 1333
 			  path=".",
805 1334
 			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
... ...
@@ -807,53 +1336,129 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
807 1336
 			  sep="_",
808 1337
 			  fileExt=list(green="Grn.idat", red="Red.idat"),
809 1338
 			  saveDate=FALSE,
810
-              save.rg=FALSE,
811
-              rgFile,
1339
+#                          save.rg=FALSE,
1340
+#                          rgFile,
812 1341
 			  stripNorm=TRUE,
813 1342
 			  useTarget=TRUE,
814
-          	  row.names=TRUE, 
1343
+          	          row.names=TRUE, 
815 1344
 			  col.names=TRUE,
816 1345
 			  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
817
-              seed=1, save.ab=FALSE, snpFile, cnFile,
818
-              mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
819
-              cdfName, sns, recallMin=10, recallRegMin=1000,
820
-              returnParams=FALSE, badSNP=.7) {
1346
+                          seed=1, # save.ab=FALSE, snpFile, cnFile,
1347
+                          mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
1348
+                          cdfName, sns, recallMin=10, recallRegMin=1000,
1349
+                          returnParams=FALSE, badSNP=.7) {
1350
+
1351
+  if(missing(cdfName)) stop("must specify cdfName")
1352
+  if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
1353
+#  if(missing(sns)) sns <- basename(arrayNames)
821 1354
 			  
822
-  if (save.rg & missing(rgFile))
823
-    stop("'rgFile' is missing, and you chose save.rg")
824
-  if (save.ab & (missing(snpFile) | missing(cnFile)))
825
-    stop("'snpFile' or 'cnFile' is missing and you chose save.ab")
826
-				  
827
-  RG = readIdatFiles(sampleSheet=sampleSheet, arrayNames=arrayNames,
1355
+#  if (save.rg & missing(rgFile))
1356
+#    stop("'rgFile' is missing, and you chose save.rg")
1357
+#  if (save.ab & (missing(snpFile) | missing(cnFile)))
1358
+#    stop("'snpFile' or 'cnFile' is missing and you chose save.ab")
1359
+#  batches = NULL
1360
+#  if(!is.null(arrayNames))
1361
+#    batches <- rep(1, length(arrayNames)) # problem here if arrayNames not specified! # splitIndicesByLength(seq(along=arrayNames), ocSamples())
1362
+#  if(!is.null(sampleSheet))
1363
+#    batches <- rep(1, nrow(sampleSheet))
1364
+#  if(is.null(batches))
1365
+#    batches=1
1366
+#  k <- 1
1367
+#  for(j in batches){
1368
+#     if(verbose) message("Batch ", k, " of ", length(batches))
1369
+#     RG = readIdatFiles(sampleSheet=sampleSheet[j,], arrayNames=arrayNames[j],
1370
+#                       ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
1371
+#                       highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate)
1372
+     RG = readIdatFiles2(sampleSheet=sampleSheet, arrayNames=arrayNames,
828 1373
                        ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
829 1374
                        highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate)
830
-  if(save.rg)
831
-	save(RG, file=rgFile)
832 1375
 
833
-  XY = RGtoXY(RG, chipType=cdfName)
834
-  rm(RG)
835
-  gc()
836
-  if (missing(sns)) sns = sampleNames(XY)
837
-    
838
-  res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
839
-                        seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget,
840
-                        save.it=save.ab, snpFile=snpFile, cnFile=cnFile)
841
-  rm(XY)
842
-  gc()
843
-  if(row.names) row.names=res$gns else row.names=NULL
844
-  if(col.names) col.names=res$sns else col.names=NULL
1376
+#  if(save.rg)
1377
+#	save(RG, file=rgFile)
845 1378
 
846
-  res2 = crlmmGT(res[["A"]], res[["B"]], res[["SNR"]],
847
-                  res[["mixtureParams"]], res[["cdfName"]],
848
-                  gender=gender, row.names=row.names,
849
-                  col.names=col.names, recallMin=recallMin,
850
-                  recallRegMin=1000, SNRMin=SNRMin,
851
-                  returnParams=returnParams, badSNP=badSNP,
852
-                  verbose=verbose)
853
-  
854
-  res2[["SNR"]] = res[["SNR"]]
855
-  res2[["SKW"]] = res[["SKW"]]
856
-  rm(res)
857
-  gc()
858
-  return(list2SnpSet(res2, returnParams=returnParams))
1379
+    XY = RGtoXY2(RG, chipType=cdfName)
1380
+    rm(RG)
1381
+    gc()
1382
+    if (missing(sns)) { sns = sampleNames(XY) #subsns = sampleNames(XY)
1383
+    } else subsns = sns[j]
1384
+    res = preprocessInfinium2v2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1385
+                               seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) # sns=subsns
1386
+#                          save.it=save.ab, snpFile=snpFile, cnFile=cnFile)
1387
+    fD = featureData(XY)
1388
+    phenD = XY@phenoData
1389
+    protD = XY@protocolData
1390
+    rm(XY)
1391
+    gc()
1392
+#    if(k == 1){
1393
+    if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability")
1394
+    callSet <- new("SnpSuperSet",
1395
+                    alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
1396
+                    alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
1397
+                    call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
1398
+                    callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
1399
+ 		    annotation=cdfName, protocolData=protD, phenoData=phenD, featureData=fD)
1400
+    sampleNames(callSet) <- sns
1401
+#            phenoData(callSet) <- getPhenoData(sampleSheet=sampleSheet,
1402
+#                                    arrayNames=sns,
1403
+#								    arrayInfoColNames=arrayInfoColNames)
1404
+#            pD <- data.frame(matrix(NA, length(sns), 1), row.names=sns)
1405
+#            colnames(pD) <- "ScanDate"
1406
+#            protocolData(callSet) <- pData(protD) # new("AnnotatedDataFrame", data=pD)
1407
+#            pData(protocolData(callSet))[j, ] <- pData(protocolData)
1408
+    featureNames(callSet) <- res[["gns"]]
1409
+    pData(callSet)$SKW <- rep(NA, length(sns))
1410
+    pData(callSet)$SNR <- rep(NA, length(sns))
1411
+    pData(callSet)$gender <- rep(NA, length(sns))			
1412
+#	}
1413
+#	pData(callSet)[j,] <- phenD
1414
+#	pData(protocolData(callSet))[j,] <- protD
1415
+#	pData(callSet) <- phenD
1416
+#	pData(protocolData(callSet)) <- protD
1417
+
1418
+    rm(phenD, protD, fD)
1419
+	
1420
+#    if(k > 1 & nrow(res[[1]]) != nrow(callSet)){
1421
+        ##RS: I don't understand why the IDATS for the
1422
+        ##same platform potentially have different lengths
1423
+#        res[["A"]] <- res[["A"]][res$gns %in% featureNames(callSet), ]
1424
+#        res[["B"]] <- res[["B"]][res$gns %in% featureNames(callSet), ]
1425
+#    }
1426
+
1427
+    snp.index <- res$snpIndex #match(res$gns, featureNames(callSet))                
1428
+#    suppressWarnings(A(callSet)[, j] <- res[["A"]])
1429
+#    suppressWarnings(B(callSet)[, j] <- res[["B"]])
1430
+    suppressWarnings(A(callSet) <- res[["A"]])
1431
+    suppressWarnings(B(callSet) <- res[["B"]])
1432
+#    pData(callSet)$SKW[j] <- res$SKW
1433
+#    pData(callSet)$SNR[j] <- res$SNR
1434
+    pData(callSet)$SKW <- res$SKW
1435
+    pData(callSet)$SNR <- res$SNR
1436
+    mixtureParams <- res$mixtureParams
1437
+    rm(res); gc()
1438
+    tmp <- crlmmGT(A=as.matrix(A(callSet)[snp.index,]), # j]),
1439
+                  B=as.matrix(B(callSet)[snp.index,]),  # j]),
1440
+                  SNR=callSet$SNR, # [j],
1441
+                  mixtureParams=mixtureParams,
1442
+                  cdfName=annotation(callSet),
1443
+                  row.names=featureNames(callSet)[snp.index],
1444
+                  col.names=sampleNames(callSet), #[j],
1445
+                  probs=probs,
1446
+                  DF=DF,
1447
+                  SNRMin=SNRMin,
1448
+                  recallMin=recallMin,
1449
+                  recallRegMin=recallRegMin,
1450
+                  gender=gender,
1451
+                  verbose=verbose,
1452
+                  returnParams=returnParams,
1453
+                  badSNP=badSNP)
1454
+#    suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
1455
+#    suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
1456
+#    callSet$gender[j] <- tmp$gender
1457
+    suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]])
1458
+    suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]])
1459
+    callSet$gender <- tmp$gender
1460
+    rm(tmp); gc()
1461
+#    k <- k+1
1462
+#  }
1463
+    return(callSet)
859 1464
 }