Browse code

Added nopackage option for krlmm

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@117087 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 03/05/2016 23:39:54
Showing16 changed files

... ...
@@ -18,13 +18,13 @@ LinkingTo: preprocessCore (>= 1.17.7)
18 18
 Imports: methods, Biobase (>= 2.15.4), BiocGenerics, affyio (>=
19 19
         1.23.2), illuminaio, ellipse, mvtnorm, splines, stats, SNPchip,
20 20
         utils, lattice, ff, foreach, RcppEigen (>= 0.3.1.2.1),
21
-        matrixStats, VGAM, parallel
21
+        matrixStats, VGAM, parallel, graphics, limma, beanplot
22 22
 Suggests: hapmapsnp6, genomewidesnp6Crlmm (>= 1.0.7), GGdata, snpStats,
23 23
         RUnit
24 24
 Collate: AllGenerics.R AllClasses.R methods-AssayData.R methods-CNSet.R
25 25
         methods-CNSetLM.R methods-eSet.R methods-SnpSuperSet.R
26 26
         methods-PredictionRegion.R cnrma-functions.R cnset-accessors.R
27
-        crlmm-functions.R crlmmGT2.R crlmm-illumina.R krlmm.R
27
+        crlmm-functions.R crlmmGT2.R crlmm-illumina.R krlmm.R plot.R
28 28
         snprma-functions.R utils.R zzz.R test_crlmm_package.R
29 29
 LazyLoad: yes
30 30
 ## Local Variables:
... ...
@@ -13,7 +13,7 @@ import(methods)
13 13
 importFrom(RcppEigen, fastLmPure)
14 14
 ## importClassesFrom(oligoClasses, CNSet, CNSetLM, ff_matrix,
15 15
 ##                   ff_or_matrix, oligoSnpSet)
16
-importClassesFrom(oligoClasses, CNSet, oligoSnpSet, ff_or_matrix)
16
+importClassesFrom(oligoClasses, CNSet, oligoSnpSet, ff_or_matrix, GenomeAnnotatedDataFrame)
17 17
 ##setOldClass(ff_or_matrix)
18 18
 import(matrixStats)
19 19
 
... ...
@@ -72,13 +72,18 @@ importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile,
72 72
 importFrom(utils, packageDescription, setTxtProgressBar,
73 73
            txtProgressBar)
74 74
 
75
+importFrom(graphics, plot, smoothScatter, abline)
76
+
77
+importFrom(beanplot, beanplot)
78
+
75 79
 ## foreach
76 80
 import(foreach)
77 81
 
78 82
 importFrom(VGAM, vglm, multinomial, coefficients)
79 83
 
80
-importFrom(parallel, makeCluster, detectCores, parRapply, stopCluster, clusterEvalQ)
84
+importFrom(parallel, makeCluster, detectCores, parRapply, stopCluster, clusterEvalQ, parCapply)
81 85
 
86
+importFrom(limma, loessFit)
82 87
 
83 88
 ##----------------------------------------------------------------------------
84 89
 ## export
... ...
@@ -92,7 +97,6 @@ exportMethods(CA, CB,
92 97
 	      BafLrrSetList)
93 98
 export(crlmm,
94 99
        crlmmIllumina,
95
-       crlmmIlluminaV2,
96 100
        constructAffyCNSet,
97 101
        genotype,
98 102
        genotypeAffy,
... ...
@@ -106,6 +110,7 @@ export(crlmm,
106 110
        genotype2, genotypeLD,
107 111
        genotypeAffy,
108 112
        genotype.Illumina,
113
+       plotSamples, plotSNPs,
109 114
        crlmmCopynumber2, crlmmCopynumberLD, crlmmCopynumber)
110 115
 export(genotypes, totalCopynumber, rawCopynumber, xyplot)
111 116
 export(ABpanel, validCEL, celDates, validCdfNames)
... ...
@@ -12,7 +12,7 @@ readIdatFiles = function(sampleSheet=NULL,
12 12
 			  sep="_",
13 13
 			  fileExt=list(green="Grn.idat", red="Red.idat"),
14 14
 			  saveDate=FALSE, verbose=FALSE) {
15
-	verbose <- FALSE
15
+       verbose <- FALSE
16 16
        if(!is.null(arrayNames)) {
17 17
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
18 18
        }
... ...
@@ -57,7 +57,7 @@ readIdatFiles = function(sampleSheet=NULL,
57 57
 	       grnidats = file.path(path, grnfiles)
58 58
 	       redidats = file.path(path, redfiles)
59 59
        }  else {
60
-	       message("path arg not set.  Assuming files are in local directory, or that complete path is provided")
60
+	       message("'path' arg not set.  Assuming files are in local directory, or that complete path is provided in 'arrayNames'")
61 61
 	       grnidats = grnfiles
62 62
 	       redidats = redfiles
63 63
        }
... ...
@@ -102,13 +102,23 @@ readIdatFiles = function(sampleSheet=NULL,
102 102
 		       } # else stop("Could not find probe IDs")
103 103
 		       nprobes = length(ids)
104 104
 		       narrays = length(arrayNames)
105
-		       RG = new("NChannelSet",
106
-		                 R=matrix(0, nprobes, narrays),
107
-		                 G=matrix(0, nprobes, narrays),
108
-		                 zero=matrix(1, nprobes, narrays),
105
+#                       if(cdfName=="nopackage") {
106
+    	               RG = new("NChannelSet",
107
+                                 R = initializeBigMatrix(name = "R", nr = nprobes, nc = narrays, vmode = "integer"),
108
+                                 G = initializeBigMatrix(name = "G", nr = nprobes, nc = narrays, vmode = "integer"),
109
+		                 zero = initializeBigMatrix(name = "zero", nr = nprobes, nc = narrays, vmode = "integer"),
109 110
 				 annotation=headerInfo$Manifest[1],
110 111
 				 phenoData=pd, storage.mode="environment")
112
+#                       } else {
113
+#    	               RG = new("NChannelSet",
114
+#                                 R = matrix(0, nprobes, narrays),
115
+#                                 G = matrix(0, nprobes, narrays),
116
+#		                 zero = matrix(1, nprobes, narrays),
117
+#				 annotation=headerInfo$Manifest[1],
118
+#				 phenoData=pd, storage.mode="environment")                           
119
+#                       }
111 120
 		       featureNames(RG) = ids
121
+                       
112 122
 		       if(!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)){
113 123
 		            sampleNames(RG) = make.unique(sampleSheet$Sample_ID)
114 124
 		       } else  sampleNames(RG) = make.unique(basename(arrayNames))
... ...
@@ -289,6 +299,9 @@ readGenCallOutput = function(filenames, path=".", cdfName,
289 299
 	    # matching on SNP names? now assume they come in order
290 300
 	}
291 301
 	maxIndex = baseIndex + sampleCounts[i] - 1
302
+        m=match(totSNPNames, rownames(valuesThisFile$Xvalues))
303
+        valuesThisFile$Xvalues=valuesThisFile$Xvalues[m,]
304
+        valuesThisFile$Yvalues=valuesThisFile$Yvalues[m,]
292 305
 	X[, baseIndex:maxIndex] = valuesThisFile$Xvalues
293 306
 	Y[, baseIndex:maxIndex] = valuesThisFile$Yvalues
294 307
         zero[, baseIndex:maxIndex] = (X[, baseIndex:maxIndex] == 0) || (Y[, baseIndex:maxIndex] == 0)
... ...
@@ -311,7 +324,8 @@ readGenCallOutput = function(filenames, path=".", cdfName,
311 324
 
312 325
 
313 326
 
314
-RGtoXY = function(RG, chipType, verbose=TRUE) {
327
+RGtoXY = function(RG, chipType, anno, verbose=TRUE) {
328
+  if(chipType!="nopackage") {
315 329
   needToLoad <- !all(sapply(c('addressA', 'addressB', 'base'), isLoaded))
316 330
   if(needToLoad){
317 331
 	  chipList = c("human1mv1c",# 1M
... ...
@@ -368,7 +382,6 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
368 382
 #  aidcol = match("AddressA_ID", colnames(annot))
369 383
 #  if(is.na(aidcol))
370 384
 #    aidcol = match("Address", colnames(annot))
371
-#  bidcol = match("AddressB_ID", colnames(annot))
372 385
 #  if(is.na(bidcol))
373 386
 #    bidcol = match("Address2", colnames(annot))
374 387
 #  aids = annot[, aidcol]
... ...
@@ -383,7 +396,9 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
383 396
   xyPhenoData = AnnotatedDataFrame(data=RG@phenoData@data,varMetadata=RG@phenoData@varMetadata)
384 397
   levels(xyPhenoData@varMetadata$channel) = c("X","Y","zero","_ALL_")
385 398
   XY <- new("NChannelSet",
386
-        assayDataNew(X=matrix(0, nsnps, narrays),Y=matrix(0, nsnps, narrays),zero=matrix(0, nsnps, narrays)),
399
+        assayDataNew(X=initializeBigMatrix(name="X", nr=nsnps, nc=narrays, vmode="integer"),
400
+                     Y=initializeBigMatrix(name="Y", nr=nsnps, nc=narrays, vmode="integer"),
401
+                     zero=initializeBigMatrix(name="zero", nr=nsnps, nc=narrays, vmode="integer")),
387 402
         phenoData=xyPhenoData, protocolData=RG@protocolData, annotation=chipType)
388 403
    storageMode(XY) = "environment"
389 404
 #  XY <- new("NChannelSet",
... ...
@@ -440,7 +455,55 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
440 455
 #  XY@assayData$Y[infI,] = 0
441 456
 #  XY@assayData$zero[infI,] = 0
442 457
 #  gc(verbose=FALSE)
443
-  XY
458
+  }
459
+  if(chipType=="nopackage" && !is.null(anno)) {
460
+            pkgname=NULL
461
+#            anno=read.csv(annot,header=TRUE, as.is=TRUE, check.names=FALSE,skip=7)
462
+            aids=anno[,"AddressA_ID"]
463
+            bids=anno[,"AddressB_ID"]
464
+            snpbase=anno[,"SNP"]
465
+            names(aids)=ids=anno[,"Name"]
466
+#            m=match(aids,rownames(RG@assayData$R))
467
+#            ARG=RG[m[!is.na(m)],]
468
+#            mm=match(rownames(ARG@assayData$R),aids)
469
+#            aids=aids[mm]
470
+#            bids=bids[mm]
471
+#            snpbas=snpbase[mm]
472
+            nsnps = length(aids)
473
+            narrays = ncol(RG)
474
+            infI = !is.na(bids) & bids != 0
475
+            aord = match(aids, featureNames(RG))
476
+            bord = match(bids, featureNames(RG))
477
+            xyPhenoData = AnnotatedDataFrame(data = RG@phenoData@data, 
478
+                                             varMetadata = RG@phenoData@varMetadata)
479
+            levels(xyPhenoData@varMetadata$channel) = c("X", "Y", "zero", "_ALL_")
480
+            XY <- new("NChannelSet", assayDataNew(X = initializeBigMatrix(name="X", nr=nsnps, nc=narrays, vmode="integer"),
481
+                                                  Y = initializeBigMatrix(name="Y", nr=nsnps, nc=narrays, vmode="integer"),
482
+                                                  zero = initializeBigMatrix(name="zero", nr=nsnps, nc=narrays, vmode="integer")),
483
+                      phenoData = xyPhenoData, protocolData = RG@protocolData)
484
+            storageMode(XY) = "environment"
485
+            featureNames(XY) = names(aids)
486
+            sampleNames(XY) = sampleNames(RG)
487
+            rm(list = c("bord", "infI", "aids", "bids", "snpbase"))
488
+            gc(verbose = FALSE)
489
+            not.na <- !is.na(aord)
490
+            not.na.aord <- aord[not.na]
491
+            r <- as.matrix(assayData(RG)[["R"]][not.na.aord, ])
492
+            XY@assayData$X[not.na, ] <- r
493
+            rm(r)
494
+            gc(verbose = FALSE)
495
+            g <- as.matrix(assayData(RG)[["G"]][not.na.aord, ])
496
+            XY@assayData$Y[not.na, ] <- g
497
+            rm(g)
498
+            gc(verbose = FALSE)
499
+            z <- as.matrix(assayData(RG)[["zero"]][not.na.aord, ])
500
+            XY@assayData$zero[not.na, ] <- z
501
+            rm(z)
502
+            gc(verbose = FALSE)
503
+            rm(RG)
504
+            gc(verbose = FALSE)
505
+   }
506
+   XY
444 507
 }
445 508
 
446 509
 
... ...
@@ -704,186 +767,188 @@ preprocessInfinium2 = function(XY, mixtureSampleSize=10^5,
704 767
   return(res)
705 768
 }
706 769
 
707
-
708
-crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
709
-			  row.names=TRUE, col.names=TRUE,
710
-			  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
711
-			  seed=1,
712
-			  mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
713
-			  cdfName, sns, recallMin=10, recallRegMin=1000,
714
-			  returnParams=FALSE, badSNP=.7) {
715
-	if(missing(cdfName)) {
716
-		if(!missing(RG))
717
-			cdfName = annotation(RG)
718
-		if(!missing(XY))
719
-			cdfName = annotation(XY)
720
-	}
721
-	if(!isValidCdfName(cdfName))
722
-		stop("cdfName not valid.  see validCdfNames")
723
-	if(!missing(RG)) {
724
-		if(missing(XY))
725
-			XY = RGtoXY(RG, chipType=cdfName)
726
-		else
727
-			stop("Both RG and XY specified - please use one or the other")
728
-	}
729
-	if (missing(sns)) sns = sampleNames(XY)
730
-	res <- preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize,
731
-				   fitMixture=TRUE, verbose=verbose,
732
-				   seed=seed, eps=eps, cdfName=cdfName, sns=sns,
733
-				   stripNorm=stripNorm, useTarget=useTarget) #,
734
-	if(row.names) row.names=res$gns else row.names=NULL
735
-	if(col.names) col.names=res$sns else col.names=NULL
736
-	res2 <- crlmmGT(A=res[["A"]],
737
-			B=res[["B"]],
738
-			SNR=res[["SNR"]],
739
-			mixtureParams=res[["mixtureParams"]],
740
-			cdfName=cdfName,
741
-			row.names=row.names,
742
-			col.names=col.names,
743
-			probs=probs,
744
-			DF=DF,
745
-			SNRMin=SNRMin,
746
-			recallMin=recallMin,
747
-			recallRegMin=recallRegMin,
748
-			gender=gender,
749
-			verbose=verbose,
750
-			returnParams=returnParams,
751
-			badSNP=badSNP)
752
-	res2[["SNR"]] = res[["SNR"]]
753
-	res2[["SKW"]] = res[["SKW"]]
754
-	rm(res); gc(verbose=FALSE)
755
-	return(list2SnpSet(res2, returnParams=returnParams))
756
-}
757
-
758
-
770
+## crlmmIllumina and genotype.Illumina are now the same function
771
+## Use genotype.Illumina instead
772
+#crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
773
+#			  row.names=TRUE, col.names=TRUE,
774
+#			  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
775
+#			  seed=1,
776
+#			  mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
777
+#			  cdfName, sns, recallMin=10, recallRegMin=1000,
778
+#			  returnParams=FALSE, badSNP=.7) {
779
+#	if(missing(cdfName)) {
780
+#		if(!missing(RG))
781
+#			cdfName = annotation(RG)
782
+#		if(!missing(XY))
783
+#			cdfName = annotation(XY)
784
+#	}
785
+#	if(!isValidCdfName(cdfName))
786
+#		stop("cdfName not valid.  see validCdfNames")
787
+#	if(!missing(RG)) {
788
+#		if(missing(XY))
789
+#			XY = RGtoXY(RG, chipType=cdfName)
790
+#		else
791
+#			stop("Both RG and XY specified - please use one or the other")
792
+#	}
793
+#	if (missing(sns)) sns = sampleNames(XY)
794
+#	res <- preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize,
795
+#				   fitMixture=TRUE, verbose=verbose,
796
+#				   seed=seed, eps=eps, cdfName=cdfName, sns=sns,
797
+#				   stripNorm=stripNorm, useTarget=useTarget) #,
798
+#	if(row.names) row.names=res$gns else row.names=NULL
799
+#	if(col.names) col.names=res$sns else col.names=NULL
800
+#	res2 <- crlmmGT(A=res[["A"]],
801
+#			B=res[["B"]],
802
+#			SNR=res[["SNR"]],
803
+#			mixtureParams=res[["mixtureParams"]],
804
+#			cdfName=cdfName,
805
+#			row.names=row.names,
806
+#			col.names=col.names,
807
+#			probs=probs,
808
+#			DF=DF,
809
+#			SNRMin=SNRMin,
810
+#			recallMin=recallMin,
811
+#			recallRegMin=recallRegMin,
812
+#			gender=gender,
813
+#			verbose=verbose,
814
+#			returnParams=returnParams,
815
+#			badSNP=badSNP)
816
+#	res2[["SNR"]] = res[["SNR"]]
817
+#	res2[["SKW"]] = res[["SKW"]]
818
+#	rm(res); gc(verbose=FALSE)
819
+#	return(list2SnpSet(res2, returnParams=returnParams))
820
+#}
821
+
822
+
823
+## Deprecate crlmmIlluminaV2 function
759 824
 ## MR: Below is a more memory efficient version of crlmmIllumina() which
760 825
 ## reads in the .idats and genotypes in the one function and removes objects
761 826
 ## after they have been used
762
-crlmmIlluminaV2 = function(sampleSheet=NULL,
763
-			  arrayNames=NULL,
764
-			  ids=NULL,
765
-			  path=".",
766
-			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
767
-			  highDensity=FALSE,
768
-			  sep="_",
769
-			  fileExt=list(green="Grn.idat", red="Red.idat"),
770
-			  saveDate=FALSE,
771
-			  stripNorm=TRUE,
772
-			  useTarget=TRUE,
773
- #                         outdir=".",
774
-			  row.names=TRUE,
775
-			  col.names=TRUE,
776
-			  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
777
-                          seed=1,  # save.it=FALSE, snpFile, cnFile,
778
-                          mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
779
-                          cdfName, sns, recallMin=10, recallRegMin=1000,
780
-                          returnParams=FALSE, badSNP=.7) {
781
-
782
-    if(missing(cdfName)) stop("must specify cdfName")
783
-    if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
784
-
785
-#    is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
786
-    is.lds=FALSE
787
-    RG = readIdatFiles(sampleSheet=sampleSheet, arrayNames=arrayNames,
788
-                       ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
789
-                       highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate)
790
-
791
-
792
-    XY = RGtoXY(RG, chipType=cdfName)
793
-#    if(is.lds) {
794
-#      open(RG@assayData$R); open(RG@assayData$G); open(RG@assayData$zero)
795
-#      delete(RG@assayData$R); delete(RG@assayData$G); delete(RG@assayData$zero)
796
-#    }
797
-    rm(RG); gc(verbose=FALSE)
798
-
799
-    if (missing(sns)) { sns = sampleNames(XY)
800
-    }
801
-    res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
802
-                               seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir) #,
803
-#                               save.it=save.it, snpFile=snpFile, cnFile=cnFile)
804
-
805
-#    if(is.lds) {
806
-#      open(XY@assayData$X); open(XY@assayData$Y); open(XY@assayData$zero)
807
-#      delete(XY@assayData$X); delete(XY@assayData$Y); delete(XY@assayData$zero)
808
-#    }
809
-    rm(XY); gc(verbose=FALSE)
810
-
811
-    if(row.names) row.names=res$gns else row.names=NULL
812
-    if(col.names) col.names=res$sns else col.names=NULL
813
-##
814
-##  if(is.lds){
815
-##	  res2 <- crlmmGT2(A=res[["A"]],
816
-##			   B=res[["B"]],
817
-##			   SNR=res[["SNR"]],
818
-##			   mixtureParams=res[["mixtureParams"]],
819
-##			   cdfName=cdfName,
820
-##			   row.names=row.names,
821
-##			   col.names=col.names,
822
-##			   probs=probs,
823
-##			   DF=DF,
824
-##			   SNRMin=SNRMin,
825
-##			   recallMin=recallMin,
826
-##			   recallRegMin=recallRegMin,
827
-##			   gender=gender,
828
-##			   verbose=verbose,
829
-##			   returnParams=returnParams,
830
-##			   badSNP=badSNP)
831
-##  } else {
832
-	  res2 <- crlmmGT(A=res[["A"]],
833
-			   B=res[["B"]],
834
-			   SNR=res[["SNR"]],
835
-			   mixtureParams=res[["mixtureParams"]],
836
-			   cdfName=cdfName,
837
-			   row.names=row.names,
838
-			   col.names=col.names,
839
-			   probs=probs,
840
-			   DF=DF,
841
-			   SNRMin=SNRMin,
842
-			   recallMin=recallMin,
843
-			   recallRegMin=recallRegMin,
844
-			   gender=gender,
845
-			   verbose=verbose,
846
-			   returnParams=returnParams,
847
-			   badSNP=badSNP)
848
-##  }
849
-
850
-##    FUN = ifelse(is.lds, "crlmmGT2", "crlmmGT")
851
-##    ## genotyping
852
-##    crlmmGTfxn = function(FUN,...){
853
-##		switch(FUN,
854
-##		       crlmmGT2=crlmmGT2(...),
855
-##		       crlmmGT=crlmmGT(...))
856
-##              }
857
-##    res2 = crlmmGTfxn(FUN,
858
-##                     A=res[["A"]],
859
-##                     B=res[["B"]],
860
-##                     SNR=res[["SNR"]],
861
-##                     mixtureParams=res[["mixtureParams"]],
862
-##                     cdfName=cdfName,
863
-##                     row.names=row.names,
864
-##                     col.names=col.names,
865
-##                     probs=probs,
866
-##                     DF=DF,
867
-##                     SNRMin=SNRMin,
868
-##                     recallMin=recallMin,
869
-##                     recallRegMin=recallRegMin,
870
-##                     gender=gender,
871
-##                     verbose=verbose,
872
-##                     returnParams=returnParams,
873
-##                     badSNP=badSNP)
874
-
875
-#    if(is.lds) {
876
-#      open(res[["SNR"]]); open(res[["SKW"]])
827
+#crlmmIlluminaV2 = function(sampleSheet=NULL,
828
+#			  arrayNames=NULL,
829
+#			  ids=NULL,
830
+#			  path=".",
831
+#			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
832
+#			  highDensity=FALSE,
833
+#			  sep="_",
834
+#			  fileExt=list(green="Grn.idat", red="Red.idat"),
835
+#			  saveDate=FALSE,
836
+#			  stripNorm=TRUE,
837
+#			  useTarget=TRUE,
838
+# #                         outdir=".",
839
+#			  row.names=TRUE,
840
+#			  col.names=TRUE,
841
+#			  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
842
+#                          seed=1,  # save.it=FALSE, snpFile, cnFile,
843
+#                          mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
844
+#                          cdfName, sns, recallMin=10, recallRegMin=1000,
845
+#                          returnParams=FALSE, badSNP=.7) {
846
+#
847
+#    if(missing(cdfName)) stop("must specify cdfName")
848
+#    if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
849
+#
850
+##    is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
851
+#    is.lds=FALSE
852
+#    RG = readIdatFiles(sampleSheet=sampleSheet, arrayNames=arrayNames,
853
+#                       ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
854
+#                       highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate)
855
+#
856
+#
857
+#    XY = RGtoXY(RG, chipType=cdfName)
858
+##    if(is.lds) {
859
+##      open(RG@assayData$R); open(RG@assayData$G); open(RG@assayData$zero)
860
+##      delete(RG@assayData$R); delete(RG@assayData$G); delete(RG@assayData$zero)
861
+##    }
862
+#    rm(RG); gc(verbose=FALSE)
863
+#
864
+#    if (missing(sns)) { sns = sampleNames(XY)
877 865
 #    }
878
-    res2[["SNR"]] = res[["SNR"]]
879
-    res2[["SKW"]] = res[["SKW"]]
880
- #  if(is.lds) {
881
- #    delete(res[["A"]]); delete(res[["B"]])
882
- #    delete(res[["SKW"]]); delete(res[["SNR"]]); delete(res[["mixtureParams"]])
883
- #  }
884
-    rm(res); gc(verbose=FALSE)
885
-    return(list2SnpSet(res2, returnParams=returnParams))
886
-}
866
+#    res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
867
+#                               seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir) #,
868
+##                               save.it=save.it, snpFile=snpFile, cnFile=cnFile)
869
+#
870
+##    if(is.lds) {
871
+##      open(XY@assayData$X); open(XY@assayData$Y); open(XY@assayData$zero)
872
+##      delete(XY@assayData$X); delete(XY@assayData$Y); delete(XY@assayData$zero)
873
+##    }
874
+#    rm(XY); gc(verbose=FALSE)
875
+#
876
+#    if(row.names) row.names=res$gns else row.names=NULL
877
+#    if(col.names) col.names=res$sns else col.names=NULL
878
+###
879
+###  if(is.lds){
880
+###	  res2 <- crlmmGT2(A=res[["A"]],
881
+###			   B=res[["B"]],
882
+###			   SNR=res[["SNR"]],
883
+###			   mixtureParams=res[["mixtureParams"]],
884
+###			   cdfName=cdfName,
885
+###			   row.names=row.names,
886
+###			   col.names=col.names,
887
+###			   probs=probs,
888
+###			   DF=DF,
889
+###			   SNRMin=SNRMin,
890
+###			   recallMin=recallMin,
891
+###			   recallRegMin=recallRegMin,
892
+###			   gender=gender,
893
+###			   verbose=verbose,
894
+###			   returnParams=returnParams,
895
+###			   badSNP=badSNP)
896
+###  } else {
897
+#	  res2 <- crlmmGT(A=res[["A"]],
898
+#			   B=res[["B"]],
899
+#			   SNR=res[["SNR"]],
900
+#			   mixtureParams=res[["mixtureParams"]],
901
+#			   cdfName=cdfName,
902
+#			   row.names=row.names,
903
+#			   col.names=col.names,
904
+#			   probs=probs,
905
+#			   DF=DF,
906
+#			   SNRMin=SNRMin,
907
+#			   recallMin=recallMin,
908
+#			   recallRegMin=recallRegMin,
909
+#			   gender=gender,
910
+#			   verbose=verbose,
911
+#			   returnParams=returnParams,
912
+#			   badSNP=badSNP)
913
+###  }
914
+#
915
+###    FUN = ifelse(is.lds, "crlmmGT2", "crlmmGT")
916
+###    ## genotyping
917
+###    crlmmGTfxn = function(FUN,...){
918
+###		switch(FUN,
919
+###		       crlmmGT2=crlmmGT2(...),
920
+###		       crlmmGT=crlmmGT(...))
921
+###              }
922
+###    res2 = crlmmGTfxn(FUN,
923
+###                     A=res[["A"]],
924
+###                     B=res[["B"]],
925
+###                     SNR=res[["SNR"]],
926
+###                     mixtureParams=res[["mixtureParams"]],
927
+###                     cdfName=cdfName,
928
+###                     row.names=row.names,
929
+###                     col.names=col.names,
930
+###                     probs=probs,
931
+###                     DF=DF,
932
+###                     SNRMin=SNRMin,
933
+###                     recallMin=recallMin,
934
+###                     recallRegMin=recallRegMin,
935
+###                     gender=gender,
936
+###                     verbose=verbose,
937
+###                     returnParams=returnParams,
938
+###                     badSNP=badSNP)
939
+#
940
+##    if(is.lds) {
941
+##      open(res[["SNR"]]); open(res[["SKW"]])
942
+##    }
943
+#    res2[["SNR"]] = res[["SNR"]]
944
+#    res2[["SKW"]] = res[["SKW"]]
945
+# #  if(is.lds) {
946
+# #    delete(res[["A"]]); delete(res[["B"]])
947
+# #    delete(res[["SKW"]]); delete(res[["SNR"]]); delete(res[["mixtureParams"]])
948
+# #  }
949
+#    rm(res); gc(verbose=FALSE)
950
+#    return(list2SnpSet(res2, returnParams=returnParams))
951
+#}
887 952
 
888 953
 # Functions analogous to Rob's Affy functions to set up container
889 954
 getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verbose=FALSE) {
... ...
@@ -950,6 +1015,8 @@ constructInf <- function(sampleSheet=NULL,
950 1015
 			 fileExt=list(green="Grn.idat", red="Red.idat"),
951 1016
                          XY,
952 1017
 			 cdfName,
1018
+                         anno,
1019
+                         genome,
953 1020
 			 verbose=FALSE,
954 1021
 			 batch=NULL, #fns,
955 1022
 			 saveDate=TRUE) { #, outdir="."){
... ...
@@ -1015,16 +1082,31 @@ constructInf <- function(sampleSheet=NULL,
1015 1082
 	  if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
1016 1083
 
1017 1084
 	  if(verbose) message("Initializing container for genotyping and copy number estimation")
1018
-	  pkgname <- getCrlmmAnnotationName(cdfName)
1019
-	  path <- system.file("extdata", package=pkgname)
1020
-	  genome <- getAvailableIlluminaGenomeBuild(path)
1021
-	  featureData = getFeatureData(cdfName, copynumber=TRUE, genome=genome)
1022
-	  nr = nrow(featureData); nc = narrays
1023
-	  sns <-  if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
1085
+          if(cdfName!="nopackage") {
1086
+	    pkgname <- getCrlmmAnnotationName(cdfName)
1087
+	    path <- system.file("extdata", package=pkgname)
1088
+	    genome <- getAvailableIlluminaGenomeBuild(path)
1089
+	    featureData = getFeatureData(cdfName, copynumber=TRUE, genome=genome)
1090
+          } else {
1091
+            if(!is.integer(anno$chr)) {
1092
+                  chr = as.character(anno$chromosome)
1093
+                  chr[chr=="X"] = 23
1094
+                  chr[chr=="Y"] = 24
1095
+                  chr[chr=="XY"] = 25
1096
+            }
1097
+            featureData = new("GenomeAnnotatedDataFrame",
1098
+                          isSnp=as.logical(anno$isSnp),
1099
+                          position=as.integer(anno$position),
1100
+                          chromosome=as.integer(chr),
1101
+                          row.names=anno$featureNames)
1102
+          }
1103
+            nr <- nrow(featureData)
1104
+            nc <- narrays
1105
+	    sns <-  if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
1024 1106
 		make.unique(sampleSheet$Sample_ID)
1025
-          } else{
1107
+            } else{
1026 1108
 		make.unique(basename(arrayNames))
1027
-          }
1109
+            } 
1028 1110
 	  biga <- initializeBigMatrix(name="A", nr, nc)
1029 1111
 	  bigb <- initializeBigMatrix(name="B", nr, nc)
1030 1112
 	  bigc <- initializeBigMatrix(name="call", nr, nc)
... ...
@@ -1067,11 +1149,16 @@ constructInf <- function(sampleSheet=NULL,
1067 1149
               batch = rep("batch1", narrays) # assume only one batch stop("Must specify 'batch'")
1068 1150
           }
1069 1151
           if(is(batch, "factor")) batch = as.character(batch)
1070
-          pkgname <- getCrlmmAnnotationName(cdfName)
1071
-          path <- system.file("extdata", package=pkgname)
1072
-          genome <- getAvailableIlluminaGenomeBuild(path)
1073
-          featureData = getFeatureData(cdfName, copynumber=TRUE, genome=genome)
1074
-          nr = nrow(featureData); nc = narrays
1152
+          if(cdfName!="nopackage") {
1153
+            pkgname <- getCrlmmAnnotationName(cdfName)
1154
+            path <- system.file("extdata", package=pkgname)
1155
+            genome <- getAvailableIlluminaGenomeBuild(path)
1156
+            featureData = getFeatureData(cdfName, copynumber=TRUE, genome=genome)
1157
+          } else {
1158
+            featureData=anno
1159
+          }
1160
+          nr <- nrow(featureData)
1161
+          nc <- narrays
1075 1162
           sns <-  sampleNames(XY)
1076 1163
           biga <- initializeBigMatrix(name="A", nr, nc)
1077 1164
           bigb <- initializeBigMatrix(name="B", nr, nc)
... ...
@@ -1109,6 +1196,7 @@ preprocessInf <- function(cnSet,
1109 1196
 		       sep="_",
1110 1197
 		       fileExt=list(green="Grn.idat", red="Red.idat"),
1111 1198
                        XY,
1199
+                       anno,
1112 1200
 		       saveDate=TRUE,
1113 1201
 		       stripNorm=TRUE,
1114 1202
 		       useTarget=TRUE,
... ...
@@ -1123,7 +1211,7 @@ preprocessInf <- function(cnSet,
1123 1211
 	open(B(cnSet))
1124 1212
 	sns <- sampleNames(cnSet)
1125 1213
 	pkgname = getCrlmmAnnotationName(annotation(cnSet))
1126
-	is.snp = isSnp(cnSet)
1214
+        is.snp = isSnp(cnSet)
1127 1215
 	snp.index = which(is.snp)
1128 1216
 	narrays = ncol(cnSet)
1129 1217
 	sampleBatches <- splitIndicesByLength(seq_len(ncol(cnSet)), ocSamples()/getDoParWorkers())
... ...
@@ -1140,6 +1228,7 @@ preprocessInf <- function(cnSet,
1140 1228
 		 sep=sep,
1141 1229
 		 fileExt=fileExt,
1142 1230
                  XY=XY,
1231
+                 anno=anno,
1143 1232
 		 saveDate=saveDate,
1144 1233
 		 verbose=verbose,
1145 1234
 		 mixtureSampleSize=mixtureSampleSize,
... ...
@@ -1174,6 +1263,7 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3),
1174 1263
 			gender=NULL,
1175 1264
 			DF=6,
1176 1265
 			cdfName,
1266
+                        nopackage.norm="quantile",
1177 1267
                         call.method="crlmm",
1178 1268
                         trueCalls=NULL){
1179 1269
 	is.snp = isSnp(cnSet)
... ...
@@ -1215,9 +1305,14 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3),
1215 1305
            cnSet$gender[,] = tmp$gender
1216 1306
            close(cnSet$gender)
1217 1307
         }
1218
-        if(call.method=="krlmm")
1219
-          tmp <- krlmm(cnSet, cdfName, gender=gender, trueCalls=trueCalls, verbose=verbose) # new function required...  cnSet, cdfName and gender are probably the only arguments you need...
1308
+        if(call.method=="krlmm") {
1309
+            if(cdfName=="nopackage") {
1310
+                tmp <- krlmmnopkg(cnSet, offset=16, gender=gender, normalize.method=nopackage.norm, annotation=anno, verbose=verbose)
1311
+            } else {
1312
+              tmp <- krlmm(cnSet, cdfName, gender=gender, trueCalls=trueCalls, verbose=verbose)
1220 1313
 	## snp.names=featureNames(cnSet)[snp.index])
1314
+            }
1315
+        }
1221 1316
 	if(verbose) message("Genotyping finished.") # Updating container with genotype calls and confidence scores.")
1222 1317
 	TRUE
1223 1318
 }
... ...
@@ -1232,6 +1327,8 @@ genotype.Illumina <- function(sampleSheet=NULL,
1232 1327
 			      sep="_",
1233 1328
 			      fileExt=list(green="Grn.idat", red="Red.idat"),
1234 1329
                               XY=NULL,
1330
+                              anno,
1331
+                              genome,
1235 1332
                               call.method="crlmm",
1236 1333
                               trueCalls=NULL,
1237 1334
 			      cdfName,
... ...
@@ -1240,7 +1337,8 @@ genotype.Illumina <- function(sampleSheet=NULL,
1240 1337
 			      saveDate=FALSE,
1241 1338
 			      stripNorm=TRUE,
1242 1339
 			      useTarget=TRUE,
1243
-                              quantile.method="between",                             
1340
+                              quantile.method="between",
1341
+                              nopackage.norm="quantile",
1244 1342
 			      mixtureSampleSize=10^5,
1245 1343
 			      fitMixture=TRUE,
1246 1344
 			      eps=0.1,
... ...
@@ -1261,7 +1359,8 @@ genotype.Illumina <- function(sampleSheet=NULL,
1261 1359
                             "humanomni258v1p1b",      # Omni2.5 8 v1.1 B
1262 1360
                             "humanomni5quadv1b",      # Omni5 quad
1263 1361
 			    "humanexome12v1p2a",      # Exome 12 v1.2 A
1264
-                            "humanomniexpexome8v1p1b") # Omni Express Exome 8 v1.1b
1362
+                            "humanomniexpexome8v1p1b", # Omni Express Exome 8 v1.1b
1363
+                            "nopackage")
1265 1364
         crlmm.supported = c("human1mv1c",             # 1M
1266 1365
                             "human370v1c",            # 370CNV
1267 1366
 	                    "human650v3a",            # 650Y
... ...
@@ -1309,6 +1408,8 @@ genotype.Illumina <- function(sampleSheet=NULL,
1309 1408
 				    sep=sep,
1310 1409
 				    fileExt=fileExt,
1311 1410
                                     XY=XY,
1411
+                                    anno=anno,
1412
+                                    genome=genome,
1312 1413
 				    cdfName=cdfName,
1313 1414
 				    verbose=verbose,
1314 1415
 				    batch=batch,
... ...
@@ -1327,6 +1428,7 @@ genotype.Illumina <- function(sampleSheet=NULL,
1327 1428
 				    fileExt=fileExt,
1328 1429
 				    saveDate=saveDate,
1329 1430
                                     XY=XY,
1431
+                                    anno=anno, 
1330 1432
 				    stripNorm=stripNorm,
1331 1433
 				    useTarget=useTarget,
1332 1434
 				    mixtureSampleSize=mixtureSampleSize,
... ...
@@ -1336,7 +1438,7 @@ genotype.Illumina <- function(sampleSheet=NULL,
1336 1438
 				    verbose=verbose,
1337 1439
 				    seed=seed,
1338 1440
 				    cdfName=cdfName)
1339
-	message("Preprocessing complete.  Begin genotyping...")
1441
+	message("Begin genotyping...")
1340 1442
 	genotypeInf(cnSet=cnSet, mixtureParams=mixtureParams,
1341 1443
 		    probs=probs,
1342 1444
 		    SNRMin=SNRMin,
... ...
@@ -1348,11 +1450,15 @@ genotype.Illumina <- function(sampleSheet=NULL,
1348 1450
 		    gender=gender,
1349 1451
 		    DF=DF,
1350 1452
 		    cdfName=cdfName,
1453
+                    nopackage.norm=nopackage.norm,
1351 1454
                     call.method=call.method,
1352 1455
                     trueCalls=trueCalls)
1353 1456
 	return(cnSet)
1354 1457
 }
1355 1458
 
1459
+crlmmIllumina <- genotype.Illumina
1460
+
1461
+
1356 1462
 
1357 1463
 processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
1358 1464
 			arrayNames=NULL,
... ...
@@ -1363,6 +1469,7 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
1363 1469
 			sep="_",
1364 1470
 			fileExt=list(green="Grn.idat", red="Red.idat"),
1365 1471
                         XY,
1472
+                        anno,
1366 1473
 			saveDate=FALSE,
1367 1474
 			verbose=TRUE,
1368 1475
 			mixtureSampleSize=10^5,
... ...
@@ -1385,19 +1492,30 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
1385 1492
           RG = readIdatFiles(sampleSheet=sampleSheet[sel,], arrayNames=arrayNames[sel],
1386 1493
                        ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
1387 1494
                        highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose)
1388
-          XY = RGtoXY(RG, chipType=cdfName)
1495
+          XY = RGtoXY(RG, chipType=cdfName, anno=anno)
1389 1496
           rm(RG)
1390 1497
           gc(verbose=FALSE)
1391 1498
           if (missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY)
1392
-          res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1499
+          if(cdfName!="nopackage") {
1500
+             res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1393 1501
                                seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget,
1394
-                               quantile.method=quantile.method) #, outdir=outdir)
1395
-          rm(XY)
1396
-        }else{ #XY already available
1397
-          if (missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY)  
1398
-          res = preprocessInfinium2(XY[,sel], mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1502
+                               quantile.method=quantile.method) #, outdir=outdir
1503
+             rm(XY)
1504
+          } else {
1505
+              res = list(A=XY@assayData$X, B=XY@assayData$Y, zero=XY@assayData$zero, sns=sns, gns=names(XY), cdfName=cdfName,
1506
+                  SNR=NA, SKW=NA, mixtureParams=NA)
1507
+              rm(XY)
1508
+          }
1509
+          } else{ #XY already available
1510
+          if(missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY)
1511
+          if(cdfName!="nopackage") {
1512
+            res = preprocessInfinium2(XY[,sel], mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1399 1513
                                            seed=seed, eps=eps, cdfName=cdfName, sns=sns[sel], stripNorm=stripNorm, useTarget=useTarget,
1400
-                                           quantile.method=quantile.method) 
1514
+                                           quantile.method=quantile.method)
1515
+          } else {
1516
+              res = list(A=XY@assayData$X, B=XY@assayData$Y, zero=XY@assayData$zero, sns=sns, gns=names(XY), cdfName=cdfName,
1517
+                  SNR=NA, SKW=NA, mixtureParams=NA)
1518
+          }
1401 1519
         }
1402 1520
         gc(verbose=FALSE)
1403 1521
 	if(verbose) message("Finished preprocessing.")
... ...
@@ -1414,6 +1532,8 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
1414 1532
 	## calls and call probabilities by the crlmmGT2 function. The A
1415 1533
 	## and B slots will retain the normalized intensities for the
1416 1534
 	## A and B alleles
1535
+        ## The loop below gives rise to warnings: In `[<-.ff_array`(`*tmp*`, snp.index, jj, value = structure(c(6648L,  ... :
1536
+        ## number of elements to replace is not multiple of values for replacement
1417 1537
 	for(j in seq_along(sel)){
1418 1538
 		jj <- sel[j]
1419 1539
 		A[snp.index, jj] <- Amatrix[, j]
... ...
@@ -1444,3 +1564,4 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
1444 1564
 	gc(verbose=FALSE)
1445 1565
         TRUE
1446 1566
       }
1567
+
... ...
@@ -1,10 +1,9 @@
1 1
 krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
2
-  
3
-    pkgname <- getCrlmmAnnotationName(cdfName) # crlmm:::
2
+    pkgname <- getCrlmmAnnotationName(cdfName)
4 3
     
5 4
 ### pre-processing, output M    
6 5
     M = computeLogRatio(cnSet, verbose)
7
-    message("leaving out novariant SNPs")
6
+    message("Leaving out non-variant SNPs")
8 7
     
9 8
     # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
10 9
     uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
... ...
@@ -32,11 +31,11 @@ krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
32 31
     krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
33 32
     
34 33
 ### do VGLM fit, to predict k for each SNP
35
-    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose);
34
+    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose)
36 35
     rm(VGLMparameters)
37 36
     
38 37
 ### assign calls
39
-    assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose);
38
+    assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose)
40 39
     rm(kPrediction)
41 40
 
42 41
 ### assign confidence scores
... ...
@@ -70,6 +69,208 @@ krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
70 69
     TRUE
71 70
 }
72 71
 
72
+krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annotation=NULL, verbose=TRUE) {
73
+    if(is.null(normalize.method)){
74
+        message("No normalization method specified. Quantile normalization applied")
75
+        M=quantilenormalization(cnSet,verbose,offset=offset)
76
+    }else{
77
+        if(normalize.method=="quantile"){
78
+            M=quantilenormalization(cnSet,verbose,offset=offset)
79
+        }
80
+        if(normalize.method=="loess"){
81
+            M=loessnormalization(cnSet,verbose,offset=offset)
82
+	}
83
+    }
84
+
85
+    message("Leaving out non-variant SNPs")
86
+
87
+    # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
88
+    uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
89
+    SNPtoProcessIndex = uniqueCount >= 3
90
+    noVariantSNPIndex = uniqueCount < 3
91
+    M = M[SNPtoProcessIndex, ]
92
+
93
+    numSNP = nrow(M)
94
+    numSample = ncol(M)
95
+
96
+    calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
97
+    scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
98
+    open(calls)
99
+    open(scores)
100
+
101
+    rownames(calls) = rownames(M)
102
+    rownames(scores) = rownames(M)
103
+    colnames(calls) = colnames(M)
104
+    colnames(scores) = colnames(M)
105
+
106
+    prepriormeans=calculatePriorcentersC(M,numSample)
107
+
108
+    trueCalls=matrix(NA,nrow(M),ncol(M))
109
+
110
+    for(i in 1:ncol(M)){
111
+      tmp=kmeans(M[,i],centers=prepriormeans,nstart=45)
112
+      trueCalls[,i]=tmp$cluster
113
+    }
114
+    colnames(trueCalls)=colnames(M)
115
+    rownames(trueCalls)=rownames(M)
116
+
117
+    priormeans = calculatePriorValues(M, numSNP, verbose)
118
+    VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
119
+
120
+### retrieve or calculate coefficients
121
+
122
+    krlmmCoefficients = getKrlmmVGLMCoefficients(trueCalls=trueCalls,params=VGLMparameters,numSample=ncol(M),samplenames=colnames(M),verbose=T)
123
+### do VGLM fit, to predict k for each SNP
124
+    kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose)
125
+    rm(VGLMparameters)
126
+
127
+### assign calls
128
+    assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose=T)
129
+    rm(kPrediction)
130
+
131
+### assign confidence scores
132
+    computeCallPr(scores, M, calls, numSNP, numSample, verbose)
133
+
134
+    YIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==24 & isSnp(cnSet)]
135
+
136
+### impute gender if gender information not provided
137
+    if (is.null(gender)) {
138
+        gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset)
139
+    }
140
+
141
+### double-check ChrY SNP cluster, impute gender if gender information not provided
142
+    if (!(is.null(gender))) {
143
+        verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
144
+    }
145
+
146
+#    if (length(YIndex)>0  && !(is.null(gender))) {
147
+#      verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
148
+#    }
149
+### add back SNPs excluded before
150
+    AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
151
+
152
+    close(calls)
153
+    close(scores)
154
+    rm(calls)
155
+    rm(scores)
156
+    rm(M)
157
+    TRUE
158
+}
159
+
160
+
161
+#######################################################################################################
162
+
163
+loessnormalization<- function(cnSet, verbose, offset=16, blockSize = 300000){
164
+    # compute log-ratio in blocksize of 300,000 by default
165
+    message("Start computing log-ratios")
166
+    A <- A(cnSet)
167
+    B <- B(cnSet)
168
+    open(A)
169
+    open(B)
170
+
171
+    numSNP <- nrow(A)
172
+    numSample <- ncol(A)
173
+    library(limma)
174
+
175
+    M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
176
+    S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
177
+
178
+    rownames(M) = rownames(A)
179
+    colnames(M) = colnames(A)
180
+
181
+    numBlock = ceiling(numSNP / blockSize)
182
+    for (i in 1:numBlock){
183
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
184
+        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
185
+        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
186
+        subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
187
+        M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
188
+        subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
189
+        S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
190
+
191
+        rm(subsetA, subsetB, subsetM,subsetS)
192
+
193
+    }
194
+    close(A)
195
+    close(B)
196
+    gc()
197
+
198
+    isna=is.na(M[,1])
199
+    prepriormeansL=calculatePriorcentersC(M[,][!isna,],numSample)
200
+    F01=abs(prepriormeansL[1])-prepriormeansL[2]   
201
+    F02=abs(prepriormeansL[3])+prepriormeansL[2]
202
+
203
+    for(i in 1:ncol(M)) {
204
+        isna=is.na(M[,i])
205
+        Msub = M[!isna,i]
206
+        Mna=M[isna,i]
207
+        Ssub = S[!isna,i]
208
+        Sna=S[isna,i]
209
+
210
+        km = kmeans(Msub, prepriormeansL)
211
+        ind1v2 = km$cluster==1
212
+        ind2v2 = km$cluster==2
213
+        ind3v2 = km$cluster==3
214
+
215
+        l1v2 = loessFit(Msub[ind1v2], Ssub[ind1v2])
216
+        l2v2 = loessFit(Msub[ind2v2], Ssub[ind2v2])
217
+        l3v2 = loessFit(Msub[ind3v2], Ssub[ind3v2])
218
+        Msub[ind1v2] = l1v2$residuals+F01
219
+        Msub[ind2v2] = l2v2$residuals
220
+        Msub[ind3v2] = l3v2$residuals-F02
221
+        Mnew=rbind(as.matrix(Msub),as.matrix(Mna))
222
+        rownames(Mnew)=c(rownames(as.matrix(Msub)),rownames(as.matrix(Mna)))
223
+        m=match(rownames(M),rownames(Mnew))
224
+        Mnew=Mnew[m]
225
+        M[,i] = Mnew
226
+        rm(Msub, Ssub, ind1v2, ind2v2, ind3v2, l1v2, l2v2, l3v2)
227
+    }
228
+    return(M)
229
+}
230
+
231
+
232
+#######################################################################################################
233
+
234
+quantilenormalization <- function(cnSet, verbose, offset=16, blockSize = 300000){
235
+    A <- A(cnSet)
236
+    B <- B(cnSet)
237
+    open(A)
238
+    open(B)
239
+
240
+    numSNP <- nrow(A)
241
+    numSample <- ncol(A)
242
+
243
+    M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
244
+    X <- oligoClasses::initializeBigMatrix(name="X", numSNP, numSample, vmode = "integer")
245
+    Y <- oligoClasses::initializeBigMatrix(name="Y", numSNP, numSample, vmode = "integer")
246
+    rownames(M) = rownames(X) = rownames(Y) = rownames(A)
247
+    colnames(M) = colnames(X) = colnames(Y) = colnames(A)
248
+
249
+    # This step may chew up a lot of memory...
250
+    X =  normalize.quantiles(as.matrix(A[,]))+offset
251
+    Y =  normalize.quantiles(as.matrix(B[,]))+offset
252
+
253
+    numBlock = ceiling(numSNP / blockSize)
254
+    for (i in 1:numBlock){
255
+        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
256
+        subsetXqws = as.matrix(X[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
257
+        subsetYqws = as.matrix(Y[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
258
+        subsetM = .Call("krlmmComputeM", subsetXqws, subsetYqws, PACKAGE="crlmm")
259
+        M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
260
+        rm(subsetYqws, subsetXqws, subsetM)
261
+    }
262
+
263
+   close(A)
264
+   close(B)
265
+   return(M)
266
+   delete(X)
267
+   delete(Y)
268
+   rm(X)
269
+   rm(Y)
270
+   gc()
271
+}
272
+
273
+
73 274
 #######################################################################################################
74 275
 
75 276
 getNumberOfCores <- function(){
... ...
@@ -80,11 +281,27 @@ getNumberOfCores <- function(){
80 281
 
81 282
 #######################################################################################################
82 283
 
83
-computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
284
+computeLogRatio <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){
84 285
     # compute log-ratio in blocksize of 500,000 by default
85
-    message("Start computing log ratio")
86
-    A <- A(cnSet)
87
-    B <- B(cnSet)
286
+    if(verbose)
287
+        message("Start computing log-ratios")
288
+    if(!is.null(col) | !is.null(row)) {
289
+      if(is.null(row)) {
290
+       A <- A(cnSet)[,col]
291
+       B <- B(cnSet)[,col]
292
+      }
293
+      if(is.null(col)) {
294
+       A <- A(cnSet)[row,]
295
+       B <- B(cnSet)[row,]
296
+      }
297
+      if(!is.null(col) & !is.null(row)) {
298
+       A <- A(cnSet)[row,col]
299
+       B <- B(cnSet)[row,col]
300
+      }
301
+    } else {
302
+       A <- A(cnSet)
303
+       B <- B(cnSet)
304
+    }
88 305
     open(A)
89 306
     open(B)
90 307
     
... ...
@@ -97,9 +314,9 @@ computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
97 314
   
98 315
     numBlock = ceiling(numSNP / blockSize)
99 316
     for (i in 1:numBlock){
100
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
101
-        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
102
-        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
317
+        if(verbose) message(" -- Processing segment ", i, " out of ", numBlock)
318
+        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
319
+        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset              
103 320
         subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
104 321
         M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
105 322
         rm(subsetA, subsetB, subsetM)
... ...
@@ -107,15 +324,32 @@ computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
107 324
     
108 325
     close(A)
109 326
     close(B)
110
-    if (verbose) message("Done computing log ratio")
327
+    if(verbose)
328
+        message("Done computing log-ratios")
111 329
     return(M)    
112 330
 }
113 331
 
114
-computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
332
+computeAverageLogIntensity <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){
115 333
     # compute average log intensity in blocksize of 500,000 by default
116
-    message("Start computing average log intensity")
117
-    A <- A(cnSet)
118
-    B <- B(cnSet)
334
+    if(verbose)
335
+        message("Start computing average log-intensities")
336
+    if(!is.null(col) | !is.null(row)) {
337
+      if(is.null(row)) {
338
+       A <- A(cnSet)[,col]
339
+       B <- B(cnSet)[,col]
340
+      }
341
+      if(is.null(col)) {
342
+       A <- A(cnSet)[row,]
343
+       B <- B(cnSet)[row,]
344
+      }
345
+      if(!is.null(col) & !is.null(row)) {
346
+       A <- A(cnSet)[row,col]
347
+       B <- B(cnSet)[row,col]
348
+      }
349
+    } else {
350
+       A <- A(cnSet)
351
+       B <- B(cnSet)
352
+    }
119 353
     open(A)
120 354
     open(B)
121 355
     
... ...
@@ -128,9 +362,9 @@ computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
128 362
       
129 363
     numBlock = ceiling(numSNP / blockSize)
130 364
     for (i in 1:numBlock){
131
-        if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
132
-        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
133
-        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])                     
365
+        if(verbose) message(" -- Processing segment ", i, " out of ", numBlock)
366
+        subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
367
+        subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset                     
134 368
         subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
135 369
         S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
136 370
         rm(subsetA, subsetB, subsetS)
... ...
@@ -138,9 +372,9 @@ computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
138 372
     
139 373
     close(A)
140 374
     close(B)
141
-    if (verbose) message("Done computing average log intensity")
142
-    return(S)
143
-  
375
+    if(verbose)
376
+        message("Done computing average log-intensities")
377
+    return(S)  
144 378
 }
145 379
 
146 380
 
... ...
@@ -153,7 +387,7 @@ calculatePriorValues <- function(M, numSNP, verbose) {
153 387
         return(sort(tmp$centers, decreasing = T))
154 388
     }
155 389
 
156
-    if (verbose) message("Start calculating Prior Means")
390
+    if (verbose) message("Start calculating prior means")
157 391
     cl <- makeCluster(getNumberOfCores())
158 392
     centers <- parRapply(cl, M, calculateOneKmeans)
159 393
     stopCluster(cl) 
... ...
@@ -163,10 +397,27 @@ calculatePriorValues <- function(M, numSNP, verbose) {
163 397
       checksymmetric= apply(centers,1,function(x){abs(sum(x))})<1
164 398
       priormeans=apply(centers[checksymmetric,],2, FUN="median", na.rm=TRUE)
165 399
     }
166
-    if (verbose) message("Done calculating Prior Means")
400
+    if (verbose) message("Done calculating prior means")
167 401
     return(priormeans)
168 402
 }
169 403
 
404
+calculatePriorcentersC <- function(M, numSample) {
405
+
406
+    calculateOneKmeans <- function(x) {
407
+        tmp = kmeans(x, 3, nstart=45)
408
+        return(sort(tmp$centers, decreasing = T))
409
+    }
410
+
411
+    cl <- makeCluster(getNumberOfCores())
412
+    centers <- parCapply(cl, M, calculateOneKmeans)
413
+    stopCluster(cl) 
414
+    centers <- matrix(centers, numSample, 3, byrow = TRUE)
415
+    prepriormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
416
+    
417
+    return(prepriormeans)
418
+}
419
+
420
+
170 421
 #######################################################################################################
171 422
 
172 423
 calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
... ...
@@ -378,7 +629,7 @@ assignCallsOneSNP <- function(x, priormeans, numSample){
378 629
 
379 630
 assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
380 631
     # process by block size of 500,000 by default
381
-    message("Start assign calls")
632
+    message("Start assigning calls")
382 633
        
383 634
     numBlock = ceiling(numSNP / blockSize)
384 635
 
... ...
@@ -402,14 +653,14 @@ assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, b
402 653
         callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
403 654
         rm(subsetM, subseta, subsetcalls)
404 655
     }
405
-    message("Done assign calls")
656
+    message("Done assigning calls")
406 657
 }
407 658
 
408 659
 #######################################################################################################
409 660
 
410 661
 
411 662
 calculateParameters <- function(M, priormeans, numSNP, verbose) {
412
-    if (verbose) message("Start calculating 3-clusters parameters")
663
+    if (verbose) message("Start calculating 3-cluster parameters")
413 664
     params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
414 665
     if (verbose) message("Done calculating 3-cluster parameters")
415 666
 
... ...
@@ -636,7 +887,7 @@ calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
636 887
 
637 888
 computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
638 889
     # compute log-ratio in blocksize of 500,000 by default
639
-    if (verbose) message("Start computing confidence score")
890
+    if (verbose) message("Start computing confidence scores")
640 891
     
641 892
     numBlock = ceiling(numSNP / blockSize)
642 893
     for (i in 1:numBlock){
... ...
@@ -648,7 +899,7 @@ computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSi
648 899
         rm(subsetM, subsetCalls, subsetCallProb)
649 900
     }
650 901
 
651
-    if (verbose) message("Done computing confidence score")
902
+    if (verbose) message("Done computing confidence scores")
652 903
 }
653 904
 
654 905
 #############################################
... ...
@@ -671,15 +922,15 @@ AddBackNoVarianceSNPs <- function(cnSet, calls, scores, numSNP, numSample, varia
671 922
 
672 923
 #############################################
673 924
 
674
-krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose){
925
+krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose, offset=0){
675 926
     if (verbose) message("Start imputing gender")    
676
-    S = computeAverageLogIntensity(cnSet, verbose) 
927
+    S = computeAverageLogIntensity(cnSet, verbose, offset=offset) 
677 928
 
678 929
     # S is calculated and saved in original SNP order. 
679
-    matchy = match(annotation[YIndex, 2], rownames(S))
930
+    matchy = match(annotation[YIndex, "Name"], rownames(S))
680 931
     matchy = matchy[!is.na(matchy)]
681 932
     if (length(matchy) <= 10){
682
-        predictedGender = rep(NA, ncol(A))
933
+        predictedGender = rep(NA, ncol(S))
683 934
     }
684 935
     Sy = S[matchy,]
685 936
 
... ...
@@ -704,8 +955,8 @@ krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose){
704 955
     priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
705 956
 
706 957
     if (abs(priorS[1] - priorS[2]) <= 1.6) {
707
-        message("Skipping gender prediction step");
708
-        predictedGender = rep(NA, ncol(Sy))        
958
+        message("Separation between clusters too small (samples probabaly all the same gender): skipping gender prediction step");
959
+        predictedGender = rep(NA, ncol(Sy))
709 960
     }
710 961
     
711 962
     meanmatrix = apply(Sy, 2, median)
... ...
@@ -735,7 +986,7 @@ verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormea
735 986
     open(callsGt)
736 987
     open(callsPr)
737 988
        
738
-    matchy = match(annotation[YIndex, 2], rownames(M))
989
+    matchy = match(annotation[YIndex, "Name"], rownames(M))
739 990
     matchy = matchy[!is.na(matchy)]
740 991
    
741 992
     MChrY = M[matchy,]
... ...
@@ -173,7 +173,8 @@ illuminaCdfNames <- function(){
173 173
 	  "humanimmuno12v1b",       # Immuno chip 12
174 174
 	  "humancytosnp12v2p1h",    # CytoSNP 12
175 175
           "humanexome12v1p2a",      # Exome 12 v1.2 A
176
-          "humanomniexpexome8v1p1b")
176
+          "humanomniexpexome8v1p1b",
177
+          "nopackage")
177 178
 }
178 179
 
179 180
 affyCdfNames <- function(){
... ...
@@ -62,8 +62,8 @@ options(width=70)
62 62
 @
63 63
 
64 64
 <<read, results=hide, eval=TRUE>>=
65
-library(Biobase)
66 65
 library(crlmm)
66
+library(ff)
67 67
 library(hapmap370k)
68 68
 
69 69
 data.dir = system.file("idatFiles", package="hapmap370k")
... ...
@@ -73,73 +73,31 @@ samples = read.csv(file.path(data.dir, "samples370k.csv"), as.is=TRUE)
73 73
 samples[1:5,]
74 74
 @
75 75
 
76
-<<read2, results=hide, cache=TRUE>>=
77
-# Read in .idats using sampleSheet information
78
-RG = readIdatFiles(samples, path=data.dir,
79
-arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"),saveDate=TRUE)
80
-@
81
-
82
-Reading in this data takes approximately 100 seconds and peak memory usage
83
-was 0.8 GB of RAM on our linux system.
84
-If memory is limiting, load the \Rpackage{ff} package and run the same command.
85
-When this package is available, the objects are stored using disk rather then RAM.
86
-The \Robject{RG} object is an \Rclass{NChannelSet} which stores the
87
-Red and Green intensities, the number of beads and standard errors for
88
-each bead-type.
89
-The scanning date of each array is stored in \Robject{protocolData}.
90
-
91
-<<explore>>=
92
-class(RG)
93
-dim(RG)
94
-slotNames(RG)
95
-channelNames(RG)
96
-exprs(channel(RG, "R"))[1:5,1:5]
97
-exprs(channel(RG, "G"))[1:5,1:5]
98
-pd = pData(RG)
99
-pd[1:5,]
100
-
101
-scandatetime = strptime(protocolData(RG)[["ScanDate"]], "%m/%d/%Y %H:%M:%S %p")
102
-datescanned = substr(scandatetime, 1, 10)
103
-scanbatch =  factor(datescanned)
104
-levels(scanbatch) = 1:16
105
-scanbatch = as.numeric(scanbatch)
106
-@
107
-
108
-If GenCall output is available instead of idat files, the function \Rfunction{readGenCallOutput} can be
109
-used to read in the data.
110
-This function assumes the GenCall output is formatted to have samples listed one below the other,
111
-and that the columns 'X Raw' and 'Y Raw' are available in the file.
112
-The resulting \Robject{NChannelSet} from this function can be used as input to \Rfunction{crlmmIllumina} via the \Robject{XY} argument (instead of the usual \Rfunction{RG} argument used when the data has been read in from idat files).
113
-
114
-Plots of the summarised data can be easily generated to check for arrays with poor signal.
115
-
116
-<<boxplots, fig=TRUE, width=8, height=8>>=
117
-par(mfrow=c(2,1), mai=c(0.4,0.4,0.4,0.1), oma=c(1,1,0,0))
118
-boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", names=1:40,
119
-main="Red channel",outline=FALSE,las=2)
120
-boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", names=1:40,
121
-main="Green channel",outline=FALSE,las=2)
122
-mtext(expression(log[2](intensity)), side=2, outer=TRUE)
123
-mtext("Array", side=1, outer=TRUE)
124
-@
125
-
126 76
 \section{Genotyping}
127 77
 
128 78
 Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing followed by genotyping using the CRLMM algorithm.
79
+It reads in data from idat files and stores results in a \Rclass{CNSet} object.
129 80
 
130 81
 <<genotype, results=hide, cache=TRUE>>=
131
-crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", returnParams=TRUE)
132
-@
82
+crlmmResult = crlmmIllumina(samples, path=data.dir,
83
+				arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"),
84
+				saveDate=TRUE, cdfName="human370v1c", verbose=FALSE)
133 85
 
134
-This analysis took 3 minutes to complete and peak memory usage was 1.9 GB on our system.
135
-The output stored in \Robject{crlmmResult} is a \Rclass{SnpSet} object.
136
-<<explore2>>=
137 86
 class(crlmmResult)
138 87
 dim(crlmmResult)
139 88
 slotNames(crlmmResult)
140 89
 calls(crlmmResult)[1:10, 1:5]
90
+i2p(confs(crlmmResult)[1:10,1:5])
141 91
 @
142 92
 
93
+If GenCall output is available instead of idat files, the function \Rfunction{readGenCallOutput} can be
94
+used to read in the data.
95
+This function assumes the GenCall output is formatted to have samples listed one below the other,
96
+and that the columns 'X Raw' and 'Y Raw' are available in the file.
97
+The resulting \Robject{NChannelSet} from this function can be used as input to 
98
+\Rfunction{crlmmIllumina} (or equivalently \code{genotype.Illumina}) via the \Robject{XY} argument 
99
+(instead of the usual \Rfunction{RG} argument used when the data has been read in from idat files).
100
+
143 101
 Plotting the {\it SNR} reveals no obvious batch effects in this data set (different symbols are used for
144 102
 arrays scanned on different days).
145 103
 
... ...
@@ -148,15 +106,16 @@ plot(crlmmResult[["SNR"]], pch=scanbatch, xlab="Array", ylab="SNR",
148 106
      main="Signal-to-noise ratio per array",las=2)
149 107
 @
150 108
 
151
-An all-in-one function named \Rfunction{crlmmIlluminaV2} that combines
152
-reading of idat files with genotyping is also available.
153
-
154
-<<readandgenotypeinone, results=hide, cache=TRUE>>=
155
-crlmmResult2 <- crlmmIlluminaV2(samples, path=data.dir,
156
-				arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"),
157
-				saveDate=TRUE, cdfName="human370v1c", returnParams=TRUE)
109
+<<plotsamples, fig=TRUE, width=8, height=8>>=
110
+par(mfrow=c(2,2))
111
+plotSamples(crlmmResult, col=1:4)
158 112
 @
159 113
 
114
+<<plotsnps, fig=TRUE, width=8, height=8>>=
115
+par(mfrow=c(2,2))
116
+plotSNPs(crlmmResult, row=1:4)
117
+@ 
118
+
160 119
 \section{System information}
161 120
 
162 121
 This analysis was carried out on a linux machine with 32GB of RAM
... ...
@@ -11,35 +11,35 @@ genotypingTest <- function(){
11 11
 	checkTrue(all.equal(calls(cnSet), calls(cnSet2)[,]))
12 12
 }
13 13
 
14
-genotypingTestIllumina <- function(){
15
-	setwd("/thumper/ctsa/snpmicroarray/illumina/IDATS/370k/")
16
-	library(crlmm)
17
-	library(ff)
18
-	ldPath(tempdir())
19
-
20
-	samples <- read.csv("samples370k.csv", as.is=TRUE)
21
-	RG <- readIdatFiles(sampleSheet=samples,
22
-			    arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"),
23
-			    saveDate=TRUE)
24
-
25
-	crlmmResult <-  crlmmIllumina(RG=RG,
26
-				      cdfName="human370v1c",
27
-				      returnParams=TRUE)
28
-	checkTrue(is(calls(crlmmResult)[1:5,1], "integer"))
29
-
30
-	crlmmResult2 <- crlmmIlluminaV2(sampleSheet=samples,
31
-					arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"),
32
-					saveDate=TRUE, cdfName="human370v1c", returnParams=TRUE)
33
-	checkTrue(identical(calls(crlmmResult)[1:5, ]),
34
-		  identical(calls(crlmmResult2)[1:5, ]))
35
-
36
-	crlmmResult3 <- genotype.Illumina(sampleSheet=samples,
37
-					  arrayInfoColNames=list(barcode=NULL,
38
-					  position="SentrixPosition"),
39
-					  saveDate=TRUE, cdfName="human370v1c",
40
-					  batch = as.factor(rep(1, nrow(samples))))
41
-
42
-	checkTrue(identical(calls(crlmmResult)[1:5, ]),
43
-		  identical(calls(crlmmResult3)[1:5, ]))
44
-
45
-}
14
+#genotypingTestIllumina <- function(){
15
+#	setwd("/thumper/ctsa/snpmicroarray/illumina/IDATS/370k/")
16
+#	library(crlmm)
17
+#	library(ff)
18
+#	ldPath(tempdir())
19
+#
20
+#	samples <- read.csv("samples370k.csv", as.is=TRUE)
21
+#	RG <- readIdatFiles(sampleSheet=samples,
22
+#			    arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"),
23
+#			    saveDate=TRUE)
24
+#
25
+#	crlmmResult <-  crlmmIllumina(RG=RG,
26
+#				      cdfName="human370v1c",
27
+#				      returnParams=TRUE)
28
+#	checkTrue(is(calls(crlmmResult)[1:5,1], "integer"))
29
+#
30
+#	crlmmResult2 <- crlmmIlluminaV2(sampleSheet=samples,
31
+#					arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"),
32
+#					saveDate=TRUE, cdfName="human370v1c", returnParams=TRUE)
33
+#	checkTrue(identical(calls(crlmmResult)[1:5, ]),
34
+#		  identical(calls(crlmmResult2)[1:5, ]))
35
+#
36
+#	crlmmResult3 <- genotype.Illumina(sampleSheet=samples,
37
+#					  arrayInfoColNames=list(barcode=NULL,
38
+#					  position="SentrixPosition"),
39
+#					  saveDate=TRUE, cdfName="human370v1c",
40
+#					  batch = as.factor(rep(1, nrow(samples))))
41
+#
42
+#	checkTrue(identical(calls(crlmmResult)[1:5, ]),
43
+#		  identical(calls(crlmmResult3)[1:5, ]))
44
+#
45
+#}
... ...
@@ -1,3 +1,4 @@
1
+
1 2
 \name{constructInf}
2 3
 \alias{constructInf}
3 4
 \title{
... ...
@@ -11,7 +12,7 @@
11 12
 \usage{
12 13
 constructInf(sampleSheet = NULL, arrayNames = NULL, path = ".",
13 14
        arrayInfoColNames = list(barcode="SentrixBarcode_A",position="SentrixPosition_A"), highDensity = FALSE, sep = "_",
14
-       fileExt = list(green = "Grn.idat", red = "Red.idat"), XY, cdfName, verbose = FALSE, batch=NULL, saveDate = TRUE)
15
+       fileExt = list(green = "Grn.idat", red = "Red.idat"), XY, cdfName, anno, genome, verbose = FALSE, batch=NULL, saveDate = TRUE)
15 16
 }
16 17
 \arguments{
17 18
   \item{sampleSheet}{\code{data.frame} containing Illumina sample sheet
... ...
@@ -38,7 +39,12 @@ constructInf(sampleSheet = NULL, arrayNames = NULL, path = ".",
38 39
   \item{fileExt}{list containing elements 'Green' and 'Red' which
39 40
     specify the .idat file extension for the Cy3 and Cy5 channels.}
40 41
   \item{XY}{an \code{NChannelSet} containing X and Y intensities.}
41
-  \item{cdfName}{ annotation package  (see also \code{validCdfNames})}
42
+  \item{cdfName}{annotation package (see also \code{validCdfNames}) or 
43
+    'nopackage' when an \code{anno} data.frame and \code{genome} supplied}
44
+  \item{anno}{data.frame containing SNP annotation information from 
45
+    manifest and additional columns 'isSnp', 'position', 'chromosome' 
46
+    and 'featureNames'. For use when \code{cdfName}='nopackage'}
47
+  \item{genome}{character string specifying which genome is used in annotation}
42 48
   \item{verbose}{  'logical.'  Whether to print descriptive messages
43 49
   during processing.}
44 50
  \item{batch}{ batch variable. See details.}
45 51
deleted file mode 100644
... ...
@@ -1,85 +0,0 @@
1
-\name{crlmmIllumina}
2
-\alias{crlmmIllumina}
3
-\title{Genotype Illumina Infinium II BeadChip data with CRLMM}
4
-\description{
5
-  Implementation of the CRLMM algorithm for
6
-  data from Illumina's Infinium II BeadChips.
7
-}
8
-\usage{
9
-
10
-crlmmIllumina(RG, XY, stripNorm=TRUE,
11
-      useTarget=TRUE, row.names=TRUE, col.names=TRUE,
12
-      probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5,
13
-      gender=NULL, seed=1, mixtureSampleSize=10^5,
14
-      eps=0.1, verbose=TRUE, cdfName, sns, recallMin=10,
15
-      recallRegMin=1000, returnParams=FALSE, badSNP=0.7)
16
-}
17
-
18
-\arguments{
19
-  \item{RG}{\code{NChannelSet} containing R and G bead intensities}
20
-  \item{XY}{\code{NChannelSet} containing X and Y bead intensities}
21
-  \item{stripNorm}{'logical'.  Should the data be strip-level normalized?}
22
-  \item{useTarget}{'logical' (only used when \code{stripNorm=TRUE}).
23
-    Should the reference HapMap intensities be used in strip-level normalization?}
24
-  \item{row.names}{'logical'. Use rownames - SNP names?}
25
-  \item{col.names}{'logical'. Use colnames - Sample names?}
26
-  \item{probs}{'numeric' vector with priors for AA, AB and BB.}
27
-  \item{DF}{'integer' with number of degrees of freedom to use with t-distribution.}
28
-  \item{SNRMin}{'numeric' scalar defining the minimum SNR used to filter
29
-  out samples.}
30
-  \item{gender}{'integer' vector, with same length as 'filenames',
31
-    defining sex. (1 - male; 2 - female)}
32
-  \item{seed}{'integer' scalar for random number generator (used to
33
-    sample \code{mixtureSampleSize} SNPs for mixture model.}
34
-  \item{mixtureSampleSize}{'integer'. The number of SNP's to be used
35
-    when fitting the mixture model.}
36
-  \item{eps}{Minimum change for mixture model.}
37
-  \item{verbose}{'logical'.}
38
-  \item{cdfName}{'character' defining the chip annotation (manifest) to use
39
-    ('human370v1c', human550v3b', 'human650v3a', 'human1mv1c',
40
-    'human370quadv3c', 'human610quadv1b', 'human660quadv1a',
41
-    'human1mduov3b', 'humanomni1quadv1b', 'humanomniexpress12v1b', 'humancytosnp12v2p1h')}
42
-  \item{sns}{'character' vector with sample names to be used.}
43
-  \item{recallMin}{'integer'. Minimum number of samples for recalibration.}
44
-  \item{recallRegMin}{'integer'. Minimum number of SNP's for regression.}
45
-  \item{returnParams}{'logical'. Return recalibrated parameters.}
46
-  \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)}
47
-}
48
-\value{
49
-  A \code{SnpSet} object which contains
50
-  \item{calls}{Genotype calls (1 - AA, 2 - AB, 3 - BB)}
51
-  \item{callProbability}{confidence scores 'round(-1000*log2(1-p))'}
52
-  in the \code{assayData} slot and
53
-  \item{SNPQC}{SNP Quality Scores}
54
-  \item{batchQC}{Batch Quality Scores}
55
-  along with center and scale parameters when \code{returnParams=TRUE}
56
-  in the \code{featureData} slot.
57
-}
58
-
59
-\details{
60
-
61
-  Note: The user should specify either the \code{RG} or \code{XY}
62
-  intensities, not both.
63
-}
64
-
65
-\references{
66
-  Ritchie ME, Carvalho BS, Hetrick KN, Tavar\'{e} S, Irizarry RA.
67
-  R/Bioconductor software for Illumina's Infinium whole-genome
68
-  genotyping BeadChips. Bioinformatics. 2009 Oct 1;25(19):2621-3.
69
-
70
-  Carvalho B, Bengtsson H, Speed TP, Irizarry RA. Exploration,
71
-  normalization, and genotype calls of high-density oligonucleotide SNP
72
-  array data. Biostatistics. 2007 Apr;8(2):485-99. Epub 2006 Dec
73
-  22. PMID: 17189563.
74
-
75
-  Carvalho BS, Louis TA, Irizarry RA.
76
-  Quantifying uncertainty in genotype calls.
77
-  Bioinformatics. 2010 Jan 15;26(2):242-9.
78
-}
79
-
80
-\author{Matt Ritchie}
81
-
82
-\examples{
83
-## crlmmOut = crlmmIllumina(RG)
84
-}
85
-\keyword{classif}
86 0
deleted file mode 100644
... ...
@@ -1,113 +0,0 @@
1
-\name{crlmmIlluminaV2}
2
-\alias{crlmmIlluminaV2}
3
-\title{Read and Genotype Illumina Infinium II BeadChip data with CRLMM}
4
-\description{
5
-  Implementation of the CRLMM algorithm for
6
-  data from Illumina's Infinium II BeadChips.
7
-}
8
-\usage{
9
-
10
-crlmmIlluminaV2(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
11
-      arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
12
-      highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"),
13
-      saveDate=FALSE, stripNorm=TRUE, useTarget=TRUE,
14
-      row.names=TRUE, col.names=TRUE, probs=c(1/3, 1/3, 1/3),
15
-      DF=6, SNRMin=5, gender=NULL, seed=1, mixtureSampleSize=10^5,
16
-      eps=0.1, verbose=TRUE, cdfName, sns, recallMin=10,
17
-      recallRegMin=1000, returnParams=FALSE, badSNP=.7)
18
-}
19
-
20
-\arguments{
21
-  \item{sampleSheet}{\code{data.frame} containing Illumina sample sheet
22
-    information (for required columns, refer to BeadStudio Genotyping
23
-    guide - Appendix A).}
24
-  \item{arrayNames}{character vector containing names of arrays to be
25
-    read in.  If \code{NULL}, all arrays that can be found in the
26
-    specified working directory will be read in.}
27
-  \item{ids}{vector containing ids of probes to be read in.  If
28
-    \code{NULL} all probes found on the first array are read in.}
29
-  \item{path}{character string specifying the location of files to be
30
-    read by the function}
31
-  \item{arrayInfoColNames}{(used when \code{sampleSheet} is specified)
32
-    list containing elements 'barcode' which indicates column names in
33
-    the \code{sampleSheet} which contains the arrayNumber/barcode number
34
-    and 'position' which indicates the strip number.  In older style
35
-    sample sheets, this information is combined (usually in a column
36
-    named 'SentrixPosition') and this should be specified as
37
-    \code{list(barcode=NULL, position="SentrixPosition")}}
38
-  \item{highDensity}{logical (used when \code{sampleSheet} is
39
-    specified). If \code{TRUE}, array extensions '\_A', '\_B' in
40
-    sampleSheet are replaced with 'R01C01', 'R01C02' etc.}
41
-  \item{sep}{character string specifying separator used in .idat file
42
-    names.}
43
-  \item{fileExt}{list containing elements 'Green' and 'Red' which
44
-    specify the .idat file extension for the Cy3 and Cy5 channels.}
45
-  \item{saveDate}{'logical'.  Should the dates from each .idat be saved
46
-    with sample information?}
47
-  \item{stripNorm}{'logical'.  Should the data be strip-level normalized?}
48
-  \item{useTarget}{'logical' (only used when \code{stripNorm=TRUE}).
49
-    Should the reference HapMap intensities be used in strip-level normalization?}
50
-  \item{row.names}{'logical'. Use rownames - SNP names?}
51
-  \item{col.names}{'logical'. Use colnames - Sample names?}
52
-  \item{probs}{'numeric' vector with priors for AA, AB and BB.}
53
-  \item{DF}{'integer' with number of degrees of freedom to use with t-distribution.}
54
-  \item{SNRMin}{'numeric' scalar defining the minimum SNR used to filter
55
-  out samples.}
56
-  \item{gender}{'integer' vector, with same length as 'filenames',
57
-    defining sex. (1 - male; 2 - female)}
58
-  \item{seed}{'integer' scalar for random number generator (used to
59
-    sample \code{mixtureSampleSize} SNPs for mixture model.}
60
-  \item{mixtureSampleSize}{'integer'. The number of SNP's to be used
61
-    when fitting the mixture model.}
62
-  \item{eps}{Minimum change for mixture model.}
63
-  \item{verbose}{'logical'.}
64
-  \item{cdfName}{'character' defining the chip annotation (manifest) to use
65
-    ('human370v1c', human550v3b', 'human650v3a', 'human1mv1c',
66
-    'human370quadv3c', 'human610quadv1b', 'human660quadv1a',
67
-    'human1mduov3b', 'humanomni1quadv1b', 'humanomniexpress12v1b', 'humancytosnp12v2p1h')}
68
-  \item{sns}{'character' vector with sample names to be used.}
69
-  \item{recallMin}{'integer'. Minimum number of samples for recalibration.}
70
-  \item{recallRegMin}{'integer'. Minimum number of SNP's for regression.}
71
-  \item{returnParams}{'logical'. Return recalibrated parameters.}
72
-  \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)}
73
-}
74
-\value{
75
-  A \code{SnpSet} object which contains
76
-  \item{calls}{Genotype calls (1 - AA, 2 - AB, 3 - BB)}
77
-  \item{callProbability}{confidence scores 'round(-1000*log2(1-p))'}
78
-  in the \code{assayData} slot and
79
-  \item{SNPQC}{SNP Quality Scores}
80
-  \item{batchQC}{Batch Quality Scores}
81
-  along with center and scale parameters when \code{returnParams=TRUE}
82
-  in the \code{featureData} slot.
83
-}
84
-
85
-\details{
86
-  This function combines the reading of data from idat files using
87
-  \code{readIdatFiles} and genotyping to reduce memory usage.
88
-}
89
-
90
-\references{
91
-  Ritchie ME, Carvalho BS, Hetrick KN, Tavar\'{e} S, Irizarry RA.
92
-  R/Bioconductor software for Illumina's Infinium whole-genome
93
-  genotyping BeadChips. Bioinformatics. 2009 Oct 1;25(19):2621-3.
94
-
95
-  Carvalho B, Bengtsson H, Speed TP, Irizarry RA. Exploration,
96
-  normalization, and genotype calls of high-density oligonucleotide SNP
97
-  array data. Biostatistics. 2007 Apr;8(2):485-99. Epub 2006 Dec
98
-  22. PMID: 17189563.
99
-
100
-  Carvalho BS, Louis TA, Irizarry RA.
101
-  Quantifying uncertainty in genotype calls.
102
-  Bioinformatics. 2010 Jan 15;26(2):242-9.
103
-}
104
-
105
-\author{Matt Ritchie}
106
-
107
-\examples{
108
-## crlmmOut = crlmmIlluminaV2(samples,path=path,arrayInfoColNames=list(barcode="Chip",position="Section"),
109
-##                             saveDate=TRUE,cdfName="human370v1c",returnParams=TRUE)
110
-
111
-}
112
-\seealso{\code{\link{crlmmIllumina}}}
113
-\keyword{classif}
... ...
@@ -1,5 +1,6 @@
1 1
 \name{genotype.Illumina}
2 2
 \alias{genotype.Illumina}
3
+\alias{crlmmIllumina}
3 4
 
4 5
 \title{
5 6
 	Preprocessing and genotyping of Illumina Infinium II arrays.
... ...
@@ -10,11 +11,20 @@
10 11
 \usage{
11 12
 genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
12 13
       arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
13
-      highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), XY=NULL,
14
+      highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), XY=NULL, anno, genome, 
14 15
       call.method="crlmm", trueCalls=NULL, cdfName, copynumber=TRUE, batch=NULL, saveDate=FALSE, stripNorm=TRUE, 
15
-      useTarget=TRUE, quantile.method="between", mixtureSampleSize=10^5, fitMixture=TRUE,                               
16
-      eps =0.1, verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, 
16
+      useTarget=TRUE, quantile.method="between", nopackage.norm="quantile", mixtureSampleSize=10^5, fitMixture=TRUE,
17
+      eps=0.1, verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, 
17 18
       recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7)
19
+
20
+crlmmIllumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
21
+      arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
22
+      highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), XY=NULL, anno, genome, 
23
+      call.method="crlmm", trueCalls=NULL, cdfName, copynumber=TRUE, batch=NULL, saveDate=FALSE, stripNorm=TRUE, 
24
+      useTarget=TRUE, quantile.method="between", nopackage.norm="quantile", mixtureSampleSize=10^5, fitMixture=TRUE,
25
+      eps=0.1, verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, 
26
+      recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7)
27
+
18 28
 }
19 29
 \arguments{
20 30
   \item{sampleSheet}{\code{data.frame} containing Illumina sample sheet
... ...
@@ -42,9 +52,13 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
42 52
   \item{fileExt}{list containing elements 'Green' and 'Red' which
43 53
     specify the .idat file extension for the Cy3 and Cy5 channels.}
44 54
   \item{XY}{\code{NChannelSet} containing X and Y intensities.}
55
+  \item{anno}{data.frame containing SNP annotation information from 
56
+    manifest and additional columns 'isSnp', 'position', 'chromosome' 
57
+    and 'featureNames'. For use when \code{cdfName}='nopackage'}
58
+  \item{genome}{character string specifying which genome is used in annotation}
45 59
   \item{call.method}{character string specifying the genotype calling algorithm to use ('crlmm' or 'krlmm').}
46 60
   \item{trueCalls}{matrix specifying known Genotype calls(can contain some NAs) for a subset of samples and features (1 - AA, 2 - AB, 3 - BB).}
47
-  \item{cdfName}{ annotation package  (see also \code{validCdfNames})}
61
+  \item{cdfName}{annotation package (see also \code{validCdfNames}) or 'nopackage' when combined with 'krlmm', an \code{anno} data.frame and \code{genome}.}
48 62
   \item{copynumber}{ 'logical.' Whether to store copy number intensities with SNP output.}
49 63
   \item{batch}{ character vector indicating the batch variable. Must be
50 64
 	the same length as the number of samples. See details.}
... ...
@@ -54,6 +68,8 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
54 68
   \item{useTarget}{'logical' (only used when \code{stripNorm=TRUE}).
55 69
     Should the reference HapMap intensities be used in strip-level normalization?}
56 70
   \item{quantile.method}{character string specifying the quantile normalization method to use ('within' or 'between' channels).}
71
+  \item{nopackage.norm}{character string specifying normalization to be used when \code{cdfName}='nopackage'.
72
+    Options are 'none', 'quantile' (within channel, between array) and 'loess'.}
57 73
   \item{mixtureSampleSize}{ Sample size to be use when fitting the mixture model.}
58 74
   \item{fitMixture}{ 'logical.' Whether to fit per-array mixture model.}
59 75
   \item{eps}{   Stop criteria.}
... ...
@@ -74,23 +90,15 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
74 90
 
75 91
 \details{
76 92
 
77
-	For large datasets it is important to utilize the large data
78
-	support by installing and loading the ff package before calling
79
-	the \code{genotype} function. In previous versions of the
80
-	\code{crlmm} package, we used different functions for
81
-	genotyping depending on whether the ff package is loaded, namely
82
-	\code{genotype} and \code{genotype2}.  The \code{genotype}
83
-	function now handles both instances.
84
-
85
-	\code{genotype.Illumina} is a wrapper of the \code{crlmm}
86
-	function for genotyping.  Differences include (1) that the copy
87
-	number probes (if present) are also quantile-normalized and (2)
88
-	the class of object returned by this function, \code{CNSet}, is
89
-	needed for subsequent copy number estimation.  Note that the
90
-	batch variable (a character string) that must be passed to this
91
-	function has no effect on the normalization or genotyping steps.
92
-	Rather, \code{batch} is required in order to initialize a
93
-	\code{CNSet} container with the appropriate dimensions.
93
+	\code{genotype.Illumina} (or equivalently \code{crlmmIllumina}) 
94
+        is a wrapper of the \code{crlmm} function for genotyping.  
95
+        Differences include (1) that the copy number probes (if present) 
96
+        are also quantile-normalized and (2) the class of object returned 
97
+        by this function, \code{CNSet}, is needed for subsequent copy number 
98
+        estimation.  Note that the batch variable (a character string) has 
99
+        no effect on the normalization or genotyping steps. Rather, \code{batch} 
100
+        is required in order to initialize a \code{CNSet} container with the 
101
+        appropriate dimensions.
94 102
 
95 103
         The new 'krlmm' option is available for certain chip types. Optional 
96 104
 	argument \code{trueCalls} matrix contains known Genotype calls 
... ...
@@ -103,6 +111,15 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
103 111
 	8 clusters. This is configurable by setting up an option named 
104 112
 	"krlmm.cores", e.g. options("krlmm.cores" = 16). 
105 113
 
114
+        In general, a chip specific annotation package is required to use the 
115
+        \code{genotype.Illumina} function. If this is not available (newer chip 
116
+        types or custom chips often don't have a chip-specific package available
117
+        on Bioconductor), consider using \code{cdfName}='nopackage' and specifying 
118
+        \code{anno} and \code{genome}, which runs 'krlmm' on the samples available.
119
+        Here \code{anno} is a data.frame read in from the relevant chip-specific
120
+        manifest, which must have additional columns 'isSnp' which is a logical that
121
+        indicates whether a probe is polymorphic or not, 'position', 'chromosome' and
122
+        'featureNames' that give the location on the chromosome and SNP name.
106 123
       }
107 124
 
108 125
 \value{	A \code{SnpSuperSet} instance.}
... ...
@@ -111,6 +128,10 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
111 128
   R/Bioconductor software for Illumina's Infinium whole-genome
112 129
   genotyping BeadChips. Bioinformatics. 2009 Oct 1;25(19):2621-3.
113 130
 
131
+  Liu R, Dai Z, Yeager M, Irizarry RA1, Ritchie ME.
132
+  KRLMM: an adaptive genotype calling method for common and low frequency variants.
133
+  BMC Bioinformatics. 2014 May 23;15:158.
134
+
114 135
   Carvalho B, Bengtsson H, Speed TP, Irizarry RA. Exploration,
115 136
   normalization, and genotype calls of high-density oligonucleotide SNP
116 137
   array data. Biostatistics. 2007 Apr;8(2):485-99. Epub 2006 Dec
... ...
@@ -119,19 +140,11 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
119 140
   Carvalho BS, Louis TA, Irizarry RA.
120 141
   Quantifying uncertainty in genotype calls.
121 142
   Bioinformatics. 2010 Jan 15;26(2):242-9.
122
-
123 143
 }
124 144
 
125 145
 \author{Matt Ritchie, Cynthia Liu, Zhiyin Dai}
126 146
 
127
-  \note{For large datasets, load the 'ff' package prior to genotyping
128
-\code{ldPath} and \code{ocSamples}.  The function
129
-\code{genotype.Illumina} supports parallelization, as the (not run)
130
-example below indicates.}
131
-
132 147
 \seealso{
133
-	\code{\link{crlmmIlluminaV2}},
134 148
 	\code{\link[oligoClasses]{ocSamples}},
135 149
 	\code{\link[oligoClasses]{ldOpts}}
136 150
 }
... ...
@@ -16,9 +16,9 @@
16 16
 }
17 17
 \usage{
18 18
 genotypeInf(cnSet, mixtureParams, probs = rep(1/3, 3), SNRMin = 5,
19
-recallMin = 10, recallRegMin = 1000, verbose = TRUE, returnParams =
20
-TRUE, badSNP = 0.7, gender = NULL, DF = 6, cdfName, call.method="crlmm", 
21
-trueCalls = NULL)
19
+             recallMin = 10, recallRegMin = 1000, verbose = TRUE, returnParams = TRUE, 
20
+             badSNP = 0.7, gender = NULL, DF = 6, cdfName, nopackage.norm="quantile",
21
+             call.method="crlmm", trueCalls = NULL)
22 22
 }
23 23
 %- maybe also 'usage' for other objects documented here.
24 24
 \arguments{
... ...
@@ -44,6 +44,8 @@ trueCalls = NULL)
44 44
     t-distribution.}
45 45
   \item{cdfName}{\code{character} string indicating which annotation
46 46
     package to load.}
47
+  \item{nopackage.norm}{character string specifying normalization to be used when \code{cdfName}='nopackage'.
48
+    Options are 'none', 'quantile' (within channel, between array) and 'quantileloess'.}
47 49
   \item{call.method}{character string specifying the genotype calling algorithm to use ('crlmm' or 'krlmm').}
48 50
   \item{trueCalls}{ matrix specifying known Genotype calls for a subset of samples and features(1 - AA, 2 - AB, 3 - BB).}
49 51
 }
... ...
@@ -19,7 +19,7 @@
19 19
 preprocessInf(cnSet, sampleSheet=NULL, arrayNames = NULL, ids = NULL,
20 20
 path = ".", arrayInfoColNames = list(barcode = "SentrixBarcode_A",
21 21
 position = "SentrixPosition_A"), highDensity = TRUE, sep = "_", fileExt
22
-= list(green = "Grn.idat", red = "Red.idat"), XY, saveDate = TRUE, stripNorm
22
+= list(green = "Grn.idat", red = "Red.idat"), XY, anno, saveDate = TRUE, stripNorm
23 23
 = TRUE, useTarget = TRUE, mixtureSampleSize = 10^5, fitMixture = TRUE, 
24 24
 quantile.method="between", eps = 0.1, verbose = TRUE, seed = 1, cdfName)
25 25
 }
... ...
@@ -59,6 +59,9 @@ quantile.method="between", eps = 0.1, verbose = TRUE, seed = 1, cdfName)
59 59
   \item{fileExt}{list containing elements 'Green' and 'Red' which
60 60
     specify the .idat file extension for the Cy3 and Cy5 channels.}
61 61
   \item{XY}{an \code{NChannelSet} object containing X and Y intensities.}
62
+  \item{anno}{data.frame containing SNP annotation information from 
63
+    manifest and additional columns 'isSnp', 'position', 'chromosome' 
64
+    and 'featureNames'. For use when \code{cdfName}='nopackage'}
62 65
   \item{saveDate}{'logical'.  Should the dates from each .idat be saved
63 66
     with sample information?}
64 67
   \item{stripNorm}{'logical'.  Should the data be strip-level normalized?}
... ...
@@ -425,28 +425,27 @@ SEXP krlmmComputeM(SEXP A, SEXP B){
425 425
     rowsAB, colsAB: dimensions of A and B, which is number of SNP and number of sample
426 426
   */
427 427
 
428
-  int rowsAB, colsAB;
428
+  long rowsAB, colsAB;
429 429
   rowsAB = INTEGER(getAttrib(A, R_DimSymbol))[0];
430 430
   colsAB = INTEGER(getAttrib(A, R_DimSymbol))[1];
431 431
 
432
-  int i, j;
433
-
434
-  int *ptr2A, *ptr2B;
435
-  ptr2A = INTEGER_POINTER(AS_INTEGER(A));
436
-  ptr2B = INTEGER_POINTER(AS_INTEGER(B));
432
+  A = coerceVector(A, REALSXP);
433
+  B = coerceVector(B, REALSXP);
434
+  
435
+  long i, j;
437 436
 
438 437
   SEXP Rval;
439 438
   PROTECT(Rval = allocMatrix(REALSXP, rowsAB, colsAB));
440 439
 
441 440
   double *ptr2M;
442
-  long ele;
441
+  long elepos;
443 442
   ptr2M = NUMERIC_POINTER(Rval);
444 443
 
445 444
   for (i = 1; i <= rowsAB; i++){
446 445
     for (j = 1; j <= colsAB; j++){
447
-      // elem is an index for A, B and M
448
-      ele = Cmatrix(i, j, rowsAB);
449
-      ptr2M[ele] = (log2(ptr2A[ele])-log2(ptr2B[ele]));
446
+      // elepos is an index for A, B and M
447
+      elepos = CMatrixElementPosition(i, j, rowsAB);
448
+      ptr2M[elepos] = (log2(REAL(A)[elepos])-log2(REAL(B)[elepos]));
450 449
     }
451 450
   }
452 451
   
... ...
@@ -460,7 +459,7 @@ SEXP krlmmComputeS(SEXP A, SEXP B){
460 459
   /*
461 460
     ARGUMENTS
462 461
     ---------
463
-    A: intensity matrix for allele A
462
+    A: intensity matrix for allele A11
464 463
     B: intensity matrix for allele B
465 464
     S: average log-intensity for a particular SNP (outgoing)
466 465
 
... ...
@@ -469,28 +468,26 @@ SEXP krlmmComputeS(SEXP A, SEXP B){
469 468
     rowsAB, colsAB: dimensions of A and B, which is number of SNP and number of sample
470 469
   */
471 470
 
472
-  int rowsAB, colsAB;
471
+  long rowsAB, colsAB;
473 472
   rowsAB = INTEGER(getAttrib(A, R_DimSymbol))[0];
474 473
   colsAB = INTEGER(getAttrib(A, R_DimSymbol))[1];
475 474
 
476
-  int i, j;
477
-
478
-  int *ptr2A, *ptr2B;
479
-  ptr2A = INTEGER_POINTER(AS_INTEGER(A));
480
-  ptr2B = INTEGER_POINTER(AS_INTEGER(B));
475
+  A = coerceVector(A, REALSXP);
476
+  B = coerceVector(B, REALSXP); 
477
+  long i, j;
481 478
 
482 479
   SEXP Rval;
483 480
   PROTECT(Rval = allocMatrix(REALSXP, rowsAB, colsAB));
484 481
 
485 482
   double *ptr2S;
486
-  long ele;
483
+  long elepos;
487 484
   ptr2S = NUMERIC_POINTER(Rval);
488