Browse code

Merge branch 'collab'

* collab:
Update AffyGW.pdf and IlluminaPreprocessCN.pdf
crlmmIlluminaV2 calls crlmmGT. comment call to crlmmGT2
Update AffyGW vignette to step through construction of CNSet, normalization, and genotyping
v 1.15.27: Fix bug in crlmmGT2. clean up comments in crlmmIllumina.
update getFeatureData

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@69561 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 19/09/2012 12:31:51
Showing10 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.15.25
4
+Version: 1.15.30
5 5
 Author: Benilton S Carvalho, Robert Scharpf, Matt Ritchie, Ingo Ruczinski, Rafael A Irizarry
6 6
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
7 7
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms
... ...
@@ -21,7 +21,8 @@ getProtocolData.Affy <- function(filenames){
21 21
 ##		  return(gns)
22 22
 ##	  })
23 23
 
24
-getFeatureData <- function(cdfName, copynumber=FALSE, genome=genome){
24
+
25
+getFeatureData <- function(cdfName, copynumber=FALSE, genome){
25 26
 	pkgname <- getCrlmmAnnotationName(cdfName)
26 27
 	if(!require(pkgname, character.only=TRUE)){
27 28
 		suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
... ...
@@ -39,17 +40,29 @@ getFeatureData <- function(cdfName, copynumber=FALSE, genome=genome){
39 40
 		pkgname <- paste(pkgname, "Crlmm", sep="")
40 41
 	}
41 42
 	path <- system.file("extdata", package=pkgname)
42
-	multiple.builds <- length(grep("hg19", list.files(path)) > 0)
43
-	if(!multiple.builds){
44
-		load(file.path(path, "snpProbes.rda"))
45
-	} else load(file.path(path, paste("snpProbes_", genome, ".rda", sep="")))
43
+	##multiple.builds <- length(grep("hg19", list.files(path)) > 0)
44
+	if(missing(genome)){
45
+		snp.file <- list.files(path, pattern="snpProbes_hg")
46
+		if(length(snp.file) > 1){
47
+			## use hg19
48
+			message("genome build not specified. Using build hg19 for annotation.")
49
+			snp.file <- snp.file[1]
50
+		}
51
+		genome <- gsub(".rda", "", strsplit(snp.file, "snpProbes_")[[1]][[2]])
52
+	} else snp.file <- paste("snpProbes_", genome, ".rda", sep="")
53
+##	if(!multiple.builds){
54
+##		load(file.path(path, "snpProbes.rda"))
55
+##	} else load(file.path(path, paste("snpProbes_", genome, ".rda", sep="")))
56
+	load(file.path(path, snp.file))
46 57
 	snpProbes <- get("snpProbes")
47 58
 	## if we use a different build we may throw out a number of snps...
48 59
 	snpProbes <- snpProbes[rownames(snpProbes) %in% gns, ]
49 60
 	if(copynumber){
50
-		if(!multiple.builds){
51
-			load(file.path(path, "cnProbes.rda"))
52
-		} else load(file.path(path, paste("cnProbes_", genome, ".rda", sep="")))
61
+		cn.file <- paste("cnProbes_", genome, ".rda", sep="")
62
+		load(file.path(path, cn.file))
63
+		##		if(!multiple.builds){
64
+		##			load(file.path(path, "cnProbes.rda"))
65
+		##		} else load(file.path(path, paste("cnProbes_", genome, ".rda", sep="")))
53 66
 		cnProbes <- get("cnProbes")
54 67
 		snpIndex <- seq(along=gns)
55 68
 		npIndex <- seq(along=rownames(cnProbes)) + max(snpIndex)
... ...
@@ -300,7 +313,7 @@ genotype <- function(filenames,
300 313
 			    seed=seed,
301 314
 			    verbose=verbose)
302 315
 	ok <- cnrmaAffy(cnSet=cnSet,
303
-			cdfName=annotation(cnSet), seed=seed,
316
+			seed=seed,
304 317
 			verbose=verbose)
305 318
 	stopifnot(ok)
306 319
 	ok <- genotypeAffy(cnSet=cnSet,
... ...
@@ -336,6 +349,7 @@ genotypeAffy <- function(cnSet, SNRMin=5, recallMin=10,
336 349
 			badSNP=badSNP,
337 350
 			callsGt=calls(cnSet),
338 351
 			callsPr=snpCallProbability(cnSet))
352
+	cnSet$gender[] <- tmp$gender
339 353
 	if(verbose) message("Genotyping finished.")
340 354
 	return(TRUE)
341 355
 }
... ...
@@ -852,154 +852,54 @@ preprocessInfinium2 = function(XY, mixtureSampleSize=10^5,
852 852
 }
853 853
 
854 854
 
855
-crlmmIllumina = function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
856
-                  row.names=TRUE, col.names=TRUE,
857
-                  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
858
-                  seed=1, # save.it=FALSE, load.it=FALSE, snpFile, cnFile,
859
-                  mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
860
-                  cdfName, sns, recallMin=10, recallRegMin=1000,
861
-                  returnParams=FALSE, badSNP=.7) {
862
-
863
-if(missing(cdfName)) {
864
-   if(!missing(RG))
865
-      cdfName = annotation(RG)
866
-   if(!missing(XY))
867
-      cdfName = annotation(XY)
868
-}
869
-if(!isValidCdfName(cdfName))
870
-   stop("cdfName not valid.  see validCdfNames")
871
-
872
-#  if (save.it & (missing(snpFile) | missing(cnFile)))
873
-#    stop("'snpFile' and/or 'cnFile' is missing and you chose to save.it")
874
-#  if (load.it & missing(snpFile))
875
-#    stop("'snpFile' is missing and you chose to load.it")
876
-#  if (!missing(snpFile))
877
-#    if (load.it & !file.exists(snpFile)){
878
-#      load.it <- FALSE
879
-#      message("File ", snpFile, " does not exist.")
880
-#      stop("Cannot load SNP data.")
881
-#  }
882
-#  if (!load.it){
883
-    if(!missing(RG)) {
884
-      if(missing(XY))
885
-        XY = RGtoXY(RG, chipType=cdfName)
886
-      else
887
-        stop("Both RG and XY specified - please use one or the other")
888
-    }
889
-    if (missing(sns)) sns = sampleNames(XY)
890
-    res <- preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize,
891
-			       fitMixture=TRUE, verbose=verbose,
892
-			       seed=seed, eps=eps, cdfName=cdfName, sns=sns,
893
-			       stripNorm=stripNorm, useTarget=useTarget) #,
894
-#                        save.it=save.it, snpFile=snpFile, cnFile=cnFile)
895
-#    is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
896
-     is.lds = FALSE
897
-
898
-#    if(is.lds) {
899
-#      open(res[["A"]])
900
-#      open(res[["B"]])
901
-#      open(res[["SNR"]])
902
-#      open(res[["mixtureParams"]])
903
-#    }
904
-
905
-#    fD = featureData(XY)
906
-#    phenD = XY@phenoData
907
-#    protD = XY@protocolData
908
-#    rm(XY)
909
-#    gc(verbose=FALSE)
910
-#    if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability")
911
-#    callSet <- new("SnpSuperSet",
912
-#                    alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
913
-#                    alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
914
-#                    call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
915
-#                    callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
916
-#  	            annotation=cdfName, protocolData=protD, phenoData=phenD, featureData=fD)
917
-#    sampleNames(callSet) <- sns
918
-#    featureNames(callSet) <- res[["gns"]]
919
-#    pData(callSet)$SKW <- rep(NA, length(sns))
920
-#    pData(callSet)$SNR <- rep(NA, length(sns))
921
-#    pData(callSet)$gender <- rep(NA, length(sns))
922
-
923
-#  }else{
924
-#      if(verbose) message("Loading ", snpFile, ".")
925
-#        obj <- load(snpFile)
926
-#        if(verbose) message("Done.")
927
-#        if(!any(obj == "res"))
928
-#          stop("Object in ", snpFile, " seems to be invalid.")
929
-#  }
930
-
931
- #   rm(phenD, protD , fD)
932
-
933
-#    snp.index <- res$snpIndex #match(res$gns, featureNames(callSet))
934
-#    suppressWarnings(A(callSet) <- res[["A"]])
935
-#    suppressWarnings(B(callSet) <- res[["B"]])
936
-#    pData(callSet)$SKW <- res$SKW
937
-#    pData(callSet)$SNR <- res$SNR
938
-#    mixtureParams <- res$mixtureParams
939
-#    rm(res); gc(verbose=FALSE)
940
-  if(row.names) row.names=res$gns else row.names=NULL
941
-  if(col.names) col.names=res$sns else col.names=NULL
942
-  FUN = ifelse(is.lds, "crlmmGT2", "crlmmGT")
943
-  ## genotyping
944
-  crlmmGTfxn = function(FUN,...){
945
-		switch(FUN,
946
-		       crlmmGT2=crlmmGT2(...),
947
-		       crlmmGT=crlmmGT(...))
948
-              }
949
-  res2 = crlmmGTfxn(FUN,
950
-                     A=res[["A"]],
951
-                     B=res[["B"]],
952
-                     SNR=res[["SNR"]],
953
-                     mixtureParams=res[["mixtureParams"]],
954
-                     cdfName=cdfName,
955
-                     row.names=row.names,
956
-                     col.names=col.names,
957
-                     probs=probs,
958
-                     DF=DF,
959
-                     SNRMin=SNRMin,
960
-                     recallMin=recallMin,
961
-                     recallRegMin=recallRegMin,
962
-                     gender=gender,
963
-                     verbose=verbose,
964
-                     returnParams=returnParams,
965
-                     badSNP=badSNP)
966
-
967
-#  res2 <- crlmmGT2(A=res[["A"]], #as.matrix(A(callSet)[snp.index,]), # j]),
968
-#                  B=res[["B"]], # as.matrix(B(callSet)[snp.index,]),  # j]),
969
-#                  SNR=res[["SNR"]], # callSet$SNR, # [j],
970
-#                  mixtureParams=res[["mixtureParams"]], #,
971
-#                  cdfName=res[["cdfName"]], # annotation(callSet),
972
-#                  row.names=row.names, # featureNames(callSet)[snp.index],
973
-#                  col.names=col.names, # sampleNames(callSet), #[j],
974
-#                  probs=probs,
975
-#                  DF=DF,
976
-#                  SNRMin=SNRMin,
977
-#                  recallMin=recallMin,
978
-#                  recallRegMin=recallRegMin,
979
-#                  gender=gender,
980
-#                  verbose=verbose,
981
-#                  returnParams=returnParams,
982
-#                  badSNP=badSNP)
983
-#    rm(res); gc(verbose=FALSE)
984
-#    suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
985
-#    suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
986
-#    callSet$gender[j] <- tmp$gender
987
-#    suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]])
988
-#    suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]])
989
-#    callSet$gender <- tmp$gender
990
-#    rm(tmp); gc(verbose=FALSE)
991
-#    return(callSet)
992
-
993
-  res2[["SNR"]] = res[["SNR"]]
994
-  res2[["SKW"]] = res[["SKW"]]
995
-#  if(is.lds) {
996
-#    delete(res[["A"]])
997
-#    delete(res[["B"]])
998
-#    delete(res[["SNR"]])
999
-#    delete(res[["mixtureParams"]])
1000
-#  }
1001
-  rm(res); gc(verbose=FALSE)
1002
-  return(list2SnpSet(res2, returnParams=returnParams))
855
+crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
856
+			  row.names=TRUE, col.names=TRUE,
857
+			  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
858
+			  seed=1,
859
+			  mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
860
+			  cdfName, sns, recallMin=10, recallRegMin=1000,
861
+			  returnParams=FALSE, badSNP=.7) {
862
+	if(missing(cdfName)) {
863
+		if(!missing(RG))
864
+			cdfName = annotation(RG)
865
+		if(!missing(XY))
866
+			cdfName = annotation(XY)
867
+	}
868
+	if(!isValidCdfName(cdfName))
869
+		stop("cdfName not valid.  see validCdfNames")
870
+	if(!missing(RG)) {
871
+		if(missing(XY))
872
+			XY = RGtoXY(RG, chipType=cdfName)
873
+		else
874
+			stop("Both RG and XY specified - please use one or the other")
875
+	}
876
+	if (missing(sns)) sns = sampleNames(XY)
877
+	res <- preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize,
878
+				   fitMixture=TRUE, verbose=verbose,
879
+				   seed=seed, eps=eps, cdfName=cdfName, sns=sns,
880
+				   stripNorm=stripNorm, useTarget=useTarget) #,
881
+	if(row.names) row.names=res$gns else row.names=NULL
882
+	if(col.names) col.names=res$sns else col.names=NULL
883
+	res2 <- crlmmGT(A=res[["A"]],
884
+			B=res[["B"]],
885
+			SNR=res[["SNR"]],
886
+			mixtureParams=res[["mixtureParams"]],
887
+			cdfName=cdfName,
888
+			row.names=row.names,
889
+			col.names=col.names,
890
+			probs=probs,
891
+			DF=DF,
892
+			SNRMin=SNRMin,
893
+			recallMin=recallMin,
894
+			recallRegMin=recallRegMin,
895
+			gender=gender,
896
+			verbose=verbose,
897
+			returnParams=returnParams,
898
+			badSNP=badSNP)
899
+	res2[["SNR"]] = res[["SNR"]]
900
+	res2[["SKW"]] = res[["SKW"]]
901
+	rm(res); gc(verbose=FALSE)
902
+	return(list2SnpSet(res2, returnParams=returnParams))
1003 903
 }
1004 904
 
1005 905
 
... ...
@@ -1057,31 +957,67 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
1057 957
 
1058 958
     if(row.names) row.names=res$gns else row.names=NULL
1059 959
     if(col.names) col.names=res$sns else col.names=NULL
960
+##
961
+##  if(is.lds){
962
+##	  res2 <- crlmmGT2(A=res[["A"]],
963
+##			   B=res[["B"]],
964
+##			   SNR=res[["SNR"]],
965
+##			   mixtureParams=res[["mixtureParams"]],
966
+##			   cdfName=cdfName,
967
+##			   row.names=row.names,
968
+##			   col.names=col.names,
969
+##			   probs=probs,
970
+##			   DF=DF,
971
+##			   SNRMin=SNRMin,
972
+##			   recallMin=recallMin,
973
+##			   recallRegMin=recallRegMin,
974
+##			   gender=gender,
975
+##			   verbose=verbose,
976
+##			   returnParams=returnParams,
977
+##			   badSNP=badSNP)
978
+##  } else {
979
+	  res2 <- crlmmGT(A=res[["A"]],
980
+			   B=res[["B"]],
981
+			   SNR=res[["SNR"]],
982
+			   mixtureParams=res[["mixtureParams"]],
983
+			   cdfName=cdfName,
984
+			   row.names=row.names,
985
+			   col.names=col.names,
986
+			   probs=probs,
987
+			   DF=DF,
988
+			   SNRMin=SNRMin,
989
+			   recallMin=recallMin,
990
+			   recallRegMin=recallRegMin,
991
+			   gender=gender,
992
+			   verbose=verbose,
993
+			   returnParams=returnParams,
994
+			   badSNP=badSNP)
995
+##  }
1060 996
 
1061
-    FUN = ifelse(is.lds, "crlmmGT2", "crlmmGT")
1062
-    ## genotyping
1063
-    crlmmGTfxn = function(FUN,...){
1064
-		switch(FUN,
1065
-		       crlmmGT2=crlmmGT2(...),
1066
-		       crlmmGT=crlmmGT(...))
1067
-              }
1068
-    res2 = crlmmGTfxn(FUN,
1069
-                     A=res[["A"]],
1070
-                     B=res[["B"]],
1071
-                     SNR=res[["SNR"]],
1072
-                     mixtureParams=res[["mixtureParams"]],
1073
-                     cdfName=cdfName,
1074
-                     row.names=row.names,
1075
-                     col.names=col.names,
1076
-                     probs=probs,
1077
-                     DF=DF,
1078
-                     SNRMin=SNRMin,
1079
-                     recallMin=recallMin,
1080
-                     recallRegMin=recallRegMin,
1081
-                     gender=gender,
1082
-                     verbose=verbose,
1083
-                     returnParams=returnParams,
1084
-                     badSNP=badSNP)
997
+##    FUN = ifelse(is.lds, "crlmmGT2", "crlmmGT")
998
+##    ## genotyping
999
+##    crlmmGTfxn = function(FUN,...){
1000
+##		switch(FUN,
1001
+##		       crlmmGT2=crlmmGT2(...),
1002
+##		       crlmmGT=crlmmGT(...))
1003
+##              }
1004
+##    res2 = crlmmGTfxn(FUN,
1005
+##                     A=res[["A"]],
1006
+##                     B=res[["B"]],
1007
+##                     SNR=res[["SNR"]],
1008
+##                     mixtureParams=res[["mixtureParams"]],
1009
+##                     cdfName=cdfName,
1010
+##                     row.names=row.names,
1011
+##                     col.names=col.names,
1012
+##                     probs=probs,
1013
+##                     DF=DF,
1014
+##                     SNRMin=SNRMin,
1015
+##                     recallMin=recallMin,
1016
+##                     recallRegMin=recallRegMin,
1017
+##                     gender=gender,
1018
+##                     verbose=verbose,
1019
+##                     returnParams=returnParams,
1020
+##                     badSNP=badSNP)
1085 1021
 
1086 1022
 #    if(is.lds) {
1087 1023
 #      open(res[["SNR"]]); open(res[["SKW"]])
... ...
@@ -1136,6 +1072,16 @@ getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verb
1136 1072
        return(protocoldata)
1137 1073
 }
1138 1074
 
1075
+getAvailableIlluminaGenomeBuild <- function(path){
1076
+	snp.file <- list.files(path, pattern="snpProbes_hg")
1077
+	if(length(snp.file) > 1){
1078
+		## use hg19
1079
+		message("genome build not specified. Using build hg19 for annotation.")
1080
+		snp.file <- snp.file[1]
1081
+	}
1082
+	genome <- gsub(".rda", "", strsplit(snp.file, "snpProbes_")[[1]][[2]])
1083
+}
1084
+
1139 1085
 
1140 1086
 constructInf <- function(sampleSheet=NULL,
1141 1087
 			 arrayNames=NULL,
... ...
@@ -1209,7 +1155,10 @@ constructInf <- function(sampleSheet=NULL,
1209 1155
 	if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
1210 1156
 
1211 1157
 	if(verbose) message("Initializing container for genotyping and copy number estimation")
1212
-	featureData = getFeatureData(cdfName, copynumber=TRUE)
1158
+	pkgname <- getCrlmmAnnotationName(cdfName)
1159
+	path <- system.file("extdata", package=pkgname)
1160
+	genome <- getAvailableIlluminaGenomeBuild(path)
1161
+	featureData = getFeatureData(cdfName, copynumber=TRUE, genome=genome)
1213 1162
 	nr = nrow(featureData); nc = narrays
1214 1163
 	sns <-  if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
1215 1164
 		make.unique(sampleSheet$Sample_ID)
... ...
@@ -1228,7 +1177,8 @@ constructInf <- function(sampleSheet=NULL,
1228 1177
 		     callProbability=bigd,
1229 1178
 		     annotation=cdfName,
1230 1179
 		     featureData=featureData,
1231
-		     batch=batch)
1180
+		     batch=batch,
1181
+		     genome=genome)
1232 1182
 ##        if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
1233 1183
 ##		sampleNames(cnSet) = make.unique(sampleSheet$Sample_ID)
1234 1184
 ##        } else{
... ...
@@ -1338,21 +1288,23 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3),
1338 1288
 ##	message("Writing complete.  Begin genotyping...")
1339 1289
 ##	close(A(cnSet))
1340 1290
 ##	close(B(cnSet))
1341
-	tmp <- crlmmGT2(A=snpCall(cnSet),
1342
-			  B=snpCallProbability(cnSet),
1343
-			  SNR=cnSet$SNR,
1344
-			  mixtureParams=mixtureParams,
1345
-			  cdfName=cdfName,
1346
-			  col.names=sampleNames(cnSet),
1347
-			  probs=probs,
1348
-			  DF=DF,
1349
-			  SNRMin=SNRMin,
1350
-			  recallMin=recallMin,
1351
-			  recallRegMin=recallRegMin,
1352
-			  gender=gender,
1353
-			  verbose=verbose,
1354
-			  returnParams=returnParams,
1355
-			  badSNP=badSNP)#,
1291
+	tmp <- crlmmGT2(A=A(cnSet),
1292
+			B=B(cnSet),
1293
+			SNR=cnSet$SNR,
1294
+			mixtureParams=mixtureParams,
1295
+			cdfName=cdfName,
1296
+			col.names=sampleNames(cnSet),
1297
+			probs=probs,
1298
+			DF=DF,
1299
+			SNRMin=SNRMin,
1300
+			recallMin=recallMin,
1301
+			recallRegMin=recallRegMin,
1302
+			gender=gender,
1303
+			verbose=verbose,
1304
+			returnParams=returnParams,
1305
+			badSNP=badSNP,
1306
+			callsGt=calls(cnSet),
1307
+			callsPr=snpCallProbability(cnSet))#,
1356 1308
 	##RS:  I changed the API for crlmmGT2 to be consistent with crlmmGT
1357 1309
 	## snp.names=featureNames(cnSet)[snp.index])
1358 1310
 	if(verbose) message("Genotyping finished.  Updating container with genotype calls and confidence scores.")
... ...
@@ -1430,7 +1382,7 @@ genotype.Illumina <- function(sampleSheet=NULL,
1430 1382
 				       cdfName=cdfName)
1431 1383
 	message("Preprocessing complete.  Begin genotyping...")
1432 1384
 	genotypeInf(cnSet=cnSet, mixtureParams=mixtureParams,
1433
-		   probs=probs,
1385
+		    probs=probs,
1434 1386
 		   SNRMin=SNRMin,
1435 1387
 		   recallMin=recallMin,
1436 1388
 		   recallRegMin=recallRegMin,
... ...
@@ -78,7 +78,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
78 78
 		} else YIndex2 <- YIndex
79 79
 		message("Imputing gender")
80 80
 		gender <- imputeGender(XIndex=XIndex2, YIndex=YIndex2)
81
-		cnSet$gender[,] <- gender
81
+		##cnSet$gender[,] <- gender
82 82
 	}
83 83
 	Indexes <- list(autosomeIndex, XIndex, YIndex)
84 84
 	cIndexes <- list(keepIndex,
... ...
@@ -115,7 +115,6 @@ celFiles <- celFiles[substr(basename(celFiles), 13, 13) %in% c("C", "Y")]
115 115
 if(exists("file.index")){
116 116
 	celFiles <- celFiles[file.index]
117 117
 }
118
-##celFiles <- celFiles[substr(basename(celFiles), 13, 13) %in% "C"]
119 118
 @
120 119
 
121 120
 Finally, copy number analyses using \crlmm{} require specification of
... ...
@@ -145,27 +144,70 @@ by a median.  For the nonpolymorphic markers on Affymetrix 6.0, only
145 144
 one probe per locus is available and the summarization step is not
146 145
 needed.  After preprocessing the arrays, the \crlmm{} package
147 146
 estimates the genotype using the CRLMM algorithm and provides a
148
-confidence score for the genotype calls.  The function
149
-\Rfunction{genotype} performs both the preprocessing and genotyping.
147
+confidence score for the genotype calls.   To begin, we initialize a
148
+container for the normalized intensities:
150 149
 
151
-<<LDS_genotype, cache=TRUE>>=
152
-cnSet <- genotype(celFiles, batch=plates, cdfName=cdfName, genome="hg19")
150
+<<constructCNSet, cache=TRUE>>=
151
+cnSet <- constructAffyCNSet(celFiles, batch=plates,
152
+			    cdfName="genomewidesnp6",
153
+			    genome="hg19")
153 154
 @
154 155
 
155
-Segment faults that occur with the above step can often be traced to a
156
-corrupt cel file. To check if any of the files are corrupt, try
157
-reading the files in one at a time:
156
+
157
+We quantile normalize the SNPs and nonpolymorphic markers separately.
158
+Since the normalized intensities are ff objects, the functions
159
+\Rfunction{cnrmaAffy} and \Rfunction{snprmaAffy} write the normalized
160
+intensities to disk and nothing is returned.
161
+
162
+<<cnrmaAffy, cache=TRUE>>=
163
+cnrmaAffy(cnSet)
164
+@
165
+
166
+Any segment fault that occurs during the normalization can often be
167
+traced to a corrupt cel file. To check if any of the files are
168
+corrupt, one can use the function \Rfunction{validCEL} that tries to
169
+read each files as in the following unevaluated codechunk:
158 170
 
159 171
 <<checkcorrupt,eval=FALSE>>=
160 172
 validCEL(celFiles)
161 173
 @
162 174
 
175
+<<snprmaAffy, cache=TRUE>>=
176
+snprmaAffy(cnSet)
177
+@
178
+
179
+The function \Rfunction{genotypeAffy} performs performs the genotyping.
180
+
181
+<<genotypeAffy, cache=TRUE>>=
182
+genotypeAffy(cnSet, gender=NULL)
183
+@
184
+
185
+The above function also imputes the gender from the chromosome X and Y
186
+intensities when the argument gender is \texttt{NULL}. The imputed
187
+genders are
163 188
 
164
-The value returned by genotype is an instance of the class
165
-\Robject{CNSet}.  The normalized intensities, genotype calls, and
166
-confidence scores are stored as \verb+ff+ objects in the
167
-\verb+assayData+ slot.  A concise summary of this object can be
168
-obtained throught the \Rfunction{print} or \Rfunction{show} methods.
189
+<<gender>>=
190
+table(c("male", "female")[cnSet$gender[]])
191
+@
192
+
193
+<<genderCheck,echo=FALSE>>=
194
+if(any(is.na(cnSet$gender[]))) stop("missing genders")
195
+@
196
+
197
+%Parameters for determining log R ratios and B allele frequencies are
198
+%estimated using the function \Rfunction{crlmmCopynumber}:
199
+%
200
+%<<crlmmCopynumber>>=
201
+%crlmmCopynumber(cnSet, fit.linearModel=FALSE)
202
+%@
203
+%<<LDS_genotype, cache=TRUE>>=
204
+%cnSet <- genotype(celFiles, batch=plates, cdfName=cdfName, genome="hg19")
205
+%@
206
+
207
+The normalized intensities, genotype calls, and confidence scores are
208
+stored as \verb+ff+ objects in the \verb+assayData+ slot.  A concise
209
+summary of this object can be obtained throught the \Rfunction{print}
210
+or \Rfunction{show} methods.
169 211
 
170 212
 <<show>>=
171 213
 print(cnSet)
... ...
@@ -184,6 +226,14 @@ print(object.size(cnSet), units="Mb")
184 226
 %available accessors.
185 227
 \SweaveInput{copynumber}
186 228
 
229
+A sample-specific estimate of the signal to noise ratio (SNR)
230
+measuring the overall separation of the genotypes provides a measure
231
+of sample quality.  Samples with SNRs below 5 typically indicate poor
232
+quality, and typically have genotypes with lower confidence scores and
233
+noisier copy number estimates.  The SNR is stored in the
234
+\Robject{phenoData} slot of the \Rclass{CNSet} class and can be
235
+accessed using the ``\$" operator.
236
+
187 237
 
188 238
 \section{Session information}
189 239
 <<sessionInfo, results=tex>>=
... ...
@@ -196,15 +246,14 @@ toLatex(sessionInfo())
196 246
   \includegraphics[width=0.6\textwidth]{AffyGW-snr.pdf}
197 247
   \caption{The signal to noise ratio (SNR) for 180 HapMap samples. For
198 248
     Affymetrix platforms, SNR values below 5 can indicate possible
199
-    problems with sample quality.  In some circumstances, it may be
200
-    more helpful to exclude samples with poor DNA quality.}
249
+    problems with sample quality.  }
201 250
 \end{center}
202 251
 \end{figure}
203 252
 
204 253
 
205
-\begin{bibliography}
254
+%\begin{bibliography}
206 255
   \bibliographystyle{plain}
207 256
   \bibliography{refs}
208
-\end{bibliography}
257
+%\end{bibliography}
209 258
 
210 259
 \end{document}
211 260
Binary files a/inst/scripts/AffyGW.pdf and b/inst/scripts/AffyGW.pdf differ
212 261
Binary files a/inst/scripts/IlluminaPreprocessCN.pdf and b/inst/scripts/IlluminaPreprocessCN.pdf differ
... ...
@@ -63,7 +63,7 @@ object of class \Rclass{CNSet}.  See the appropriate
63 63
 preprocessing/genotyping vignette for the construction of an object of
64 64
 class \Rclass{CNSet}.
65 65
 <<LDS_copynumber,cache=TRUE>>=
66
-(cnSet.updated <- crlmmCopynumber(cnSet))
66
+crlmmCopynumber(cnSet)
67 67
 @
68 68
 
69 69
 The following steps were performed by the \Rfunction{crlmmCopynumber}
... ...
@@ -29,8 +29,8 @@
29 29
 
30 30
 \textwidth=6.2in
31 31
 
32
-\bibliographystyle{plainnat} 
33
- 
32
+\bibliographystyle{plainnat}
33
+
34 34
 \begin{document}
35 35
 %\setkeys{Gin}{width=0.55\textwidth}
36 36
 
... ...
@@ -40,26 +40,26 @@
40 40
 
41 41
 \section{Getting started}
42 42
 
43
-In this user guide we read in and genotype data from 40 HapMap samples 
43
+In this user guide we read in and genotype data from 40 HapMap samples
44 44
 which have been analyzed using Illumina's 370k Duo BeadChips.
45
-This data is available in the \Rpackage{hapmap370k} package.  
46
-Additional chip-specific model parameters and basic SNP annotation 
45
+This data is available in the \Rpackage{hapmap370k} package.
46
+Additional chip-specific model parameters and basic SNP annotation
47 47
 information used by CRLMM is stored in the \Rpackage{human370v1cCrlmm} package.
48 48
 The required packages can be installed in the usual way using the \Rfunction{biocLite} function.
49 49
 
50 50
 <<echo=TRUE, results=hide, eval=FALSE>>=
51 51
 source("http://www.bioconductor.org/biocLite.R")
52 52
 biocLite(c("crlmm", "hapmap370k", "human370v1cCrlmm"))
53
-@ 
53
+@
54 54
 
55
-\section{Reading in data} 
56
-The function \Rfunction{readIdatFiles} extracts the Red and Green intensities 
55
+\section{Reading in data}
56
+The function \Rfunction{readIdatFiles} extracts the Red and Green intensities
57 57
 from the binary {\tt idat} files output by Illumina's scanning device.
58 58
 The file {\tt samples370k.csv} contains information about each sample.
59 59
 
60 60
 <<echo=FALSE, results=hide, eval=TRUE>>=
61 61
 options(width=70)
62
-@ 
62
+@
63 63
 
64 64
 <<read, results=hide, eval=TRUE>>=
65 65
 library(Biobase)
... ...
@@ -71,21 +71,21 @@ data.dir = system.file("idatFiles", package="hapmap370k")
71 71
 # Read in sample annotation info
72 72
 samples = read.csv(file.path(data.dir, "samples370k.csv"), as.is=TRUE)
73 73
 samples[1:5,]
74
-@ 
74
+@
75 75
 
76 76
 <<read2, results=hide, cache=TRUE>>=
77 77
 # Read in .idats using sampleSheet information
78
-RG = readIdatFiles(samples, path=data.dir, 
78
+RG = readIdatFiles(samples, path=data.dir,
79 79
 arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"),saveDate=TRUE)
80 80
 @
81 81
 
82
-Reading in this data takes approximately 100 seconds and peak memory usage 
82
+Reading in this data takes approximately 100 seconds and peak memory usage
83 83
 was 0.8 GB of RAM on our linux system.
84 84
 If memory is limiting, load the \Rpackage{ff} package and run the same command.
85 85
 When this package is available, the objects are stored using disk rather then RAM.
86
-The \Robject{RG} object is an \Rclass{NChannelSet} which stores the 
87
-Red and Green intensities, the number of beads and standard errors for 
88
-each bead-type.  
86
+The \Robject{RG} object is an \Rclass{NChannelSet} which stores the
87
+Red and Green intensities, the number of beads and standard errors for
88
+each bead-type.
89 89
 The scanning date of each array is stored in \Robject{protocolData}.
90 90
 
91 91
 <<explore>>=
... ...
@@ -105,19 +105,19 @@ levels(scanbatch) = 1:16
105 105
 scanbatch = as.numeric(scanbatch)
106 106
 @
107 107
 
108
-If GenCall output is available instead of idat files, the function \Rfunction{readGenCallOutput} can be 
109
-used to read in the data.  
108
+If GenCall output is available instead of idat files, the function \Rfunction{readGenCallOutput} can be
109
+used to read in the data.
110 110
 This function assumes the GenCall output is formatted to have samples listed one below the other,
111
-and that the columns 'X Raw' and 'Y Raw' are available in the file.  
111
+and that the columns 'X Raw' and 'Y Raw' are available in the file.
112 112
 The resulting \Robject{NChannelSet} from this function can be used as input to \Rfunction{crlmmIllumina} via the \Robject{XY} argument (instead of the usual \Rfunction{RG} argument used when the data has been read in from idat files).
113 113
 
114 114
 Plots of the summarised data can be easily generated to check for arrays with poor signal.
115 115
 
116 116
 <<boxplots, fig=TRUE, width=8, height=8>>=
117 117
 par(mfrow=c(2,1), mai=c(0.4,0.4,0.4,0.1), oma=c(1,1,0,0))
118
-boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", names=1:40, 
118
+boxplot(log2(exprs(channel(RG, "R"))), xlab="Array", ylab="", names=1:40,
119 119
 main="Red channel",outline=FALSE,las=2)
120
-boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", names=1:40, 
120
+boxplot(log2(exprs(channel(RG, "G"))), xlab="Array", ylab="", names=1:40,
121 121
 main="Green channel",outline=FALSE,las=2)
122 122
 mtext(expression(log[2](intensity)), side=2, outer=TRUE)
123 123
 mtext("Array", side=1, outer=TRUE)
... ...
@@ -129,36 +129,37 @@ Next we use the function \Rfunction{crlmmIllumina} which performs preprocessing
129 129
 
130 130
 <<genotype, results=hide, cache=TRUE>>=
131 131
 crlmmResult = crlmmIllumina(RG=RG, cdfName="human370v1c", returnParams=TRUE)
132
-@ 
132
+@
133 133
 
134 134
 This analysis took 3 minutes to complete and peak memory usage was 1.9 GB on our system.
135
-The output stored in \Robject{crlmmResult} is a \Rclass{SnpSet} object.                                                                                                                                                         
135
+The output stored in \Robject{crlmmResult} is a \Rclass{SnpSet} object.
136 136
 <<explore2>>=
137 137
 class(crlmmResult)
138 138
 dim(crlmmResult)
139 139
 slotNames(crlmmResult)
140 140
 calls(crlmmResult)[1:10, 1:5]
141
-@ 
141
+@
142 142
 
143
-Plotting the {\it SNR} reveals no obvious batch effects in this data set (different symbols are used for 
143
+Plotting the {\it SNR} reveals no obvious batch effects in this data set (different symbols are used for
144 144
 arrays scanned on different days).
145 145
 
146 146
 <<snr,  fig=TRUE, width=8, height=6>>=
147
-plot(crlmmResult[["SNR"]], pch=scanbatch, xlab="Array", ylab="SNR", 
148
-main="Signal-to-noise ratio per array",las=2)
147
+plot(crlmmResult[["SNR"]], pch=scanbatch, xlab="Array", ylab="SNR",
148
+     main="Signal-to-noise ratio per array",las=2)
149 149
 @
150 150
 
151
-An all-in-one function named \Rfunction{crlmmIlluminaV2} that combines reading of idat files with genotyping is also available.
151
+An all-in-one function named \Rfunction{crlmmIlluminaV2} that combines
152
+reading of idat files with genotyping is also available.
152 153
 
153 154
 <<readandgenotypeinone, results=hide, cache=TRUE>>=
154
-crlmmResult2 = crlmmIlluminaV2(samples, path=data.dir, 
155
-arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"),
156
-saveDate=TRUE, cdfName="human370v1c", returnParams=TRUE)
157
-@ 
155
+crlmmResult2 <- crlmmIlluminaV2(samples, path=data.dir,
156
+				arrayInfoColNames=list(barcode=NULL,position="SentrixPosition"),
157
+				saveDate=TRUE, cdfName="human370v1c", returnParams=TRUE)
158
+@
158 159
 
159 160
 \section{System information}
160 161
 
161
-This analysis was carried out on a linux machine with 32GB of RAM 
162
+This analysis was carried out on a linux machine with 32GB of RAM
162 163
 using the following packages:
163 164
 
164 165
 <<session>>=
... ...
@@ -6,6 +6,7 @@ vignettes: AffyGW.tex
6 6
 	cp ../inst/scripts/IlluminaPreprocessCN.pdf .
7 7
 	cp ../inst/scripts/Infrastructure.pdf .
8 8
 	cp ../inst/scripts/gtypeDownstream.pdf .
9
+	cp ../inst/scripts/crlmmIllumina.pdf .
9 10
 	texi2dvi --pdf genotyping.tex
10 11
 	texi2dvi --pdf CopyNumberOverview.tex
11 12