Browse code

Merge branch 'collab'

* collab:
replace splitIndicesByLength with splitIndicesByNode throughout cnrma-functions.R (check that snow is loaded and getCluster is not null)
add an example to genotype.Illumina that indicates how parallelization would be enabled. The example requires local data and is not run.
change outdir in IlluminaPreprocessCN and AffyGW
Update R/crlmm-illumina.R
bump version for parallelization of genotype.Illumina
Update R/crlmm-illumina.R

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@64151 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 20/03/2012 13:55:50
Showing 6 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.13.10
4
+Version: 1.13.11
5 5
 Date: 2010-12-10
6 6
 Author: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -1540,7 +1540,10 @@ shrinkSummary <- function(object,
1540 1540
 		marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X)
1541 1541
 	}
1542 1542
 	if(is.lds){
1543
-		index.list <- splitIndicesByLength(marker.index, ocProbesets())
1543
+		##index.list <- splitIndicesByLength(marker.index, ocProbesets())
1544
+		if(!is.null(getCluster()) & isPackageLoaded("snow")){
1545
+			index.list <- splitIndicesByNode(marker.index)
1546
+		} else index.list <- splitIndicesByLength(marker.index, ocProbesets())
1544 1547
 		ocLapply(seq(along=index.list),
1545 1548
 			 shrinkGenotypeSummaries,
1546 1549
 			 index.list=index.list,
... ...
@@ -1591,7 +1594,10 @@ genotypeSummary <- function(object,
1591 1594
 	myf <- summaryFxn(type[[1]])
1592 1595
 	FUN <- get(myf)
1593 1596
 	if(is.lds){
1594
-		index.list <- splitIndicesByLength(marker.index, ocProbesets())
1597
+		##index.list <- splitIndicesByLength(marker.index, ocProbesets())
1598
+		if(!is.null(getCluster()) & isPackageLoaded("snow")){
1599
+			index.list <- splitIndicesByNode(marker.index)
1600
+		} else index.list <- splitIndicesByLength(marker.index, ocProbesets())
1595 1601
 		ocLapply(seq(along=index.list),
1596 1602
 			 FUN,
1597 1603
 			 index.list=index.list,
... ...
@@ -2110,7 +2116,9 @@ estimateCnParameters <- function(object,
2110 2116
 	myfun <- lmFxn(type[[1]])
2111 2117
 	FUN <- get(myfun)
2112 2118
 	if(is.lds){
2113
-		index.list <- splitIndicesByLength(marker.index, ocProbesets())
2119
+		if(!is.null(getCluster()) & isPackageLoaded("snow")){
2120
+			index.list <- splitIndicesByNode(marker.index)
2121
+		} else index.list <- splitIndicesByLength(marker.index, ocProbesets())
2114 2122
 		ocLapply(seq(along=index.list),
2115 2123
 			 FUN,
2116 2124
 			 index.list=index.list,
... ...
@@ -108,7 +108,7 @@ readIdatFiles = function(sampleSheet=NULL,
108 108
 		       if(!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)){
109 109
 		            sampleNames(RG) = sampleSheet$Sample_ID
110 110
 		       } else  sampleNames(RG) = arrayNames
111
-		       gc()
111
+		       gc(verbose=FALSE)
112 112
 	       }
113 113
 	       if(length(ids)==length(idsG)) {
114 114
 		       if(sum(ids==idsG)==nprobes) {
... ...
@@ -121,7 +121,7 @@ readIdatFiles = function(sampleSheet=NULL,
121 121
 		       zeroG = G$Quants[indG, "NBeads"]==0
122 122
 	       }
123 123
 	       rm(G)
124
-	       gc()
124
+	       gc(verbose=FALSE)
125 125
 	       if(verbose) {
126 126
                       cat(paste(sep, fileExt$red, sep=""), "\n")
127 127
 	       }
... ...
@@ -140,7 +140,7 @@ readIdatFiles = function(sampleSheet=NULL,
140 140
 	       }
141 141
 	       RG@assayData$zero[,i] = zeroG | zeroR
142 142
 	       rm(R, zeroG, zeroR)
143
-	       gc()
143
+	       gc(verbose=FALSE)
144 144
        }
145 145
        if(saveDate) {
146 146
 	       protocolData(RG)[["ScanDate"]] = dates$scan
... ...
@@ -500,7 +500,7 @@ readGenCallOutput = function(file, path=".", cdfName,
500 500
             X[,i]= dat$"XRaw"[ind][m]
501 501
             Y[,i]= dat$"YRaw"[ind][m]
502 502
         }
503
-        gc()
503
+        gc(verbose=FALSE)
504 504
     }
505 505
 
506 506
     zeroes=(X=="0"|Y=="0")
... ...
@@ -597,7 +597,8 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
597 597
   sampleNames(XY) = sampleNames(RG)
598 598
   ## RS
599 599
   rm(list=c("bord", "infI", "aids", "bids", "ids", "snpbase"))
600
-  print(gc())
600
+#  print(gc())
601
+  gc(verbose=FALSE)
601 602
   # Need to initialize - matrices filled with NAs to begin with
602 603
 #  XY@assayData$X[1:nsnps,] = 0
603 604
 #  XY@assayData$Y[1:nsnps,] = 0
... ...
@@ -611,17 +612,18 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
611 612
   ##r <- as.matrix(exprs(channel(RG, "R"))[not.na.aord,]) # mostly red
612 613
   r <- as.matrix(assayData(RG)[["R"]][not.na.aord, ])
613 614
   XY@assayData$X[not.na,] <- r
614
-  rm(r);gc()
615
+  rm(r);gc(verbose=FALSE)
615 616
   g <- as.matrix(assayData(RG)[["G"]][not.na.aord,]) # mostly green
616 617
   XY@assayData$Y[not.na,] <- g
617
-  rm(g); gc()
618
+  rm(g); gc(verbose=FALSE)
618 619
   ##z <- as.matrix(exprs(channel(RG, "zero"))[not.na.aord,]) # mostly green
619 620
   z <- as.matrix(assayData(RG)[["zero"]][not.na.aord, ])
620 621
   XY@assayData$zero[not.na,] <- z
621
-  rm(z); gc()
622
+  rm(z); gc(verbose=FALSE)
622 623
   ##RS added
623 624
   rm(RG)
624
-  print(gc())
625
+#  print(gc())
626
+  gc(verbose=FALSE)
625 627
   ## Warning - not 100% sure that the code below is correct - could be more complicated than this
626 628
 #  infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]")
627 629
 
... ...
@@ -638,7 +640,7 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
638 640
 #  XY@assayData$X[infI,] = 0
639 641
 #  XY@assayData$Y[infI,] = 0
640 642
 #  XY@assayData$zero[infI,] = 0
641
-#  gc()
643
+#  gc(verbose=FALSE)
642 644
   XY
643 645
 }
644 646
 
... ...
@@ -685,7 +687,7 @@ stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
685 687
     XY@assayData$X[sel,] = matrix(as.integer(tmp[,1:(ncol(tmp)/2)]+16), nrow(tmp), ncol(tmp)/2)
686 688
     XY@assayData$Y[sel,] = matrix(as.integer(tmp[,(ncol(tmp)/2+1):ncol(tmp)]+16), nrow(tmp), ncol(tmp)/2)
687 689
     rm(subX, subY, tmp, sel)
688
-    gc()
690
+    gc(verbose=FALSE)
689 691
   }
690 692
   if(verbose)
691 693
     cat("\n")
... ...
@@ -755,7 +757,8 @@ preprocessInfinium2 = function(XY, mixtureSampleSize=10^5,
755 757
 		##      t0 <- proc.time()-t0
756 758
 		##      if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".")
757 759
 		rm(A, B, zero)
758
-		print(gc())
760
+#		print(gc())
761
+		gc(verbose=FALSE)
759 762
 	}
760 763
   # next process snp probes
761 764
   nprobes = length(snpIndex)
... ...
@@ -805,7 +808,7 @@ preprocessInfinium2 = function(XY, mixtureSampleSize=10^5,
805 808
 		  else cat(".")
806 809
 	  }
807 810
 	  ## run garbage collection every now and then
808
-	  if(i %% 100 == 0) gc();
811
+	  if(i %% 100 == 0) gc(verbose=FALSE);
809 812
   }
810 813
   if (verbose) {
811 814
 	  if (getRversion() > '2.7.0') close(pb)
... ...
@@ -888,8 +891,8 @@ if(!isValidCdfName(cdfName))
888 891
 			       seed=seed, eps=eps, cdfName=cdfName, sns=sns,
889 892
 			       stripNorm=stripNorm, useTarget=useTarget) #,
890 893
 #                        save.it=save.it, snpFile=snpFile, cnFile=cnFile)
891
-    is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
892
-
894
+#    is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
895
+     is.lds = FALSE
893 896
 
894 897
 #    if(is.lds) {
895 898
 #      open(res[["A"]])
... ...
@@ -902,7 +905,7 @@ if(!isValidCdfName(cdfName))
902 905
 #    phenD = XY@phenoData
903 906
 #    protD = XY@protocolData
904 907
 #    rm(XY)
905
-#    gc()
908
+#    gc(verbose=FALSE)
906 909
 #    if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability")
907 910
 #    callSet <- new("SnpSuperSet",
908 911
 #                    alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
... ...
@@ -932,7 +935,7 @@ if(!isValidCdfName(cdfName))
932 935
 #    pData(callSet)$SKW <- res$SKW
933 936
 #    pData(callSet)$SNR <- res$SNR
934 937
 #    mixtureParams <- res$mixtureParams
935
-#    rm(res); gc()
938
+#    rm(res); gc(verbose=FALSE)
936 939
   if(row.names) row.names=res$gns else row.names=NULL
937 940
   if(col.names) col.names=res$sns else col.names=NULL
938 941
   FUN = ifelse(is.lds, "crlmmGT2", "crlmmGT")
... ...
@@ -976,14 +979,14 @@ if(!isValidCdfName(cdfName))
976 979
 #                  verbose=verbose,
977 980
 #                  returnParams=returnParams,
978 981
 #                  badSNP=badSNP)
979
-#    rm(res); gc()
982
+#    rm(res); gc(verbose=FALSE)
980 983
 #    suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
981 984
 #    suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
982 985
 #    callSet$gender[j] <- tmp$gender
983 986
 #    suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]])
984 987
 #    suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]])
985 988
 #    callSet$gender <- tmp$gender
986
-#    rm(tmp); gc()
989
+#    rm(tmp); gc(verbose=FALSE)
987 990
 #    return(callSet)
988 991
 
989 992
   res2[["SNR"]] = res[["SNR"]]
... ...
@@ -994,7 +997,7 @@ if(!isValidCdfName(cdfName))
994 997
 #    delete(res[["SNR"]])
995 998
 #    delete(res[["mixtureParams"]])
996 999
 #  }
997
-  rm(res); gc()
1000
+  rm(res); gc(verbose=FALSE)
998 1001
   return(list2SnpSet(res2, returnParams=returnParams))
999 1002
 }
1000 1003
 
... ...
@@ -1037,7 +1040,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
1037 1040
 #      open(RG@assayData$R); open(RG@assayData$G); open(RG@assayData$zero)
1038 1041
 #      delete(RG@assayData$R); delete(RG@assayData$G); delete(RG@assayData$zero)
1039 1042
 #    }
1040
-    rm(RG); gc()
1043
+    rm(RG); gc(verbose=FALSE)
1041 1044
 
1042 1045
     if (missing(sns)) { sns = sampleNames(XY)
1043 1046
     }
... ...
@@ -1049,7 +1052,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
1049 1052
 #      open(XY@assayData$X); open(XY@assayData$Y); open(XY@assayData$zero)
1050 1053
 #      delete(XY@assayData$X); delete(XY@assayData$Y); delete(XY@assayData$zero)
1051 1054
 #    }
1052
-    rm(XY); gc()
1055
+    rm(XY); gc(verbose=FALSE)
1053 1056
 
1054 1057
     if(row.names) row.names=res$gns else row.names=NULL
1055 1058
     if(col.names) col.names=res$sns else col.names=NULL
... ...
@@ -1088,7 +1091,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
1088 1091
  #    delete(res[["A"]]); delete(res[["B"]])
1089 1092
  #    delete(res[["SKW"]]); delete(res[["SNR"]]); delete(res[["mixtureParams"]])
1090 1093
  #  }
1091
-    rm(res); gc()
1094
+    rm(res); gc(verbose=FALSE)
1092 1095
     return(list2SnpSet(res2, returnParams=returnParams))
1093 1096
 }
1094 1097
 
... ...
@@ -1123,7 +1126,7 @@ getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verb
1123 1126
 	       scanDates$ScanDate[i] = G$RunInfo[1, 1]
1124 1127
 	       scanDates$DecodeDate[i] = G$RunInfo[2, 1]
1125 1128
 	       rm(G)
1126
-	       gc()
1129
+	       gc(verbose=FALSE)
1127 1130
        }
1128 1131
        protocoldata = new("AnnotatedDataFrame",
1129 1132
 			    data=scanDates,
... ...
@@ -1260,7 +1263,8 @@ preprocessInf <- function(cnSet,
1260 1263
 	is.snp = isSnp(cnSet)
1261 1264
 	snp.index = which(is.snp)
1262 1265
 	narrays = ncol(cnSet)
1263
-	sampleBatches <- splitIndicesByLength(seq(length=ncol(cnSet)), ocSamples())
1266
+#	sampleBatches <- splitIndicesByLength(seq(length=ncol(cnSet)), ocSamples())
1267
+	sampleBatches <- splitIndicesByNode(seq(length=ncol(cnSet)))
1264 1268
 	mixtureParams = initializeBigMatrix("crlmmMixt-", 4, narrays, "double")
1265 1269
 	ocLapply(seq_along(sampleBatches),
1266 1270
 		 processIDAT,
... ...
@@ -1460,12 +1464,12 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
1460 1464
                        highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose)
1461 1465
         XY = RGtoXY(RG, chipType=cdfName)
1462 1466
         rm(RG)
1463
-        gc()
1467
+        gc(verbose=FALSE)
1464 1468
         if (missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY)
1465 1469
         res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1466 1470
                                seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir)
1467 1471
         rm(XY)
1468
-        gc()
1472
+        gc(verbose=FALSE)
1469 1473
 	if(verbose) message("Finished preprocessing.")
1470 1474
         snp.index = which(is.snp)
1471 1475
 	np.index = which(!is.snp)
... ...
@@ -1507,6 +1511,6 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
1507 1511
         close(SNR); close(SKW)
1508 1512
         close(mixtureParams)
1509 1513
         rm(res)
1510
-	gc()
1514
+	gc(verbose=FALSE)
1511 1515
         TRUE
1512 1516
       }
... ...
@@ -72,10 +72,7 @@ files to the path indicated by \verb+outdir+.
72 72
 
73 73
 <<setup>>=
74 74
 pathToCels <- "/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m"
75
-if(getRversion() < "2.13.0"){
76
-	rpath <- getRversion()
77
-} else rpath <- "trunk"
78
-outdir <- paste("/amber1/archive/oncbb/rscharpf/crlmm_vignette/", rpath, "/gw6/", sep="")
75
+outdir <- paste("/local_data/r00/crlmm/", getRversion(), "/affy_vignette", sep="")
79 76
 dir.create(outdir, recursive=TRUE, showWarnings=FALSE)
80 77
 @
81 78
 
... ...
@@ -60,10 +60,7 @@ calls.
60 60
 
61 61
 <<ldpath,results=hide>>=
62 62
 library(ff)
63
-if(getRversion() < "2.13.0"){
64
-	rpath <- getRversion()
65
-} else rpath <- "trunk"
66
-outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/illumina_vignette", sep="")
63
+outdir <- paste("/local_data/r00/crlmm/", getRversion(), "/illumina_vignette", sep="")
67 64
 ldPath(outdir)
68 65
 dir.create(outdir, recursive=TRUE, showWarnings=FALSE)
69 66
 @
... ...
@@ -88,6 +88,8 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
88 88
 	Rather, \code{batch} is required in order to initialize a
89 89
 	\code{CNSet} container with the appropriate dimensions.
90 90
 
91
+
92
+
91 93
       }
92 94
 
93 95
 \value{	A \code{SnpSuperSet} instance.}
... ...
@@ -106,10 +108,14 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
106 108
   Bioinformatics. 2010 Jan 15;26(2):242-9.
107 109
 
108 110
 }
111
+
109 112
 \author{Matt Ritchie}
110
-\note{For large datasets, load the 'ff' package prior to genotyping --
111
-this will greatly reduce the RAM required for big jobs.  See
112
-\code{ldPath} and \code{ocSamples}.}
113
+
114
+  \note{For large datasets, load the 'ff' package prior to genotyping
115
+-- this will greatly reduce the RAM required for big jobs.  See
116
+\code{ldPath} and \code{ocSamples}.  The function
117
+\code{genotype.Illumina} supports parallelization, as the (not run)
118
+example below indicates.}
113 119
 
114 120
 \seealso{
115 121
 	\code{\link{crlmmIlluminaV2}},
... ...
@@ -117,6 +123,27 @@ this will greatly reduce the RAM required for big jobs.  See
117 123
 	\code{\link[oligoClasses]{ldOpts}}
118 124
 }
119 125
 \examples{
120
-  ##
126
+\dontrun{
127
+	library(ff)
128
+	library(crlmm)
129
+	## to enable paralellization, set to TRUE
130
+	if(FALSE){
131
+		library(snow)
132
+		## 10 workers
133
+		setCluster(10, "SOCK")
134
+	}
135
+	## path to idat files
136
+	datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k"
137
+	## read in your samplesheet
138
+	samplesheet = read.csv(file.path(datadir, "HumanHap370Duo_Sample_Map.csv"), header=TRUE, as.is=TRUE)
139
+	samplesheet <- samplesheet[-c(28:46,61:75,78:79), ]
140
+	arrayNames <- file.path(datadir, unique(samplesheet[, "SentrixPosition"]))
141
+	arrayInfo <- list(barcode=NULL, position="SentrixPosition")
142
+	cnSet <- genotype.Illumina(sampleSheet=samplesheet,
143
+				   arrayNames=arrayNames,
144
+				   arrayInfoColNames=arrayInfo,
145
+				   cdfName="human370v1c",
146
+				   batch=rep("1", nrow(samplesheet)))
147
+}
121 148
 }
122 149
 \keyword{classif}