Browse code

make gender an ff object in genotype function for Affy

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54170 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 30/03/2011 02:40:42
Showing6 changed files

... ...
@@ -54,6 +54,12 @@ importClassesFrom(oligoClasses, ff_matrix, ffdf)
54 54
 ## Important to export these classes
55 55
 ##exportClasses(ff_or_matrix, ff_matrix, ffdf)
56 56
 ##exportClasses(ff_or_matrix)
57
+##---------------------------------------------------------------------------
58
+## lattice imports
59
+##---------------------------------------------------------------------------
60
+##import(panel.number, panel.grid, panel.xyplot, lpoints, lsegments, lrect, ltext)
61
+
62
+
57 63
 exportMethods(lines)
58 64
 exportMethods(CA, CB)
59 65
 export(crlmm,
... ...
@@ -143,6 +143,7 @@ genotype <- function(filenames,
143 143
 	mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
144 144
 	SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
145 145
 	SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")
146
+	genderff <- initializeBigVector("gender", length(filenames), "integer")
146 147
 	featureData <- getFeatureData(cdfName, copynumber=TRUE)
147 148
 	nr <- nrow(featureData); nc <- length(sns)
148 149
 	A <- initializeBigMatrix("crlmmA-", nr, length(filenames), "integer")
... ...
@@ -182,15 +183,17 @@ genotype <- function(filenames,
182 183
 	protocolData <- getProtocolData.Affy(filenames)
183 184
 	rownames(pData(protocolData)) <- sns
184 185
 	protocolData(cnSet) <- protocolData
185
-	##protocolData(cnSet) <- protocolData
186
-	pd <- data.frame(matrix(NA, nc, 3), row.names=sns)
187
-	colnames(pd)=c("SKW", "SNR", "gender")
188
-	phenoData(cnSet) <- new("AnnotatedDataFrame", data=pd)
186
+	##pd <- data.frame(matrix(NA, nc, 3), row.names=sns)
187
+	##colnames(pd)=c("SKW", "SNR", "gender")
188
+	##phenoData(cnSet) <- new("AnnotatedDataFrame", data=pd)
189
+	cnSet$SKW <- SKW
190
+	cnSet$SNR <- SNR
191
+	cnSet$gender <- genderff
189 192
 	stopifnot(validObject(cnSet))
190 193
 	snp.I <- isSnp(cnSet)
191 194
 	snp.index <- which(snp.I)
192
-	pData(cnSet)$SKW <- SKW
193
-	pData(cnSet)$SNR <- SNR
195
+	##pData(cnSet)$SKW <- SKW
196
+	##pData(cnSet)$SNR <- SNR
194 197
 	np.index <- which(!snp.I)
195 198
 	if(verbose) message("Normalizing nonpolymorphic markers")
196 199
 	FUN <- ifelse(is.lds, "cnrma2", "cnrma")
... ...
@@ -203,22 +206,24 @@ genotype <- function(filenames,
203 206
 	       seed=seed,
204 207
 	       verbose=verbose)
205 208
 	tmp <- crlmmGT2(A=calls(cnSet),
206
-			  B=snpCallProbability(cnSet),
207
-			  SNR=SNR,
208
-			  mixtureParams=mixtureParams,
209
-			  cdfName=cdfName,
210
-			  row.names=NULL,
211
-			  col.names=sampleNames(cnSet),
212
-			  SNRMin=SNRMin,
213
-			  recallMin=recallMin,
214
-			  recallRegMin=recallRegMin,
215
-			  gender=gender,
216
-			  verbose=verbose,
217
-			  returnParams=returnParams,
218
-			  badSNP=badSNP,
219
-			  snp.names=featureNames(cnSet)[snp.index])
209
+			B=snpCallProbability(cnSet),
210
+			SNR=SNR,
211
+			mixtureParams=mixtureParams,
212
+			cdfName=cdfName,
213
+			row.names=NULL,
214
+			col.names=sampleNames(cnSet),
215
+			SNRMin=SNRMin,
216
+			recallMin=recallMin,
217
+			recallRegMin=recallRegMin,
218
+			gender=gender,
219
+			verbose=verbose,
220
+			returnParams=returnParams,
221
+			badSNP=badSNP,
222
+			snp.names=featureNames(cnSet)[snp.index])
220 223
 	if(verbose) message("Genotyping finished.  Updating container with genotype calls and confidence scores.")
221
-	cnSet$gender <- tmp[["gender"]]
224
+	open(cnSet$gender)
225
+	cnSet$gender[,] <- tmp[["gender"]]
226
+	close(cnSet$gender)
222 227
 	return(cnSet)
223 228
 }
224 229
 
... ...
@@ -1180,8 +1180,8 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3),
1180 1180
 			badSNP=0.7,
1181 1181
 			gender=NULL,
1182 1182
 			DF=6){
1183
-##	is.snp = isSnp(cnSet)
1184
-##	snp.index = which(is.snp)
1183
+	is.snp = isSnp(cnSet)
1184
+	snp.index = which(is.snp)
1185 1185
 ##	narrays = ncol(cnSet)
1186 1186
 ##	open(A(cnSet))
1187 1187
 ##	open(B(cnSet))
... ...
@@ -57,42 +57,42 @@ linesCNSet <- function(x, y, batch, copynumber, x.axis="A", grid=FALSE, ...){
57 57
 }
58 58
 
59 59
 
60
-cnPanel <- function(x, y, ..., pch.cols, gt, cbs.segs, hmm.segs=NULL, shades, subscripts, add.ideogram=TRUE){
61
-	##if(panel.number() == 2) browser()
62
-	add.ideogram <- add.ideogram[[panel.number()]]
63
-	##cbs.segs <- cbs.segs[[panel.number()]]
64
-	draw.hmm.states <- ifelse(panel.number() <= length(hmm.segs), TRUE, FALSE)
65
-	panel.grid(h=6, v=10)
66
-	which.hom <- which(gt[subscripts] == 1 | gt[subscripts]==3)
67
-	which.het <- which(gt[subscripts] == 2)
68
-	panel.xyplot(x, y, col="grey60", ...)
69
-	lpoints(x[which.hom], y[which.hom], col=pch.cols[1], ...)
70
-	lpoints(x[which.het], y[which.het], col=pch.cols[2], ...)
71
-	lsegments(x0=start(cbs.segs)/1e6, x1=end(cbs.segs)/1e6,
72
-		  y0=cbs.segs$seg.mean,
73
-		  y1=cbs.segs$seg.mean, ...)
74
-	if(draw.hmm.states){
75
-		hmm.segs <- hmm.segs[order(width(hmm.segs), decreasing=TRUE), ]
76
-		lrect(xleft=start(hmm.segs)/1e6,
77
-		      xright=end(hmm.segs)/1e6,
78
-		      ybottom=-0.4,
79
-		      ytop=0,
80
-		      border=shades[hmm.segs$state],
81
-		      col=shades[hmm.segs$state])
82
-	}
83
-	ltext(130, 5, paste("MAD:", round(mad(y, na.rm=TRUE), 2)))
84
-	if(add.ideogram){
85
-		pathto <- system.file("hg18", package="SNPchip")
86
-		cytoband <- read.table(file.path(pathto, "cytoBand.txt"), as.is=TRUE)
87
-		cytoband$V2 <- cytoband$V2/1e6
88
-		cytoband$V3 <- cytoband$V3/1e6
89
-		colnames(cytoband) <- c("chrom", "start", "end", "name", "gieStain")
90
-		plotCytoband(unique(hmm.segs$chrom),
91
-			     cytoband=cytoband,
92
-			     cytoband.ycoords=c(5.6, 5.9),
93
-			     new=FALSE,
94
-			     label.cytoband=FALSE,
95
-			     build="hg18",
96
-			     use.lattice=TRUE)
97
-	}
98
-}
60
+##cnPanel <- function(x, y, ..., pch.cols, gt, cbs.segs, hmm.segs=NULL, shades, subscripts, add.ideogram=TRUE){
61
+##	##if(panel.number() == 2) browser()
62
+##	add.ideogram <- add.ideogram[[panel.number()]]
63
+##	##cbs.segs <- cbs.segs[[panel.number()]]
64
+##	draw.hmm.states <- ifelse(panel.number() <= length(hmm.segs), TRUE, FALSE)
65
+##	panel.grid(h=6, v=10)
66
+##	which.hom <- which(gt[subscripts] == 1 | gt[subscripts]==3)
67
+##	which.het <- which(gt[subscripts] == 2)
68
+##	panel.xyplot(x, y, col="grey60", ...)
69
+##	lpoints(x[which.hom], y[which.hom], col=pch.cols[1], ...)
70
+##	lpoints(x[which.het], y[which.het], col=pch.cols[2], ...)
71
+##	lsegments(x0=start(cbs.segs)/1e6, x1=end(cbs.segs)/1e6,
72
+##		  y0=cbs.segs$seg.mean,
73
+##		  y1=cbs.segs$seg.mean, ...)
74
+##	if(draw.hmm.states){
75
+##		hmm.segs <- hmm.segs[order(width(hmm.segs), decreasing=TRUE), ]
76
+##		lrect(xleft=start(hmm.segs)/1e6,
77
+##		      xright=end(hmm.segs)/1e6,
78
+##		      ybottom=-0.4,
79
+##		      ytop=0,
80
+##		      border=shades[hmm.segs$state],
81
+##		      col=shades[hmm.segs$state])
82
+##	}
83
+##	ltext(130, 5, paste("MAD:", round(mad(y, na.rm=TRUE), 2)))
84
+##	if(add.ideogram){
85
+##		pathto <- system.file("hg18", package="SNPchip")
86
+##		cytoband <- read.table(file.path(pathto, "cytoBand.txt"), as.is=TRUE)
87
+##		cytoband$V2 <- cytoband$V2/1e6
88
+##		cytoband$V3 <- cytoband$V3/1e6
89
+##		colnames(cytoband) <- c("chrom", "start", "end", "name", "gieStain")
90
+##		plotCytoband(unique(hmm.segs$chrom),
91
+##			     cytoband=cytoband,
92
+##			     cytoband.ycoords=c(5.6, 5.9),
93
+##			     new=FALSE,
94
+##			     label.cytoband=FALSE,
95
+##			     build="hg18",
96
+##			     use.lattice=TRUE)
97
+##	}
98
+##}
... ...
@@ -1,8 +1,13 @@
1
-All: copynumber crlmmDownstream clean
1
+All: copynumber illumina_copynumber crlmmDownstream clean
2 2
 
3 3
 copynumber: copynumber.tex
4
-	cp -p ../scripts/copynumber.pdf .
5
-	cp -p ../scripts/illumina_copynumber.pdf .
4
+#	cp  -p ../scripts/copynumber.pdf .
5
+#	cp  -p ../scripts/illumina_copynumber.pdf .
6
+	cp  ../scripts/copynumber.pdf .
7
+	cp  ../scripts/illumina_copynumber.pdf .
8
+
9
+illumina_copynumber: illumina_copynumber.tex
10
+	cp ../scripts/illumina_copynumber.pdf .
6 11
 
7 12
 crlmmDownstream: crlmmDownstream.tex
8 13
 	cp -p ../scripts/crlmmDownstream.pdf .
... ...
@@ -36,7 +36,7 @@
36 36
   to the copy number vignette.
37 37
 \end{abstract}
38 38
 
39
-\section{Infrastructure}
39
+\section{Setup}
40 40
 
41 41
 <<crlmm, results=hide, echo=FALSE>>=
42 42
 library(crlmm)
... ...
@@ -44,29 +44,16 @@ options(width=70)
44 44
 options(continue=" ")
45 45
 @
46 46
 
47
-\textbf{Supported platforms:} The supported Infinium platforms are
48
-those for which a corresponding annotation package is available.  The
49
-annotation packages contain information on the markers, such as
50
-physical position and chromosome, as well as pre-computed parameters
51
-estimated from HapMap used during the preprocessing and genotyping
52
-steps. Currently supported Infinium platforms are listed in the
53
-following code chunk.
54
-
55
-<<supportedPlatforms>>=
56
-pkgs <- annotationPackages()
57
-crlmm.pkgs <- pkgs[grep("Crlmm", pkgs)]
58
-crlmm.pkgs[grep("human", crlmm.pkgs)]
59
-@
60
-
61
-\textbf{Large data:} In order to reduce \crlmm{}'s memory footprint,
62
-we require the \Rpackage{ff} for copy number estimation.  The
63
-\Rpackage{ff} package provides infrastructure for accessing and
64
-writing data to disk instead of keeping data in memory.  Each element
65
-of the \verb+assayData+ and \verb+batchStatistics+ slot of the
66
-\Rclass{CNSet} class are \Robject{ff} objects.  \Robject{ff} objects
67
-in the \R{} workspace contain pointers to several files with the
68
-\verb+.ff+ extension.  The location of where the data is stored on
69
-disk is specified by use of the \Rfunction{ldPath} function.
47
+%\textbf{Supported platforms:} The supported Infinium platforms are
48
+%those for which a corresponding annotation package is available.  The
49
+%annotation packages contain information on the markers, such as
50
+%physical position and chromosome, as well as pre-computed parameters
51
+%estimated from HapMap used during the preprocessing and genotyping
52
+%steps. Currently supported Infinium platforms are listed in the
53
+%following code chunk.
54
+The following codechunk declares a directory for saving \Robject{ff}
55
+files that will contain the normalized intensities and the genotype
56
+calls.
70 57
 
71 58
 <<ldpath,results=hide>>=
72 59
 library(ff)
... ...
@@ -78,41 +65,23 @@ ldPath(outdir)
78 65
 dir.create(outdir, recursive=TRUE, showWarnings=FALSE)
79 66
 @
80 67
 
81
-Users should not move or rename this directory.  If only output files
82
-are stored in \verb+outdir+, one can either remove the entire
83
-directory prior to rerunning the analysis or all of the '.ff' files.
84
-Otherwise, one would accumulate a large number of '.ff' files on disk
85
-that are no longer in use.
86
-
87
-As none of the functions for preprocessing, genotyping, or copy number
88
-estimation simultaneously require all samples and all probes in
89
-memory, memory-usage by \crlmm{} can be fine tuned by reading in and
90
-processing subsets of the markers and/or samples. The functions
91
-\Rfunction{ocSamples} and \Rfunction{ocProbesets} in the
92
-\Rpackage{oligoClasses} package can be used to declare how many
93
-markers and samples to read at once.  In general, specifying smaller
94
-values should reduce the RAM required for a particular job, but would
95
-be expected to have an increased run-time. In the following
96
-code-chunk, we declare that \crlmm{} should process 150,000 markers at
97
-a time (when possible) and 500 samples at a time.  (As our dataset in
98
-this vignette only contains 43 samples, the \Rfunction{ocSamples}
99
-option would not have any effect.)  One can view the current settings
100
-for these commands, by typing the functions without an argument.
101
-
102
-<<ram>>=
103
-ocSamples()
104
-ocProbesets()
105
-ocProbesets(150e3)
106
-ocSamples(500)
107
-ocSamples()
108
-ocProbesets()
109
-@
68
+We will also store cached computations in the directory \verb+outdir+.
110 69
 
111 70
 <<cacheSweave, echo=FALSE, results=hide>>=
112 71
 library(cacheSweave)
113 72
 setCacheDir(outdir)
114 73
 @
115 74
 
75
+We declare that \crlmm{} should process 150,000 markers at a time
76
+(when possible) and/or 500 samples at a time.  As our example dataset
77
+in this vignette contains fewer than 500 samples, all samples will be
78
+processed simultaneously.
79
+
80
+<<ram>>=
81
+ocProbesets(150e3)
82
+ocSamples(500)
83
+@
84
+
116 85
 \textbf{Limitations:} There is no minimum number of samples required
117 86
 for preprocessing and genotyping.  However, for copy number estimation
118 87
 the \crlmm{} package currently requires at least 10 samples per batch.
... ...
@@ -134,7 +103,7 @@ platforms, several steps are required.
134 103
 We begin by specifying the path containing the raw IDAT files for a
135 104
 set of samples from the Infinium 370k platform.
136 105
 
137
-<libraries>>=
106
+<<datadir>>=
138 107
 datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k"
139 108
 @
140 109