Browse code

Update summarizeNps function for illumina platform

- A scatterplot of the A and B channels at nonpolymophic loci shows
that one channel appears to be background and the other channel
appears to be signal.

- A quick fix for CN estimation is to replace the normalized
intensities at nonpolymorphic loci for the A allele with the average
from both channels on the intensity scale ((A+B)/2)

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@50353 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 20/10/2010 13:43:06
Showing4 changed files

... ...
@@ -13,3 +13,6 @@ extdata*
13 13
 copynumber-*
14 14
 copynumber.log
15 15
 *.log
16
+*.png
17
+*~
18
+relocating*
... ...
@@ -74,4 +74,4 @@ export(totalCopynumber)
74 74
 exportMethods(A, B, nuA, nuB, phiA, phiB, corr, tau2, Ns, medians, mads)
75 75
 
76 76
 
77
-
77
+##export(summarizeNps, genotypeSummary, fit.lm2)
... ...
@@ -119,7 +119,6 @@ genotype <- function(filenames,
119 119
 			     sns=sns,
120 120
 			     verbose=verbose,
121 121
 			     batch=batch)
122
-	##save(callSet, file=file.path(outdir, "callSet.rda"))
123 122
 	if(is.lds) open(callSet)
124 123
 	mixtureParams <- matrix(NA, 4, length(filenames))
125 124
 	is.snp <- isSnp(callSet)
... ...
@@ -130,7 +129,8 @@ genotype <- function(filenames,
130 129
 		       snprma=snprma(...),
131 130
 		       snprma2=snprma2(...))
132 131
 	}
133
-	snprmaRes <- snprmaFxn(FUN, filenames=filenames,
132
+	snprmaRes <- snprmaFxn(FUN,
133
+			       filenames=filenames,
134 134
 			       mixtureSampleSize=mixtureSampleSize,
135 135
 			       fitMixture=TRUE,
136 136
 			       eps=eps,
... ...
@@ -138,22 +138,27 @@ genotype <- function(filenames,
138 138
 			       seed=seed,
139 139
 			       cdfName=cdfName,
140 140
 			       sns=sns)
141
+	##message("Saving snprmaRes file")
142
+	##save(snprmaRes, file=file.path(outdir, "snprmaRes.rda"))
141 143
 	if(verbose) message("Finished preprocessing.")
142 144
 	if(is.lds){
143 145
 		open(snprmaRes[["A"]])
144 146
 		open(snprmaRes[["B"]])
147
+		open(snprmaRes[["SNR"]])
148
+		open(snprmaRes[["mixtureParams"]])
145 149
 		##bb <- getOption("ffbatchbytes")
150
+		message("Writing normalized intensities to callSet")
146 151
 		ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]])##, BATCHBYTES=bb)
147 152
 		ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]])##, BATCHBYTES=bb)
153
+		pData(callSet)$SKW <- snprmaRes[["SKW"]]
154
+		pData(callSet)$SNR <- snprmaRes[["SNR"]]
148 155
 	} else{
149 156
 		A(callSet)[snp.index, ] <- snprmaRes[["A"]]
150 157
 		B(callSet)[snp.index, ] <- snprmaRes[["B"]]
158
+		pData(callSet)$SKW <- snprmaRes[["SKW"]]
159
+		pData(callSet)$SNR <- snprmaRes[["SNR"]]
151 160
 	}
152
-	pData(callSet)$SKW <- snprmaRes[["SKW"]]
153
-	pData(callSet)$SNR <- snprmaRes[["SNR"]]
154
-	mixtureParams <- snprmaRes$mixtureParams
155 161
 	np.index <- which(!is.snp)
156
-	if(verbose) message("Normalizing nonpolymorphic markers")
157 162
 	FUN <- ifelse(is.lds, "cnrma2", "cnrma")
158 163
 	## main purpose is to update 'alleleA'
159 164
 	cnrmaFxn <- function(FUN,...){
... ...
@@ -170,10 +175,7 @@ genotype <- function(filenames,
170 175
 		       sns=sns,
171 176
 		       seed=seed,
172 177
 		       verbose=verbose)
173
-	if(verbose) message("Saving callSet.rda")
174
-	##save(callSet, file=file.path(outdir, "callSet.rda"))
175 178
 	if(!is.lds) A(callSet) <- AA
176
-	## otherwise, the normalized values were written to file... no need to do anything
177 179
 	rm(AA)
178 180
 	FUN <- ifelse(is.lds, "crlmmGT2", "crlmmGT")
179 181
 	## genotyping
... ...
@@ -182,7 +184,7 @@ genotype <- function(filenames,
182 184
 		       crlmmGT2=crlmmGT2(...),
183 185
 		       crlmmGT=crlmmGT(...))
184 186
 	}
185
-	if(verbose) message("Call genotypes at polymorphic markers")
187
+	if(verbose) message("Running crlmmGT2")
186 188
 	tmp <- crlmmGTfxn(FUN,
187 189
 			  A=snprmaRes[["A"]],
188 190
 			  B=snprmaRes[["B"]],
... ...
@@ -191,8 +193,6 @@ genotype <- function(filenames,
191 193
 			  cdfName=cdfName,
192 194
 			  row.names=NULL,
193 195
 			  col.names=sampleNames(callSet),
194
-			  probs=probs,
195
-			  DF=DF,
196 196
 			  SNRMin=SNRMin,
197 197
 			  recallMin=recallMin,
198 198
 			  recallRegMin=recallRegMin,
... ...
@@ -206,16 +206,24 @@ genotype <- function(filenames,
206 206
 		open(tmp[["confs"]])
207 207
 		ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]])#, BATCHBYTES=bb)
208 208
 		ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]])#, BATCHBYTES=bb)
209
-		close(tmp[["calls"]])
210
-		close(tmp[["confs"]])
211 209
 	} else {
212 210
 		calls(callSet)[snp.index, ] <- tmp[["calls"]]
213 211
 		snpCallProbability(callSet)[snp.index, ] <- tmp[["confs"]]
214 212
 	}
213
+	message("Finished updating.  Cleaning up.")
215 214
 	callSet$gender <- tmp$gender
216 215
 	close(callSet)
216
+	if(is.lds){
217
+		delete(snprmaRes[["A"]])
218
+		delete(snprmaRes[["B"]])
219
+		##delete(snprmaRes[["SNR"]]) -- would need to do something like callSet$SNR <- snprmaRes[["SNR"]][,]
220
+		##delete(snprmaRes[["SKW"]])
221
+		delete(snprmaRes[["mixtureParams"]])
222
+		rm(snprmaRes)
223
+	}
217 224
 	return(callSet)
218 225
 }
226
+
219 227
 genotype2 <- function(){
220 228
 	.Defunct(msg="The genotype2 function has been deprecated. The function genotype should be used instead.  genotype will support large data using ff provided that the ff package is loaded.")
221 229
 }
... ...
@@ -494,6 +502,7 @@ fit.lm1 <- function(strata,
494 502
 	}
495 503
 }
496 504
 
505
+## nonpolymorphic markers
497 506
 fit.lm2 <- function(strata,
498 507
 		    index.list,
499 508
 		    object,
... ...
@@ -532,7 +541,8 @@ fit.lm2 <- function(strata,
532 541
 		X <- cbind(1, log2(c(medianA.AA[, k], medianB.BB[, k])))
533 542
 		Y <- log2(c(phiA.snp[, k], phiB.snp[, k]))
534 543
 		betahat <- solve(crossprod(X), crossprod(X, Y))
535
-		crosshyb <- max(median(medianA.AA[, k]) - median(medianA.AA.np[, k]), 0)
544
+		##crosshyb <- max(median(medianA.AA[, k]) - median(medianA.AA.np[, k]), 0)
545
+		crosshyb <- 0
536 546
 		X <- cbind(1, log2(medianA.AA.np[, k] + crosshyb))
537 547
 		logPhiT <- X %*% betahat
538 548
 		phiA.np[, k] <- 2^(logPhiT)
... ...
@@ -1616,19 +1626,30 @@ summarizeNps <- function(strata, index.list, object, batchSize,
1616 1626
 	nr <- length(index)
1617 1627
 	nc <- length(batchnames)
1618 1628
 	N.AA <- medianA.AA <- madA.AA <- tau2A.AA <- matrix(NA, nr, nc)
1629
+	is.illumina <- whichPlatform(annotation(object)) == "illumina"
1619 1630
 	AA <- as.matrix(A(object)[index, ])
1631
+	if(is.illumina){
1632
+		BB <- as.matrix(B(object)[index, ])
1633
+		AVG <- (AA+BB)/2
1634
+		A(object)[index, ] <- AVG
1635
+		AA <- AVG
1636
+		rm(AVG, BB)
1637
+	}
1620 1638
 	for(k in seq_along(batches)){
1621 1639
 		B <- batches[[k]]
1622 1640
 		N.AA[, k] <- length(B)
1623 1641
 		this.batch <- unique(as.character(batch(object)[B]))
1624 1642
 		j <- match(this.batch, batchnames)
1625
-		##NORM <- normal.index[, k]
1626
-		A <- AA[, B]
1627
-		medianA.AA[, k] <- rowMedians(A, na.rm=TRUE)
1628
-		madA.AA[, k] <- rowMAD(A, na.rm=TRUE)
1643
+		I.A <- AA[, B]
1644
+##		if(is.illumina){
1645
+##			I.B <- BB[, B]
1646
+##			I.A <- I.A + I.B
1647
+##		}
1648
+		medianA.AA[, k] <- rowMedians(I.A, na.rm=TRUE)
1649
+		madA.AA[, k] <- rowMAD(I.A, na.rm=TRUE)
1629 1650
 		## log2 Transform Intensities
1630
-		A <- log2(A)
1631
-		tau2A.AA[, k] <- rowMAD(A, na.rm=TRUE)^2
1651
+		I.A <- log2(I.A)
1652
+		tau2A.AA[, k] <- rowMAD(I.A, na.rm=TRUE)^2
1632 1653
 	}
1633 1654
 	N.AA(object)[index,] <- N.AA
1634 1655
 	medianA.AA(object)[index,] <- medianA.AA
... ...
@@ -1674,8 +1695,8 @@ summarizeSnps <- function(strata,
1674 1695
 		##NORM <- normal.index[, k]
1675 1696
 		xx <- CP[, B]
1676 1697
 		highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1677
-		##G <- G*highConf*NORM
1678 1698
 		G <- G*highConf
1699
+		##G <- G*highConf*NORM
1679 1700
 		A <- AA[, B]
1680 1701
 		B <- BB[, B]
1681 1702
 		## this can be time consuming...do only once
... ...
@@ -139,8 +139,9 @@ container <- checkExists("container", .path=outdir,
139 139
 cnSet <- checkExists("cnSet", .path=outdir, .FUN=crlmmCopynumber, object=container, .load.it=load.it)
140 140
 @
141 141
 
142
+
142 143
 <<copynumberObject>>=
143
-marker.index <- which(chromosome(cnSet) <= 22 & isSnp(cnSet))
144
+marker.index <- which(chromosome(cnSet) <= 22)## & isSnp(cnSet))
144 145
 sample.index <- 1:5 ## first five samples
145 146
 invisible(open(cnSet))
146 147
 copynumberSet <- as(cnSet[marker.index, 1:5], "CopyNumberSet")