Browse code

updated biasAdjNP and biasAdj functions

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@40036 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 09/06/2009 14:23:02
Showing 3 changed files

... ...
@@ -159,3 +159,7 @@ is decoded and scanned
159 159
 2009-06-08 B Carvalho - committed version 1.3.3
160 160
 
161 161
 * Added batchQC to phenoData
162
+
163
+2009-06-08 R Scharpf - committed version 1.3.4
164
+
165
+* fixed bug in the biasAdjNP function.  updated biasAdj function
... ...
@@ -316,6 +316,10 @@ computeCopynumber <- function(chrom,
316 316
 			      cdfName="genomewidesnp6",
317 317
 			      verbose=TRUE, ...){
318 318
 	require(paste(cdfName, "Crlmm", sep=""), character.only=TRUE) || stop(paste("cdf ", cdfName, "Crlmm", " not available.", sep=""))
319
+	if(!missing(plate)){
320
+		if(length(plate) != ncol(A))
321
+			stop("plate must the same length as the number of columns of A")
322
+	}
319 323
 	set.seed(seed)
320 324
 	if(missing(chrom)) stop("must specify chromosome")
321 325
 	if(length(ls(envir)) == 0) {
... ...
@@ -517,6 +521,7 @@ nonpolymorphic <- function(plateIndex, NP, envir, CONF.THR=0.99, DF.PRIOR=50, pk
517 521
 	##---------------------------------------------------------------------------
518 522
 }
519 523
 
524
+##sufficient statistics on the intensity scale
520 525
 withinGenotypeMoments <- function(p, A, B, calls, conf, CONF.THR, DF.PRIOR, envir){
521 526
 	CHR <- envir[["chrom"]]
522 527
 	Ns <- envir[["Ns"]]
... ...
@@ -714,14 +719,14 @@ oneBatch <- function(plateIndex,
714 719
 	snpflags[, p] <- index[[1]] | index[[2]] | index[[3]]
715 720
 
716 721
 	##---------------------------------------------------------------------------
717
-	## Two genotype clusters not observed -- would sequence help? (didn't seem that helpful)
718
-	## 1 extract index of complete data
719
-	## 2 Regress  mu1,mu3 ~ sequence + mu2
720
-	## 3 Predict mu1*, mu3* for missing genotypes
722
+	## Two genotype clusters not observed -- would sequence help? (didn't seem to)
723
+	## 1. extract index of complete data
724
+	## 2. Regress  mu_missing ~ sequence + mu_observed
725
+	## 3. solve for nu assuming the median is 2
721 726
 	##---------------------------------------------------------------------------
722 727
 	cols <- c(3, 1, 2)
723 728
 	for(j in 1:3){
724
-		if(sum(index[[1]]) == 0) next()
729
+		if(sum(index[[j]]) == 0) next()
725 730
 		k <- cols[j]
726 731
 		X <- cbind(1, mnA[index.complete, k], mnB[index.complete, k])
727 732
 		Y <- cbind(mnA[index.complete,  -k],
... ...
@@ -732,7 +737,6 @@ oneBatch <- function(plateIndex,
732 737
 		muA[index[[j]], p, -c(1, 2, k+2)] <- mus[, 1:2]
733 738
 		muB[index[[j]], p, -c(1, 2, k+2)] <- mus[, 3:4]
734 739
 	}
735
-	
736 740
 	negA <- rowSums(muA[, p, ] < 0) > 0
737 741
 	negB <- rowSums(muB[, p, ] < 0) > 0	
738 742
 	snpflags[, p] <- snpflags[, p] | negA | negB | rowSums(is.na(muA[, p, 3:5]), na.rm=TRUE) > 0
... ...
@@ -748,6 +752,7 @@ oneBatch <- function(plateIndex,
748 752
 	envir[["plates.completed"]] <- plates.completed
749 753
 }
750 754
 
755
+##Estimate tau2, sigma2, and correlation
751 756
 locationAndScale <- function(p, GT.A, GT.B, index, envir, DF.PRIOR){
752 757
 	tau2A <- envir[["tau2A"]]
753 758
 	tau2B <- envir[["tau2B"]]
... ...
@@ -997,20 +1002,18 @@ biasAdj <- function(plateIndex, envir, priorProb, PROP=0.75){
997 1002
 			if(CA > 0 & CB > 0) rho <- corr[, p]
998 1003
 			if(CHR == 23){
999 1004
 				means <- cbind(suppressWarnings(log2(nuA[, p]+CA*phiA[, p] + CB*phiAx[, p])), suppressWarnings(log2(nuB[, p]+CB*phiB[, p] + CA*phiBx[, p])))
1005
+				browser()
1000 1006
 			} else{
1001 1007
 				means <- cbind(suppressWarnings(log2(nuA[, p]+CA*phiA[, p])), suppressWarnings(log2(nuB[, p]+CB*phiB[, p])))
1008
+				meanA <- suppressWarnings(log2(nuA[, p]+CA*phiA[, p]))
1009
+				meanB <- suppressWarnings(log2(nuB[, p]+CB*phiB[, p]))
1010
+				covs <- rho*A.scale*B.scale
1011
+				A.scale2 <- A.scale^2
1012
+				B.scale2 <- B.scale^2
1002 1013
 			}
1003
-			covs <- rho*A.scale*B.scale
1004
-			A.scale2 <- A.scale^2
1005
-			B.scale2 <- B.scale^2
1006
-			m <- 1			
1007
-			for(i in 1:nrow(A)){
1008
-				Sigma <- matrix(c(A.scale2[i], covs[i], covs[i], B.scale2[i]), 2,2)
1009
-				xx <- matrix(X[i, ], ncol=2)
1010
-				tmp <- dmvnorm(xx, mean=means[i, ], sigma=Sigma) 
1011
-				emit[m, , counter] <- tmp
1012
-				m <- m+1				
1013
-			}
1014
+			Q.x.y <- 1/(1-rho^2)*(((lA - meanA)/A.scale)^2 + ((lB - meanB)/B.scale)^2 - 2*rho*((lA - meanA)*(lB - meanB))/(A.scale*B.scale))
1015
+			f.x.y <- 1/(2*pi*A.scale*B.scale*sqrt(1-rho^2))*exp(-0.5*Q.x.y)
1016
+			emit[, , counter] <- f.x.y
1014 1017
 			counter <- counter+1
1015 1018
 		}
1016 1019
 	}
... ...
@@ -1019,60 +1022,63 @@ biasAdj <- function(plateIndex, envir, priorProb, PROP=0.75){
1019 1022
 	norm <- priorProb[3]*emit[, , 4:6]
1020 1023
 	amp <- priorProb[4]*emit[, , 7:10]
1021 1024
 	##sum over the different combinations within each copy number state
1022
-	hemDel <- apply(hemDel, c(1,2), sum)
1023
-	norm <- apply(norm, c(1, 2), sum)
1024
-	amp <- apply(amp, c(1,2), sum)
1025
+	hemDel <- hemDel[, , 1] + hemDel[, , 2]
1026
+	norm <- norm[, , 1] + norm[, , 2] + norm[, , 3]
1027
+	amp <- amp[, , 1] + amp[, , 2] + amp[ , , 3] + amp[, , 4]
1025 1028
 	total <- hemDel + norm + amp
1026 1029
 	hemDel <- hemDel/total
1027 1030
 	norm <- norm/total
1028 1031
 	amp <- amp/total
1029 1032
 	envir[["posteriorProb"]] <- list(hemDel=hemDel, norm=norm, amp=amp)
1030
-	tmp <- array(NA, dim=c(nrow(A), ncol(A), 4))
1031
-	tmp[, , 1] <- homDel
1032
-	tmp[, , 2] <- hemDel
1033
-	tmp[, , 3] <- norm
1034
-	tmp[, , 4] <- amp
1035
-	tmp2 <- apply(tmp, c(1, 2), function(x) order(x, decreasing=TRUE)[1])
1033
+	posteriorProb <- array(NA, dim=c(nrow(A), ncol(A), 4))
1034
+	posteriorProb[, , 1] <- homDel
1035
+	posteriorProb[, , 2] <- hemDel
1036
+	posteriorProb[, , 3] <- norm
1037
+	posteriorProb[, , 4] <- amp
1038
+	mostLikelyState <- apply(posteriorProb, c(1, 2), function(x) order(x, decreasing=TRUE)[1])
1036 1039
 	##Adjust for SNPs that have less than 80% of the samples in an altered state
1037 1040
 	##flag the remainder?
1038 1041
 	if(CHR != 23){
1039
-		tmp3 <- tmp2 != 3
1042
+		proportionSamplesAltered <- rowMeans(mostLikelyState != 3)
1040 1043
 	}else{
1041 1044
 		##should also consider pseud
1045
+		browser()
1042 1046
 		gender <- envir[["gender"]]
1043 1047
 		tmp3 <- tmp2
1044 1048
 		tmp3[, gender=="male"] <- tmp2[, gender=="male"] != 2
1045 1049
 		tmp3[, gender=="female"] <- tmp2[, gender=="female"] != 3
1046 1050
 	}
1047 1051
 	##Those near 1 have NaNs for nu and phi.  this occurs by NaNs in the muA[,, "A"] or muA[, , "B"] for X chromosome
1048
-	propAlt <- rowMeans(tmp3)##prop normal
1049
-	##ii <- propAlt < PROP
1050
-	ii <- propAlt > 0.05
1052
+	##ii <- proportionSamplesAltered < PROP
1053
+	##ii <- proportionSamplesAltered > 0.05
1054
+	##Figure out why the proportion altered can be near 1...
1055
+	ii <- proportionSamplesAltered < 0.8 & proportionSamplesAltered > 0.01
1051 1056
 	##only exclude observations from one tail, depending on
1052 1057
 	##whether more are up or down
1053 1058
 	if(CHR != 23){
1054
-		moreup <- rowSums(tmp2 > 3) > rowSums(tmp2 < 3) ##3 is normal
1055
-		down <-  tmp2[ii & moreup, ] <= 3
1056
-		up <- tmp2[ii & !moreup, ] >= 3
1057
-		##What if we just keep X% of the ones with the highest
1058
-		##posterior probability for normal
1059
-##		prNorm <- tmp[, , 3]
1060
-##		rankNorm <- t(apply(prNorm, 1, order, decreasing=TRUE))
1061
-
1062
-		rankUP <- t(apply(tmp[moreup & ii, , 4]/tmp[moreup & ii, , 3], 1, order, decreasing=TRUE))
1063
-		rankDown <- t(apply(tmp[!moreup & ii, , 2]/tmp[!moreup & ii, , 3], 1, order, decreasing=TRUE))		
1064
-		maxNumberToDrop <- round(0.25*ncol(tmp2),0)
1065
-##		NORM <- rankNorm <= percentage
1066
-		NORM <- matrix(TRUE, nrow(A), ncol(A))
1067
-		NORM[ii & moreup, ] <- rankUP >= maxNumberToDrop
1068
-		NORM[ii & !moreup, ] <- rankDown >= maxNumberToDrop
1059
+		moreup <- rowSums(mostLikelyState > 3) > rowSums(mostLikelyState < 3) ##3 is normal
1060
+		NORM <- matrix(FALSE, nrow(A), ncol(A))
1061
+		NORM[proportionSamplesAltered > 0.8, ] <- FALSE
1062
+		##big and greater than 1 if copy number 3 is more likely
1063
+		ratioUp <- posteriorProb[, , 4]/posteriorProb[, , 3]
1064
+		##large values will have small ranks
1065
+		##rankUP <- t(apply(ratio, 1, rank))
1066
+		## drop small ranks up to the maximum number, unless the ratio is greater than 1 
1067
+		##NORM[ii & moreup, ] <- !(rankUP <= maxNumberToDrop) | (ratio < 1)
1068
+		NORM[ii & moreup, ] <- ratioUp[moreup & ii] < 1  ##normal more likely
1069
+		##big and greater than 1 if copy number 1 is more likely than 2
1070
+		ratioDown <- posteriorProb[, , 2]/posteriorProb[, , 3]
1071
+		##big values have small ranks
1072
+		##rankDown <- t(apply(ratio, 1, rank))		
1073
+		##NORM[ii & !moreup, ] <- !(rankDown <= maxNumberToDrop) | (ratio < 1)
1074
+		NORM[ii & !moreup, ] <- ratioDown[!moreup & ii] < 1  ##normal more likely
1069 1075
 ##		NORM[ii & !moreup, ] <- up
1070 1076
 		##Define NORM so that we can iterate this step
1071 1077
 		##NA's in the previous iteration (normal) will be propogated
1072 1078
 		normal <- NORM*normal
1073 1079
 	} else{
1074
-		fem <- tmp2[, gender=="female"]
1075
-		mal <- tmp2[, gender=="male"]
1080
+		fem <- mostLikelyState[, gender=="female"]
1081
+		mal <- mostLikelyState[, gender=="male"]
1076 1082
 		moreupF <- rowSums(fem > 3) > rowSums(fem < 3)
1077 1083
 		moreupM <- rowSums(mal > 2) > rowSums(mal < 2)
1078 1084
 		notUpF <-  fem[ii & moreupF, ] <= 3
... ...
@@ -1089,7 +1095,7 @@ biasAdj <- function(plateIndex, envir, priorProb, PROP=0.75){
1089 1095
 		normal[, gender=="female"] <- normalF
1090 1096
 		normal[, gender=="male"] <- normalM
1091 1097
 	}
1092
-	flagAltered <- which(propAlt > 0.5)
1098
+	flagAltered <- which(proportionSamplesAltered > 0.5)
1093 1099
 	envir[["flagAltered"]] <- flagAltered
1094 1100
 	normal[normal == FALSE] <- NA
1095 1101
 	envir[["normal"]] <- normal
... ...
@@ -1285,10 +1291,6 @@ expectedC <- function(plateIndex, envir, priorProb, cnStates=0:6){
1285 1291
 	return(posteriorMode)
1286 1292
 }
1287 1293
 
1288
-
1289
-
1290
-
1291
-
1292 1294
 biasAdjNP <- function(plateIndex, envir, priorProb){
1293 1295
 	p <- plateIndex
1294 1296
 	normalNP <- envir[["normalNP"]]
... ...
@@ -1297,56 +1299,62 @@ biasAdjNP <- function(plateIndex, envir, priorProb){
1297 1299
 	plate <- envir[["plate"]]
1298 1300
 	uplate <- envir[["uplate"]]
1299 1301
 	sig2T <- envir[["sig2T"]]
1302
+	normalNP <- normalNP[, plate==uplate[p]]	
1300 1303
 	NP <- NP[, plate==uplate[p]]
1301
-	sig2T <- sig2T[, plate==uplate[p]]
1304
+	##sig2T <- sig2T[, plate==uplate[p]]
1305
+	sig2T <- sig2T[, p]
1306
+
1302 1307
 
1303 1308
 	##Assume that on the log-scale, that the background variance is the same...
1304 1309
 	tau2T <- sig2T	
1305 1310
 	nuT <- envir[["nuT"]]
1311
+	nuT <- nuT[, p]
1306 1312
 	phiT <- envir[["phiT"]]
1307
-	p <- plateIndex
1308
-	plate <- envir[["plate"]]
1313
+	phiT <- phiT[, p]
1314
+	
1309 1315
 	if(missing(priorProb)) priorProb <- rep(1/4, 4) ##uniform
1310 1316
 	emit <- array(NA, dim=c(nrow(NP), ncol(NP), 4))##SNPs x sample x 'truth'	
1311 1317
 	lT <- log2(NP)
1312 1318
 	counter <- 1 ##state counter
1313 1319
 	for(CT in 0:3){
1314 1320
 		sds <- sqrt(tau2T*(CT==0) + sig2T*(CT > 0))
1315
-		means <- suppressWarnings(log2(nuT[, p]+CT*phiT[, p]))
1321
+		means <- suppressWarnings(log2(nuT+CT*phiT))
1316 1322
 		tmp <- dnorm(lT, mean=means, sd=sds)
1317 1323
 		emit[, , counter] <- tmp
1318 1324
 		counter <- counter+1
1319 1325
 	}
1320
-	tmp2 <- apply(emit, c(1, 2), function(x) order(x, decreasing=TRUE)[1])
1326
+	mostLikelyState <- apply(emit, c(1, 2), function(x) order(x, decreasing=TRUE)[1])
1321 1327
 	##Adjust for SNPs that have less than 80% of the samples in an altered state
1322 1328
 	##flag the remainder?
1323 1329
 	if(CHR != 23){
1324
-		tmp3 <- tmp2 != 3
1330
+		tmp3 <- mostLikelyState != 3
1325 1331
 	}else{
1326 1332
 		browser()
1327 1333
 		##should also consider pseudoautosomal
1328 1334
 		gender <- envir[["gender"]]
1329
-		tmp3 <- tmp2
1330
-		tmp3[, gender=="male"] <- tmp2[, gender=="male"] != 2
1331
-		tmp3[, gender=="female"] <- tmp2[, gender=="female"] != 3
1335
+		tmp3 <- mostLikelyState
1336
+		tmp3[, gender=="male"] <- mostLikelyState[, gender=="male"] != 2
1337
+		tmp3[, gender=="female"] <- mostLikelyState[, gender=="female"] != 3
1332 1338
 	}
1333 1339
 	##Those near 1 have NaNs for nu and phi.  this occurs by NaNs in the muA[,, "A"] or muA[, , "B"] for X chromosome
1334
-	propAlt <- rowMeans(tmp3)##prop normal
1335
-	ii <- propAlt < 0.75
1340
+	proportionSamplesAltered <- rowMeans(tmp3)##prop normal
1341
+	ii <- proportionSamplesAltered < 0.75
1336 1342
 	##only exclude observations from one tail, depending on
1337 1343
 	##whether more are up or down
1338 1344
 	if(CHR != 23){
1339
-		moreup <- rowSums(tmp2 > 3) > rowSums(tmp2 < 3)
1340
-		notUp <-  tmp2[ii & moreup, ] <= 3
1341
-		notDown <- tmp2[ii & !moreup, ] >= 3
1345
+		moreup <- rowSums(mostLikelyState > 3) > rowSums(mostLikelyState < 3)
1346
+		notUp <-  mostLikelyState[ii & moreup, ] <= 3
1347
+		notDown <- mostLikelyState[ii & !moreup, ] >= 3
1342 1348
 		NORM <- matrix(TRUE, nrow(NP), ncol(NP))
1343 1349
 		NORM[ii & moreup, ] <- notUp
1344 1350
 		NORM[ii & !moreup, ] <- notDown
1345 1351
 		normalNP <- normalNP*NORM
1346 1352
 	}
1347
-	flagAltered <- which(propAlt > 0.5)
1353
+	flagAltered <- which(proportionSamplesAltered > 0.5)
1348 1354
 	envir[["flagAlteredNP"]] <- flagAltered
1349 1355
 	normalNP[normalNP == FALSE] <- NA
1350
-	envir[["normalNP"]] <- normalNP
1356
+	tmp <- envir[["normalNP"]]
1357
+	tmp[, plate==uplate[p]] <- normalNP
1358
+	envir[["normalNP"]] <- tmp
1351 1359
 }
1352 1360
 
... ...
@@ -14,8 +14,8 @@
14 14
 \newcommand{\R}{\textsf{R}}
15 15
 
16 16
 \begin{document}
17
-\title{Estimating copy number for Affymetrix 6.0  with the crlmm Package}
18
-\date{February, 2009}
17
+\title{Trisomy analysis}
18
+\date{May, 2009}
19 19
 \author{Rob Scharpf}
20 20
 \maketitle
21 21