git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@40036 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -316,6 +316,10 @@ computeCopynumber <- function(chrom, |
316 | 316 |
cdfName="genomewidesnp6", |
317 | 317 |
verbose=TRUE, ...){ |
318 | 318 |
require(paste(cdfName, "Crlmm", sep=""), character.only=TRUE) || stop(paste("cdf ", cdfName, "Crlmm", " not available.", sep="")) |
319 |
+ if(!missing(plate)){ |
|
320 |
+ if(length(plate) != ncol(A)) |
|
321 |
+ stop("plate must the same length as the number of columns of A") |
|
322 |
+ } |
|
319 | 323 |
set.seed(seed) |
320 | 324 |
if(missing(chrom)) stop("must specify chromosome") |
321 | 325 |
if(length(ls(envir)) == 0) { |
... | ... |
@@ -517,6 +521,7 @@ nonpolymorphic <- function(plateIndex, NP, envir, CONF.THR=0.99, DF.PRIOR=50, pk |
517 | 521 |
##--------------------------------------------------------------------------- |
518 | 522 |
} |
519 | 523 |
|
524 |
+##sufficient statistics on the intensity scale |
|
520 | 525 |
withinGenotypeMoments <- function(p, A, B, calls, conf, CONF.THR, DF.PRIOR, envir){ |
521 | 526 |
CHR <- envir[["chrom"]] |
522 | 527 |
Ns <- envir[["Ns"]] |
... | ... |
@@ -714,14 +719,14 @@ oneBatch <- function(plateIndex, |
714 | 719 |
snpflags[, p] <- index[[1]] | index[[2]] | index[[3]] |
715 | 720 |
|
716 | 721 |
##--------------------------------------------------------------------------- |
717 |
- ## Two genotype clusters not observed -- would sequence help? (didn't seem that helpful) |
|
718 |
- ## 1 extract index of complete data |
|
719 |
- ## 2 Regress mu1,mu3 ~ sequence + mu2 |
|
720 |
- ## 3 Predict mu1*, mu3* for missing genotypes |
|
722 |
+ ## Two genotype clusters not observed -- would sequence help? (didn't seem to) |
|
723 |
+ ## 1. extract index of complete data |
|
724 |
+ ## 2. Regress mu_missing ~ sequence + mu_observed |
|
725 |
+ ## 3. solve for nu assuming the median is 2 |
|
721 | 726 |
##--------------------------------------------------------------------------- |
722 | 727 |
cols <- c(3, 1, 2) |
723 | 728 |
for(j in 1:3){ |
724 |
- if(sum(index[[1]]) == 0) next() |
|
729 |
+ if(sum(index[[j]]) == 0) next() |
|
725 | 730 |
k <- cols[j] |
726 | 731 |
X <- cbind(1, mnA[index.complete, k], mnB[index.complete, k]) |
727 | 732 |
Y <- cbind(mnA[index.complete, -k], |
... | ... |
@@ -732,7 +737,6 @@ oneBatch <- function(plateIndex, |
732 | 737 |
muA[index[[j]], p, -c(1, 2, k+2)] <- mus[, 1:2] |
733 | 738 |
muB[index[[j]], p, -c(1, 2, k+2)] <- mus[, 3:4] |
734 | 739 |
} |
735 |
- |
|
736 | 740 |
negA <- rowSums(muA[, p, ] < 0) > 0 |
737 | 741 |
negB <- rowSums(muB[, p, ] < 0) > 0 |
738 | 742 |
snpflags[, p] <- snpflags[, p] | negA | negB | rowSums(is.na(muA[, p, 3:5]), na.rm=TRUE) > 0 |
... | ... |
@@ -748,6 +752,7 @@ oneBatch <- function(plateIndex, |
748 | 752 |
envir[["plates.completed"]] <- plates.completed |
749 | 753 |
} |
750 | 754 |
|
755 |
+##Estimate tau2, sigma2, and correlation |
|
751 | 756 |
locationAndScale <- function(p, GT.A, GT.B, index, envir, DF.PRIOR){ |
752 | 757 |
tau2A <- envir[["tau2A"]] |
753 | 758 |
tau2B <- envir[["tau2B"]] |
... | ... |
@@ -997,20 +1002,18 @@ biasAdj <- function(plateIndex, envir, priorProb, PROP=0.75){ |
997 | 1002 |
if(CA > 0 & CB > 0) rho <- corr[, p] |
998 | 1003 |
if(CHR == 23){ |
999 | 1004 |
means <- cbind(suppressWarnings(log2(nuA[, p]+CA*phiA[, p] + CB*phiAx[, p])), suppressWarnings(log2(nuB[, p]+CB*phiB[, p] + CA*phiBx[, p]))) |
1005 |
+ browser() |
|
1000 | 1006 |
} else{ |
1001 | 1007 |
means <- cbind(suppressWarnings(log2(nuA[, p]+CA*phiA[, p])), suppressWarnings(log2(nuB[, p]+CB*phiB[, p]))) |
1008 |
+ meanA <- suppressWarnings(log2(nuA[, p]+CA*phiA[, p])) |
|
1009 |
+ meanB <- suppressWarnings(log2(nuB[, p]+CB*phiB[, p])) |
|
1010 |
+ covs <- rho*A.scale*B.scale |
|
1011 |
+ A.scale2 <- A.scale^2 |
|
1012 |
+ B.scale2 <- B.scale^2 |
|
1002 | 1013 |
} |
1003 |
- covs <- rho*A.scale*B.scale |
|
1004 |
- A.scale2 <- A.scale^2 |
|
1005 |
- B.scale2 <- B.scale^2 |
|
1006 |
- m <- 1 |
|
1007 |
- for(i in 1:nrow(A)){ |
|
1008 |
- Sigma <- matrix(c(A.scale2[i], covs[i], covs[i], B.scale2[i]), 2,2) |
|
1009 |
- xx <- matrix(X[i, ], ncol=2) |
|
1010 |
- tmp <- dmvnorm(xx, mean=means[i, ], sigma=Sigma) |
|
1011 |
- emit[m, , counter] <- tmp |
|
1012 |
- m <- m+1 |
|
1013 |
- } |
|
1014 |
+ Q.x.y <- 1/(1-rho^2)*(((lA - meanA)/A.scale)^2 + ((lB - meanB)/B.scale)^2 - 2*rho*((lA - meanA)*(lB - meanB))/(A.scale*B.scale)) |
|
1015 |
+ f.x.y <- 1/(2*pi*A.scale*B.scale*sqrt(1-rho^2))*exp(-0.5*Q.x.y) |
|
1016 |
+ emit[, , counter] <- f.x.y |
|
1014 | 1017 |
counter <- counter+1 |
1015 | 1018 |
} |
1016 | 1019 |
} |
... | ... |
@@ -1019,60 +1022,63 @@ biasAdj <- function(plateIndex, envir, priorProb, PROP=0.75){ |
1019 | 1022 |
norm <- priorProb[3]*emit[, , 4:6] |
1020 | 1023 |
amp <- priorProb[4]*emit[, , 7:10] |
1021 | 1024 |
##sum over the different combinations within each copy number state |
1022 |
- hemDel <- apply(hemDel, c(1,2), sum) |
|
1023 |
- norm <- apply(norm, c(1, 2), sum) |
|
1024 |
- amp <- apply(amp, c(1,2), sum) |
|
1025 |
+ hemDel <- hemDel[, , 1] + hemDel[, , 2] |
|
1026 |
+ norm <- norm[, , 1] + norm[, , 2] + norm[, , 3] |
|
1027 |
+ amp <- amp[, , 1] + amp[, , 2] + amp[ , , 3] + amp[, , 4] |
|
1025 | 1028 |
total <- hemDel + norm + amp |
1026 | 1029 |
hemDel <- hemDel/total |
1027 | 1030 |
norm <- norm/total |
1028 | 1031 |
amp <- amp/total |
1029 | 1032 |
envir[["posteriorProb"]] <- list(hemDel=hemDel, norm=norm, amp=amp) |
1030 |
- tmp <- array(NA, dim=c(nrow(A), ncol(A), 4)) |
|
1031 |
- tmp[, , 1] <- homDel |
|
1032 |
- tmp[, , 2] <- hemDel |
|
1033 |
- tmp[, , 3] <- norm |
|
1034 |
- tmp[, , 4] <- amp |
|
1035 |
- tmp2 <- apply(tmp, c(1, 2), function(x) order(x, decreasing=TRUE)[1]) |
|
1033 |
+ posteriorProb <- array(NA, dim=c(nrow(A), ncol(A), 4)) |
|
1034 |
+ posteriorProb[, , 1] <- homDel |
|
1035 |
+ posteriorProb[, , 2] <- hemDel |
|
1036 |
+ posteriorProb[, , 3] <- norm |
|
1037 |
+ posteriorProb[, , 4] <- amp |
|
1038 |
+ mostLikelyState <- apply(posteriorProb, c(1, 2), function(x) order(x, decreasing=TRUE)[1]) |
|
1036 | 1039 |
##Adjust for SNPs that have less than 80% of the samples in an altered state |
1037 | 1040 |
##flag the remainder? |
1038 | 1041 |
if(CHR != 23){ |
1039 |
- tmp3 <- tmp2 != 3 |
|
1042 |
+ proportionSamplesAltered <- rowMeans(mostLikelyState != 3) |
|
1040 | 1043 |
}else{ |
1041 | 1044 |
##should also consider pseud |
1045 |
+ browser() |
|
1042 | 1046 |
gender <- envir[["gender"]] |
1043 | 1047 |
tmp3 <- tmp2 |
1044 | 1048 |
tmp3[, gender=="male"] <- tmp2[, gender=="male"] != 2 |
1045 | 1049 |
tmp3[, gender=="female"] <- tmp2[, gender=="female"] != 3 |
1046 | 1050 |
} |
1047 | 1051 |
##Those near 1 have NaNs for nu and phi. this occurs by NaNs in the muA[,, "A"] or muA[, , "B"] for X chromosome |
1048 |
- propAlt <- rowMeans(tmp3)##prop normal |
|
1049 |
- ##ii <- propAlt < PROP |
|
1050 |
- ii <- propAlt > 0.05 |
|
1052 |
+ ##ii <- proportionSamplesAltered < PROP |
|
1053 |
+ ##ii <- proportionSamplesAltered > 0.05 |
|
1054 |
+ ##Figure out why the proportion altered can be near 1... |
|
1055 |
+ ii <- proportionSamplesAltered < 0.8 & proportionSamplesAltered > 0.01 |
|
1051 | 1056 |
##only exclude observations from one tail, depending on |
1052 | 1057 |
##whether more are up or down |
1053 | 1058 |
if(CHR != 23){ |
1054 |
- moreup <- rowSums(tmp2 > 3) > rowSums(tmp2 < 3) ##3 is normal |
|
1055 |
- down <- tmp2[ii & moreup, ] <= 3 |
|
1056 |
- up <- tmp2[ii & !moreup, ] >= 3 |
|
1057 |
- ##What if we just keep X% of the ones with the highest |
|
1058 |
- ##posterior probability for normal |
|
1059 |
-## prNorm <- tmp[, , 3] |
|
1060 |
-## rankNorm <- t(apply(prNorm, 1, order, decreasing=TRUE)) |
|
1061 |
- |
|
1062 |
- rankUP <- t(apply(tmp[moreup & ii, , 4]/tmp[moreup & ii, , 3], 1, order, decreasing=TRUE)) |
|
1063 |
- rankDown <- t(apply(tmp[!moreup & ii, , 2]/tmp[!moreup & ii, , 3], 1, order, decreasing=TRUE)) |
|
1064 |
- maxNumberToDrop <- round(0.25*ncol(tmp2),0) |
|
1065 |
-## NORM <- rankNorm <= percentage |
|
1066 |
- NORM <- matrix(TRUE, nrow(A), ncol(A)) |
|
1067 |
- NORM[ii & moreup, ] <- rankUP >= maxNumberToDrop |
|
1068 |
- NORM[ii & !moreup, ] <- rankDown >= maxNumberToDrop |
|
1059 |
+ moreup <- rowSums(mostLikelyState > 3) > rowSums(mostLikelyState < 3) ##3 is normal |
|
1060 |
+ NORM <- matrix(FALSE, nrow(A), ncol(A)) |
|
1061 |
+ NORM[proportionSamplesAltered > 0.8, ] <- FALSE |
|
1062 |
+ ##big and greater than 1 if copy number 3 is more likely |
|
1063 |
+ ratioUp <- posteriorProb[, , 4]/posteriorProb[, , 3] |
|
1064 |
+ ##large values will have small ranks |
|
1065 |
+ ##rankUP <- t(apply(ratio, 1, rank)) |
|
1066 |
+ ## drop small ranks up to the maximum number, unless the ratio is greater than 1 |
|
1067 |
+ ##NORM[ii & moreup, ] <- !(rankUP <= maxNumberToDrop) | (ratio < 1) |
|
1068 |
+ NORM[ii & moreup, ] <- ratioUp[moreup & ii] < 1 ##normal more likely |
|
1069 |
+ ##big and greater than 1 if copy number 1 is more likely than 2 |
|
1070 |
+ ratioDown <- posteriorProb[, , 2]/posteriorProb[, , 3] |
|
1071 |
+ ##big values have small ranks |
|
1072 |
+ ##rankDown <- t(apply(ratio, 1, rank)) |
|
1073 |
+ ##NORM[ii & !moreup, ] <- !(rankDown <= maxNumberToDrop) | (ratio < 1) |
|
1074 |
+ NORM[ii & !moreup, ] <- ratioDown[!moreup & ii] < 1 ##normal more likely |
|
1069 | 1075 |
## NORM[ii & !moreup, ] <- up |
1070 | 1076 |
##Define NORM so that we can iterate this step |
1071 | 1077 |
##NA's in the previous iteration (normal) will be propogated |
1072 | 1078 |
normal <- NORM*normal |
1073 | 1079 |
} else{ |
1074 |
- fem <- tmp2[, gender=="female"] |
|
1075 |
- mal <- tmp2[, gender=="male"] |
|
1080 |
+ fem <- mostLikelyState[, gender=="female"] |
|
1081 |
+ mal <- mostLikelyState[, gender=="male"] |
|
1076 | 1082 |
moreupF <- rowSums(fem > 3) > rowSums(fem < 3) |
1077 | 1083 |
moreupM <- rowSums(mal > 2) > rowSums(mal < 2) |
1078 | 1084 |
notUpF <- fem[ii & moreupF, ] <= 3 |
... | ... |
@@ -1089,7 +1095,7 @@ biasAdj <- function(plateIndex, envir, priorProb, PROP=0.75){ |
1089 | 1095 |
normal[, gender=="female"] <- normalF |
1090 | 1096 |
normal[, gender=="male"] <- normalM |
1091 | 1097 |
} |
1092 |
- flagAltered <- which(propAlt > 0.5) |
|
1098 |
+ flagAltered <- which(proportionSamplesAltered > 0.5) |
|
1093 | 1099 |
envir[["flagAltered"]] <- flagAltered |
1094 | 1100 |
normal[normal == FALSE] <- NA |
1095 | 1101 |
envir[["normal"]] <- normal |
... | ... |
@@ -1285,10 +1291,6 @@ expectedC <- function(plateIndex, envir, priorProb, cnStates=0:6){ |
1285 | 1291 |
return(posteriorMode) |
1286 | 1292 |
} |
1287 | 1293 |
|
1288 |
- |
|
1289 |
- |
|
1290 |
- |
|
1291 |
- |
|
1292 | 1294 |
biasAdjNP <- function(plateIndex, envir, priorProb){ |
1293 | 1295 |
p <- plateIndex |
1294 | 1296 |
normalNP <- envir[["normalNP"]] |
... | ... |
@@ -1297,56 +1299,62 @@ biasAdjNP <- function(plateIndex, envir, priorProb){ |
1297 | 1299 |
plate <- envir[["plate"]] |
1298 | 1300 |
uplate <- envir[["uplate"]] |
1299 | 1301 |
sig2T <- envir[["sig2T"]] |
1302 |
+ normalNP <- normalNP[, plate==uplate[p]] |
|
1300 | 1303 |
NP <- NP[, plate==uplate[p]] |
1301 |
- sig2T <- sig2T[, plate==uplate[p]] |
|
1304 |
+ ##sig2T <- sig2T[, plate==uplate[p]] |
|
1305 |
+ sig2T <- sig2T[, p] |
|
1306 |
+ |
|
1302 | 1307 |
|
1303 | 1308 |
##Assume that on the log-scale, that the background variance is the same... |
1304 | 1309 |
tau2T <- sig2T |
1305 | 1310 |
nuT <- envir[["nuT"]] |
1311 |
+ nuT <- nuT[, p] |
|
1306 | 1312 |
phiT <- envir[["phiT"]] |
1307 |
- p <- plateIndex |
|
1308 |
- plate <- envir[["plate"]] |
|
1313 |
+ phiT <- phiT[, p] |
|
1314 |
+ |
|
1309 | 1315 |
if(missing(priorProb)) priorProb <- rep(1/4, 4) ##uniform |
1310 | 1316 |
emit <- array(NA, dim=c(nrow(NP), ncol(NP), 4))##SNPs x sample x 'truth' |
1311 | 1317 |
lT <- log2(NP) |
1312 | 1318 |
counter <- 1 ##state counter |
1313 | 1319 |
for(CT in 0:3){ |
1314 | 1320 |
sds <- sqrt(tau2T*(CT==0) + sig2T*(CT > 0)) |
1315 |
- means <- suppressWarnings(log2(nuT[, p]+CT*phiT[, p])) |
|
1321 |
+ means <- suppressWarnings(log2(nuT+CT*phiT)) |
|
1316 | 1322 |
tmp <- dnorm(lT, mean=means, sd=sds) |
1317 | 1323 |
emit[, , counter] <- tmp |
1318 | 1324 |
counter <- counter+1 |
1319 | 1325 |
} |
1320 |
- tmp2 <- apply(emit, c(1, 2), function(x) order(x, decreasing=TRUE)[1]) |
|
1326 |
+ mostLikelyState <- apply(emit, c(1, 2), function(x) order(x, decreasing=TRUE)[1]) |
|
1321 | 1327 |
##Adjust for SNPs that have less than 80% of the samples in an altered state |
1322 | 1328 |
##flag the remainder? |
1323 | 1329 |
if(CHR != 23){ |
1324 |
- tmp3 <- tmp2 != 3 |
|
1330 |
+ tmp3 <- mostLikelyState != 3 |
|
1325 | 1331 |
}else{ |
1326 | 1332 |
browser() |
1327 | 1333 |
##should also consider pseudoautosomal |
1328 | 1334 |
gender <- envir[["gender"]] |
1329 |
- tmp3 <- tmp2 |
|
1330 |
- tmp3[, gender=="male"] <- tmp2[, gender=="male"] != 2 |
|
1331 |
- tmp3[, gender=="female"] <- tmp2[, gender=="female"] != 3 |
|
1335 |
+ tmp3 <- mostLikelyState |
|
1336 |
+ tmp3[, gender=="male"] <- mostLikelyState[, gender=="male"] != 2 |
|
1337 |
+ tmp3[, gender=="female"] <- mostLikelyState[, gender=="female"] != 3 |
|
1332 | 1338 |
} |
1333 | 1339 |
##Those near 1 have NaNs for nu and phi. this occurs by NaNs in the muA[,, "A"] or muA[, , "B"] for X chromosome |
1334 |
- propAlt <- rowMeans(tmp3)##prop normal |
|
1335 |
- ii <- propAlt < 0.75 |
|
1340 |
+ proportionSamplesAltered <- rowMeans(tmp3)##prop normal |
|
1341 |
+ ii <- proportionSamplesAltered < 0.75 |
|
1336 | 1342 |
##only exclude observations from one tail, depending on |
1337 | 1343 |
##whether more are up or down |
1338 | 1344 |
if(CHR != 23){ |
1339 |
- moreup <- rowSums(tmp2 > 3) > rowSums(tmp2 < 3) |
|
1340 |
- notUp <- tmp2[ii & moreup, ] <= 3 |
|
1341 |
- notDown <- tmp2[ii & !moreup, ] >= 3 |
|
1345 |
+ moreup <- rowSums(mostLikelyState > 3) > rowSums(mostLikelyState < 3) |
|
1346 |
+ notUp <- mostLikelyState[ii & moreup, ] <= 3 |
|
1347 |
+ notDown <- mostLikelyState[ii & !moreup, ] >= 3 |
|
1342 | 1348 |
NORM <- matrix(TRUE, nrow(NP), ncol(NP)) |
1343 | 1349 |
NORM[ii & moreup, ] <- notUp |
1344 | 1350 |
NORM[ii & !moreup, ] <- notDown |
1345 | 1351 |
normalNP <- normalNP*NORM |
1346 | 1352 |
} |
1347 |
- flagAltered <- which(propAlt > 0.5) |
|
1353 |
+ flagAltered <- which(proportionSamplesAltered > 0.5) |
|
1348 | 1354 |
envir[["flagAlteredNP"]] <- flagAltered |
1349 | 1355 |
normalNP[normalNP == FALSE] <- NA |
1350 |
- envir[["normalNP"]] <- normalNP |
|
1356 |
+ tmp <- envir[["normalNP"]] |
|
1357 |
+ tmp[, plate==uplate[p]] <- normalNP |
|
1358 |
+ envir[["normalNP"]] <- tmp |
|
1351 | 1359 |
} |
1352 | 1360 |
|