git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@37630 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,14 +1,14 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling via CRLMM Algorithm |
4 |
-Version: 1.0.34 |
|
4 |
+Version: 1.0.35 |
|
5 | 5 |
Date: 2008-12-28 |
6 | 6 |
Author: Rafael A Irizarry |
7 | 7 |
Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu> |
8 | 8 |
Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays. |
9 |
-Imports: affyio, preprocessCore, utils, stats |
|
9 |
+Imports: affyio, preprocessCore, utils, stats, genefilter, splines, Biobase |
|
10 | 10 |
License: Artistic-2.0 |
11 |
-Suggests: hapmapsnp5, genomewidesnp5Crlmm, methods |
|
11 |
+Suggests: hapmapsnp5, genomewidesnp5Crlmm, genomewidesnp6Crlmm, methods |
|
12 | 12 |
Collate: crlmmGT.R |
13 | 13 |
crlmm.R |
14 | 14 |
fitAffySnpMixture56.R |
... | ... |
@@ -17,4 +17,5 @@ Collate: crlmmGT.R |
17 | 17 |
zzz.R |
18 | 18 |
crlmmGTnm.R |
19 | 19 |
crlmmnm.R |
20 |
+ functions.R |
|
20 | 21 |
LazyLoad: yes |
... | ... |
@@ -1,6 +1,9 @@ |
1 | 1 |
useDynLib("crlmm", .registration=TRUE) |
2 |
-export("crlmm", "list.celfiles") |
|
2 |
+export("crlmm", "list.celfiles", "computeCnBatch") |
|
3 | 3 |
importFrom(affyio, read.celfile.header, read.celfile) |
4 | 4 |
importFrom(preprocessCore, normalize.quantiles.use.target) |
5 | 5 |
importFrom(utils, data, packageDescription, setTxtProgressBar, txtProgressBar) |
6 | 6 |
importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile, sd) |
7 |
+##RS |
|
8 |
+importFrom(Biobase, rowMedians) |
|
9 |
+importFrom(genefilter, rowSds) |
... | ... |
@@ -5,7 +5,7 @@ crlmm <- function(filenames, row.names=TRUE, col.names=TRUE, |
5 | 5 |
cdfName, sns, recallMin=10, recallRegMin=1000, |
6 | 6 |
returnParams=FALSE, badSNP=.7){ |
7 | 7 |
if ((load.it | save.it) & missing(intensityFile)) |
8 |
- stop("'intensityFile' is missing, and you chose either load.it or save.it") |
|
8 |
+ stop("'intensityFile' is missing, and you chose either load.it or save.it") |
|
9 | 9 |
|
10 | 10 |
if (missing(sns)) sns <- basename(filenames) |
11 | 11 |
if (!missing(intensityFile)) |
... | ... |
@@ -33,7 +33,6 @@ crlmm <- function(filenames, row.names=TRUE, col.names=TRUE, |
33 | 33 |
} |
34 | 34 |
if(row.names) row.names=res$gns else row.names=NULL |
35 | 35 |
if(col.names) col.names=res$sns else col.names=NULL |
36 |
- |
|
37 | 36 |
res2 <- crlmmGT(res[["A"]], res[["B"]], res[["SNR"]], |
38 | 37 |
res[["mixtureParams"]], res[["cdfName"]], |
39 | 38 |
gender=gender, row.names=row.names, |
... | ... |
@@ -36,13 +36,13 @@ crlmmGT <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
36 | 36 |
|
37 | 37 |
##IF gender not provide, we predict |
38 | 38 |
if(is.null(gender)){ |
39 |
- if(verbose) message("Determining gender.") |
|
40 |
- XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
41 |
- if(sum(SNR>SNRMin)==1){ |
|
42 |
- gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
43 |
- }else{ |
|
44 |
- gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
45 |
- } |
|
39 |
+ if(verbose) message("Determining gender.") |
|
40 |
+ XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
41 |
+ if(sum(SNR>SNRMin)==1){ |
|
42 |
+ gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
43 |
+ }else{ |
|
44 |
+ gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
45 |
+ } |
|
46 | 46 |
} |
47 | 47 |
|
48 | 48 |
Indexes <- list(autosomeIndex, XIndex, YIndex) |
49 | 49 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,984 @@ |
1 |
+rowCovs <- function(x, y, ...){ |
|
2 |
+ notna <- !is.na(x) |
|
3 |
+ N <- rowSums(notna) |
|
4 |
+ x <- suppressWarnings(log2(x)) |
|
5 |
+ if(!missing(y)) y <- suppressWarnings(log2(y)) |
|
6 |
+ return(rowSums((x - rowMeans(x, ...)) * (y - rowMeans(y, ...)), ...)/(N-1)) |
|
7 |
+} |
|
8 |
+ |
|
9 |
+rowMAD <- function(x, y, ...){ |
|
10 |
+ notna <- !is.na(x) |
|
11 |
+ mad <- 1.4826*rowMedians(abs(x-rowMedians(x, ...)), ...) |
|
12 |
+ return(mad) |
|
13 |
+} |
|
14 |
+ |
|
15 |
+rowCors <- function(x, y, ...){ |
|
16 |
+ N <- rowSums(!is.na(x)) |
|
17 |
+ x <- suppressWarnings(log2(x)) |
|
18 |
+ y <- suppressWarnings(log2(y)) |
|
19 |
+ sd.x <- rowSds(x, ...) |
|
20 |
+ sd.y <- rowSds(y, ...) |
|
21 |
+ covar <- rowSums((x - rowMeans(x, ...)) * (y - rowMeans(y, ...)), ...)/(N-1) |
|
22 |
+ return(covar/(sd.x*sd.y)) |
|
23 |
+} |
|
24 |
+ |
|
25 |
+nuphiAllele <- function(allele, Ystar, W, envir, p){ |
|
26 |
+ Ns <- get("Ns", envir) |
|
27 |
+ NOHET <- mean(Ns[, p, 2], na.rm=TRUE) < 0.05 |
|
28 |
+ chrom <- get("chrom", envir) |
|
29 |
+ if(missing(allele)) stop("must specify allele") |
|
30 |
+ if(chrom == 23){ |
|
31 |
+ gender <- get("gender", envir) |
|
32 |
+ if(allele == "A" & gender == "female") |
|
33 |
+ X <- cbind(1, 2:0) |
|
34 |
+ if(allele == "B" & gender == "female") |
|
35 |
+ X <- cbind(1, 0:2) |
|
36 |
+ if(allele == "A" & gender == "male") |
|
37 |
+ X <- cbind(1, 1:0) |
|
38 |
+ if(allele == "B" & gender == "male") |
|
39 |
+ X <- cbind(1, 0:1) |
|
40 |
+ } else {##autosome |
|
41 |
+ if(allele == "A") X <- cbind(1, 2:0) else X <- cbind(1, 0:2) |
|
42 |
+ if(NOHET) X <- X[-2, ] ##more than 1 X chromosome, but all homozygous |
|
43 |
+ } |
|
44 |
+ if(any(!is.finite(W))){## | any(!is.finite(V))){ |
|
45 |
+ i <- which(rowSums(!is.finite(W)) > 0) |
|
46 |
+ browser() |
|
47 |
+ stop("Inf values in W or V") |
|
48 |
+ } |
|
49 |
+ ##How to quickly generate Xstar, Xstar = diag(W) %*% X |
|
50 |
+ generateX <- function(w, X) as.numeric(diag(w) %*% X) |
|
51 |
+ ##Not instant |
|
52 |
+ Xstar <- apply(W, 1, generateX, X) |
|
53 |
+ generateIXTX <- function(x, nrow=3) { |
|
54 |
+ X <- matrix(x, nrow=nrow) |
|
55 |
+ XTX <- crossprod(X) |
|
56 |
+ solve(XTX) |
|
57 |
+ } |
|
58 |
+ ##a little slow |
|
59 |
+ IXTX <- apply(Xstar, 2, generateIXTX, nrow=nrow(X)) |
|
60 |
+ betahat <- matrix(NA, 2, nrow(Ystar)) |
|
61 |
+ ses <- matrix(NA, 2, nrow(Ystar)) |
|
62 |
+ for(i in 1:nrow(Ystar)){ |
|
63 |
+ betahat[, i] <- crossprod(matrix(IXTX[, i], 2, 2), crossprod(matrix(Xstar[, i], nrow=nrow(X)), Ystar[i, ])) |
|
64 |
+ ssr <- sum((Ystar[i, ] - matrix(Xstar[, i], nrow(X), 2) %*% matrix(betahat[, i], 2, 1))^2) |
|
65 |
+ ses[, i] <- sqrt(diag(matrix(IXTX[, i], 2, 2) * ssr)) |
|
66 |
+ } |
|
67 |
+ nu <- betahat[1, ] |
|
68 |
+ phi <- betahat[2, ] |
|
69 |
+ nu.se <- ses[1,] |
|
70 |
+ phi.se <- ses[2,] |
|
71 |
+ ##dimnames(nu) <- dimnames(phi) <- dimnames(nu.ses) <- dimnames(phi.ses) |
|
72 |
+ assign("nu", nu, envir=parent.frame(1)) |
|
73 |
+ assign("phi", phi, envir=parent.frame(1)) |
|
74 |
+ assign("nu.se", nu.se, envir=parent.frame(1)) |
|
75 |
+ assign("phi.se", phi.se, envir=parent.frame(1)) |
|
76 |
+} |
|
77 |
+ |
|
78 |
+ |
|
79 |
+ |
|
80 |
+celDatesFrom <- function(celfiles, path=""){ |
|
81 |
+ require(affyio) |
|
82 |
+ celdates <- vector("character", length(celfiles)) |
|
83 |
+ celtimes <- vector("character", length(celfiles)) |
|
84 |
+ for(i in seq(along=celfiles)){ |
|
85 |
+ if(i %% 100 == 0) cat(".") |
|
86 |
+ tmp <- read.celfile.header(file.path(path, celfiles[i]), info="full")$DatHeader |
|
87 |
+ tmp <- strsplit(tmp, "\ +") |
|
88 |
+ celdates[i] <- tmp[[1]][6] |
|
89 |
+ celtimes[i] <- tmp[[1]][7] |
|
90 |
+ } |
|
91 |
+ tmp <- paste(celdates, celtimes) |
|
92 |
+ celdts <- strptime(tmp, "%m/%d/%y %H:%M:%S") |
|
93 |
+ return(celdts) |
|
94 |
+} |
|
95 |
+ |
|
96 |
+ |
|
97 |
+cnrma <- function(filenames, sns, cdfName, seed=1){ |
|
98 |
+## require(preprocessCore) || stop("Package preprocessCore not available") |
|
99 |
+ if (missing(sns)) sns <- basename(filenames) |
|
100 |
+ if (missing(cdfName)) |
|
101 |
+ cdfName <- read.celfile.header(filenames[1])$cdfName |
|
102 |
+ ## stuffDir <- changeToCrlmmAnnotationName(cdfName) |
|
103 |
+ pkgname <- crlmm:::getCrlmmAnnotationName(cdfName) |
|
104 |
+ if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ |
|
105 |
+ suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
106 |
+ msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
107 |
+ message(strwrap(msg)) |
|
108 |
+ stop("Package ", pkgname, " could not be found.") |
|
109 |
+ rm(suggCall, msg) |
|
110 |
+ } |
|
111 |
+ |
|
112 |
+ ##I'm using the pd.mapping package |
|
113 |
+ map <- list(); j <- 1 |
|
114 |
+ for(chrom in c(1:22, "X")){ |
|
115 |
+ map[[j]] <- getCnvFid(chrom) |
|
116 |
+ j <- j+1 |
|
117 |
+ } |
|
118 |
+ map <- lapply(map, function(x) x[order(x[, "chrom_start"]), ]) |
|
119 |
+ map <- do.call("rbind", map) |
|
120 |
+ |
|
121 |
+ |
|
122 |
+ fid <- map[, "fid"] |
|
123 |
+ set.seed(seed) |
|
124 |
+ idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything |
|
125 |
+ SKW <- vector("numeric", length(filenames)) |
|
126 |
+ NP <- matrix(NA, nrow(map), length(filenames)) |
|
127 |
+ verbose <- TRUE |
|
128 |
+ if(verbose){ |
|
129 |
+ message("Processing ", length(filenames), " files.") |
|
130 |
+ if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3) |
|
131 |
+ } |
|
132 |
+ ##load reference distribution obtained from hapmap |
|
133 |
+ load("/thumper/ctsa/snpmicroarray/hapmap/processed/affy/1m_reference_cn.rda") |
|
134 |
+ for(i in seq(along=filenames)){ |
|
135 |
+ y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
|
136 |
+ x <- log2(y[idx2]) |
|
137 |
+ SKW[i] <- mean((x-mean(x))^3)/(sd(x)^3) |
|
138 |
+ rm(x) |
|
139 |
+ NP[, i] <- as.integer(normalize.quantiles.use.target(y, target=reference)) |
|
140 |
+ ##A[, i] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
|
141 |
+ ##B[, i] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
|
142 |
+ ##Now to fit the EM |
|
143 |
+ if (verbose) |
|
144 |
+ if (getRversion() > '2.7.0') setTxtProgressBar(pb, i) |
|
145 |
+ else cat(".") |
|
146 |
+ } |
|
147 |
+ dimnames(NP) <- list(map[, "man_fsetid"], sns) |
|
148 |
+ res3 <- list(NP=NP, SKW=SKW) |
|
149 |
+ return(res3) |
|
150 |
+} |
|
151 |
+ |
|
152 |
+getFlags <- function(phi.thr, envir){ |
|
153 |
+ nuA <- get("nuA", envir) |
|
154 |
+ nuB <- get("nuB", envir) |
|
155 |
+ phiA <- get("phiA", envir) |
|
156 |
+ phiB <- get("phiB", envir) |
|
157 |
+ |
|
158 |
+ negativeNus <- nuA < 1 | nuB < 1 |
|
159 |
+ negativeNus <- negativeNus > 0 |
|
160 |
+ |
|
161 |
+ negativePhis <- phiA < phi.thr | phiB < phi.thr |
|
162 |
+ negativePhis <- negativePhis > 0 |
|
163 |
+ negativeCoef <- negativeNus | negativePhis |
|
164 |
+ |
|
165 |
+ notfinitePhi <- !is.finite(phiA) | !is.finite(phiB) |
|
166 |
+ notfinitePhi <- notfinitePhi > 0 |
|
167 |
+ flags <- negativeCoef | notfinitePhi |
|
168 |
+} |
|
169 |
+ |
|
170 |
+goodSnps <- function(phi.thr, envir, fewAA=20, fewBB=20){ |
|
171 |
+ Ns <- get("Ns", envir) |
|
172 |
+ flags <- getFlags(phi.thr=phi.thr, envir) |
|
173 |
+ fewAA <- Ns[, , 1] < fewAA |
|
174 |
+ fewBB <- Ns[, , 3] < fewBB |
|
175 |
+ flagsA <- flags | fewAA |
|
176 |
+ flagsB <- flags | fewBB |
|
177 |
+ flags <- list(A=flagsA, B=flagsB) |
|
178 |
+ return(flags) |
|
179 |
+} |
|
180 |
+ |
|
181 |
+instantiateObjects <- function(calls, NP, plate, envir, chrom){ |
|
182 |
+ if(missing(chrom)) stop("must specify chrom") |
|
183 |
+ assign("chrom", chrom, envir) |
|
184 |
+ snps <- rownames(calls) |
|
185 |
+ cnvs <- rownames(NP) |
|
186 |
+ sns <- basename(colnames(calls)) |
|
187 |
+ stopifnot(identical(colnames(calls), colnames(NP))) |
|
188 |
+ assign("sns", sns, envir) |
|
189 |
+ assign("snps", snps, envir) |
|
190 |
+ assign("cnvs", cnvs, envir) |
|
191 |
+ |
|
192 |
+ CA <- CB <- matrix(NA, nrow(calls), ncol(calls)) |
|
193 |
+ assign("plate", plate, envir) |
|
194 |
+ uplate <- unique(plate) |
|
195 |
+ Ns <- array(NA, dim=c(nrow(calls), length(uplate), 3)) |
|
196 |
+ dimnames(Ns)[[3]] <- c("AA", "AB", "BB") |
|
197 |
+ muA.AA <- matrix(NA, nrow=nrow(calls), length(uplate)) |
|
198 |
+ ##Sigma.AA <- array(NA, dim=c(nrow(calls), 3, length(uplate))) |
|
199 |
+ ##dimnames(Sigma.AA)[2:3] <- list(c("varA", "varB", "cov"), uplate) |
|
200 |
+ |
|
201 |
+ NP.CT <- matrix(NA, nrow(NP), ncol(NP)) |
|
202 |
+ NP.sds <- matrix(NA, nrow(NP), ncol(NP)) |
|
203 |
+ assign("NP.CT", NP.CT, envir) |
|
204 |
+ assign("NP.sds", NP.sds, envir) |
|
205 |
+ nus <- matrix(NA, nrow(NP), length(uplate)) |
|
206 |
+ assign("nus", nus, envir=envir) |
|
207 |
+ assign("phis", nus, envir=envir) |
|
208 |
+ |
|
209 |
+ plates.completed <- rep(FALSE, length(uplate)) |
|
210 |
+ assign("plates.completed", plates.completed, envir) |
|
211 |
+ |
|
212 |
+ fit.variance <- NULL |
|
213 |
+ snpflags <- NULL |
|
214 |
+ assign("fit.variance", fit.variance, envir=envir) |
|
215 |
+ assign("snpflags", snpflags, envir=envir) |
|
216 |
+ |
|
217 |
+ assign("Ns", Ns, envir=envir) |
|
218 |
+ assign("uplate", uplate, envir=envir) |
|
219 |
+ assign("muA.AA", muA.AA, envir=envir) |
|
220 |
+ assign("muA.AB", muA.AA, envir=envir) |
|
221 |
+ assign("muA.BB", muA.AA, envir=envir) |
|
222 |
+ assign("muB.AA", muA.AA, envir=envir) |
|
223 |
+ assign("muB.AB", muA.AA, envir=envir) |
|
224 |
+ assign("muB.BB", muA.AA, envir=envir) |
|
225 |
+ |
|
226 |
+ assign("tau2A", muA.AA, envir=envir) |
|
227 |
+ assign("tau2B", muA.AA, envir=envir) |
|
228 |
+ assign("sig2A", muA.AA, envir=envir) |
|
229 |
+ assign("sig2B", muA.AA, envir=envir) |
|
230 |
+ assign("corr", muA.AA, envir=envir) |
|
231 |
+ assign("corrA.BB", muA.AA, envir=envir) |
|
232 |
+ assign("corrB.AA", muA.AA, envir=envir) |
|
233 |
+ |
|
234 |
+ assign("nuA", muA.AA, envir=envir) |
|
235 |
+ assign("nuB", muA.AA, envir=envir) |
|
236 |
+ assign("phiA", muA.AA, envir=envir) |
|
237 |
+ assign("phiB", muA.AA, envir=envir) |
|
238 |
+ assign("nuA.se", muA.AA, envir=envir) |
|
239 |
+ assign("nuB.se", muA.AA, envir=envir) |
|
240 |
+ assign("phiA.se", muA.AA, envir=envir) |
|
241 |
+ assign("phiB.se", muA.AA, envir=envir) |
|
242 |
+ assign("sd.CT", CA, envir=envir) |
|
243 |
+ assign("CA", CA, envir=envir) |
|
244 |
+ assign("CB", CB, envir=envir) |
|
245 |
+} |
|
246 |
+ |
|
247 |
+computeCnBatch <- function(A, |
|
248 |
+ B, |
|
249 |
+ calls, |
|
250 |
+ conf, |
|
251 |
+ NP, |
|
252 |
+ plate, |
|
253 |
+ fit.variance=NULL, |
|
254 |
+ seed=123, |
|
255 |
+ MIN.OBS=3, |
|
256 |
+ envir, |
|
257 |
+ chrom, P, DF.PRIOR=50, CONF.THR=0.99, trim, upperTail, ...){ |
|
258 |
+ if(length(ls(envir)) == 0) instantiateObjects(calls, NP, plate, envir, chrom) |
|
259 |
+ ##require(genefilter) |
|
260 |
+ require(splines) |
|
261 |
+ require(Biobase)##needs rowMedians |
|
262 |
+ ##will be updating these objects |
|
263 |
+ uplate <- get("uplate", envir) |
|
264 |
+ ##AAA <- A |
|
265 |
+ ##BBB <- B |
|
266 |
+ message("Sufficient statistics") |
|
267 |
+ if(missing(P)) P <- seq(along=uplate) |
|
268 |
+ for(p in P){ |
|
269 |
+ cat(".") |
|
270 |
+ if(sum(plate == uplate[p]) < 10) next() |
|
271 |
+ oneBatch(plateIndex=p, |
|
272 |
+ G=calls[, plate==uplate[p]], |
|
273 |
+ A=A[, plate==uplate[p]], |
|
274 |
+ B=B[, plate==uplate[p]], |
|
275 |
+ envir=envir, |
|
276 |
+ MIN.OBS=MIN.OBS, |
|
277 |
+ DF.PRIOR=DF.PRIOR, |
|
278 |
+ trim=trim, upperTail=upperTail) |
|
279 |
+ } |
|
280 |
+ message("\nEstimating coefficients") |
|
281 |
+ for(p in P){ |
|
282 |
+ cat(".") |
|
283 |
+ coefs(plateIndex=p, conf=conf[, plate==uplate[p]], |
|
284 |
+ envir=envir, CONF.THR=CONF.THR) |
|
285 |
+ } |
|
286 |
+ message("\nAllele specific copy number") |
|
287 |
+ for(p in P){ |
|
288 |
+ cat(".") |
|
289 |
+ copynumber(plateIndex=p, |
|
290 |
+ A=A[, plate==uplate[p]], |
|
291 |
+ B=B[, plate==uplate[p]], |
|
292 |
+ envir=envir) |
|
293 |
+ } |
|
294 |
+ message("\nCopy number for nonpolymorphic probes...") |
|
295 |
+ for(p in P){ |
|
296 |
+ cat(".") |
|
297 |
+ nonpolymorphic(plateIndex=p, |
|
298 |
+ NP=NP[, plate==uplate[p]], |
|
299 |
+ envir=envir) |
|
300 |
+ } |
|
301 |
+} |
|
302 |
+ |
|
303 |
+nonpolymorphic <- function(plateIndex, NP, envir){ |
|
304 |
+ require(genefilter) |
|
305 |
+ p <- plateIndex |
|
306 |
+ plate <- get("plate", envir) |
|
307 |
+ uplate <- get("uplate", envir) |
|
308 |
+ plates.completed <- get("plates.completed", envir) |
|
309 |
+ if(!plates.completed[p]) return() |
|
310 |
+ ##snpflags <- get("snpflags", envir) |
|
311 |
+ ##if(is.null(snpflags)){ |
|
312 |
+ snpflags <- goodSnps(phi.thr=10, envir=envir, fewAA=10, fewBB=10) |
|
313 |
+ ##assign("snpflags", snpflags, envir=envir) |
|
314 |
+ flagsA <- snpflags$A[, p] |
|
315 |
+ flagsB <- snpflags$B[, p] |
|
316 |
+ if(all(flagsA) | all(flagsB)) stop("all snps are flagged") |
|
317 |
+ nuA <- get("nuA", envir) |
|
318 |
+ nuB <- get("nuB", envir) |
|
319 |
+ phiA <- get("phiA", envir) |
|
320 |
+ phiB <- get("phiB", envir) |
|
321 |
+ uplate <- get("uplate", envir) |
|
322 |
+ sns <- get("sns", envir) |
|
323 |
+ muA.AA <- get("muA.AA", envir) |
|
324 |
+ muA.AB <- get("muA.AB", envir) |
|
325 |
+ muA.BB <- get("muA.BB", envir) |
|
326 |
+ muB.AA <- get("muB.AA", envir) |
|
327 |
+ muB.AB <- get("muB.AB", envir) |
|
328 |
+ muB.BB <- get("muB.BB", envir) |
|
329 |
+ NP.CT <- get("NP.CT", envir) |
|
330 |
+ nus <- get("nus", envir) |
|
331 |
+ phis <- get("phis", envir) |
|
332 |
+ NP.sds <- get("NP.sds", envir) |
|
333 |
+ ##fit.variance <- get("fit.variance", envir) |
|
334 |
+ NP.CT <- get("NP.CT", envir) |
|
335 |
+ ##--------------------------------------------------------------------------- |
|
336 |
+ ## Train on unflagged SNPs |
|
337 |
+ ##--------------------------------------------------------------------------- |
|
338 |
+ plateInd <- plate == uplate[p] |
|
339 |
+ muA.AAp <- muA.AA[!flagsA, p] |
|
340 |
+ muB.BBp <- muB.BB[!flagsB, p] |
|
341 |
+ |
|
342 |
+ ##From SNP data |
|
343 |
+ X <- cbind(1, log(c(muA.AAp, muB.BBp))) |
|
344 |
+ Y <- log(c(phiA[!flagsA, p], phiB[!flagsB, p])) |
|
345 |
+ ##Y.nu <- log(c(nuA[!snpflags$A, p], nuB[!snpflags$B, p])) |
|
346 |
+ ##Y <- cbind(Y.nu, Y.phi) |
|
347 |
+ betahat <- solve(crossprod(X), crossprod(X, Y)) |
|
348 |
+ ##Prediction |
|
349 |
+ mus <- rowMedians(NP, na.rm=TRUE) |
|
350 |
+ X <- cbind(1, log(mus)) |
|
351 |
+ Yhat <- as.numeric(X %*% betahat) |
|
352 |
+ ##NP.nus[, p] <- exp(Yhat[, 1]) |
|
353 |
+ ##NP.phis[, p] <- exp(Yhat[, 2]) |
|
354 |
+ phi <- exp(Yhat) |
|
355 |
+ nu <- mus - 2*phi |
|
356 |
+ |
|
357 |
+ phis[, p] <- as.integer(phi) |
|
358 |
+ nus[, p] <- as.integer(nu) |
|
359 |
+ ##plot(NP.phis[, p], NP.nus[, p], pch=".") |
|
360 |
+ CT <- 1/phi*(NP-nu) |
|
361 |
+ tmp <- matrix(as.integer(CT*100), nrow(CT), ncol(CT)) |
|
362 |
+ NP.CT[, plateInd] <- tmp |
|
363 |
+ if(FALSE){ |
|
364 |
+ ##should be centered at 2 |
|
365 |
+ tmp <- rowMeans(NP.CT[, plateInd]) |
|
366 |
+ hist(tmp/100, breaks=200) |
|
367 |
+ } |
|
368 |
+ |
|
369 |
+ ##--------------------------------------------------------------------------- |
|
370 |
+ ## For NA SNPs, treat as nonpolymorphic |
|
371 |
+ ##--------------------------------------------------------------------------- |
|
372 |
+ CA <- get("CA", envir) |
|
373 |
+ CB <- get("CB", envir) |
|
374 |
+ tmpA <- CA[, plate==uplate[p]] |
|
375 |
+ tmpB <- CB[, plate==uplate[p]] |
|
376 |
+ indexA <- which(rowSums(is.na(tmpA)) > 0) |
|
377 |
+ indexB <- which(rowSums(is.na(tmpB)) > 0) |
|
378 |
+ index <- union(indexA, indexB) |
|
379 |
+ if(length(index) > 0) browser() |
|
380 |
+ |
|
381 |
+ ##--------------------------------------------------------------------------- |
|
382 |
+ ## this part could stand improvement |
|
383 |
+ ## - estimate the uncertainty |
|
384 |
+ ##--------------------------------------------------------------------------- |
|
385 |
+ |
|
386 |
+ ##--------------------------------------------------------------------------- |
|
387 |
+ ## Estimate variance for copy number probes |
|
388 |
+ ## VAR(CT) = var(1/phi * (NP - nu)) |
|
389 |
+ ## = 1/phi^2 * VAR(NP) |
|
390 |
+ ## = 1/phi^2 * f(mu) * sigma |
|
391 |
+ ## ** Phi is predicted from a regression using the row medians as predictors |
|
392 |
+ ## -- this may underestimate the uncertainty |
|
393 |
+ ##--------------------------------------------------------------------------- |
|
394 |
+## log.mus <- log2(mus) |
|
395 |
+## log.var <- log2(rowVars(NP)) |
|
396 |
+## f <- predict(fit.variance, newdata=data.frame(log.mus=log.mus)) |
|
397 |
+## resid <- log.var - f |
|
398 |
+## sds <- 1/phi*sqrt(2^(predict(fit.variance, newdata=data.frame(log.mus=log.mus+resid)))) |
|
399 |
+## |
|
400 |
+## robustSD <- function(X) diff(quantile(X, probs=c(0.16, (1-0.16)), na.rm=TRUE))/2 |
|
401 |
+## sample.sds <- apply(CT, 2, robustSD) |
|
402 |
+## sample.sds <- sample.sds/median(sample.sds, na.rm=TRUE) |
|
403 |
+## NP.sds[, plate==uplate[p]] <- sds %*% t(sample.sds) |
|
404 |
+ assign("phis", phis, envir) |
|
405 |
+ assign("nus", nus, envir) |
|
406 |
+ assign("NP.CT", NP.CT, envir) |
|
407 |
+## assign("NP.sds", NP.sds, envir) |
|
408 |
+## firstPass.NP <- list(phis=phis, nus=nus, CT=NP.CT, sds=sds, |
|
409 |
+## gns=gns, sns=sns, bns=bns) |
|
410 |
+## firstPass.NP |
|
411 |
+} |
|
412 |
+ |
|
413 |
+oneBatch <- function(plateIndex, G, A, B, MIN.OBS=3, DF.PRIOR, envir, trim, upperTail, ...){ |
|
414 |
+ p <- plateIndex |
|
415 |
+ plate <- get("plate", envir) |
|
416 |
+ AA <- G == 1 |
|
417 |
+ AB <- G == 2 |
|
418 |
+ BB <- G == 3 |
|
419 |
+ Ns <- get("Ns", envir) |
|
420 |
+ Ns[, p, "AA"] <- rowSums(AA) |
|
421 |
+ Ns[, p, "AB"] <- rowSums(AB) |
|
422 |
+ Ns[, p, "BB"] <- rowSums(BB) |
|
423 |
+ AA[AA == FALSE] <- NA |
|
424 |
+ AB[AB == FALSE] <- NA |
|
425 |
+ BB[BB == FALSE] <- NA |
|
426 |
+ Ip <- array(NA, dim=c(nrow(G), ncol(G), 3, 2)) |
|
427 |
+ dimnames(Ip) <- list(rownames(G), colnames(G), c("AA", "AB", "BB"), c("A", "B")) |
|
428 |
+ Ip[, , "AA", "A"] <- AA*A |
|
429 |
+ Ip[, , "AB", "A"] <- AB*A |
|
430 |
+ Ip[, , "BB", "A"] <- BB*A |
|
431 |
+ Ip[, , "AA", "B"] <- AA*B |
|
432 |
+ Ip[, , "AB", "B"] <- AB*B |
|
433 |
+ Ip[, , "BB", "B"] <- BB*B |
|
434 |
+ ##--------------------------------------------------------------------------- |
|
435 |
+ ## Sufficient statistics (plate-specific) |
|
436 |
+ ##--------------------------------------------------------------------------- |
|
437 |
+ ##These matrices instantiated in the calling function are updated |
|
438 |
+ muA.AA <- get("muA.AA", envir) |
|
439 |
+ muA.AB <- get("muA.AB", envir) |
|
440 |
+ muA.BB <- get("muA.BB", envir) |
|
441 |
+ muB.AA <- get("muB.AA", envir) |
|
442 |
+ muB.AB <- get("muB.AB", envir) |
|
443 |
+ muB.BB <- get("muB.BB", envir) |
|
444 |
+ tau2A <- get("tau2A", envir) |
|
445 |
+ tau2B <- get("tau2B", envir) |
|
446 |
+ sig2A <- get("sig2A", envir) |
|
447 |
+ sig2B <- get("sig2B", envir) |
|
448 |
+ corr <- get("corr", envir) |
|
449 |
+ corrA.BB <- get("corrA.BB", envir) ## B allele |
|
450 |
+ corrB.AA <- get("corrB.AA", envir) |
|
451 |
+ |
|
452 |
+ AA.A <- AA*A |
|
453 |
+ AB.A <- AB*A |
|
454 |
+ BB.A <- BB*A |
|
455 |
+ AA.B <- AA*B |
|
456 |
+ AB.B <- AB*B |
|
457 |
+ BB.B <- BB*B |
|
458 |
+ if(trim > 0){ |
|
459 |
+ ##rowMedians is not robust enough when a variant is common |
|
460 |
+ ##Try a trimmed rowMedian, trimming only one tail (must specify which) |
|
461 |
+ ## - for genotypes with 3 or more observations, exclude the upper X% |
|
462 |
+ ## - one way to do this is to replace these observations with NAs |
|
463 |
+ ## e.g, exclude round(Ns * X%, 0) observations |
|
464 |
+ ## - for genotypes with fewer than 10 observations, |
|
465 |
+ ## - recalculate rowMedians |
|
466 |
+ replaceWithNAs <- function(x, trim, upperTail){ |
|
467 |
+ ##put NA's last if trimming the upperTail |
|
468 |
+ if(upperTail) decreasing <- TRUE else decreasing <- FALSE |
|
469 |
+ NN <- round(sum(!is.na(x)) * trim, 0) |
|
470 |
+ ix <- order(x, decreasing=decreasing, na.last=TRUE)[1:NN] |
|
471 |
+ x[ix] <- NA |
|
472 |
+ return(x) |
|
473 |
+ } |
|
474 |
+ ##which rows should be trimmed |
|
475 |
+ rowsToTrim <- which(round(Ns[, p, "AA"] * trim, 0) > 0) |
|
476 |
+ ##replace values in the tail of A with NAs |
|
477 |
+ AA.A[rowsToTrim, ] <- t(apply(AA.A[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail)) |
|
478 |
+ AA.B[rowsToTrim, ] <- t(apply(AA.B[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail)) |
|
479 |
+ rowsToTrim <- which(round(Ns[, p, "AB"] * trim, 0) > 0) |
|
480 |
+ AB.A[rowsToTrim, ] <- t(apply(AB.A[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail)) |
|
481 |
+ AB.B[rowsToTrim, ] <- t(apply(AB.B[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail)) |
|
482 |
+ rowsToTrim <- which(round(Ns[, p, "BB"] * trim, 0) > 0) |
|
483 |
+ BB.A[rowsToTrim, ] <- t(apply(BB.A[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail)) |
|
484 |
+ BB.B[rowsToTrim, ] <- t(apply(BB.B[rowsToTrim, ], 1, replaceWithNAs, trim=trim, upperTail=upperTail)) |
|
485 |
+ |
|
486 |
+ ##Should probably recompute the Ns -- change the |
|
487 |
+ ##degrees of freedom |
|
488 |
+ Ns[, p, "AA"] <- rowSums(!is.na(AA.A)) |
|
489 |
+ Ns[, p, "AB"] <- rowSums(!is.na(AB.A)) |
|
490 |
+ Ns[, p, "BB"] <- rowSums(!is.na(BB.A)) |
|
491 |
+ } |
|
492 |
+ muA.AA[, p] <- rowMedians(AA.A, na.rm=TRUE) |
|
493 |
+ muA.AB[, p] <- rowMedians(AB.A, na.rm=TRUE) |
|
494 |
+ muA.BB[, p] <- rowMedians(BB.A, na.rm=TRUE) |
|
495 |
+ muB.AA[, p] <- rowMedians(AA.B, na.rm=TRUE) |
|
496 |
+ muB.AB[, p] <- rowMedians(AB.B, na.rm=TRUE) |
|
497 |
+ muB.BB[, p] <- rowMedians(BB.B, na.rm=TRUE) |
|
498 |
+ sigmaA <- matrix(NA, nrow(AA), 3) |
|
499 |
+ sigmaB <- matrix(NA, nrow(AA), 3) |
|
500 |
+ colnames(sigmaB) <- colnames(sigmaA) <- c("AA", "AB", "BB") |
|
501 |
+ |
|
502 |
+ |
|
503 |
+ sigmaA[, "AA"] <- rowMAD(AA.A, na.rm=TRUE) |
|
504 |
+ sigmaA[, "AB"] <- rowMAD(AB.A, na.rm=TRUE) |
|
505 |
+ sigmaA[, "BB"] <- rowMAD(BB.A, na.rm=TRUE) |
|
506 |
+ sigmaB[, "AA"] <- rowMAD(AA.B, na.rm=TRUE) |
|
507 |
+ sigmaB[, "AB"] <- rowMAD(AB.B, na.rm=TRUE) |
|
508 |
+ sigmaB[, "BB"] <- rowMAD(BB.B, na.rm=TRUE) |
|
509 |
+ |
|
510 |
+## sigmaA[, "AA"] <- rowSds(AA.A, na.rm=TRUE) |
|
511 |
+## sigmaA[, "AB"] <- rowSds(AB.A, na.rm=TRUE) |
|
512 |
+## sigmaA[, "BB"] <- rowSds(BB*A, na.rm=TRUE) |
|
513 |
+## sigmaB[, "AA"] <- rowSds(AA*B, na.rm=TRUE) |
|
514 |
+## sigmaB[, "AB"] <- rowSds(AB*B, na.rm=TRUE) |
|
515 |
+## sigmaB[, "BB"] <- rowSds(BB*B, na.rm=TRUE) |
|
516 |
+ ##shrink |
|
517 |
+ DF <- Ns[, p, ]-1 |
|
518 |
+ DF[DF < 1] <- 1 |
|
519 |
+ medsA <- apply(sigmaA, 2, "median", na.rm=TRUE) |
|
520 |
+ medsB <- apply(sigmaB, 2, "median", na.rm=TRUE) |
|
521 |
+ for(m in 1:3){ |
|
522 |
+ sigmaA[, m] <- (sigmaA[, m]*DF[, m] + medsA[m]*DF.PRIOR)/(DF.PRIOR+DF[, m]) |
|
523 |
+ sigmaA[is.na(sigmaA[, m]), m] <- medsA[m] |
|
524 |
+ sigmaB[, m] <- (sigmaB[, m]*DF[, m] + medsB[m]*DF.PRIOR)/(DF.PRIOR+DF[, m]) |
|
525 |
+ sigmaB[is.na(sigmaB[, m]), m] <- medsB[m] |
|
526 |
+ } |
|
527 |
+ index.AA <- which(Ns[, p, "AA"] >= 2) |
|
528 |
+ index.AB <- which(Ns[, p, "AB"] >= 2) |
|
529 |
+ index.BB <- which(Ns[, p, "BB"] >= 2) |
|
530 |
+ |
|
531 |
+ x <- Ip[index.BB, , "BB", "A"] |
|
532 |
+ ##tau2A[index.BB, p] <- rowVars(x, na.rm=TRUE)##var(log(IA)| BB) |
|
533 |
+ tau2A[index.BB, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2 |
|
534 |
+ DF <- Ns[, p, "BB"]-1 |
|
535 |
+ DF[DF < 1] <- 1 |
|
536 |
+ med <- median(tau2A[, p], na.rm=TRUE) |
|
537 |
+ tau2A[, p] <- (tau2A[, p] * DF + med * DF.PRIOR)/(DF.PRIOR + DF) |
|
538 |
+ tau2A[is.na(tau2A[, p]), p] <- med |
|
539 |
+ |
|
540 |
+ x <- Ip[index.BB, , "BB", "B"] |
|
541 |
+## sig2B[index.BB, p] <- rowVars(log2(x), na.rm=TRUE) ##var(log(IB)|BB) |
|
542 |
+ sig2B[index.BB, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2 |
|
543 |
+ ##system.time(tmp <- rowCovs(x, x, na.rm=TRUE)) ##var(log(IB)|BB) |
|
544 |
+ ##system.time(tmp2 <- rowVars(log2(x), na.rm=TRUE)) |
|
545 |
+ med <- median(sig2B[, p], na.rm=TRUE) |
|
546 |
+ sig2B[, p] <- (sig2B[, p] * DF + med * DF.PRIOR)/(DF.PRIOR + DF) |
|
547 |
+ sig2B[is.na(sig2B[, p]), p] <- med |
|
548 |
+ |
|
549 |
+ x <- Ip[index.AA, , "AA", "B"] |
|
550 |
+ ##tau2B[index.AA, p] <- rowCovs(x, x, na.rm=TRUE)##var(log(IB)| AA) |
|
551 |
+ tau2B[index.AA, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2 |
|
552 |
+ DF <- Ns[, p, "AA"]-1 |
|
553 |
+ DF[DF < 1] <- 1 |
|
554 |
+ med <- median(tau2B[, p], na.rm=TRUE) |
|
555 |
+ tau2B[, p] <- (tau2B[, p] * DF + med * DF.PRIOR)/(DF.PRIOR + DF) |
|
556 |
+ tau2B[is.na(tau2B[, p]), p] <- med |
|
557 |
+ |
|
558 |
+ x <- Ip[index.AA, , "AA", "A"] |
|
559 |
+ ##sig2A[index.AA, p] <- rowVars(log2(x), na.rm=TRUE)##var(log(IA)|AA) |
|
560 |
+ sig2A[index.AA, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2##var(log(IA)|AA) |
|
561 |
+ ##sig2A[index.AA, p] <- rowCovs(x, x, na.rm=TRUE) ##var(log(IA)|AA) |
|
562 |
+ med <- median(sig2A[, p], na.rm=TRUE) |
|
563 |
+ sig2A[, p] <- (sig2A[, p]*DF + med * DF.PRIOR)/(DF.PRIOR + DF) |
|
564 |
+ sig2A[is.na(sig2A[, p]), p] <- med |
|
565 |
+ |
|
566 |
+ ##estimate the correlation where there is at least 1 or more copies (AB genotypes) |
|
567 |
+ if(length(index.AB) > 0){ ##all homozygous is possible |
|
568 |
+ x <- Ip[index.AB, , "AB", "A"] |
|
569 |
+ y <- Ip[index.AB, , "AB", "B"] |
|
570 |
+ corr[index.AB, p] <- rowCors(x, y, na.rm=TRUE) |
|
571 |
+ corr[corr < 0] <- 0 |
|
572 |
+ DF <- Ns[, p, "AB"]-1 |
|
573 |
+ DF[DF<1] <- 1 |
|
574 |
+ med <- median(corr[, p], na.rm=TRUE) |
|
575 |
+ corr[, p] <- (corr[, p]*DF + med * DF.PRIOR)/(DF.PRIOR + DF) |
|
576 |
+ corr[is.na(corr[, p]), p] <- med |
|
577 |
+ } |
|
578 |
+ ##estimate the correlation of the background errors and AA, BB genotypes |
|
579 |
+ backgroundB <- Ip[index.AA, , "AA", "B"] |
|
580 |
+ signalA <- Ip[index.AA, , "AA", "A"] |
|
581 |
+ corrB.AA[index.AA, p] <- rowCors(backgroundB, signalA, na.rm=TRUE) |
|
582 |
+ DF <- Ns[, p, "AA"]-1 |
|
583 |
+ DF[DF < 1] <- 1 |
|
584 |
+ med <- median(corrB.AA[, p], na.rm=TRUE) |
|
585 |
+ corrB.AA[, p] <- (corrB.AA[, p]*DF + med*DF.PRIOR)/(DF.PRIOR + DF) |
|
586 |
+ corrB.AA[is.na(corrB.AA[, p]), p] <- med |
|
587 |
+ |
|
588 |
+ backgroundA <- Ip[index.BB, , "BB", "A"] |
|
589 |
+ signalB <- Ip[index.BB, , "BB", "B"] |
|
590 |
+ corrA.BB[index.BB, p] <- rowCors(backgroundA, signalB, na.rm=TRUE) |
|
591 |
+ DF <- Ns[, p, "BB"]-1 |
|
592 |
+ DF[DF < 1] <- 1 |
|
593 |
+ med <- median(corrA.BB[, p], na.rm=TRUE) |
|
594 |
+ corrA.BB[, p] <- (corrA.BB[, p]*DF + med*DF.PRIOR)/(DF.PRIOR + DF) |
|
595 |
+ corrA.BB[is.na(corrA.BB[, p]), p] <- med |
|
596 |
+ |
|
597 |
+ ##--------------------------------------------------------------------------- |
|
598 |
+ ## Predict sufficient statistics for unobserved genotypes (plate-specific) |
|
599 |
+ ##--------------------------------------------------------------------------- |
|
600 |
+ correct.orderA <- muA.AA[, p] > muA.BB[, p] |
|
601 |
+ correct.orderB <- muB.BB[, p] > muB.AA[, p] |
|
602 |
+ if(length(index.AB) > 0){ |
|
603 |
+ nobs <- rowSums(Ns[, p, ] >= MIN.OBS) == 3 |
|
604 |
+ } else nobs <- rowSums(Ns[, p, c(1,3)] >= MIN.OBS) == 2 |
|
605 |
+ index.complete <- which(correct.orderA & correct.orderB & nobs) ##be selective here |
|
606 |
+ size <- min(5000, length(index.complete)) |
|
607 |
+ if(size == 5000) index.complete <- sample(index.complete, 5000) |
|
608 |
+ if(length(index.complete) < 200){ |
|
609 |
+ warning("too few snps pass my criteria for predicting the sufficient statistics") |
|
610 |
+ ##message("Plate has too few samples to call many genotypes...can we use prior information to predict") |
|
611 |
+ browser() |
|
612 |
+ } |
|
613 |
+ index.AA <- which(Ns[, p, "AA"] == 0 & Ns[, p, "AB"] >= MIN.OBS) |
|
614 |
+ index.AB <- which(Ns[, p, "AB"] == 0 & (Ns[, p, "AA"] >= MIN.OBS & Ns[, p, "BB"] >= MIN.OBS)) |
|
615 |
+ index.BB <- which(Ns[, p, "BB"] == 0 & Ns[, p, "AB"] >= MIN.OBS) |
|
616 |
+ if(FALSE){ |
|
617 |
+ png("figures/predictedMusBB%02d.png", width=800, height=500) |
|
618 |
+ par(mfrow=c(1, 2), las=1, mar=c(3, 1, 1, 1), oma=c(1, 2, 1, 1), pty="s") |
|
619 |
+ plot(log(muA.BB[index.complete, 1]), log(muB.BB[index.complete, 1]), pch=".", ylim=c(5,12), xlim=c(5,12), |
|
620 |
+ xlab="A", ylab="B", main="good") |
|
621 |
+ points(log(muA.AB[index.complete, 1]), log(muB.AB[index.complete, 1]), pch=".") |
|
622 |
+ points(log(muA.AA[index.complete, 1]), log(muB.AA[index.complete, 1]), pch=".") |
|
623 |
+ plot(log(muA.BB[index.BB, 1]), log(muB.BB[index.BB, 1]), pch=".", ylim=c(5,12), xlim=c(5,12), |
|
624 |
+ xlab="A", ylab="B", main="missing BB") |
|
625 |
+ points(log(muA.AB[index.BB, 1]), log(muB.AB[index.BB, 1]), pch=".") |
|
626 |
+ points(log(muA.AA[index.BB, 1]), log(muB.AA[index.BB, 1]), pch=".") |
|
627 |
+ } |
|
628 |
+ if(length(index.AA) > 0){ |
|
629 |
+ X.AA <- cbind(1, muA.AB[index.complete, p], muA.BB[index.complete, p], |
|
630 |
+ muB.AB[index.complete, p], muB.BB[index.complete, p]) |
|
631 |
+ Y <- cbind(muA.AA[index.complete, p], muB.AA[index.complete, p]) |
|
632 |
+ XtY <- crossprod(X.AA, Y) |
|
633 |
+ betahat <- solve(crossprod(X.AA), XtY) |
|
634 |
+ X.AA <- cbind(1, muA.AB[index.AA, p], muA.BB[index.AA, p], |
|
635 |
+ muB.AB[index.AA, p], muB.BB[index.AA, p]) |
|
636 |
+ musAA <- X.AA %*% betahat |
|
637 |
+ musAA[musAA <= 100] <- 100 |
|
638 |
+ muA.AA[index.AA, p] <- musAA[, 1] |
|
639 |
+ muB.AA[index.AA, p] <- musAA[, 2] |
|
640 |
+ } |
|
641 |
+ if(length(index.AB) > 0){ |
|
642 |
+ X.AB <- cbind(1, muA.AA[index.complete, p], muA.BB[index.complete, p], |
|
643 |
+ muB.AA[index.complete, p], muB.BB[index.complete, p]) |
|
644 |
+ Y <- cbind(muA.AB[index.complete, p], muB.AB[index.complete, p]) |
|
645 |
+ XtY <- crossprod(X.AB, Y) |
|
646 |
+ betahat <- solve(crossprod(X.AB), XtY) |
|
647 |
+ X.AB <- cbind(1, muA.AA[index.AB, p], muA.BB[index.AB, p], |
|
648 |
+ muB.AA[index.AB, p], muB.BB[index.AB, p]) |
|
649 |
+ musAB <- X.AB %*% betahat |
|
650 |
+ musAB[musAB <= 100] <- 100 |
|
651 |
+ muA.AB[index.AB, p] <- musAB[, 1] |
|
652 |
+ muB.AB[index.AB, p] <- musAB[, 2] |
|
653 |
+ } |
|
654 |
+ if(length(index.BB) > 0){ |
|
655 |
+ X.BB <- cbind(1, muA.AA[index.complete, p], muA.AB[index.complete, p], |
|
656 |
+ muB.AA[index.complete, p], muB.AB[index.complete, p]) |
|
657 |
+ Y <- cbind(muA.BB[index.complete, p], muB.BB[index.complete, p]) |
|
658 |
+ XtY <- crossprod(X.BB, Y) |
|
659 |
+ betahat <- solve(crossprod(X.BB), XtY) |
|
660 |
+ X.BB <- cbind(1, muA.AA[index.BB, p], muA.AB[index.BB, p], |
|
661 |
+ muB.AA[index.BB, p], muB.AB[index.BB, p]) |
|
662 |
+ musBB <- X.BB %*% betahat |
|
663 |
+ musBB[musBB <= 100] <- 100 |
|
664 |
+ muA.BB[index.BB, p] <- musBB[, 1] |
|
665 |
+ muB.BB[index.BB, p] <- musBB[, 2] |
|
666 |
+ } |
|
667 |
+ if(FALSE){ |
|
668 |
+ points(log(muA.BB[index.BB, 1]), log(muB.BB[index.BB, 1]), pch=".", col="blue") |
|
669 |
+ dev.off() |
|
670 |
+ } |
|
671 |
+ ##missing two genotypes |
|
672 |
+ noAA <- Ns[, p, "AA"] <= 2 |
|
673 |
+ noAB <- Ns[, p, "AB"] <= 2 |
|
674 |
+ noBB <- Ns[, p, "BB"] <= 2 |
|
675 |
+ ##--------------------------------------------------------------------------- |
|
676 |
+ ## Two genotype clusters not observed -- would sequence help? (didn't seem that helpful) |
|
677 |
+ ## 1 extract index of complete data |
|
678 |
+ ## 2 Regress mu1,mu3 ~ sequence + mu2 |
|
679 |
+ ## 3 Predict mu1*, mu3* for missing genotypes |
|
680 |
+ ##--------------------------------------------------------------------------- |
|
681 |
+ if(sum(noAA & noAB) > 0){ |
|
682 |
+ ##predict other cluster centers using only the observed genotypes |
|
683 |
+ ##predict AA: |
|
684 |
+ X <- cbind(1, muA.BB[index.complete, p], muB.BB[index.complete, p]) |
|
685 |
+ Y <- cbind(muA.AA[index.complete, p], |
|
686 |
+ muB.AA[index.complete, p], |
|
687 |
+ muA.AB[index.complete, p], |
|
688 |
+ muB.AB[index.complete, p]) |
|
689 |
+ XtY <- crossprod(X, Y) |
|
690 |
+ betahat <- solve(crossprod(X), XtY) |
|
691 |
+ X <- cbind(1, muA.BB[noAA & noAB, p], muB.BB[noAA & noAB, p]) |
|
692 |
+ mus <- X %*% betahat |
|
693 |
+ mus[mus <= 100] <- 100 |
|
694 |
+ muA.AA[noAA & noAB, p] <- mus[, 1] |
|
695 |
+ muB.AA[noAA & noAB, p] <- mus[, 2] |
|
696 |
+ muA.AB[noAA & noAB, p] <- mus[, 3] |
|
697 |
+ muB.AB[noAA & noAB, p] <- mus[, 4] |
|
698 |
+ } |
|
699 |
+ if(FALSE){ |
|
700 |
+ i <- which(noAA & noAB) |
|
701 |
+ par(mfrow=c(1, 1), las=1, mar=c(3, 3, 1, 1), pty="s") |
|
702 |
+ plot(log(muA.BB[i, 1]), log(muB.BB[i, 1]), pch=".", ylim=c(5,12), xlim=c(5,12), |
|
703 |
+ xlab="A", ylab="B", main="missing BB") |
|
704 |
+ points(log(muA.AB[i, 1]), log(muB.AB[i, 1]), pch=".", col="blue") |
|
705 |
+ points(log(muA.AA[i, 1]), log(muB.AA[i, 1]), pch=".", col="red") |
|
706 |
+ } |
|
707 |
+ if(sum(noBB & noAB) > 0){ |
|
708 |
+ ##predict other cluster centers using only the observed genotypes |
|
709 |
+ X <- cbind(1, muA.AA[index.complete, p], muB.AA[index.complete, p]) |
|
710 |
+ Y <- cbind(muA.AB[index.complete, p], |
|
711 |
+ muB.AB[index.complete, p], |
|
712 |
+ muA.BB[index.complete, p], #muA.AB[index.complete, p], |
|
713 |
+ muB.BB[index.complete, p])#, muB.AB[index.complete, p]) |
|
714 |
+ XtY <- crossprod(X, Y) |
|
715 |
+ betahat <- solve(crossprod(X), XtY) |
|
716 |
+ X <- cbind(1, muA.AA[noBB & noAB, p], muB.AA[noBB & noAB, p]) |
|
717 |
+ mus <- X %*% betahat |
|
718 |
+ mus[mus <= 100] <- 100 |
|
719 |
+ muA.AB[noBB & noAB, p] <- mus[, 1] |
|
720 |
+ muB.AB[noBB & noAB, p] <- mus[, 2] |
|
721 |
+ muA.BB[noBB & noAB, p] <- mus[, 3] |
|
722 |
+ muB.BB[noBB & noAB, p] <- mus[, 4] |
|
723 |
+ if(FALSE){ |
|
724 |
+ i <- which(noBB & noAB) |
|
725 |
+ par(mfrow=c(1, 1), las=1, mar=c(3, 3, 1, 1), pty="s") |
|
726 |
+ plot(log(muA.BB[i, 1]), log(muB.BB[i, 1]), pch=".", ylim=c(5,12), xlim=c(5,12), |
|
727 |
+ xlab="A", ylab="B", main="missing BB") |
|
728 |
+ points(log(muA.AB[i, 1]), log(muB.AB[i, 1]), pch=".", col="blue") |
|
729 |
+ points(log(muA.AA[i, 1]), log(muB.AA[i, 1]), pch=".", col="red") |
|
730 |
+ } |
|
731 |
+ } |
|
732 |
+ dn.Ns <- dimnames(Ns) |
|
733 |
+ Ns <- array(as.integer(Ns), dim=dim(Ns)) |
|
734 |
+ dimnames(Ns)[[3]] <- dn.Ns[[3]] |
|
735 |
+ |
|
736 |
+ assign("Ns", Ns, envir) |
|
737 |
+ assign("muA.AA", muA.AA, envir) |
|
738 |
+ assign("muA.AB", muA.AB, envir) |
|
739 |
+ assign("muA.BB", muA.BB, envir) |
|
740 |
+ assign("muB.AA", muB.AA, envir) |
|
741 |
+ assign("muB.AB", muB.AB, envir) |
|
742 |
+ assign("muB.BB", muB.BB, envir) |
|
743 |
+ assign("sigmaA", sigmaA, envir) |
|
744 |
+ assign("sigmaB", sigmaB, envir) |
|
745 |
+ assign("tau2A", tau2A, envir) |
|
746 |
+ assign("tau2B", tau2B, envir) |
|
747 |
+ assign("sig2A", sig2A, envir) |
|
748 |
+ assign("sig2B", sig2B, envir) |
|
749 |
+ assign("corr", corr, envir) |
|
750 |
+ assign("corrB.AA", corrB.AA, envir) |
|
751 |
+ assign("corrA.BB", corrA.BB, envir) |
|
752 |
+ plates.completed <- get("plates.completed", envir) |
|
753 |
+ plates.completed[p] <- TRUE |
|
754 |
+ assign("plates.completed", plates.completed, envir) |
|
755 |
+} |
|
756 |
+ |
|
757 |
+coefs <- function(plateIndex, conf, MIN.OBS=3, envir, CONF.THR=0.99){ |
|
758 |
+ p <- plateIndex |
|
759 |
+ ##if(p == 5) browser() |
|
760 |
+ plates.completed <- get("plates.completed", envir) |
|
761 |
+ if(!plates.completed[p]) return() |
|
762 |
+ plate <- get("plate", envir) |
|
763 |
+ nuA <- get("nuA", envir) |
|
764 |
+ nuB <- get("nuB", envir) |
|
765 |
+ nuA.se <- get("nuA.se", envir) |
|
766 |
+ nuB.se <- get("nuB.se", envir) |
|
767 |
+ phiA <- get("phiA", envir) |
|
768 |
+ phiB <- get("phiB", envir) |
|
769 |
+ phiA.se <- get("phiA.se", envir) |
|
770 |
+ phiB.se <- get("phiB.se", envir) |
|
771 |
+ muA.AA <- get("muA.AA", envir) |
|
772 |
+ muA.AB <- get("muA.AB", envir) |
|
773 |
+ muA.BB <- get("muA.BB", envir) |
|
774 |
+ muB.AA <- get("muB.AA", envir) |
|
775 |
+ muB.AB <- get("muB.AB", envir) |
|
776 |
+ muB.BB <- get("muB.BB", envir) |
|
777 |
+ Ns <- get("Ns", envir) |
|
778 |
+ uplate <- get("uplate", envir) |
|
779 |
+ sigmaA <- get("sigmaA", envir) |
|
780 |
+ sigmaB <- get("sigmaB", envir) |
|
781 |
+ ##fit.variance <- get("fit.variance", envir) |
|
782 |
+ IA <- cbind(muA.AA[, p], muA.AB[, p], muA.BB[, p]) |
|
783 |
+ IB <- cbind(muB.AA[, p], muB.AB[, p], muB.BB[, p]) |
|
784 |
+ NOHET <- mean(Ns[, p, 2], na.rm=TRUE) < 0.05 |
|
785 |
+ ##predict missing variance (do only once) |
|
786 |
+ is.complete <- rowSums(Ns[, p, ] >= 1) == 3 |
|
787 |
+ if(NOHET) is.complete <- rowSums(Ns[, p, c(1,3)] >= MIN.OBS) == 2 |
|
788 |
+ correct.orderA <- muA.AA[, p] > muA.BB[, p] |
|
789 |
+ correct.orderB <- muB.BB[, p] > muB.AA[, p] |
|
790 |
+ highConf <- 1-exp(-conf/1000) |
|
791 |
+ confInd <- rowMeans(highConf) > CONF.THR |
|
792 |
+ keep <- confInd & is.complete & correct.orderA & correct.orderB |
|
793 |
+## if(is.null(fit.variance)){ |
|
794 |
+## log.sigma2 <- as.numeric(log2(sigma2A[keep, ])) |
|
795 |
+## log.mus <- as.numeric(log2(IA[keep, ])) |
|
796 |
+## ##log.mus <- as.numeric(log(mus[, p, , "A"])) |
|
797 |
+## nodups <- which(!duplicated(log.sigma2)) |
|
798 |
+## i <- sample(nodups, 5000) |
|
799 |
+## include <- intersect(which(!is.na(log.sigma2) & !is.na(log.mus)), nodups) |
|
800 |
+## fit.variance <- lm(log.sigma2 ~ ns(log.mus, 3), subset=sample(include, 5000)) |
|
801 |
+## assign("fit.variance", fit.variance, envir) |
|
802 |
+## } |
|
803 |
+## i <- which(rowSums(is.na(sigma2A)) > 0) |
|
804 |
+## log.mus <- as.numeric(log2(IA[i, ])) |
|
805 |
+## predictS2 <- predict(fit.variance, newdata=data.frame(log.mus=log.mus)) |
|
806 |
+## predictS2 <- matrix(2^(predictS2), ncol=3) |
|
807 |
+## sigma2A[i, ] <- predictS2 |
|
808 |
+ ##--------------------------------------------------------------------------- |
|
809 |
+ ## Estimate nu and phi |
|
810 |
+ ##--------------------------------------------------------------------------- |
|
811 |
+ Np <- Ns[, p, ] |
|
812 |
+ sigma2A <- sigmaA^2 |
|
813 |
+ sigma2B <- sigmaB^2 |
|
814 |
+ if(NOHET){ |
|
815 |
+ ##only homozygous |
|
816 |
+ Np <- Np[, -2] |
|
817 |
+ Np[Np < 1] <- 1 |
|
818 |
+ IA <- IA[, c(1, 3)] |
|
819 |
+ sigma2A <- sigma2A[, c(1,3)] |
|
820 |
+ }else Np[Np < 1] <- 1 |
|
821 |
+ ##Using MAD instead of |
|
822 |
+ V <- sigma2A/Np |
|
823 |
+ W <- sqrt(1/V) |
|
824 |
+ Ystar <- IA*W |
|
825 |
+ is.complete <- rowSums(is.na(W)) == 0 |
|
826 |
+ ##is.complete <- is.complete & correct.orderA & correct.orderB & confInd & notmissing |
|
827 |
+ nuphiAllele(allele="A", Ystar=Ystar[is.complete, ], W=W[is.complete, ], envir=envir, p=p) |
|
828 |
+ nuA[is.complete, p] <- nu |
|
829 |
+ phiA[is.complete, p] <- phi |
|
830 |
+ nuA.se[is.complete, p] <- nu.se |
|
831 |
+ phiA.se[is.complete, p] <- phi.se |
|
832 |
+ |
|
833 |
+ if(NOHET){ |
|
834 |
+ IB <- IB[, c(1, 3)] |
|
835 |
+ sigma2B <- sigma2B[, c(1,3)] |
|
836 |
+ } |
|
837 |
+## i <- which(rowSums(is.na(sigma2B)) > 0) |
|
838 |
+## log.mus <- as.numeric(log2(IB[i, ])) |
|
839 |
+## predictS2 <- predict(fit.variance, newdata=data.frame(log.mus=log.mus)) |
|
840 |
+## predictS2 <- matrix(2^(predictS2), ncol=ncol(sigma2B)) |
|
841 |
+## sigma2B[i, ] <- predictS2 |
|
842 |
+ V <- sigma2B/Np |
|
843 |
+ W <- sqrt(1/V) |
|
844 |
+ Ystar <- IB*W |
|
845 |
+ is.complete <- rowSums(is.na(W)) == 0 |
|
846 |
+ nuphiAllele(allele="B", Ystar=Ystar[is.complete, ], W=W[is.complete, ], envir=envir, p=p) |
|
847 |
+ nuB[is.complete, p] <- nu |
|
848 |
+ phiB[is.complete, p] <- phi |
|
849 |
+ nuB.se[is.complete, p] <- nu.se |
|
850 |
+ phiB.se[is.complete, p] <- phi.se |
|
851 |
+ phiA <- matrix(as.integer(phiA), nrow(phiA), ncol(phiA)) |
|
852 |
+ phiB <- matrix(as.integer(phiB), nrow(phiA), ncol(phiA)) |
|
853 |
+ |
|
854 |
+ assign("nuA", nuA, envir) |
|
855 |
+ assign("nuB", nuB, envir) |
|
856 |
+ assign("phiA", phiA, envir) |
|
857 |
+ assign("phiB", phiB, envir) |
|
858 |
+ assign("nuA.se", nuA.se, envir) |
|
859 |
+ assign("nuB.se", nuB.se, envir) |
|
860 |
+ assign("phiA.se", phiA.se, envir) |
|
861 |
+ assign("phiB.se", phiB.se, envir) |
|
862 |
+} |
|
863 |
+ |
|
864 |
+copynumber <- function(plateIndex, A, B, envir){ |
|
865 |
+ p <- plateIndex |
|
866 |
+ plates.completed <- get("plates.completed", envir) |
|
867 |
+ if(!plates.completed[p]) return() |
|
868 |
+ plate <- get("plate", envir) |
|
869 |
+ nuA <- get("nuA", envir) |
|
870 |
+ nuB <- get("nuB", envir) |
|
871 |
+ phiA <- get("phiA", envir) |
|
872 |
+ phiB <- get("phiB", envir) |
|
873 |
+ uplate <- get("uplate", envir) |
|
874 |
+ muA.AA <- get("muA.AA", envir) |
|
875 |
+ muA.AB <- get("muA.AB", envir) |
|
876 |
+ muA.BB <- get("muA.BB", envir) |
|
877 |
+ muB.AA <- get("muB.AA", envir) |
|
878 |
+ muB.AB <- get("muB.AB", envir) |
|
879 |
+ muB.BB <- get("muB.BB", envir) |
|
880 |
+ sd.CT <- get("sd.CT", envir) |
|
881 |
+ sigmaA <- get("sigmaA", envir) |
|
882 |
+ sigmaB <- get("sigmaB", envir) |
|
883 |
+ Ns <- get("Ns", envir) |
|
884 |
+ CA <- get("CA", envir) |
|
885 |
+ CB <- get("CB", envir) |
|
886 |
+ |
|
887 |
+ NOHET <- mean(Ns[, p, 2], na.rm=TRUE) < 0.05 |
|
888 |
+ if(NOHET){ |
|
889 |
+ IA <- cbind(muA.AA[, p], muA.BB[, p]) |
|
890 |
+ IB <- cbind(muB.AA[, p], muB.BB[, p]) |
|
891 |
+ sigma2A <- cbind(sigmaA[, c(1, 3)]^2) |
|
892 |
+ sigma2B <- cbind(sigmaB[, c(1, 3)]^2) |
|
893 |
+## sigma2A <- cbind(Sigma.AA[, "varA", p], Sigma.BB[, "varA", p]) |
|
894 |
+## colnames(sigma2A) <- c("AA", "BB") |
|
895 |
+## sigma2B <- cbind(Sigma.AA[, "varB", p], Sigma.BB[, "varB", p]) |
|
896 |
+## colnames(sigma2B) <- c("AA", "BB") |
|
897 |
+ } else{ |
|
898 |
+ IA <- cbind(muA.AA[, p], muA.AB[, p], muA.BB[, p]) |
|
899 |
+ IB <- cbind(muB.AA[, p], muB.AB[, p], muB.BB[, p]) |
|
900 |
+ sigma2A <- sigmaA^2 |
|
901 |
+ sigma2B <- sigmaB^2 |
|
902 |
+## sigma2A <- cbind(Sigma.AA[, "varA", p], Sigma.AB[, "varA", p], Sigma.BB[, "varA", p]) |
|
903 |
+## colnames(sigma2A) <- c("AA", "AB", "BB") |
|
904 |
+## sigma2B <- cbind(Sigma.AA[, "varB", p], Sigma.AB[, "varB", p], Sigma.BB[, "varB", p]) |
|
905 |
+## colnames(sigma2B) <- c("AA", "AB", "BB") |
|
906 |
+ } |
|
907 |
+ ##sigma2A <- get("sigma2A", envir) |
|
908 |
+ ##sigma2B <- get("sigma2B", envir) |
|
909 |
+## fit.variance <- get("fit.variance", envir) |
|
910 |
+ |
|
911 |
+ ##--------------------------------------------------------------------------- |
|
912 |
+ ## Estimate CA, CB |
|
913 |
+ ##--------------------------------------------------------------------------- |
|
914 |
+ phiA[phiA < 1] <- 1 |
|
915 |
+ tmp <- (1/phiA[, p])*(A - nuA[, p]) |
|
916 |
+ tmp <- matrix(as.integer(tmp*100), nrow(tmp), ncol(tmp)) |
|
917 |
+ if(FALSE) hist(rowMeans(tmp/100), breaks=1000, xlim=c(-2, 8)) |
|
918 |
+ CA[, plate==uplate[p]] <- tmp |
|
919 |
+ phiB[phiB < 1] <- 1 |
|
920 |
+ tmp <- (1/phiB[, p])*(B - nuB[, p]) |
|
921 |
+ tmp <- matrix(as.integer(tmp*100), nrow(tmp), ncol(tmp)) |
|
922 |
+ if(FALSE) hist(rowMeans(tmp/100), breaks=1000, xlim=c(-2, 8)) |
|
923 |
+ CB[, plate==uplate[p]] <- tmp |
|
924 |
+ |
|
925 |
+ nuA[nuA < 1] <- 1 |
|
926 |
+ nuB[nuB < 1] <- 1 |
|
927 |
+ |
|
928 |
+ tmpA <- CA[, plate==uplate[p]] |
|
929 |
+ tmpB <- CB[, plate==uplate[p]] |
|
930 |
+ indexA <- which(rowSums(is.na(tmpA)) > 0) |
|
931 |
+ indexB <- which(rowSums(is.na(tmpB)) > 0) |
|
932 |
+ index <- union(indexA, indexB) |
|
933 |
+ if(length(index) > 0) browser() |
|
934 |
+ ##--------------------------------------------------------------------------- |
|
935 |
+ ## Estimate var(CA), var(CB), var(CA+CB) |
|
936 |
+ ##--------------------------------------------------------------------------- |
|
937 |
+ ##var(CA) = 1/phiA^2*var(IA - nuA) |
|
938 |
+ ## = 1/phiA^2*var(IA) |
|
939 |
+ ## = 1/phiA^2*f(mu) |
|
940 |
+## log.musA <- as.numeric(log2(IA)) |
|
941 |
+## log.sigma2A <- as.numeric(log2(sigma2A)) |
|
942 |
+## log.musB <- as.numeric(log2(IB)) |
|
943 |
+## log.sigma2B <- as.numeric(log2(sigma2B)) |
|
944 |
+## f <- predict(fit.variance, newdata=data.frame(log.mus=log.musA)) |
|
945 |
+## resid <- matrix(log.sigma2A - f, ncol=ncol(sigma2A)) |
|
946 |
+## ls2 <- rowMedians(resid, na.rm=TRUE) |
|
947 |
+## log.musA <- matrix(log.musA, ncol=ncol(sigma2A)) |
|
948 |
+## tmp <- apply(log.musA + ls2, 2, function(log.mus, fit) sqrt(2^(predict(fit, newdata=data.frame(log.mus=log.mus)))), fit=fit.variance) |
|
949 |
+## sdA <- 1/phiA[, p]*rowMedians(tmp) |
|
950 |
+## ##If all three points are below the spline, the 3 residuals |
|
951 |
+## ##will be negative. adding the median residual to the log mus |
|
952 |
+## ##protects against over/under estimating the variance |
|
953 |
+## ##sdA[tmpIndex, p, ] <- apply(log.musA[tmpIndex, ] + ls2, 2, |
|
954 |
+## ##function(log.mus, fit) sqrt(exp(predict(fit.logVariance, |
|
955 |
+## ##newdata=data.frame(log.mus=log.mus))))) |
|
956 |
+## robustSD <- function(X) diff(quantile(X, probs=c(0.16, (1-0.16)), na.rm=TRUE))/2 |
|
957 |
+## sample.sd <- apply(CA[, plate==uplate[p]]/100, 2, robustSD) |
|
958 |
+## sd.ca <- sdA %*% t(sample.sd) |
|
959 |
+## ##where missing, use the residual from the genotype with the most observations |
|
960 |
+## f <- predict(fit.variance, newdata=data.frame(log.mus=log.musB)) |
|
961 |
+## resid <- matrix(log.sigma2B - f, ncol=ncol(sigma2B)) |
|
962 |
+## ls2 <- rowMedians(resid, na.rm=TRUE) |
|
963 |
+## log.musB <- matrix(log.musB, ncol=ncol(sigma2B)) |
|
964 |
+## tmp <- apply(log.musB + ls2, 2, function(log.mus, fit) sqrt(2^(predict(fit, newdata=data.frame(log.mus=log.mus)))), fit=fit.variance) |
|
965 |
+## sdB <- 1/phiB[, p]*rowMedians(tmp) |
|
966 |
+## sample.sd <- apply(CB[, plate==uplate[p]]/100, 2, robustSD) |
|
967 |
+## sd.cb <- sdB %*% t(sample.sd) |
|
968 |
+## covar <- rowCovs(1/phiA[, p]*log.musA, 1/phiB[, p]*log.musB) |
|
969 |
+## ##tmp1 <- sqrt(sdA^2 + sdB^2 - 2*covar) ##probe-specific variance |
|
970 |
+## tmp <- sqrt(sd.ca^2 + sd.cb^2 - 2*covar) ##probe-specific variance |
|
971 |
+## sd.CT[, plate==uplate[p]] <- matrix(as.integer(100*tmp), nrow(tmp), ncol(tmp)) |
|
972 |
+ ##another approach: |
|
973 |
+ ##tmp <- apply(log(A), 2, function(log.mus, fit) sqrt(exp(predict(fit, newdata=data.frame(log.mus=log.mus)))), fit=fit.variance) |
|
974 |
+ ##tmp2 <- 1/phiA[, p]*tmp |
|
975 |
+## CA <- matrix(as.integer(CA*100), nrow(CA), ncol(CA)) |
|
976 |
+## CB <- matrix(as.integer(CB*100), nrow(CB), ncol(CB)) |
|
977 |
+## sd.CT <- matrix(as.integer(sd.CT*100), nrow(sd.CT), ncol(sd.CT)) |
|
978 |
+ assign("CA", CA, envir) |
|
979 |
+ assign("CB", CB, envir) |
|
980 |
+## assign("sd.CT", sd.CT, envir) |
|
981 |
+ ##--------------------------------------------------------------------------- |
|
982 |
+ ## nonpolymorphic probes |
|
983 |
+ ##--------------------------------------------------------------------------- |
|
984 |
+} |
... | ... |
@@ -11,22 +11,22 @@ changeToCrlmmAnnotationName <- function(x){ |
11 | 11 |
} |
12 | 12 |
|
13 | 13 |
getCrlmmAnnotationName <- function(x){ |
14 |
- paste(tolower(gsub("_", "", x)), "Crlmm", sep="") |
|
14 |
+ paste(tolower(gsub("_", "", x)), "Crlmm", sep="") |
|
15 | 15 |
} |
16 | 16 |
|
17 | 17 |
medianSummaries <- function(mat, grps) |
18 | 18 |
.Call("R_subColSummarize_median", mat, grps, PACKAGE = "preprocessCore") |
19 | 19 |
|
20 | 20 |
intMedianSummaries <- function(mat, grps) |
21 |
- as.integer(medianSummaries(mat, grps)) |
|
21 |
+ as.integer(medianSummaries(mat, grps)) |
|
22 | 22 |
|
23 | 23 |
testProb <- function(p) |
24 | 24 |
.Call("test", p) |
25 | 25 |
|
26 | 26 |
|
27 | 27 |
list.celfiles <- function(...){ |
28 |
- files <- list.files(...) |
|
29 |
- return(files[grep("\\.[cC][eE][lL]$", files)]) |
|
28 |
+ files <- list.files(...) |
|
29 |
+ return(files[grep("\\.[cC][eE][lL]$", files)]) |
|
30 | 30 |
} |
31 | 31 |
|
32 | 32 |
## .crlmmPkgEnv is an enviroment that will |
... | ... |
@@ -36,10 +36,9 @@ list.celfiles <- function(...){ |
36 | 36 |
## R CMD check |
37 | 37 |
|
38 | 38 |
isLoaded <- function(dataset, environ=.crlmmPkgEnv) |
39 |
- exists(dataset, envir=environ) |
|
40 |
- |
|
39 |
+ exists(dataset, envir=environ) |
|
41 | 40 |
getVarInEnv <- function(dataset, environ=.crlmmPkgEnv){ |
42 |
- if (!isLoaded(dataset)) |
|
43 |
- stop("Variable ", dataset, " not found in .crlmmPkgEnv") |
|
44 |
- environ[[dataset]] |
|
41 |
+ if (!isLoaded(dataset)) |
|
42 |
+ stop("Variable ", dataset, " not found in .crlmmPkgEnv") |
|
43 |
+ environ[[dataset]] |
|
45 | 44 |
} |
17 | 16 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,378 @@ |
1 |
+%\VignetteIndexEntry{crlmm copy number Vignette} |
|
2 |
+%\VignetteDepends{crlmm, genomewidesnp6Crlmm} |
|
3 |
+%\VignetteKeywords{crlmm, SNP 6} |
|
4 |
+%\VignettePackage{crlmm} |
|
5 |
+\documentclass{article} |
|
6 |
+\usepackage{graphicx} |
|
7 |
+\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
|
8 |
+\newcommand{\Rmethod}[1]{{\texttt{#1}}} |
|
9 |
+\newcommand{\Rcode}[1]{{\texttt{#1}}} |
|
10 |
+\newcommand{\Robject}[1]{{\texttt{#1}}} |
|
11 |
+\newcommand{\Rpackage}[1]{{\textsf{#1}}} |
|
12 |
+\newcommand{\Rclass}[1]{{\textit{#1}}} |
|
13 |
+\newcommand{\oligo}{\Rpackage{oligo }} |
|
14 |
+\newcommand{\R}{\textsf{R}} |
|
15 |
+ |
|
16 |
+\begin{document} |
|
17 |
+\title{Estimating copy number for Affymetrix 6.0 with the crlmm Package} |
|
18 |
+\date{February, 2009} |
|
19 |
+\author{Rob Scharpf} |
|
20 |
+\maketitle |
|
21 |
+ |
|
22 |
+<<setup, echo=FALSE, results=hide>>= |
|
23 |
+options(width=60) |
|
24 |
+options(continue=" ") |
|
25 |
+options(prompt="R> ") |
|
26 |
+@ |
|
27 |
+ |
|
28 |
+\section{Estimating copy number} |
|
29 |
+ |
|
30 |
+General requirements: in addition to the \R{} packages indicated |
|
31 |
+below, processing a large number of samples requires a high-end |
|
32 |
+computer with a large amount of RAM. Currently, the maximum number of |
|
33 |
+samples that can be genotyped at once is approximately 2000 and this |
|
34 |
+requires roughly 32G of RAM. |
|
35 |
+ |
|
36 |
+\paragraph{Suggested work flow.} I typically submit code for |
|
37 |
+preprocessing and genotyping as a batch job to a computing cluster. |
|
38 |
+When the preprocessing is complete, I pick a chromosome and work |
|
39 |
+interactively from the cluster to calculate copy number and make some |
|
40 |
+of the suggested diagnostic plots. |
|
41 |
+ |
|
42 |
+\subsection{Preprocess and genotype the samples} |
|
43 |
+ |
|
44 |
+First, load the required libraries. |
|
45 |
+ |
|
46 |
+<<requiredPackages>>= |
|
47 |
+library(crlmm) |
|
48 |
+library(genomewidesnp6Crlmm) |
|
49 |
+library(ellipse) |
|
50 |
+@ |
|
51 |
+ |
|
52 |
+Specify an output directory and provide the complete path for the CEL |
|
53 |
+file names. |
|
54 |
+ |
|
55 |
+<<>>= |
|
56 |
+outdir <- "/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/GRU-EA" |
|
57 |
+datadir <- "/thumper/ctsa/snpmicroarray/GAIN/Bipolar/GRU-EA" |
|
58 |
+fns <- list.files(datadir, pattern=".CEL", full.names=TRUE) |
|
59 |
+@ |
|
60 |
+ |
|
61 |
+Genotype the samples with CRLMM. Submit this job to the cluster and save the |
|
62 |
+output to \Robject{outdir}. |
|
63 |
+ |
|
64 |
+<<genotype, eval=FALSE>>= |
|
65 |
+res2 <- crlmm(filenames=fns, save.it=TRUE, intensityFile=file.path(outdir, "intensities.rda")) |
|
66 |
+save(res2, file=file.path(outdir, "res2.rda")) |
|
67 |
+@ |
|
68 |
+ |
|
69 |
+Quantile normalize the nonpolymorphic probes and save the output: |
|
70 |
+ |
|
71 |
+<<quantileNormalizeCnProbes, eval=FALSE>>= |
|
72 |
+res3 <- cnrma(fns) |
|
73 |
+save(res3, file=file.path(outdir, "res3.rda")) |
|
74 |
+@ |
|
75 |
+ |
|
76 |
+\subsection{Copy number} |
|
77 |
+ |
|
78 |
+Copy number can be assessed one chromosome at a time. Here we specify |
|
79 |
+chromosome 15 and and load a list of indices used to subset the files |
|
80 |
+saved in the previous section. The first element in the list |
|
81 |
+correspond to indices of polymorphic probes on chromosome 15; the |
|
82 |
+second element corresponds to indices of nonpolymorphic probes on |
|
83 |
+chromosome 15. |
|
84 |
+ |
|
85 |
+<<chromosomeIndex>>= |
|
86 |
+CHR <- 15 |
|
87 |
+CHR_INDEX <- paste(CHR, "index", sep="") |
|
88 |
+data(list=CHR_INDEX, package="genomewidesnp6Crlmm") |
|
89 |
+str(index) |
|
90 |
+@ |
|
91 |
+ |
|
92 |
+Next we load 3 files that were saved from the preprocessing step and |
|
93 |
+then subset these lists using the above indices to extract the |
|
94 |
+preprocessed intensities and genotypes needed for estimating copy |
|
95 |
+number. Specifically, we require 6 items: |
|
96 |
+ |
|
97 |
+\begin{itemize} |
|
98 |
+ \item quantile-normalized A intensities (I1 x J) |
|
99 |
+ \item quantile-normalized B intensities (I1 x J) |
|
100 |
+ \item quantile-normalized intensities from nonpolymorphic (NP) probes (I2 x J) |
|
101 |
+ \item genotype calls (I1 x J) |
|
102 |
+ \item confidence scores of the genotype calls (I1 x J) |
|
103 |
+ \item signal to noise ratio (SNR) of the samples (J) |
|
104 |
+ \end{itemize} |
|
105 |
+ |
|
106 |
+ These items are extracted as follows: |
|
107 |
+ |
|
108 |
+<<eval=FALSE>>= |
|
109 |
+load(file.path(outdir, "intensities.rda")) |
|
110 |
+load(file.path(outdir, "res2.rda")) |
|
111 |
+load(file.path(outdir, "res3.rda")) |
|
112 |
+@ |
|
113 |
+ |
|
114 |
+Depending on the number of samples, the above objects can be large and |
|
115 |
+take a while to load. For the figures in this vignette, I am loading |
|
116 |
+only 5000 SNP-level summaries from chromosome 22 (28MB) and the |
|
117 |
+quantile-normalized intensities for 5000 nonpolymorphic probes on |
|
118 |
+chromosome 22 (8 MB total). |
|
119 |
+ |
|
120 |
+<<loadTmpFiles, eval=FALSE, echo=FALSE>>= |
|
121 |
+load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmp.rda") |
|
122 |
+assign("res", tmp, envir=.GlobalEnv) |
|
123 |
+rm(tmp); gc() |
|
124 |
+load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmpcn.rda") |
|
125 |
+A <- res$A |
|
126 |
+B <- res$B |
|
127 |
+calls <- res$calls |
|
128 |
+conf <- res$conf |
|
129 |
+SNR <- res$SNR |
|
130 |
+NP <- res3$NP |
|
131 |
+@ |
|
132 |
+ |
|
133 |
+<<snpAndCnSummaries, eval=FALSE>>= |
|
134 |
+A <- res$A |
|
135 |
+B <- res$B |
|
136 |
+calls <- res2$calls |
|
137 |
+conf <- res2$conf |
|
138 |
+SNR <- res2$SNR |
|
139 |
+NP <- res3$NP |
|
140 |
+rm(res, res2, res3) |
|
141 |
+gc() |
|
142 |
+@ |
|
143 |
+ |
|
144 |
+Make a histogram of the signal to noise ratio for these samples: |
|
145 |
+ |
|
146 |
+<<plotSnr, fig=TRUE>>= |
|
147 |
+hist(SNR, xlab="SNR", main="") |
|
148 |
+@ |
|
149 |
+ |
|
150 |
+We suggest excluding samples with a signal to noise ratio less than 5. |
|
151 |
+We then extract the probes (rows) that are on chromosome 15 and the |
|
152 |
+samples (columns) that have a suitable signal to noise ratio. |
|
153 |
+ |
|
154 |
+<<subsetCrlmmOutput, eval=FALSE>>= |
|
155 |
+A <- A[index[[1]], SNR > 5] |
|
156 |
+B <- B[index[[1]], SNR > 5] |
|
157 |
+calls <- calls[index[[1]], SNR > 5] |
|
158 |
+conf <- conf[index[[1]], SNR > 5] |
|
159 |
+NP <- NP[index[[2]], SNR > 5] |
|
160 |
+rm(res, res2, res3); gc() |
|
161 |
+@ |
|
162 |
+ |
|
163 |
+We want to adjust for batch effects. In our experience, the chemistry |
|
164 |
+plate is a good surrogate for batch, although this should be assessed |
|
165 |
+on a study by study basis. For samples processed by Broad, the plate |
|
166 |
+is indicated by a capitalized five-letter word and is part of the |
|
167 |
+sample name. It is important that the plate (batch) variable is |
|
168 |
+ordered the same as filenames in the preprocessed data. For instance, |
|
169 |
+to indicate plate in this example one can use the command |
|
170 |
+ |
|
171 |
+<<specifyBatch>>= |
|
172 |
+sns <- colnames(calls) |
|
173 |
+sns[1] |
|
174 |
+plates <- sapply(sns, function(x) strsplit(x, "_")[[1]][2]) |
|
175 |
+table(plates) |
|
176 |
+@ |
|
177 |
+ |
|
178 |
+We are now ready to estimate the copy number for each batch. In the |
|
179 |
+current version of this package, one specifies an environment to which |
|
180 |
+intermediate \R{} objects for copy number estimation are |
|
181 |
+stored. Allele-specific estimates of copy number are also stored in |
|
182 |
+this environment. |
|
183 |
+ |
|
184 |
+<<copyNumberByBatch, eval=FALSE>>= |
|
185 |
+e1 <- new.env() |
|
186 |
+computeCnBatch(A=A, |
|
187 |
+ B=B, |
|
188 |
+ calls=calls, |
|
189 |
+ conf=conf, |
|
190 |
+ NP=NP, |
|
191 |
+ plate=plates, |
|
192 |
+ envir=e1, chrom=CHR, DF.PRIOR=75) |
|
193 |
+@ |
|
194 |
+ |
|
195 |
+The DF.PRIOR indicates how much we will shrink SNP-specific estimates |
|
196 |
+of the variance and correlation. |
|
197 |
+ |
|
198 |
+<<accessingEstimates>>= |
|
199 |
+copyA <- get("CA", e1) |
|
200 |
+copyB <- get("CB", e1) |
|
201 |
+copyT <- (copyA+copyB)/100 |
|
202 |
+copyT[copyT < 0] <- 0 |
|
203 |
+copyT[copyT > 6] <- 6 |
|
204 |
+@ |
|
205 |
+ |
|
206 |
+\section{Suggested plots} |
|
207 |
+ |
|
208 |
+\paragraph{One sample at a time} |
|
209 |
+ |
|
210 |
+<<annotation>>= |
|
211 |
+data(snpProbes, package="genomewidesnp6Crlmm") |
|
212 |
+data(cnProbes, package="genomewidesnp6Crlmm") |
|
213 |
+position <- snpProbes[match(rownames(calls), rownames(snpProbes)), "position"] |
|
214 |
+@ |
|
215 |
+ |
|
216 |
+To plot physical position versus copy number for the first sample: |
|
217 |
+ |
|
218 |
+<<oneSample, fig=TRUE, width=8, height=4, include=FALSE>>= |
|
219 |
+require(SNPchip) |
|
220 |
+par(las=1) |
|
221 |
+plotCytoband(as.character(CHR), ylim=c(0,7), cytoband.ycoords=c(6.5,7), label.cytoband=FALSE, main="Chr 15") |
|
222 |
+points(position, copyT[, 1], pch=".", cex=2, , xaxt="n", col="grey70") |
|
223 |
+axis(1, at=pretty(range(position)), labels=pretty(range(position))/1e6) |
|
224 |
+axis(2, at=0:5, labels=0:5) |
|
225 |
+mtext("position (Mb)", 1, line=2) |
|
226 |
+mtext(expression(C[A] + C[B]), 2, line=2, las=3) |
|
227 |
+@ |
|
228 |
+ |
|
229 |
+\begin{figure} |
|
230 |
+ \includegraphics[width=0.9\textwidth]{copynumber-oneSample} |
|
231 |
+ \caption{Total copy number (y-axis) for chromosome 22 plotted |
|
232 |
+ against physical position (x-axis) for one sample. } |
|
233 |
+\end{figure} |
|
234 |
+ |
|
235 |
+\paragraph{One SNP at a time} |
|
236 |
+ |
|
237 |
+<<intermediateFiles>>= |
|
238 |
+tau2A <- get("tau2A", e1) |
|
239 |
+tau2B <- get("tau2B", e1) |
|
240 |
+sig2A <- get("sig2A", e1) |
|
241 |
+sig2B <- get("sig2B", e1) |
|
242 |
+nuA <- get("nuA", e1) |
|
243 |
+phiA <- get("phiA", e1) |
|
244 |
+nuB <- get("nuB", e1) |
|
245 |
+phiB <- get("phiB", e1) |
|
246 |
+corr <- get("corr", e1) |
|
247 |
+corrA.BB <- get("corrA.BB", e1) |
|
248 |
+corrB.AA <- get("corrA.BB", e1) |
|
249 |
+A <- get("A", e1) |
|
250 |
+B <- get("B", e1) |
|
251 |
+@ |
|
252 |
+ |
|
253 |
+Here, we plot the prediction regions for total copy number 2 and 3 for |
|
254 |
+the first plate. Black plotting symbols are estimates from the first |
|
255 |
+plate; light grey are points from other plates. (You could also draw |
|
256 |
+prediction regions for 0-4 copies, but it gets crowded). Notice that |
|
257 |
+there is little evidence of a plate effect for this SNP. |
|
258 |
+ |
|
259 |
+<<predictionRegion, fig=TRUE, width=8, height=8, include=FALSE>>= |
|
260 |
+par(las=1) |
|
261 |
+p <- 1 ##indicates plate |
|
262 |
+J <- grep(unique(plates)[p], sns) ##sample indices for this plate |
|
263 |
+ylim <- c(6.5,13) |
|
264 |
+I <- which(phiA > 10 & phiB > 10) |
|
265 |
+i <- I[1] |
|
266 |
+plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col="grey60", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B") |
|
267 |
+points(log2(A[i, J]), log2(B[i, J]), col="black", pch=as.character(calls[i, J])) |
|
268 |
+for(CT in 2:3){ |
|
269 |
+ if(CT == 2) ellipse.col <- "brown" else ellipse.col <- "purple" |
|
270 |
+ for(CA in 0:CT){ |
|
271 |
+ CB <- CT-CA |
|
272 |
+ A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0)) |
|
273 |
+ B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0)) |
|
274 |
+ scale <- c(A.scale, B.scale) |
|
275 |
+ if(CA == 0 & CB > 0) rho <- corrA.BB[i, p] |
|