git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@36653 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,19 @@ |
1 |
+Package: crlmm |
|
2 |
+Type: Package |
|
3 |
+Title: Genotype Calling via CRLMM Algorithm |
|
4 |
+Version: 1.24 |
|
5 |
+Date: 2008-12-28 |
|
6 |
+Author: Rafael A Irizarry |
|
7 |
+Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu> |
|
8 |
+Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays. |
|
9 |
+License: Artistic-2.0 |
|
10 |
+Depends: methods, affyio, preprocessCore, utils |
|
11 |
+Collate: crlmmGT.R |
|
12 |
+ crlmm.R |
|
13 |
+ fitAffySnpMixture56.R |
|
14 |
+ snprma.R |
|
15 |
+ utils.R |
|
16 |
+ zzz.R |
|
17 |
+ crlmmGTnm.R |
|
18 |
+ crlmmnm.R |
|
19 |
+LazyLoad: yes |
0 | 5 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,117 @@ |
1 |
+crlmm <- function(filenames, row.names=TRUE, col.names=TRUE, |
|
2 |
+ probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
3 |
+ save.it=FALSE, load.it=FALSE, intensityFile, |
|
4 |
+ mixtureSampleSize=10^5, eps=0.1, verbose=TRUE, |
|
5 |
+ cdfName, sns, recallMin=10, recallRegMin=1000, |
|
6 |
+ returnParams=FALSE, badSNP=.7){ |
|
7 |
+ if ((load.it | save.it) & missing(intensityFile)) |
|
8 |
+ stop("'intensityFile' is missing, and you chose either load.it or save.it") |
|
9 |
+ |
|
10 |
+ if (missing(sns)) sns <- basename(filenames) |
|
11 |
+ if (load.it & !file.exists(intensityFile)){ |
|
12 |
+ load.it <- FALSE |
|
13 |
+ message("File ", intensityFile, " does not exist.") |
|
14 |
+ message("Not loading it, but running SNPRMA from scratch.") |
|
15 |
+ } |
|
16 |
+ if (!load.it){ |
|
17 |
+ res <- snprma(filenames, fitMixture=TRUE, |
|
18 |
+ mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
19 |
+ eps=eps, cdfName=cdfName, sns=sns) |
|
20 |
+ if(save.it){ |
|
21 |
+ t0 <- proc.time() |
|
22 |
+ save(res, file=intensityFile) |
|
23 |
+ t0 <- proc.time()-t0 |
|
24 |
+ if (verbose) message("Used ", t0[3], " seconds to save ", intensityFile, ".") |
|
25 |
+ } |
|
26 |
+ }else{ |
|
27 |
+ if (verbose) message("Loading ", intensityFile, ".") |
|
28 |
+ obj <- load(intensityFile) |
|
29 |
+ if (verbose) message("Done.") |
|
30 |
+ if (obj != "res") |
|
31 |
+ stop("Object in ", intensityFile, " seems to be invalid.") |
|
32 |
+ } |
|
33 |
+ if(row.names) row.names=res$gns else row.names=NULL |
|
34 |
+ if(col.names) col.names=res$sns else col.names=NULL |
|
35 |
+ |
|
36 |
+ res2 <- crlmmGT(res[["A"]], res[["B"]], res[["SNR"]], |
|
37 |
+ res[["mixtureParams"]], res[["cdfName"]], |
|
38 |
+ gender=gender, row.names=row.names, |
|
39 |
+ col.names=col.names, recallMin=recallMin, |
|
40 |
+ recallRegMin=1000, SNRMin=SNRMin, |
|
41 |
+ returnParams=returnParams, badSNP=badSNP) |
|
42 |
+ |
|
43 |
+ res2[["SNR"]] <- res[["SNR"]] |
|
44 |
+ res2[["SKW"]] <- res[["SKW"]] |
|
45 |
+ return(res2) |
|
46 |
+} |
|
47 |
+ |
|
48 |
+ |
|
49 |
+############################### |
|
50 |
+####### THIS IS TEMPORARY NOT OFFICIALLY USED |
|
51 |
+##################################### |
|
52 |
+ |
|
53 |
+crlmmTNoN <- function(filenames, row.names=TRUE, col.names=TRUE, |
|
54 |
+ probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
55 |
+ save.it=FALSE, load.it=FALSE, |
|
56 |
+ intensityFile="tmpcrlmmintensities.rda", |
|
57 |
+ desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1, |
|
58 |
+ verbose=TRUE){ |
|
59 |
+ if (load.it & !file.exists(intensityFile)){ |
|
60 |
+ load.it <- FALSE |
|
61 |
+ message("File ", intensityFile, " does not exist.") |
|
62 |
+ message("Not loading it, but running SNPRMA from scratch.") |
|
63 |
+ } |
|
64 |
+ if (!load.it){ |
|
65 |
+ res <- snprma(filenames, fitMixture=TRUE, |
|
66 |
+ mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
67 |
+ eps=eps) |
|
68 |
+ if(save.it) save(res, file=intensityFile) |
|
69 |
+ }else{ |
|
70 |
+ message("Loading ", intensityFile, ".") |
|
71 |
+ obj <- load(intensityFile) |
|
72 |
+ message("Done.") |
|
73 |
+ if (obj != "res") |
|
74 |
+ stop("Object in ", intensityFile, " seems to be invalid.") |
|
75 |
+ } |
|
76 |
+ if(row.names) row.names=res$gns else row.names=NULL |
|
77 |
+ if(col.names) col.names=res$sns else col.names=NULL |
|
78 |
+ res2 <- crlmmGTTNoN(res[["A"]], res[["B"]], res[["SNR"]], |
|
79 |
+ res[["mixtureParams"]], res[["cdfName"]], |
|
80 |
+ gender=gender, row.names=row.names, |
|
81 |
+ col.names=col.names) |
|
82 |
+ res2[["SNR"]] <- res[["SNR"]] |
|
83 |
+ return(res2) |
|
84 |
+} |
|
85 |
+ |
|
86 |
+crlmmNormalNoN <- function(filenames, row.names=TRUE, col.names=TRUE, |
|
87 |
+ probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
88 |
+ save.it=FALSE, load.it=FALSE, |
|
89 |
+ intensityFile="tmpcrlmmintensities.rda", |
|
90 |
+ desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1, |
|
91 |
+ verbose=TRUE){ |
|
92 |
+ if (load.it & !file.exists(intensityFile)){ |
|
93 |
+ load.it <- FALSE |
|
94 |
+ message("File ", intensityFile, " does not exist.") |
|
95 |
+ message("Not loading it, but running SNPRMA from scratch.") |
|
96 |
+ } |
|
97 |
+ if (!load.it){ |
|
98 |
+ res <- snprma(filenames, fitMixture=TRUE, |
|
99 |
+ mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
100 |
+ eps=eps) |
|
101 |
+ if(save.it) save(res, file=intensityFile) |
|
102 |
+ }else{ |
|
103 |
+ message("Loading ", intensityFile, ".") |
|
104 |
+ obj <- load(intensityFile) |
|
105 |
+ message("Done.") |
|
106 |
+ if (obj != "res") |
|
107 |
+ stop("Object in ", intensityFile, " seems to be invalid.") |
|
108 |
+ } |
|
109 |
+ if(row.names) row.names=res$gns else row.names=NULL |
|
110 |
+ if(col.names) col.names=res$sns else col.names=NULL |
|
111 |
+ res2 <- crlmmGTNormalNoN(res[["A"]], res[["B"]], res[["SNR"]], |
|
112 |
+ res[["mixtureParams"]], res[["cdfName"]], |
|
113 |
+ gender=gender, row.names=row.names, |
|
114 |
+ col.names=col.names) |
|
115 |
+ res2[["SNR"]] <- res[["SNR"]] |
|
116 |
+ return(res2) |
|
117 |
+} |
0 | 118 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,547 @@ |
1 |
+crlmmGT <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
|
2 |
+ col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6, |
|
3 |
+ SNRMin=5, recallMin=10, recallRegMin=1000, |
|
4 |
+ gender=NULL, desctrucitve=FALSE, verbose=TRUE, |
|
5 |
+ returnParams=FALSE, badSNP=.7){ |
|
6 |
+ |
|
7 |
+ keepIndex <- which(SNR>SNRMin) |
|
8 |
+ if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
9 |
+ |
|
10 |
+ NC <- ncol(A) |
|
11 |
+ NR <- nrow(A) |
|
12 |
+ |
|
13 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
14 |
+ if(!require(pkgname, character.only=TRUE)){ |
|
15 |
+ suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
16 |
+ msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
17 |
+ message(strwrap(msg)) |
|
18 |
+ stop("Package ", pkgname, " could not be found.") |
|
19 |
+ rm(suggCall, msg) |
|
20 |
+ } |
|
21 |
+ |
|
22 |
+ if(verbose) message("Loading annotations.") |
|
23 |
+ data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
24 |
+ |
|
25 |
+ ## this is toget rid of the 'no visible binding' notes |
|
26 |
+ ## variable definitions |
|
27 |
+ XIndex <- getVarInEnv("XIndex") |
|
28 |
+ autosomeIndex <- getVarInEnv("autosomeIndex") |
|
29 |
+ YIndex <- getVarInEnv("YIndex") |
|
30 |
+ SMEDIAN <- getVarInEnv("SMEDIAN") |
|
31 |
+ theKnots <- getVarInEnv("theKnots") |
|
32 |
+ regionInfo <- getVarInEnv("regionInfo") |
|
33 |
+ |
|
34 |
+ ##IF gender not provide, we predict |
|
35 |
+ if(is.null(gender)){ |
|
36 |
+ if(verbose) message("Determining gender.") |
|
37 |
+ XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
38 |
+ if(sum(SNR>SNRMin)==1){ |
|
39 |
+ gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
40 |
+ }else{ |
|
41 |
+ gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
42 |
+ } |
|
43 |
+ } |
|
44 |
+ |
|
45 |
+ Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
46 |
+ cIndexes <- list(keepIndex, |
|
47 |
+ keepIndex[which(gender[keepIndex]==2)], |
|
48 |
+ keepIndex[which(gender[keepIndex]==1)]) |
|
49 |
+ |
|
50 |
+ if(verbose) cat("Calling", NR, "SNPs for recalibration") |
|
51 |
+ |
|
52 |
+ ## call C |
|
53 |
+ fIndex <- which(gender==2) |
|
54 |
+ mIndex <- which(gender==1) |
|
55 |
+ newparams <- gtypeCallerR(A, B, fIndex, mIndex, |
|
56 |
+ params[["centers"]], params[["scales"]], params[["N"]], |
|
57 |
+ Indexes, cIndexes, |
|
58 |
+ sapply(Indexes, length), sapply(cIndexes, length), |
|
59 |
+ SMEDIAN, theKnots, |
|
60 |
+ mixtureParams, DF, probs, 0.025) |
|
61 |
+ gc(verbose=FALSE) |
|
62 |
+ names(newparams) <- c("centers", "scales", "N") |
|
63 |
+ |
|
64 |
+ if(verbose) message("Done.") |
|
65 |
+ if(verbose) message("Estimating recalibration parameters.") |
|
66 |
+ d <- newparams[["centers"]] - params$centers |
|
67 |
+ |
|
68 |
+ ##regression |
|
69 |
+ Index <- intersect(which(pmin(newparams[["N"]][, 1], |
|
70 |
+ newparams[["N"]][, 2], |
|
71 |
+ newparams[["N"]][, 3]) > recallMin & |
|
72 |
+ !apply(regionInfo, 1, any)), |
|
73 |
+ autosomeIndex) |
|
74 |
+ |
|
75 |
+ if(length(Index) < recallRegMin){ |
|
76 |
+ warning("Recallibration not possible.") |
|
77 |
+ newparams <- params |
|
78 |
+ dev <- vector("numeric", nrow(newparams[["centers"]])) |
|
79 |
+ SS <- matrix(Inf, 3, 3) |
|
80 |
+ DD <- 0 |
|
81 |
+ }else{ |
|
82 |
+ data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
83 |
+ names(data4reg) <- c("AA", "AB", "BB") |
|
84 |
+ regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
85 |
+ c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
86 |
+ coef(lm(BB~AA*AB, data=data4reg))) |
|
87 |
+ rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
88 |
+ rm(data4reg) |
|
89 |
+ |
|
90 |
+ minN <- 3 |
|
91 |
+ newparams[["centers"]][newparams[["N"]] < minN] <- NA |
|
92 |
+ Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
93 |
+ if(verbose) cat("Filling out empty centers") |
|
94 |
+ for(i in Index){ |
|
95 |
+ if(verbose) if(i%%10000==0)cat(".") |
|
96 |
+ mu <- newparams[["centers"]][i, ] |
|
97 |
+ j <- which(is.na(mu)) |
|
98 |
+ newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
99 |
+ } |
|
100 |
+ |
|
101 |
+ ##remaing NAs are made like originals |
|
102 |
+ if(length(YIndex)>0){ |
|
103 |
+ noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
104 |
+ YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
105 |
+ } |
|
106 |
+ newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
107 |
+ if(verbose) cat("\n") |
|
108 |
+ |
|
109 |
+ if(verbose) message("Calculating and standardizing size of shift.") |
|
110 |
+ DD <- newparams[["centers"]] - params[["centers"]] |
|
111 |
+ DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ])) |
|
112 |
+ SS <- cov(DD[autosomeIndex, ]) |
|
113 |
+ SSI <- solve(SS) |
|
114 |
+ dev <- vector("numeric", nrow(DD)) |
|
115 |
+ if(length(YIndex)){ |
|
116 |
+ dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
117 |
+ dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
118 |
+ ##Now Y (only two params) |
|
119 |
+ SSY <- SS[c(1, 3), c(1, 3)] |
|
120 |
+ SSI <- solve(SSY) |
|
121 |
+ dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
122 |
+ dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
123 |
+ } else { |
|
124 |
+ dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
125 |
+ dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
126 |
+ } |
|
127 |
+ } |
|
128 |
+ |
|
129 |
+ ## BC: must keep SD |
|
130 |
+ params[-2] <- newparams[-2] |
|
131 |
+ |
|
132 |
+ rm(newparams);gc(verbose=FALSE) |
|
133 |
+ if(verbose) cat("Calling", NR, "SNPs") |
|
134 |
+ ## ################### |
|
135 |
+ ## ## MOVE TO C####### |
|
136 |
+ ImNull <- gtypeCallerR2(A, B, fIndex, mIndex, params[["centers"]], |
|
137 |
+ params[["scales"]], params[["N"]], Indexes, |
|
138 |
+ cIndexes, sapply(Indexes, length), |
|
139 |
+ sapply(cIndexes, length), SMEDIAN, theKnots, |
|
140 |
+ mixtureParams, DF, probs, 0.025, |
|
141 |
+ which(regionInfo[,2]), |
|
142 |
+ which(regionInfo[,1])) |
|
143 |
+ gc(verbose=FALSE) |
|
144 |
+ ## END MOVE TO C####### |
|
145 |
+ ## ################## |
|
146 |
+ |
|
147 |
+ dev <- dev/(dev+1/383) |
|
148 |
+ if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
149 |
+ if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
150 |
+ |
|
151 |
+ if(length(Index) >= recallRegMin){ |
|
152 |
+ tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1) |
|
153 |
+ tmpSnpQc <- dev[autosomeIndex] |
|
154 |
+ SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,]) |
|
155 |
+ batchQC <- mean(diag(SS)) |
|
156 |
+ }else{ |
|
157 |
+ batchQC <- Inf |
|
158 |
+ } |
|
159 |
+ |
|
160 |
+ if(verbose) message("Done.") |
|
161 |
+ if (returnParams){ |
|
162 |
+ return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD)) |
|
163 |
+ }else{ |
|
164 |
+ return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD)) |
|
165 |
+ } |
|
166 |
+} |
|
167 |
+ |
|
168 |
+ |
|
169 |
+gtypeCallerR <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
170 |
+ theNs, Indexes, cIndexes, nIndexes, |
|
171 |
+ ncIndexes, SMEDIAN, knots, params, dft, |
|
172 |
+ probs, trim){ |
|
173 |
+ |
|
174 |
+ stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
175 |
+ nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
176 |
+ nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
177 |
+ ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
178 |
+ |
|
179 |
+ ## make code robust |
|
180 |
+ ## check types before passing to C |
|
181 |
+ |
|
182 |
+ .Call("gtypeCallerPart1", A, B, |
|
183 |
+ as.integer(fIndex), as.integer(mIndex), |
|
184 |
+ as.numeric(theCenters), as.numeric(theScales), |
|
185 |
+ as.integer(theNs), lapply(Indexes, as.integer), lapply(cIndexes, as.integer), as.integer(nIndexes), as.integer(ncIndexes), |
|
186 |
+ as.numeric(SMEDIAN), as.numeric(knots), as.numeric(params), |
|
187 |
+ as.integer(dft), as.numeric(probs), as.numeric(trim), |
|
188 |
+ PACKAGE="crlmm") |
|
189 |
+ |
|
190 |
+} |
|
191 |
+ |
|
192 |
+gtypeCallerR2 <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
193 |
+ theNs, Indexes, cIndexes, nIndexes, |
|
194 |
+ ncIndexes, SMEDIAN, knots, params, dft, |
|
195 |
+ probs, trim, noTraining, noInfo){ |
|
196 |
+ |
|
197 |
+ stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
198 |
+ nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
199 |
+ nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
200 |
+ ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
201 |
+ |
|
202 |
+ .Call("gtypeCallerPart2", A, B, |
|
203 |
+ as.integer(fIndex), as.integer(mIndex), |
|
204 |
+ as.numeric(theCenters), as.numeric(theScales), |
|
205 |
+ as.integer(theNs), Indexes, cIndexes, nIndexes, ncIndexes, |
|
206 |
+ as.numeric(SMEDIAN), as.numeric(knots), as.numeric(params), |
|
207 |
+ as.integer(dft), as.numeric(probs), as.numeric(trim), |
|
208 |
+ as.integer(noTraining), as.integer(noInfo), PACKAGE="crlmm") |
|
209 |
+ |
|
210 |
+} |
|
211 |
+ |
|
212 |
+ |
|
213 |
+ |
|
214 |
+ |
|
215 |
+ |
|
216 |
+################## |
|
217 |
+################## |
|
218 |
+### THIS IS TEMPORARY NOT OFFICIALLY USED |
|
219 |
+################## |
|
220 |
+#################### |
|
221 |
+crlmmGTTNoN <- function(A, B, SNR, mixtureParams, cdfName, |
|
222 |
+ row.names=NULL, col.names=NULL, probs=c(1/3, |
|
223 |
+ 1/3, 1/3), DF=6, SNRMin=6, gender=NULL, |
|
224 |
+ desctrucitve=FALSE, verbose=TRUE){ |
|
225 |
+ keepIndex <- which(SNR>SNRMin) |
|
226 |
+ if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
227 |
+ |
|
228 |
+ NC <- ncol(A) |
|
229 |
+ NR <- nrow(A) |
|
230 |
+ |
|
231 |
+ if(verbose) message("Loading annotations.") |
|
232 |
+ data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
233 |
+ |
|
234 |
+ ## this is toget rid of the 'no visible binding' notes |
|
235 |
+ ## variable definitions |
|
236 |
+ XIndex <- getVarInEnv("XIndex") |
|
237 |
+ autosomeIndex <- getVarInEnv("autosomeIndex") |
|
238 |
+ YIndex <- getVarInEnv("YIndex") |
|
239 |
+ SMEDIAN <- getVarInEnv("SMEDIAN") |
|
240 |
+ theKnots <- getVarInEnv("theKnots") |
|
241 |
+ regionInfo <- getVarInEnv("regionInfo") |
|
242 |
+ |
|
243 |
+ ##IF gender not provide, we predict |
|
244 |
+ if(is.null(gender)){ |
|
245 |
+ if(verbose) message("Determining gender.") |
|
246 |
+ XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
247 |
+ if(sum(SNR>SNRMin)==1) gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) else gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
248 |
+ } |
|
249 |
+ |
|
250 |
+ Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
251 |
+ cIndexes <- list(keepIndex, |
|
252 |
+ keepIndex[which(gender[keepIndex]==2)], |
|
253 |
+ keepIndex[which(gender[keepIndex]==1)]) |
|
254 |
+ |
|
255 |
+ if(verbose) cat("Calling", NR, "SNPs for recalibration") |
|
256 |
+ |
|
257 |
+ ## call C |
|
258 |
+ fIndex <- which(gender==2) |
|
259 |
+ mIndex <- which(gender==1) |
|
260 |
+ t0 <- proc.time() |
|
261 |
+ newparams <- gtypeCallerRTNoN(A, B, fIndex, mIndex, |
|
262 |
+ params[["centers"]], params[["scales"]], params[["N"]], |
|
263 |
+ Indexes, cIndexes, |
|
264 |
+ sapply(Indexes, length), sapply(cIndexes, length), |
|
265 |
+ SMEDIAN, theKnots, |
|
266 |
+ mixtureParams, DF, probs, 0.025) |
|
267 |
+ t0 <- proc.time()-t0 |
|
268 |
+ message("Part 1 took ", t0[3], " seconds.") |
|
269 |
+ names(newparams) <- c("centers", "scales", "N") |
|
270 |
+ |
|
271 |
+ if(verbose) message("Done.") |
|
272 |
+ if(verbose) message("Estimating recalibration parameters.") |
|
273 |
+ d <- newparams[["centers"]] - params$centers |
|
274 |
+ |
|
275 |
+ ##regression |
|
276 |
+ MIN <- 10 |
|
277 |
+ Index <- intersect(which(pmin(newparams[["N"]][, 1], newparams[["N"]][, 2], newparams[["N"]][, 3])>MIN & !apply(regionInfo, 1, any)), autosomeIndex) |
|
278 |
+ data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
279 |
+ names(data4reg) <- c("AA", "AB", "BB") |
|
280 |
+ regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
281 |
+ c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
282 |
+ coef(lm(BB~AA*AB, data=data4reg))) |
|
283 |
+ rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
284 |
+ rm(data4reg) |
|
285 |
+ |
|
286 |
+ minN <- 3 |
|
287 |
+ newparams[["centers"]][newparams[["N"]]<minN] <- NA |
|
288 |
+ Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
289 |
+ if(verbose) cat("Filling out empty centers") |
|
290 |
+ for(i in Index){ |
|
291 |
+ if(verbose) if(i%%10000==0)cat(".") |
|
292 |
+ mu <- newparams[["centers"]][i, ] |
|
293 |
+ j <- which(is.na(mu)) |
|
294 |
+ newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
295 |
+ } |
|
296 |
+ ##remaing NAs are made like originals |
|
297 |
+ if(length(YIndex)>0){ |
|
298 |
+ noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
299 |
+ YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
300 |
+ } |
|
301 |
+ newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
302 |
+ if(verbose) cat("\n") |
|
303 |
+ |
|
304 |
+ if(verbose) message("Calculating and standardizing size of shift.") |
|
305 |
+ DD <- newparams[["centers"]] - params[["centers"]] |
|
306 |
+ MM <- colMeans(DD[autosomeIndex, ]) |
|
307 |
+ DD <- sweep(DD, 2, MM) |
|
308 |
+ SS <- cov(DD[autosomeIndex, ]) |
|
309 |
+ SSI <- solve(SS) |
|
310 |
+ dev <- vector("numeric", nrow(DD)) |
|
311 |
+ if(length(YIndex)){ |
|
312 |
+ dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
313 |
+ dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
314 |
+ ##Now Y (only two params) |
|
315 |
+ SSY <- SS[c(1, 3), c(1, 3)] |
|
316 |
+ SSI <- solve(SSY) |
|
317 |
+ dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
318 |
+ dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
319 |
+ } else { |
|
320 |
+ dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
321 |
+ dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
322 |
+ } |
|
323 |
+ |
|
324 |
+ ## BC: must keep SD |
|
325 |
+ params[-2] <- newparams[-2] |
|
326 |
+ rm(newparams);gc(verbose=FALSE) |
|
327 |
+ if(verbose) cat("Calling", NR, "SNPs") |
|
328 |
+ ## ################### |
|
329 |
+ ## ## MOVE TO C####### |
|
330 |
+ t0 <- proc.time() |
|
331 |
+ ImNull <- gtypeCallerR2TNoN(A, B, fIndex, mIndex, params[["centers"]], |
|
332 |
+ params[["scales"]], params[["N"]], Indexes, |
|
333 |
+ cIndexes, sapply(Indexes, length), |
|
334 |
+ sapply(cIndexes, length), SMEDIAN, theKnots, |
|
335 |
+ mixtureParams, DF, probs, 0.025, |
|
336 |
+ which(regionInfo[,2]), |
|
337 |
+ which(regionInfo[,1])) |
|
338 |
+ t0 <- proc.time()-t0 |
|
339 |
+ message("Part 2 took ", t0[3], " seconds.") |
|
340 |
+ ## END MOVE TO C####### |
|
341 |
+ ## ################## |
|
342 |
+ |
|
343 |
+ dev <- dev/(dev+1/383) |
|
344 |
+ if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
345 |
+ if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
346 |
+ |
|
347 |
+ if(verbose) message("Done.") |
|
348 |
+ return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS)))) |
|
349 |
+} |
|
350 |
+ |
|
351 |
+ |
|
352 |
+gtypeCallerRTNoN <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
353 |
+ theNs, Indexes, cIndexes, nIndexes, |
|
354 |
+ ncIndexes, SMEDIAN, knots, params, dft, |
|
355 |
+ probs, trim){ |
|
356 |
+ |
|
357 |
+ stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
358 |
+ nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
359 |
+ nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
360 |
+ ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
361 |
+ |
|
362 |
+ .Call("gtypeCallerPart1TNoN", A, B, fIndex, mIndex, theCenters, |
|
363 |
+ theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes, |
|
364 |
+ SMEDIAN, knots, params, dft, probs, trim, PACKAGE="crlmm") |
|
365 |
+ |
|
366 |
+} |
|
367 |
+ |
|
368 |
+gtypeCallerR2TNoN <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
369 |
+ theNs, Indexes, cIndexes, nIndexes, |
|
370 |
+ ncIndexes, SMEDIAN, knots, params, dft, |
|
371 |
+ probs, trim, noTraining, noInfo){ |
|
372 |
+ |
|
373 |
+ stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
374 |
+ nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
375 |
+ nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
376 |
+ ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
377 |
+ |
|
378 |
+ .Call("gtypeCallerPart2TNoN", A, B, fIndex, mIndex, theCenters, |
|
379 |
+ theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes, |
|
380 |
+ SMEDIAN, knots, params, dft, probs, trim, noTraining, noInfo, PACKAGE="crlmm") |
|
381 |
+ |
|
382 |
+} |
|
383 |
+ |
|
384 |
+crlmmGTNormalNoN <- function(A, B, SNR, mixtureParams, cdfName, |
|
385 |
+ row.names=NULL, col.names=NULL, probs=c(1/3, |
|
386 |
+ 1/3, 1/3), DF=6, SNRMin=6, gender=NULL, |
|
387 |
+ desctrucitve=FALSE, verbose=TRUE){ |
|
388 |
+ keepIndex <- which(SNR>SNRMin) |
|
389 |
+ if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
390 |
+ |
|
391 |
+ NC <- ncol(A) |
|
392 |
+ NR <- nrow(A) |
|
393 |
+ |
|
394 |
+ if(verbose) message("Loading annotations.") |
|
395 |
+ data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
396 |
+ |
|
397 |
+ ## this is toget rid of the 'no visible binding' notes |
|
398 |
+ ## variable definitions |
|
399 |
+ XIndex <- getVarInEnv("XIndex") |
|
400 |
+ autosomeIndex <- getVarInEnv("autosomeIndex") |
|
401 |
+ YIndex <- getVarInEnv("YIndex") |
|
402 |
+ SMEDIAN <- getVarInEnv("SMEDIAN") |
|
403 |
+ theKnots <- getVarInEnv("theKnots") |
|
404 |
+ regionInfo <- getVarInEnv("regionInfo") |
|
405 |
+ |
|
406 |
+ ##IF gender not provide, we predict |
|
407 |
+ if(is.null(gender)){ |
|
408 |
+ if(verbose) message("Determining gender.") |
|
409 |
+ XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
410 |
+ if(sum(SNR>SNRMin)==1) gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) else gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
411 |
+ } |
|
412 |
+ |
|
413 |
+ Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
414 |
+ cIndexes <- list(keepIndex, |
|
415 |
+ keepIndex[which(gender[keepIndex]==2)], |
|
416 |
+ keepIndex[which(gender[keepIndex]==1)]) |
|
417 |
+ |
|
418 |
+ if(verbose) cat("Calling", NR, "SNPs for recalibration") |
|
419 |
+ |
|
420 |
+ ## call C |
|
421 |
+ fIndex <- which(gender==2) |
|
422 |
+ mIndex <- which(gender==1) |
|
423 |
+ t0 <- proc.time() |
|
424 |
+ newparams <- gtypeCallerRNormalNoN(A, B, fIndex, mIndex, |
|
425 |
+ params[["centers"]], params[["scales"]], params[["N"]], |
|
426 |
+ Indexes, cIndexes, |
|
427 |
+ sapply(Indexes, length), sapply(cIndexes, length), |
|
428 |
+ SMEDIAN, theKnots, |
|
429 |
+ mixtureParams, DF, probs, 0.025) |
|
430 |
+ t0 <- proc.time()-t0 |
|
431 |
+ message("Part 1 took ", t0[3], " seconds.") |
|
432 |
+ names(newparams) <- c("centers", "scales", "N") |
|
433 |
+ |
|
434 |
+ if(verbose) message("Done.") |
|
435 |
+ if(verbose) message("Estimating recalibration parameters.") |
|
436 |
+ d <- newparams[["centers"]] - params$centers |
|
437 |
+ |
|
438 |
+ ##regression |
|
439 |
+ MIN <- 10 |
|
440 |
+ Index <- intersect(which(pmin(newparams[["N"]][, 1], newparams[["N"]][, 2], newparams[["N"]][, 3])>MIN & !apply(regionInfo, 1, any)), autosomeIndex) |
|
441 |
+ data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
442 |
+ names(data4reg) <- c("AA", "AB", "BB") |
|
443 |
+ regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
444 |
+ c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
445 |
+ coef(lm(BB~AA*AB, data=data4reg))) |
|
446 |
+ rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
447 |
+ rm(data4reg) |
|
448 |
+ |
|
449 |
+ minN <- 3 |
|
450 |
+ newparams[["centers"]][newparams[["N"]]<minN] <- NA |
|
451 |
+ Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
452 |
+ if(verbose) cat("Filling out empty centers") |
|
453 |
+ for(i in Index){ |
|
454 |
+ if(verbose) if(i%%10000==0)cat(".") |
|
455 |
+ mu <- newparams[["centers"]][i, ] |
|
456 |
+ j <- which(is.na(mu)) |
|
457 |
+ newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
458 |
+ } |
|
459 |
+ ##remaing NAs are made like originals |
|
460 |
+ if(length(YIndex)>0){ |
|
461 |
+ noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
462 |
+ YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
463 |
+ } |
|
464 |
+ newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
465 |
+ if(verbose) cat("\n") |
|
466 |
+ |
|
467 |
+ if(verbose) message("Calculating and standardizing size of shift.") |
|
468 |
+ DD <- newparams[["centers"]] - params[["centers"]] |
|
469 |
+ MM <- colMeans(DD[autosomeIndex, ]) |
|
470 |
+ DD <- sweep(DD, 2, MM) |
|
471 |
+ SS <- cov(DD[autosomeIndex, ]) |
|
472 |
+ SSI <- solve(SS) |
|
473 |
+ dev <- vector("numeric", nrow(DD)) |
|
474 |
+ if(length(YIndex)){ |
|
475 |
+ dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
476 |
+ dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
477 |
+ ##Now Y (only two params) |
|
478 |
+ SSY <- SS[c(1, 3), c(1, 3)] |
|
479 |
+ SSI <- solve(SSY) |
|
480 |
+ dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
481 |
+ dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
482 |
+ } else { |
|
483 |
+ dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
484 |
+ dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
485 |
+ } |
|
486 |
+ |
|
487 |
+ ## BC: must keep SD |
|
488 |
+ params[-2] <- newparams[-2] |
|
489 |
+ rm(newparams);gc(verbose=FALSE) |
|
490 |
+ if(verbose) cat("Calling", NR, "SNPs") |
|
491 |
+ ## ################### |
|
492 |
+ ## ## MOVE TO C####### |
|
493 |
+ t0 <- proc.time() |
|
494 |
+ ImNull <- gtypeCallerR2NormalNoN(A, B, fIndex, mIndex, params[["centers"]], |
|
495 |
+ params[["scales"]], params[["N"]], Indexes, |
|
496 |
+ cIndexes, sapply(Indexes, length), |
|
497 |
+ sapply(cIndexes, length), SMEDIAN, theKnots, |
|
498 |
+ mixtureParams, DF, probs, 0.025, |
|
499 |
+ which(regionInfo[,2]), |
|
500 |
+ which(regionInfo[,1])) |
|
501 |
+ t0 <- proc.time()-t0 |
|
502 |
+ message("Part 2 took ", t0[3], " seconds.") |
|
503 |
+ ## END MOVE TO C####### |
|
504 |
+ ## ################## |
|
505 |
+ |
|
506 |
+ dev <- dev/(dev+1/383) |
|
507 |
+ if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
508 |
+ if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
509 |
+ |
|
510 |
+ if(verbose) message("Done.") |
|
511 |
+ return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS)))) |
|
512 |
+} |
|
513 |
+ |
|
514 |
+ |
|
515 |
+gtypeCallerRNormalNoN <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
516 |
+ theNs, Indexes, cIndexes, nIndexes, |
|
517 |
+ ncIndexes, SMEDIAN, knots, params, dft, |
|
518 |
+ probs, trim){ |
|
519 |
+ |
|
520 |
+ stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
521 |
+ nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
522 |
+ nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
523 |
+ ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
524 |
+ |
|
525 |
+ .Call("gtypeCallerPart1NormalNoN", A, B, fIndex, mIndex, theCenters, |
|
526 |
+ theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes, |
|
527 |
+ SMEDIAN, knots, params, dft, probs, trim, PACKAGE="crlmm") |
|
528 |
+ |
|
529 |
+} |
|
530 |
+ |
|
531 |
+gtypeCallerR2NormalNoN <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
532 |
+ theNs, Indexes, cIndexes, nIndexes, |
|
533 |
+ ncIndexes, SMEDIAN, knots, params, dft, |
|
534 |
+ probs, trim, noTraining, noInfo){ |
|
535 |
+ |
|
536 |
+ stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
537 |
+ nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
538 |
+ nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
539 |
+ ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
540 |
+ |
|
541 |
+ .Call("gtypeCallerPart2NormalNoN", A, B, fIndex, mIndex, theCenters, |
|
542 |
+ theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes, |
|
543 |
+ SMEDIAN, knots, params, dft, probs, trim, noTraining, noInfo, PACKAGE="crlmm") |
|
544 |
+ |
|
545 |
+} |
|
546 |
+ |
|
547 |
+ |
0 | 548 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,207 @@ |
1 |
+crlmmGTnm <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
|
2 |
+ col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6, |
|
3 |
+ SNRMin=5, recallMin=10, recallRegMin=1000, |
|
4 |
+ gender=NULL, desctrucitve=FALSE, verbose=TRUE, |
|
5 |
+ returnParams=FALSE){ |
|
6 |
+ |
|
7 |
+ keepIndex <- which(SNR>SNRMin) |
|
8 |
+ if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
9 |
+ |
|
10 |
+ NC <- ncol(A) |
|
11 |
+ NR <- nrow(A) |
|
12 |
+ |
|
13 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
14 |
+ if(!require(pkgname, character.only=TRUE)){ |
|
15 |
+ suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
16 |
+ msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
17 |
+ message(strwrap(msg)) |
|
18 |
+ stop("Package ", pkgname, " could not be found.") |
|
19 |
+ rm(suggCall, msg) |
|
20 |
+ } |
|
21 |
+ |
|
22 |
+ if(verbose) message("Loading annotations.") |
|
23 |
+ data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
24 |
+ |
|
25 |
+ ## this is toget rid of the 'no visible binding' notes |
|
26 |
+ ## variable definitions |
|
27 |
+ XIndex <- getVarInEnv("XIndex") |
|
28 |
+ autosomeIndex <- getVarInEnv("autosomeIndex") |
|
29 |
+ YIndex <- getVarInEnv("YIndex") |
|
30 |
+ SMEDIAN <- getVarInEnv("SMEDIAN") |
|
31 |
+ theKnots <- getVarInEnv("theKnots") |
|
32 |
+ regionInfo <- getVarInEnv("regionInfo") |
|
33 |
+ |
|
34 |
+ ##IF gender not provide, we predict |
|
35 |
+ if(is.null(gender)){ |
|
36 |
+ if(verbose) message("Determining gender.") |
|
37 |
+ XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
38 |
+ if(sum(SNR>SNRMin)==1){ |
|
39 |
+ gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
40 |
+ }else{ |
|
41 |
+ gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
42 |
+ } |
|
43 |
+ } |
|
44 |
+ |
|
45 |
+ Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
46 |
+ cIndexes <- list(keepIndex, |
|
47 |
+ keepIndex[which(gender[keepIndex]==2)], |
|
48 |
+ keepIndex[which(gender[keepIndex]==1)]) |
|
49 |
+ |
|
50 |
+ if(verbose) cat("Calling", NR, "SNPs for recalibration") |
|
51 |
+ |
|
52 |
+ ## call C |
|
53 |
+ fIndex <- which(gender==2) |
|
54 |
+ mIndex <- which(gender==1) |
|
55 |
+ t0 <- proc.time() |
|
56 |
+ newparams <- gtypeCallerRnm(A, B, fIndex, mIndex, |
|
57 |
+ params[["centers"]], params[["scales"]], params[["N"]], |
|
58 |
+ Indexes, cIndexes, |
|
59 |
+ sapply(Indexes, length), sapply(cIndexes, length), |
|
60 |
+ SMEDIAN, theKnots, |
|
61 |
+ mixtureParams, DF, probs, 0.025) |
|
62 |
+ t0 <- proc.time()-t0 |
|
63 |
+ message("Part 1 took ", t0[3], " seconds.") |
|
64 |
+ gc(verbose=FALSE) |
|
65 |
+ names(newparams) <- c("centers", "scales", "N") |
|
66 |
+ |
|
67 |
+ if(verbose) message("Done.") |
|
68 |
+ if(verbose) message("Estimating recalibration parameters.") |
|
69 |
+ d <- newparams[["centers"]] - params$centers |
|
70 |
+ |
|
71 |
+ ##regression |
|
72 |
+ Index <- intersect(which(pmin(newparams[["N"]][, 1], |
|
73 |
+ newparams[["N"]][, 2], |
|
74 |
+ newparams[["N"]][, 3]) > recallMin & |
|
75 |
+ !apply(regionInfo, 1, any)), |
|
76 |
+ autosomeIndex) |
|
77 |
+ |
|
78 |
+ if(length(Index) < recallRegMin){ |
|
79 |
+ warning("Recallibration not possible.") |
|
80 |
+ newparams <- params |
|
81 |
+ dev <- vector("numeric", nrow(newparams[["centers"]])) |
|
82 |
+ SS <- matrix(Inf, 3, 3) |
|
83 |
+ }else{ |
|
84 |
+ data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
85 |
+ names(data4reg) <- c("AA", "AB", "BB") |
|
86 |
+ regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
87 |
+ c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
88 |
+ coef(lm(BB~AA*AB, data=data4reg))) |
|
89 |
+ rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
90 |
+ rm(data4reg) |
|
91 |
+ |
|
92 |
+ minN <- 3 |
|
93 |
+ newparams[["centers"]][newparams[["N"]] < minN] <- NA |
|
94 |
+ Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
95 |
+ if(verbose) cat("Filling out empty centers") |
|
96 |
+ for(i in Index){ |
|
97 |
+ if(verbose) if(i%%10000==0)cat(".") |
|
98 |
+ mu <- newparams[["centers"]][i, ] |
|
99 |
+ j <- which(is.na(mu)) |
|
100 |
+ newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
101 |
+ } |
|
102 |
+ |
|
103 |
+ ##remaing NAs are made like originals |
|
104 |
+ if(length(YIndex)>0){ |
|
105 |
+ noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
106 |
+ YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
107 |
+ } |
|
108 |
+ newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
109 |
+ if(verbose) cat("\n") |
|
110 |
+ |
|
111 |
+ if(verbose) message("Calculating and standardizing size of shift.") |
|
112 |
+ DD <- newparams[["centers"]] - params[["centers"]] |
|
113 |
+ DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ])) |
|
114 |
+ SS <- cov(DD[autosomeIndex, ]) |
|
115 |
+ SSI <- solve(SS) |
|
116 |
+ dev <- vector("numeric", nrow(DD)) |
|
117 |
+ if(length(YIndex)){ |
|
118 |
+ dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
119 |
+ dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
120 |
+ ##Now Y (only two params) |
|
121 |
+ SSY <- SS[c(1, 3), c(1, 3)] |
|
122 |
+ SSI <- solve(SSY) |
|
123 |
+ dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
124 |
+ dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
125 |
+ } else { |
|
126 |
+ dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
127 |
+ dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
128 |
+ } |
|
129 |
+ } |
|
130 |
+ |
|
131 |
+ ## BC: must keep SD |
|
132 |
+ params[-2] <- newparams[-2] |
|
133 |
+ |
|
134 |
+ rm(newparams);gc(verbose=FALSE) |
|
135 |
+ if(verbose) cat("Calling", NR, "SNPs") |
|
136 |
+ ## ################### |
|
137 |
+ ## ## MOVE TO C####### |
|
138 |
+ t0 <- proc.time() |
|
139 |
+ ImNull <- gtypeCallerR2nm(A, B, fIndex, mIndex, params[["centers"]], |
|
140 |
+ params[["scales"]], params[["N"]], Indexes, |
|
141 |
+ cIndexes, sapply(Indexes, length), |
|
142 |
+ sapply(cIndexes, length), SMEDIAN, theKnots, |
|
143 |
+ mixtureParams, DF, probs, 0.025, |
|
144 |
+ which(regionInfo[,2]), |
|
145 |
+ which(regionInfo[,1])) |
|
146 |
+ t0 <- proc.time()-t0 |
|
147 |
+ gc(verbose=FALSE) |
|
148 |
+ message("\n Part 2 took ", t0[3], " seconds.") |
|
149 |
+ ## END MOVE TO C####### |
|
150 |
+ ## ################## |
|
151 |
+ |
|
152 |
+ dev <- dev/(dev+1/383) |
|
153 |
+ if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
154 |
+ if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
155 |
+ |
|
156 |
+ if(verbose) message("Done.") |
|
157 |
+ if (returnParams){ |
|
158 |
+ return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS)), params=params)) |
|
159 |
+ }else{ |
|
160 |
+ return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS)))) |
|
161 |
+ } |
|
162 |
+} |
|
163 |
+ |
|
164 |
+ |
|
165 |
+gtypeCallerRnm <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
166 |
+ theNs, Indexes, cIndexes, nIndexes, |
|
167 |
+ ncIndexes, SMEDIAN, knots, params, dft, |
|
168 |
+ probs, trim){ |
|
169 |
+ |
|
170 |
+ stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
171 |
+ nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
172 |
+ nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
173 |
+ ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
174 |
+ |
|
175 |
+ ## make code robust |
|
176 |
+ ## check types before passing to C |
|
177 |
+ |
|
178 |
+ .Call("gtypeCallerPart1nm", A, B, |
|
179 |
+ as.integer(fIndex), as.integer(mIndex), |
|
180 |
+ as.numeric(theCenters), as.numeric(theScales), |
|
181 |
+ as.integer(theNs), lapply(Indexes, as.integer), lapply(cIndexes, as.integer), as.integer(nIndexes), as.integer(ncIndexes), |
|
182 |
+ as.numeric(SMEDIAN), as.numeric(knots), as.numeric(params), |
|
183 |
+ as.integer(dft), as.numeric(probs), as.numeric(trim), |
|
184 |
+ PACKAGE="crlmm") |
|
185 |
+ |
|
186 |
+} |
|
187 |
+ |
|
188 |
+gtypeCallerR2nm <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
189 |
+ theNs, Indexes, cIndexes, nIndexes, |
|
190 |
+ ncIndexes, SMEDIAN, knots, params, dft, |
|
191 |
+ probs, trim, noTraining, noInfo){ |
|
192 |
+ |
|
193 |
+ stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
194 |
+ nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
195 |
+ nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
196 |
+ ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
197 |
+ |
|
198 |
+ .Call("gtypeCallerPart2nm", A, B, |
|
199 |
+ as.integer(fIndex), as.integer(mIndex), |
|
200 |
+ as.numeric(theCenters), as.numeric(theScales), |
|
201 |
+ as.integer(theNs), Indexes, cIndexes, nIndexes, ncIndexes, |
|
202 |
+ as.numeric(SMEDIAN), as.numeric(knots), as.numeric(params), |
|
203 |
+ as.integer(dft), as.numeric(probs), as.numeric(trim), |
|
204 |
+ as.integer(noTraining), as.integer(noInfo), PACKAGE="crlmm") |
|
205 |
+ |
|
206 |
+} |
|
207 |
+ |
0 | 208 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,116 @@ |
1 |
+crlmmNM <- function(filenames, row.names=TRUE, col.names=TRUE, |
|
2 |
+ probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
3 |
+ save.it=FALSE, load.it=FALSE, |
|
4 |
+ intensityFile="tmpcrlmmintensities.rda", |
|
5 |
+ desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1, |
|
6 |
+ verbose=TRUE, cdfName, sns, recallMin=10, |
|
7 |
+ recallRegMin=1000, returnParams=FALSE){ |
|
8 |
+ |
|
9 |
+ if (missing(sns)) sns <- basename(filenames) |
|
10 |
+ if (load.it & !file.exists(intensityFile)){ |
|
11 |
+ load.it <- FALSE |
|
12 |
+ message("File ", intensityFile, " does not exist.") |
|
13 |
+ message("Not loading it, but running SNPRMA from scratch.") |
|
14 |
+ } |
|
15 |
+ if (!load.it){ |
|
16 |
+ res <- snprma(filenames, fitMixture=TRUE, |
|
17 |
+ mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
18 |
+ eps=eps, cdfName=cdfName, sns=sns) |
|
19 |
+ if(save.it){ |
|
20 |
+ t0 <- proc.time() |
|
21 |
+ save(res, file=intensityFile) |
|
22 |
+ t0 <- proc.time()-t0 |
|
23 |
+ message("Used ", t0[3], " seconds to save ", intensityFile, ".") |
|
24 |
+ } |
|
25 |
+ }else{ |
|
26 |
+ message("Loading ", intensityFile, ".") |
|
27 |
+ obj <- load(intensityFile) |
|
28 |
+ message("Done.") |
|
29 |
+ if (obj != "res") |
|
30 |
+ stop("Object in ", intensityFile, " seems to be invalid.") |
|
31 |
+ } |
|
32 |
+ if(row.names) row.names=res$gns else row.names=NULL |
|
33 |
+ if(col.names) col.names=res$sns else col.names=NULL |
|
34 |
+ |
|
35 |
+ res2 <- crlmmGTnm(res[["A"]], res[["B"]], res[["SNR"]], |
|
36 |
+ res[["mixtureParams"]], res[["cdfName"]], |
|
37 |
+ gender=gender, row.names=row.names, |
|
38 |
+ col.names=col.names, recallMin=recallMin, |
|
39 |
+ recallRegMin=1000, SNRMin=SNRMin, |
|
40 |
+ returnParams=returnParams) |
|
41 |
+ |
|
42 |
+ res2[["SNR"]] <- res[["SNR"]] |
|
43 |
+ |
|
44 |
+ return(res2) |
|
45 |
+} |
|
46 |
+ |
|
47 |
+ |
|
48 |
+############################### |
|
49 |
+####### THIS IS TEMPORARY NOT OFFICIALLY USED |
|
50 |
+##################################### |
|
51 |
+ |
|
52 |
+crlmmTNoN <- function(filenames, row.names=TRUE, col.names=TRUE, |
|
53 |
+ probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
54 |
+ save.it=FALSE, load.it=FALSE, |
|
55 |
+ intensityFile="tmpcrlmmintensities.rda", |
|
56 |
+ desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1, |
|
57 |
+ verbose=TRUE){ |
|
58 |
+ if (load.it & !file.exists(intensityFile)){ |
|
59 |
+ load.it <- FALSE |
|
60 |
+ message("File ", intensityFile, " does not exist.") |
|
61 |
+ message("Not loading it, but running SNPRMA from scratch.") |
|
62 |
+ } |
|
63 |
+ if (!load.it){ |
|
64 |
+ res <- snprma(filenames, fitMixture=TRUE, |
|
65 |
+ mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
66 |
+ eps=eps) |
|
67 |
+ if(save.it) save(res, file=intensityFile) |
|
68 |
+ }else{ |
|
69 |
+ message("Loading ", intensityFile, ".") |
|
70 |
+ obj <- load(intensityFile) |
|
71 |
+ message("Done.") |
|
72 |
+ if (obj != "res") |
|
73 |
+ stop("Object in ", intensityFile, " seems to be invalid.") |
|
74 |
+ } |
|
75 |
+ if(row.names) row.names=res$gns else row.names=NULL |
|
76 |
+ if(col.names) col.names=res$sns else col.names=NULL |
|
77 |
+ res2 <- crlmmGTTNoN(res[["A"]], res[["B"]], res[["SNR"]], |
|
78 |
+ res[["mixtureParams"]], res[["cdfName"]], |
|
79 |
+ gender=gender, row.names=row.names, |
|
80 |
+ col.names=col.names) |
|
81 |
+ res2[["SNR"]] <- res[["SNR"]] |
|
82 |
+ return(res2) |
|
83 |
+} |
|
84 |
+ |
|
85 |
+crlmmNormalNoN <- function(filenames, row.names=TRUE, col.names=TRUE, |
|
86 |
+ probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
87 |
+ save.it=FALSE, load.it=FALSE, |
|
88 |
+ intensityFile="tmpcrlmmintensities.rda", |
|
89 |
+ desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1, |
|
90 |
+ verbose=TRUE){ |
|
91 |
+ if (load.it & !file.exists(intensityFile)){ |
|
92 |
+ load.it <- FALSE |
|
93 |
+ message("File ", intensityFile, " does not exist.") |
|
94 |
+ message("Not loading it, but running SNPRMA from scratch.") |
|
95 |
+ } |
|
96 |
+ if (!load.it){ |
|
97 |
+ res <- snprma(filenames, fitMixture=TRUE, |
|
98 |
+ mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
99 |
+ eps=eps) |
|
100 |
+ if(save.it) save(res, file=intensityFile) |
|
101 |
+ }else{ |
|
102 |
+ message("Loading ", intensityFile, ".") |
|
103 |
+ obj <- load(intensityFile) |
|
104 |
+ message("Done.") |
|
105 |
+ if (obj != "res") |
|
106 |
+ stop("Object in ", intensityFile, " seems to be invalid.") |
|
107 |
+ } |
|
108 |
+ if(row.names) row.names=res$gns else row.names=NULL |
|
109 |
+ if(col.names) col.names=res$sns else col.names=NULL |
|
110 |
+ res2 <- crlmmGTNormalNoN(res[["A"]], res[["B"]], res[["SNR"]], |
|
111 |
+ res[["mixtureParams"]], res[["cdfName"]], |
|
112 |
+ gender=gender, row.names=row.names, |
|
113 |
+ col.names=col.names) |
|
114 |
+ res2[["SNR"]] <- res[["SNR"]] |
|
115 |
+ return(res2) |
|
116 |
+} |
0 | 117 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,45 @@ |
1 |
+fitAffySnpMixture56 <- function(S, M, knots, probs=rep(1/3, 3), eps=.01, maxit=10, verbose=FALSE){ |
|
2 |
+ ##56 stands for 5 and 6 arrays but will also work for Illumina |
|
3 |
+ ##Note the unfortunate choice of numbering: |
|
4 |
+ ##1 is BB, 2 AB, and 3 AA. Opposite to everything else! |
|
5 |
+ ##this is legacy code I decided not to change. |
|
6 |
+ ## this why at the end we report -coefs: F1 is the negative f |
|
7 |
+ mus <- append(quantile(M, c(1, 5)/6, names=FALSE), 0, 1) |
|
8 |
+ sigmas <- rep(mad(c(M[M<mus[1]]-mus[1], M[M>mus[3]]-mus[3])), 3) |
|
9 |
+ sigmas[2] <- sigmas[2]/2 |
|
10 |
+ |
|
11 |
+ weights <- apply(cbind(mus, sigmas), 1, function(p) dnorm(M, p[1], p[2])) |
|
12 |
+ previousF1 <- -Inf |
|
13 |
+ change <- eps+1 |
|
14 |
+ it <- 0 |
|
15 |
+ |
|
16 |
+ if(verbose) message("Max change must be under ", eps, ".") |
|
17 |
+ matS <- stupidSplineBasis(S, knots) |
|
18 |
+ while (change > eps & it < maxit){ |
|
19 |
+ it <- it+1 |
|
20 |
+ ## E |
|
21 |
+ z <- sweep(weights, 2, probs, "*") |
|
22 |
+ LogLik <- rowSums(z) |
|
23 |
+ z <- sweep(z, 1, LogLik, "/") |
|
24 |
+ probs <- colMeans(z) |
|
25 |
+ |
|
26 |
+ ## M |
|
27 |
+ fit1 <- crossprod(chol2inv(chol(crossprod(sweep(matS, 1, z[, 1], FUN="*"), matS))), crossprod(matS, z[, 1]*M)) |
|
28 |
+ |
|
29 |
+ fit2 <- sum(z[, 2]*M)/sum(z[, 2]) |
|
30 |
+ F1 <- matS%*%fit1 |
|
31 |
+ sigmas[c(1, 3)] <- sqrt(sum(z[, 1]*(M-F1)^2)/sum(z[, 1])) |
|
32 |
+ sigmas[2] <- sqrt(sum(z[, 2]*(M-fit2)^2)/sum(z[, 2])) |
|
33 |
+ |
|
34 |
+ weights[, 1] <- dnorm(M, F1, sigmas[1]) |
|
35 |
+ weights[, 2] <- dnorm(M, fit2, sigmas[2]) |
|
36 |
+ weights[, 3] <- dnorm(M, -F1, sigmas[3]) |
|
37 |
+ |
|
38 |
+ change <- max(abs(F1-previousF1)) |
|
39 |
+ previousF1 <- F1 |
|
40 |
+ if(verbose) message("Iter ", it, ": ", change, ".") |
|
41 |
+ } |
|
42 |
+ medF1 <- median(-F1) |
|
43 |
+ return(list(coef= -fit1, medF1=medF1, sigma1=sigmas[1], sigma2=sigmas[2])) |
|
44 |
+} |
|
45 |
+ |
0 | 46 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,80 @@ |
1 |
+snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1, verbose=TRUE, seed=1, cdfName, sns){ |
|
2 |
+ ##ADD CHECK TO SEE IF LOADED |
|
3 |
+ if (missing(cdfName)) |
|
4 |
+ cdfName <- read.celfile.header(filenames[1])$cdfName |
|
5 |
+## stuffDir <- changeToCrlmmAnnotationName(cdfName) |
|
6 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
7 |
+ if(!require(pkgname, character.only=TRUE)){ |
|
8 |
+ suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
9 |
+ msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
10 |
+ message(strwrap(msg)) |
|
11 |
+ stop("Package ", pkgname, " could not be found.") |
|
12 |
+ rm(suggCall, msg) |
|
13 |
+ } |
|
14 |
+ |
|
15 |
+ if(verbose) message("Loading annotations and mixture model parameters.") |
|
16 |
+ data(preprocStuff, genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
17 |
+ autosomeIndex <- getVarInEnv("autosomeIndex") |
|
18 |
+ pnsa <- getVarInEnv("pnsa") |
|
19 |
+ pnsb <- getVarInEnv("pnsb") |
|
20 |
+ fid <- getVarInEnv("fid") |
|
21 |
+ reference <- getVarInEnv("reference") |
|
22 |
+ aIndex <- getVarInEnv("aIndex") |
|
23 |
+ bIndex <- getVarInEnv("bIndex") |
|
24 |
+ SMEDIAN <- getVarInEnv("SMEDIAN") |
|
25 |
+ theKnots <- getVarInEnv("theKnots") |
|
26 |
+ gns <- getVarInEnv("gns") |
|
27 |
+ |
|
28 |
+ ##We will read each cel file, summarize, and run EM one by one |
|
29 |
+ ##We will save parameters of EM to use later |
|
30 |
+ mixtureParams <- matrix(0, 4, length(filenames)) |
|
31 |
+ SNR <- vector("numeric", length(filenames)) |
|
32 |
+ SKW <- vector("numeric", length(filenames)) |
|
33 |
+ |
|
34 |
+ ## This is the sample for the fitting of splines |
|
35 |
+ ## BC: I like better the idea of the user passing the seed, |
|
36 |
+ ## because this might intefere with other analyses |
|
37 |
+ ## (like what happened to GCRMA) |
|
38 |
+ set.seed(seed) |
|
39 |
+ |
|
40 |
+ idx <- sort(sample(autosomeIndex, mixtureSampleSize)) |
|
41 |
+ |
|
42 |
+ ##S will hold (A+B)/2 and M will hold A-B |
|
43 |
+ ##NOTE: We actually dont need to save S. Only for pics etc... |
|
44 |
+ ##f is the correction. we save to avoid recomputing |
|
45 |
+ A <- matrix(as.integer(0), length(pnsa), length(filenames)) |
|
46 |
+ B <- matrix(as.integer(0), length(pnsb), length(filenames)) |
|
47 |
+ |
|
48 |
+ if(verbose){ |
|
49 |
+ message("Processing ", length(filenames), " files.") |
|
50 |
+ pb <- txtProgressBar(min=0, max=length(filenames), style=3) |
|
51 |
+ } |
|
52 |
+ ##We start looping throug cel files |
|
53 |
+ idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything |
|
54 |
+ for(i in seq(along=filenames)){ |
|
55 |
+ y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
|
56 |
+ x <- log2(y[idx2]) |
|
57 |
+ SKW[i] <- mean((x-mean(x))^3)/(sd(x)^3) |
|
58 |
+ rm(x) |
|
59 |
+ y <- normalize.quantiles.use.target(y, target=reference) |
|
60 |
+ A[, i] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
|
61 |
+ B[, i] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
|
62 |
+ |
|
63 |
+ ##Now to fit the EM |
|
64 |
+ if(fitMixture){ |
|
65 |
+ S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN |
|
66 |
+ M <- log2(A[idx, i])-log2(B[idx, i]) |
|
67 |
+ |
|
68 |
+ ##we need to test the choice of eps.. it is not the max diff between funcs |
|
69 |
+ tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps) |
|
70 |
+ |
|
71 |
+ mixtureParams[, i] <- tmp[["coef"]] |
|
72 |
+ SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) |
|
73 |
+ } |
|
74 |
+ if (verbose) setTxtProgressBar(pb, i) |
|
75 |
+ } |
|
76 |
+ close(pb) |
|
77 |
+ if (!fitMixture) SNR <- mixtureParams <- NA |
|
78 |
+ ## gns comes from preprocStuff.rda |
|
79 |
+ list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName) |
|
80 |
+} |
0 | 81 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,45 @@ |
1 |
+stupidSplineBasis <- function(x,knots){ |
|
2 |
+ x <- pmin(x,knots[3]) |
|
3 |
+ x <- pmax(x,knots[1]) |
|
4 |
+ cbind(1, x, x^2, pmax(0, (x-knots[2]))^2) |
|
5 |
+} |
|
6 |
+ |
|
7 |
+changeToCrlmmAnnotationName <- function(x){ |
|
8 |
+ pkgDir <- system.file(package=THISPKG) |
|
9 |
+ wanted <- paste(tolower(gsub("_", "", x)), "crlmm.stuff", sep=".") |
|
10 |
+ file.path(pkgDir, "extdata", wanted) |
|
11 |
+} |
|
12 |
+ |
|
13 |
+getCrlmmAnnotationName <- function(x){ |
|
14 |
+ paste(tolower(gsub("_", "", x)), "Crlmm", sep="") |
|
15 |
+} |
|
16 |
+ |
|
17 |
+medianSummaries <- function(mat, grps) |
|
18 |
+ .Call("R_subColSummarize_median", mat, grps, PACKAGE = "preprocessCore") |
|
19 |
+ |
|
20 |
+intMedianSummaries <- function(mat, grps) |
|
21 |
+ as.integer(medianSummaries(mat, grps)) |
|
22 |
+ |
|
23 |
+testProb <- function(p) |
|
24 |
+ .Call("test", p) |
|
25 |
+ |
|
26 |
+ |
|
27 |
+list.celfiles <- function(...){ |
|
28 |
+ files <- list.files(...) |
|
29 |
+ return(files[grep("\\.[cC][eE][lL]$", files)]) |
|
30 |
+} |
|
31 |
+ |
|
32 |
+## .crlmmPkgEnv is an enviroment that will |
|
33 |
+## store all the variables used by the pkg. |
|
34 |
+## it's meant to not overwrite user's variables |
|
35 |
+## and get rid of the NOTES generated by |
|
36 |
+## R CMD check |
|
37 |
+ |
|
38 |
+isLoaded <- function(dataset, environ=.crlmmPkgEnv) |
|
39 |
+ exists(dataset, envir=environ) |
|
40 |
+ |
|
41 |
+getVarInEnv <- function(dataset, environ=.crlmmPkgEnv){ |
|
42 |
+ if (!isLoaded(dataset)) |
|
43 |
+ stop("Variable ", dataset, " not found in .crlmmPkgEnv") |
|
44 |
+ environ[[dataset]] |
|
45 |
+} |
0 | 46 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,16 @@ |
1 |
+# Loading required libraries |
|
2 |
+THISPKG <- "crlmm" |
|
3 |
+ |
|
4 |
+.onLoad <- function(libname, pkgname) { |
|
5 |
+ require("methods") |
|
6 |
+} |
|
7 |
+ |
|
8 |
+.onAttach <- function(libname, pkgname) { |
|
9 |
+ message("Welcome to crlmm version ", packageDescription(THISPKG, field="Version")) |
|
10 |
+} |
|
11 |
+ |
|
12 |
+.onUnload <- function( libpath ){ |
|
13 |
+ library.dynam.unload(THISPKG, libpath) |
|
14 |
+} |
|
15 |
+ |
|
16 |
+ .crlmmPkgEnv <- new.env(parent=emptyenv()) |
0 | 17 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,32 @@ |
1 |
+\name{crlmm-package} |
|
2 |
+\alias{crlmm-package} |
|
3 |
+\docType{package} |
|
4 |
+\title{ |
|
5 |
+Genotype Calling via CRLMM Algorithm |
|
6 |
+} |
|
7 |
+\description{ |
|
8 |
+Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays. |
|
9 |
+} |
|
10 |
+\details{ |
|
11 |
+Index: |
|
12 |
+\preformatted{ |
|
13 |
+crlmm Genotype SNP 5.0 or 6.0 samples. |
|
14 |
+crlmm-package New implementation of the CRLMM Algorithm. |
|
15 |
+} |
|
16 |
+The 'crlmm' package reimplements the CRLMM algorithm present in the |
|
17 |
+'oligo' package. This implementation primes for efficient genotyping of |
|
18 |
+samples on SNP 5.0 and SNP 6.0 Affymetrix arrays. |
|
19 |
+ |
|
20 |
+To use this package, the user must have additional data packages: |
|
21 |
+'genomewidesnp5Crlmm' - SNP 5.0 arrays |
|
22 |
+'genomewidesnp6Crlmm' - SNP 6.0 arrays |
|
23 |
+} |
|
24 |
+\author{ |
|
25 |
+Rafael A Irizarry |
|
26 |
+Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu> |
|
27 |
+} |
|
28 |
+\references{ |
|
29 |
+ Carvalho B, Louis TA, Irizarry RA. Describing Uncertainty in |
|
30 |
+ Genome-wide Genotype Calling. (in prep) |
|
31 |
+} |
|
32 |
+\keyword{ package } |
0 | 33 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,63 @@ |
1 |
+\name{crlmm} |
|
2 |
+\alias{crlmm} |
|
3 |
+%- Also NEED an '\alias' for EACH other topic documented here. |
|
4 |
+\title{Genotype oligonucleotide arrays with CRLMM} |
|
5 |
+\description{ |
|
6 |
+ This is a faster and more efficient implementation of the CRLMM |
|
7 |
+ algorithm, especially designed for Affymetrix SNP 5 and 6 arrays (to |
|
8 |
+ be soon extended to other platforms). |
|
9 |
+} |
|
10 |
+\usage{ |
|
11 |
+ |
|
12 |
+crlmm(filenames, row.names = TRUE, col.names = TRUE, probs = c(1/3, 1/3, |
|
13 |
+1/3), DF = 6, SNRMin = 5, gender = NULL, save.it = FALSE, load.it = |
|
14 |
+FALSE, intensityFile, mixtureSampleSize = |
|
15 |
+10^5, eps = 0.1, verbose = TRUE, cdfName, sns, recallMin = 10, |
|
16 |
+recallRegMin = 1000, returnParams = FALSE, badSNP = 0.7) |
|
17 |
+ |
|
18 |
+} |
|
19 |
+%- maybe also 'usage' for other objects documented here. |
|
20 |
+\arguments{ |
|
21 |
+ \item{filenames}{'character' vector with CEL files to be genotyped.} |
|
22 |
+ \item{row.names}{'logical'. Use rownames - SNP names?} |
|
23 |
+ \item{col.names}{'logical'. Use colnames - Sample names?} |
|
24 |
+ \item{probs}{'numeric' vector with priors for AA, AB and BB.} |
|
25 |
+ \item{DF}{'integer' with number of degrees of freedom to use with t-distribution.} |
|
26 |
+ \item{SNRMin}{'numeric' scalar defining the minimum SNR used to filter |
|
27 |
+ out samples.} |
|
28 |
+ \item{gender}{'integer' vector, with same length as 'filenames', |
|
29 |
+ defining gender. (1 - male; 2 - female)} |
|
30 |
+ \item{save.it}{'logical'. Save preprocessed data?} |
|
31 |
+ \item{load.it}{'logical'. Load preprocessed data to speed up analysis?} |
|
32 |
+ \item{intensityFile}{'character' with filename to be saved/loaded - |
|
33 |
+ preprocessed data.} |
|
34 |
+ \item{mixtureSampleSize}{Number of SNP's to be used with the mixture model.} |
|
35 |
+ \item{eps}{Minimum change for mixture model.} |
|
36 |
+ \item{verbose}{'logical'.} |
|
37 |
+ \item{cdfName}{'character' defining the CDF name to use |
|
38 |
+ ('GenomeWideSnp5', 'GenomeWideSnp6')} |
|
39 |
+ \item{sns}{'character' vector with sample names to be used.} |
|
40 |
+ \item{recallMin}{Minimum number of samples for recalibration.} |
|
41 |
+ \item{recallRegMin}{Minimum number of SNP's for regression.} |
|
42 |
+ \item{returnParams}{'logical'. Return recalibrated parameters.} |
|
43 |
+ \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)} |
|
44 |
+} |
|
45 |
+\value{ |
|
46 |
+ \item{calls}{Genotype calls (1 - AA, 2 - AB, 3 - BB)} |
|
47 |
+ \item{confs}{Confidence scores 'round(-1000*log2(1-p))'} |
|
48 |
+ \item{SNPQC}{SNP Quality Scores} |
|
49 |
+ \item{batchQC}{Batch Quality Score} |
|
50 |
+ \item{params}{Recalibrated parameters} |
|
51 |
+} |
|
52 |
+\references{ |
|
53 |
+ Carvalho B, Bengtsson H, Speed TP, Irizarry RA. Exploration, |
|
54 |
+ normalization, and genotype calls of high-density oligonucleotide SNP |
|
55 |
+ array data. Biostatistics. 2007 Apr;8(2):485-99. Epub 2006 Dec |
|
56 |
+ 22. PMID: 17189563. |
|
57 |
+ |
|
58 |
+ Carvalho B, Louis TA, Irizarry RA. Describing Uncertainty in |
|
59 |
+ Genome-wide Genotype Calling. (in prep) |
|
60 |
+} |
|
61 |
+ |
|
62 |
+\keyword{classif} |
|
63 |
+ |
0 | 64 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,29 @@ |
1 |
+\name{list.celfiles} |
|
2 |
+\alias{list.celfiles} |
|
3 |
+ |
|
4 |
+\title{List CEL files.} |
|
5 |
+\description{ |
|
6 |
+ Function used to get a list of CEL files. |
|
7 |
+} |
|
8 |
+\usage{ |
|
9 |
+list.celfiles(...) |
|
10 |
+} |
|
11 |
+ |
|
12 |
+\arguments{ |
|
13 |
+ \item{\dots}{Same arguments of \code{\link{list.files}}} |
|
14 |
+} |
|
15 |
+\details{ |
|
16 |
+ For the moment, this function returns only uncompressed CEL files (ie, |
|
17 |
+ no CEL.gz) |
|
18 |
+} |
|
19 |
+\value{ |
|
20 |
+ Character vector with filenames. |
|
21 |
+} |
|
22 |
+ |
|
23 |
+\note{ |
|
24 |
+ Quite often users want to use this function to pass filenames to other |
|
25 |
+ methods. In this situations, it is safer to use the argument 'full.names=TRUE'. |
|
26 |
+} |
|
27 |
+\seealso{\code{\link{list.files}}} |
|
28 |
+\keyword{IO} |
|
29 |
+\keyword{utilities} |
0 | 30 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,413 @@ |
1 |
+#include <math.h> |
|
2 |
+#include <R.h> |
|
3 |
+#include <Rdefines.h> |
|
4 |
+#include <Rmath.h> |
|
5 |
+#include <Rinternals.h> |
|
6 |
+ |
|
7 |
+#include "utils.h" |
|
8 |
+ |
|
9 |
+static double mydt(double x, int df){ |
|
10 |
+ return(pow(1.0+pow(x, 2.0)/ (double)df, -((double)df+1.0)/2.0)); |
|
11 |
+} |
|
12 |
+ |
|
13 |
+SEXP gtypeCallerPart1(SEXP A, SEXP B, SEXP fIndex, SEXP mIndex, |
|
14 |
+ SEXP theCenters, SEXP theScales, SEXP theNs, |
|
15 |
+ SEXP Indexes, SEXP cIndexes, SEXP nIndexes, |
|
16 |
+ SEXP ncIndexes, SEXP SMEDIAN, |
|
17 |
+ SEXP knots, SEXP mixtureParams, SEXP df, |
|
18 |
+ SEXP probs, SEXP trim){ |
|
19 |
+ /* |
|
20 |
+ ARGUMENTS |
|
21 |
+ --------- |
|
22 |
+ A: intensity matrix for allele A |
|
23 |
+ B: intensity matrix for allele B |
|
24 |
+ fIndex: indexes for females (columns in A/B for females) |
|
25 |
+ mIndex: indexes for males (columns in A/B for males) |
|
26 |
+ theCenters: matrix with SNP-specific centers (3 columns: AA/AB/BB) |
|
27 |
+ theScales: matrix with SNP-specific scales (3 columns: AA/AB/BB) |
|
28 |
+ theNs: matrix with SNP-specific counts (3 columns: AA/AB/BB) |
|
29 |
+ Indexes: list with 3 elements (autosomeIndex, XIndex, YIndex) for SNPs |
|
30 |
+ cIndexes: list with 3 elements (keepIndex, keepIndexFemale, keepIndexMale) for arrays |
|
31 |
+ SMEDIAN: scalar (median S) |
|
32 |
+ knots: knots for mixture |
|
33 |
+ mixtureParams: mixture parameters |
|
34 |
+ probs: genotype priors (1/3) for *ALL* SNPs. It's a vector of length 3 |
|
35 |
+ trim: drop rate to estimate means |
|
36 |