Browse code

Adding crlmm package

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@36653 bc3139a8-67e5-0310-9ffc-ced21a209358

Benilton Carvalho authored on 14/01/2009 22:00:52
Showing 21 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,19 @@
1
+Package: crlmm
2
+Type: Package
3
+Title: Genotype Calling via CRLMM Algorithm
4
+Version: 1.24
5
+Date: 2008-12-28
6
+Author: Rafael A Irizarry
7
+Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>
8
+Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays.
9
+License: Artistic-2.0
10
+Depends: methods, affyio, preprocessCore, utils
11
+Collate: crlmmGT.R
12
+         crlmm.R
13
+         fitAffySnpMixture56.R
14
+         snprma.R
15
+         utils.R
16
+         zzz.R
17
+         crlmmGTnm.R
18
+         crlmmnm.R
19
+LazyLoad: yes
0 20
new file mode 100644
... ...
@@ -0,0 +1,4 @@
1
+useDynLib("crlmm")
2
+import(methods)
3
+export("crlmm",
4
+       "list.celfiles")
0 5
new file mode 100644
... ...
@@ -0,0 +1,117 @@
1
+crlmm <- function(filenames, row.names=TRUE, col.names=TRUE,
2
+                  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
3
+                  save.it=FALSE, load.it=FALSE, intensityFile,
4
+                  mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
5
+                  cdfName, sns, recallMin=10, recallRegMin=1000,
6
+                  returnParams=FALSE, badSNP=.7){
7
+  if ((load.it | save.it) & missing(intensityFile))
8
+    stop("'intensityFile' is missing, and you chose either load.it or save.it")
9
+  
10
+  if (missing(sns)) sns <- basename(filenames)
11
+  if (load.it & !file.exists(intensityFile)){
12
+    load.it <- FALSE
13
+    message("File ", intensityFile, " does not exist.")
14
+    message("Not loading it, but running SNPRMA from scratch.")
15
+  }
16
+  if (!load.it){
17
+    res <- snprma(filenames, fitMixture=TRUE,
18
+                  mixtureSampleSize=mixtureSampleSize, verbose=verbose,
19
+                  eps=eps, cdfName=cdfName, sns=sns)
20
+    if(save.it){
21
+      t0 <- proc.time()
22
+      save(res, file=intensityFile)
23
+      t0 <- proc.time()-t0
24
+      if (verbose) message("Used ", t0[3], " seconds to save ", intensityFile, ".")
25
+    }
26
+  }else{
27
+    if (verbose) message("Loading ", intensityFile, ".")
28
+    obj <- load(intensityFile)
29
+    if (verbose) message("Done.")
30
+    if (obj != "res")
31
+      stop("Object in ", intensityFile, " seems to be invalid.")
32
+  }
33
+  if(row.names) row.names=res$gns else row.names=NULL
34
+  if(col.names) col.names=res$sns else col.names=NULL
35
+
36
+  res2 <- crlmmGT(res[["A"]], res[["B"]], res[["SNR"]],
37
+                  res[["mixtureParams"]], res[["cdfName"]],
38
+                  gender=gender, row.names=row.names,
39
+                  col.names=col.names, recallMin=recallMin,
40
+                  recallRegMin=1000, SNRMin=SNRMin,
41
+                  returnParams=returnParams, badSNP=badSNP)
42
+
43
+  res2[["SNR"]] <- res[["SNR"]]
44
+  res2[["SKW"]] <- res[["SKW"]]
45
+  return(res2)
46
+}
47
+
48
+
49
+###############################
50
+####### THIS IS TEMPORARY NOT OFFICIALLY USED
51
+#####################################
52
+
53
+crlmmTNoN <- function(filenames, row.names=TRUE, col.names=TRUE,
54
+                  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
55
+                  save.it=FALSE, load.it=FALSE,
56
+                  intensityFile="tmpcrlmmintensities.rda",
57
+                  desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1,
58
+                  verbose=TRUE){
59
+  if (load.it & !file.exists(intensityFile)){
60
+    load.it <- FALSE
61
+    message("File ", intensityFile, " does not exist.")
62
+    message("Not loading it, but running SNPRMA from scratch.")
63
+  }
64
+  if (!load.it){
65
+    res <- snprma(filenames, fitMixture=TRUE,
66
+                  mixtureSampleSize=mixtureSampleSize, verbose=verbose,
67
+                  eps=eps)
68
+    if(save.it) save(res, file=intensityFile)
69
+  }else{
70
+    message("Loading ", intensityFile, ".")
71
+    obj <- load(intensityFile)
72
+    message("Done.")
73
+    if (obj != "res")
74
+      stop("Object in ", intensityFile, " seems to be invalid.")
75
+  }
76
+  if(row.names) row.names=res$gns else row.names=NULL
77
+  if(col.names) col.names=res$sns else col.names=NULL
78
+  res2 <- crlmmGTTNoN(res[["A"]], res[["B"]], res[["SNR"]],
79
+                  res[["mixtureParams"]], res[["cdfName"]],
80
+                  gender=gender, row.names=row.names,
81
+                  col.names=col.names)
82
+  res2[["SNR"]] <- res[["SNR"]]
83
+  return(res2)
84
+}
85
+
86
+crlmmNormalNoN <- function(filenames, row.names=TRUE, col.names=TRUE,
87
+                  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
88
+                  save.it=FALSE, load.it=FALSE,
89
+                  intensityFile="tmpcrlmmintensities.rda",
90
+                  desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1,
91
+                  verbose=TRUE){
92
+  if (load.it & !file.exists(intensityFile)){
93
+    load.it <- FALSE
94
+    message("File ", intensityFile, " does not exist.")
95
+    message("Not loading it, but running SNPRMA from scratch.")
96
+  }
97
+  if (!load.it){
98
+    res <- snprma(filenames, fitMixture=TRUE,
99
+                  mixtureSampleSize=mixtureSampleSize, verbose=verbose,
100
+                  eps=eps)
101
+    if(save.it) save(res, file=intensityFile)
102
+  }else{
103
+    message("Loading ", intensityFile, ".")
104
+    obj <- load(intensityFile)
105
+    message("Done.")
106
+    if (obj != "res")
107
+      stop("Object in ", intensityFile, " seems to be invalid.")
108
+  }
109
+  if(row.names) row.names=res$gns else row.names=NULL
110
+  if(col.names) col.names=res$sns else col.names=NULL
111
+  res2 <- crlmmGTNormalNoN(res[["A"]], res[["B"]], res[["SNR"]],
112
+                  res[["mixtureParams"]], res[["cdfName"]],
113
+                  gender=gender, row.names=row.names,
114
+                  col.names=col.names)
115
+  res2[["SNR"]] <- res[["SNR"]]
116
+  return(res2)
117
+}
0 118
new file mode 100644
... ...
@@ -0,0 +1,547 @@
1
+crlmmGT <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
2
+                    col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
3
+                    SNRMin=5, recallMin=10, recallRegMin=1000,
4
+                    gender=NULL, desctrucitve=FALSE, verbose=TRUE,
5
+                    returnParams=FALSE, badSNP=.7){
6
+  
7
+  keepIndex <- which(SNR>SNRMin)
8
+  if(length(keepIndex)==0) stop("No arrays above quality threshold!")
9
+  
10
+  NC <- ncol(A)
11
+  NR <- nrow(A)
12
+  
13
+  pkgname <- getCrlmmAnnotationName(cdfName)
14
+  if(!require(pkgname, character.only=TRUE)){
15
+    suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
16
+    msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
17
+    message(strwrap(msg))
18
+    stop("Package ", pkgname, " could not be found.")
19
+    rm(suggCall, msg)
20
+  }
21
+
22
+  if(verbose) message("Loading annotations.")
23
+  data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
24
+
25
+  ## this is toget rid of the 'no visible binding' notes
26
+  ## variable definitions
27
+  XIndex <- getVarInEnv("XIndex")
28
+  autosomeIndex <- getVarInEnv("autosomeIndex")
29
+  YIndex <- getVarInEnv("YIndex")
30
+  SMEDIAN <- getVarInEnv("SMEDIAN")
31
+  theKnots <- getVarInEnv("theKnots")
32
+  regionInfo <- getVarInEnv("regionInfo")
33
+  
34
+  ##IF gender not provide, we predict
35
+  if(is.null(gender)){
36
+    if(verbose) message("Determining gender.")
37
+    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
38
+    if(sum(SNR>SNRMin)==1){
39
+      gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
40
+    }else{
41
+      gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]]
42
+    }
43
+  }
44
+  
45
+  Indexes <- list(autosomeIndex, XIndex, YIndex)
46
+  cIndexes <- list(keepIndex, 
47
+                   keepIndex[which(gender[keepIndex]==2)], 
48
+                   keepIndex[which(gender[keepIndex]==1)])
49
+  
50
+  if(verbose) cat("Calling", NR, "SNPs for recalibration")
51
+
52
+  ## call C
53
+  fIndex <- which(gender==2)
54
+  mIndex <- which(gender==1)
55
+  newparams <- gtypeCallerR(A, B, fIndex, mIndex,
56
+                            params[["centers"]], params[["scales"]], params[["N"]],
57
+                            Indexes, cIndexes,
58
+                            sapply(Indexes, length), sapply(cIndexes, length),
59
+                            SMEDIAN, theKnots,
60
+                            mixtureParams, DF, probs, 0.025)
61
+  gc(verbose=FALSE)
62
+  names(newparams) <- c("centers", "scales", "N")
63
+  
64
+  if(verbose) message("Done.")
65
+  if(verbose) message("Estimating recalibration parameters.")
66
+  d <- newparams[["centers"]] - params$centers
67
+
68
+  ##regression 
69
+  Index <- intersect(which(pmin(newparams[["N"]][, 1],
70
+                                newparams[["N"]][, 2],
71
+                                newparams[["N"]][, 3]) > recallMin &
72
+                                !apply(regionInfo, 1, any)),
73
+                                autosomeIndex)
74
+  
75
+  if(length(Index) < recallRegMin){
76
+    warning("Recallibration not possible.")
77
+    newparams <- params
78
+    dev <- vector("numeric", nrow(newparams[["centers"]]))
79
+    SS <- matrix(Inf, 3, 3)
80
+    DD <- 0
81
+  }else{
82
+    data4reg <- as.data.frame(newparams[["centers"]][Index,])
83
+    names(data4reg) <- c("AA", "AB", "BB")
84
+    regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
85
+                       c(coef(lm(AB~AA+BB, data=data4reg)), 0), 
86
+                       coef(lm(BB~AA*AB, data=data4reg)))
87
+    rownames(regParams) <- c("intercept", "X", "Y", "XY")
88
+    rm(data4reg)
89
+  
90
+    minN <- 3
91
+    newparams[["centers"]][newparams[["N"]] < minN] <- NA
92
+    Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
93
+    if(verbose) cat("Filling out empty centers")
94
+    for(i in Index){
95
+      if(verbose) if(i%%10000==0)cat(".")
96
+      mu <- newparams[["centers"]][i, ]
97
+      j <- which(is.na(mu))
98
+      newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
99
+    }
100
+    
101
+    ##remaing NAs are made like originals
102
+    if(length(YIndex)>0){
103
+      noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), 
104
+                           YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
105
+    }
106
+    newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
107
+    if(verbose) cat("\n")
108
+  
109
+    if(verbose) message("Calculating and standardizing size of shift.")
110
+    DD <- newparams[["centers"]] - params[["centers"]]
111
+    DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
112
+    SS <- cov(DD[autosomeIndex, ])
113
+    SSI <- solve(SS)
114
+    dev <- vector("numeric", nrow(DD))
115
+    if(length(YIndex)){
116
+      dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
117
+      dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
118
+      ##Now Y (only two params)
119
+      SSY <- SS[c(1, 3), c(1, 3)]
120
+      SSI <- solve(SSY) 
121
+      dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
122
+      dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
123
+    } else {
124
+      dev=apply(DD,1,function(x) x%*%SSI%*%x)
125
+      dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
126
+    }
127
+  }
128
+    
129
+  ## BC: must keep SD
130
+  params[-2] <- newparams[-2]
131
+  
132
+  rm(newparams);gc(verbose=FALSE)  
133
+  if(verbose) cat("Calling", NR, "SNPs")
134
+  ## ###################
135
+  ## ## MOVE TO C#######
136
+  ImNull <- gtypeCallerR2(A, B, fIndex, mIndex, params[["centers"]],
137
+                          params[["scales"]], params[["N"]], Indexes,
138
+                          cIndexes, sapply(Indexes, length),
139
+                          sapply(cIndexes, length), SMEDIAN, theKnots,
140
+                          mixtureParams, DF, probs, 0.025,
141
+                          which(regionInfo[,2]),
142
+                          which(regionInfo[,1]))
143
+  gc(verbose=FALSE)
144
+  ##  END MOVE TO C#######
145
+  ## ##################
146
+  
147
+  dev <- dev/(dev+1/383)
148
+  if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
149
+  if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
150
+
151
+  if(length(Index) >= recallRegMin){
152
+    tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
153
+    tmpSnpQc <- dev[autosomeIndex]
154
+    SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
155
+    batchQC <- mean(diag(SS))
156
+  }else{
157
+    batchQC <- Inf
158
+  }
159
+  
160
+  if(verbose) message("Done.")
161
+  if (returnParams){
162
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD))
163
+  }else{
164
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD))
165
+  }
166
+}
167
+
168
+
169
+gtypeCallerR <- function(A, B, fIndex, mIndex, theCenters, theScales,
170
+                         theNs, Indexes, cIndexes, nIndexes,
171
+                         ncIndexes, SMEDIAN, knots, params, dft,
172
+                         probs, trim){
173
+
174
+  stopifnot(!missing(A), !missing(B), dim(A)==dim(B),
175
+            nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales),
176
+            nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4,
177
+            ncol(params)==ncol(A), length(trim)==1, length(probs)==3)
178
+
179
+  ## make code robust
180
+  ## check types before passing to C
181
+  
182
+  .Call("gtypeCallerPart1", A, B,
183
+        as.integer(fIndex), as.integer(mIndex),
184
+        as.numeric(theCenters), as.numeric(theScales),
185
+        as.integer(theNs), lapply(Indexes, as.integer), lapply(cIndexes, as.integer), as.integer(nIndexes), as.integer(ncIndexes),
186
+        as.numeric(SMEDIAN), as.numeric(knots), as.numeric(params),
187
+        as.integer(dft), as.numeric(probs), as.numeric(trim),
188
+        PACKAGE="crlmm")
189
+  
190
+}
191
+
192
+gtypeCallerR2 <- function(A, B, fIndex, mIndex, theCenters, theScales,
193
+                         theNs, Indexes, cIndexes, nIndexes,
194
+                         ncIndexes, SMEDIAN, knots, params, dft,
195
+                         probs, trim, noTraining, noInfo){
196
+
197
+  stopifnot(!missing(A), !missing(B), dim(A)==dim(B),
198
+            nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales),
199
+            nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4,
200
+            ncol(params)==ncol(A), length(trim)==1, length(probs)==3)
201
+
202
+  .Call("gtypeCallerPart2", A, B,
203
+        as.integer(fIndex), as.integer(mIndex),
204
+        as.numeric(theCenters), as.numeric(theScales),
205
+        as.integer(theNs), Indexes, cIndexes, nIndexes, ncIndexes,
206
+        as.numeric(SMEDIAN), as.numeric(knots), as.numeric(params),
207
+        as.integer(dft), as.numeric(probs), as.numeric(trim),
208
+        as.integer(noTraining), as.integer(noInfo), PACKAGE="crlmm")
209
+  
210
+}
211
+
212
+
213
+
214
+
215
+
216
+##################
217
+##################
218
+### THIS IS TEMPORARY NOT OFFICIALLY USED
219
+##################
220
+####################
221
+crlmmGTTNoN <- function(A, B, SNR, mixtureParams, cdfName,
222
+                         row.names=NULL, col.names=NULL, probs=c(1/3,
223
+                         1/3, 1/3), DF=6, SNRMin=6, gender=NULL,
224
+                         desctrucitve=FALSE, verbose=TRUE){
225
+  keepIndex <- which(SNR>SNRMin)
226
+  if(length(keepIndex)==0) stop("No arrays above quality threshold!")
227
+  
228
+  NC <- ncol(A)
229
+  NR <- nrow(A)
230
+  
231
+  if(verbose) message("Loading annotations.")
232
+  data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
233
+
234
+  ## this is toget rid of the 'no visible binding' notes
235
+  ## variable definitions
236
+  XIndex <- getVarInEnv("XIndex")
237
+  autosomeIndex <- getVarInEnv("autosomeIndex")
238
+  YIndex <- getVarInEnv("YIndex")
239
+  SMEDIAN <- getVarInEnv("SMEDIAN")
240
+  theKnots <- getVarInEnv("theKnots")
241
+  regionInfo <- getVarInEnv("regionInfo")
242
+  
243
+  ##IF gender not provide, we predict
244
+  if(is.null(gender)){
245
+    if(verbose) message("Determining gender.")
246
+    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
247
+    if(sum(SNR>SNRMin)==1) gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) else  gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]]
248
+  }
249
+  
250
+  Indexes <- list(autosomeIndex, XIndex, YIndex)
251
+  cIndexes <- list(keepIndex, 
252
+                   keepIndex[which(gender[keepIndex]==2)], 
253
+                   keepIndex[which(gender[keepIndex]==1)])
254
+  
255
+  if(verbose) cat("Calling", NR, "SNPs for recalibration")
256
+
257
+  ## call C
258
+  fIndex <- which(gender==2)
259
+  mIndex <- which(gender==1)
260
+  t0 <- proc.time()
261
+  newparams <- gtypeCallerRTNoN(A, B, fIndex, mIndex,
262
+                            params[["centers"]], params[["scales"]], params[["N"]],
263
+                            Indexes, cIndexes,
264
+                            sapply(Indexes, length), sapply(cIndexes, length),
265
+                            SMEDIAN, theKnots,
266
+                            mixtureParams, DF, probs, 0.025)
267
+  t0 <- proc.time()-t0
268
+  message("Part 1 took ", t0[3], " seconds.")
269
+  names(newparams) <- c("centers", "scales", "N")
270
+  
271
+  if(verbose) message("Done.")
272
+  if(verbose) message("Estimating recalibration parameters.")
273
+  d <- newparams[["centers"]] - params$centers
274
+
275
+  ##regression 
276
+  MIN <- 10
277
+  Index <- intersect(which(pmin(newparams[["N"]][, 1], newparams[["N"]][, 2], newparams[["N"]][, 3])>MIN & !apply(regionInfo, 1, any)), autosomeIndex)
278
+  data4reg <- as.data.frame(newparams[["centers"]][Index,])
279
+  names(data4reg) <- c("AA", "AB", "BB")
280
+  regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
281
+                     c(coef(lm(AB~AA+BB, data=data4reg)), 0), 
282
+                       coef(lm(BB~AA*AB, data=data4reg)))
283
+  rownames(regParams) <- c("intercept", "X", "Y", "XY")
284
+  rm(data4reg)
285
+  
286
+  minN <- 3
287
+  newparams[["centers"]][newparams[["N"]]<minN] <- NA
288
+  Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
289
+  if(verbose) cat("Filling out empty centers")
290
+  for(i in Index){
291
+    if(verbose) if(i%%10000==0)cat(".")
292
+    mu <- newparams[["centers"]][i, ]
293
+    j <- which(is.na(mu))
294
+    newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
295
+  }
296
+  ##remaing NAs are made like originals
297
+  if(length(YIndex)>0){
298
+    noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), 
299
+                         YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
300
+  }
301
+  newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
302
+  if(verbose) cat("\n")
303
+  
304
+  if(verbose) message("Calculating and standardizing size of shift.")
305
+  DD <- newparams[["centers"]] - params[["centers"]]
306
+  MM <- colMeans(DD[autosomeIndex, ])
307
+  DD <- sweep(DD, 2, MM)
308
+  SS <- cov(DD[autosomeIndex, ])
309
+  SSI <- solve(SS)
310
+  dev <- vector("numeric", nrow(DD))
311
+  if(length(YIndex)){
312
+    dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
313
+    dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
314
+    ##Now Y (only two params)
315
+    SSY <- SS[c(1, 3), c(1, 3)]
316
+    SSI <- solve(SSY) 
317
+    dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
318
+    dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
319
+  } else {
320
+    dev=apply(DD,1,function(x) x%*%SSI%*%x)
321
+    dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
322
+  } 
323
+
324
+  ## BC: must keep SD
325
+  params[-2] <- newparams[-2]
326
+  rm(newparams);gc(verbose=FALSE)  
327
+  if(verbose) cat("Calling", NR, "SNPs")
328
+  ## ###################
329
+  ## ## MOVE TO C#######
330
+  t0 <- proc.time()
331
+  ImNull <- gtypeCallerR2TNoN(A, B, fIndex, mIndex, params[["centers"]],
332
+                          params[["scales"]], params[["N"]], Indexes,
333
+                          cIndexes, sapply(Indexes, length),
334
+                          sapply(cIndexes, length), SMEDIAN, theKnots,
335
+                          mixtureParams, DF, probs, 0.025,
336
+                          which(regionInfo[,2]),
337
+                          which(regionInfo[,1]))
338
+  t0 <- proc.time()-t0
339
+  message("Part 2 took ", t0[3], " seconds.")
340
+  ##  END MOVE TO C#######
341
+  ## ##################
342
+  
343
+  dev <- dev/(dev+1/383)
344
+  if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
345
+  if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
346
+  
347
+  if(verbose) message("Done.")
348
+  return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS))))
349
+}
350
+
351
+
352
+gtypeCallerRTNoN <- function(A, B, fIndex, mIndex, theCenters, theScales,
353
+                         theNs, Indexes, cIndexes, nIndexes,
354
+                         ncIndexes, SMEDIAN, knots, params, dft,
355
+                         probs, trim){
356
+
357
+  stopifnot(!missing(A), !missing(B), dim(A)==dim(B),
358
+            nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales),
359
+            nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4,
360
+            ncol(params)==ncol(A), length(trim)==1, length(probs)==3)
361
+  
362
+  .Call("gtypeCallerPart1TNoN", A, B, fIndex, mIndex, theCenters,
363
+        theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes,
364
+        SMEDIAN, knots, params, dft, probs, trim, PACKAGE="crlmm")
365
+  
366
+}
367
+
368
+gtypeCallerR2TNoN <- function(A, B, fIndex, mIndex, theCenters, theScales,
369
+                         theNs, Indexes, cIndexes, nIndexes,
370
+                         ncIndexes, SMEDIAN, knots, params, dft,
371
+                         probs, trim, noTraining, noInfo){
372
+
373
+  stopifnot(!missing(A), !missing(B), dim(A)==dim(B),
374
+            nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales),
375
+            nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4,
376
+            ncol(params)==ncol(A), length(trim)==1, length(probs)==3)
377
+  
378
+  .Call("gtypeCallerPart2TNoN", A, B, fIndex, mIndex, theCenters,
379
+        theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes,
380
+        SMEDIAN, knots, params, dft, probs, trim, noTraining, noInfo, PACKAGE="crlmm")
381
+  
382
+}
383
+
384
+crlmmGTNormalNoN <- function(A, B, SNR, mixtureParams, cdfName,
385
+                         row.names=NULL, col.names=NULL, probs=c(1/3,
386
+                         1/3, 1/3), DF=6, SNRMin=6, gender=NULL,
387
+                         desctrucitve=FALSE, verbose=TRUE){
388
+  keepIndex <- which(SNR>SNRMin)
389
+  if(length(keepIndex)==0) stop("No arrays above quality threshold!")
390
+  
391
+  NC <- ncol(A)
392
+  NR <- nrow(A)
393
+  
394
+  if(verbose) message("Loading annotations.")
395
+  data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
396
+
397
+  ## this is toget rid of the 'no visible binding' notes
398
+  ## variable definitions
399
+  XIndex <- getVarInEnv("XIndex")
400
+  autosomeIndex <- getVarInEnv("autosomeIndex")
401
+  YIndex <- getVarInEnv("YIndex")
402
+  SMEDIAN <- getVarInEnv("SMEDIAN")
403
+  theKnots <- getVarInEnv("theKnots")
404
+  regionInfo <- getVarInEnv("regionInfo")
405
+  
406
+  ##IF gender not provide, we predict
407
+  if(is.null(gender)){
408
+    if(verbose) message("Determining gender.")
409
+    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
410
+    if(sum(SNR>SNRMin)==1) gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) else  gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]]
411
+  }
412
+  
413
+  Indexes <- list(autosomeIndex, XIndex, YIndex)
414
+  cIndexes <- list(keepIndex, 
415
+                   keepIndex[which(gender[keepIndex]==2)], 
416
+                   keepIndex[which(gender[keepIndex]==1)])
417
+  
418
+  if(verbose) cat("Calling", NR, "SNPs for recalibration")
419
+
420
+  ## call C
421
+  fIndex <- which(gender==2)
422
+  mIndex <- which(gender==1)
423
+  t0 <- proc.time()
424
+  newparams <- gtypeCallerRNormalNoN(A, B, fIndex, mIndex,
425
+                            params[["centers"]], params[["scales"]], params[["N"]],
426
+                            Indexes, cIndexes,
427
+                            sapply(Indexes, length), sapply(cIndexes, length),
428
+                            SMEDIAN, theKnots,
429
+                            mixtureParams, DF, probs, 0.025)
430
+  t0 <- proc.time()-t0
431
+  message("Part 1 took ", t0[3], " seconds.")
432
+  names(newparams) <- c("centers", "scales", "N")
433
+  
434
+  if(verbose) message("Done.")
435
+  if(verbose) message("Estimating recalibration parameters.")
436
+  d <- newparams[["centers"]] - params$centers
437
+
438
+  ##regression 
439
+  MIN <- 10
440
+  Index <- intersect(which(pmin(newparams[["N"]][, 1], newparams[["N"]][, 2], newparams[["N"]][, 3])>MIN & !apply(regionInfo, 1, any)), autosomeIndex)
441
+  data4reg <- as.data.frame(newparams[["centers"]][Index,])
442
+  names(data4reg) <- c("AA", "AB", "BB")
443
+  regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
444
+                     c(coef(lm(AB~AA+BB, data=data4reg)), 0), 
445
+                       coef(lm(BB~AA*AB, data=data4reg)))
446
+  rownames(regParams) <- c("intercept", "X", "Y", "XY")
447
+  rm(data4reg)
448
+  
449
+  minN <- 3
450
+  newparams[["centers"]][newparams[["N"]]<minN] <- NA
451
+  Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
452
+  if(verbose) cat("Filling out empty centers")
453
+  for(i in Index){
454
+    if(verbose) if(i%%10000==0)cat(".")
455
+    mu <- newparams[["centers"]][i, ]
456
+    j <- which(is.na(mu))
457
+    newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
458
+  }
459
+  ##remaing NAs are made like originals
460
+  if(length(YIndex)>0){
461
+    noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), 
462
+                         YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
463
+  }
464
+  newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
465
+  if(verbose) cat("\n")
466
+  
467
+  if(verbose) message("Calculating and standardizing size of shift.")
468
+  DD <- newparams[["centers"]] - params[["centers"]]
469
+  MM <- colMeans(DD[autosomeIndex, ])
470
+  DD <- sweep(DD, 2, MM)
471
+  SS <- cov(DD[autosomeIndex, ])
472
+  SSI <- solve(SS)
473
+  dev <- vector("numeric", nrow(DD))
474
+  if(length(YIndex)){
475
+    dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
476
+    dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
477
+    ##Now Y (only two params)
478
+    SSY <- SS[c(1, 3), c(1, 3)]
479
+    SSI <- solve(SSY) 
480
+    dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
481
+    dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
482
+  } else {
483
+    dev=apply(DD,1,function(x) x%*%SSI%*%x)
484
+    dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
485
+  } 
486
+
487
+  ## BC: must keep SD
488
+  params[-2] <- newparams[-2]
489
+  rm(newparams);gc(verbose=FALSE)  
490
+  if(verbose) cat("Calling", NR, "SNPs")
491
+  ## ###################
492
+  ## ## MOVE TO C#######
493
+  t0 <- proc.time()
494
+  ImNull <- gtypeCallerR2NormalNoN(A, B, fIndex, mIndex, params[["centers"]],
495
+                          params[["scales"]], params[["N"]], Indexes,
496
+                          cIndexes, sapply(Indexes, length),
497
+                          sapply(cIndexes, length), SMEDIAN, theKnots,
498
+                          mixtureParams, DF, probs, 0.025,
499
+                          which(regionInfo[,2]),
500
+                          which(regionInfo[,1]))
501
+  t0 <- proc.time()-t0
502
+  message("Part 2 took ", t0[3], " seconds.")
503
+  ##  END MOVE TO C#######
504
+  ## ##################
505
+  
506
+  dev <- dev/(dev+1/383)
507
+  if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
508
+  if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
509
+  
510
+  if(verbose) message("Done.")
511
+  return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS))))
512
+}
513
+
514
+
515
+gtypeCallerRNormalNoN <- function(A, B, fIndex, mIndex, theCenters, theScales,
516
+                         theNs, Indexes, cIndexes, nIndexes,
517
+                         ncIndexes, SMEDIAN, knots, params, dft,
518
+                         probs, trim){
519
+
520
+  stopifnot(!missing(A), !missing(B), dim(A)==dim(B),
521
+            nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales),
522
+            nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4,
523
+            ncol(params)==ncol(A), length(trim)==1, length(probs)==3)
524
+  
525
+  .Call("gtypeCallerPart1NormalNoN", A, B, fIndex, mIndex, theCenters,
526
+        theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes,
527
+        SMEDIAN, knots, params, dft, probs, trim, PACKAGE="crlmm")
528
+  
529
+}
530
+
531
+gtypeCallerR2NormalNoN <- function(A, B, fIndex, mIndex, theCenters, theScales,
532
+                         theNs, Indexes, cIndexes, nIndexes,
533
+                         ncIndexes, SMEDIAN, knots, params, dft,
534
+                         probs, trim, noTraining, noInfo){
535
+
536
+  stopifnot(!missing(A), !missing(B), dim(A)==dim(B),
537
+            nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales),
538
+            nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4,
539
+            ncol(params)==ncol(A), length(trim)==1, length(probs)==3)
540
+  
541
+  .Call("gtypeCallerPart2NormalNoN", A, B, fIndex, mIndex, theCenters,
542
+        theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes,
543
+        SMEDIAN, knots, params, dft, probs, trim, noTraining, noInfo, PACKAGE="crlmm")
544
+  
545
+}
546
+
547
+
0 548
new file mode 100644
... ...
@@ -0,0 +1,207 @@
1
+crlmmGTnm <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
2
+                    col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
3
+                    SNRMin=5, recallMin=10, recallRegMin=1000,
4
+                    gender=NULL, desctrucitve=FALSE, verbose=TRUE,
5
+                    returnParams=FALSE){
6
+  
7
+  keepIndex <- which(SNR>SNRMin)
8
+  if(length(keepIndex)==0) stop("No arrays above quality threshold!")
9
+  
10
+  NC <- ncol(A)
11
+  NR <- nrow(A)
12
+  
13
+  pkgname <- getCrlmmAnnotationName(cdfName)
14
+  if(!require(pkgname, character.only=TRUE)){
15
+    suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
16
+    msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
17
+    message(strwrap(msg))
18
+    stop("Package ", pkgname, " could not be found.")
19
+    rm(suggCall, msg)
20
+  }
21
+
22
+  if(verbose) message("Loading annotations.")
23
+  data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
24
+
25
+  ## this is toget rid of the 'no visible binding' notes
26
+  ## variable definitions
27
+  XIndex <- getVarInEnv("XIndex")
28
+  autosomeIndex <- getVarInEnv("autosomeIndex")
29
+  YIndex <- getVarInEnv("YIndex")
30
+  SMEDIAN <- getVarInEnv("SMEDIAN")
31
+  theKnots <- getVarInEnv("theKnots")
32
+  regionInfo <- getVarInEnv("regionInfo")
33
+  
34
+  ##IF gender not provide, we predict
35
+  if(is.null(gender)){
36
+    if(verbose) message("Determining gender.")
37
+    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
38
+    if(sum(SNR>SNRMin)==1){
39
+      gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
40
+    }else{
41
+      gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]]
42
+    }
43
+  }
44
+  
45
+  Indexes <- list(autosomeIndex, XIndex, YIndex)
46
+  cIndexes <- list(keepIndex, 
47
+                   keepIndex[which(gender[keepIndex]==2)], 
48
+                   keepIndex[which(gender[keepIndex]==1)])
49
+  
50
+  if(verbose) cat("Calling", NR, "SNPs for recalibration")
51
+
52
+  ## call C
53
+  fIndex <- which(gender==2)
54
+  mIndex <- which(gender==1)
55
+  t0 <- proc.time()
56
+  newparams <- gtypeCallerRnm(A, B, fIndex, mIndex,
57
+                            params[["centers"]], params[["scales"]], params[["N"]],
58
+                            Indexes, cIndexes,
59
+                            sapply(Indexes, length), sapply(cIndexes, length),
60
+                            SMEDIAN, theKnots,
61
+                            mixtureParams, DF, probs, 0.025)
62
+  t0 <- proc.time()-t0
63
+  message("Part 1 took ", t0[3], " seconds.")
64
+  gc(verbose=FALSE)
65
+  names(newparams) <- c("centers", "scales", "N")
66
+  
67
+  if(verbose) message("Done.")
68
+  if(verbose) message("Estimating recalibration parameters.")
69
+  d <- newparams[["centers"]] - params$centers
70
+
71
+  ##regression 
72
+  Index <- intersect(which(pmin(newparams[["N"]][, 1],
73
+                                newparams[["N"]][, 2],
74
+                                newparams[["N"]][, 3]) > recallMin &
75
+                                !apply(regionInfo, 1, any)),
76
+                                autosomeIndex)
77
+  
78
+  if(length(Index) < recallRegMin){
79
+    warning("Recallibration not possible.")
80
+    newparams <- params
81
+    dev <- vector("numeric", nrow(newparams[["centers"]]))
82
+    SS <- matrix(Inf, 3, 3)
83
+  }else{
84
+    data4reg <- as.data.frame(newparams[["centers"]][Index,])
85
+    names(data4reg) <- c("AA", "AB", "BB")
86
+    regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
87
+                       c(coef(lm(AB~AA+BB, data=data4reg)), 0), 
88
+                       coef(lm(BB~AA*AB, data=data4reg)))
89
+    rownames(regParams) <- c("intercept", "X", "Y", "XY")
90
+    rm(data4reg)
91
+  
92
+    minN <- 3
93
+    newparams[["centers"]][newparams[["N"]] < minN] <- NA
94
+    Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
95
+    if(verbose) cat("Filling out empty centers")
96
+    for(i in Index){
97
+      if(verbose) if(i%%10000==0)cat(".")
98
+      mu <- newparams[["centers"]][i, ]
99
+      j <- which(is.na(mu))
100
+      newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
101
+    }
102
+    
103
+    ##remaing NAs are made like originals
104
+    if(length(YIndex)>0){
105
+      noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), 
106
+                           YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
107
+    }
108
+    newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
109
+    if(verbose) cat("\n")
110
+  
111
+    if(verbose) message("Calculating and standardizing size of shift.")
112
+    DD <- newparams[["centers"]] - params[["centers"]]
113
+    DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
114
+    SS <- cov(DD[autosomeIndex, ])
115
+    SSI <- solve(SS)
116
+    dev <- vector("numeric", nrow(DD))
117
+    if(length(YIndex)){
118
+      dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
119
+      dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
120
+      ##Now Y (only two params)
121
+      SSY <- SS[c(1, 3), c(1, 3)]
122
+      SSI <- solve(SSY) 
123
+      dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
124
+      dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
125
+    } else {
126
+      dev=apply(DD,1,function(x) x%*%SSI%*%x)
127
+      dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
128
+    }
129
+  }
130
+    
131
+  ## BC: must keep SD
132
+  params[-2] <- newparams[-2]
133
+  
134
+  rm(newparams);gc(verbose=FALSE)  
135
+  if(verbose) cat("Calling", NR, "SNPs")
136
+  ## ###################
137
+  ## ## MOVE TO C#######
138
+  t0 <- proc.time()
139
+  ImNull <- gtypeCallerR2nm(A, B, fIndex, mIndex, params[["centers"]],
140
+                          params[["scales"]], params[["N"]], Indexes,
141
+                          cIndexes, sapply(Indexes, length),
142
+                          sapply(cIndexes, length), SMEDIAN, theKnots,
143
+                          mixtureParams, DF, probs, 0.025,
144
+                          which(regionInfo[,2]),
145
+                          which(regionInfo[,1]))
146
+  t0 <- proc.time()-t0
147
+  gc(verbose=FALSE)
148
+  message("\n Part 2 took ", t0[3], " seconds.")
149
+  ##  END MOVE TO C#######
150
+  ## ##################
151
+  
152
+  dev <- dev/(dev+1/383)
153
+  if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
154
+  if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
155
+  
156
+  if(verbose) message("Done.")
157
+  if (returnParams){
158
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS)), params=params))
159
+  }else{
160
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS))))
161
+  }
162
+}
163
+
164
+
165
+gtypeCallerRnm <- function(A, B, fIndex, mIndex, theCenters, theScales,
166
+                         theNs, Indexes, cIndexes, nIndexes,
167
+                         ncIndexes, SMEDIAN, knots, params, dft,
168
+                         probs, trim){
169
+
170
+  stopifnot(!missing(A), !missing(B), dim(A)==dim(B),
171
+            nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales),
172
+            nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4,
173
+            ncol(params)==ncol(A), length(trim)==1, length(probs)==3)
174
+
175
+  ## make code robust
176
+  ## check types before passing to C
177
+  
178
+  .Call("gtypeCallerPart1nm", A, B,
179
+        as.integer(fIndex), as.integer(mIndex),
180
+        as.numeric(theCenters), as.numeric(theScales),
181
+        as.integer(theNs), lapply(Indexes, as.integer), lapply(cIndexes, as.integer), as.integer(nIndexes), as.integer(ncIndexes),
182
+        as.numeric(SMEDIAN), as.numeric(knots), as.numeric(params),
183
+        as.integer(dft), as.numeric(probs), as.numeric(trim),
184
+        PACKAGE="crlmm")
185
+  
186
+}
187
+
188
+gtypeCallerR2nm <- function(A, B, fIndex, mIndex, theCenters, theScales,
189
+                         theNs, Indexes, cIndexes, nIndexes,
190
+                         ncIndexes, SMEDIAN, knots, params, dft,
191
+                         probs, trim, noTraining, noInfo){
192
+
193
+  stopifnot(!missing(A), !missing(B), dim(A)==dim(B),
194
+            nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales),
195
+            nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4,
196
+            ncol(params)==ncol(A), length(trim)==1, length(probs)==3)
197
+
198
+  .Call("gtypeCallerPart2nm", A, B,
199
+        as.integer(fIndex), as.integer(mIndex),
200
+        as.numeric(theCenters), as.numeric(theScales),
201
+        as.integer(theNs), Indexes, cIndexes, nIndexes, ncIndexes,
202
+        as.numeric(SMEDIAN), as.numeric(knots), as.numeric(params),
203
+        as.integer(dft), as.numeric(probs), as.numeric(trim),
204
+        as.integer(noTraining), as.integer(noInfo), PACKAGE="crlmm")
205
+  
206
+}
207
+
0 208
new file mode 100644
... ...
@@ -0,0 +1,116 @@
1
+crlmmNM <- function(filenames, row.names=TRUE, col.names=TRUE,
2
+                  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
3
+                  save.it=FALSE, load.it=FALSE,
4
+                  intensityFile="tmpcrlmmintensities.rda",
5
+                  desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1,
6
+                  verbose=TRUE, cdfName, sns, recallMin=10,
7
+                  recallRegMin=1000, returnParams=FALSE){
8
+  
9
+  if (missing(sns)) sns <- basename(filenames)
10
+  if (load.it & !file.exists(intensityFile)){
11
+    load.it <- FALSE
12
+    message("File ", intensityFile, " does not exist.")
13
+    message("Not loading it, but running SNPRMA from scratch.")
14
+  }
15
+  if (!load.it){
16
+    res <- snprma(filenames, fitMixture=TRUE,
17
+                  mixtureSampleSize=mixtureSampleSize, verbose=verbose,
18
+                  eps=eps, cdfName=cdfName, sns=sns)
19
+    if(save.it){
20
+      t0 <- proc.time()
21
+      save(res, file=intensityFile)
22
+      t0 <- proc.time()-t0
23
+      message("Used ", t0[3], " seconds to save ", intensityFile, ".")
24
+    }
25
+  }else{
26
+    message("Loading ", intensityFile, ".")
27
+    obj <- load(intensityFile)
28
+    message("Done.")
29
+    if (obj != "res")
30
+      stop("Object in ", intensityFile, " seems to be invalid.")
31
+  }
32
+  if(row.names) row.names=res$gns else row.names=NULL
33
+  if(col.names) col.names=res$sns else col.names=NULL
34
+
35
+  res2 <- crlmmGTnm(res[["A"]], res[["B"]], res[["SNR"]],
36
+                  res[["mixtureParams"]], res[["cdfName"]],
37
+                  gender=gender, row.names=row.names,
38
+                  col.names=col.names, recallMin=recallMin,
39
+                  recallRegMin=1000, SNRMin=SNRMin,
40
+                  returnParams=returnParams)
41
+
42
+  res2[["SNR"]] <- res[["SNR"]]
43
+  
44
+  return(res2)
45
+}
46
+
47
+
48
+###############################
49
+####### THIS IS TEMPORARY NOT OFFICIALLY USED
50
+#####################################
51
+
52
+crlmmTNoN <- function(filenames, row.names=TRUE, col.names=TRUE,
53
+                  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
54
+                  save.it=FALSE, load.it=FALSE,
55
+                  intensityFile="tmpcrlmmintensities.rda",
56
+                  desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1,
57
+                  verbose=TRUE){
58
+  if (load.it & !file.exists(intensityFile)){
59
+    load.it <- FALSE
60
+    message("File ", intensityFile, " does not exist.")
61
+    message("Not loading it, but running SNPRMA from scratch.")
62
+  }
63
+  if (!load.it){
64
+    res <- snprma(filenames, fitMixture=TRUE,
65
+                  mixtureSampleSize=mixtureSampleSize, verbose=verbose,
66
+                  eps=eps)
67
+    if(save.it) save(res, file=intensityFile)
68
+  }else{
69
+    message("Loading ", intensityFile, ".")
70
+    obj <- load(intensityFile)
71
+    message("Done.")
72
+    if (obj != "res")
73
+      stop("Object in ", intensityFile, " seems to be invalid.")
74
+  }
75
+  if(row.names) row.names=res$gns else row.names=NULL
76
+  if(col.names) col.names=res$sns else col.names=NULL
77
+  res2 <- crlmmGTTNoN(res[["A"]], res[["B"]], res[["SNR"]],
78
+                  res[["mixtureParams"]], res[["cdfName"]],
79
+                  gender=gender, row.names=row.names,
80
+                  col.names=col.names)
81
+  res2[["SNR"]] <- res[["SNR"]]
82
+  return(res2)
83
+}
84
+
85
+crlmmNormalNoN <- function(filenames, row.names=TRUE, col.names=TRUE,
86
+                  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
87
+                  save.it=FALSE, load.it=FALSE,
88
+                  intensityFile="tmpcrlmmintensities.rda",
89
+                  desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1,
90
+                  verbose=TRUE){
91
+  if (load.it & !file.exists(intensityFile)){
92
+    load.it <- FALSE
93
+    message("File ", intensityFile, " does not exist.")
94
+    message("Not loading it, but running SNPRMA from scratch.")
95
+  }
96
+  if (!load.it){
97
+    res <- snprma(filenames, fitMixture=TRUE,
98
+                  mixtureSampleSize=mixtureSampleSize, verbose=verbose,
99
+                  eps=eps)
100
+    if(save.it) save(res, file=intensityFile)
101
+  }else{
102
+    message("Loading ", intensityFile, ".")
103
+    obj <- load(intensityFile)
104
+    message("Done.")
105
+    if (obj != "res")
106
+      stop("Object in ", intensityFile, " seems to be invalid.")
107
+  }
108
+  if(row.names) row.names=res$gns else row.names=NULL
109
+  if(col.names) col.names=res$sns else col.names=NULL
110
+  res2 <- crlmmGTNormalNoN(res[["A"]], res[["B"]], res[["SNR"]],
111
+                  res[["mixtureParams"]], res[["cdfName"]],
112
+                  gender=gender, row.names=row.names,
113
+                  col.names=col.names)
114
+  res2[["SNR"]] <- res[["SNR"]]
115
+  return(res2)
116
+}
0 117
new file mode 100644
... ...
@@ -0,0 +1,45 @@
1
+fitAffySnpMixture56 <- function(S, M, knots, probs=rep(1/3, 3), eps=.01, maxit=10, verbose=FALSE){
2
+  ##56 stands for 5 and 6 arrays but will also work for Illumina
3
+  ##Note the unfortunate choice of numbering:
4
+  ##1 is BB, 2 AB, and 3 AA. Opposite to everything else!
5
+  ##this is legacy code I decided not to change.
6
+  ## this why at the end we report -coefs: F1 is the negative f
7
+  mus <- append(quantile(M, c(1, 5)/6, names=FALSE), 0, 1)
8
+  sigmas <- rep(mad(c(M[M<mus[1]]-mus[1], M[M>mus[3]]-mus[3])), 3)
9
+  sigmas[2] <- sigmas[2]/2
10
+ 
11
+  weights <- apply(cbind(mus, sigmas), 1, function(p) dnorm(M, p[1], p[2]))
12
+  previousF1 <- -Inf
13
+  change <- eps+1
14
+  it <- 0
15
+ 
16
+  if(verbose) message("Max change must be under ", eps, ".")
17
+  matS <- stupidSplineBasis(S, knots)
18
+  while (change > eps & it < maxit){
19
+    it <- it+1
20
+    ## E
21
+    z <- sweep(weights, 2, probs, "*")
22
+    LogLik <- rowSums(z)
23
+    z <- sweep(z, 1, LogLik, "/")
24
+    probs <- colMeans(z)
25
+ 
26
+    ## M
27
+    fit1 <- crossprod(chol2inv(chol(crossprod(sweep(matS, 1, z[, 1], FUN="*"), matS))), crossprod(matS, z[, 1]*M))
28
+ 
29
+    fit2 <- sum(z[, 2]*M)/sum(z[, 2])
30
+    F1 <- matS%*%fit1
31
+    sigmas[c(1, 3)] <- sqrt(sum(z[, 1]*(M-F1)^2)/sum(z[, 1]))
32
+    sigmas[2] <- sqrt(sum(z[, 2]*(M-fit2)^2)/sum(z[, 2]))
33
+ 
34
+    weights[, 1] <- dnorm(M, F1, sigmas[1])
35
+    weights[, 2] <- dnorm(M, fit2, sigmas[2])
36
+    weights[, 3] <- dnorm(M, -F1, sigmas[3])
37
+    
38
+    change <- max(abs(F1-previousF1))
39
+    previousF1 <- F1
40
+    if(verbose) message("Iter ", it, ": ", change, ".")
41
+  }
42
+  medF1 <- median(-F1)
43
+ return(list(coef= -fit1, medF1=medF1, sigma1=sigmas[1], sigma2=sigmas[2]))
44
+}
45
+
0 46
new file mode 100644
... ...
@@ -0,0 +1,80 @@
1
+snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1, verbose=TRUE, seed=1, cdfName, sns){
2
+  ##ADD CHECK TO SEE IF LOADED
3
+  if (missing(cdfName))
4
+    cdfName <- read.celfile.header(filenames[1])$cdfName
5
+##  stuffDir <- changeToCrlmmAnnotationName(cdfName)
6
+  pkgname <- getCrlmmAnnotationName(cdfName)
7
+  if(!require(pkgname, character.only=TRUE)){
8
+    suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
9
+    msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
10
+    message(strwrap(msg))
11
+    stop("Package ", pkgname, " could not be found.")
12
+    rm(suggCall, msg)
13
+  }
14
+  
15
+  if(verbose) message("Loading annotations and mixture model parameters.")
16
+  data(preprocStuff, genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
17
+  autosomeIndex <- getVarInEnv("autosomeIndex")
18
+  pnsa <- getVarInEnv("pnsa")
19
+  pnsb <- getVarInEnv("pnsb")
20
+  fid <- getVarInEnv("fid")
21
+  reference <- getVarInEnv("reference")
22
+  aIndex <- getVarInEnv("aIndex")
23
+  bIndex <- getVarInEnv("bIndex")
24
+  SMEDIAN <- getVarInEnv("SMEDIAN")
25
+  theKnots <- getVarInEnv("theKnots")
26
+  gns <- getVarInEnv("gns")
27
+
28
+  ##We will read each cel file, summarize, and run EM one by one
29
+  ##We will save parameters of EM to use later
30
+  mixtureParams <- matrix(0, 4, length(filenames))
31
+  SNR <- vector("numeric", length(filenames))
32
+  SKW <- vector("numeric", length(filenames))
33
+
34
+  ## This is the sample for the fitting of splines
35
+  ## BC: I like better the idea of the user passing the seed,
36
+  ##     because this might intefere with other analyses
37
+  ##     (like what happened to GCRMA)
38
+  set.seed(seed)
39
+  
40
+  idx <- sort(sample(autosomeIndex, mixtureSampleSize))
41
+
42
+  ##S will hold (A+B)/2 and M will hold A-B
43
+  ##NOTE: We actually dont need to save S. Only for pics etc...
44
+  ##f is the correction. we save to avoid recomputing
45
+  A <- matrix(as.integer(0), length(pnsa), length(filenames))
46
+  B <- matrix(as.integer(0), length(pnsb), length(filenames))
47
+  
48
+  if(verbose){
49
+    message("Processing ", length(filenames), " files.")
50
+    pb <- txtProgressBar(min=0, max=length(filenames), style=3)
51
+  }
52
+  ##We start looping throug cel files
53
+  idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything
54
+  for(i in seq(along=filenames)){
55
+    y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
56
+    x <- log2(y[idx2])
57
+    SKW[i] <- mean((x-mean(x))^3)/(sd(x)^3)
58
+    rm(x)
59
+    y <- normalize.quantiles.use.target(y, target=reference)
60
+    A[, i] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
61
+    B[, i] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
62
+    
63
+    ##Now to fit the EM
64
+    if(fitMixture){
65
+      S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN
66
+      M <- log2(A[idx, i])-log2(B[idx, i])
67
+      
68
+      ##we need to test the choice of eps.. it is not the max diff between funcs
69
+      tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
70
+      
71
+      mixtureParams[, i] <- tmp[["coef"]]
72
+      SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
73
+    }
74
+    if (verbose) setTxtProgressBar(pb, i)
75
+  }
76
+  close(pb)
77
+  if (!fitMixture) SNR <- mixtureParams <- NA
78
+  ## gns comes from preprocStuff.rda
79
+  list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
80
+}
0 81
new file mode 100644
... ...
@@ -0,0 +1,45 @@
1
+stupidSplineBasis <- function(x,knots){
2
+  x <- pmin(x,knots[3])
3
+  x <- pmax(x,knots[1])
4
+  cbind(1, x, x^2, pmax(0, (x-knots[2]))^2)
5
+}
6
+
7
+changeToCrlmmAnnotationName <- function(x){
8
+  pkgDir <- system.file(package=THISPKG)
9
+  wanted <- paste(tolower(gsub("_", "", x)), "crlmm.stuff", sep=".")
10
+  file.path(pkgDir, "extdata", wanted)
11
+}
12
+
13
+getCrlmmAnnotationName <- function(x){
14
+  paste(tolower(gsub("_", "", x)), "Crlmm", sep="")
15
+}
16
+
17
+medianSummaries <- function(mat, grps)
18
+  .Call("R_subColSummarize_median", mat, grps, PACKAGE = "preprocessCore")
19
+
20
+intMedianSummaries <- function(mat, grps)
21
+  as.integer(medianSummaries(mat, grps))
22
+
23
+testProb <- function(p)
24
+  .Call("test", p)
25
+
26
+
27
+list.celfiles <-   function(...){
28
+  files <- list.files(...)
29
+  return(files[grep("\\.[cC][eE][lL]$", files)])
30
+}
31
+
32
+## .crlmmPkgEnv is an enviroment that will
33
+## store all the variables used by the pkg.
34
+## it's meant to not overwrite user's variables
35
+## and get rid of the NOTES generated by
36
+## R CMD check
37
+
38
+isLoaded <- function(dataset, environ=.crlmmPkgEnv)
39
+  exists(dataset, envir=environ)
40
+
41
+getVarInEnv <- function(dataset, environ=.crlmmPkgEnv){
42
+  if (!isLoaded(dataset))
43
+    stop("Variable ", dataset, " not found in .crlmmPkgEnv")
44
+  environ[[dataset]]
45
+}
0 46
new file mode 100644
... ...
@@ -0,0 +1,16 @@
1
+# Loading required libraries
2
+THISPKG <- "crlmm"
3
+
4
+.onLoad <- function(libname, pkgname) {
5
+  require("methods")
6
+}
7
+
8
+.onAttach <- function(libname, pkgname) {
9
+  message("Welcome to crlmm version ", packageDescription(THISPKG, field="Version"))
10
+}
11
+
12
+.onUnload <- function( libpath ){
13
+  library.dynam.unload(THISPKG, libpath)
14
+}
15
+
16
+  .crlmmPkgEnv <- new.env(parent=emptyenv())
0 17
new file mode 100644
... ...
@@ -0,0 +1,32 @@
1
+\name{crlmm-package}
2
+\alias{crlmm-package}
3
+\docType{package}
4
+\title{
5
+Genotype Calling via CRLMM Algorithm
6
+}
7
+\description{
8
+Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays.
9
+}
10
+\details{
11
+Index:
12
+\preformatted{
13
+crlmm                   Genotype SNP 5.0 or 6.0 samples.
14
+crlmm-package           New implementation of the CRLMM Algorithm.
15
+}
16
+The 'crlmm' package reimplements the CRLMM algorithm present in the
17
+'oligo' package. This implementation primes for efficient genotyping of
18
+samples on SNP 5.0 and SNP 6.0 Affymetrix arrays.
19
+
20
+To use this package, the user must have additional data packages:
21
+'genomewidesnp5Crlmm' - SNP 5.0 arrays
22
+'genomewidesnp6Crlmm' - SNP 6.0 arrays
23
+}
24
+\author{
25
+Rafael A Irizarry
26
+Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>
27
+}
28
+\references{
29
+  Carvalho B, Louis TA, Irizarry RA. Describing Uncertainty in
30
+  Genome-wide Genotype Calling. (in prep)
31
+}
32
+\keyword{ package }
0 33
new file mode 100644
... ...
@@ -0,0 +1,63 @@
1
+\name{crlmm}
2
+\alias{crlmm}
3
+%- Also NEED an '\alias' for EACH other topic documented here.
4
+\title{Genotype oligonucleotide arrays with CRLMM}
5
+\description{
6
+  This is a faster and more efficient implementation of the CRLMM
7
+  algorithm, especially designed for Affymetrix SNP 5 and 6 arrays (to
8
+  be soon extended to other platforms).
9
+}
10
+\usage{
11
+
12
+crlmm(filenames, row.names = TRUE, col.names = TRUE, probs = c(1/3, 1/3,
13
+1/3), DF = 6, SNRMin = 5, gender = NULL, save.it = FALSE, load.it =
14
+FALSE, intensityFile, mixtureSampleSize =
15
+10^5, eps = 0.1, verbose = TRUE, cdfName, sns, recallMin = 10,
16
+recallRegMin = 1000, returnParams = FALSE, badSNP = 0.7)
17
+
18
+}
19
+%- maybe also 'usage' for other objects documented here.
20
+\arguments{
21
+  \item{filenames}{'character' vector with CEL files to be genotyped.}
22
+  \item{row.names}{'logical'. Use rownames - SNP names?}
23
+  \item{col.names}{'logical'. Use colnames - Sample names?}
24
+  \item{probs}{'numeric' vector with priors for AA, AB and BB.}
25
+  \item{DF}{'integer' with number of degrees of freedom to use with t-distribution.}
26
+  \item{SNRMin}{'numeric' scalar defining the minimum SNR used to filter
27
+  out samples.}
28
+  \item{gender}{'integer' vector, with same length as 'filenames',
29
+    defining gender. (1 - male; 2 - female)}
30
+  \item{save.it}{'logical'. Save preprocessed data?}
31
+  \item{load.it}{'logical'. Load preprocessed data to speed up analysis?}
32
+  \item{intensityFile}{'character' with filename to be saved/loaded -
33
+    preprocessed data.}
34
+  \item{mixtureSampleSize}{Number of SNP's to be used with the mixture model.}
35
+  \item{eps}{Minimum change for mixture model.}
36
+  \item{verbose}{'logical'.}
37
+  \item{cdfName}{'character' defining the CDF name to use
38
+    ('GenomeWideSnp5', 'GenomeWideSnp6')}
39
+  \item{sns}{'character' vector with sample names to be used.}
40
+  \item{recallMin}{Minimum number of samples for recalibration.}
41
+  \item{recallRegMin}{Minimum number of SNP's for regression.}
42
+  \item{returnParams}{'logical'. Return recalibrated parameters.}
43
+  \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)}
44
+}
45
+\value{
46
+  \item{calls}{Genotype calls (1 - AA, 2 - AB, 3 - BB)}
47
+  \item{confs}{Confidence scores 'round(-1000*log2(1-p))'}
48
+  \item{SNPQC}{SNP Quality Scores}
49
+  \item{batchQC}{Batch Quality Score}
50
+  \item{params}{Recalibrated parameters}
51
+}
52
+\references{
53
+  Carvalho B, Bengtsson H, Speed TP, Irizarry RA. Exploration,
54
+  normalization, and genotype calls of high-density oligonucleotide SNP
55
+  array data. Biostatistics. 2007 Apr;8(2):485-99. Epub 2006 Dec
56
+  22. PMID: 17189563.
57
+
58
+  Carvalho B, Louis TA, Irizarry RA. Describing Uncertainty in
59
+  Genome-wide Genotype Calling. (in prep)
60
+}
61
+
62
+\keyword{classif}
63
+
0 64
new file mode 100644
... ...
@@ -0,0 +1,29 @@
1
+\name{list.celfiles}
2
+\alias{list.celfiles}
3
+
4
+\title{List CEL files.}
5
+\description{
6
+  Function used to get a list of CEL files.
7
+}
8
+\usage{
9
+list.celfiles(...)
10
+}
11
+
12
+\arguments{
13
+  \item{\dots}{Same arguments of \code{\link{list.files}}}
14
+}
15
+\details{
16
+  For the moment, this function returns only uncompressed CEL files (ie,
17
+  no CEL.gz)
18
+}
19
+\value{
20
+  Character vector with filenames.
21
+}
22
+
23
+\note{
24
+  Quite often users want to use this function to pass filenames to other
25
+  methods. In this situations, it is safer to use the argument 'full.names=TRUE'.
26
+}
27
+\seealso{\code{\link{list.files}}}
28
+\keyword{IO}
29
+\keyword{utilities}
0 30
new file mode 100644
... ...
@@ -0,0 +1,413 @@
1
+#include <math.h>
2
+#include <R.h>
3
+#include <Rdefines.h>
4
+#include <Rmath.h>
5
+#include <Rinternals.h>
6
+
7
+#include "utils.h"
8
+
9
+static double mydt(double x, int df){
10
+  return(pow(1.0+pow(x, 2.0)/ (double)df, -((double)df+1.0)/2.0));
11
+}
12
+
13
+SEXP gtypeCallerPart1(SEXP A, SEXP B, SEXP fIndex, SEXP mIndex,
14
+		      SEXP theCenters, SEXP theScales, SEXP theNs,
15
+		      SEXP Indexes, SEXP cIndexes, SEXP nIndexes,
16
+		      SEXP ncIndexes, SEXP SMEDIAN,
17
+		      SEXP knots, SEXP mixtureParams, SEXP df,
18
+		      SEXP probs, SEXP trim){
19
+  /*
20
+    ARGUMENTS
21
+    ---------
22
+    A: intensity matrix for allele A
23
+    B: intensity matrix for allele B
24
+    fIndex: indexes for females (columns in A/B for females)
25
+    mIndex: indexes for males (columns in A/B for males)
26
+    theCenters: matrix with SNP-specific centers (3 columns: AA/AB/BB)
27
+    theScales: matrix with SNP-specific scales (3 columns: AA/AB/BB)
28
+    theNs: matrix with SNP-specific counts (3 columns: AA/AB/BB)
29
+    Indexes: list with 3 elements (autosomeIndex, XIndex, YIndex) for SNPs
30
+    cIndexes: list with 3 elements (keepIndex, keepIndexFemale, keepIndexMale) for arrays
31
+    SMEDIAN: scalar (median S)
32
+    knots: knots for mixture
33
+    mixtureParams: mixture parameters
34
+    probs: genotype priors (1/3) for *ALL* SNPs. It's a vector of length 3
35
+    trim: drop rate to estimate means
36 </