Browse code

merge with collab branch containing fix for dqrls and bug-fix for computeRBaf that can misalign sample index with batch index (when batch is not in alphabetical order)

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@71295 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 17/11/2012 15:24:46
Showing10 changed files

... ...
@@ -1,12 +1,13 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.17.1
4
+Version: 1.17.9
5 5
 Author: Benilton S Carvalho, Robert Scharpf, Matt Ritchie, Ingo Ruczinski, Rafael A Irizarry
6 6
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
7 7
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms
8 8
 License: Artistic-2.0
9
-Depends: R (>= 2.14.0), oligoClasses (>= 1.19.39)
9
+Depends: R (>= 2.14.0), oligoClasses (>= 1.19.39), preprocessCore (>= 1.17.7)
10
+LinkingTo: preprocessCore (>= 1.17.7)
10 11
 Imports: methods,
11 12
          Biobase (>= 2.15.4),
12 13
          BiocGenerics,
... ...
@@ -14,14 +15,14 @@ Imports: methods,
14 15
          ellipse,
15 16
          genefilter (>= 1.37.1),
16 17
          mvtnorm,
17
-         preprocessCore (>= 1.17.7),
18 18
          splines,
19 19
          stats,
20 20
          SNPchip,
21 21
          utils,
22 22
 	 lattice,
23 23
 	 ff,
24
-	 foreach
24
+	 foreach,
25
+         RcppEigen
25 26
 Suggests: hapmapsnp6,
26 27
           genomewidesnp6Crlmm (>= 1.0.4),
27 28
           GGdata,
... ...
@@ -5,7 +5,7 @@ importClassesFrom(Biobase, AssayData, eSet)
5 5
 importClassesFrom(methods, ANY, character, formula, integer, list,
6 6
                   matrix, oldClass)
7 7
 importFrom(methods, setOldClass)
8
-
8
+importFrom(RcppEigen, fastLmPure)
9 9
 ## importClassesFrom(oligoClasses, CNSet, CNSetLM, ff_matrix,
10 10
 ##                   ff_or_matrix, oligoSnpSet)
11 11
 importClassesFrom(oligoClasses, CNSet, oligoSnpSet, ff_or_matrix)
... ...
@@ -56,7 +56,7 @@ importFrom(oligoClasses, celfileDate, chromosome2integer, i2p,
56 56
            splitIndicesByLength, splitIndicesByNode, AssayDataList)
57 57
 
58 58
 importFrom(preprocessCore, normalize.quantiles,
59
-           normalize.quantiles.use.target)
59
+           normalize.quantiles.use.target, subColSummarizeMedian)
60 60
 
61 61
 importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile,
62 62
            sd)
... ...
@@ -41,15 +41,22 @@ getFeatureData <- function(cdfName, copynumber=FALSE, genome){
41 41
 	}
42 42
 	path <- system.file("extdata", package=pkgname)
43 43
 	##multiple.builds <- length(grep("hg19", list.files(path)) > 0)
44
-	if(missing(genome)){
45
-		snp.file <- list.files(path, pattern="snpProbes_hg")
46
-		if(length(snp.file) > 1){
47
-			## use hg19
48
-			message("genome build not specified. Using build hg19 for annotation.")
49
-			snp.file <- snp.file[1]
50
-		}
51
-		genome <- gsub(".rda", "", strsplit(snp.file, "snpProbes_")[[1]][[2]])
52
-	} else snp.file <- paste("snpProbes_", genome, ".rda", sep="")
44
+	snp.file <- list.files(path, pattern="snpProbes_hg")
45
+	if(length(snp.file)==0){
46
+		no.build <- TRUE
47
+		snp.file <- list.files(path, pattern="snpProbes.rda")
48
+	} else {
49
+		no.build <- FALSE
50
+		if(missing(genome)){
51
+			snp.file <- list.files(path, pattern="snpProbes_hg")
52
+			if(length(snp.file) > 1){
53
+				## use hg19
54
+				message("genome build not specified. Using build hg19 for annotation.")
55
+				snp.file <- snp.file[1]
56
+			}
57
+			genome <- gsub(".rda", "", strsplit(snp.file, "snpProbes_")[[1]][[2]])
58
+		} else snp.file <- paste("snpProbes_", genome, ".rda", sep="")
59
+	}
53 60
 ##	if(!multiple.builds){
54 61
 ##		load(file.path(path, "snpProbes.rda"))
55 62
 ##	} else load(file.path(path, paste("snpProbes_", genome, ".rda", sep="")))
... ...
@@ -58,7 +65,9 @@ getFeatureData <- function(cdfName, copynumber=FALSE, genome){
58 65
 	## if we use a different build we may throw out a number of snps...
59 66
 	snpProbes <- snpProbes[rownames(snpProbes) %in% gns, ]
60 67
 	if(copynumber){
61
-		cn.file <- paste("cnProbes_", genome, ".rda", sep="")
68
+		if(!no.build){
69
+			cn.file <- paste("cnProbes_", genome, ".rda", sep="")
70
+		} else cn.file <- "cnProbes.rda"
62 71
 		load(file.path(path, cn.file))
63 72
 		##		if(!multiple.builds){
64 73
 		##			load(file.path(path, "cnProbes.rda"))
... ...
@@ -307,11 +316,11 @@ genotype <- function(filenames,
307 316
 	if(!(is(A(cnSet), "ff") || is(A(cnSet), "ffdf"))){
308 317
 		stop("The ff package is required for this function.")
309 318
 	}
310
-	cnSet <- snprmaAffy(cnSet,
311
-			    mixtureSampleSize=mixtureSampleSize,
312
-			    eps=eps,
313
-			    seed=seed,
314
-			    verbose=verbose)
319
+	ok <- snprmaAffy(cnSet,
320
+			 mixtureSampleSize=mixtureSampleSize,
321
+			 eps=eps,
322
+			 seed=seed,
323
+			 verbose=verbose)
315 324
 	ok <- cnrmaAffy(cnSet=cnSet,
316 325
 			seed=seed,
317 326
 			verbose=verbose)
... ...
@@ -434,17 +443,20 @@ rowCors <- function(x, y, ...){
434 443
 	return(covar/(sd.x*sd.y))
435 444
 }
436 445
 
437
-dqrlsWrapper <- function(x, y, wts, tol=1e-7){
438
-	n <- NROW(y)
439
-	p <- ncol(x)
440
-	ny <- NCOL(y)
441
-	.Fortran("dqrls", qr=x*wts, n=n, p=p, y=y * wts, ny=ny,
442
-		 tol=as.double(tol), coefficients=mat.or.vec(p, ny),
443
-		 residuals=y, effects=mat.or.vec(n, ny),
444
-		 rank=integer(1L), pivot=1L:p, qraux=double(p),
445
-		 work=double(2 * p), PACKAGE="base")[["coefficients"]]
446
-}
446
+## dqrlsWrapper <- function(x, y, wts, tol=1e-7){
447
+## 	n <- NROW(y)
448
+## 	p <- ncol(x)
449
+## 	ny <- NCOL(y)
450
+## 	.Fortran("dqrls", qr=x*wts, n=n, p=p, y=y * wts, ny=ny,
451
+## 		 tol=as.double(tol), coefficients=mat.or.vec(p, ny),
452
+## 		 residuals=y, effects=mat.or.vec(n, ny),
453
+## 		 rank=integer(1L), pivot=1L:p, qraux=double(p),
454
+## 		 work=double(2 * p), PACKAGE="base")[["coefficients"]]
455
+## }
456
+
447 457
 
458
+dqrlsWrapper <- function(x, y, wts, ...)
459
+    fastLmPure(X=x*wts, y=y*wts, method=3)[['coefficients']]
448 460
 
449 461
 fit.wls <- function(NN, sigma, allele, Y, autosome, X){
450 462
 	Np <- NN
... ...
@@ -730,7 +742,8 @@ fit.lm2 <- function(strata,
730 742
 		warning("All features in one of the probeset batches were flagged. The data is processed in probeset batches to reduce RAM, but the error suggests that increasing the size of the batches may help.  For example, ocProbesets(4*ocProbesets()) would increase the current size of the probeset batches by 4.")
731 743
 		fns.noflags <- featureNames(object)[marker.index] ## allow processing to continue
732 744
 	}
733
-	snp.index <- sample(match(fns.noflags, featureNames(object)), 5000)
745
+	index.flag <- match(fns.noflags, featureNames(object))
746
+	snp.index <- sample(index.flag, min(c(length(index.flag), 5000)))
734 747
 	##
735 748
 	nuA.np <- as.matrix(nuA(object)[marker.index, batch.index])
736 749
 	phiA.np <- as.matrix(phiA(object)[marker.index, batch.index])
... ...
@@ -1191,7 +1204,8 @@ fit.lm4 <- function(strata,
1191 1204
 		warning("All features in one of the probeset batches were flagged. The data is processed in probeset batches to reduce RAM, but the error suggests that increasing the size of the batches may help.  For example, ocProbesets(4*ocProbesets()) would increase the current size of the probeset batches by 4.")
1192 1205
 		fns.noflags <- featureNames(object)[marker.index] ## allow processing to continue
1193 1206
 	}
1194
-	snp.index <- sample(match(fns.noflags, featureNames(object)), 10000)
1207
+	index.flag <- match(fns.noflags, featureNames(object))
1208
+	snp.index <- sample(index.flag, min(c(length(index.flag), 10000)))
1195 1209
 	##
1196 1210
 	N.AA <- as.matrix(N.AA(object)[snp.index, batch.index, drop=FALSE])
1197 1211
 	N.AB <- as.matrix(N.AB(object)[snp.index, batch.index, drop=FALSE])
... ...
@@ -1079,7 +1079,10 @@ getAvailableIlluminaGenomeBuild <- function(path){
1079 1079
 		message("genome build not specified. Using build hg19 for annotation.")
1080 1080
 		snp.file <- snp.file[1]
1081 1081
 	}
1082
-	genome <- gsub(".rda", "", strsplit(snp.file, "snpProbes_")[[1]][[2]])
1082
+	if(length(snp.file) == 1)
1083
+		genome <- gsub(".rda", "", strsplit(snp.file, "snpProbes_")[[1]][[2]])
1084
+	if(length(snp.file)==0) genome <- ""
1085
+	genome
1083 1086
 }
1084 1087
 
1085 1088
 
... ...
@@ -208,7 +208,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
208 208
 	callsGt.present <- !missing(callsGt)
209 209
 	callsPr.present <- !missing(callsPr)
210 210
 	overwriteAB <- !callsGt.present & !callsPr.present
211
-	if(overwriteAB){
211
+	if(!overwriteAB){
212 212
 		open(callsGt)
213 213
 		open(callsPr)
214 214
 	}
... ...
@@ -246,7 +246,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
246 246
 	close(A)
247 247
 	close(B)
248 248
 	close(mixtureParams)
249
-	if(overwriteAB){
249
+	if(!overwriteAB){
250 250
 		close(callsGt)
251 251
 		close(callsPr)
252 252
 	}
... ...
@@ -548,7 +548,7 @@ calculateRBafCNSet <- function(object, batch.name, chrom){
548 548
 	indexlist <- split(index, chr[index])
549 549
 	J <- which(batch(object) %in% batch.name)
550 550
 	sns <- sampleNames(object)[J]
551
-	sampleindex <- split(J, batch(object)[J])
551
+	sampleindex <- split(J, factor(batch(object)[J], levels=unique(batch(object)[J])))
552 552
 	if(!all(valid.chrs)) warning("Only computing log R ratios and BAFs for autosomes and chr X")
553 553
 	## if ff package is loaded, these will be ff objects
554 554
 	chr <- names(indexlist)
... ...
@@ -14,8 +14,11 @@ getCrlmmAnnotationName <- function(x){
14 14
 	paste(tolower(gsub("_", "", x)), "Crlmm", sep="")
15 15
 }
16 16
 
17
+## medianSummaries <- function(mat, grps)
18
+##   .Call("R_subColSummarize_median", mat, grps, PACKAGE = "preprocessCore")
19
+
17 20
 medianSummaries <- function(mat, grps)
18
-  .Call("R_subColSummarize_median", mat, grps, PACKAGE = "preprocessCore")
21
+.Call("subColSummarizeMedianPP", mat, grps)
19 22
 
20 23
 intMedianSummaries <- function(mat, grps)
21 24
   as.integer(medianSummaries(mat, grps))
... ...
@@ -110,7 +110,7 @@ the purposes of this vignette, we only analyze the CEPH ('C') and
110 110
 Yoruban ('Y') samples.
111 111
 
112 112
 <<celfiles>>=
113
-celFiles <- list.celfiles(pathToCels, full.names=TRUE, pattern=".CEL")
113
+celFiles <- list.celfiles(pathToCels, full.names=TRUE, pattern=".CEL")[1:10]
114 114
 celFiles <- celFiles[substr(basename(celFiles), 13, 13) %in% c("C", "Y")]
115 115
 if(exists("file.index")){
116 116
 	celFiles <- celFiles[file.index]
... ...
@@ -2,7 +2,7 @@
2 2
   Affymetrix 5.0 and 6.0 platforms, as well as several Illumina
3 3
   platforms.  This vignette assumes that the arrays have already been
4 4
   successfully preprocessed and genotyped as per the instructions in
5
-  the \verb+AffymetrixPreprocessCN+ and \verb+IlluminaPreprocessCN+
5
+  the \verb+AffyGW+ and \verb+IlluminaPreprocessCN+
6 6
   vignettes for the Affymetrix and Illumina platforms,
7 7
   respectively. While this vignette uses Affymetrix 6.0 arrays for
8 8
   illustration, the steps at this point are identical for both
... ...
@@ -1,6 +1,8 @@
1 1
 #include <R.h>
2 2
 #include <R_ext/Rdynload.h>
3 3
 #include "crlmm.h"
4
+#include <Rinternals.h>
5
+#include <R_ext/Rdynload.h>
4 6
 
5 7
 static const R_CallMethodDef CallEntries[] = {
6 8
     {"gtypeCallerPart1", (DL_FUNC)&gtypeCallerPart1, 17},
... ...
@@ -12,3 +14,11 @@ static const R_CallMethodDef CallEntries[] = {
12 14
 void R_init_crlmm(DllInfo *dll){
13 15
     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
14 16
 }
17
+
18
+
19
+SEXP subColSummarizeMedianPP(SEXP RMatrix, SEXP R_rowIndexList){
20
+  static SEXP(*fun)(SEXP, SEXP) = NULL;
21
+  if (fun == NULL)
22
+    fun =  (SEXP(*)(SEXP, SEXP))R_GetCCallable("preprocessCore","R_subColSummarize_median");
23
+  return fun(RMatrix, R_rowIndexList);
24
+}