Browse code

Merge branch 'collab'

* collab: (34 commits)
revert change to IlluminaPreprocessCN
fix bug in isValidCdfName
print warning when all features in a batch of probes are flagged, but allow processing to continue
add utility cleancdfnames
Add validCdfNames.Rd
export validCdfNames
imputeGender fix when chromosome Y not available
Use splitIndicesByLength(index, ocSamples/getDoParWorkers())
Can not allocate vector of size XG with genotype.Illumina. Use splitIndicesByNode() only if the length of the list is greater than the split from splitIndicesByLength(). Otherwise, split by length using ocSamples()
update .gitignore
Add make.unique for sampleSheet$Sample_ID in readIdatFiles
bug in description
ensure sample ids stored in samplesheet are unique when constructing cnSet object
update oligoClasses dependency
update unit test for genotype.Illumina
revert change in constructInf call from genotype.Illumina
Update genotype.Rd
edit ACN function
1.15.6 use make.unique(basename(arrayNames)) to allow processing of Illumina samples with duplicated barcodes
check that sample identifies are unique in crlmm function
...

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@67435 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 08/07/2012 19:00:03
Showing 29 changed files

... ...
@@ -1,12 +1,12 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.15.0
4
+Version: 1.15.12
5 5
 Author: Benilton S Carvalho, Robert Scharpf, Matt Ritchie, Ingo Ruczinski, Rafael A Irizarry
6 6
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
7 7
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms
8 8
 License: Artistic-2.0
9
-Depends: R (>= 2.14.0), oligoClasses (>= 1.17.36)
9
+Depends: R (>= 2.14.0), oligoClasses (>= 1.19.17)
10 10
 Imports: methods,
11 11
          Biobase (>= 2.15.4),
12 12
          BiocGenerics,
... ...
@@ -20,7 +20,8 @@ Imports: methods,
20 20
          SNPchip,
21 21
          utils,
22 22
 	 lattice,
23
-	 ff
23
+	 ff,
24
+	 foreach
24 25
 Suggests: hapmapsnp6,
25 26
           genomewidesnp6Crlmm (>= 1.0.4),
26 27
           GGdata,
... ...
@@ -53,7 +53,7 @@ importFrom(oligoClasses, celfileDate, chromosome2integer, i2p,
53 53
 	   isPackageLoaded,
54 54
            ldPath, ocLapply, ocProbesets, ocSamples,
55 55
 	   parStatus,
56
-           splitIndicesByLength, splitIndicesByNode)
56
+           splitIndicesByLength, splitIndicesByNode, AssayDataList)
57 57
 
58 58
 importFrom(preprocessCore, normalize.quantiles,
59 59
            normalize.quantiles.use.target)
... ...
@@ -64,11 +64,14 @@ importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile,
64 64
 importFrom(utils, packageDescription, setTxtProgressBar,
65 65
            txtProgressBar)
66 66
 
67
+## foreach
68
+import(foreach)
69
+
67 70
 ##----------------------------------------------------------------------------
68 71
 ## export
69 72
 ##----------------------------------------------------------------------------
70 73
 exportClasses(PredictionRegion)
71
-exportMethods(CA, CB, coerce,
74
+exportMethods(CA, CB,
72 75
 	      A, B, corr, nuA, nuB, phiA, phiB,
73 76
 	      predictionRegion, posteriorProbability,
74 77
 	      tau2, Ns, medians, mads,
... ...
@@ -87,4 +90,4 @@ export(crlmm,
87 90
        genotype.Illumina,
88 91
        crlmmCopynumber2, crlmmCopynumberLD, crlmmCopynumber)
89 92
 export(genotypes, totalCopynumber, rawCopynumber, xyplot)
90
-export(ABpanel, validCEL, celDates)
93
+export(ABpanel, validCEL, celDates, validCdfNames)
... ...
@@ -104,4 +104,4 @@ setGeneric("predictionRegion", function(object, copyNumber=0:4)
104 104
 	   standardGeneric("predictionRegion"))
105 105
 setGeneric("xyplot", useAsDefault=function(x, data, ...) lattice::xyplot(x, data,...))
106 106
 setGeneric("xyplotcrlmm", function(x, data, predictRegion,...) standardGeneric("xyplotcrlmm"))
107
-setGeneric("calculateRBaf", function(object, batch.name) standardGeneric("calculateRBaf"))
107
+setGeneric("calculateRBaf", function(object, batch.name, chrom) standardGeneric("calculateRBaf"))
... ...
@@ -21,7 +21,7 @@ getProtocolData.Affy <- function(filenames){
21 21
 ##		  return(gns)
22 22
 ##	  })
23 23
 
24
-getFeatureData <- function(cdfName, copynumber=FALSE){
24
+getFeatureData <- function(cdfName, copynumber=FALSE, genome=genome){
25 25
 	pkgname <- getCrlmmAnnotationName(cdfName)
26 26
 	if(!require(pkgname, character.only=TRUE)){
27 27
 		suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
... ...
@@ -39,10 +39,15 @@ getFeatureData <- function(cdfName, copynumber=FALSE){
39 39
 		pkgname <- paste(pkgname, "Crlmm", sep="")
40 40
 	}
41 41
 	path <- system.file("extdata", package=pkgname)
42
-	load(file.path(path, "snpProbes.rda"))
42
+	multiple.builds <- length(grep("hg19", list.files(path)) > 0)
43
+	if(!multiple.builds)
44
+		load(file.path(path, "snpProbes.rda"))
45
+	else load(file.path(path, paste("snpProbes_", genome, ".rda", sep="")))
43 46
 	snpProbes <- get("snpProbes")
44 47
 	if(copynumber){
45
-		load(file.path(path, "cnProbes.rda"))
48
+		if(!multiple.builds)
49
+			load(file.path(path, "cnProbes.rda"))
50
+		else load(file.path(path, paste("cnProbes_", genome, ".rda", sep="")))
46 51
 		cnProbes <- get("cnProbes")
47 52
 		snpIndex <- seq(along=gns)
48 53
 		npIndex <- seq(along=rownames(cnProbes)) + max(snpIndex)
... ...
@@ -121,8 +126,8 @@ construct <- function(filenames,
121 126
 	return(cnSet)
122 127
 }
123 128
 
124
-constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE){
125
-	stopifnot(!missing(filenames))
129
+constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE, genome){
130
+	##stopifnot(!missing(filenames))
126 131
 	if(missing(sns)) sns <- basename(filenames)
127 132
 	stopifnot(!missing(batch))
128 133
 	##---------------------------------------------------------------------------
... ...
@@ -131,7 +136,6 @@ constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE){
131 136
 	## appropriate dimension for snps+nps
132 137
 	##
133 138
 	##---------------------------------------------------------------------------
134
-	if (missing(sns)) sns <- basename(filenames)
135 139
 	if (missing(cdfName))
136 140
 		cdfName <- read.celfile.header(filenames[1])[["cdfName"]]
137 141
 	pkgname <- getCrlmmAnnotationName(cdfName)
... ...
@@ -148,7 +152,7 @@ constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE){
148 152
 	SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
149 153
 	SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")
150 154
 	genderff <- initializeBigVector("gender", length(filenames), "integer")
151
-	featureData <- getFeatureData(cdfName, copynumber=TRUE)
155
+	featureData <- getFeatureData(cdfName, copynumber=TRUE, genome=genome)
152 156
 	nr <- nrow(featureData); nc <- length(sns)
153 157
 	A <- initializeBigMatrix("crlmmA-", nr, length(filenames), "integer")
154 158
 	B <- initializeBigMatrix("crlmmB-", nr, length(filenames), "integer")
... ...
@@ -164,18 +168,12 @@ constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE){
164 168
 		     callProbability=callPr,
165 169
 		     featureData=featureData,
166 170
 		     annotation=cdfName,
167
-		     batch=as.character(batch))
168
-	if(!missing(sns)){
169
-		sampleNames(cnSet) <- sns
170
-	} else {
171
-		sampleNames(cnSet) <- basename(filenames)
172
-	}
171
+		     batch=as.character(batch),
172
+		     genome=genome)
173
+	sampleNames(cnSet) <- sns
173 174
 	protocolData <- getProtocolData.Affy(filenames)
174 175
 	rownames(pData(protocolData)) <- sns
175 176
 	protocolData(cnSet) <- protocolData
176
-	##pd <- data.frame(matrix(NA, nc, 3), row.names=sns)
177
-	##colnames(pd)=c("SKW", "SNR", "gender")
178
-	##phenoData(cnSet) <- new("AnnotatedDataFrame", data=pd)
179 177
 	cnSet$SKW <- SKW
180 178
 	cnSet$SNR <- SNR
181 179
 	cnSet$gender <- genderff
... ...
@@ -192,7 +190,9 @@ snprmaAffy <- function(cnSet, filenames,
192 190
 	pkgname <- getCrlmmAnnotationName(annotation(cnSet))
193 191
 	stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
194 192
 	mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
195
-	sampleBatches <- splitIndicesByNode(seq(along=filenames))
193
+	sampleBatches <- splitIndicesByLength(seq_along(filenames), ocSamples()/getDoParWorkers())
194
+##	if(length(sampleBatches) < getDoParWorkers())
195
+##		sampleBatches <- splitIndicesByNode(seq_along(filenames))
196 196
 	ocLapply(sampleBatches,
197 197
 		 rsprocessCEL,
198 198
 		 filenames=filenames,
... ...
@@ -225,19 +225,25 @@ genotype <- function(filenames,
225 225
 		     recallRegMin=1000,
226 226
 		     gender=NULL,
227 227
 		     returnParams=TRUE,
228
-		     badSNP=0.7){
229
-	stopifnot(isPackageLoaded("ff"))
228
+		     badSNP=0.7,
229
+		     genome=c("hg19", "hg18")){
230
+	if(!isPackageLoaded("ff")) stop("ff package must be loaded")
231
+	if (missing(sns)) sns <- basename(filenames)
232
+	if (any(duplicated(sns))) stop("sample identifiers must be unique")
233
+	genome <- match.arg(genome)
230 234
 	cnSet <- constructAffy(filenames=filenames,
235
+			       sns=sns,
231 236
 			       cdfName=cdfName,
232
-			       batch=batch, verbose=verbose)
237
+			       batch=batch, verbose=verbose,
238
+			       genome=genome)
233 239
 	if(!(is(A(cnSet), "ff") || is(A(cnSet), "ffdf"))){
234 240
 		stop("The ff package is required for this function.")
235 241
 	}
236 242
 	cnSet@mixtureParams <- snprmaAffy(cnSet, filenames=filenames,
237
-				    mixtureSampleSize=mixtureSampleSize,
238
-				    eps=eps,
239
-				    seed=seed,
240
-				    verbose=verbose)
243
+					  mixtureSampleSize=mixtureSampleSize,
244
+					  eps=eps,
245
+					  seed=seed,
246
+					  verbose=verbose)
241 247
 	ok <- cnrmaAffy(cnSet=cnSet, filenames=filenames,
242 248
 			cdfName=annotation(cnSet), seed=seed,
243 249
 			verbose=verbose)
... ...
@@ -697,6 +703,10 @@ fit.lm2 <- function(strata,
697 703
 	flags <- as.matrix(flags(object)[ii, batch.index])
698 704
 	fns <- featureNames(object)[ii]
699 705
 	fns.noflags <- fns[rowSums(flags, na.rm=T) == 0]
706
+	if(length(fns.noflags) == 0) {
707
+		warning("All features in one of the probeset batches were flagged. The data is processed in probeset batches to reduce RAM, but the error suggests that increasing the size of the batches may help.  For example, ocProbesets(4*ocProbesets()) would increase the current size of the probeset batches by 4.")
708
+		fns.noflags <- featureNames(object)[marker.index] ## allow processing to continue
709
+	}
700 710
 	snp.index <- sample(match(fns.noflags, featureNames(object)), 5000)
701 711
 	##
702 712
 	nuA.np <- as.matrix(nuA(object)[marker.index, batch.index])
... ...
@@ -1089,6 +1099,15 @@ fit.lm3 <- function(strata,
1089 1099
 	phiB(object)[marker.index, batch.index] <- phiB
1090 1100
 	phiPrimeA(object)[marker.index, batch.index] <- phiA2
1091 1101
 	phiPrimeB(object)[marker.index, batch.index] <- phiB2
1102
+	##
1103
+	if(enough.women){
1104
+		medianA.AA(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianA.Flist, function(x) x[, 1]))
1105
+		medianA.AB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianA.Flist, function(x) x[, 2]))
1106
+		medianA.BB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianA.Flist, function(x) x[, 3]))
1107
+		medianB.AA(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianB.Flist, function(x) x[, 1]))
1108
+		medianB.AB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianB.Flist, function(x) x[, 2]))
1109
+		medianB.BB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianB.Flist, function(x) x[, 3]))
1110
+	}
1092 1111
 	if(is.lds) {close(object); return(TRUE)} else return(object)
1093 1112
 }
1094 1113
 
... ...
@@ -1145,6 +1164,10 @@ fit.lm4 <- function(strata,
1145 1164
 	fns <- featureNames(object)[ii]
1146 1165
 	flags <- as.matrix(flags(object)[ii, batch.index])
1147 1166
 	fns.noflags <- fns[rowSums(flags, na.rm=T) == 0]
1167
+	if(length(fns.noflags) == 0){
1168
+		warning("All features in one of the probeset batches were flagged. The data is processed in probeset batches to reduce RAM, but the error suggests that increasing the size of the batches may help.  For example, ocProbesets(4*ocProbesets()) would increase the current size of the probeset batches by 4.")
1169
+		fns.noflags <- featureNames(object)[marker.index] ## allow processing to continue
1170
+	}
1148 1171
 	snp.index <- sample(match(fns.noflags, featureNames(object)), 10000)
1149 1172
 	##
1150 1173
 	N.AA <- as.matrix(N.AA(object)[snp.index, batch.index, drop=FALSE])
... ...
@@ -1260,7 +1283,7 @@ cnrma2 <- function(A, filenames, row.names, verbose=TRUE, seed=1, cdfName, sns){
1260 1283
 	pkgname <- getCrlmmAnnotationName(cdfName)
1261 1284
 	require(pkgname, character.only=TRUE) || stop("Package ", pkgname, " not available")
1262 1285
 	if (missing(sns)) sns <- basename(filenames)
1263
-	sampleBatches <- splitIndicesByNode(seq(along=filenames))
1286
+	sampleBatches <- splitIndicesByLength(seq_along(filenames), ocSamples()/getDoParWorkers())
1264 1287
 	if(verbose) message("Processing nonpolymorphic probes for ", length(filenames), " files.")
1265 1288
 	## updates A
1266 1289
 	ocLapply(sampleBatches,
... ...
@@ -1540,10 +1563,10 @@ shrinkSummary <- function(object,
1540 1563
 		marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X)
1541 1564
 	}
1542 1565
 	if(is.lds){
1543
-		##index.list <- splitIndicesByLength(marker.index, ocProbesets())
1544
-		if(parStatus()){
1545
-			index.list <- splitIndicesByNode(marker.index)
1546
-		} else index.list <- splitIndicesByLength(marker.index, ocProbesets())
1566
+		index.list <- splitIndicesByLength(marker.index, ocProbesets())
1567
+##		if(parStatus()){
1568
+##			index.list <- splitIndicesByNode(marker.index)
1569
+##		} else index.list <- splitIndicesByLength(marker.index, ocProbesets())
1547 1570
 		ocLapply(seq(along=index.list),
1548 1571
 			 shrinkGenotypeSummaries,
1549 1572
 			 index.list=index.list,
... ...
@@ -1594,10 +1617,10 @@ genotypeSummary <- function(object,
1594 1617
 	myf <- summaryFxn(type[[1]])
1595 1618
 	FUN <- get(myf)
1596 1619
 	if(is.lds){
1597
-		##index.list <- splitIndicesByLength(marker.index, ocProbesets())
1598
-		if(parStatus()){
1599
-			index.list <- splitIndicesByNode(marker.index)
1600
-		} else index.list <- splitIndicesByLength(marker.index, ocProbesets())
1620
+		index.list <- splitIndicesByLength(marker.index, ocProbesets())
1621
+##		if(parStatus()){
1622
+##			index.list <- splitIndicesByNode(marker.index)
1623
+##		} else index.list <- splitIndicesByLength(marker.index, ocProbesets())
1601 1624
 		ocLapply(seq(along=index.list),
1602 1625
 			 FUN,
1603 1626
 			 index.list=index.list,
... ...
@@ -2116,9 +2139,10 @@ estimateCnParameters <- function(object,
2116 2139
 	myfun <- lmFxn(type[[1]])
2117 2140
 	FUN <- get(myfun)
2118 2141
 	if(is.lds){
2119
-		if(parStatus()){
2120
-			index.list <- splitIndicesByNode(marker.index)
2121
-		} else index.list <- splitIndicesByLength(marker.index, ocProbesets())
2142
+		index.list <- splitIndicesByLength(marker.index, ocProbesets())
2143
+##		if(parStatus()){
2144
+##			index.list <- splitIndicesByNode(marker.index)
2145
+##		} else index.list <- splitIndicesByLength(marker.index, ocProbesets())
2122 2146
 		ocLapply(seq(along=index.list),
2123 2147
 			 FUN,
2124 2148
 			 index.list=index.list,
... ...
@@ -2784,89 +2808,73 @@ ABpanel <- function(x, y, predictRegion,
2784 2808
 	}
2785 2809
 }
2786 2810
 
2787
-calculateRTheta <- function(cnSet, genotype=c("AA", "AB", "BB"), batch.name){
2788
-	genotype <- match.arg(genotype)
2789
-	j <- match(batch.name, batchNames(cnSet))
2790
-	##centers <- medians(cnSet, i=snp.index, j)
2791
-	centers <- medians(cnSet, i=seq_len(nrow(cnSet)), j)
2792
-	theta <- matrix(NA, nrow(cnSet), 2)
2793
-	colnames(theta) <- c("theta", "R")
2794
-	x <- centers[, "A", genotype, j]
2795
-	y <- centers[, "B", genotype, j]
2796
-	## small imputed values -- should fix imputeCenter
2797
-	x[x < 64] <- 64
2798
-	y[y < 64] <- 64
2799
-	theta[, "theta"] <- atan2(y, x)*2/pi
2800
-##	if(any(is.na(theta[, "theta"]))){
2801
-##		##stop("NA's in thetas.  Can not compute theta / R values for batches with fewer than 10 samples")
2802
-##	}
2803
-	## so that 'R' is available for NP probes
2804
-	y[is.na(y)] <- 0
2805
-	theta[, "R"] <- x+y
2806
-	theta[!isSnp(cnSet), 1] <- 1
2807
-	if(any(is.na(theta[, "theta"]))){
2808
-		stop("NA's in thetas.  Can not compute theta / R values for batches with fewer than 10 samples")
2809
-	}
2810
-	return(theta)
2811
-}
2812
-
2813
-
2814
-
2815
-
2816
-calculateBAFandLRR <- function(cnSet) {
2817
-  ## Calculates B allele frequency and log2 R ratio for all snps for a given cnSet
2818
-  ## Returns a 3-D array of size Num Of Snps x Num of Samples x 2
2819
-  ## where result[,,1] is B allele frequency and result[,,2] is log2 R ratio
2820
-
2821
-  snp.index <- which(isSnp(cnSet))
2822
-  b <- (B(cnSet))[snp.index, ]
2823
-  a <- (A(cnSet))[snp.index, ]
2824
-  centers<-medians(cnSet, i=snp.index, j=1:length(unique(batch(cnSet))))
2825
-  bafAndLrr <- array(NA_integer_, dim=c(length(snp.index), length(sampleNames(cnSet)), 2), dimnames=list(featureNames(cnSet)[snp.index], sampleNames(cnSet),c("baf", "lrr")))
2826
-
2827
-  for (batch in unique(batch(cnSet))) {
2828
-
2829
-    ## get batch- and snp-specific centers
2830
-    center.aa.x <- centers[, 1, 1, batch]
2831
-    center.aa.y <- centers[, 2, 1, batch]
2832
-    center.ab.x <- centers[, 1, 2, batch]
2833
-    center.ab.y <- centers[, 2, 2, batch]
2834
-    center.bb.x <- centers[, 1, 3, batch]
2835
-    center.bb.y <- centers[, 2, 3, batch]
2836
-    theta.aa <- atan2(center.aa.y, center.aa.x) * 2 / pi
2837
-    r.aa <- center.aa.x + center.aa.y
2838
-    theta.ab <- atan2(center.ab.y, center.ab.x) * 2 / pi
2839
-    r.ab <- center.ab.x + center.ab.y
2840
-    theta.bb <- atan2(center.bb.y, center.bb.x) * 2 / pi
2841
-    r.bb <- center.bb.x + center.bb.y
2842
-
2843
-    index.sample <- which(batch(cnSet) %in% batch)
2844
-    for (i in index.sample) {
2845
-      theta <- atan2(b[, i], a[, i]) * 2 / pi
2846
-      r <- a[, i] + b[, i]
2847
-
2848
-      baf <- vector(mode="numeric", length=length(theta))
2849
-      baf[theta < theta.aa] <- 0
2850
-      baf[theta >= theta.bb] <- 1
2851
-      idx.ab <- which(theta >= theta.aa & theta < theta.ab)
2852
-      baf[idx.ab] <- .5 * (theta[idx.ab] - theta.aa[idx.ab]) / (theta.ab[idx.ab] - theta.aa[idx.ab])
2853
-      idx.bb <- which(theta >= theta.ab & theta < theta.bb)
2854
-      baf[idx.bb] <- .5 * (theta[idx.bb] - theta.ab[idx.bb]) / (theta.bb[idx.bb] - theta.ab[idx.bb]) + .5
2855
-      bafAndLrr[, i ,1] <- baf
2856
-
2857
-      r.expected <- vector(mode="numeric", length=length(r))
2858
-      r.expected[theta < theta.aa] <- r.aa[theta < theta.aa]
2859
-      r.expected[theta >= theta.bb] <- r.bb[theta >= theta.bb]
2860
-
2861
-      idx.ab <- which(theta >= theta.aa & theta < theta.ab)
2862
-      r.expected[idx.ab] <- (theta[idx.ab] - theta.aa[idx.ab]) * (r.ab[idx.ab]-r.aa[idx.ab]) /
2863
-        (theta.ab[idx.ab] - theta.aa[idx.ab]) + r.aa[idx.ab]
2864
-      idx.bb <- which(theta >= theta.ab & theta < theta.bb)
2865
-      r.expected[idx.bb] <- (theta[idx.bb] - theta.ab[idx.bb]) * (r.bb[idx.bb]-r.ab[idx.bb])/
2866
-        (theta.bb[idx.bb] - theta.ab[idx.bb]) + r.ab[idx.bb]
2867
-      bafAndLrr[, i, 2] <- log2(r / r.expected)
2868
-
2869
-    }
2870
-  }
2871
-  return(bafAndLrr)
2811
+calculateRTheta <- function(object, ##genotype=c("AA", "AB", "BB"),
2812
+			    batch.name,
2813
+			    feature.index){
2814
+	index.np <- which(!(isSnp(object)[feature.index]))
2815
+	j <- match(batch.name, batchNames(object))
2816
+	if(missing(j)) stop("batch.name did not match in batchNames(object)")
2817
+	CHR <- unique(chromosome(object)[feature.index])
2818
+	isX <- CHR == "X"
2819
+	centers <- getMedians(batchStatistics(object))
2820
+	lapply(centers, open)
2821
+	centersMatrix <- lapply(centers, function(x, i, j) x[i, j], j=j, i=feature.index)
2822
+	lapply(centers, close)
2823
+	centersMatrix <- do.call(cbind, centersMatrix)
2824
+	centersA <- centersMatrix[, 1:3]
2825
+	centersB <- centersMatrix[, 4:6]
2826
+	rm(centers, centersMatrix); gc()
2827
+	centersA[centersA < 64] <- 64
2828
+	centersB[centersB < 64] <- 64
2829
+	theta <- atan2(centersB, centersA) * 2/pi
2830
+	centersA[is.na(centersA)] <- 0
2831
+	centersB[is.na(centersB)] <- 0
2832
+	if(length(index.np) > 0) theta[index.np, ] <- 1
2833
+	##
2834
+	r <- centersA+centersB
2835
+	J <- which(batch(object)==batch.name)
2836
+	open(A(object))
2837
+	open(B(object))
2838
+	a <- as.matrix(A(object)[feature.index, J, drop=FALSE])
2839
+	b <- as.matrix(B(object)[feature.index, J, drop=FALSE])
2840
+	close(A(object))
2841
+	close(B(object))
2842
+	if(length(index.np) > 0) b[index.np, ] <- 0L
2843
+	obs.theta <- atan2(b, a)*2/pi
2844
+	calculateBandR <- function(o, r, theta, not.na, M){
2845
+		lessAA <- o < theta[, 1]
2846
+		lessAB <- o < theta[, 2]
2847
+		lessBB <- o < theta[, 3]
2848
+		ii <- which(lessAA & not.na)
2849
+		jj <- which(!lessBB & not.na)
2850
+		i <- which(!lessAA & lessAB & not.na)
2851
+		j <- which(!lessAB & lessBB & not.na)
2852
+		M[i, 1] <- 0.5*(o[i]-theta[i,1])/(theta[i,2]-theta[i,1])
2853
+		M[j, 1] <- 0.5*(o[j]-theta[j,2])/(theta[j,3]-theta[j,2]) + 0.5
2854
+		M[ii, 1] <- 0
2855
+		M[jj, 1] <- 1
2856
+		M[i, 2] <- (o[i]-theta[i,1])*(r[i,2]-r[i,1])/(theta[i,2]-theta[i,1]) + r[i, 1]
2857
+		M[j, 2] <- (o[j]-theta[j,2])*(r[j,3]-r[j,2])/(theta[j,3]-theta[j,2]) + r[j, 2]
2858
+		M[ii, 2] <- r[ii, 1]
2859
+		M[jj, 2] <- r[jj, 3]
2860
+		return(M)
2861
+	}
2862
+	not.na <- !is.na(theta[,1])
2863
+	r.expected <- bf <- matrix(NA, length(feature.index), ncol(a))
2864
+	M <- matrix(NA, length(feature.index), 2)
2865
+	for(j in seq_len(ncol(a))){
2866
+		tmp <- calculateBandR(obs.theta[, j], r=r, theta=theta, not.na=not.na, M=M)
2867
+		bf[, j] <- tmp[,1]
2868
+		r.expected[,j] <- tmp[,2]
2869
+	}
2870
+	bf[bf < 0] <- 0
2871
+	bf[bf > 1] <- 1
2872
+	if(length(index.np) > 0) r.expected[index.np, ] <- centersA[index.np, 1]
2873
+	obs.r <- a+b
2874
+	lrr <- log2(obs.r/r.expected)
2875
+	dimnames(bf) <- dimnames(lrr) <- list(featureNames(object)[feature.index],
2876
+					      sampleNames(object)[J])
2877
+	bf <- integerMatrix(bf, 1000)
2878
+	lrr <- integerMatrix(lrr, 100)
2879
+	return(list(baf=bf, lrr=lrr))
2872 2880
 }
... ...
@@ -7,6 +7,7 @@ crlmm <- function(filenames, row.names=TRUE, col.names=TRUE,
7 7
   if ((load.it | save.it) & missing(intensityFile))
8 8
 	  stop("'intensityFile' is missing, and you chose either load.it or save.it")
9 9
   if (missing(sns)) sns <- basename(filenames)
10
+  if (any(duplicated(sns))) stop("sample identifiers are not unique")
10 11
   if (!missing(intensityFile))
11 12
     if (load.it & !file.exists(intensityFile)){
12 13
       load.it <- FALSE
... ...
@@ -312,24 +313,26 @@ crlmm2 <- function(filenames, row.names=TRUE, col.names=TRUE,
312 313
 }
313 314
 
314 315
 imputeGender <- function(A, B, XIndex, YIndex, SNR, SNRMin){
315
-	##XMedian <- apply(log2(A[XIndex,,
316
-	##drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
317
-	a <- log2(A[XIndex,,drop=FALSE])
318
-	b <- log2(B[XIndex,,drop=FALSE])
319
-	meds.X <- (apply(a+b, 2, median))/2
320
-	##YIndex <- which(isSnp(gtSet) & chromosome(gtSet)==24)
321
-	a <- log2(A[YIndex,,drop=FALSE])
322
-	b <- log2(B[YIndex,,drop=FALSE])
323
-	meds.Y <- (apply(a+b, 2, median))/2
324
-	R <- meds.X - meds.Y
325
-	##SNR <- gtSet$SNR[]
326
-	##SNRMin=5
327
-	if(sum(SNR > SNRMin) == 1){
328
-		##gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
329
-		gender <- ifelse(R[SNR[] > SNRMin] > 0.5, 2L, 1L)
330
-	} else{
331
-		##gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]]
332
-		gender <- kmeans(R, c(min(R[SNR[]>SNRMin]), max(R[SNR[]>SNRMin])))[["cluster"]]
316
+	if(length(YIndex) > 0){
317
+		a <- log2(A[XIndex,,drop=FALSE])
318
+		b <- log2(B[XIndex,,drop=FALSE])
319
+		meds.X <- (apply(a+b, 2, median))/2
320
+		a <- log2(A[YIndex,,drop=FALSE])
321
+		b <- log2(B[YIndex,,drop=FALSE])
322
+		meds.Y <- (apply(a+b, 2, median))/2
323
+		R <- meds.X - meds.Y
324
+		if(sum(SNR > SNRMin) == 1){
325
+			gender <- ifelse(R[SNR[] > SNRMin] > 0.5, 2L, 1L)
326
+		} else{
327
+			gender <- kmeans(R, c(min(R[SNR[]>SNRMin]), max(R[SNR[]>SNRMin])))[["cluster"]]
328
+		}
329
+	} else {
330
+		XMedian <- apply(log2(A[XIndex,,drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
331
+		if(sum(SNR > SNRMin) == 1){
332
+			gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
333
+		} else{
334
+			gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
335
+		}
333 336
 	}
334 337
 	return(gender)
335 338
 }
... ...
@@ -1,4 +1,4 @@
1
-# function below works OK provided all .idat files are in the current working directory
1
+					# function below works OK provided all .idat files are in the current working directory
2 2
 # - could add an option to allow files in Illumina directory structure to be handled
3 3
 # or to use the optional 'Path' column in sampleSheet
4 4
 # - there is a lot of header information that is currently discarded - could try and store this somewhere in the resulting NChannelSet
... ...
@@ -36,7 +36,7 @@ readIdatFiles = function(sampleSheet=NULL,
36 36
 		       }
37 37
 	       }
38 38
 	       pd = new("AnnotatedDataFrame", data = sampleSheet)
39
-	       sampleNames(pd) = basename(arrayNames)
39
+	       sampleNames(pd) = make.unique(basename(arrayNames))
40 40
        }
41 41
        if(is.null(arrayNames)) {
42 42
                arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
... ...
@@ -95,7 +95,7 @@ readIdatFiles = function(sampleSheet=NULL,
95 95
 	       if(i==1) {
96 96
 		       if(is.null(ids) && !is.null(G)){
97 97
 			       ids = idsG
98
-		       } else stop("Could not find probe IDs")
98
+		       } # else stop("Could not find probe IDs")
99 99
 		       nprobes = length(ids)
100 100
 		       narrays = length(arrayNames)
101 101
 		       RG = new("NChannelSet",
... ...
@@ -106,8 +106,8 @@ readIdatFiles = function(sampleSheet=NULL,
106 106
 				 phenoData=pd, storage.mode="environment")
107 107
 		       featureNames(RG) = ids
108 108
 		       if(!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)){
109
-		            sampleNames(RG) = sampleSheet$Sample_ID
110
-		       } else  sampleNames(RG) = arrayNames
109
+		            sampleNames(RG) = make.unique(sampleSheet$Sample_ID)
110
+		       } else  sampleNames(RG) = make.unique(basename(arrayNames))
111 111
 		       gc(verbose=FALSE)
112 112
 	       }
113 113
 	       if(length(ids)==length(idsG)) {
... ...
@@ -524,7 +524,6 @@ readGenCallOutput = function(file, path=".", cdfName,
524 524
 
525 525
 
526 526
 RGtoXY = function(RG, chipType, verbose=TRUE) {
527
-
528 527
   needToLoad <- !all(sapply(c('addressA', 'addressB', 'base'), isLoaded))
529 528
   if(needToLoad){
530 529
 	  chipList = c("human1mv1c",# 1M
... ...
@@ -537,12 +536,14 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
537 536
 	  "human1mduov3b",          # 1M Duo
538 537
 	  "humanomni1quadv1b",      # Omni1 quad
539 538
 	  "humanomni25quadv1b",     # Omni2.5 quad
539
+	  "humanomni258v1a",        # Omni2.5 8 sample
540 540
 	  "humanomniexpress12v1b",  # Omni express 12
541 541
 	  "humanimmuno12v1b",       # Immuno chip 12
542 542
           "humancytosnp12v2p1h")    # CytoSNP 12
543
+	  ## RS: added cleancdfname()
543 544
 	  if(missing(chipType)){
544
-		  chipType = match.arg(annotation(RG), chipList)
545
-	  } else chipType = match.arg(chipType, chipList)
545
+		  chipType = match.arg(cleancdfname(annotation(RG)), chipList)
546
+	  } else chipType = match.arg(cleancdfname(chipType), chipList)
546 547
 	  pkgname = getCrlmmAnnotationName(chipType)
547 548
 	  if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
548 549
 		  suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
... ...
@@ -1106,7 +1107,7 @@ getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verb
1106 1107
                          Position = rep(NA, narrays))
1107 1108
 
1108 1109
        scanDates = data.frame(ScanDate=rep(NA, narrays), DecodeDate=rep(NA, narrays))
1109
-       rownames(scanDates) = gsub(paste(sep, fileExt, sep=""), "", filenames)
1110
+       rownames(scanDates) <- make.unique(gsub(paste(sep, fileExt, sep=""), "", filenames))
1110 1111
        ## read in the data
1111 1112
        for(i in seq_along(filenames)) {
1112 1113
                if(verbose)
... ...
@@ -1132,7 +1133,7 @@ getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verb
1132 1133
 			    data=scanDates,
1133 1134
 			    varMetadata=data.frame(labelDescription=colnames(scanDates),
1134 1135
 			                           row.names=colnames(scanDates)))
1135
-	return(protocoldata)
1136
+       return(protocoldata)
1136 1137
 }
1137 1138
 
1138 1139
 
... ...
@@ -1171,7 +1172,7 @@ constructInf <- function(sampleSheet=NULL,
1171 1172
 			}
1172 1173
 		}
1173 1174
 		pd = new("AnnotatedDataFrame", data = sampleSheet)
1174
-		sampleNames(pd) = basename(arrayNames)
1175
+		sampleNames(pd) = make.unique(basename(arrayNames))
1175 1176
 	}
1176 1177
 	if(is.null(arrayNames)) {
1177 1178
 		arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
... ...
@@ -1210,17 +1211,29 @@ constructInf <- function(sampleSheet=NULL,
1210 1211
 	if(verbose) message("Initializing container for genotyping and copy number estimation")
1211 1212
 	featureData = getFeatureData(cdfName, copynumber=TRUE)
1212 1213
 	nr = nrow(featureData); nc = narrays
1214
+	sns <-  if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
1215
+		make.unique(sampleSheet$Sample_ID)
1216
+        } else{
1217
+		make.unique(basename(arrayNames))
1218
+	}
1219
+	biga <- initializeBigMatrix(name="A", nr, nc)
1220
+	bigb <- initializeBigMatrix(name="B", nr, nc)
1221
+	bigc <- initializeBigMatrix(name="call", nr, nc)
1222
+	bigd <- initializeBigMatrix(name="callPr", nr,nc)
1223
+	colnames(biga) <- colnames(bigb) <- colnames(bigc) <- colnames(bigd) <- sns
1213 1224
 	cnSet <- new("CNSet",
1214
-		     alleleA=initializeBigMatrix(name="A", nr, nc),
1215
-		     alleleB=initializeBigMatrix(name="B", nr, nc),
1216
-		     call=initializeBigMatrix(name="call", nr, nc),
1217
-		     callProbability=initializeBigMatrix(name="callPr", nr,nc),
1225
+		     alleleA=biga,
1226
+		     alleleB=bigb,
1227
+		     call=bigc,
1228
+		     callProbability=bigd,
1218 1229
 		     annotation=cdfName,
1219 1230
 		     featureData=featureData,
1220 1231
 		     batch=batch)
1221
-        if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
1222
-	   	sampleNames(cnSet) = sampleSheet$Sample_ID
1223
-        } else  sampleNames(cnSet) = arrayNames
1232
+##        if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
1233
+##		sampleNames(cnSet) = make.unique(sampleSheet$Sample_ID)
1234
+##        } else{
1235
+##		sampleNames(cnSet) <- make.unique(basename(arrayNames))
1236
+##	}
1224 1237
 	if(saveDate){
1225 1238
 		protocolData = getProtocolData.Illumina(grnidats, sep=sep, fileExt=fileExt$green, verbose=verbose)
1226 1239
 	} else{
... ...
@@ -1233,7 +1246,7 @@ constructInf <- function(sampleSheet=NULL,
1233 1246
 	cnSet$gender <- initializeBigVector("gender", ncol(cnSet), vmode="integer")
1234 1247
 	cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double")
1235 1248
 	cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double")
1236
-	sampleNames(cnSet) <- basename(sampleNames(cnSet))
1249
+	##sampleNames(cnSet) <- basename(sampleNames(cnSet))
1237 1250
 	return(cnSet)
1238 1251
 }
1239 1252
 construct.Illumina <- constructInf
... ...
@@ -1263,8 +1276,7 @@ preprocessInf <- function(cnSet,
1263 1276
 	is.snp = isSnp(cnSet)
1264 1277
 	snp.index = which(is.snp)
1265 1278
 	narrays = ncol(cnSet)
1266
-#	sampleBatches <- splitIndicesByLength(seq(length=ncol(cnSet)), ocSamples())
1267
-	sampleBatches <- splitIndicesByNode(seq(length=ncol(cnSet)))
1279
+	sampleBatches <- splitIndicesByLength(seq_len(ncol(cnSet)), ocSamples()/getDoParWorkers())
1268 1280
 	mixtureParams = initializeBigMatrix("crlmmMixt-", 4, narrays, "double")
1269 1281
 	ocLapply(seq_along(sampleBatches),
1270 1282
 		 crlmm:::processIDAT,
... ...
@@ -50,6 +50,21 @@ setMethod("medians", signature(object="AssayData"),
50 50
 		  return(res)
51 51
 })
52 52
 
53
+getMedians <- function(object){
54
+	medianA.AA <- assayDataElement(object, "medianA.AA")
55
+	medianA.AB <- assayDataElement(object, "medianA.AB")
56
+	medianA.BB <- assayDataElement(object, "medianA.BB")
57
+	medianB.AA <- assayDataElement(object, "medianB.AA")
58
+	medianB.AB <- assayDataElement(object, "medianB.AB")
59
+	medianB.BB <- assayDataElement(object, "medianB.BB")
60
+	list(A.AA=medianA.AA,
61
+	     A.AB=medianA.AB,
62
+	     A.BB=medianA.BB,
63
+	     B.AA=medianB.AA,
64
+	     B.AB=medianB.AB,
65
+	     B.BB=medianB.BB)
66
+}
67
+
53 68
 setMethod("mads", signature(object="AssayData"),
54 69
 	  function(object, ...){
55 70
 		  batchnames <- sampleNames(object)
... ...
@@ -1,63 +1,6 @@
1 1
 setMethod("posteriorMean", signature(object="CNSet"), function(object) assayDataElement(object, "posteriorMean"))
2 2
 setReplaceMethod("posteriorMean", signature(object="CNSet", value="matrix"), function(object, value) assayDataElementReplace(object, "posteriorMean", value))
3 3
 
4
-setAs("CNSet", "oligoSnpSet", function(from, to){
5
-	cnSet2oligoSnpSet(from)
6
-})
7
-
8
-cnSet2oligoSnpSet <- function(object){
9
-	row.index <- seq_len(nrow(object))
10
-	col.index <- seq_len(ncol(object))
11
-	is.lds <- ifelse(is(calls(object), "ff_matrix") | is(calls(object), "ffdf"), TRUE, FALSE)
12
-	if(is.lds) stopifnot(isPackageLoaded("ff"))
13
-	b.r <- calculateRBaf(object)
14
-	b <- integerMatrix(b.r[[1]], 1000)
15
-	r <- integerMatrix(b.r[[2]], 100)
16
-##	if(is.lds){
17
-##		## initialize a big matrix for raw copy number
18
-##		message("creating an ff object for storing total copy number")
19
-##		tcn <- initializeBigMatrix(name="total_cn", nrow(object), ncol(object), vmode="double")
20
-##		for(j in 1:ncol(object)){
21
-##			tcn[, j] <- totalCopynumber(object, i=row.index, j=j)
22
-##		}
23
-##	} else {
24
-##		if(ncol(object) > 5){
25
-##			##this can be memory intensive, so we try to be careful
26
-##			col.index <- splitIndicesByLength(seq(length=ncol(object)), 5)
27
-##			tcn <- matrix(NA, nrow(object), ncol(object))
28
-##			dimnames(tcn) <- list(featureNames(object), sampleNames(object))
29
-##			rows <- 1:nrow(object)
30
-##			for(i in seq_along(col.index)){
31
-##				cat(".")
32
-##				j <- col.index[[i]]
33
-##				cnSet <- object[, j]
34
-##				tcn[, j] <- totalCopynumber(cnSet, i=row.index, j=1:ncol(cnSet))
35
-##				rm(cnSet); gc()
36
-##			}
37
-##			cat("\n")
38
-##		} else {
39
-##			tcn <- totalCopynumber(object, i=row.index, j=col.index)
40
-##		}
41
-##	}
42
-##	message("Transforming copy number to log2 scale")
43
-##	tcn[tcn < 0.1] <- 0.1
44
-##	tcn[tcn > 8] <- 8
45
-##	log.tcn <- log2(tcn)
46
-	tmp <- new("oligoSnpSet",
47
-		   copyNumber=integerMatrix(b.r[[2]], scale=100),
48
-		   call=calls(object),
49
-		   callProbability=snpCallProbability(object),
50
-		   annotation=annotation(object),
51
-		   featureData=featureData(object),
52
-		   phenoData=phenoData(object),
53
-		   experimentData=experimentData(object),
54
-		   protocolData=protocolData(object),
55
-		   is.integer=FALSE)
56
-	tmp <- assayDataElementReplace(tmp, "baf", integerMatrix(b.r[[1]], scale=1000))
57
-	return(tmp)
58
-}
59
-
60
-
61 4
 linearParamElementReplace <- function(obj, elt, value) {
62 5
     storage.mode <- storageMode(batchStatistics(obj))
63 6
     switch(storage.mode,
... ...
@@ -185,7 +128,7 @@ ACN <- function(object, allele, i , j){
185 128
 	}
186 129
 	## Define batch.index and sample.index
187 130
 	if(!missing.j) {
188
-		batches <- unique(as.character(batch(object))[j])
131
+		batches <- unique(as.character(batch(object)[j]))
189 132
 		##batches <- as.character(batch(object)[j])
190 133
 		batch.index <- match(batches, batchNames(object))
191 134
 	} else {
... ...
@@ -587,78 +530,129 @@ setMethod("xyplot", signature(x="formula", data="CNSet"),
587 530
 })
588 531
 
589 532
 setMethod("calculateRBaf", signature(object="CNSet"),
590
-	  function(object, batch.name){
591
-		  all.autosomes <- all(chromosome(object) < 23)
592
-		  if(!all.autosomes){
593
-			  stop("method currently only defined for chromosomes 1-22")
594
-		  }
595
-		  if(missing(batch.name)) batch.name <- batchNames(object)[1]
596
-		  stopifnot(batch.name %in% batchNames(object))
597
-		  if(length(batch.name) > 1){
598
-			  warning("only the first batch in batch.name processed")
599
-			  batch.name <- batch.name[1]
600
-		  }
601
-		  RTheta.aa <- calculateRTheta(object, "AA", batch.name)
602
-		  RTheta.ab <- calculateRTheta(object, "AB", batch.name)
603
-		  RTheta.bb <- calculateRTheta(object, "BB", batch.name)
604
-
605
-		  J <- which(batch(object) == batch.name)
606
-
607
-		  theta.aa <- matrix(RTheta.aa[, "theta"], nrow(object), length(J), byrow=FALSE)
608
-		  theta.ab <- matrix(RTheta.ab[, "theta"], nrow(object), length(J), byrow=FALSE)
609
-		  theta.bb <- matrix(RTheta.bb[, "theta"], nrow(object), length(J), byrow=FALSE)
610
-
611
-		  a <- A(object)[, J, drop=FALSE]
612
-		  b <- B(object)[, J, drop=FALSE] ## NA's for b where nonpolymorphic
613
-		  is.np <- !isSnp(object)
614
-		  ##b[is.np, ] <- a[is.np, ]
615
-		  b[is.np, ] <- 0L
616
-		  dns <- dimnames(a)
617
-		  dimnames(a) <- dimnames(b) <- NULL
618
-		  obs.theta <- atan2(b, a)*2/pi
619
-
620
-		  lessAA <- obs.theta < theta.aa
621
-		  lessAB <- obs.theta < theta.ab
622
-		  lessBB <- obs.theta < theta.bb
623
-		  grAA <- !lessAA
624
-		  grAB <- !lessAB
625
-		  grBB <- !lessBB
626
-		  not.na <- !is.na(theta.aa)
627
-		  I1 <- grAA & lessAB & not.na
628
-		  I2 <- grAB & lessBB & not.na
629
-
630
-		  bf <- matrix(NA, nrow(object), ncol(a))
631
-		  bf[I1] <- 0.5 * ((obs.theta-theta.aa)/(theta.ab-theta.aa))[I1]
632
-		  bf[I2] <- (.5 * (obs.theta - theta.ab) / (theta.bb - theta.ab))[I2] + 0.5
633
-		  bf[lessAA] <- 0
634
-		  bf[grBB] <- 1
635
-
636
-		  r.expected <- matrix(NA, nrow(object), ncol(a))
637
-		  r.aa <- matrix(RTheta.aa[, "R"], nrow(object), length(J), byrow=FALSE)
638
-		  r.ab <- matrix(RTheta.ab[, "R"], nrow(object), length(J), byrow=FALSE)
639
-		  r.bb <- matrix(RTheta.bb[, "R"], nrow(object), length(J), byrow=FALSE)
640
-		  rm(RTheta.aa, RTheta.ab, RTheta.bb); gc()
641
-		  obs.r <- a+b
642
-
643
-		  lessAA <- lessAA & not.na
644
-		  grBB <- grBB & not.na
645
-		  tmp <- ((obs.theta - theta.aa) * (r.ab-r.aa)/(theta.ab-theta.aa))[I1] + r.aa[I1]
646
-		  r.expected[I1] <- tmp
647
-		  tmp <- ((obs.theta - theta.ab) * (r.bb - r.ab)/(theta.bb-theta.ab))[I2] + r.ab[I2]
648
-		  r.expected[I2] <- tmp
649
-		  r.expected[lessAA] <- r.aa[lessAA]
650
-		  r.expected[grBB] <- r.bb[grBB]
533
+	  function(object, batch.name, chrom){
534
+		  calculateRBafCNSet(object, batch.name, chrom)
535
+	  })
651 536
 
652
-		  index.np <- which(is.np)
653
-		  if(length(index.np) > 0){
654
-			  a.np <- A(object)[index.np, J]
655
-			  meds <- rowMedians(a.np, na.rm=TRUE)
656
-			  r.expected[index.np, ] <- matrix(meds, length(index.np), ncol(a.np))
657
-		  }
658
-		  lrr <- log2(obs.r/r.expected)
537
+calculateRBafCNSet <- function(object, batch.name, chrom){
538
+	if(missing(batch.name)) batch.name <- batchNames(object)
539
+	if(missing(chrom)) chrom <- unique(chromosome(object))
540
+	if(!(all(batch.name %in% batchNames(object)))) stop("batch.name must be belong to batchNames(object)")
541
+	chr <- chromosome(object)
542
+	valid.chrs <- chr <= 23 & chr %in% chrom
543
+	index <- which(valid.chrs)
544
+	indexlist <- split(index, chr[index])
545
+	J <- which(batch(object) %in% batch.name)
546
+	sns <- sampleNames(object)[J]
547
+	sampleindex <- split(J, batch(object)[J])
548
+	if(!all(valid.chrs)) warning("Only computing log R ratios and BAFs for autosomes and chr X")
549
+	## if ff package is loaded, these will be ff objects
550
+	chr <- names(indexlist)
551
+	rlist <- blist <- vector("list", length(indexlist))
552
+	for(i in seq_along(indexlist)){
553
+		I <- indexlist[[i]]
554
+		nr <- length(I)
555
+		CHR <- names(indexlist)[i]
556
+		bafname <- paste("baf_chr", CHR, sep="")
557
+		rname <- paste("lrr_chr", CHR, sep="")
558
+		bmatrix <- initializeBigMatrix(bafname, nr=nr, nc=length(sns), vmode="integer")
559
+		rmatrix <- initializeBigMatrix(rname, nr=nr, nc=length(sns), vmode="integer")
560
+		colnames(rmatrix) <- colnames(bmatrix) <- sns
561
+		## put rownames in order of physical position
562
+		ix <- order(position(object)[I])
563
+		I <- I[ix]
564
+		rownames(rmatrix) <- rownames(bmatrix) <- featureNames(object)[I]
565
+		for(j in seq_along(sampleindex)){
566
+			bname <- batch.name[j]
567
+			J <- sampleindex[[j]]
568
+			res <- calculateRTheta(object=object,
569
+					       batch.name=bname,
570
+					       feature.index=I)
571
+			k <- match(sampleNames(object)[J], sns)
572
+			bmatrix[, k] <- res[["baf"]]
573
+			rmatrix[, k] <- res[["lrr"]]
574
+			##colnames(bmatrix)[J] <- colnames(rmatrix)[J] <- sampleNames(object)[J]
575
+		}
576
+		blist[[i]] <- bmatrix
577
+		rlist[[i]] <- rmatrix
578
+	}
579
+	res <- list(baf=blist,
580
+		    lrr=rlist)
581
+	return(res)
582
+}
659 583
 
660
-		  dimnames(bf) <- dimnames(lrr) <- dns
661
-		  res <- list(baf=bf,
662
-			      lrr=lrr)
663
-		  return(res)
664
-	  })
584
+##setMethod("calculateRBaf", signature(object="CNSet"),
585
+##	  function(object, batch.name){
586
+##		  all.autosomes <- all(chromosome(object) < 23)
587
+##		  if(!all.autosomes){
588
+##			  stop("method currently only defined for chromosomes 1-22")
589
+##		  }
590
+##		  if(missing(batch.name)) batch.name <- batchNames(object)[1]
591
+##		  stopifnot(batch.name %in% batchNames(object))
592
+##		  if(length(batch.name) > 1){
593
+##			  warning("only the first batch in batch.name processed")
594
+##			  batch.name <- batch.name[1]
595
+##		  }
596
+##		  RTheta.aa <- calculateRTheta(object, "AA", batch.name)
597
+##		  RTheta.ab <- calculateRTheta(object, "AB", batch.name)
598
+##		  RTheta.bb <- calculateRTheta(object, "BB", batch.name)
599
+##
600
+##		  J <- which(batch(object) == batch.name)
601
+##
602
+##		  theta.aa <- matrix(RTheta.aa[, "theta"], nrow(object), length(J), byrow=FALSE)
603
+##		  theta.ab <- matrix(RTheta.ab[, "theta"], nrow(object), length(J), byrow=FALSE)
604
+##		  theta.bb <- matrix(RTheta.bb[, "theta"], nrow(object), length(J), byrow=FALSE)
605
+##
606
+##		  a <- A(object)[, J, drop=FALSE]
607
+##		  b <- B(object)[, J, drop=FALSE] ## NA's for b where nonpolymorphic
608
+##		  is.np <- !isSnp(object)
609
+##		  ##b[is.np, ] <- a[is.np, ]
610
+##		  b[is.np, ] <- 0L
611
+##		  dns <- dimnames(a)
612
+##		  dimnames(a) <- dimnames(b) <- NULL
613
+##		  obs.theta <- atan2(b, a)*2/pi
614
+##
615
+##		  lessAA <- obs.theta < theta.aa
616
+##		  lessAB <- obs.theta < theta.ab
617
+##		  lessBB <- obs.theta < theta.bb
618
+##		  grAA <- !lessAA
619
+##		  grAB <- !lessAB
620
+##		  grBB <- !lessBB
621
+##		  not.na <- !is.na(theta.aa)
622
+##		  I1 <- grAA & lessAB & not.na
623
+##		  I2 <- grAB & lessBB & not.na
624
+##
625
+##		  bf <- matrix(NA, nrow(object), ncol(a))
626
+##		  bf[I1] <- 0.5 * ((obs.theta-theta.aa)/(theta.ab-theta.aa))[I1]
627
+##		  bf[I2] <- (.5 * (obs.theta - theta.ab) / (theta.bb - theta.ab))[I2] + 0.5
628
+##		  bf[lessAA] <- 0
629
+##		  bf[grBB] <- 1
630
+##
631
+##		  r.expected <- matrix(NA, nrow(object), ncol(a))
632
+##		  r.aa <- matrix(RTheta.aa[, "R"], nrow(object), length(J), byrow=FALSE)
633
+##		  r.ab <- matrix(RTheta.ab[, "R"], nrow(object), length(J), byrow=FALSE)
634
+##		  r.bb <- matrix(RTheta.bb[, "R"], nrow(object), length(J), byrow=FALSE)
635
+##		  rm(RTheta.aa, RTheta.ab, RTheta.bb); gc()
636
+##		  obs.r <- a+b
637
+##
638
+##		  lessAA <- lessAA & not.na
639
+##		  grBB <- grBB & not.na
640
+##		  tmp <- ((obs.theta - theta.aa) * (r.ab-r.aa)/(theta.ab-theta.aa))[I1] + r.aa[I1]
641
+##		  r.expected[I1] <- tmp
642
+##		  tmp <- ((obs.theta - theta.ab) * (r.bb - r.ab)/(theta.bb-theta.ab))[I2] + r.ab[I2]
643
+##		  r.expected[I2] <- tmp
644
+##		  r.expected[lessAA] <- r.aa[lessAA]
645
+##		  r.expected[grBB] <- r.bb[grBB]
646
+##		  index.np <- which(is.np)
647
+##		  if(length(index.np) > 0){
648
+##			  a.np <- A(object)[index.np, J, drop=FALSE]
649
+##			  meds <- rowMedians(a.np, na.rm=TRUE)
650
+##			  r.expected[index.np, ] <- matrix(meds, length(index.np), ncol(a.np))
651
+##		  }
652
+##		  lrr <- log2(obs.r/r.expected)
653
+##
654
+##		  dimnames(bf) <- dimnames(lrr) <- dns
655
+##		  res <- list(baf=bf,
656
+##			      lrr=lrr)
657
+##		  return(res)
658
+##	  })
... ...
@@ -152,30 +152,39 @@ celDates <- function(celfiles){
152 152
 	return(celdts)
153 153
 }
154 154
 
155
-validCdfNames <- function(){
155
+illuminaCdfNames <- function(){
156
+	c("human1mv1c",# 1M
157
+	  "human370v1c",            # 370CNV
158
+	  "human650v3a",            # 650Y
159
+	  "human610quadv1b",        # 610 quad
160
+	  "human660quadv1a",        # 660 quad
161
+	  "human370quadv3c",        # 370CNV quad
162
+	  "human550v3b",            # 550K
163
+	  "human1mduov3b",          # 1M Duo
164
+	  "humanomni1quadv1b",      # Omni1 quad
165
+	  "humanomni25quadv1b",     # Omni2.5 quad
166
+	  "humanomni258v1a",        # Omni2.5 8 sample
167
+	  "humanomniexpress12v1b",  # Omni express 12
168
+	  "humanimmuno12v1b",       # Immuno chip 12
169
+	  "humancytosnp12v2p1h")    # CytoSNP 12
170
+}
171
+
172
+affyCdfNames <- function(){
156 173
 	c("genomewidesnp6",
157
-	  "genomewidesnp5",
158
-	  "human370v1c",
159
-	  "human370quadv3c",
160
-	  "human550v3b",
161
-	  "human650v3a",
162
-	  "human610quadv1b",
163
-	  "human660quadv1a",
164
-	  "human1mduov3b",
165
-	  "humanomni1quadv1b",
166
-          "humanomniexpress12v1b",
167
-	  "humanomni25quadv1b",
168
-          "humanimmuno12v1b",
169
-          "humancytosnp12v2p1h")
174
+	  "genomewidesnp5")
170 175
 }
176
+
177
+validCdfNames <- function(){
178
+	c(affyCdfNames(),
179
+	  illuminaCdfNames())
180
+}
181
+
182
+cleancdfname <- function(x) strsplit(x, "Crlmm")[[1]][[1]]
183
+
171 184
 isValidCdfName <- function(cdfName){
172 185
 	chipList <- validCdfNames()
173
-	result <- cdfName %in% chipList
174
-	if(!(result)){
175
-		warning("cdfName must be one of the following: ",
176
-			chipList)
177
-	}
178
-	return(result)
186
+	match.arg(cleancdfname(cdfName), chipList)
187
+	return(TRUE)
179 188
 }
180 189
 
181 190
 isPackageLoaded <- function(pkg){
182 191
Binary files a/data/cnSetExample.rda and b/data/cnSetExample.rda differ
183 192
Binary files a/data/cnSetExample2.rda and b/data/cnSetExample2.rda differ
... ...
@@ -38,12 +38,12 @@ citEntry(entry="article",
38 38
 		 "Aravinda Chakravarti and",
39 39
 		 "Rafael A Irizarry"),
40 40
   journal = "Biostatistics",
41
-  year = "2010",
42
-  volume="",
43
-  pages="",
41
+  year = "2011",
42
+  volume="12",
43
+  pages="33--50",
44 44
   textVersion = paste("Scharpf RB, Ruczinski I, Carvalho B, Doan B, Chakravarti A, Irizarry RA",
45 45
                       "A multilevel model to address batch effects in copy number estimation using SNP arrays",
46
-		      "Biostatistics. 2010."))
46
+		      "Biostatistics. 2011."))
47 47
 
48 48
 citEntry(entry = "Article",
49 49
   title        = "Using the {R} Package {crlmm} for Genotyping and Copy Number Estimation",
... ...
@@ -1,3 +1,4 @@
1 1
 illumina_copynumber.tex
2 2
 run*.R
3
+*.pdf
3 4
 
... ...
@@ -47,9 +47,11 @@ options(continue=" ", width=70)
47 47
 \section{Set up}
48 48
 
49 49
 <<cdfname, results=hide>>=
50
-library(crlmm)
51
-library(ff)
52
-library(cacheSweave)
50
+library(oligoClasses)
51
+library2(crlmm)
52
+library2(ff)
53
+if(!exists("useCache")) useCache <- TRUE
54
+if(useCache) library2(cacheSweave)
53 55
 @
54 56
 
55 57
 This vignette analyzes HapMap samples assayed on the Affymetrix 6.0
... ...
@@ -67,8 +69,11 @@ cdfName <- "genomewidesnp6"
67 69
 
68 70
 The HapMap CEL files are stored in a local directory assigned to
69 71
 \verb+pathToCels+ in the following code. The genotyping step will
70
-create several files with \verb+ff+ extensions. We will store these
71
-files to the path indicated by \verb+outdir+.
72
+create several files with \verb+ff+ extensions. The ff objects contain
73
+the low-level, normalized intensities as well as parameters used to
74
+subsequently estimate copy number and B allele frequencies. These
75
+files should not be deleted or moved.  We will store these files to
76
+the path indicated by \verb+outdir+.
72 77
 
73 78
 <<setup>>=
74 79
 pathToCels <- "/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m"
... ...
@@ -86,7 +91,7 @@ ldPath(outdir)
86 91
 
87 92
 % only needed if cacheing
88 93
 <<cachedir, echo=FALSE>>=
89
-setCacheDir(outdir)
94
+if(useCache) setCacheDir(outdir)
90 95
 @
91 96
 
92 97
 The \R{} functions \Rfunction{ocProbesets} and \Rfunction{ocSamples}
... ...
@@ -107,6 +112,10 @@ Yoruban ('Y') samples.
107 112
 <<celfiles>>=
108 113
 celFiles <- list.celfiles(pathToCels, full.names=TRUE, pattern=".CEL")
109 114
 celFiles <- celFiles[substr(basename(celFiles), 13, 13) %in% c("C", "Y")]
115
+if(exists("file.index")){
116
+	celFiles <- celFiles[file.index]
117
+}
118
+##celFiles <- celFiles[substr(basename(celFiles), 13, 13) %in% "C"]
110 119
 @
111 120
 
112 121
 Finally, copy number analyses using \crlmm{} require specification of
... ...
@@ -140,7 +149,7 @@ confidence score for the genotype calls.  The function
140 149
 \Rfunction{genotype} performs both the preprocessing and genotyping.
141 150
 
142 151
 <<LDS_genotype, cache=TRUE>>=
143
-cnSet <- genotype(celFiles, batch=plates, cdfName=cdfName)
152
+cnSet <- genotype(celFiles, batch=plates, cdfName=cdfName, genome="hg19")
144 153
 @
145 154
 
146 155
 Segment faults that occur with the above step can often be traced to a
... ...
@@ -169,22 +178,10 @@ genotype calls are stored on disk rather than in active memory.
169 178
 print(object.size(cnSet), units="Mb")
170 179
 @
171 180
 
172
-We save the \Robject{cnSet} object in a local directory for subsequent
173
-copy number analysis in the \verb+copynumber+ vignette.
174
-
175
-<<save,cache=TRUE>>=
176
-saveObject <- function(outdir, cnSet) {
177
-	save(cnSet, file=file.path(outdir, "cnSet.rda"))
178
-	TRUE
179
-}
180
-(cnset.saved <- saveObject(outdir, cnSet))
181
-@
182
-
183 181
 %Users can proceed to the \verb+copynumber+ vignette for copy number
184 182
 %analyses.  See the \verb+Infrastructure+ vignette for additional
185 183
 %details on the \Robject{CNSet} class, including an overview of the
186 184
 %available accessors.
187
-
188 185
 \SweaveInput{copynumber}
189 186
 
190 187
 
191 188
Binary files a/inst/scripts/AffyGW.pdf and b/inst/scripts/AffyGW.pdf differ
... ...
@@ -6,6 +6,7 @@
6 6
 \usepackage{graphicx}
7 7
 \usepackage{natbib}
8 8
 \usepackage{amsmath}
9
+\usepackage{url}
9 10
 \usepackage[margin=1in]{geometry}
10 11
 \newcommand{\crlmm}{\Rpackage{crlmm}}
11 12
 \newcommand{\Rfunction}[1]{{\texttt{#1}}}
... ...
@@ -233,10 +234,10 @@ wrapper to the above functions and returns an object of class
233 234
 
234 235
 <<genotype.Illumina,cache=TRUE,results=hide>>=
235 236
 cnSet <- genotype.Illumina(sampleSheet=samplesheet,
236
-			    arrayNames=arrayNames,
237
-			    arrayInfoColNames=arrayInfo,
238
-			    cdfName="human370v1c",
239
-			    batch=batch)
237
+			   arrayNames=arrayNames,
238
+			   arrayInfoColNames=arrayInfo,
239
+			   cdfName="human370v1c",
240
+			   batch=batch)
240 241
 @
241 242
 
242 243
 
243 244
Binary files a/inst/scripts/IlluminaPreprocessCN.pdf and b/inst/scripts/IlluminaPreprocessCN.pdf differ
... ...
@@ -129,11 +129,9 @@ directory to store output files and setting the \Rfunction{ldPath}
129 129
 function with this path.
130 130
 
131 131
 <<path>>=
132
-if(getRversion() < "2.13.0"){
133
-	rpath <- getRversion()
134
-} else rpath <- "trunk"
135
-outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="")
132
+outdir <- paste("/local_data/r00/crlmm/", getRversion(), "/infrastructure", sep="")
136 133
 ldPath(outdir)
134
+if(!file.exists(outdir)) dir.create(outdir)
137 135
 @
138 136
 
139 137
 
... ...
@@ -258,6 +256,7 @@ is a SNP can be accessed through accessors defined for the
258 256
 \verb+featureData+.
259 257
 
260 258
 <<featuredataAccessors>>=
259
+library(Biobase)
261 260
 fvarLabels(exampleSet)
262 261
 position(exampleSet)[1:10]
263 262
 chromosome(exampleSet)[1:10]
... ...
@@ -554,7 +553,7 @@ estimates are missing for a given SNP.
554 553
 
555 554
 <<NAchromosomeX>>=
556 555
 ## start with first batch
557
-sample.index <- 1:90
556
+sample.index <- which(batch(cnSet)==batch(cnSet)[1])
558 557
 X.index <- which(isSnp(cnSet) & chromosome(cnSet) == 23)
559 558
 ca.X <- CA(cnSet, i=X.index, j=sample.index)
560 559
 missing.caX <- is.na(ca.X)
561 560
Binary files a/inst/scripts/Infrastructure.pdf and b/inst/scripts/Infrastructure.pdf differ
... ...
@@ -154,7 +154,7 @@ formally, \newcommand{\A}{A} \newcommand{\B}{B}
154 154
     \phi}_{k,ip}}\left(I_{k,ijp}-{\hat \nu}_{k,ip}\right), ~0\right\}
155 155
 \mbox{~for~} k \in \{\A, \B\}.
156 156
 \end{eqnarray}
157
-\noindent See \cite{Scharpf2010} for details.
157
+\noindent See \cite{Scharpf2011} for details.
158 158
 
159 159
 The function \Rfunction{totalCopynumber} translates the normalized
160 160
 intensities to an estimate of raw copy number by adding the
... ...
@@ -175,42 +175,49 @@ dim(tmp2)
175 175
 
176 176
 Alternatively, the functions \Rfunction{CA} and \Rfunction{CB} compute
177 177
 the allele-specific copy number.  For instance, the following code
178
-chunk computes the allele-specific summaries at all polymorphic loci.
178
+chunk computes the allele-specific summaries at all polymorphic loci
179
+for the first 2 samples.
179 180
 
180 181
 <<ca>>=
181 182
 snp.index <- which(isSnp(cnSet) & !is.na(chromosome(cnSet)))
182
-ca <- CA(cnSet, i=snp.index, j=seq_len(ncol(cnSet)))
183
-cb <- CB(cnSet, i=snp.index, j=seq_len(ncol(cnSet)))
183
+ca <- CA(cnSet, i=snp.index, j=1:2)
184
+cb <- CB(cnSet, i=snp.index, j=1:2)
184 185
 @
185 186
 
186 187
 
187
-\subsection{Container for raw copy number}
188
+\subsection{Container for log R ratios and B allele frequencies}
188 189
 
189 190
 A useful container for storing the \crlmm{} genotypes, genotype
190
-confidence scores, and the total copy number at each marker is the
191
-\Rclass{oligoSnpSet} class.  Coercion of a \Rclass{CNSet} object to a
192
-\Rfunction{oligoSnpSet} object can be acheived by using the method
193
-\Rfunction{as} (as illustrated below). Users should note that if the
194
-\verb+assayData+ elements in the \Rclass{CNSet} instance are
195
-\Rpackage{ff} objects, the \verb+assayData+ elements of the
196
-instantiated \Rclass{oligoSnpSet} will also be \Rpackage{ff}-dervied
197
-objects (a new \verb+total_cn*.ff+ file will be created in the
198
-\verb+ldPath()+ directory).
191
+confidence scores, and the total or relative copy number at each
192
+marker is the \Rclass{oligoSetList} class.  Coercion of a
193
+\Rclass{CNSet} object to a \Rfunction{oligoSnpSet} object can be
194
+acheived by using the function \Rfunction{constructOligoSetFrom} as
195
+illustrated below. Users should note that if the \verb+assayData+
196
+elements in the \Rclass{CNSet} instance are \Rpackage{ff} objects, the
197
+\verb+assayData+ elements of each element in the \Rclass{oligoSetList}
198
+object will be \Rpackage{ff}-dervied objects (a new
199
+\verb+total_cn*.ff+ file will be created in the \verb+ldPath()+
200
+directory).
199 201
 
200 202
 <<oligoSnpSet>>=
203
+library(VanillaICE)
201 204
 open(cnSet3)
202
-oligoSet <- as(cnSet3, "oligoSnpSet")
205
+oligoSetList <- constructOligoSetListFrom(cnSet3, batch.name=batch(cnSet3)[1])
203 206
 close(cnSet3)
204
-class(copyNumber(oligoSet))
207
+show(oligoSetList)
208
+class(oligoSetList)
209
+## oligoSnpSet of first chromosome
210
+oligoSetList[[1]]
205 211
 @
206 212
 
207
-\noindent Note that the raw copy number estimates stored in the
208
-\Robject{oligoSnpSet} object can be retrieved by the
209
-\Rfunction{copyNumber} accessor and is equivalent to that returned by
210
-the \Rfunction{totalCopynumber} function defined over the same row and
211
-column indices.
213
+\noindent Note that log R ratios stored in the \Robject{oligoSnpSet}
214
+object can be retrieved by the \Rfunction{copyNumber} accessor.  B
215
+allele frequences are retrieved by the \Rfunction{baf} accessor.
212 216
 
213 217
 <<testEqual>>=
214
-total.cn3 <- totalCopynumber(cnSet3, i=1:nrow(cnSet3), j=seq_len(ncol(cnSet3)))
215
-all.equal(copyNumber(oligoSet), total.cn3)
216
-@
217 218
\ No newline at end of file
219
+lrrList <- copyNumber(oligoSetList)
220
+class(lrrList)
221
+dim(lrrList[[1]]) ## log R ratios for chromosome 1.
222
+bafList <- baf(oligoSetList)
223
+dim(bafList[[1]]) ## B allele frequencies for chromosome 1
224
+@
... ...
@@ -8,7 +8,7 @@ all: inst test
8 8
 inst: # Install package
9 9
 
10 10
 	cd ${TOP}/..;\
11
-	R CMD INSTALL ${PKG}
11
+	R CMD INSTALL --no-lock ${PKG}
12 12
 
13 13
 test: # Run unit tests
14 14
 	export RCMDCHECK=FALSE;\
15 15
new file mode 100644
... ...
@@ -0,0 +1,39 @@
1
+test_crlmm <- function(){
2
+	library(oligoClasses)
3
+	if (require(genomewidesnp6Crlmm) & require(hapmapsnp6)){
4
+		path <- system.file("celFiles", package="hapmapsnp6")
5
+		cels <- list.celfiles(path, full.names=TRUE)
6
+		(crlmmOutput <- crlmm(cels))
7
+	}
8
+}
9
+
10
+test_duplicates <- function(){
11
+	library(crlmm);library(RUnit); library(oligoClasses)
12
+	if (require(genomewidesnp6Crlmm) & require(hapmapsnp6)){
13
+		path <- system.file("celFiles", package="hapmapsnp6")
14
+		cels <- list.celfiles(path, full.names=TRUE)
15
+		## Quickly fails because sample identifiers are not unique
16
+		checkException(crlmmOutput <- crlmm(cels[c(1,1, 2)]), silent=FALSE)
17
+
18
+	}
19
+	if(FALSE){
20
+		library2(ff)
21
+		datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k"
22
+		## read in your samplesheet
23
+		samplesheet <- read.csv(file.path(datadir, "HumanHap370Duo_Sample_Map.csv"), header=TRUE, as.is=TRUE)
24
+		samplesheet <- samplesheet[-c(28:46,61:75,78:79), ]
25
+		arrayNames <- file.path(datadir, unique(samplesheet[, "SentrixPosition"]))
26
+		index <- c(1,1,2,2,3,4)
27
+		any(duplicate(arrayNames[index]))
28
+		arrayNames <- arrayNames[index]
29
+		ss <- samplesheet[index, ]
30
+		arrayInfo <- list(barcode=NULL, position="SentrixPosition")
31
+		cnSet <- genotype.Illumina(sampleSheet=ss,
32
+					   arrayNames=arrayNames,
33
+					   arrayInfoColNames=arrayInfo,
34
+					   cdfName="human370v1c",
35
+					   batch=rep("1", nrow(ss)))
36
+		checkTrue(validObject(cnSet))
37
+	}
38
+
39
+}
... ...
@@ -5,45 +5,63 @@
5 5
 \description{Calculate log R ratios and B allele frequencies from
6 6
   a \code{CNSet} object}
7 7
 \usage{
8
-calculateRBaf(object, batch.name)
8
+calculateRBaf(object, batch.name, chrom)
9 9
 }
10 10
 
11 11
 \arguments{
12
+
12 13
   \item{object}{A \code{CNSet} object.}
13
-  \item{batch.name}{A character string.  See details.}
14
+
15
+  \item{batch.name}{A character string indicating the batch.  If
16
+  missing, log R ratios and B allele frequencies are calculated for all
17
+  batches in the \code{object}.}
18
+
19
+  \item{chrom}{Integer indicating which chromosome to process. If
20
+  missing, B allele frequencies and log R ratios are calculated for all
21
+  autosomal chromosomes and chromosome X that are included in
22
+  \code{object}.}
23
+
14 24
 }
15
-\details{\code{batch.name} must be a value in
16
-  \code{batch(object)}. Currently, one must specify a single
17
-  \code{batch.name}. If a character vector for \code{batch.name} is
18
-  supplied, only the first is evaluated.
25
+
26
+\details{
27
+
28
+  \code{batch.name} must be a value in \code{batch(object)}. Currently,
29
+  one must specify a single \code{batch.name}. If a character vector for
30
+  \code{batch.name} is supplied, only the first is evaluated.
19 31
 
20 32
   TODO: A description of how these values are calculated.
21 33
 
22 34
 }
23 35
 
24 36
 \value{
25
-  A list.
37
+  A named list.
26 38
 
27
-  \code{baf}: A matrix of B allele frequencies.
39
+  \code{baf}: Each element in the baf list is a matrix of B allele
40
+  frequencies (one matrix for each chromosome).
28 41
 
29
-  \code{lrr}: A matrix of log R ratios.
42
+  \code{lrr}: Each element in the lrr list is a matrix of log R ratios
43
+  (one matrix for each chromosome).
44
+
45
+  The log R ratios were scaled by a factor of 100 and stored as an
46
+  integer.  B allele frequencies were scaled by a factor of 1000 and
47
+  stored as an integer.
30 48
 
31 49
 }
32 50
 
33 51
 \references{
34 52
 
35
-  Reference for BAFs, LRRs.
53
+  Peiffer et al., High-resolution genomic profiling of chromosomal aberrations using
54
+  Infinium whole-genome genotyping (2006), Genome Research
36 55
 
37 56
 }
57
+
38 58
 \author{Lynn Mireless}
39 59
 
40 60
 \examples{
41 61
 data(cnSetExample)
42 62
 baf.lrr <- calculateRBaf(cnSetExample, "SHELF")
43
-hist(baf.lrr[["baf"]], breaks=100)
44
-hist(baf.lrr[["lrr"]], breaks=100)
63
+hist(baf.lrr[["baf"]][[1]]/1000, breaks=100)
64
+hist(baf.lrr[["lrr"]][[1]]/100, breaks=100)
45 65
 }
46
-% Add one or more standard keywords, see file 'KEYWORDS' in the
47
-% R documentation directory.
48 66
 \keyword{list}
49
-\keyword{methods}
50 67
\ No newline at end of file
68
+\keyword{methods}
... ...
@@ -12,7 +12,7 @@
12 12
 genotype(filenames, cdfName, batch, mixtureSampleSize = 10^5, eps =0.1,
13 13
          verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3),
14 14
          DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000,
15
-         gender = NULL, returnParams = TRUE, badSNP = 0.7)
15
+         gender = NULL, returnParams = TRUE, badSNP = 0.7,  genome=c("hg19", "hg18"))
16 16
 }
17 17
 \arguments{
18 18
   \item{filenames}{ complete path to CEL files}
... ...
@@ -37,7 +37,10 @@ genotype(filenames, cdfName, batch, mixtureSampleSize = 10^5, eps =0.1,
37 37
   with same length as filenames.  If missing, the gender is predicted.}
38 38
   \item{returnParams}{'logical'. Return recalibrated parameters from
39 39
     crlmm.}
40
-  \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)}
40
+  \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects
41
+    batchQC)}
42
+  \item{genome}{character string indicating the UCSC genome build for
43
+    the SNP annotation}
41 44
 }
42 45
 
43 46
 \details{
44 47
new file mode 100644
... ...
@@ -0,0 +1,26 @@
1
+\name{validCdfNames}
2
+\alias{validCdfNames}
3
+