* collab: (34 commits)
revert change to IlluminaPreprocessCN
fix bug in isValidCdfName
print warning when all features in a batch of probes are flagged, but allow processing to continue
add utility cleancdfnames
Add validCdfNames.Rd
export validCdfNames
imputeGender fix when chromosome Y not available
Use splitIndicesByLength(index, ocSamples/getDoParWorkers())
Can not allocate vector of size XG with genotype.Illumina. Use splitIndicesByNode() only if the length of the list is greater than the split from splitIndicesByLength(). Otherwise, split by length using ocSamples()
update .gitignore
Add make.unique for sampleSheet$Sample_ID in readIdatFiles
bug in description
ensure sample ids stored in samplesheet are unique when constructing cnSet object
update oligoClasses dependency
update unit test for genotype.Illumina
revert change in constructInf call from genotype.Illumina
Update genotype.Rd
edit ACN function
1.15.6 use make.unique(basename(arrayNames)) to allow processing of Illumina samples with duplicated barcodes
check that sample identifies are unique in crlmm function
...
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@67435 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,12 +1,12 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.15.0 |
|
4 |
+Version: 1.15.12 |
|
5 | 5 |
Author: Benilton S Carvalho, Robert Scharpf, Matt Ritchie, Ingo Ruczinski, Rafael A Irizarry |
6 | 6 |
Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
7 | 7 |
Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms |
8 | 8 |
License: Artistic-2.0 |
9 |
-Depends: R (>= 2.14.0), oligoClasses (>= 1.17.36) |
|
9 |
+Depends: R (>= 2.14.0), oligoClasses (>= 1.19.17) |
|
10 | 10 |
Imports: methods, |
11 | 11 |
Biobase (>= 2.15.4), |
12 | 12 |
BiocGenerics, |
... | ... |
@@ -20,7 +20,8 @@ Imports: methods, |
20 | 20 |
SNPchip, |
21 | 21 |
utils, |
22 | 22 |
lattice, |
23 |
- ff |
|
23 |
+ ff, |
|
24 |
+ foreach |
|
24 | 25 |
Suggests: hapmapsnp6, |
25 | 26 |
genomewidesnp6Crlmm (>= 1.0.4), |
26 | 27 |
GGdata, |
... | ... |
@@ -53,7 +53,7 @@ importFrom(oligoClasses, celfileDate, chromosome2integer, i2p, |
53 | 53 |
isPackageLoaded, |
54 | 54 |
ldPath, ocLapply, ocProbesets, ocSamples, |
55 | 55 |
parStatus, |
56 |
- splitIndicesByLength, splitIndicesByNode) |
|
56 |
+ splitIndicesByLength, splitIndicesByNode, AssayDataList) |
|
57 | 57 |
|
58 | 58 |
importFrom(preprocessCore, normalize.quantiles, |
59 | 59 |
normalize.quantiles.use.target) |
... | ... |
@@ -64,11 +64,14 @@ importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile, |
64 | 64 |
importFrom(utils, packageDescription, setTxtProgressBar, |
65 | 65 |
txtProgressBar) |
66 | 66 |
|
67 |
+## foreach |
|
68 |
+import(foreach) |
|
69 |
+ |
|
67 | 70 |
##---------------------------------------------------------------------------- |
68 | 71 |
## export |
69 | 72 |
##---------------------------------------------------------------------------- |
70 | 73 |
exportClasses(PredictionRegion) |
71 |
-exportMethods(CA, CB, coerce, |
|
74 |
+exportMethods(CA, CB, |
|
72 | 75 |
A, B, corr, nuA, nuB, phiA, phiB, |
73 | 76 |
predictionRegion, posteriorProbability, |
74 | 77 |
tau2, Ns, medians, mads, |
... | ... |
@@ -87,4 +90,4 @@ export(crlmm, |
87 | 90 |
genotype.Illumina, |
88 | 91 |
crlmmCopynumber2, crlmmCopynumberLD, crlmmCopynumber) |
89 | 92 |
export(genotypes, totalCopynumber, rawCopynumber, xyplot) |
90 |
-export(ABpanel, validCEL, celDates) |
|
93 |
+export(ABpanel, validCEL, celDates, validCdfNames) |
... | ... |
@@ -104,4 +104,4 @@ setGeneric("predictionRegion", function(object, copyNumber=0:4) |
104 | 104 |
standardGeneric("predictionRegion")) |
105 | 105 |
setGeneric("xyplot", useAsDefault=function(x, data, ...) lattice::xyplot(x, data,...)) |
106 | 106 |
setGeneric("xyplotcrlmm", function(x, data, predictRegion,...) standardGeneric("xyplotcrlmm")) |
107 |
-setGeneric("calculateRBaf", function(object, batch.name) standardGeneric("calculateRBaf")) |
|
107 |
+setGeneric("calculateRBaf", function(object, batch.name, chrom) standardGeneric("calculateRBaf")) |
... | ... |
@@ -21,7 +21,7 @@ getProtocolData.Affy <- function(filenames){ |
21 | 21 |
## return(gns) |
22 | 22 |
## }) |
23 | 23 |
|
24 |
-getFeatureData <- function(cdfName, copynumber=FALSE){ |
|
24 |
+getFeatureData <- function(cdfName, copynumber=FALSE, genome=genome){ |
|
25 | 25 |
pkgname <- getCrlmmAnnotationName(cdfName) |
26 | 26 |
if(!require(pkgname, character.only=TRUE)){ |
27 | 27 |
suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
... | ... |
@@ -39,10 +39,15 @@ getFeatureData <- function(cdfName, copynumber=FALSE){ |
39 | 39 |
pkgname <- paste(pkgname, "Crlmm", sep="") |
40 | 40 |
} |
41 | 41 |
path <- system.file("extdata", package=pkgname) |
42 |
- load(file.path(path, "snpProbes.rda")) |
|
42 |
+ multiple.builds <- length(grep("hg19", list.files(path)) > 0) |
|
43 |
+ if(!multiple.builds) |
|
44 |
+ load(file.path(path, "snpProbes.rda")) |
|
45 |
+ else load(file.path(path, paste("snpProbes_", genome, ".rda", sep=""))) |
|
43 | 46 |
snpProbes <- get("snpProbes") |
44 | 47 |
if(copynumber){ |
45 |
- load(file.path(path, "cnProbes.rda")) |
|
48 |
+ if(!multiple.builds) |
|
49 |
+ load(file.path(path, "cnProbes.rda")) |
|
50 |
+ else load(file.path(path, paste("cnProbes_", genome, ".rda", sep=""))) |
|
46 | 51 |
cnProbes <- get("cnProbes") |
47 | 52 |
snpIndex <- seq(along=gns) |
48 | 53 |
npIndex <- seq(along=rownames(cnProbes)) + max(snpIndex) |
... | ... |
@@ -121,8 +126,8 @@ construct <- function(filenames, |
121 | 126 |
return(cnSet) |
122 | 127 |
} |
123 | 128 |
|
124 |
-constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE){ |
|
125 |
- stopifnot(!missing(filenames)) |
|
129 |
+constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE, genome){ |
|
130 |
+ ##stopifnot(!missing(filenames)) |
|
126 | 131 |
if(missing(sns)) sns <- basename(filenames) |
127 | 132 |
stopifnot(!missing(batch)) |
128 | 133 |
##--------------------------------------------------------------------------- |
... | ... |
@@ -131,7 +136,6 @@ constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE){ |
131 | 136 |
## appropriate dimension for snps+nps |
132 | 137 |
## |
133 | 138 |
##--------------------------------------------------------------------------- |
134 |
- if (missing(sns)) sns <- basename(filenames) |
|
135 | 139 |
if (missing(cdfName)) |
136 | 140 |
cdfName <- read.celfile.header(filenames[1])[["cdfName"]] |
137 | 141 |
pkgname <- getCrlmmAnnotationName(cdfName) |
... | ... |
@@ -148,7 +152,7 @@ constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE){ |
148 | 152 |
SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double") |
149 | 153 |
SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double") |
150 | 154 |
genderff <- initializeBigVector("gender", length(filenames), "integer") |
151 |
- featureData <- getFeatureData(cdfName, copynumber=TRUE) |
|
155 |
+ featureData <- getFeatureData(cdfName, copynumber=TRUE, genome=genome) |
|
152 | 156 |
nr <- nrow(featureData); nc <- length(sns) |
153 | 157 |
A <- initializeBigMatrix("crlmmA-", nr, length(filenames), "integer") |
154 | 158 |
B <- initializeBigMatrix("crlmmB-", nr, length(filenames), "integer") |
... | ... |
@@ -164,18 +168,12 @@ constructAffy <- function(filenames, sns, cdfName, batch, verbose=TRUE){ |
164 | 168 |
callProbability=callPr, |
165 | 169 |
featureData=featureData, |
166 | 170 |
annotation=cdfName, |
167 |
- batch=as.character(batch)) |
|
168 |
- if(!missing(sns)){ |
|
169 |
- sampleNames(cnSet) <- sns |
|
170 |
- } else { |
|
171 |
- sampleNames(cnSet) <- basename(filenames) |
|
172 |
- } |
|
171 |
+ batch=as.character(batch), |
|
172 |
+ genome=genome) |
|
173 |
+ sampleNames(cnSet) <- sns |
|
173 | 174 |
protocolData <- getProtocolData.Affy(filenames) |
174 | 175 |
rownames(pData(protocolData)) <- sns |
175 | 176 |
protocolData(cnSet) <- protocolData |
176 |
- ##pd <- data.frame(matrix(NA, nc, 3), row.names=sns) |
|
177 |
- ##colnames(pd)=c("SKW", "SNR", "gender") |
|
178 |
- ##phenoData(cnSet) <- new("AnnotatedDataFrame", data=pd) |
|
179 | 177 |
cnSet$SKW <- SKW |
180 | 178 |
cnSet$SNR <- SNR |
181 | 179 |
cnSet$gender <- genderff |
... | ... |
@@ -192,7 +190,9 @@ snprmaAffy <- function(cnSet, filenames, |
192 | 190 |
pkgname <- getCrlmmAnnotationName(annotation(cnSet)) |
193 | 191 |
stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
194 | 192 |
mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double") |
195 |
- sampleBatches <- splitIndicesByNode(seq(along=filenames)) |
|
193 |
+ sampleBatches <- splitIndicesByLength(seq_along(filenames), ocSamples()/getDoParWorkers()) |
|
194 |
+## if(length(sampleBatches) < getDoParWorkers()) |
|
195 |
+## sampleBatches <- splitIndicesByNode(seq_along(filenames)) |
|
196 | 196 |
ocLapply(sampleBatches, |
197 | 197 |
rsprocessCEL, |
198 | 198 |
filenames=filenames, |
... | ... |
@@ -225,19 +225,25 @@ genotype <- function(filenames, |
225 | 225 |
recallRegMin=1000, |
226 | 226 |
gender=NULL, |
227 | 227 |
returnParams=TRUE, |
228 |
- badSNP=0.7){ |
|
229 |
- stopifnot(isPackageLoaded("ff")) |
|
228 |
+ badSNP=0.7, |
|
229 |
+ genome=c("hg19", "hg18")){ |
|
230 |
+ if(!isPackageLoaded("ff")) stop("ff package must be loaded") |
|
231 |
+ if (missing(sns)) sns <- basename(filenames) |
|
232 |
+ if (any(duplicated(sns))) stop("sample identifiers must be unique") |
|
233 |
+ genome <- match.arg(genome) |
|
230 | 234 |
cnSet <- constructAffy(filenames=filenames, |
235 |
+ sns=sns, |
|
231 | 236 |
cdfName=cdfName, |
232 |
- batch=batch, verbose=verbose) |
|
237 |
+ batch=batch, verbose=verbose, |
|
238 |
+ genome=genome) |
|
233 | 239 |
if(!(is(A(cnSet), "ff") || is(A(cnSet), "ffdf"))){ |
234 | 240 |
stop("The ff package is required for this function.") |
235 | 241 |
} |
236 | 242 |
cnSet@mixtureParams <- snprmaAffy(cnSet, filenames=filenames, |
237 |
- mixtureSampleSize=mixtureSampleSize, |
|
238 |
- eps=eps, |
|
239 |
- seed=seed, |
|
240 |
- verbose=verbose) |
|
243 |
+ mixtureSampleSize=mixtureSampleSize, |
|
244 |
+ eps=eps, |
|
245 |
+ seed=seed, |
|
246 |
+ verbose=verbose) |
|
241 | 247 |
ok <- cnrmaAffy(cnSet=cnSet, filenames=filenames, |
242 | 248 |
cdfName=annotation(cnSet), seed=seed, |
243 | 249 |
verbose=verbose) |
... | ... |
@@ -697,6 +703,10 @@ fit.lm2 <- function(strata, |
697 | 703 |
flags <- as.matrix(flags(object)[ii, batch.index]) |
698 | 704 |
fns <- featureNames(object)[ii] |
699 | 705 |
fns.noflags <- fns[rowSums(flags, na.rm=T) == 0] |
706 |
+ if(length(fns.noflags) == 0) { |
|
707 |
+ warning("All features in one of the probeset batches were flagged. The data is processed in probeset batches to reduce RAM, but the error suggests that increasing the size of the batches may help. For example, ocProbesets(4*ocProbesets()) would increase the current size of the probeset batches by 4.") |
|
708 |
+ fns.noflags <- featureNames(object)[marker.index] ## allow processing to continue |
|
709 |
+ } |
|
700 | 710 |
snp.index <- sample(match(fns.noflags, featureNames(object)), 5000) |
701 | 711 |
## |
702 | 712 |
nuA.np <- as.matrix(nuA(object)[marker.index, batch.index]) |
... | ... |
@@ -1089,6 +1099,15 @@ fit.lm3 <- function(strata, |
1089 | 1099 |
phiB(object)[marker.index, batch.index] <- phiB |
1090 | 1100 |
phiPrimeA(object)[marker.index, batch.index] <- phiA2 |
1091 | 1101 |
phiPrimeB(object)[marker.index, batch.index] <- phiB2 |
1102 |
+ ## |
|
1103 |
+ if(enough.women){ |
|
1104 |
+ medianA.AA(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianA.Flist, function(x) x[, 1])) |
|
1105 |
+ medianA.AB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianA.Flist, function(x) x[, 2])) |
|
1106 |
+ medianA.BB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianA.Flist, function(x) x[, 3])) |
|
1107 |
+ medianB.AA(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianB.Flist, function(x) x[, 1])) |
|
1108 |
+ medianB.AB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianB.Flist, function(x) x[, 2])) |
|
1109 |
+ medianB.BB(object)[marker.index, batch.index] <- do.call("cbind", lapply(medianB.Flist, function(x) x[, 3])) |
|
1110 |
+ } |
|
1092 | 1111 |
if(is.lds) {close(object); return(TRUE)} else return(object) |
1093 | 1112 |
} |
1094 | 1113 |
|
... | ... |
@@ -1145,6 +1164,10 @@ fit.lm4 <- function(strata, |
1145 | 1164 |
fns <- featureNames(object)[ii] |
1146 | 1165 |
flags <- as.matrix(flags(object)[ii, batch.index]) |
1147 | 1166 |
fns.noflags <- fns[rowSums(flags, na.rm=T) == 0] |
1167 |
+ if(length(fns.noflags) == 0){ |
|
1168 |
+ warning("All features in one of the probeset batches were flagged. The data is processed in probeset batches to reduce RAM, but the error suggests that increasing the size of the batches may help. For example, ocProbesets(4*ocProbesets()) would increase the current size of the probeset batches by 4.") |
|
1169 |
+ fns.noflags <- featureNames(object)[marker.index] ## allow processing to continue |
|
1170 |
+ } |
|
1148 | 1171 |
snp.index <- sample(match(fns.noflags, featureNames(object)), 10000) |
1149 | 1172 |
## |
1150 | 1173 |
N.AA <- as.matrix(N.AA(object)[snp.index, batch.index, drop=FALSE]) |
... | ... |
@@ -1260,7 +1283,7 @@ cnrma2 <- function(A, filenames, row.names, verbose=TRUE, seed=1, cdfName, sns){ |
1260 | 1283 |
pkgname <- getCrlmmAnnotationName(cdfName) |
1261 | 1284 |
require(pkgname, character.only=TRUE) || stop("Package ", pkgname, " not available") |
1262 | 1285 |
if (missing(sns)) sns <- basename(filenames) |
1263 |
- sampleBatches <- splitIndicesByNode(seq(along=filenames)) |
|
1286 |
+ sampleBatches <- splitIndicesByLength(seq_along(filenames), ocSamples()/getDoParWorkers()) |
|
1264 | 1287 |
if(verbose) message("Processing nonpolymorphic probes for ", length(filenames), " files.") |
1265 | 1288 |
## updates A |
1266 | 1289 |
ocLapply(sampleBatches, |
... | ... |
@@ -1540,10 +1563,10 @@ shrinkSummary <- function(object, |
1540 | 1563 |
marker.index <- whichMarkers(type[[1]], is.snp, is.autosome, is.annotated, is.X) |
1541 | 1564 |
} |
1542 | 1565 |
if(is.lds){ |
1543 |
- ##index.list <- splitIndicesByLength(marker.index, ocProbesets()) |
|
1544 |
- if(parStatus()){ |
|
1545 |
- index.list <- splitIndicesByNode(marker.index) |
|
1546 |
- } else index.list <- splitIndicesByLength(marker.index, ocProbesets()) |
|
1566 |
+ index.list <- splitIndicesByLength(marker.index, ocProbesets()) |
|
1567 |
+## if(parStatus()){ |
|
1568 |
+## index.list <- splitIndicesByNode(marker.index) |
|
1569 |
+## } else index.list <- splitIndicesByLength(marker.index, ocProbesets()) |
|
1547 | 1570 |
ocLapply(seq(along=index.list), |
1548 | 1571 |
shrinkGenotypeSummaries, |
1549 | 1572 |
index.list=index.list, |
... | ... |
@@ -1594,10 +1617,10 @@ genotypeSummary <- function(object, |
1594 | 1617 |
myf <- summaryFxn(type[[1]]) |
1595 | 1618 |
FUN <- get(myf) |
1596 | 1619 |
if(is.lds){ |
1597 |
- ##index.list <- splitIndicesByLength(marker.index, ocProbesets()) |
|
1598 |
- if(parStatus()){ |
|
1599 |
- index.list <- splitIndicesByNode(marker.index) |
|
1600 |
- } else index.list <- splitIndicesByLength(marker.index, ocProbesets()) |
|
1620 |
+ index.list <- splitIndicesByLength(marker.index, ocProbesets()) |
|
1621 |
+## if(parStatus()){ |
|
1622 |
+## index.list <- splitIndicesByNode(marker.index) |
|
1623 |
+## } else index.list <- splitIndicesByLength(marker.index, ocProbesets()) |
|
1601 | 1624 |
ocLapply(seq(along=index.list), |
1602 | 1625 |
FUN, |
1603 | 1626 |
index.list=index.list, |
... | ... |
@@ -2116,9 +2139,10 @@ estimateCnParameters <- function(object, |
2116 | 2139 |
myfun <- lmFxn(type[[1]]) |
2117 | 2140 |
FUN <- get(myfun) |
2118 | 2141 |
if(is.lds){ |
2119 |
- if(parStatus()){ |
|
2120 |
- index.list <- splitIndicesByNode(marker.index) |
|
2121 |
- } else index.list <- splitIndicesByLength(marker.index, ocProbesets()) |
|
2142 |
+ index.list <- splitIndicesByLength(marker.index, ocProbesets()) |
|
2143 |
+## if(parStatus()){ |
|
2144 |
+## index.list <- splitIndicesByNode(marker.index) |
|
2145 |
+## } else index.list <- splitIndicesByLength(marker.index, ocProbesets()) |
|
2122 | 2146 |
ocLapply(seq(along=index.list), |
2123 | 2147 |
FUN, |
2124 | 2148 |
index.list=index.list, |
... | ... |
@@ -2784,89 +2808,73 @@ ABpanel <- function(x, y, predictRegion, |
2784 | 2808 |
} |
2785 | 2809 |
} |
2786 | 2810 |
|
2787 |
-calculateRTheta <- function(cnSet, genotype=c("AA", "AB", "BB"), batch.name){ |
|
2788 |
- genotype <- match.arg(genotype) |
|
2789 |
- j <- match(batch.name, batchNames(cnSet)) |
|
2790 |
- ##centers <- medians(cnSet, i=snp.index, j) |
|
2791 |
- centers <- medians(cnSet, i=seq_len(nrow(cnSet)), j) |
|
2792 |
- theta <- matrix(NA, nrow(cnSet), 2) |
|
2793 |
- colnames(theta) <- c("theta", "R") |
|
2794 |
- x <- centers[, "A", genotype, j] |
|
2795 |
- y <- centers[, "B", genotype, j] |
|
2796 |
- ## small imputed values -- should fix imputeCenter |
|
2797 |
- x[x < 64] <- 64 |
|
2798 |
- y[y < 64] <- 64 |
|
2799 |
- theta[, "theta"] <- atan2(y, x)*2/pi |
|
2800 |
-## if(any(is.na(theta[, "theta"]))){ |
|
2801 |
-## ##stop("NA's in thetas. Can not compute theta / R values for batches with fewer than 10 samples") |
|
2802 |
-## } |
|
2803 |
- ## so that 'R' is available for NP probes |
|
2804 |
- y[is.na(y)] <- 0 |
|
2805 |
- theta[, "R"] <- x+y |
|
2806 |
- theta[!isSnp(cnSet), 1] <- 1 |
|
2807 |
- if(any(is.na(theta[, "theta"]))){ |
|
2808 |
- stop("NA's in thetas. Can not compute theta / R values for batches with fewer than 10 samples") |
|
2809 |
- } |
|
2810 |
- return(theta) |
|
2811 |
-} |
|
2812 |
- |
|
2813 |
- |
|
2814 |
- |
|
2815 |
- |
|
2816 |
-calculateBAFandLRR <- function(cnSet) { |
|
2817 |
- ## Calculates B allele frequency and log2 R ratio for all snps for a given cnSet |
|
2818 |
- ## Returns a 3-D array of size Num Of Snps x Num of Samples x 2 |
|
2819 |
- ## where result[,,1] is B allele frequency and result[,,2] is log2 R ratio |
|
2820 |
- |
|
2821 |
- snp.index <- which(isSnp(cnSet)) |
|
2822 |
- b <- (B(cnSet))[snp.index, ] |
|
2823 |
- a <- (A(cnSet))[snp.index, ] |
|
2824 |
- centers<-medians(cnSet, i=snp.index, j=1:length(unique(batch(cnSet)))) |
|
2825 |
- bafAndLrr <- array(NA_integer_, dim=c(length(snp.index), length(sampleNames(cnSet)), 2), dimnames=list(featureNames(cnSet)[snp.index], sampleNames(cnSet),c("baf", "lrr"))) |
|
2826 |
- |
|
2827 |
- for (batch in unique(batch(cnSet))) { |
|
2828 |
- |
|
2829 |
- ## get batch- and snp-specific centers |
|
2830 |
- center.aa.x <- centers[, 1, 1, batch] |
|
2831 |
- center.aa.y <- centers[, 2, 1, batch] |
|
2832 |
- center.ab.x <- centers[, 1, 2, batch] |
|
2833 |
- center.ab.y <- centers[, 2, 2, batch] |
|
2834 |
- center.bb.x <- centers[, 1, 3, batch] |
|
2835 |
- center.bb.y <- centers[, 2, 3, batch] |
|
2836 |
- theta.aa <- atan2(center.aa.y, center.aa.x) * 2 / pi |
|
2837 |
- r.aa <- center.aa.x + center.aa.y |
|
2838 |
- theta.ab <- atan2(center.ab.y, center.ab.x) * 2 / pi |
|
2839 |
- r.ab <- center.ab.x + center.ab.y |
|
2840 |
- theta.bb <- atan2(center.bb.y, center.bb.x) * 2 / pi |
|
2841 |
- r.bb <- center.bb.x + center.bb.y |
|
2842 |
- |
|
2843 |
- index.sample <- which(batch(cnSet) %in% batch) |
|
2844 |
- for (i in index.sample) { |
|
2845 |
- theta <- atan2(b[, i], a[, i]) * 2 / pi |
|
2846 |
- r <- a[, i] + b[, i] |
|
2847 |
- |
|
2848 |
- baf <- vector(mode="numeric", length=length(theta)) |
|
2849 |
- baf[theta < theta.aa] <- 0 |
|
2850 |
- baf[theta >= theta.bb] <- 1 |
|
2851 |
- idx.ab <- which(theta >= theta.aa & theta < theta.ab) |
|
2852 |
- baf[idx.ab] <- .5 * (theta[idx.ab] - theta.aa[idx.ab]) / (theta.ab[idx.ab] - theta.aa[idx.ab]) |
|
2853 |
- idx.bb <- which(theta >= theta.ab & theta < theta.bb) |
|
2854 |
- baf[idx.bb] <- .5 * (theta[idx.bb] - theta.ab[idx.bb]) / (theta.bb[idx.bb] - theta.ab[idx.bb]) + .5 |
|
2855 |
- bafAndLrr[, i ,1] <- baf |
|
2856 |
- |
|
2857 |
- r.expected <- vector(mode="numeric", length=length(r)) |
|
2858 |
- r.expected[theta < theta.aa] <- r.aa[theta < theta.aa] |
|
2859 |
- r.expected[theta >= theta.bb] <- r.bb[theta >= theta.bb] |
|
2860 |
- |
|
2861 |
- idx.ab <- which(theta >= theta.aa & theta < theta.ab) |
|
2862 |
- r.expected[idx.ab] <- (theta[idx.ab] - theta.aa[idx.ab]) * (r.ab[idx.ab]-r.aa[idx.ab]) / |
|
2863 |
- (theta.ab[idx.ab] - theta.aa[idx.ab]) + r.aa[idx.ab] |
|
2864 |
- idx.bb <- which(theta >= theta.ab & theta < theta.bb) |
|
2865 |
- r.expected[idx.bb] <- (theta[idx.bb] - theta.ab[idx.bb]) * (r.bb[idx.bb]-r.ab[idx.bb])/ |
|
2866 |
- (theta.bb[idx.bb] - theta.ab[idx.bb]) + r.ab[idx.bb] |
|
2867 |
- bafAndLrr[, i, 2] <- log2(r / r.expected) |
|
2868 |
- |
|
2869 |
- } |
|
2870 |
- } |
|
2871 |
- return(bafAndLrr) |
|
2811 |
+calculateRTheta <- function(object, ##genotype=c("AA", "AB", "BB"), |
|
2812 |
+ batch.name, |
|
2813 |
+ feature.index){ |
|
2814 |
+ index.np <- which(!(isSnp(object)[feature.index])) |
|
2815 |
+ j <- match(batch.name, batchNames(object)) |
|
2816 |
+ if(missing(j)) stop("batch.name did not match in batchNames(object)") |
|
2817 |
+ CHR <- unique(chromosome(object)[feature.index]) |
|
2818 |
+ isX <- CHR == "X" |
|
2819 |
+ centers <- getMedians(batchStatistics(object)) |
|
2820 |
+ lapply(centers, open) |
|
2821 |
+ centersMatrix <- lapply(centers, function(x, i, j) x[i, j], j=j, i=feature.index) |
|
2822 |
+ lapply(centers, close) |
|
2823 |
+ centersMatrix <- do.call(cbind, centersMatrix) |
|
2824 |
+ centersA <- centersMatrix[, 1:3] |
|
2825 |
+ centersB <- centersMatrix[, 4:6] |
|
2826 |
+ rm(centers, centersMatrix); gc() |
|
2827 |
+ centersA[centersA < 64] <- 64 |
|
2828 |
+ centersB[centersB < 64] <- 64 |
|
2829 |
+ theta <- atan2(centersB, centersA) * 2/pi |
|
2830 |
+ centersA[is.na(centersA)] <- 0 |
|
2831 |
+ centersB[is.na(centersB)] <- 0 |
|
2832 |
+ if(length(index.np) > 0) theta[index.np, ] <- 1 |
|
2833 |
+ ## |
|
2834 |
+ r <- centersA+centersB |
|
2835 |
+ J <- which(batch(object)==batch.name) |
|
2836 |
+ open(A(object)) |
|
2837 |
+ open(B(object)) |
|
2838 |
+ a <- as.matrix(A(object)[feature.index, J, drop=FALSE]) |
|
2839 |
+ b <- as.matrix(B(object)[feature.index, J, drop=FALSE]) |
|
2840 |
+ close(A(object)) |
|
2841 |
+ close(B(object)) |
|
2842 |
+ if(length(index.np) > 0) b[index.np, ] <- 0L |
|
2843 |
+ obs.theta <- atan2(b, a)*2/pi |
|
2844 |
+ calculateBandR <- function(o, r, theta, not.na, M){ |
|
2845 |
+ lessAA <- o < theta[, 1] |
|
2846 |
+ lessAB <- o < theta[, 2] |
|
2847 |
+ lessBB <- o < theta[, 3] |
|
2848 |
+ ii <- which(lessAA & not.na) |
|
2849 |
+ jj <- which(!lessBB & not.na) |
|
2850 |
+ i <- which(!lessAA & lessAB & not.na) |
|
2851 |
+ j <- which(!lessAB & lessBB & not.na) |
|
2852 |
+ M[i, 1] <- 0.5*(o[i]-theta[i,1])/(theta[i,2]-theta[i,1]) |
|
2853 |
+ M[j, 1] <- 0.5*(o[j]-theta[j,2])/(theta[j,3]-theta[j,2]) + 0.5 |
|
2854 |
+ M[ii, 1] <- 0 |
|
2855 |
+ M[jj, 1] <- 1 |
|
2856 |
+ M[i, 2] <- (o[i]-theta[i,1])*(r[i,2]-r[i,1])/(theta[i,2]-theta[i,1]) + r[i, 1] |
|
2857 |
+ M[j, 2] <- (o[j]-theta[j,2])*(r[j,3]-r[j,2])/(theta[j,3]-theta[j,2]) + r[j, 2] |
|
2858 |
+ M[ii, 2] <- r[ii, 1] |
|
2859 |
+ M[jj, 2] <- r[jj, 3] |
|
2860 |
+ return(M) |
|
2861 |
+ } |
|
2862 |
+ not.na <- !is.na(theta[,1]) |
|
2863 |
+ r.expected <- bf <- matrix(NA, length(feature.index), ncol(a)) |
|
2864 |
+ M <- matrix(NA, length(feature.index), 2) |
|
2865 |
+ for(j in seq_len(ncol(a))){ |
|
2866 |
+ tmp <- calculateBandR(obs.theta[, j], r=r, theta=theta, not.na=not.na, M=M) |
|
2867 |
+ bf[, j] <- tmp[,1] |
|
2868 |
+ r.expected[,j] <- tmp[,2] |
|
2869 |
+ } |
|
2870 |
+ bf[bf < 0] <- 0 |
|
2871 |
+ bf[bf > 1] <- 1 |
|
2872 |
+ if(length(index.np) > 0) r.expected[index.np, ] <- centersA[index.np, 1] |
|
2873 |
+ obs.r <- a+b |
|
2874 |
+ lrr <- log2(obs.r/r.expected) |
|
2875 |
+ dimnames(bf) <- dimnames(lrr) <- list(featureNames(object)[feature.index], |
|
2876 |
+ sampleNames(object)[J]) |
|
2877 |
+ bf <- integerMatrix(bf, 1000) |
|
2878 |
+ lrr <- integerMatrix(lrr, 100) |
|
2879 |
+ return(list(baf=bf, lrr=lrr)) |
|
2872 | 2880 |
} |
... | ... |
@@ -7,6 +7,7 @@ crlmm <- function(filenames, row.names=TRUE, col.names=TRUE, |
7 | 7 |
if ((load.it | save.it) & missing(intensityFile)) |
8 | 8 |
stop("'intensityFile' is missing, and you chose either load.it or save.it") |
9 | 9 |
if (missing(sns)) sns <- basename(filenames) |
10 |
+ if (any(duplicated(sns))) stop("sample identifiers are not unique") |
|
10 | 11 |
if (!missing(intensityFile)) |
11 | 12 |
if (load.it & !file.exists(intensityFile)){ |
12 | 13 |
load.it <- FALSE |
... | ... |
@@ -312,24 +313,26 @@ crlmm2 <- function(filenames, row.names=TRUE, col.names=TRUE, |
312 | 313 |
} |
313 | 314 |
|
314 | 315 |
imputeGender <- function(A, B, XIndex, YIndex, SNR, SNRMin){ |
315 |
- ##XMedian <- apply(log2(A[XIndex,, |
|
316 |
- ##drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
317 |
- a <- log2(A[XIndex,,drop=FALSE]) |
|
318 |
- b <- log2(B[XIndex,,drop=FALSE]) |
|
319 |
- meds.X <- (apply(a+b, 2, median))/2 |
|
320 |
- ##YIndex <- which(isSnp(gtSet) & chromosome(gtSet)==24) |
|
321 |
- a <- log2(A[YIndex,,drop=FALSE]) |
|
322 |
- b <- log2(B[YIndex,,drop=FALSE]) |
|
323 |
- meds.Y <- (apply(a+b, 2, median))/2 |
|
324 |
- R <- meds.X - meds.Y |
|
325 |
- ##SNR <- gtSet$SNR[] |
|
326 |
- ##SNRMin=5 |
|
327 |
- if(sum(SNR > SNRMin) == 1){ |
|
328 |
- ##gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
329 |
- gender <- ifelse(R[SNR[] > SNRMin] > 0.5, 2L, 1L) |
|
330 |
- } else{ |
|
331 |
- ##gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
332 |
- gender <- kmeans(R, c(min(R[SNR[]>SNRMin]), max(R[SNR[]>SNRMin])))[["cluster"]] |
|
316 |
+ if(length(YIndex) > 0){ |
|
317 |
+ a <- log2(A[XIndex,,drop=FALSE]) |
|
318 |
+ b <- log2(B[XIndex,,drop=FALSE]) |
|
319 |
+ meds.X <- (apply(a+b, 2, median))/2 |
|
320 |
+ a <- log2(A[YIndex,,drop=FALSE]) |
|
321 |
+ b <- log2(B[YIndex,,drop=FALSE]) |
|
322 |
+ meds.Y <- (apply(a+b, 2, median))/2 |
|
323 |
+ R <- meds.X - meds.Y |
|
324 |
+ if(sum(SNR > SNRMin) == 1){ |
|
325 |
+ gender <- ifelse(R[SNR[] > SNRMin] > 0.5, 2L, 1L) |
|
326 |
+ } else{ |
|
327 |
+ gender <- kmeans(R, c(min(R[SNR[]>SNRMin]), max(R[SNR[]>SNRMin])))[["cluster"]] |
|
328 |
+ } |
|
329 |
+ } else { |
|
330 |
+ XMedian <- apply(log2(A[XIndex,,drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
331 |
+ if(sum(SNR > SNRMin) == 1){ |
|
332 |
+ gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
333 |
+ } else{ |
|
334 |
+ gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]] |
|
335 |
+ } |
|
333 | 336 |
} |
334 | 337 |
return(gender) |
335 | 338 |
} |
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-# function below works OK provided all .idat files are in the current working directory |
|
1 |
+ # function below works OK provided all .idat files are in the current working directory |
|
2 | 2 |
# - could add an option to allow files in Illumina directory structure to be handled |
3 | 3 |
# or to use the optional 'Path' column in sampleSheet |
4 | 4 |
# - there is a lot of header information that is currently discarded - could try and store this somewhere in the resulting NChannelSet |
... | ... |
@@ -36,7 +36,7 @@ readIdatFiles = function(sampleSheet=NULL, |
36 | 36 |
} |
37 | 37 |
} |
38 | 38 |
pd = new("AnnotatedDataFrame", data = sampleSheet) |
39 |
- sampleNames(pd) = basename(arrayNames) |
|
39 |
+ sampleNames(pd) = make.unique(basename(arrayNames)) |
|
40 | 40 |
} |
41 | 41 |
if(is.null(arrayNames)) { |
42 | 42 |
arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path)) |
... | ... |
@@ -95,7 +95,7 @@ readIdatFiles = function(sampleSheet=NULL, |
95 | 95 |
if(i==1) { |
96 | 96 |
if(is.null(ids) && !is.null(G)){ |
97 | 97 |
ids = idsG |
98 |
- } else stop("Could not find probe IDs") |
|
98 |
+ } # else stop("Could not find probe IDs") |
|
99 | 99 |
nprobes = length(ids) |
100 | 100 |
narrays = length(arrayNames) |
101 | 101 |
RG = new("NChannelSet", |
... | ... |
@@ -106,8 +106,8 @@ readIdatFiles = function(sampleSheet=NULL, |
106 | 106 |
phenoData=pd, storage.mode="environment") |
107 | 107 |
featureNames(RG) = ids |
108 | 108 |
if(!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)){ |
109 |
- sampleNames(RG) = sampleSheet$Sample_ID |
|
110 |
- } else sampleNames(RG) = arrayNames |
|
109 |
+ sampleNames(RG) = make.unique(sampleSheet$Sample_ID) |
|
110 |
+ } else sampleNames(RG) = make.unique(basename(arrayNames)) |
|
111 | 111 |
gc(verbose=FALSE) |
112 | 112 |
} |
113 | 113 |
if(length(ids)==length(idsG)) { |
... | ... |
@@ -524,7 +524,6 @@ readGenCallOutput = function(file, path=".", cdfName, |
524 | 524 |
|
525 | 525 |
|
526 | 526 |
RGtoXY = function(RG, chipType, verbose=TRUE) { |
527 |
- |
|
528 | 527 |
needToLoad <- !all(sapply(c('addressA', 'addressB', 'base'), isLoaded)) |
529 | 528 |
if(needToLoad){ |
530 | 529 |
chipList = c("human1mv1c",# 1M |
... | ... |
@@ -537,12 +536,14 @@ RGtoXY = function(RG, chipType, verbose=TRUE) { |
537 | 536 |
"human1mduov3b", # 1M Duo |
538 | 537 |
"humanomni1quadv1b", # Omni1 quad |
539 | 538 |
"humanomni25quadv1b", # Omni2.5 quad |
539 |
+ "humanomni258v1a", # Omni2.5 8 sample |
|
540 | 540 |
"humanomniexpress12v1b", # Omni express 12 |
541 | 541 |
"humanimmuno12v1b", # Immuno chip 12 |
542 | 542 |
"humancytosnp12v2p1h") # CytoSNP 12 |
543 |
+ ## RS: added cleancdfname() |
|
543 | 544 |
if(missing(chipType)){ |
544 |
- chipType = match.arg(annotation(RG), chipList) |
|
545 |
- } else chipType = match.arg(chipType, chipList) |
|
545 |
+ chipType = match.arg(cleancdfname(annotation(RG)), chipList) |
|
546 |
+ } else chipType = match.arg(cleancdfname(chipType), chipList) |
|
546 | 547 |
pkgname = getCrlmmAnnotationName(chipType) |
547 | 548 |
if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ |
548 | 549 |
suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
... | ... |
@@ -1106,7 +1107,7 @@ getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verb |
1106 | 1107 |
Position = rep(NA, narrays)) |
1107 | 1108 |
|
1108 | 1109 |
scanDates = data.frame(ScanDate=rep(NA, narrays), DecodeDate=rep(NA, narrays)) |
1109 |
- rownames(scanDates) = gsub(paste(sep, fileExt, sep=""), "", filenames) |
|
1110 |
+ rownames(scanDates) <- make.unique(gsub(paste(sep, fileExt, sep=""), "", filenames)) |
|
1110 | 1111 |
## read in the data |
1111 | 1112 |
for(i in seq_along(filenames)) { |
1112 | 1113 |
if(verbose) |
... | ... |
@@ -1132,7 +1133,7 @@ getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verb |
1132 | 1133 |
data=scanDates, |
1133 | 1134 |
varMetadata=data.frame(labelDescription=colnames(scanDates), |
1134 | 1135 |
row.names=colnames(scanDates))) |
1135 |
- return(protocoldata) |
|
1136 |
+ return(protocoldata) |
|
1136 | 1137 |
} |
1137 | 1138 |
|
1138 | 1139 |
|
... | ... |
@@ -1171,7 +1172,7 @@ constructInf <- function(sampleSheet=NULL, |
1171 | 1172 |
} |
1172 | 1173 |
} |
1173 | 1174 |
pd = new("AnnotatedDataFrame", data = sampleSheet) |
1174 |
- sampleNames(pd) = basename(arrayNames) |
|
1175 |
+ sampleNames(pd) = make.unique(basename(arrayNames)) |
|
1175 | 1176 |
} |
1176 | 1177 |
if(is.null(arrayNames)) { |
1177 | 1178 |
arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path)) |
... | ... |
@@ -1210,17 +1211,29 @@ constructInf <- function(sampleSheet=NULL, |
1210 | 1211 |
if(verbose) message("Initializing container for genotyping and copy number estimation") |
1211 | 1212 |
featureData = getFeatureData(cdfName, copynumber=TRUE) |
1212 | 1213 |
nr = nrow(featureData); nc = narrays |
1214 |
+ sns <- if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) { |
|
1215 |
+ make.unique(sampleSheet$Sample_ID) |
|
1216 |
+ } else{ |
|
1217 |
+ make.unique(basename(arrayNames)) |
|
1218 |
+ } |
|
1219 |
+ biga <- initializeBigMatrix(name="A", nr, nc) |
|
1220 |
+ bigb <- initializeBigMatrix(name="B", nr, nc) |
|
1221 |
+ bigc <- initializeBigMatrix(name="call", nr, nc) |
|
1222 |
+ bigd <- initializeBigMatrix(name="callPr", nr,nc) |
|
1223 |
+ colnames(biga) <- colnames(bigb) <- colnames(bigc) <- colnames(bigd) <- sns |
|
1213 | 1224 |
cnSet <- new("CNSet", |
1214 |
- alleleA=initializeBigMatrix(name="A", nr, nc), |
|
1215 |
- alleleB=initializeBigMatrix(name="B", nr, nc), |
|
1216 |
- call=initializeBigMatrix(name="call", nr, nc), |
|
1217 |
- callProbability=initializeBigMatrix(name="callPr", nr,nc), |
|
1225 |
+ alleleA=biga, |
|
1226 |
+ alleleB=bigb, |
|
1227 |
+ call=bigc, |
|
1228 |
+ callProbability=bigd, |
|
1218 | 1229 |
annotation=cdfName, |
1219 | 1230 |
featureData=featureData, |
1220 | 1231 |
batch=batch) |
1221 |
- if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) { |
|
1222 |
- sampleNames(cnSet) = sampleSheet$Sample_ID |
|
1223 |
- } else sampleNames(cnSet) = arrayNames |
|
1232 |
+## if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) { |
|
1233 |
+## sampleNames(cnSet) = make.unique(sampleSheet$Sample_ID) |
|
1234 |
+## } else{ |
|
1235 |
+## sampleNames(cnSet) <- make.unique(basename(arrayNames)) |
|
1236 |
+## } |
|
1224 | 1237 |
if(saveDate){ |
1225 | 1238 |
protocolData = getProtocolData.Illumina(grnidats, sep=sep, fileExt=fileExt$green, verbose=verbose) |
1226 | 1239 |
} else{ |
... | ... |
@@ -1233,7 +1246,7 @@ constructInf <- function(sampleSheet=NULL, |
1233 | 1246 |
cnSet$gender <- initializeBigVector("gender", ncol(cnSet), vmode="integer") |
1234 | 1247 |
cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double") |
1235 | 1248 |
cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double") |
1236 |
- sampleNames(cnSet) <- basename(sampleNames(cnSet)) |
|
1249 |
+ ##sampleNames(cnSet) <- basename(sampleNames(cnSet)) |
|
1237 | 1250 |
return(cnSet) |
1238 | 1251 |
} |
1239 | 1252 |
construct.Illumina <- constructInf |
... | ... |
@@ -1263,8 +1276,7 @@ preprocessInf <- function(cnSet, |
1263 | 1276 |
is.snp = isSnp(cnSet) |
1264 | 1277 |
snp.index = which(is.snp) |
1265 | 1278 |
narrays = ncol(cnSet) |
1266 |
-# sampleBatches <- splitIndicesByLength(seq(length=ncol(cnSet)), ocSamples()) |
|
1267 |
- sampleBatches <- splitIndicesByNode(seq(length=ncol(cnSet))) |
|
1279 |
+ sampleBatches <- splitIndicesByLength(seq_len(ncol(cnSet)), ocSamples()/getDoParWorkers()) |
|
1268 | 1280 |
mixtureParams = initializeBigMatrix("crlmmMixt-", 4, narrays, "double") |
1269 | 1281 |
ocLapply(seq_along(sampleBatches), |
1270 | 1282 |
crlmm:::processIDAT, |
... | ... |
@@ -50,6 +50,21 @@ setMethod("medians", signature(object="AssayData"), |
50 | 50 |
return(res) |
51 | 51 |
}) |
52 | 52 |
|
53 |
+getMedians <- function(object){ |
|
54 |
+ medianA.AA <- assayDataElement(object, "medianA.AA") |
|
55 |
+ medianA.AB <- assayDataElement(object, "medianA.AB") |
|
56 |
+ medianA.BB <- assayDataElement(object, "medianA.BB") |
|
57 |
+ medianB.AA <- assayDataElement(object, "medianB.AA") |
|
58 |
+ medianB.AB <- assayDataElement(object, "medianB.AB") |
|
59 |
+ medianB.BB <- assayDataElement(object, "medianB.BB") |
|
60 |
+ list(A.AA=medianA.AA, |
|
61 |
+ A.AB=medianA.AB, |
|
62 |
+ A.BB=medianA.BB, |
|
63 |
+ B.AA=medianB.AA, |
|
64 |
+ B.AB=medianB.AB, |
|
65 |
+ B.BB=medianB.BB) |
|
66 |
+} |
|
67 |
+ |
|
53 | 68 |
setMethod("mads", signature(object="AssayData"), |
54 | 69 |
function(object, ...){ |
55 | 70 |
batchnames <- sampleNames(object) |
... | ... |
@@ -1,63 +1,6 @@ |
1 | 1 |
setMethod("posteriorMean", signature(object="CNSet"), function(object) assayDataElement(object, "posteriorMean")) |
2 | 2 |
setReplaceMethod("posteriorMean", signature(object="CNSet", value="matrix"), function(object, value) assayDataElementReplace(object, "posteriorMean", value)) |
3 | 3 |
|
4 |
-setAs("CNSet", "oligoSnpSet", function(from, to){ |
|
5 |
- cnSet2oligoSnpSet(from) |
|
6 |
-}) |
|
7 |
- |
|
8 |
-cnSet2oligoSnpSet <- function(object){ |
|
9 |
- row.index <- seq_len(nrow(object)) |
|
10 |
- col.index <- seq_len(ncol(object)) |
|
11 |
- is.lds <- ifelse(is(calls(object), "ff_matrix") | is(calls(object), "ffdf"), TRUE, FALSE) |
|
12 |
- if(is.lds) stopifnot(isPackageLoaded("ff")) |
|
13 |
- b.r <- calculateRBaf(object) |
|
14 |
- b <- integerMatrix(b.r[[1]], 1000) |
|
15 |
- r <- integerMatrix(b.r[[2]], 100) |
|
16 |
-## if(is.lds){ |
|
17 |
-## ## initialize a big matrix for raw copy number |
|
18 |
-## message("creating an ff object for storing total copy number") |
|
19 |
-## tcn <- initializeBigMatrix(name="total_cn", nrow(object), ncol(object), vmode="double") |
|
20 |
-## for(j in 1:ncol(object)){ |
|
21 |
-## tcn[, j] <- totalCopynumber(object, i=row.index, j=j) |
|
22 |
-## } |
|
23 |
-## } else { |
|
24 |
-## if(ncol(object) > 5){ |
|
25 |
-## ##this can be memory intensive, so we try to be careful |
|
26 |
-## col.index <- splitIndicesByLength(seq(length=ncol(object)), 5) |
|
27 |
-## tcn <- matrix(NA, nrow(object), ncol(object)) |
|
28 |
-## dimnames(tcn) <- list(featureNames(object), sampleNames(object)) |
|
29 |
-## rows <- 1:nrow(object) |
|
30 |
-## for(i in seq_along(col.index)){ |
|
31 |
-## cat(".") |
|
32 |
-## j <- col.index[[i]] |
|
33 |
-## cnSet <- object[, j] |
|
34 |
-## tcn[, j] <- totalCopynumber(cnSet, i=row.index, j=1:ncol(cnSet)) |
|
35 |
-## rm(cnSet); gc() |
|
36 |
-## } |
|
37 |
-## cat("\n") |
|
38 |
-## } else { |
|
39 |
-## tcn <- totalCopynumber(object, i=row.index, j=col.index) |
|
40 |
-## } |
|
41 |
-## } |
|
42 |
-## message("Transforming copy number to log2 scale") |
|
43 |
-## tcn[tcn < 0.1] <- 0.1 |
|
44 |
-## tcn[tcn > 8] <- 8 |
|
45 |
-## log.tcn <- log2(tcn) |
|
46 |
- tmp <- new("oligoSnpSet", |
|
47 |
- copyNumber=integerMatrix(b.r[[2]], scale=100), |
|
48 |
- call=calls(object), |
|
49 |
- callProbability=snpCallProbability(object), |
|
50 |
- annotation=annotation(object), |
|
51 |
- featureData=featureData(object), |
|
52 |
- phenoData=phenoData(object), |
|
53 |
- experimentData=experimentData(object), |
|
54 |
- protocolData=protocolData(object), |
|
55 |
- is.integer=FALSE) |
|
56 |
- tmp <- assayDataElementReplace(tmp, "baf", integerMatrix(b.r[[1]], scale=1000)) |
|
57 |
- return(tmp) |
|
58 |
-} |
|
59 |
- |
|
60 |
- |
|
61 | 4 |
linearParamElementReplace <- function(obj, elt, value) { |
62 | 5 |
storage.mode <- storageMode(batchStatistics(obj)) |
63 | 6 |
switch(storage.mode, |
... | ... |
@@ -185,7 +128,7 @@ ACN <- function(object, allele, i , j){ |
185 | 128 |
} |
186 | 129 |
## Define batch.index and sample.index |
187 | 130 |
if(!missing.j) { |
188 |
- batches <- unique(as.character(batch(object))[j]) |
|
131 |
+ batches <- unique(as.character(batch(object)[j])) |
|
189 | 132 |
##batches <- as.character(batch(object)[j]) |
190 | 133 |
batch.index <- match(batches, batchNames(object)) |
191 | 134 |
} else { |
... | ... |
@@ -587,78 +530,129 @@ setMethod("xyplot", signature(x="formula", data="CNSet"), |
587 | 530 |
}) |
588 | 531 |
|
589 | 532 |
setMethod("calculateRBaf", signature(object="CNSet"), |
590 |
- function(object, batch.name){ |
|
591 |
- all.autosomes <- all(chromosome(object) < 23) |
|
592 |
- if(!all.autosomes){ |
|
593 |
- stop("method currently only defined for chromosomes 1-22") |
|
594 |
- } |
|
595 |
- if(missing(batch.name)) batch.name <- batchNames(object)[1] |
|
596 |
- stopifnot(batch.name %in% batchNames(object)) |
|
597 |
- if(length(batch.name) > 1){ |
|
598 |
- warning("only the first batch in batch.name processed") |
|
599 |
- batch.name <- batch.name[1] |
|
600 |
- } |
|
601 |
- RTheta.aa <- calculateRTheta(object, "AA", batch.name) |
|
602 |
- RTheta.ab <- calculateRTheta(object, "AB", batch.name) |
|
603 |
- RTheta.bb <- calculateRTheta(object, "BB", batch.name) |
|
604 |
- |
|
605 |
- J <- which(batch(object) == batch.name) |
|
606 |
- |
|
607 |
- theta.aa <- matrix(RTheta.aa[, "theta"], nrow(object), length(J), byrow=FALSE) |
|
608 |
- theta.ab <- matrix(RTheta.ab[, "theta"], nrow(object), length(J), byrow=FALSE) |
|
609 |
- theta.bb <- matrix(RTheta.bb[, "theta"], nrow(object), length(J), byrow=FALSE) |
|
610 |
- |
|
611 |
- a <- A(object)[, J, drop=FALSE] |
|
612 |
- b <- B(object)[, J, drop=FALSE] ## NA's for b where nonpolymorphic |
|
613 |
- is.np <- !isSnp(object) |
|
614 |
- ##b[is.np, ] <- a[is.np, ] |
|
615 |
- b[is.np, ] <- 0L |
|
616 |
- dns <- dimnames(a) |
|
617 |
- dimnames(a) <- dimnames(b) <- NULL |
|
618 |
- obs.theta <- atan2(b, a)*2/pi |
|
619 |
- |
|
620 |
- lessAA <- obs.theta < theta.aa |
|
621 |
- lessAB <- obs.theta < theta.ab |
|
622 |
- lessBB <- obs.theta < theta.bb |
|
623 |
- grAA <- !lessAA |
|
624 |
- grAB <- !lessAB |
|
625 |
- grBB <- !lessBB |
|
626 |
- not.na <- !is.na(theta.aa) |
|
627 |
- I1 <- grAA & lessAB & not.na |
|
628 |
- I2 <- grAB & lessBB & not.na |
|
629 |
- |
|
630 |
- bf <- matrix(NA, nrow(object), ncol(a)) |
|
631 |
- bf[I1] <- 0.5 * ((obs.theta-theta.aa)/(theta.ab-theta.aa))[I1] |
|
632 |
- bf[I2] <- (.5 * (obs.theta - theta.ab) / (theta.bb - theta.ab))[I2] + 0.5 |
|
633 |
- bf[lessAA] <- 0 |
|
634 |
- bf[grBB] <- 1 |
|
635 |
- |
|
636 |
- r.expected <- matrix(NA, nrow(object), ncol(a)) |
|
637 |
- r.aa <- matrix(RTheta.aa[, "R"], nrow(object), length(J), byrow=FALSE) |
|
638 |
- r.ab <- matrix(RTheta.ab[, "R"], nrow(object), length(J), byrow=FALSE) |
|
639 |
- r.bb <- matrix(RTheta.bb[, "R"], nrow(object), length(J), byrow=FALSE) |
|
640 |
- rm(RTheta.aa, RTheta.ab, RTheta.bb); gc() |
|
641 |
- obs.r <- a+b |
|
642 |
- |
|
643 |
- lessAA <- lessAA & not.na |
|
644 |
- grBB <- grBB & not.na |
|
645 |
- tmp <- ((obs.theta - theta.aa) * (r.ab-r.aa)/(theta.ab-theta.aa))[I1] + r.aa[I1] |
|
646 |
- r.expected[I1] <- tmp |
|
647 |
- tmp <- ((obs.theta - theta.ab) * (r.bb - r.ab)/(theta.bb-theta.ab))[I2] + r.ab[I2] |
|
648 |
- r.expected[I2] <- tmp |
|
649 |
- r.expected[lessAA] <- r.aa[lessAA] |
|
650 |
- r.expected[grBB] <- r.bb[grBB] |
|
533 |
+ function(object, batch.name, chrom){ |
|
534 |
+ calculateRBafCNSet(object, batch.name, chrom) |
|
535 |
+ }) |
|
651 | 536 |
|
652 |
- index.np <- which(is.np) |
|
653 |
- if(length(index.np) > 0){ |
|
654 |
- a.np <- A(object)[index.np, J] |
|
655 |
- meds <- rowMedians(a.np, na.rm=TRUE) |
|
656 |
- r.expected[index.np, ] <- matrix(meds, length(index.np), ncol(a.np)) |
|
657 |
- } |
|
658 |
- lrr <- log2(obs.r/r.expected) |
|
537 |
+calculateRBafCNSet <- function(object, batch.name, chrom){ |
|
538 |
+ if(missing(batch.name)) batch.name <- batchNames(object) |
|
539 |
+ if(missing(chrom)) chrom <- unique(chromosome(object)) |
|
540 |
+ if(!(all(batch.name %in% batchNames(object)))) stop("batch.name must be belong to batchNames(object)") |
|
541 |
+ chr <- chromosome(object) |
|
542 |
+ valid.chrs <- chr <= 23 & chr %in% chrom |
|
543 |
+ index <- which(valid.chrs) |
|
544 |
+ indexlist <- split(index, chr[index]) |
|
545 |
+ J <- which(batch(object) %in% batch.name) |
|
546 |
+ sns <- sampleNames(object)[J] |
|
547 |
+ sampleindex <- split(J, batch(object)[J]) |
|
548 |
+ if(!all(valid.chrs)) warning("Only computing log R ratios and BAFs for autosomes and chr X") |
|
549 |
+ ## if ff package is loaded, these will be ff objects |
|
550 |
+ chr <- names(indexlist) |
|
551 |
+ rlist <- blist <- vector("list", length(indexlist)) |
|
552 |
+ for(i in seq_along(indexlist)){ |
|
553 |
+ I <- indexlist[[i]] |
|
554 |
+ nr <- length(I) |
|
555 |
+ CHR <- names(indexlist)[i] |
|
556 |
+ bafname <- paste("baf_chr", CHR, sep="") |
|
557 |
+ rname <- paste("lrr_chr", CHR, sep="") |
|
558 |
+ bmatrix <- initializeBigMatrix(bafname, nr=nr, nc=length(sns), vmode="integer") |
|
559 |
+ rmatrix <- initializeBigMatrix(rname, nr=nr, nc=length(sns), vmode="integer") |
|
560 |
+ colnames(rmatrix) <- colnames(bmatrix) <- sns |
|
561 |
+ ## put rownames in order of physical position |
|
562 |
+ ix <- order(position(object)[I]) |
|
563 |
+ I <- I[ix] |
|
564 |
+ rownames(rmatrix) <- rownames(bmatrix) <- featureNames(object)[I] |
|
565 |
+ for(j in seq_along(sampleindex)){ |
|
566 |
+ bname <- batch.name[j] |
|
567 |
+ J <- sampleindex[[j]] |
|
568 |
+ res <- calculateRTheta(object=object, |
|
569 |
+ batch.name=bname, |
|
570 |
+ feature.index=I) |
|
571 |
+ k <- match(sampleNames(object)[J], sns) |
|
572 |
+ bmatrix[, k] <- res[["baf"]] |
|
573 |
+ rmatrix[, k] <- res[["lrr"]] |
|
574 |
+ ##colnames(bmatrix)[J] <- colnames(rmatrix)[J] <- sampleNames(object)[J] |
|
575 |
+ } |
|
576 |
+ blist[[i]] <- bmatrix |
|
577 |
+ rlist[[i]] <- rmatrix |
|
578 |
+ } |
|
579 |
+ res <- list(baf=blist, |
|
580 |
+ lrr=rlist) |
|
581 |
+ return(res) |
|
582 |
+} |
|
659 | 583 |
|
660 |
- dimnames(bf) <- dimnames(lrr) <- dns |
|
661 |
- res <- list(baf=bf, |
|
662 |
- lrr=lrr) |
|
663 |
- return(res) |
|
664 |
- }) |
|
584 |
+##setMethod("calculateRBaf", signature(object="CNSet"), |
|
585 |
+## function(object, batch.name){ |
|
586 |
+## all.autosomes <- all(chromosome(object) < 23) |
|
587 |
+## if(!all.autosomes){ |
|
588 |
+## stop("method currently only defined for chromosomes 1-22") |
|
589 |
+## } |
|
590 |
+## if(missing(batch.name)) batch.name <- batchNames(object)[1] |
|
591 |
+## stopifnot(batch.name %in% batchNames(object)) |
|
592 |
+## if(length(batch.name) > 1){ |
|
593 |
+## warning("only the first batch in batch.name processed") |
|
594 |
+## batch.name <- batch.name[1] |
|
595 |
+## } |
|
596 |
+## RTheta.aa <- calculateRTheta(object, "AA", batch.name) |
|
597 |
+## RTheta.ab <- calculateRTheta(object, "AB", batch.name) |
|
598 |
+## RTheta.bb <- calculateRTheta(object, "BB", batch.name) |
|
599 |
+## |
|
600 |
+## J <- which(batch(object) == batch.name) |
|
601 |
+## |
|
602 |
+## theta.aa <- matrix(RTheta.aa[, "theta"], nrow(object), length(J), byrow=FALSE) |
|
603 |
+## theta.ab <- matrix(RTheta.ab[, "theta"], nrow(object), length(J), byrow=FALSE) |
|
604 |
+## theta.bb <- matrix(RTheta.bb[, "theta"], nrow(object), length(J), byrow=FALSE) |
|
605 |
+## |
|
606 |
+## a <- A(object)[, J, drop=FALSE] |
|
607 |
+## b <- B(object)[, J, drop=FALSE] ## NA's for b where nonpolymorphic |
|
608 |
+## is.np <- !isSnp(object) |
|
609 |
+## ##b[is.np, ] <- a[is.np, ] |
|
610 |
+## b[is.np, ] <- 0L |
|
611 |
+## dns <- dimnames(a) |
|
612 |
+## dimnames(a) <- dimnames(b) <- NULL |
|
613 |
+## obs.theta <- atan2(b, a)*2/pi |
|
614 |
+## |
|
615 |
+## lessAA <- obs.theta < theta.aa |
|
616 |
+## lessAB <- obs.theta < theta.ab |
|
617 |
+## lessBB <- obs.theta < theta.bb |
|
618 |
+## grAA <- !lessAA |
|
619 |
+## grAB <- !lessAB |
|
620 |
+## grBB <- !lessBB |
|
621 |
+## not.na <- !is.na(theta.aa) |
|
622 |
+## I1 <- grAA & lessAB & not.na |
|
623 |
+## I2 <- grAB & lessBB & not.na |
|
624 |
+## |
|
625 |
+## bf <- matrix(NA, nrow(object), ncol(a)) |
|
626 |
+## bf[I1] <- 0.5 * ((obs.theta-theta.aa)/(theta.ab-theta.aa))[I1] |
|
627 |
+## bf[I2] <- (.5 * (obs.theta - theta.ab) / (theta.bb - theta.ab))[I2] + 0.5 |
|
628 |
+## bf[lessAA] <- 0 |
|
629 |
+## bf[grBB] <- 1 |
|
630 |
+## |
|
631 |
+## r.expected <- matrix(NA, nrow(object), ncol(a)) |
|
632 |
+## r.aa <- matrix(RTheta.aa[, "R"], nrow(object), length(J), byrow=FALSE) |
|
633 |
+## r.ab <- matrix(RTheta.ab[, "R"], nrow(object), length(J), byrow=FALSE) |
|
634 |
+## r.bb <- matrix(RTheta.bb[, "R"], nrow(object), length(J), byrow=FALSE) |
|
635 |
+## rm(RTheta.aa, RTheta.ab, RTheta.bb); gc() |
|
636 |
+## obs.r <- a+b |
|
637 |
+## |
|
638 |
+## lessAA <- lessAA & not.na |
|
639 |
+## grBB <- grBB & not.na |
|
640 |
+## tmp <- ((obs.theta - theta.aa) * (r.ab-r.aa)/(theta.ab-theta.aa))[I1] + r.aa[I1] |
|
641 |
+## r.expected[I1] <- tmp |
|
642 |
+## tmp <- ((obs.theta - theta.ab) * (r.bb - r.ab)/(theta.bb-theta.ab))[I2] + r.ab[I2] |
|
643 |
+## r.expected[I2] <- tmp |
|
644 |
+## r.expected[lessAA] <- r.aa[lessAA] |
|
645 |
+## r.expected[grBB] <- r.bb[grBB] |
|
646 |
+## index.np <- which(is.np) |
|
647 |
+## if(length(index.np) > 0){ |
|
648 |
+## a.np <- A(object)[index.np, J, drop=FALSE] |
|
649 |
+## meds <- rowMedians(a.np, na.rm=TRUE) |
|
650 |
+## r.expected[index.np, ] <- matrix(meds, length(index.np), ncol(a.np)) |
|
651 |
+## } |
|
652 |
+## lrr <- log2(obs.r/r.expected) |
|
653 |
+## |
|
654 |
+## dimnames(bf) <- dimnames(lrr) <- dns |
|
655 |
+## res <- list(baf=bf, |
|
656 |
+## lrr=lrr) |
|
657 |
+## return(res) |
|
658 |
+## }) |
... | ... |
@@ -152,30 +152,39 @@ celDates <- function(celfiles){ |
152 | 152 |
return(celdts) |
153 | 153 |
} |
154 | 154 |
|
155 |
-validCdfNames <- function(){ |
|
155 |
+illuminaCdfNames <- function(){ |
|
156 |
+ c("human1mv1c",# 1M |
|
157 |
+ "human370v1c", # 370CNV |
|
158 |
+ "human650v3a", # 650Y |
|
159 |
+ "human610quadv1b", # 610 quad |
|
160 |
+ "human660quadv1a", # 660 quad |
|
161 |
+ "human370quadv3c", # 370CNV quad |
|
162 |
+ "human550v3b", # 550K |
|
163 |
+ "human1mduov3b", # 1M Duo |
|
164 |
+ "humanomni1quadv1b", # Omni1 quad |
|
165 |
+ "humanomni25quadv1b", # Omni2.5 quad |
|
166 |
+ "humanomni258v1a", # Omni2.5 8 sample |
|
167 |
+ "humanomniexpress12v1b", # Omni express 12 |
|
168 |
+ "humanimmuno12v1b", # Immuno chip 12 |
|
169 |
+ "humancytosnp12v2p1h") # CytoSNP 12 |
|
170 |
+} |
|
171 |
+ |
|
172 |
+affyCdfNames <- function(){ |
|
156 | 173 |
c("genomewidesnp6", |
157 |
- "genomewidesnp5", |
|
158 |
- "human370v1c", |
|
159 |
- "human370quadv3c", |
|
160 |
- "human550v3b", |
|
161 |
- "human650v3a", |
|
162 |
- "human610quadv1b", |
|
163 |
- "human660quadv1a", |
|
164 |
- "human1mduov3b", |
|
165 |
- "humanomni1quadv1b", |
|
166 |
- "humanomniexpress12v1b", |
|
167 |
- "humanomni25quadv1b", |
|
168 |
- "humanimmuno12v1b", |
|
169 |
- "humancytosnp12v2p1h") |
|
174 |
+ "genomewidesnp5") |
|
170 | 175 |
} |
176 |
+ |
|
177 |
+validCdfNames <- function(){ |
|
178 |
+ c(affyCdfNames(), |
|
179 |
+ illuminaCdfNames()) |
|
180 |
+} |
|
181 |
+ |
|
182 |
+cleancdfname <- function(x) strsplit(x, "Crlmm")[[1]][[1]] |
|
183 |
+ |
|
171 | 184 |
isValidCdfName <- function(cdfName){ |
172 | 185 |
chipList <- validCdfNames() |
173 |
- result <- cdfName %in% chipList |
|
174 |
- if(!(result)){ |
|
175 |
- warning("cdfName must be one of the following: ", |
|
176 |
- chipList) |
|
177 |
- } |
|
178 |
- return(result) |
|
186 |
+ match.arg(cleancdfname(cdfName), chipList) |
|
187 |
+ return(TRUE) |
|
179 | 188 |
} |
180 | 189 |
|
181 | 190 |
isPackageLoaded <- function(pkg){ |
... | ... |
@@ -38,12 +38,12 @@ citEntry(entry="article", |
38 | 38 |
"Aravinda Chakravarti and", |
39 | 39 |
"Rafael A Irizarry"), |
40 | 40 |
journal = "Biostatistics", |
41 |
- year = "2010", |
|
42 |
- volume="", |
|
43 |
- pages="", |
|
41 |
+ year = "2011", |
|
42 |
+ volume="12", |
|
43 |
+ pages="33--50", |
|
44 | 44 |
textVersion = paste("Scharpf RB, Ruczinski I, Carvalho B, Doan B, Chakravarti A, Irizarry RA", |
45 | 45 |
"A multilevel model to address batch effects in copy number estimation using SNP arrays", |
46 |
- "Biostatistics. 2010.")) |
|
46 |
+ "Biostatistics. 2011.")) |
|
47 | 47 |
|
48 | 48 |
citEntry(entry = "Article", |
49 | 49 |
title = "Using the {R} Package {crlmm} for Genotyping and Copy Number Estimation", |
... | ... |
@@ -47,9 +47,11 @@ options(continue=" ", width=70) |
47 | 47 |
\section{Set up} |
48 | 48 |
|
49 | 49 |
<<cdfname, results=hide>>= |
50 |
-library(crlmm) |
|
51 |
-library(ff) |
|
52 |
-library(cacheSweave) |
|
50 |
+library(oligoClasses) |
|
51 |
+library2(crlmm) |
|
52 |
+library2(ff) |
|
53 |
+if(!exists("useCache")) useCache <- TRUE |
|
54 |
+if(useCache) library2(cacheSweave) |
|
53 | 55 |
@ |
54 | 56 |
|
55 | 57 |
This vignette analyzes HapMap samples assayed on the Affymetrix 6.0 |
... | ... |
@@ -67,8 +69,11 @@ cdfName <- "genomewidesnp6" |
67 | 69 |
|
68 | 70 |
The HapMap CEL files are stored in a local directory assigned to |
69 | 71 |
\verb+pathToCels+ in the following code. The genotyping step will |
70 |
-create several files with \verb+ff+ extensions. We will store these |
|
71 |
-files to the path indicated by \verb+outdir+. |
|
72 |
+create several files with \verb+ff+ extensions. The ff objects contain |
|
73 |
+the low-level, normalized intensities as well as parameters used to |
|
74 |
+subsequently estimate copy number and B allele frequencies. These |
|
75 |
+files should not be deleted or moved. We will store these files to |
|
76 |
+the path indicated by \verb+outdir+. |
|
72 | 77 |
|
73 | 78 |
<<setup>>= |
74 | 79 |
pathToCels <- "/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m" |
... | ... |
@@ -86,7 +91,7 @@ ldPath(outdir) |
86 | 91 |
|
87 | 92 |
% only needed if cacheing |
88 | 93 |
<<cachedir, echo=FALSE>>= |
89 |
-setCacheDir(outdir) |
|
94 |
+if(useCache) setCacheDir(outdir) |
|
90 | 95 |
@ |
91 | 96 |
|
92 | 97 |
The \R{} functions \Rfunction{ocProbesets} and \Rfunction{ocSamples} |
... | ... |
@@ -107,6 +112,10 @@ Yoruban ('Y') samples. |
107 | 112 |
<<celfiles>>= |
108 | 113 |
celFiles <- list.celfiles(pathToCels, full.names=TRUE, pattern=".CEL") |
109 | 114 |
celFiles <- celFiles[substr(basename(celFiles), 13, 13) %in% c("C", "Y")] |
115 |
+if(exists("file.index")){ |
|
116 |
+ celFiles <- celFiles[file.index] |
|
117 |
+} |
|
118 |
+##celFiles <- celFiles[substr(basename(celFiles), 13, 13) %in% "C"] |
|
110 | 119 |
@ |
111 | 120 |
|
112 | 121 |
Finally, copy number analyses using \crlmm{} require specification of |
... | ... |
@@ -140,7 +149,7 @@ confidence score for the genotype calls. The function |
140 | 149 |
\Rfunction{genotype} performs both the preprocessing and genotyping. |
141 | 150 |
|
142 | 151 |
<<LDS_genotype, cache=TRUE>>= |
143 |
-cnSet <- genotype(celFiles, batch=plates, cdfName=cdfName) |
|
152 |
+cnSet <- genotype(celFiles, batch=plates, cdfName=cdfName, genome="hg19") |
|
144 | 153 |
@ |
145 | 154 |
|
146 | 155 |
Segment faults that occur with the above step can often be traced to a |
... | ... |
@@ -169,22 +178,10 @@ genotype calls are stored on disk rather than in active memory. |
169 | 178 |
print(object.size(cnSet), units="Mb") |
170 | 179 |
@ |
171 | 180 |
|
172 |
-We save the \Robject{cnSet} object in a local directory for subsequent |
|
173 |
-copy number analysis in the \verb+copynumber+ vignette. |
|
174 |
- |
|
175 |
-<<save,cache=TRUE>>= |
|
176 |
-saveObject <- function(outdir, cnSet) { |
|
177 |
- save(cnSet, file=file.path(outdir, "cnSet.rda")) |
|
178 |
- TRUE |
|
179 |
-} |
|
180 |
-(cnset.saved <- saveObject(outdir, cnSet)) |
|
181 |
-@ |
|
182 |
- |
|
183 | 181 |
%Users can proceed to the \verb+copynumber+ vignette for copy number |
184 | 182 |
%analyses. See the \verb+Infrastructure+ vignette for additional |
185 | 183 |
%details on the \Robject{CNSet} class, including an overview of the |
186 | 184 |
%available accessors. |
187 |
- |
|
188 | 185 |
\SweaveInput{copynumber} |
189 | 186 |
|
190 | 187 |
|
... | ... |
@@ -6,6 +6,7 @@ |
6 | 6 |
\usepackage{graphicx} |
7 | 7 |
\usepackage{natbib} |
8 | 8 |
\usepackage{amsmath} |
9 |
+\usepackage{url} |
|
9 | 10 |
\usepackage[margin=1in]{geometry} |
10 | 11 |
\newcommand{\crlmm}{\Rpackage{crlmm}} |
11 | 12 |
\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
... | ... |
@@ -233,10 +234,10 @@ wrapper to the above functions and returns an object of class |
233 | 234 |
|
234 | 235 |
<<genotype.Illumina,cache=TRUE,results=hide>>= |
235 | 236 |
cnSet <- genotype.Illumina(sampleSheet=samplesheet, |
236 |
- arrayNames=arrayNames, |
|
237 |
- arrayInfoColNames=arrayInfo, |
|
238 |
- cdfName="human370v1c", |
|
239 |
- batch=batch) |
|
237 |
+ arrayNames=arrayNames, |
|
238 |
+ arrayInfoColNames=arrayInfo, |
|
239 |
+ cdfName="human370v1c", |
|
240 |
+ batch=batch) |
|
240 | 241 |
@ |
241 | 242 |
|
242 | 243 |
|
... | ... |
@@ -129,11 +129,9 @@ directory to store output files and setting the \Rfunction{ldPath} |
129 | 129 |
function with this path. |
130 | 130 |
|
131 | 131 |
<<path>>= |
132 |
-if(getRversion() < "2.13.0"){ |
|
133 |
- rpath <- getRversion() |
|
134 |
-} else rpath <- "trunk" |
|
135 |
-outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="") |
|
132 |
+outdir <- paste("/local_data/r00/crlmm/", getRversion(), "/infrastructure", sep="") |
|
136 | 133 |
ldPath(outdir) |
134 |
+if(!file.exists(outdir)) dir.create(outdir) |
|
137 | 135 |
@ |
138 | 136 |
|
139 | 137 |
|
... | ... |
@@ -258,6 +256,7 @@ is a SNP can be accessed through accessors defined for the |
258 | 256 |
\verb+featureData+. |
259 | 257 |
|
260 | 258 |
<<featuredataAccessors>>= |
259 |
+library(Biobase) |
|
261 | 260 |
fvarLabels(exampleSet) |
262 | 261 |
position(exampleSet)[1:10] |
263 | 262 |
chromosome(exampleSet)[1:10] |
... | ... |
@@ -554,7 +553,7 @@ estimates are missing for a given SNP. |
554 | 553 |
|
555 | 554 |
<<NAchromosomeX>>= |
556 | 555 |
## start with first batch |
557 |
-sample.index <- 1:90 |
|
556 |
+sample.index <- which(batch(cnSet)==batch(cnSet)[1]) |
|
558 | 557 |
X.index <- which(isSnp(cnSet) & chromosome(cnSet) == 23) |
559 | 558 |
ca.X <- CA(cnSet, i=X.index, j=sample.index) |
560 | 559 |
missing.caX <- is.na(ca.X) |
... | ... |
@@ -154,7 +154,7 @@ formally, \newcommand{\A}{A} \newcommand{\B}{B} |
154 | 154 |
\phi}_{k,ip}}\left(I_{k,ijp}-{\hat \nu}_{k,ip}\right), ~0\right\} |
155 | 155 |
\mbox{~for~} k \in \{\A, \B\}. |
156 | 156 |
\end{eqnarray} |
157 |
-\noindent See \cite{Scharpf2010} for details. |
|
157 |
+\noindent See \cite{Scharpf2011} for details. |
|
158 | 158 |
|
159 | 159 |
The function \Rfunction{totalCopynumber} translates the normalized |
160 | 160 |
intensities to an estimate of raw copy number by adding the |
... | ... |
@@ -175,42 +175,49 @@ dim(tmp2) |
175 | 175 |
|
176 | 176 |
Alternatively, the functions \Rfunction{CA} and \Rfunction{CB} compute |
177 | 177 |
the allele-specific copy number. For instance, the following code |
178 |
-chunk computes the allele-specific summaries at all polymorphic loci. |
|
178 |
+chunk computes the allele-specific summaries at all polymorphic loci |
|
179 |
+for the first 2 samples. |
|
179 | 180 |
|
180 | 181 |
<<ca>>= |
181 | 182 |
snp.index <- which(isSnp(cnSet) & !is.na(chromosome(cnSet))) |
182 |
-ca <- CA(cnSet, i=snp.index, j=seq_len(ncol(cnSet))) |
|
183 |
-cb <- CB(cnSet, i=snp.index, j=seq_len(ncol(cnSet))) |
|
183 |
+ca <- CA(cnSet, i=snp.index, j=1:2) |
|
184 |
+cb <- CB(cnSet, i=snp.index, j=1:2) |
|
184 | 185 |
@ |
185 | 186 |
|
186 | 187 |
|
187 |
-\subsection{Container for raw copy number} |
|
188 |
+\subsection{Container for log R ratios and B allele frequencies} |
|
188 | 189 |
|
189 | 190 |
A useful container for storing the \crlmm{} genotypes, genotype |
190 |
-confidence scores, and the total copy number at each marker is the |
|
191 |
-\Rclass{oligoSnpSet} class. Coercion of a \Rclass{CNSet} object to a |
|
192 |
-\Rfunction{oligoSnpSet} object can be acheived by using the method |
|
193 |
-\Rfunction{as} (as illustrated below). Users should note that if the |
|
194 |
-\verb+assayData+ elements in the \Rclass{CNSet} instance are |
|
195 |
-\Rpackage{ff} objects, the \verb+assayData+ elements of the |
|
196 |
-instantiated \Rclass{oligoSnpSet} will also be \Rpackage{ff}-dervied |
|
197 |
-objects (a new \verb+total_cn*.ff+ file will be created in the |
|
198 |
-\verb+ldPath()+ directory). |
|
191 |
+confidence scores, and the total or relative copy number at each |
|
192 |
+marker is the \Rclass{oligoSetList} class. Coercion of a |
|
193 |
+\Rclass{CNSet} object to a \Rfunction{oligoSnpSet} object can be |
|
194 |
+acheived by using the function \Rfunction{constructOligoSetFrom} as |
|
195 |
+illustrated below. Users should note that if the \verb+assayData+ |
|
196 |
+elements in the \Rclass{CNSet} instance are \Rpackage{ff} objects, the |
|
197 |
+\verb+assayData+ elements of each element in the \Rclass{oligoSetList} |
|
198 |
+object will be \Rpackage{ff}-dervied objects (a new |
|
199 |
+\verb+total_cn*.ff+ file will be created in the \verb+ldPath()+ |
|
200 |
+directory). |
|
199 | 201 |
|
200 | 202 |
<<oligoSnpSet>>= |
203 |
+library(VanillaICE) |
|
201 | 204 |
open(cnSet3) |
202 |
-oligoSet <- as(cnSet3, "oligoSnpSet") |
|
205 |
+oligoSetList <- constructOligoSetListFrom(cnSet3, batch.name=batch(cnSet3)[1]) |
|
203 | 206 |
close(cnSet3) |
204 |
-class(copyNumber(oligoSet)) |
|
207 |
+show(oligoSetList) |
|
208 |
+class(oligoSetList) |
|
209 |
+## oligoSnpSet of first chromosome |
|
210 |
+oligoSetList[[1]] |
|
205 | 211 |
@ |
206 | 212 |
|
207 |
-\noindent Note that the raw copy number estimates stored in the |
|
208 |
-\Robject{oligoSnpSet} object can be retrieved by the |
|
209 |
-\Rfunction{copyNumber} accessor and is equivalent to that returned by |
|
210 |
-the \Rfunction{totalCopynumber} function defined over the same row and |
|
211 |
-column indices. |
|
213 |
+\noindent Note that log R ratios stored in the \Robject{oligoSnpSet} |
|
214 |
+object can be retrieved by the \Rfunction{copyNumber} accessor. B |
|
215 |
+allele frequences are retrieved by the \Rfunction{baf} accessor. |
|
212 | 216 |
|
213 | 217 |
<<testEqual>>= |
214 |
-total.cn3 <- totalCopynumber(cnSet3, i=1:nrow(cnSet3), j=seq_len(ncol(cnSet3))) |
|
215 |
-all.equal(copyNumber(oligoSet), total.cn3) |
|
216 |
-@ |
|
217 | 218 |
\ No newline at end of file |
219 |
+lrrList <- copyNumber(oligoSetList) |
|
220 |
+class(lrrList) |
|
221 |
+dim(lrrList[[1]]) ## log R ratios for chromosome 1. |
|
222 |
+bafList <- baf(oligoSetList) |
|
223 |
+dim(bafList[[1]]) ## B allele frequencies for chromosome 1 |
|
224 |
+@ |
15 | 15 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,39 @@ |
1 |
+test_crlmm <- function(){ |
|
2 |
+ library(oligoClasses) |
|
3 |
+ if (require(genomewidesnp6Crlmm) & require(hapmapsnp6)){ |
|
4 |
+ path <- system.file("celFiles", package="hapmapsnp6") |
|
5 |
+ cels <- list.celfiles(path, full.names=TRUE) |
|
6 |
+ (crlmmOutput <- crlmm(cels)) |
|
7 |
+ } |
|
8 |
+} |
|
9 |
+ |
|
10 |
+test_duplicates <- function(){ |
|
11 |
+ library(crlmm);library(RUnit); library(oligoClasses) |
|
12 |
+ if (require(genomewidesnp6Crlmm) & require(hapmapsnp6)){ |
|
13 |
+ path <- system.file("celFiles", package="hapmapsnp6") |
|
14 |
+ cels <- list.celfiles(path, full.names=TRUE) |
|
15 |
+ ## Quickly fails because sample identifiers are not unique |
|
16 |
+ checkException(crlmmOutput <- crlmm(cels[c(1,1, 2)]), silent=FALSE) |
|
17 |
+ |
|
18 |
+ } |
|
19 |
+ if(FALSE){ |
|
20 |
+ library2(ff) |
|
21 |
+ datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k" |
|
22 |
+ ## read in your samplesheet |
|
23 |
+ samplesheet <- read.csv(file.path(datadir, "HumanHap370Duo_Sample_Map.csv"), header=TRUE, as.is=TRUE) |
|
24 |
+ samplesheet <- samplesheet[-c(28:46,61:75,78:79), ] |
|
25 |
+ arrayNames <- file.path(datadir, unique(samplesheet[, "SentrixPosition"])) |
|
26 |
+ index <- c(1,1,2,2,3,4) |
|
27 |
+ any(duplicate(arrayNames[index])) |
|
28 |
+ arrayNames <- arrayNames[index] |
|
29 |
+ ss <- samplesheet[index, ] |
|
30 |
+ arrayInfo <- list(barcode=NULL, position="SentrixPosition") |
|
31 |
+ cnSet <- genotype.Illumina(sampleSheet=ss, |
|
32 |
+ arrayNames=arrayNames, |
|
33 |
+ arrayInfoColNames=arrayInfo, |
|
34 |
+ cdfName="human370v1c", |
|
35 |
+ batch=rep("1", nrow(ss))) |
|
36 |
+ checkTrue(validObject(cnSet)) |
|
37 |
+ } |
|
38 |
+ |
|
39 |
+} |
... | ... |
@@ -5,45 +5,63 @@ |
5 | 5 |
\description{Calculate log R ratios and B allele frequencies from |
6 | 6 |
a \code{CNSet} object} |
7 | 7 |
\usage{ |
8 |
-calculateRBaf(object, batch.name) |
|
8 |
+calculateRBaf(object, batch.name, chrom) |
|
9 | 9 |
} |
10 | 10 |
|
11 | 11 |
\arguments{ |
12 |
+ |
|
12 | 13 |
\item{object}{A \code{CNSet} object.} |
13 |
- \item{batch.name}{A character string. See details.} |
|
14 |
+ |
|
15 |
+ \item{batch.name}{A character string indicating the batch. If |
|
16 |
+ missing, log R ratios and B allele frequencies are calculated for all |
|
17 |
+ batches in the \code{object}.} |
|
18 |
+ |
|
19 |
+ \item{chrom}{Integer indicating which chromosome to process. If |
|
20 |
+ missing, B allele frequencies and log R ratios are calculated for all |
|
21 |
+ autosomal chromosomes and chromosome X that are included in |
|
22 |
+ \code{object}.} |
|
23 |
+ |
|
14 | 24 |
} |
15 |
-\details{\code{batch.name} must be a value in |
|
16 |
- \code{batch(object)}. Currently, one must specify a single |
|
17 |
- \code{batch.name}. If a character vector for \code{batch.name} is |
|
18 |
- supplied, only the first is evaluated. |
|
25 |
+ |
|
26 |
+\details{ |
|
27 |
+ |
|
28 |
+ \code{batch.name} must be a value in \code{batch(object)}. Currently, |
|
29 |
+ one must specify a single \code{batch.name}. If a character vector for |
|
30 |
+ \code{batch.name} is supplied, only the first is evaluated. |
|
19 | 31 |
|
20 | 32 |
TODO: A description of how these values are calculated. |
21 | 33 |
|
22 | 34 |
} |
23 | 35 |
|
24 | 36 |
\value{ |
25 |
- A list. |
|
37 |
+ A named list. |
|
26 | 38 |
|
27 |
- \code{baf}: A matrix of B allele frequencies. |
|
39 |
+ \code{baf}: Each element in the baf list is a matrix of B allele |
|
40 |
+ frequencies (one matrix for each chromosome). |
|
28 | 41 |
|
29 |
- \code{lrr}: A matrix of log R ratios. |
|
42 |
+ \code{lrr}: Each element in the lrr list is a matrix of log R ratios |
|
43 |
+ (one matrix for each chromosome). |
|
44 |
+ |
|
45 |
+ The log R ratios were scaled by a factor of 100 and stored as an |
|
46 |
+ integer. B allele frequencies were scaled by a factor of 1000 and |
|
47 |
+ stored as an integer. |
|
30 | 48 |
|
31 | 49 |
} |
32 | 50 |
|
33 | 51 |
\references{ |
34 | 52 |
|
35 |
- Reference for BAFs, LRRs. |
|
53 |
+ Peiffer et al., High-resolution genomic profiling of chromosomal aberrations using |
|
54 |
+ Infinium whole-genome genotyping (2006), Genome Research |
|
36 | 55 |
|
37 | 56 |
} |
57 |
+ |
|
38 | 58 |
\author{Lynn Mireless} |
39 | 59 |
|
40 | 60 |
\examples{ |
41 | 61 |
data(cnSetExample) |
42 | 62 |
baf.lrr <- calculateRBaf(cnSetExample, "SHELF") |
43 |
-hist(baf.lrr[["baf"]], breaks=100) |
|
44 |
-hist(baf.lrr[["lrr"]], breaks=100) |
|
63 |
+hist(baf.lrr[["baf"]][[1]]/1000, breaks=100) |
|
64 |
+hist(baf.lrr[["lrr"]][[1]]/100, breaks=100) |
|
45 | 65 |
} |
46 |
-% Add one or more standard keywords, see file 'KEYWORDS' in the |
|
47 |
-% R documentation directory. |
|
48 | 66 |
\keyword{list} |
49 |
-\keyword{methods} |
|
50 | 67 |
\ No newline at end of file |
68 |
+\keyword{methods} |
... | ... |
@@ -12,7 +12,7 @@ |
12 | 12 |
genotype(filenames, cdfName, batch, mixtureSampleSize = 10^5, eps =0.1, |
13 | 13 |
verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), |
14 | 14 |
DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, |
15 |
- gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
15 |
+ gender = NULL, returnParams = TRUE, badSNP = 0.7, genome=c("hg19", "hg18")) |
|
16 | 16 |
} |
17 | 17 |
\arguments{ |
18 | 18 |
\item{filenames}{ complete path to CEL files} |
... | ... |
@@ -37,7 +37,10 @@ genotype(filenames, cdfName, batch, mixtureSampleSize = 10^5, eps =0.1, |
37 | 37 |
with same length as filenames. If missing, the gender is predicted.} |
38 | 38 |
\item{returnParams}{'logical'. Return recalibrated parameters from |
39 | 39 |
crlmm.} |
40 |
- \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)} |
|
40 |
+ \item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects |
|
41 |
+ batchQC)} |
|
42 |
+ \item{genome}{character string indicating the UCSC genome build for |
|
43 |
+ the SNP annotation} |
|
41 | 44 |
} |
42 | 45 |
|
43 | 46 |
\details{ |