git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54164 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.9.19 |
|
4 |
+Version: 1.9.20 |
|
5 | 5 |
Date: 2010-12-10 |
6 | 6 |
Author: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry |
7 | 7 |
Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
... | ... |
@@ -100,21 +100,21 @@ construct <- function(filenames, |
100 | 100 |
} |
101 | 101 |
|
102 | 102 |
genotype <- function(filenames, |
103 |
- cdfName, |
|
104 |
- batch, |
|
105 |
- mixtureSampleSize=10^5, |
|
106 |
- eps=0.1, |
|
107 |
- verbose=TRUE, |
|
108 |
- seed=1, |
|
109 |
- sns, |
|
110 |
- probs=rep(1/3, 3), |
|
111 |
- DF=6, |
|
112 |
- SNRMin=5, |
|
113 |
- recallMin=10, |
|
114 |
- recallRegMin=1000, |
|
103 |
+ cdfName, |
|
104 |
+ batch, |
|
105 |
+ mixtureSampleSize=10^5, |
|
106 |
+ eps=0.1, |
|
107 |
+ verbose=TRUE, |
|
108 |
+ seed=1, |
|
109 |
+ sns, |
|
110 |
+ probs=rep(1/3, 3), |
|
111 |
+ DF=6, |
|
112 |
+ SNRMin=5, |
|
113 |
+ recallMin=10, |
|
114 |
+ recallRegMin=1000, |
|
115 | 115 |
gender=NULL, |
116 |
- returnParams=TRUE, |
|
117 |
- badSNP=0.7){ |
|
116 |
+ returnParams=TRUE, |
|
117 |
+ badSNP=0.7){ |
|
118 | 118 |
is.lds <- ifelse(isPackageLoaded("ff"), TRUE, FALSE) |
119 | 119 |
if(!is.lds) stop("this function now requires that the ff package be loaded") |
120 | 120 |
if(missing(cdfName)) stop("must specify cdfName") |
... | ... |
@@ -498,6 +498,7 @@ fit.lm1 <- function(strata, |
498 | 498 |
MIN.PHI, |
499 | 499 |
verbose, is.lds, |
500 | 500 |
CHR.X, ...){ |
501 |
+ open(object$gender) |
|
501 | 502 |
if(is.lds) {physical <- get("physical"); open(object)} |
502 | 503 |
if(verbose) message(" Probe stratum ", strata, " of ", length(index.list)) |
503 | 504 |
snps <- index.list[[strata]] |
... | ... |
@@ -606,6 +607,7 @@ fit.lm2 <- function(strata, |
606 | 607 |
MIN.PHI, |
607 | 608 |
verbose, is.lds, CHR.X, ...){ |
608 | 609 |
if(is.lds) {physical <- get("physical"); open(object)} |
610 |
+ open(object$gender) |
|
609 | 611 |
if(verbose) message(" Probe stratum ", strata, " of ", length(index.list)) |
610 | 612 |
marker.index <- index.list[[strata]] |
611 | 613 |
batches <- split(seq_along(batch(object)), as.character(batch(object))) |
... | ... |
@@ -663,7 +665,9 @@ summarizeMaleXNps <- function(marker.index, |
663 | 665 |
nr <- length(marker.index) |
664 | 666 |
nc <- length(batchNames(object)) |
665 | 667 |
NN.Mlist <- imputed.medianA <- imputed.medianB <- shrink.madA <- shrink.madB <- vector("list", nc) |
666 |
- gender <- object$gender |
|
668 |
+ open(object$gender) |
|
669 |
+ gender <- object$gender[] |
|
670 |
+ open(A(object)) |
|
667 | 671 |
AA <- as.matrix(A(object)[marker.index, gender==1]) |
668 | 672 |
madA.AA <- medianA.AA <- matrix(NA, nr, nc) |
669 | 673 |
colnames(madA.AA) <- colnames(medianA.AA) <- batchNames(object) |
... | ... |
@@ -697,10 +701,11 @@ summarizeXGenotypes <- function(marker.index, |
697 | 701 |
DF.PRIOR, |
698 | 702 |
gender="male", ...){ |
699 | 703 |
I <- unlist(batches) |
704 |
+ open(object$gender) |
|
700 | 705 |
if(gender == "male"){ |
701 |
- gender.index <- which(object$gender == 1) |
|
706 |
+ gender.index <- which(object$gender[] == 1) |
|
702 | 707 |
} else { |
703 |
- gender.index <- which(object$gender == 2) |
|
708 |
+ gender.index <- which(object$gender[] == 2) |
|
704 | 709 |
} |
705 | 710 |
batches <- lapply(batches, function(x, gender.index) intersect(x, gender.index), gender.index) |
706 | 711 |
batch.names <- names(batches) |
... | ... |
@@ -709,6 +714,10 @@ summarizeXGenotypes <- function(marker.index, |
709 | 714 |
nc <- length(batch.index) |
710 | 715 |
NN.Mlist <- medianA <- medianB <- shrink.madA <- shrink.madB <- vector("list", nc) |
711 | 716 |
names(NN.Mlist) <- names(medianA) <- names(medianB) <- names(shrink.madA) <- names(shrink.madB) <- batch.names |
717 |
+ open(calls(object)) |
|
718 |
+ open(snpCallProbability(object)) |
|
719 |
+ open(A(object)) |
|
720 |
+ open(B(object)) |
|
712 | 721 |
GG <- as.matrix(calls(object)[marker.index, ]) |
713 | 722 |
CP <- as.matrix(snpCallProbability(object)[marker.index, ]) |
714 | 723 |
AA <- as.matrix(A(object)[marker.index, ]) |
... | ... |
@@ -819,6 +828,10 @@ summarizeXGenotypes <- function(marker.index, |
819 | 828 |
medianA[[k]] <- res[[1]] |
820 | 829 |
medianB[[k]] <- res[[2]] |
821 | 830 |
} |
831 |
+ close(calls(object)) |
|
832 |
+ close(snpCallProbability(object)) |
|
833 |
+ close(A(object)) |
|
834 |
+ close(B(object)) |
|
822 | 835 |
return(list(madA=shrink.madA, |
823 | 836 |
madB=shrink.madB, |
824 | 837 |
NN=NN.Mlist, |
... | ... |
@@ -841,7 +854,8 @@ fit.lm3 <- function(strata, |
841 | 854 |
verbose, is.lds, CHR.X, ...){ |
842 | 855 |
##if(is.lds) {physical <- get("physical"); open(object)} |
843 | 856 |
if(verbose) message(" Probe stratum ", strata, " of ", length(index.list)) |
844 |
- gender <- object$gender |
|
857 |
+ open(object$gender) |
|
858 |
+ gender <- object$gender[] |
|
845 | 859 |
enough.males <- sum(gender==1) >= MIN.SAMPLES |
846 | 860 |
enough.females <- sum(gender==2) >= MIN.SAMPLES |
847 | 861 |
if(!enough.males & !enough.females){ |
... | ... |
@@ -1011,7 +1025,8 @@ fit.lm4 <- function(strata, |
1011 | 1025 |
verbose, is.lds=TRUE, ...){ |
1012 | 1026 |
##if(is.lds) {physical <- get("physical"); open(object)} |
1013 | 1027 |
## exclude batches that have fewer than MIN.SAMPLES |
1014 |
- gender <- object$gender |
|
1028 |
+ open(object$gender) |
|
1029 |
+ gender <- object$gender[] |
|
1015 | 1030 |
enough.males <- sum(gender==1) >= MIN.SAMPLES |
1016 | 1031 |
enough.females <- sum(gender==2) >= MIN.SAMPLES |
1017 | 1032 |
if(!enough.males & !enough.females){ |
... | ... |
@@ -1434,7 +1449,7 @@ shrinkSummary <- function(object, |
1434 | 1449 |
DF.PRIOR=50, |
1435 | 1450 |
verbose=TRUE, |
1436 | 1451 |
marker.index, |
1437 |
- is.lds){ |
|
1452 |
+ is.lds=TRUE){ |
|
1438 | 1453 |
stopifnot(type[[1]] == "SNP") |
1439 | 1454 |
CHR.X <- FALSE ## this is no longer needed |
1440 | 1455 |
if(missing(marker.index)){ |
... | ... |
@@ -1469,7 +1484,7 @@ shrinkSummary <- function(object, |
1469 | 1484 |
DF.PRIOR=DF.PRIOR, |
1470 | 1485 |
is.lds=is.lds) |
1471 | 1486 |
} |
1472 |
- return(object) |
|
1487 |
+ TRUE |
|
1473 | 1488 |
} |
1474 | 1489 |
|
1475 | 1490 |
genotypeSummary <- function(object, |
... | ... |
@@ -1522,7 +1537,7 @@ genotypeSummary <- function(object, |
1522 | 1537 |
is.lds=is.lds, |
1523 | 1538 |
CHR.X=CHR.X) |
1524 | 1539 |
} |
1525 |
- return(object) |
|
1540 |
+ TRUE |
|
1526 | 1541 |
} |
1527 | 1542 |
|
1528 | 1543 |
whichMarkers <- function(type, is.snp, is.autosome, is.annotated, is.X){ |
... | ... |
@@ -1552,6 +1567,8 @@ summarizeNps <- function(strata, index.list, object, batchSize, |
1552 | 1567 |
nc <- length(batchnames) |
1553 | 1568 |
N.AA <- medianA.AA <- madA.AA <- tau2A.AA <- matrix(NA, nr, nc) |
1554 | 1569 |
is.illumina <- whichPlatform(annotation(object)) == "illumina" |
1570 |
+ open(A(object)) |
|
1571 |
+ open(object$gender) |
|
1555 | 1572 |
AA <- as.matrix(A(object)[index, ]) |
1556 | 1573 |
if(is.illumina){ |
1557 | 1574 |
BB <- as.matrix(B(object)[index, ]) |
... | ... |
@@ -1593,7 +1610,6 @@ summarizeSnps <- function(strata, |
1593 | 1610 |
## physical <- get("physical") |
1594 | 1611 |
## open(object) |
1595 | 1612 |
## } |
1596 |
- open(object) |
|
1597 | 1613 |
if(verbose) message(" Probe stratum ", strata, " of ", length(index.list)) |
1598 | 1614 |
index <- index.list[[strata]] |
1599 | 1615 |
batches <- split(seq_along(batch(object)), as.character(batch(object))) |
... | ... |
@@ -1789,6 +1805,7 @@ crlmmCopynumber <- function(object, |
1789 | 1805 |
is.lds <- is(calls(object), "ffdf") | is(calls(object), "ff_matrix") |
1790 | 1806 |
if(is.lds) stopifnot(isPackageLoaded("ff")) |
1791 | 1807 |
samplesPerBatch <- table(as.character(batch(object))) |
1808 |
+ open(object) |
|
1792 | 1809 |
if(any(samplesPerBatch < MIN.SAMPLES)){ |
1793 | 1810 |
tmp <- paste(names(samplesPerBatch)[samplesPerBatch < MIN.SAMPLES], collapse=", ") |
1794 | 1811 |
message("The following batches have fewer than ", MIN.SAMPLES, " samples: ", tmp) |
... | ... |
@@ -1810,25 +1827,25 @@ crlmmCopynumber <- function(object, |
1810 | 1827 |
##if(verbose) message(paste("Computing summary statistics for ", mylabel(marker.type), " genotype clusters for each batch") |
1811 | 1828 |
marker.index <- whichMarkers(marker.type, is.snp, |
1812 | 1829 |
is.autosome, is.annotated, is.X) |
1813 |
- object <- genotypeSummary(object=object, |
|
1814 |
- GT.CONF.THR=GT.CONF.THR, |
|
1815 |
- type=marker.type, |
|
1816 |
- verbose=verbose, |
|
1817 |
- marker.index=marker.index, |
|
1818 |
- is.lds=is.lds) |
|
1830 |
+ genotypeSummary(object=object, |
|
1831 |
+ GT.CONF.THR=GT.CONF.THR, |
|
1832 |
+ type=marker.type, |
|
1833 |
+ verbose=verbose, |
|
1834 |
+ marker.index=marker.index, |
|
1835 |
+ is.lds=is.lds) |
|
1819 | 1836 |
} |
1820 | 1837 |
if(verbose) message("Imputing unobserved genotype medians and shrinking the variances (within-batch, across loci) ")##SNPs only |
1821 | 1838 |
if("SNP" %in% type){ |
1822 | 1839 |
marker.index <- whichMarkers("SNP", is.snp, |
1823 | 1840 |
is.autosome, is.annotated, is.X) |
1824 |
- object <- shrinkSummary(object=object, |
|
1825 |
- MIN.OBS=MIN.OBS, |
|
1826 |
- MIN.SAMPLES=MIN.SAMPLES, |
|
1827 |
- DF.PRIOR=DF.PRIOR, |
|
1828 |
- type="SNP", |
|
1829 |
- verbose=verbose, |
|
1830 |
- marker.index=marker.index, |
|
1831 |
- is.lds=is.lds) |
|
1841 |
+ shrinkSummary(object=object, |
|
1842 |
+ MIN.OBS=MIN.OBS, |
|
1843 |
+ MIN.SAMPLES=MIN.SAMPLES, |
|
1844 |
+ DF.PRIOR=DF.PRIOR, |
|
1845 |
+ type="SNP", |
|
1846 |
+ verbose=verbose, |
|
1847 |
+ marker.index=marker.index, |
|
1848 |
+ is.lds=is.lds) |
|
1832 | 1849 |
} |
1833 | 1850 |
if(verbose) message("Estimating parameters for copy number")##SNPs only |
1834 | 1851 |
for(i in seq_along(type)){ |
... | ... |
@@ -1852,6 +1869,7 @@ crlmmCopynumber <- function(object, |
1852 | 1869 |
is.lds=is.lds, |
1853 | 1870 |
CHR.X=CHR.X) |
1854 | 1871 |
} |
1872 |
+ close(object) |
|
1855 | 1873 |
TRUE |
1856 | 1874 |
} |
1857 | 1875 |
crlmmCopynumber2 <- function(){ |
... | ... |
@@ -1008,16 +1008,14 @@ getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verb |
1008 | 1008 |
} |
1009 | 1009 |
|
1010 | 1010 |
|
1011 |
-construct.Illumina <- function(sampleSheet=NULL, |
|
1011 |
+constructInf <- function(sampleSheet=NULL, |
|
1012 | 1012 |
arrayNames=NULL, |
1013 |
- ids=NULL, |
|
1014 | 1013 |
path=".", |
1015 | 1014 |
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
1016 | 1015 |
highDensity=FALSE, |
1017 | 1016 |
sep="_", |
1018 | 1017 |
fileExt=list(green="Grn.idat", red="Red.idat"), |
1019 | 1018 |
cdfName, |
1020 |
- copynumber=TRUE, |
|
1021 | 1019 |
verbose=FALSE, |
1022 | 1020 |
batch, #fns, |
1023 | 1021 |
saveDate=TRUE) { #, outdir="."){ |
... | ... |
@@ -1082,14 +1080,8 @@ construct.Illumina <- function(sampleSheet=NULL, |
1082 | 1080 |
if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files") |
1083 | 1081 |
|
1084 | 1082 |
if(verbose) message("Initializing container for genotyping and copy number estimation") |
1085 |
- featureData = getFeatureData(cdfName, copynumber=copynumber) |
|
1086 |
-# if(!missing(fns)){ |
|
1087 |
-# index = match(fns, featureNames(featureData)) |
|
1088 |
-# if(all(is.na(index))) stop("fns not in featureNames") |
|
1089 |
-# featureData = featureData[index, ] |
|
1090 |
-# } |
|
1083 |
+ featureData = getFeatureData(cdfName, copynumber=TRUE) |
|
1091 | 1084 |
nr = nrow(featureData); nc = narrays |
1092 |
-# ldPath(outdir) |
|
1093 | 1085 |
cnSet <- new("CNSet", |
1094 | 1086 |
alleleA=initializeBigMatrix(name="A", nr, nc), |
1095 | 1087 |
alleleB=initializeBigMatrix(name="B", nr, nc), |
... | ... |
@@ -1114,13 +1106,12 @@ construct.Illumina <- function(sampleSheet=NULL, |
1114 | 1106 |
cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double") |
1115 | 1107 |
cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double") |
1116 | 1108 |
sampleNames(cnSet) <- basename(sampleNames(cnSet)) |
1117 |
- ##pd = data.frame(matrix(NA, nc, 3), row.names=sampleNames(cnSet)) |
|
1118 |
- ##colnames(pd)=c("SKW", "SNR", "gender") |
|
1119 |
- ##phenoData(cnSet) = new("AnnotatedDataFrame", data=pd) |
|
1120 | 1109 |
return(cnSet) |
1121 | 1110 |
} |
1111 |
+construct.Illumina <- constructInf |
|
1122 | 1112 |
|
1123 |
-preprocess <- function(cnSet, |
|
1113 |
+preprocessInf <- function(cnSet, |
|
1114 |
+ sampleSheet=NULL, |
|
1124 | 1115 |
arrayNames=NULL, |
1125 | 1116 |
ids=NULL, |
1126 | 1117 |
path=".", |
... | ... |
@@ -1176,16 +1167,17 @@ preprocess <- function(cnSet, |
1176 | 1167 |
neededPkgs=c("crlmm", pkgname)) # outdir=outdir, |
1177 | 1168 |
return(mixtureParams) |
1178 | 1169 |
} |
1170 |
+preprocess <- preprocessInf |
|
1179 | 1171 |
|
1180 |
-genotypeRS <- function(cnSet, mixtureParams, probs=rep(1/3,3), |
|
1181 |
- SNRMin=5, |
|
1182 |
- recallMin=10, |
|
1183 |
- recallRegMin=1000, |
|
1184 |
- verbose=TRUE, |
|
1185 |
- returnParams=TRUE, |
|
1186 |
- badSNP=0.7, |
|
1187 |
- gender=NULL, |
|
1188 |
- DF=6){ |
|
1172 |
+genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3), |
|
1173 |
+ SNRMin=5, |
|
1174 |
+ recallMin=10, |
|
1175 |
+ recallRegMin=1000, |
|
1176 |
+ verbose=TRUE, |
|
1177 |
+ returnParams=TRUE, |
|
1178 |
+ badSNP=0.7, |
|
1179 |
+ gender=NULL, |
|
1180 |
+ DF=6){ |
|
1189 | 1181 |
is.snp = isSnp(cnSet) |
1190 | 1182 |
snp.index = which(is.snp) |
1191 | 1183 |
narrays = ncol(cnSet) |
... | ... |
@@ -1256,91 +1248,49 @@ genotype.Illumina <- function(sampleSheet=NULL, |
1256 | 1248 |
stopifnot(is.lds) |
1257 | 1249 |
if(missing(cdfName)) stop("must specify cdfName") |
1258 | 1250 |
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
1259 |
- pkgname = getCrlmmAnnotationName(cdfName) |
|
1260 |
- callSet <- construct.Illumina(sampleSheet=sampleSheet, arrayNames=arrayNames, |
|
1261 |
- ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, |
|
1262 |
- highDensity=highDensity, sep=sep, fileExt=fileExt, |
|
1263 |
- cdfName=cdfName, copynumber=copynumber, verbose=verbose, batch=batch, # fns=fns, |
|
1264 |
- saveDate=saveDate) #, outdir=outdir) |
|
1265 |
- ## Basic checks |
|
1266 |
- ##check that mixtureParams provided |
|
1267 |
- if(missing(mixtureParams)) |
|
1268 |
- stop("preprocess is FALSE but mixtureParams were not provided") |
|
1269 |
- if(ncol(callSet) != length(arrayNames)) |
|
1270 |
- stop("callSet provided but number of samples is not the same as the length of arrayNames") |
|
1271 |
- snr <- callSet$SNR[] |
|
1272 |
- if(any(is.na(snr))) stop("missing values in callSet$SNR") |
|
1273 |
- if(missing(sns)) sns = basename(sampleNames(callSet)) |
|
1274 |
- open(A(callSet)) |
|
1275 |
- open(B(callSet)) |
|
1276 |
- is.snp = isSnp(callSet) |
|
1277 |
- snp.index = which(is.snp) |
|
1278 |
- narrays = ncol(callSet) |
|
1279 |
- sampleBatches <- splitIndicesByLength(seq(length=ncol(callSet)), ocSamples()) |
|
1280 |
- mixtureParams = initializeBigMatrix("crlmmMixt-", 4, narrays, "double") |
|
1281 |
- SNR = initializeBigVector("crlmmSNR-", narrays, "double") |
|
1282 |
- SKW = initializeBigVector("crlmmSKW-", narrays, "double") |
|
1283 |
- ocLapply(seq_along(sampleBatches), processIDAT, sampleBatches=sampleBatches, |
|
1284 |
- sampleSheet=sampleSheet, arrayNames=arrayNames, |
|
1285 |
- ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, highDensity=highDensity, |
|
1286 |
- sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose, mixtureSampleSize=mixtureSampleSize, |
|
1287 |
- fitMixture=fitMixture, eps=eps, seed=seed, cdfName=cdfName, sns=sns, stripNorm=stripNorm, |
|
1288 |
- useTarget=useTarget, A=A(callSet), B=B(callSet), SKW=SKW, SNR=SNR, |
|
1289 |
- mixtureParams=mixtureParams, is.snp=is.snp, neededPkgs=c("crlmm", pkgname)) # outdir=outdir, |
|
1290 |
- open(SKW) |
|
1291 |
- open(SNR) |
|
1292 |
- pData(callSet)$SKW = SKW |
|
1293 |
- pData(callSet)$SNR = SNR |
|
1294 |
- save(callSet, file=file.path(ldPath(), "callSet.rda")) |
|
1295 |
- close(SNR) |
|
1296 |
- close(SKW) |
|
1297 |
- open(A(callSet)) |
|
1298 |
- open(B(callSet)) |
|
1299 |
- tmpA = initializeBigMatrix(name="tmpA", length(snp.index), narrays) |
|
1300 |
- tmpB = initializeBigMatrix(name="tmpB", length(snp.index), narrays) |
|
1301 |
- for(j in seq(length=ncol(callSet))){ |
|
1302 |
- tmpA[, j] <- as.integer(A(callSet)[snp.index, j]) |
|
1303 |
- tmpB[, j] <- as.integer(B(callSet)[snp.index, j]) |
|
1304 |
- } |
|
1305 |
- close(A(callSet)) |
|
1306 |
- close(B(callSet)) |
|
1307 |
- close(tmpA) |
|
1308 |
- close(tmpB) |
|
1309 |
- save(tmpA, file=file.path(ldPath(), "tmpA.rda")) |
|
1310 |
- save(tmpB, file=file.path(ldPath(), "tmpB.rda")) |
|
1311 |
- save(SNR, file=file.path(ldPath(), "SNR.rda")) |
|
1312 |
- save(SKW, file=file.path(ldPath(), "SKW.rda")) |
|
1313 |
- save(mixtureParams, file=file.path(ldPath(), "mixtureParams.rda")) |
|
1314 |
- message("Begin genotyping") |
|
1315 |
- tmp <- crlmmGT2(A=tmpA, |
|
1316 |
- B=tmpB, |
|
1317 |
- SNR=SNR, |
|
1318 |
- mixtureParams=mixtureParams, |
|
1319 |
- cdfName=cdfName, |
|
1320 |
- row.names=NULL, |
|
1321 |
- col.names=sampleNames(callSet), |
|
1322 |
- probs=probs, |
|
1323 |
- DF=DF, |
|
1324 |
- SNRMin=SNRMin, |
|
1325 |
- recallMin=recallMin, |
|
1326 |
- recallRegMin=recallRegMin, |
|
1327 |
- gender=gender, |
|
1328 |
- verbose=verbose, |
|
1329 |
- returnParams=returnParams, |
|
1330 |
- badSNP=badSNP) |
|
1331 |
- save(tmp, file=file.path(ldPath(), "tmp.rda")) |
|
1332 |
- if(verbose) message("Genotyping finished. Updating container with genotype calls and confidence scores.") |
|
1333 |
- open(tmpA); open(tmpB) |
|
1334 |
- for(j in 1:ncol(callSet)){ |
|
1335 |
- snpCall(callSet)[snp.index, j] <- as.integer(tmpA[, j]) |
|
1336 |
- snpCallProbability(callSet)[snp.index, j] <- as.integer(tmpB[, j]) |
|
1337 |
- } |
|
1338 |
- close(tmpA); close(tmpB) |
|
1339 |
- delete(tmpA); delete(tmpB); |
|
1340 |
- callSet$gender = tmp$gender |
|
1341 |
- rm(tmp) |
|
1342 |
- close(callSet) |
|
1343 |
- return(callSet) |
|
1251 |
+ stopifnot(!missing(batch)) |
|
1252 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
1253 |
+ message("Instantiate CNSet container.") |
|
1254 |
+ cnSet <- constructInf(sampleSheet=sampleSheet, |
|
1255 |
+ arrayNames=arrayNames, |
|
1256 |
+ path=path, |
|
1257 |
+ arrayInfoColNames=arrayInfoColNames, |
|
1258 |
+ highDensity=highDensity, |
|
1259 |
+ sep=sep, |
|
1260 |
+ fileExt=fileExt, |
|
1261 |
+ cdfName=cdfName, |
|
1262 |
+ verbose=verbose, |
|
1263 |
+ batch=batch, |
|
1264 |
+ saveDate=saveDate) |
|
1265 |
+ mixtureParams <- preprocessInf(cnSet=cnSet, |
|
1266 |
+ sampleSheet=sampleSheet, |
|
1267 |
+ arrayNames=arrayNames, |
|
1268 |
+ ids=ids, |
|
1269 |
+ path=path, |
|
1270 |
+ arrayInfoColNames=arrayInfoColNames, |
|
1271 |
+ highDensity=highDensity, |
|
1272 |
+ sep=sep, |
|
1273 |
+ fileExt=fileExt, |
|
1274 |
+ saveDate=saveDate, |
|
1275 |
+ stripNorm=stripNorm, |
|
1276 |
+ useTarget=useTarget, |
|
1277 |
+ mixtureSampleSize=mixtureSampleSize, |
|
1278 |
+ fitMixture=fitMixture, |
|
1279 |
+ eps=eps, |
|
1280 |
+ verbose=verbose, |
|
1281 |
+ seed=seed) |
|
1282 |
+ message("Preprocessing complete. Begin genotyping...") |
|
1283 |
+ genotypeInf(cnSet=cnSet, mixtureParams=mixtureParams, |
|
1284 |
+ probs=probs, |
|
1285 |
+ SNRMin=SNRMin, |
|
1286 |
+ recallMin=recallMin, |
|
1287 |
+ recallRegMin=recallRegMin, |
|
1288 |
+ verbose=verbose, |
|
1289 |
+ returnParams=returnParams, |
|
1290 |
+ badSNP=badSNP, |
|
1291 |
+ gender=gender, |
|
1292 |
+ DF=DF) |
|
1293 |
+ return(cnSet) |
|
1344 | 1294 |
} |
1345 | 1295 |
|
1346 | 1296 |
|
... | ... |
@@ -55,3 +55,44 @@ linesCNSet <- function(x, y, batch, copynumber, x.axis="A", grid=FALSE, ...){ |
55 | 55 |
} |
56 | 56 |
} |
57 | 57 |
} |
58 |
+ |
|
59 |
+ |
|
60 |
+cnPanel <- function(x, y, ..., pch.cols, gt, cbs.segs, hmm.segs=NULL, shades, subscripts, add.ideogram=TRUE){ |
|
61 |
+ ##if(panel.number() == 2) browser() |
|
62 |
+ add.ideogram <- add.ideogram[[panel.number()]] |
|
63 |
+ ##cbs.segs <- cbs.segs[[panel.number()]] |
|
64 |
+ draw.hmm.states <- ifelse(panel.number() <= length(hmm.segs), TRUE, FALSE) |
|
65 |
+ panel.grid(h=6, v=10) |
|
66 |
+ which.hom <- which(gt[subscripts] == 1 | gt[subscripts]==3) |
|
67 |
+ which.het <- which(gt[subscripts] == 2) |
|
68 |
+ panel.xyplot(x, y, col="grey60", ...) |
|
69 |
+ lpoints(x[which.hom], y[which.hom], col=pch.cols[1], ...) |
|
70 |
+ lpoints(x[which.het], y[which.het], col=pch.cols[2], ...) |
|
71 |
+ lsegments(x0=start(cbs.segs)/1e6, x1=end(cbs.segs)/1e6, |
|
72 |
+ y0=cbs.segs$seg.mean, |
|
73 |
+ y1=cbs.segs$seg.mean, ...) |
|
74 |
+ if(draw.hmm.states){ |
|
75 |
+ hmm.segs <- hmm.segs[order(width(hmm.segs), decreasing=TRUE), ] |
|
76 |
+ lrect(xleft=start(hmm.segs)/1e6, |
|
77 |
+ xright=end(hmm.segs)/1e6, |
|
78 |
+ ybottom=-0.4, |
|
79 |
+ ytop=0, |
|
80 |
+ border=shades[hmm.segs$state], |
|
81 |
+ col=shades[hmm.segs$state]) |
|
82 |
+ } |
|
83 |
+ ltext(130, 5, paste("MAD:", round(mad(y, na.rm=TRUE), 2))) |
|
84 |
+ if(add.ideogram){ |
|
85 |
+ pathto <- system.file("hg18", package="SNPchip") |
|
86 |
+ cytoband <- read.table(file.path(pathto, "cytoBand.txt"), as.is=TRUE) |
|
87 |
+ cytoband$V2 <- cytoband$V2/1e6 |
|
88 |
+ cytoband$V3 <- cytoband$V3/1e6 |
|
89 |
+ colnames(cytoband) <- c("chrom", "start", "end", "name", "gieStain") |
|
90 |
+ plotCytoband(unique(hmm.segs$chrom), |
|
91 |
+ cytoband=cytoband, |
|
92 |
+ cytoband.ycoords=c(5.6, 5.9), |
|
93 |
+ new=FALSE, |
|
94 |
+ label.cytoband=FALSE, |
|
95 |
+ build="hg18", |
|
96 |
+ use.lattice=TRUE) |
|
97 |
+ } |
|
98 |
+} |
... | ... |
@@ -15,6 +15,7 @@ |
15 | 15 |
\newcommand{\Rclass}[1]{{\textit{#1}}} |
16 | 16 |
\newcommand{\oligo}{\Rpackage{oligo }} |
17 | 17 |
\newcommand{\R}{\textsf{R}} |
18 |
+\newcommand{\ff}{\Rpackage{ff}} |
|
18 | 19 |
|
19 | 20 |
\begin{document} |
20 | 21 |
\title{Using \crlmm{} for copy number estimation and genotype calling |
... | ... |
@@ -27,619 +28,291 @@ |
27 | 28 |
|
28 | 29 |
|
29 | 30 |
\begin{abstract} |
30 |
- This vignette illustrates the steps necessary for obtaining |
|
31 |
- marker-level estimates of allele-specific copy number from the raw |
|
32 |
- Illumina IDAT files. Hidden markov models or segmentation |
|
33 |
- algorithms can be applied to the marker-level estimates. Examples |
|
34 |
- of both are illustrated. |
|
31 |
+ This vignette illustrates the steps required prior to copy number |
|
32 |
+ analysis for Infinium platforms. Specifically, we require |
|
33 |
+ construction of a container to store processed forms of the raw |
|
34 |
+ data, preprocessing to normalize the arrays, and genotyping using |
|
35 |
+ the CRLMM algorithm. After completing these steps, users can refer |
|
36 |
+ to the copy number vignette. |
|
35 | 37 |
\end{abstract} |
36 | 38 |
|
37 |
-\section{About this vignette} |
|
39 |
+\section{Infrastructure} |
|
40 |
+ |
|
38 | 41 |
<<crlmm, results=hide, echo=FALSE>>= |
39 | 42 |
library(crlmm) |
43 |
+options(width=70) |
|
44 |
+options(continue=" ") |
|
40 | 45 |
@ |
41 | 46 |
|
42 |
-%The vignette for copy number estimation for illumina platforms is |
|
43 |
-%under development. Please use the latest stable release of crlmm. |
|
44 |
- |
|
45 |
-%\end{document} |
|
47 |
+\textbf{Supported platforms:} The supported Infinium platforms are |
|
48 |
+those for which a corresponding annotation package is available. The |
|
49 |
+annotation packages contain information on the markers, such as |
|
50 |
+physical position and chromosome, as well as pre-computed parameters |
|
51 |
+estimated from HapMap used during the preprocessing and genotyping |
|
52 |
+steps. Currently supported Infinium platforms are listed in the |
|
53 |
+following code chunk. |
|
46 | 54 |
|
47 |
-Allele-specific copy number estimation in the crlmm package is |
|
48 |
-available for several Illumina platforms. As described in the |
|
49 |
-\texttt{copynumber} vignette, copy number estimation in \crlmm{} works |
|
50 |
-best when there are a sufficient number of samples such that AA, AB, |
|
51 |
-and BB genotypes are observed at most loci. For small studies (e.g., |
|
52 |
-fewer than 50 samples), there will be a large number of SNPs that are |
|
53 |
-monomorphic. For monomorphic SNPs, the estimation problem becomes more |
|
54 |
-difficult and alternative strategies that estimate the relative total |
|
55 |
-copy number may be preferable. In addition to installing \crlmm{}, |
|
56 |
-one must also install the appropriate annotation package for the |
|
57 |
-Illumina platform. In the following code, we list the platforms for |
|
58 |
-which annotation packages are currently available. Next we create a |
|
59 |
-directory where output files will be stored and indicate the directory |
|
60 |
-that contains the IDAT files that will be used in our analysis. |
|
61 |
- |
|
62 |
- |
|
63 |
-<<setup, echo=FALSE, results=hide>>= |
|
64 |
-options(width=70) |
|
65 |
-options(continue=" ") |
|
66 |
-library(cacheSweave) |
|
55 |
+<<supportedPlatforms>>= |
|
56 |
+pkgs <- annotationPackages() |
|
57 |
+crlmm.pkgs <- pkgs[grep("Crlmm", pkgs)] |
|
58 |
+crlmm.pkgs[grep("human", crlmm.pkgs)] |
|
67 | 59 |
@ |
68 | 60 |
|
69 |
-<<libraries>>= |
|
61 |
+\textbf{Large data:} In order to reduce \crlmm{}'s memory footprint, |
|
62 |
+we require the \Rpackage{ff} for copy number estimation. The |
|
63 |
+\Rpackage{ff} package provides infrastructure for accessing and |
|
64 |
+writing data to disk instead of keeping data in memory. Each element |
|
65 |
+of the \verb+assayData+ and \verb+batchStatistics+ slot of the |
|
66 |
+\Rclass{CNSet} class are \Robject{ff} objects. \Robject{ff} objects |
|
67 |
+in the \R{} workspace contain pointers to several files with the |
|
68 |
+\verb+.ff+ extension. The location of where the data is stored on |
|
69 |
+disk is specified by use of the \Rfunction{ldPath} function. |
|
70 |
+ |
|
71 |
+<<ldpath,results=hide>>= |
|
70 | 72 |
library(ff) |
71 |
-pkgs <- annotationPackages() |
|
72 |
-pkgs[grep("Crlmm", pkgs)] |
|
73 | 73 |
if(getRversion() < "2.13.0"){ |
74 | 74 |
rpath <- getRversion() |
75 | 75 |
} else rpath <- "trunk" |
76 | 76 |
outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/illumina_vignette", sep="") |
77 |
+ldPath(outdir) |
|
77 | 78 |
dir.create(outdir, recursive=TRUE, showWarnings=FALSE) |
78 |
-datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k" |
|
79 |
-setCacheDir(outdir) |
|
80 | 79 |
@ |
81 | 80 |
|
82 |
-%Options for controlling RAM with the \Rpackage{ff} package. |
|
83 |
- |
|
84 |
-<<ldOptions>>= |
|
85 |
-ldPath(outdir) |
|
81 |
+Users should not move or rename this directory. If only output files |
|
82 |
+are stored in \verb+outdir+, one can either remove the entire |
|
83 |
+directory prior to rerunning the analysis or all of the '.ff' files. |
|
84 |
+Otherwise, one would accumulate a large number of '.ff' files on disk |
|
85 |
+that are no longer in use. |
|
86 |
+ |
|
87 |
+As none of the functions for preprocessing, genotyping, or copy number |
|
88 |
+estimation simultaneously require all samples and all probes in |
|
89 |
+memory, memory-usage by \crlmm{} can be fine tuned by reading in and |
|
90 |
+processing subsets of the markers and/or samples. The functions |
|
91 |
+\Rfunction{ocSamples} and \Rfunction{ocProbesets} in the |
|
92 |
+\Rpackage{oligoClasses} package can be used to declare how many |
|
93 |
+markers and samples to read at once. In general, specifying smaller |
|
94 |
+values should reduce the RAM required for a particular job, but would |
|
95 |
+be expected to have an increased run-time. In the following |
|
96 |
+code-chunk, we declare that \crlmm{} should process 150,000 markers at |
|
97 |
+a time (when possible) and 500 samples at a time. (As our dataset in |
|
98 |
+this vignette only contains 43 samples, the \Rfunction{ocSamples} |
|
99 |
+option would not have any effect.) One can view the current settings |
|
100 |
+for these commands, by typing the functions without an argument. |
|
101 |
+ |
|
102 |
+<<ram>>= |
|
103 |
+ocSamples() |
|
104 |
+ocProbesets() |
|
86 | 105 |
ocProbesets(150e3) |
87 |
-ocSamples(200) |
|
106 |
+ocSamples(500) |
|
107 |
+ocSamples() |
|
108 |
+ocProbesets() |
|
88 | 109 |
@ |
89 | 110 |
|
90 |
-This vignette was created using Illumina IDAT files that are located |
|
91 |
-in a specific directory on my computer (\Robject{pathToCels}). Long |
|
92 |
-computations are saved in the output directory \Robject{outdir}. |
|
93 |
-Towards this end, we make repeated use of the \Rfunction{checkExists} |
|
94 |
-simply checks that a variable is in the workspace. If not, the |
|
95 |
-variable is loaded from the indicated path. If the file (with '.rda' |
|
96 |
-extension) does not exist in this path, the function indicated by the |
|
97 |
-.FUN argument is executed. Alternatively, if \Robject{.load.it} is |
|
98 |
-\Robject{FALSE} the function will be executed regardless of whether |
|
99 |
-the file exists. Users should see the \Rpackage{weaver} package for a |
|
100 |
-more 'correct' approach for cacheing long computations. The following |
|
101 |
-code chunk checks that that the variables specific to the directory |
|
102 |
-structure on my computer exist. Users should modify the |
|
103 |
-\Robject{outdir} and \Robject{datadir} variables as appropriate. |
|
104 |
- |
|
105 |
-<<checkSetup>>= |
|
106 |
-if(!file.exists(outdir)) stop("Please specify valid directory for storing output") |
|
107 |
-if(!file.exists(datadir)) stop("Please specify the correct path to the CEL files") |
|
111 |
+<<cacheSweave, echo=FALSE, results=hide>>= |
|
112 |
+library(cacheSweave) |
|
113 |
+setCacheDir(outdir) |
|
108 | 114 |
@ |
109 | 115 |
|
110 |
-\section{Preprocessing Illumina IDAT files, genotyping, and copy number |
|
111 |
- estimation} |
|
112 |
-To perform copy number analysis on the Illumina platform, several |
|
113 |
-steps are required. The first step is to read in the IDAT files and |
|
114 |
-create a container for storing the red and green intensities. These |
|
115 |
-intensities are quantile normalized in the function |
|
116 |
-\Rfunction{crlmmIllumina}, and then genotyped using the crlmm |
|
117 |
-algorithm. Details on the crlmm genotyping algorithm are described |
|
118 |
-elsewhere. We will make use of the normalized intensities when we |
|
119 |
-estimate copy number. The object returned by the |
|
120 |
-\Rfunction{genotype.Illumina} function is an instance of the |
|
121 |
-\Robject{CNSet} class, a container for storing the genotype calls, |
|
122 |
-genotype confidence scores, the normalized intensities for the A and B |
|
123 |
-channels, and summary statistics on the batches. The batch variable |
|
124 |
-can be the date on which the array was scanned or the 96 well |
|
125 |
-microarray chemistry plate on which the samples were processed. Both |
|
126 |
-date and chemistry plate are useful surrogates for experimental |
|
127 |
-factors that effect subgroups of the probe intensities. (Quantile |
|
128 |
-normalization does not remove batch effects.) The genotype confidence |
|
129 |
-scores are saved as an integer, and can be converted back to a $[0, |
|
130 |
-1]$ probability scale by the transformation $round(-1000*log2(1-p))$. |
|
116 |
+\textbf{Limitations:} There is no minimum number of samples required |
|
117 |
+for preprocessing and genotyping. However, for copy number estimation |
|
118 |
+the \crlmm{} package currently requires at least 10 samples per batch. |
|
119 |
+The parameter estimates for copy number and the corresponding |
|
120 |
+estimates of raw copy number will tend to be more noisy for batches |
|
121 |
+with small sample sizes (e.g., $<$ 50). Chemistry plate or scan date |
|
122 |
+are often useful surrogates for batch. |
|
123 |
+ |
|
124 |
+\section{Initializing a container for storing processed data} |
|
125 |
+ |
|
126 |
+This section will initialize a container for storing processed forms |
|
127 |
+of the data, including the normalized intensities for the A and B |
|
128 |
+alleles and the CRLMM genotype calls and confidence scores. In |
|
129 |
+addition, the container will store information on the markers |
|
130 |
+(physical position, chromosome, and a SNP indicator), the batch, and |
|
131 |
+the samples (e.g., gender). To construct this container for Infinium |
|
132 |
+platforms, several steps are required. |
|
133 |
+ |
|
134 |
+We begin by specifying the path containing the raw IDAT files for a |
|
135 |
+set of samples from the Infinium 370k platform. |
|
136 |
+ |
|
137 |
+<libraries>>= |
|
138 |
+datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k" |
|
139 |
+@ |
|
140 |
+ |
|
141 |
+For Infinium platforms, an Illumina sample sheet containing |
|
142 |
+information for reading the raw IDAT files is required. Please refer |
|
143 |
+to the BeadStudio Genotyping guide, Appendix A, for additional |
|
144 |
+information. The following code reads in the samplesheet for the IDAT |
|
145 |
+files on our local server. For our dataset, the file extensions are |
|
146 |
+`Grn.dat' and `Red.idat'. We store the complete path to the filename |
|
147 |
+without the file extension in the \Robject{arrayNames} and check that |
|
148 |
+all of the green and red IDAT files exists. |
|
131 | 149 |
|
132 | 150 |
<<samplesheet>>= |
133 | 151 |
samplesheet = read.csv(file.path(datadir, "HumanHap370Duo_Sample_Map.csv"), header=TRUE, as.is=TRUE) |
134 | 152 |
samplesheet <- samplesheet[-c(28:46,61:75,78:79), ] |
135 | 153 |
arrayNames <- file.path(datadir, unique(samplesheet[, "SentrixPosition"])) |
136 |
-grnfiles = all(file.exists(paste(arrayNames, "_Grn.idat", sep=""))) |
|
137 |
-redfiles = all(file.exists(paste(arrayNames, "_Red.idat", sep=""))) |
|
138 |
-load.it <- TRUE |
|
139 |
-if(!load.it){ |
|
140 |
- rmFiles <- list.files(outdir, pattern=".ff", full.names=TRUE) |
|
141 |
- unlink(rmFiles) |
|
142 |
-} |
|
143 |
-plate <- samplesheet$Plate |
|
144 |
-table(plate) |
|
145 |
-## too few samples to run the plates as separate batches |
|
146 |
-batch <- rep("1", nrow(samplesheet)) |
|
154 |
+all(file.exists(paste(arrayNames, "_Grn.idat", sep=""))) |
|
155 |
+all(file.exists(paste(arrayNames, "_Red.idat", sep=""))) |
|
156 |
+arrayInfo <- list(barcode=NULL, position="SentrixPosition") |
|
147 | 157 |
@ |
148 | 158 |
|
149 |
-<<genotype,cache=TRUE>>= |
|
150 |
-container <- genotype.Illumina(sampleSheet=samplesheet, |
|
151 |
- path=dirname(arrayNames[1]), |
|
152 |
- arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), |
|
153 |
- cdfName="human370v1c", |
|
154 |
- batch=batch) |
|
155 |
-@ |
|
159 |
+As discussed previously, all supported platforms have a corresponding |
|
160 |
+annotation package. The appropriate annotation package is specified |
|
161 |
+by the platform identifier without the \verb+Crlmm+ postfix. |
|
156 | 162 |
|
157 |
-<<copynumber,cache=TRUE>>= |
|
158 |
-GT.CONF.THR <- 0.8 |
|
159 |
-cnSet <- crlmmCopynumber(container, GT.CONF.THR=GT.CONF.THR) |
|
163 |
+<<cdfname>>= |
|
164 |
+cdfName <- "human370v1c" |
|
160 | 165 |
@ |
161 | 166 |
|
162 |
-The \Robject{cnSet} returned by the \Rfunction{crlmmCopynumber} |
|
163 |
-function contains all of the parameters used to compute |
|
164 |
-allele-specific copy number. In order to reduce I/O and speed |
|
165 |
-computation, the allele-specific copy number estimates are not written |
|
166 |
-to file nor saved in the \Robject{CNSet} object. Rather, |
|
167 |
-allele-specific estimates are computed on the fly by the functions |
|
168 |
-\Rfunction{CA}, \Rfunction{CB}, or \Rfunction{totalCopynumber}. The |
|
169 |
-following codechunks provide a few examples. |
|
170 |
- |
|
171 |
-Copy number for polymorphic markers on chromosomes 1 - 22. |
|
172 |
- |
|
173 |
-<<ca>>= |
|
174 |
-snp.index <- which(isSnp(cnSet) & !is.na(chromosome(cnSet)) & chromosome(cnSet) < 22) |
|
175 |
-ca <- CA(cnSet, i=snp.index, j=1:5) |
|
176 |
-cb <- CB(cnSet, i=snp.index, j=1:5) |
|
177 |
-ct <- ca+cb |
|
178 |
-@ |
|
167 |
+Next, we construct a character vector that specifies the batch for |
|
168 |
+each of the \Sexpr{length(arrayNames)} arrays. Here, we have a small |
|
169 |
+dataset and process the samples in a single batch. Processing the |
|
170 |
+samples as a single batch is generally reasonable if the samples were |
|
171 |
+processed at similar times (e.g., within a few weeks). |
|
179 | 172 |
|
180 |
-Alternatively, total copy number can be obtained by |
|
181 |
-<<totalCopynumber.snps>>= |
|
182 |
-ct2 <- totalCopynumber(cnSet, i=snp.index, j=1:5) |
|
183 |
-stopifnot(all.equal(ct, ct2)) |
|
173 |
+<<batch>>= |
|
174 |
+batch <- rep("1", nrow(samplesheet)) |
|
184 | 175 |
@ |
185 | 176 |
|
186 |
-Missing values can arise at polymorphic when the confidence score of |
|
187 |
-the genotype calls are below the threshold indicated by the threshold |
|
188 |
-\texttt{GT.CONF.THR} in \Robject{crlmmCopynumber}. See |
|
189 |
-\Robject{?crlmmCopynumber} for additional details. In the following |
|
190 |
-codechunk, we compute the number of samples that had confidence scores |
|
191 |
-below \Sexpr{GT.CONF.THR} at loci for which ${\hat CA}$ and ${\hat CB}$ |
|
192 |
-are missing. |
|
193 |
- |
|
194 |
-<<NAs.snps>>= |
|
195 |
-missing.copynumber <- which(rowSums(is.na(ct)) > 0) |
|
196 |
-invisible(open(snpCallProbability(cnSet))) |
|
197 |
-gt.confidence <- i2p(snpCallProbability(cnSet)[snp.index, ]) |
|
198 |
-n.below.threshold <- rowSums(gt.confidence < 0.95) |
|
199 |
-unique(n.below.threshold[missing.copynumber]) |
|
200 |
-@ |
|
201 |
-\noindent For loci with missing copy number, the confidence scores |
|
202 |
-were all below the threshold. |
|
203 |
- |
|
204 |
-At nonpolymorphic loci, either the \Rfunction{CA} or |
|
205 |
-\Rfunction{totalCopynumber} functions can be used to obtain estimates |
|
206 |
-of total copy number. |
|
207 |
- |
|
208 |
-<<nonpolymorphicAutosomes>>= |
|
209 |
-marker.index <- which(!isSnp(cnSet) & chromosome(cnSet) < 23) |
|
210 |
-ct <- CA(cnSet, i=marker.index, j=1:5) |
|
211 |
-## all zeros |
|
212 |
-stopifnot(all(CB(cnSet, i=marker.index, j=1:5) == 0)) |
|
213 |
-ct2 <- totalCopynumber(cnSet, i=marker.index, j=1:5) |
|
214 |
-stopifnot(all.equal(ct, ct2)) |
|
177 |
+Finally, we initialize an object of class \Robject{CNSet} using the |
|
178 |
+function \Rfunction{constructInf}. |
|
179 |
+ |
|
180 |
+<<container,cache=TRUE>>= |
|
181 |
+cnSet <- constructInf(sampleSheet=samplesheet, |
|
182 |
+ arrayNames=arrayNames, |
|
183 |
+ batch=batch, |
|
184 |
+ arrayInfoColNames=arrayInfo, |
|
185 |
+ cdfName=cdfName, |
|
186 |
+ verbose=TRUE, |
|
187 |
+ saveDate=TRUE) |
|
215 | 188 |
@ |
216 | 189 |
|
190 |
+A concise summary of the object's contents can be viewed with the |
|
191 |
+\Rfunction{print} function. |
|
217 | 192 |
|
218 |
-\begin{figure}[t!] |
|
219 |
- Nonpolymorphic markers on chromosome X: |
|
220 |
- \centering |
|
221 |
-<<nonpolymorphicX, fig=TRUE, width=8, height=4>>= |
|
222 |
-npx.index <- which(chromosome(cnSet)==23 & !isSnp(cnSet)) |
|
223 |
-M <- sample(which(cnSet$gender==1), 5) |
|
224 |
-F <- sample(which(cnSet$gender==2), 5) |
|
225 |
-cn.M <- CA(cnSet, i=npx.index, j=M) |
|
226 |
-cn.F <- CA(cnSet, i=npx.index, j=F) |
|
227 |
-boxplot(data.frame(cbind(cn.M, cn.F)), pch=".", col="grey60", outline=FALSE) |
|
228 |
-@ |
|
229 |
-\caption{Copy number estimates for nonpolymorphic loci on chromosome |
|
230 |
- X (5 men, 5 women). crlmm assumes that the median copy number |
|
231 |
- across samples at a given marker on X is 1 for men and 2 for women.} |
|
232 |
-\end{figure} |
|
233 |
- |
|
234 |
- |
|
235 |
- |
|
236 |
- |
|
237 |
-\begin{figure}[t!] |
|
238 |
- \centering |
|
239 |
-<<polymorphicX, fig=TRUE, width=8, height=4>>= |
|
240 |
-## copy number estimates on X for SNPs are biased towards small values. |
|
241 |
-X.markers <- which(isSnp(cnSet) & chromosome(cnSet) == 23) |
|
242 |
-ca.M <- CA(cnSet, i=X.markers, j=M) |
|
243 |
-cb.M <- CB(cnSet, i=X.markers, j=M) |
|
244 |
-ca.F <- CA(cnSet, i=X.markers, j=F) |
|
245 |
-cb.F <- CB(cnSet, i=X.markers, j=F) |
|
246 |
-cn.M <- ca.M+cb.M |
|
247 |
-cn.F <- ca.F+cb.F |
|
248 |
-boxplot(data.frame(cbind(cn.M, cn.F)), pch=".", outline=FALSE, col="grey60") |
|
249 |
-cn2 <- totalCopynumber(cnSet, i=X.markers, j=c(M, F)) |
|
250 |
-stopifnot(all.equal(cbind(cn.M, cn.F), cn2)) |
|
251 |
-@ |
|
252 |
-\caption{Copy number estimates for polymorphic markers on chromosome |
|
253 |
- X. } |
|
254 |
-\end{figure} |
|
255 |
- |
|
256 |
-Often it is of interest to smooth the total copy number estimates (the |
|
257 |
-sum of the allele-specific copy number) as a function of the physical |
|
258 |
-position. The following code chunk can be used to create an instance |
|
259 |
-of \Robject{CopyNumberSet} that contains the CA + CB at polymorphic |
|
260 |
-markers and 'CA' at nonpolymorphic markers. |
|
261 |
- |
|
262 |
-The \Robject{cnSet} returned by the \Rfunction{crlmmCopynumber} |
|
263 |
-function contains all of the parameters used to compute |
|
264 |
-allele-specific copy number. In order to reduce I/O and speed |
|
265 |
-computation, the allele-specific copy number estimates are not written |
|
266 |
-to file nor saved in the \Robject{CNSet} object. Rather, the |
|
267 |
-estimates are computed on the fly when the user calls one of the |
|
268 |
-helper functions that uses this information. For instance, often it is |
|
269 |
-of interest to smooth the total copy number estimates (the sum of the |
|
270 |
-allele-specific copy number) as a function of the physical position. |
|
271 |
-The following code chunk can be used to create an instance of |
|
272 |
-\Robject{CopyNumberSet} that contains the CA + CB at polymorphic |
|
273 |
-markers and 'CA' at nonpolymorphic markers. |
|
274 |
- |
|
275 |
-Alternatively, total copy number can be obtained by |
|
276 |
-<<totalCopynumber.snps>>= |
|
277 |
-ct2 <- totalCopynumber(cnSet, i=snp.index, j=1:5) |
|
278 |
-stopifnot(all.equal(ct, ct2)) |
|
193 |
+<<cnset>>= |
|
194 |
+print(cnSet) |
|
279 | 195 |
@ |
280 | 196 |
|
281 |
-Missing values can arise at polymorphic when the confidence score of |
|
282 |
-the genotype calls are below the threshold indicated by the threshold |
|
283 |
-\texttt{GT.CONF.THR} in \Robject{crlmmCopynumber}. See |
|
284 |
-\Robject{?crlmmCopynumber} for additional details. In the following |
|
285 |
-codechunk, we compute the number of samples that had confidence scores |
|
286 |
-below \Sexpr{GT.CONF.THR} at loci for which ${\hat CA}$ and ${\hat CB}$ |
|
287 |
-are missing. |
|
197 |
+Note that the above object does not yet contain any processed data |
|
198 |
+(only \verb+NA+'s) and several \verb+.ff+ files now appear in the |
|
199 |
+\verb+outdir+. Again, these files should not be removed. |
|
288 | 200 |
|
289 |
-<<NAs.snps>>= |
|
290 |
-missing.copynumber <- which(rowSums(is.na(ct)) > 0) |
|
291 |
-invisible(open(snpCallProbability(cnSet))) |
|
292 |
-gt.confidence <- i2p(snpCallProbability(cnSet)[snp.index, ]) |
|
293 |
-n.below.threshold <- rowSums(gt.confidence < 0.95) |
|
294 |
-unique(n.below.threshold[missing.copynumber]) |
|
201 |
+<<listff>>= |
|
202 |
+list.files(outdir, pattern=".ff")[1:5] |
|
295 | 203 |
@ |
296 |
-\noindent For loci with missing copy number, the confidence scores |
|
297 |
-were all below the threshold. |
|
298 |
- |
|
299 |
-At nonpolymorphic loci, either the \Rfunction{CA} or |
|
300 |
-\Rfunction{totalCopynumber} functions can be used to obtain estimates |
|
301 |
-of total copy number. |
|
302 |
- |
|
303 |
-<<nonpolymorphicAutosomes>>= |
|
304 |
-marker.index <- which(!isSnp(cnSet) & chromosome(cnSet) < 23) |
|
305 |
-ct <- CA(cnSet, i=marker.index, j=1:5) |
|
306 |
-## all zeros |
|
307 |
-stopifnot(all(CB(cnSet, i=marker.index, j=1:5) == 0)) |
|
308 |
-ct2 <- totalCopynumber(cnSet, i=marker.index, j=1:5) |
|
309 |
-stopifnot(all.equal(ct, ct2)) |
|
310 |
-@ |
|
311 |
- |
|
312 | 204 |
|
313 |
-\begin{figure}[t!] |
|
314 |
- Nonpolymorphic markers on chromosome X: |
|
315 |
- \centering |
|
316 |
-<<nonpolymorphicX, fig=TRUE, width=8, height=4>>= |
|
317 |
-npx.index <- which(chromosome(cnSet)==23 & !isSnp(cnSet)) |
|
318 |
-M <- sample(which(cnSet$gender==1), 5) |
|
319 |
-F <- sample(which(cnSet$gender==2), 5) |
|
320 |
-cn.M <- CA(cnSet, i=npx.index, j=M) |
|
321 |
-cn.F <- CA(cnSet, i=npx.index, j=F) |
|
322 |
-boxplot(data.frame(cbind(cn.M, cn.F)), pch=".", col="grey60", outline=FALSE) |
|
323 |
-@ |
|
324 |
-\caption{Copy number estimates for nonpolymorphic loci on chromosome |
|
325 |
- X (5 men, 5 women). crlmm assumes that the median copy number |
|
326 |
- across samples at a given marker on X is 1 for men and 2 for women.} |
|
327 |
-\end{figure} |
|
328 |
- |
|
329 |
- |
|
330 |
- |
|
331 |
- |
|
332 |
-\begin{figure}[t!] |
|
333 |
- \centering |
|
334 |
-<<polymorphicX, fig=TRUE, width=8, height=4>>= |
|
335 |
-## copy number estimates on X for SNPs are biased towards small values. |
|
336 |
-X.markers <- which(isSnp(cnSet) & chromosome(cnSet) == 23) |
|
337 |
-ca.M <- CA(cnSet, i=X.markers, j=M) |
|
338 |
-cb.M <- CB(cnSet, i=X.markers, j=M) |
|
339 |
-ca.F <- CA(cnSet, i=X.markers, j=F) |
|
340 |
-cb.F <- CB(cnSet, i=X.markers, j=F) |
|
341 |
-cn.M <- ca.M+cb.M |
|
342 |
-cn.F <- ca.F+cb.F |
|
343 |
-boxplot(data.frame(cbind(cn.M, cn.F)), pch=".", outline=FALSE, col="grey60") |
|
344 |
-cn2 <- totalCopynumber(cnSet, i=X.markers, j=c(M, F)) |
|
345 |
-stopifnot(all.equal(cbind(cn.M, cn.F), cn2)) |
|
346 |
-@ |
|
347 |
-\caption{Copy number estimates for polymorphic markers on chromosome |
|
348 |
- X. } |
|
349 |
-\end{figure} |
|
350 |
- |
|
351 |
-Often it is of interest to smooth the total copy number estimates (the |
|
352 |
-sum of the allele-specific copy number) as a function of the physical |
|
353 |
-position. The following code chunk can be used to create an instance |
|
354 |
-of \Robject{CopyNumberSet} that contains the CA + CB at polymorphic |
|
355 |
-markers and 'CA' at nonpolymorphic markers. |
|
356 |
- |
|
357 |
-<<copynumberObject>>= |
|
358 |
-marker.index <- which(chromosome(cnSet) <= 22)## & isSnp(cnSet)) |
|
359 |
-sample.index <- 1:5 ## first five samples |
|
360 |
-invisible(open(cnSet)) |
|
361 |
-copynumberSet <- as(cnSet[marker.index, 1:5], "CopyNumberSet") |
|
362 |
-invisible(close(cnSet)) |
|
363 |
-copynumberSet <- copynumberSet[order(chromosome(copynumberSet), position(copynumberSet)), ] |
|
364 |
-indices <- split(1:nrow(copynumberSet), chromosome(copynumberSet)) |
|
365 |
-dup.index <- unlist(sapply(indices, function(i, position) i[duplicated(position[i])], position=position(copynumberSet))) |
|
366 |
-if(length(dup.index) > 0) copynumberSet <- copynumberSet[-dup.index, ] |
|
367 |
-## exclude any rows that are all missing |
|
368 |
-missing.index <- which(rowSums(is.na(copyNumber(copynumberSet))) == ncol(copynumberSet)) |
|
369 |
-table(chromosome(copynumberSet)[missing.index]) |
|
370 |
-copynumberSet <- copynumberSet[-missing.index, ] |
|
371 |
-@ |
|
205 |
+Finally, note that the elements of the \verb+assayData+ slot are |
|
206 |
+\Robject{ff} objects and not matrices. For the most part, the |
|
207 |
+\emph{appearance} that the data is stored in memory is preserved. |
|
372 | 208 |
|
373 | 209 |
|
374 |
-We suggest plotting the total copy number as a function of the |
|
375 |
-physical position to evaluate whether a wave-correction is needed. |
|
376 |
- |
|
377 |
- |
|
378 |
-<<plotTotalCn, fig=TRUE, width=8, height=8>>= |
|
379 |
-require(SNPchip) |
|
380 |
-data(chromosomeAnnotation) |
|
381 |
-chrAnn <- chromosomeAnnotation[1,] |
|
382 |
-layout(matrix(1:5, 5,1), heights=c(1, 1, 1, 1, 1)) |
|
383 |
-par(mar=c(0.5, 0.5, 0.5, 0.5), las=1, oma=c(4, 4, 4, 4)) |
|
384 |
-i <- which(chromosome(copynumberSet) == 1) |
|
385 |
-for(j in 1:5){ |
|
386 |
- plot(position(copynumberSet)[i], copyNumber(copynumberSet)[i, j], pch=".", cex=0.6,col="grey60", xaxt="n", |
|
387 |
- ylim=c(-1,6), ylab="total copy number", |
|
388 |
- yaxt="n", |
|
389 |
- xlim=c(0, max(position(copynumberSet)[i]))) |
|
390 |
- legend("topleft", legend=paste("SNR:", round(copynumberSet$SNR[j], 1))) |
|
391 |
- axis(2, at=0:6, labels=0:6) |
|
392 |
- if(j == 5){ |
|
393 |
- mtext("physical position (Mb)", 1, outer=TRUE, line=2) |
|
394 |
- mtext("Chromosome 8", 3, line=0, outer=TRUE) |
|
395 |
- at <- pretty(position(copynumberSet)[i], n=8) |
|
396 |
- } |
|
397 |
- ## draw centromere |
|
398 |
- xx <- c(chrAnn[1:2], chrAnn[2:1]) |
|
399 |
- yy <- c(-0.5, -0.5, 5, 5) |
|
400 |
- polygon(xx, yy, col="bisque") |
|
401 |
-} |
|
210 |
+<<assaydataelements>>= |
|
211 |
+sapply(assayData(cnSet), function(x) class(x)[1]) |
|
402 | 212 |
@ |
403 | 213 |
|
404 |
-\section{Smoothing marker-level estimates of total copy number} |
|
214 |
+\section{Preprocessing} |
|
405 | 215 |
|
406 |
-In this section, we show how the total copy number can be smoothed |
|
407 |
-using either the hidden Markov model implemented in the \R{} package |
|
408 |
-\Rpackage{VanillaICE} or circular binary segmentation implemented in |
|
409 |
-the \R{} package{DNAcopy}. |
|
216 |
+The raw intensities from the Infinium IDAT files are read and |
|
217 |
+normalized using the function \Rfunction{preprocessInf}. The function |
|
218 |
+\Rfunction{preprocessInf} returns a \Robject{ff} object containing the |
|
219 |
+parameters for the mixture model used by the CRLMM genotyping |
|
220 |
+algorithm. |
|
410 | 221 |
|
411 |
-\paragraph{Smoothing via a hidden Markov model.} |
|
412 |
- |
|
413 |
-<<hmm>>= |
|
414 |
-if(require(VanillaICE)){ |
|
415 |
- hmmOpts <- hmm.setup(copynumberSet, c("hom-del", "hem-del", "normal", "amp"), |
|
416 |
- copynumberStates=0:3, normalIndex=3, |
|
417 |
- log.initialP=rep(log(1/4), 4)) |
|
418 |
- timing <- system.time(fit.cn <- hmm(copynumberSet, hmmOpts, TAUP=1e10, verbose=FALSE)) |
|
419 |
- hmm.df <- as.data.frame(fit.cn) |
|
420 |
- print(hmm.df[1:5, c(2:4,7:9)]) |
|
421 |
-} |
|
422 |
-@ |
|
423 |
- |
|
424 |
-\paragraph{Smoothing via segmentation.} |
|
425 |
- |
|
426 |
-<<cbs>>= |
|
427 |
-library("DNAcopy") |
|
428 |
-CNA.object <- CNA(genomdat=copyNumber(copynumberSet), |
|
429 |
- chrom=chromosome(copynumberSet), |
|
430 |
- maploc=position(copynumberSet), |
|
431 |
- data.type="logratio", |
|
432 |
- sampleid=sampleNames(copynumberSet)) |
|
433 |
-smu.object <- smooth.CNA(CNA.object) |
|
434 |
-if(!load.it) unlink(file.path(outdir, "cbs.segments.rda")) |
|
435 |
-cbs.segments <- checkExists("cbs.segments", .path=outdir, .FUN=segment, x=smu.object, .load.it=load.it) |
|
436 |
-cbs.segments <- cbind(cbs.segments$output, cbs.segments$segRows) |
|
437 |
-cbs.segments$call <- rep(3, nrow(cbs.segments)) |
|
438 |
-cbs.segments$call[cbs.segments$seg.mean > 2.5] <- 4 |
|
439 |
-cbs.segments$call[cbs.segments$seg.mean < 1.25 & cbs.segments$seg.mean > 0.75] <- 2 |
|
440 |
-cbs.segments$call[cbs.segments$seg.mean < 0.75] <- 1 |
|
441 |
-cbs.ir <- RangedData(IRanges(cbs.segments$loc.start, cbs.segments$loc.end), |
|
442 |
- chrom=cbs.segments$chrom, |
|
443 |
- numMarkers=cbs.segments$num.mark, |
|
444 |
- seg.mean=cbs.segments$seg.mean, |
|
445 |
- cnCall=cbs.segments$call, |
|
446 |
- sampleId=substr(cbs.segments$ID, 2, 13)) |
|
222 |
+<<preprocess,cache=TRUE>>= |
|
223 |
+mixtureParams <- preprocessInf(cnSet=cnSet, sampleSheet=samplesheet, arrayNames=arrayNames, arrayInfoColNames=arrayInfo) |
|
224 |
+invisible(open(mixtureParams)) |
|
225 |
+str(mixtureParams[]) |
|
226 |
+invisible(close(mixtureParams)) |
|
447 | 227 |
@ |
448 | 228 |
|
449 |
-\paragraph{Visualizing inferred CNV.} |
|
450 |
- |
|
451 |
-In the following code chunk, we plot the total copy number versus |
|
452 |
-physical position for chromosome 1 and overlay the copy number |
|
453 |
-predictions from the HMM and a CNV call from the segmentation obtained |
|
454 |
-by thresholding the segment means. We restrict our focus to regions |
|
455 |
-that have 3 or more markers. |
|
456 |
- |
|
457 |
-<<smoothFig, fig=TRUE, width=8, height=8>>= |
|
458 |
-require(SNPchip) |
|
459 |
-CHR <- 1 |
|
460 |
-cnSet2 <- copynumberSet |
|
461 |
-## require at least 3 markers |
|
462 |
-fit.cn2 <- fit.cn[fit.cn$numMarkers >= 3, ] |
|
463 |
-cbs.ir2 <- cbs.ir[cbs.ir$numMarkers >= 3, ] |
|
464 |
- |
|
465 |
-copynumberSet2 <- copynumberSet[which(chromosome(copynumberSet) == CHR), ] |
|
466 |
-layout(matrix(1:5, 5,1), heights=c(1, 1, 1, 1, 1)) |
|
467 |
-par(mar=c(0.5, 0.5, 0.5, 0.5), las=1, oma=c(4, 4, 4, 4)) |
|
468 |
-for(j in 1:5){ |
|
469 |
- fit.cn3 <- fit.cn2[fit.cn2$chrom==CHR & fit.cn2$sampleId == sampleNames(copynumberSet2)[j], ] |
|
470 |
- cbs.ir3 <- cbs.ir2[cbs.ir2$chrom==CHR & cbs.ir2$sampleId == sampleNames(copynumberSet2)[j], ] |
|
471 |
- plot(position(copynumberSet2), copyNumber(copynumberSet2)[, j], pch=".", cex=0.6,col="grey60", xaxt="n", |
|
472 |
- ylim=c(-1,6), ylab="total copy number", |
|
473 |
- yaxt="n", |
|
474 |
- xlim=c(0, max(position(copynumberSet2)))) |
|
475 |
- |
|
476 |
- legend("topleft", legend=paste("SNR:", round(copynumberSet2$SNR[j], 1))) |
|
477 |
- axis(2, at=0:6, labels=0:6) |
|
478 |
- if(j == 5){ |
|
479 |
- mtext("physical position (Mb)", 1, outer=TRUE, line=2) |
|
480 |
- mtext("Chromosome 8", 3, line=0, outer=TRUE) |
|
481 |
- at <- pretty(position(copynumberSet2)) |
|
482 |
- } |
|
483 |
- ##Use polygon at bottom to show predictions |
|
484 |
- w <- width(fit.cn3) |
|
485 |
- fit.cn3 <- fit.cn3[order(w, decreasing=TRUE), ] |
|
486 |
- ##fit.cn3 <- fit.cn3[fit.cn3$numMarkers > 5, ] |
|
487 |
- col <- c("blue", "black", "grey80", "red") |
|
488 |
- x <- c(min(start(fit.cn3)), max(end(fit.cn3))) |
|
489 |
- xx <- c(x, rev(x)) |
|
490 |
- y <- rep(c(-0.3, 0), each=2) |
|
491 |
- polygon(xx, y, col="white") |
|
492 |
- for(i in seq_along(w)){ |
|
493 |
- x <- c(start(fit.cn3)[i], end(fit.cn3)[i]) |
|
494 |
- xx <- c(x, rev(x)) |
|
495 |
- statecolor <- col[fit.cn3$state[i]] |
|
496 |
- polygon(xx, y, col=statecolor, border=statecolor) |
|
497 |
- } |
|
498 |
- w <- width(cbs.ir3) |
|
499 |
- cbs.ir3 <- cbs.ir3[order(w, decreasing=TRUE), ] |
|
500 |
- ##cbs.ir3 <- cbs.ir3[cbs.ir3$numMarkers > 5, ] |
|
501 |
- y <- rep(c(-1, -0.7), each=2) |
|
502 |
- for(i in seq_along(w)){ |
|
503 |
- x <- c(start(cbs.ir3)[i], end(cbs.ir3)[i]) |
|
504 |
- xx <- c(x, rev(x)) |
|
505 |
- statecolor <- col[cbs.ir3$cnCall[i]] |
|
506 |
- polygon(xx, y, col=statecolor, border=statecolor) |
|
507 |
- } |
|
508 |
- axis(4, at=c(-0.9, -0.1), c("CBS", "HMM"), cex=0.8) |
|
509 |
- if(j==5){ |
|
510 |
- legend("topright", fill=col, legend=c(0, 1, 2, expression(3+.))) |
|
511 |
-## invisible(plotCytoband(8, label.cytoband=FALSE, xlim=c(0,max(position(copynumberSet2))))) |
|
512 |
- labels <- at/1e6 |
|
513 |
- axis(1, at=at, labels=labels, outer=T, line=0) |
|
514 |
- } |
|
515 |
- ## draw centromere |
|
516 |
- xx <- c(chrAnn[1:2], chrAnn[2:1]) |
|
517 |
- yy <- c(-0.5, -0.5, 5, 5) |
|
518 |
- polygon(xx, yy, col="bisque") |
|
519 |
-} |
|
229 |
+Note that the normalized intensities for the A and B alleles are no |
|
230 |
+longer \verb+NA+s and can now be accessed and inspected using the |
|
231 |
+methods \Rfunction{A} and \Rfunction{B}, respectively. |
|
232 |
+ |
|
233 |
+<<intensities>>= |
|
234 |
+invisible(open(A(cnSet))) |
|
235 |
+invisible(open(B(cnSet))) |
|
236 |
+as.matrix(A(cnSet)[1:5, 1:5]) |
|
237 |
+as.matrix(B(cnSet)[1:5, 1:5]) |
|
238 |
+invisible(close(A(cnSet))) |
|
239 |
+invisible(close(B(cnSet))) |
|
520 | 240 |
@ |
521 | 241 |
|
522 |
-TODO: Samples 4 and 5 have a sparse band of CN-estimates near 1 that |
|
523 |
-is uniformly distributed across chromosome 1. This suggests markers |
|
524 |
-that are either not well fit by the linear model, or a problem with |
|
525 |
-the genotyping. Need to check. |
|
526 |
- |
|
527 |
- |
|
528 |
-%Make a 4 x 4 table of the called states for each sample. |
|
529 |
-% NA estimates are not counted in CBS -- need to recompute the number |
|
530 |
-% of markers in each segment in order to make the 4 x 4 table |
|
531 |
- |
|
532 |
-<<smooth, eval=FALSE, echo=FALSE>>= |
|
533 |
-nm.hmm <- sapply(split(fit.cn$numMarkers, fit.cn$sampleId), sum) |
|
534 |
-stopifnot(length(unique(nm.hmm)) == 1) |
|
535 |
-nm.cbs <- sapply(split(cbs.ir$numMarkers, cbs.ir$sampleId), sum) |
|
536 |
-stopifnot(length(unique(nm.cbs)) == 1) |
|
537 |
-stopifnot(all.equal(nm.cbs, nm.hmm)) |
|
538 |
-hmm.states <- rep(fit.cn$state, fit.cn$numMarkers) |
|
539 |
-sample.id <- rep(fit.cn$sampleId, fit.cn$numMarkers) |
|
540 |
-hmm.states <- split(hmm.states, sample.id) |
|
541 |
-cbs.states <- rep(cbs.ir$cnCall, cbs.ir$numMarkers) |
|
542 |
-sample.id <- rep(cbs.ir$sampleId, cbs.ir$numMarkers) |
|
543 |
-cbs.states <- split(cbs.states, sample.id) |
|
544 |
-tabs <- vector("list", length(cbs.states)) |
|
545 |
-for(i in seq_along(cbs.states)) tabs[[i]] <- table(cbs.states[[i]], hmm.states[[i]]) |
|
546 |
-names(tabs) <- names(cbs.states) |
|
547 |
-@ |
|
548 | 242 |
|
243 |
+\section{Genotyping} |
|
549 | 244 |
|
550 |
-\section{Accessors for extracting the locus-level copy number |
|
551 |
- estimates.} |
|
552 | 245 |
|
553 |
-As an example of how to use accessors to obtain the allele-specific CN |
|
554 |
-estimates, the following code chunk extracts the allele-specific copy |
|
555 |
-number for polymorphic markers on chromosome 21. |
|
246 |
+CRLMM genotype calls and confidence scores are estimated using the |
|
247 |
+function \Rfunction{genotypeInf}. |
|
556 | 248 |
|
557 |
-<<acn-accessor>>= |
|
558 |
-marker.index <- which(chromosome(cnSet) == 21 & isSnp(cnSet)) |
|
559 |
-ca <- CA(cnSet, i=marker.index) |
|
560 |
-cb <- CB(cnSet, i=marker.index) |
|
561 |
-missing.index <- which(rowSums(is.na(ca))==ncol(cnSet)) |
|
562 |
-ca <- ca[-missing.index, ] |
|
563 |
-cb <- cb[-missing.index, ] |
|
564 |
-@ |
|
565 |
-\noindent Negating the \Robject{isSnp} function could be used to |
|
566 |
-extract the estimates at nonpolymorphic markers. For instance, |
|
567 |
-<<monomorphic-accessor>>= |
|
568 |
-cn.monomorphic <- CA(cnSet, i=which(chromosome(cnSet) == 21 & !isSnp(cnSet))) |
|
249 |
+<<genotype,cache=TRUE>>= |
|
250 |
+updated <- genotypeInf(cnSet, mixtureParams=mixtureParams) |
|
251 |
+print(updated) |
|
569 | 252 |
@ |
570 | 253 |
|
571 |
-At polymorphic loci, the total copy number is the sum of the number of |
|
572 |
-copies of the A allele and the number of copies for the B allele. At |
|
573 |
-nonpolymorphic loci, the total copy number is the number of copies for |
|
574 |
-the A allele. The helper function totalCopyNumber can be used to |
|
575 |
-extract the total copy number for all polymorphic and nonpolymorphic |
|
576 |
-markers. Documentation of the \Rfunction{totalCopyNumber} will be |
|
577 |
-available in the devel version of the \Rpackage{oligoClasses}. |
|
578 |
- |
|
579 |
-<<copyNumberHelper>>= |
|
580 |
-cn.total <- ca+cb |
|
581 |
-cn.total2 <- totalCopynumber(cnSet, i=marker.index) |
|
582 |
-cn.total2 <- cn.total2[-missing.index, ] |
|
583 |
-all.equal(cn.total2, cn.total) |
|
584 |
-@ |
|
254 |
+The posterior probabilities for the |
|
255 |
+genotype calls in the \verb+callProbability+ element of the |
|
256 |
+\verb+assayData+ are stored as integers to reduce the file size on |
|
257 |
+disk. However, the scores can be easily transformed back to the |
|
258 |
+probability scale using the \Rfunction{i2p} function as illustrated in |
|
259 |
+the following code chunk. |
|
585 | 260 |
|
586 |
-A few simple visualizations may be helpful at this point. The first |
|
587 |
-plot is a histogram of the signal to noise ratio of the sample -- an |
|
588 |
-overall measure of how well the genotype clusters separate. (This |
|
589 |
-statistic tends to be much higher for Illumina than for the Affymetrix |
|
590 |
-platforms.) The second is a visualization of the total copy number |
|
591 |
-estimates plotted versus physical position on chromosome 1 for the two |
|
592 |
-samples with the lowest (top) and highest (bottom) signal to noise |
|
593 |
-ratios. |
|
594 |
- |
|
595 |
-<<snr, fig=TRUE>>= |
|
596 |
-open(cnSet$SNR) |
|
597 |
-snr <- cnSet$SNR[,] |
|
598 |
-hist(snr, breaks=15) |
|
261 |
+<<callprobs>>= |
|
262 |
+invisible(open(snpCallProbability(cnSet))) |
|
263 |
+callProbs <- as.matrix(snpCallProbability(cnSet)[1:5, 1:5]) |
|
264 |
+i2p(callProbs) |
|
265 |
+invisible(close(snpCallProbability(cnSet))) |
|
599 | 266 |
@ |
600 | 267 |
|
601 |
- |
|
602 |
-We plot the total copy number estimates versus physical position the |
|
603 |
-two samples with the lowest (top) and highest (bottom) signal to noise |
|
604 |
-ratios. |
|
605 |
- |
|
606 |
-\begin{figure} |
|
607 |
- \centering |
|
608 |
-<<firstSample, fig=TRUE>>= |
|
609 |
-low.snr <- which(snr == min(snr)) |
|
610 |
-high.snr <- which(snr == max(snr)) |
|
611 |
-marker.index <- which(chromosome(cnSet) == 1) |
|
612 |
-x <- position(cnSet)[marker.index] |
|
613 |
-cn.total <- totalCopynumber(cnSet, i=marker.index, j=c(low.snr, high.snr)) |
|
614 |
-##x <- x[-missing.index] |
|
615 |
-snrs <- round(snr[c(low.snr, high.snr)], 2) |
|
616 |
-par(mfrow=c(2,1), las=1, mar=c(0.5, 4, 0.5, 0.5), oma=c(4, 1, 1,1)) |
|
617 |
-for(j in 1:2){ |
|
618 |
- cn <- cn.total[, j] |
|
619 |
- plot(x, cn, pch=".", ylab="copy number", xaxt="n", ylim=c(0, 6), |
|
620 |
- col="grey50") |
|
621 |
- legend("topleft", legend=paste("SNR", snrs[j])) |
|
622 |
-} |
|
623 |
-axis(1, at=pretty(x), labels=pretty(x/1e6)) |
|
624 |
-mtext("Mb", 1, outer=TRUE, line=2) |
|
268 |
+%As described in the \texttt{copynumber} vignette, copy number |
|
269 |
+%estimation in \crlmm{} works best when there are a sufficient number |
|
270 |
+%of samples such that AA, AB, and BB genotypes are observed at most |
|
271 |
+%loci. For small studies (e.g., fewer than 50 samples), there will be a |
|
272 |
+%large number of SNPs that are monomorphic. For monomorphic SNPs, the |
|
273 |
+%estimation problem becomes more difficult and alternative strategies |
|
274 |
+%that estimate the relative total copy number may be preferable. In |
|
275 |
+%addition to installing \crlmm{}, one must also install the appropriate |
|
276 |
+%annotation package for the Illumina platform. In the following code, |
|
277 |
+%we list the platforms for which annotation packages are currently |
|
278 |
+%available. Next we create a directory where output files will be |
|
279 |
+%stored and indicate the directory that contains the IDAT files that |
|
280 |
+%will be used in our analysis. |
|
281 |
+ |
|
282 |
+\textbf{Wrapper:} The construction of a \Rclass{CNSet} object, |
|
283 |
+preprocessing, and genotype calling are wrapped into a convenience |
|
284 |
+function called \Rfunction{genotype.Illumina}. |
|
285 |
+ |
|
286 |
+<<genotype.Illumina,cache=TRUE,results=hide>>= |
|
287 |
+cnSet2 <- genotype.Illumina(sampleSheet=samplesheet, |
|
288 |
+ arrayNames=arrayNames, |
|
289 |
+ arrayInfoColNames=arrayInfo, |
|
290 |
+ cdfName="human370v1c", |
|
291 |
+ batch=batch) |
|
625 | 292 |
@ |
626 | 293 |
|
627 |
-\caption{Copy number estimates for samples with low signal to noise |
|
628 |
- ratio (top) and high signal to noise ratio (bottom). Note the |
|
629 |
- slight wave in the bottom figure. Such waves are typically |
|
630 |
- correlated with GC-content and can be removed by scatterplot |
|
631 |
- smoothers or any of the published wave-correction tools for |
|
632 |
- genotyping arrays.} |
|
633 |
-\end{figure} |
|
634 |
- |
|
635 |
-\section{Session information} |
|
636 |
-<<sessionInfo, results=tex>>= |
|
637 |
-toLatex(sessionInfo()) |
|
294 |
+<<checkIdenticalCalls>>= |
|
295 |
+invisible(open(calls(cnSet))) |
|
296 |
+invisible(open(calls(cnSet2))) |
|
297 |
+snp.index <- which(isSnp(cnSet)) |
|
298 |
+identical(calls(cnSet)[snp.index, 1:20], calls(cnSet2)[snp.index, 1:20]) |
|
299 |
+invisible(close(calls(cnSet))) |
|
300 |
+invisible(close(calls(cnSet2))) |
|
638 | 301 |
@ |
639 | 302 |
|
303 |
+To fully remove the data associated with the \Robject{cnSet2} object, |
|
304 |
+one should use the \Rfunction{delete} function in the \Rpackage{ff} |
|
305 |
+package followed by the \Rfunction{rm} function. The following code |
|
306 |
+is not evaluated is it would change the results of the cached |
|
307 |
+computations in the previous code chunk. |
|
308 |
+ |
|
309 |
+<<delete, eval=FALSE>>= |
|
310 |
+lapply(assayData(cnSet2), delete) |
|
311 |
+lapply(batchStatistics(cnSet2), delete) |
|
312 |
+delete(cnSet2$gender) |
|
313 |
+delete(cnSet2$SNR) |
|
314 |
+delete(cnSet2$SKW) |
|
315 |
+rm(cnSet2) |
|
316 |
+@ |
|
640 | 317 |
|
641 | 318 |
\end{document} |
642 |
- |
|
643 |
- |
|
644 |
- |
646 | 320 |
deleted file mode 100644 |
... | ... |
@@ -1,44 +0,0 @@ |
1 |
-\name{constructIlluminaCNSet} |
|
2 |
-\alias{constructIlluminaCNSet} |
|
3 |
-\title{ |
|
4 |
- Construct an instance of CNSetLM after preprocessing Illumina files |
|
5 |
-} |
|
6 |
-\description{ |
|
7 |
- |
|
8 |
- Assemble the preprocessed data and genotype calls from |
|
9 |
- \code{crlmmIllumina} to initialize a \code{CNSetLM} object. |
|
10 |
- |
|
11 |
-} |
|
12 |
-\usage{ |
|
13 |
-constructIlluminaCNSet(crlmmResult, path, snpFile, cnFile) |
|
14 |
-} |
|
15 |
-\arguments{ |
|
16 |
- \item{crlmmResult}{ |
|
17 |
- |
|
18 |
- A \code{SnpSet} object returned by function \code{crlmmIllumina} or |
|
19 |
- \code{crlmmIllumina2}. |
|
20 |
- |
|
21 |
-} |
|
22 |
- |
|
23 |
- \item{path}{path to files created by \code{crlmmIllumina}} |
|
24 |
- |
|
25 |
- \item{snpFile}{ |
|
26 |
- The \code{snpFile} filename specified in \code{crlmmIllumina}. |
|
27 |
-} |
|
28 |
- \item{cnFile}{ |
|
29 |
- The \code{cnFile} filename specified in \code{crlmmIllumina}. |
|
30 |
-} |
|
31 |
-} |
|
32 |
- |
|
33 |
-\value{ |
|
34 |
- An object of class \code{CNSetLM}. |
|
35 |
-} |
|
36 |
-\author{ |
|
37 |
-R. Scharpf |
|
38 |
-} |
|
39 |
-\seealso{ |
|
40 |
- \code{\link{CNSet-class}}, \code{\link{crlmmIllumina}} |
|
41 |
-} |
|
42 |
- |
|
43 |
-\keyword{manip} |
|
44 |
- |