... | ... |
@@ -1,14 +1,14 @@ |
1 | 1 |
Package: AneuFinder |
2 | 2 |
Type: Package |
3 | 3 |
Title: Analysis of Copy Number Variation in Single-Cell-Sequencing Data |
4 |
-Version: 0.99.1 |
|
5 |
-Date: 2015-09-19 |
|
4 |
+Version: 0.99.3 |
|
5 |
+Date: 2016-03 |
|
6 | 6 |
Author: Aaron Taudt, Bjorn Bakker, David Porubsky |
7 | 7 |
Maintainer: Aaron Taudt <aaron.taudt@gmail.com> |
8 | 8 |
Description: This package implements functions for CNV calling, plotting, export |
9 | 9 |
and analysis from whole-genome single cell sequencing data. |
10 | 10 |
Depends: |
11 |
- R (>= 3.2.3), |
|
11 |
+ R (>= 3.3.0), |
|
12 | 12 |
GenomicRanges, |
13 | 13 |
cowplot, |
14 | 14 |
AneuFinderData |
... | ... |
@@ -25,6 +25,7 @@ |
25 | 25 |
#' @param bw Bandwidth for SCE hotspot detection (see \code{\link{hotspotter}} for further details). |
26 | 26 |
#' @param pval P-value for SCE hotspot detection (see \code{\link{hotspotter}} for further details). |
27 | 27 |
#' @param cluster.plots A logical indicating whether plots should be clustered by similarity. |
28 |
+#' @return \code{NULL} |
|
28 | 29 |
#' @author Aaron Taudt |
29 | 30 |
#' @import foreach |
30 | 31 |
#' @import doParallel |
... | ... |
@@ -216,7 +216,7 @@ binReads <- function(file, format, assembly, ID=basename(file), bamindex=file, c |
216 | 216 |
} |
217 | 217 |
|
218 | 218 |
### Quality measures ### |
219 |
- qualityInfo <- list(complexity=complexity, coverage=coverage, spikyness=qc.spikyness(binned.data$counts), shannon.entropy=qc.entropy(binned.data$counts)) |
|
219 |
+ qualityInfo <- list(complexity=complexity, coverage=coverage, spikiness=qc.spikiness(binned.data$counts), shannon.entropy=qc.entropy(binned.data$counts)) |
|
220 | 220 |
attr(binned.data, 'qualityInfo') <- qualityInfo |
221 | 221 |
attr(binned.data, 'min.mapq') <- min.mapq |
222 | 222 |
|
... | ... |
@@ -63,7 +63,7 @@ correctMappability <- function(binned.data.list, same.binsize, reference, format |
63 | 63 |
|
64 | 64 |
### Quality measures ### |
65 | 65 |
## Spikyness |
66 |
- attr(binned.data, 'spikyness') <- qc.spikyness(binned.data$counts) |
|
66 |
+ attr(binned.data, 'spikiness') <- qc.spikiness(binned.data$counts) |
|
67 | 67 |
## Shannon entropy |
68 | 68 |
attr(binned.data, 'shannon.entropy') <- qc.entropy(binned.data$counts) |
69 | 69 |
|
... | ... |
@@ -200,7 +200,7 @@ correctGC <- function(binned.data.list, GC.BSgenome, same.binsize=FALSE) { |
200 | 200 |
|
201 | 201 |
### Quality measures ### |
202 | 202 |
## Spikyness |
203 |
- attr(binned.data, 'spikyness') <- qc.spikyness(binned.data$counts) |
|
203 |
+ attr(binned.data, 'spikiness') <- qc.spikiness(binned.data$counts) |
|
204 | 204 |
## Shannon entropy |
205 | 205 |
attr(binned.data, 'shannon.entropy') <- qc.entropy(binned.data$counts) |
206 | 206 |
|
... | ... |
@@ -8,6 +8,7 @@ |
8 | 8 |
#' Use \code{exportReadCounts} to export the binned read counts from an \code{\link{aneuHMM}} object in WIGGLE format. |
9 | 9 |
#' Use \code{exportGRanges} to export a \code{\link{GRanges}} object in BED format. |
10 | 10 |
#' |
11 |
+#' @return \code{NULL} |
|
11 | 12 |
#' @name export |
12 | 13 |
#' @author Aaron Taudt |
13 | 14 |
#'@examples |
... | ... |
@@ -84,7 +84,7 @@ findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1 |
84 | 84 |
univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation",paste0(0:10,"-somy")), most.frequent.state="2-somy", algorithm="EM", initial.params=NULL) { |
85 | 85 |
|
86 | 86 |
### Define cleanup behaviour ### |
87 |
- on.exit(.C("R_univariate_cleanup")) |
|
87 |
+ on.exit(.C("C_univariate_cleanup", PACKAGE = 'AneuFinder')) |
|
88 | 88 |
|
89 | 89 |
## Intercept user input |
90 | 90 |
if (class(binned.data) != 'GRanges') { |
... | ... |
@@ -150,7 +150,7 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
150 | 150 |
result$ID <- ID |
151 | 151 |
result$bins <- binned.data |
152 | 152 |
## Quality info |
153 |
- qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=NA) |
|
153 |
+ qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikiness=qc.spikiness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=NA) |
|
154 | 154 |
result$qualityInfo <- qualityInfo |
155 | 155 |
## Convergence info |
156 | 156 |
convergenceInfo <- list(eps=eps, loglik=NA, loglik.delta=NA, num.iterations=NA, time.sec=NA, error=NA) |
... | ... |
@@ -251,7 +251,7 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
251 | 251 |
prob.initial[index] <- 0.5 |
252 | 252 |
} |
253 | 253 |
|
254 |
- hmm <- .C("R_univariate_hmm", |
|
254 |
+ hmm <- .C("C_univariate_hmm", |
|
255 | 255 |
counts = as.integer(counts), # int* O |
256 | 256 |
num.bins = as.integer(numbins), # int* T |
257 | 257 |
num.states = as.integer(numstates), # int* N |
... | ... |
@@ -275,7 +275,8 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
275 | 275 |
num.threads = as.integer(num.threads), # int* num_threads |
276 | 276 |
error = as.integer(0), # int* error (error handling) |
277 | 277 |
count.cutoff = as.integer(count.cutoff), # int* count.cutoff |
278 |
- algorithm = as.integer(algorithm) # int* algorithm |
|
278 |
+ algorithm = as.integer(algorithm), # int* algorithm |
|
279 |
+ PACKAGE = 'AneuFinder' |
|
279 | 280 |
) |
280 | 281 |
|
281 | 282 |
hmm$eps <- eps.try |
... | ... |
@@ -322,7 +323,7 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
322 | 323 |
|
323 | 324 |
# Rerun the HMM with different epsilon and initial parameters from trial run |
324 | 325 |
message(paste0("Rerunning trial ",index2use," with eps = ",eps)) |
325 |
- hmm <- .C("R_univariate_hmm", |
|
326 |
+ hmm <- .C("C_univariate_hmm", |
|
326 | 327 |
counts = as.integer(counts), # int* O |
327 | 328 |
num.bins = as.integer(numbins), # int* T |
328 | 329 |
num.states = as.integer(numstates), # int* N |
... | ... |
@@ -346,7 +347,8 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
346 | 347 |
num.threads = as.integer(num.threads), # int* num_threads |
347 | 348 |
error = as.integer(0), # int* error (error handling) |
348 | 349 |
count.cutoff = as.integer(count.cutoff), # int* count.cutoff |
349 |
- algorithm = as.integer(algorithm) |
|
350 |
+ algorithm = as.integer(algorithm), # int* algorithm |
|
351 |
+ PACKAGE = 'AneuFinder' |
|
350 | 352 |
) |
351 | 353 |
} |
352 | 354 |
|
... | ... |
@@ -411,7 +413,7 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
411 | 413 |
convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, error=hmm$error) |
412 | 414 |
result$convergenceInfo <- convergenceInfo |
413 | 415 |
## Quality info |
414 |
- qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=qc.bhattacharyya(result)) |
|
416 |
+ qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikiness=qc.spikiness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=qc.bhattacharyya(result)) |
|
415 | 417 |
result$qualityInfo <- qualityInfo |
416 | 418 |
} else if (hmm$error == 1) { |
417 | 419 |
warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to.")) |
... | ... |
@@ -497,7 +499,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
497 | 499 |
result$ID <- ID |
498 | 500 |
result$bins <- binned.data |
499 | 501 |
## Quality info |
500 |
- qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=NA) |
|
502 |
+ qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikiness=qc.spikiness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=NA) |
|
501 | 503 |
result$qualityInfo <- qualityInfo |
502 | 504 |
|
503 | 505 |
# Check if there are counts in the data, otherwise HMM will blow up |
... | ... |
@@ -712,11 +714,11 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
712 | 714 |
stopTimedMessage(ptm) |
713 | 715 |
|
714 | 716 |
### Define cleanup behaviour ### |
715 |
- on.exit(.C("R_multivariate_cleanup", as.integer(num.comb.states))) |
|
717 |
+ on.exit(.C("C_multivariate_cleanup", as.integer(num.comb.states), PACKAGE = 'AneuFinder')) |
|
716 | 718 |
|
717 | 719 |
### Run the multivariate HMM |
718 | 720 |
# Call the C function |
719 |
- hmm <- .C("R_multivariate_hmm", |
|
721 |
+ hmm <- .C("C_multivariate_hmm", |
|
720 | 722 |
densities = as.double(densities), # double* D |
721 | 723 |
num.bins = as.integer(num.bins), # int* T |
722 | 724 |
num.comb.states = as.integer(num.comb.states), # int* N |
... | ... |
@@ -734,7 +736,8 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
734 | 736 |
use.initial.params = as.logical(use.initial), # bool* use_initial_params |
735 | 737 |
num.threads = as.integer(num.threads), # int* num_threads |
736 | 738 |
error = as.integer(0), # error handling |
737 |
- algorithm = as.integer(algorithm) # int* algorithm |
|
739 |
+ algorithm = as.integer(algorithm), # int* algorithm |
|
740 |
+ PACKAGE = 'AneuFinder' |
|
738 | 741 |
) |
739 | 742 |
|
740 | 743 |
### Check convergence ### |
... | ... |
@@ -811,7 +814,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
811 | 814 |
convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec) |
812 | 815 |
result$convergenceInfo <- convergenceInfo |
813 | 816 |
## Quality info |
814 |
- qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=qc.bhattacharyya(result)) |
|
817 |
+ qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikiness=qc.spikiness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=qc.bhattacharyya(result)) |
|
815 | 818 |
result$qualityInfo <- qualityInfo |
816 | 819 |
## Univariate infos |
817 | 820 |
univariateParams <- list(transitionProbs=uni.transitionProbs, startProbs=uni.startProbs, distributions=distributions[[1]], weights=uni.weights) |
... | ... |
@@ -79,8 +79,8 @@ findSCEs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1 |
79 | 79 |
#' @export |
80 | 80 |
#'@examples |
81 | 81 |
#'## Load an HMM |
82 |
-#'file <- system.file("extdata", "primary-lung", "hmms", "AvdB150303_I_001.bam.RData", |
|
83 |
-#' package="AneuFinderData") |
|
82 |
+#'file <- list.files(system.file("extdata", "primary-lung", "hmms", |
|
83 |
+#' package="AneuFinderData"), full.names=TRUE) |
|
84 | 84 |
#'hmm <- loadHmmsFromFiles(file)[[1]] |
85 | 85 |
#'## Check number of segments before and after filtering |
86 | 86 |
#'length(hmm$segments) |
... | ... |
@@ -258,7 +258,7 @@ plotBivariateHistograms <- function(bihmm) { |
258 | 258 |
binned.data.plus$counts <- binned.data.plus$pcounts |
259 | 259 |
binned.data.plus$counts.gc <- binned.data.plus$pcounts.gc |
260 | 260 |
binned.data.stacked <- c(binned.data.minus, binned.data.plus) |
261 |
- mask.attributes <- c(grep('complexity', names(attributes(binned.data)), value=TRUE), 'spikyness', 'shannon.entropy') |
|
261 |
+ mask.attributes <- c(grep('complexity', names(attributes(binned.data)), value=TRUE), 'spikiness', 'shannon.entropy') |
|
262 | 262 |
attributes(binned.data.stacked)[mask.attributes] <- attributes(binned.data)[mask.attributes] |
263 | 263 |
|
264 | 264 |
## Make fake uni.hmm and plot |
... | ... |
@@ -369,10 +369,10 @@ plotUnivariateHistogram <- function(model, state=NULL, strand='*', chromosome=NU |
369 | 369 |
|
370 | 370 |
# Quality info |
371 | 371 |
if (is.null(model$qualityInfo$complexity)) { model$qualityInfo$complexity <- NA } |
372 |
- if (is.null(model$qualityInfo$spikyness)) { model$qualityInfo$spikyness <- NA } |
|
372 |
+ if (is.null(model$qualityInfo$spikiness)) { model$qualityInfo$spikiness <- NA } |
|
373 | 373 |
if (is.null(model$qualityInfo$shannon.entropy)) { model$qualityInfo$shannon.entropy <- NA } |
374 | 374 |
if (is.null(model$qualityInfo$bhattacharyya)) { model$qualityInfo$bhattacharyya <- NA } |
375 |
- quality.string <- paste0('reads = ',round(sum(model$bins$counts)/1e6,2),'M, complexity = ',round(model$qualityInfo$complexity),', spikyness = ',round(model$qualityInfo$spikyness,2),', entropy = ',round(model$qualityInfo$shannon.entropy,2),', bhattacharyya = ',round(model$qualityInfo$bhattacharyya,2), ', num.segments = ',length(model$segments)) |
|
375 |
+ quality.string <- paste0('reads = ',round(sum(model$bins$counts)/1e6,2),'M, complexity = ',round(model$qualityInfo$complexity),', spikiness = ',round(model$qualityInfo$spikiness,2),', entropy = ',round(model$qualityInfo$shannon.entropy,2),', bhattacharyya = ',round(model$qualityInfo$bhattacharyya,2), ', num.segments = ',length(model$segments)) |
|
376 | 376 |
|
377 | 377 |
# Plot the histogram |
378 | 378 |
ggplt <- ggplot(data.frame(counts)) + geom_histogram(aes_string(x='counts', y='..density..'), binwidth=1, color='black', fill='white') + coord_cartesian(xlim=c(0,rightxlim)) + theme_bw() + xlab("read count") + ggtitle(bquote(atop(.(model$ID), atop(.(quality.string),'')))) |
... | ... |
@@ -453,7 +453,7 @@ plotKaryogram <- function(model, both.strands=FALSE, plot.SCE=FALSE, file=NULL) |
453 | 453 |
model <- list() |
454 | 454 |
model$ID <- '' |
455 | 455 |
model$bins <- binned.data |
456 |
- model$qualityInfo <- list(shannon.entropy=qc.entropy(binned.data$counts), spikyness=qc.spikyness(binned.data$counts), complexity=attr(binned.data, 'complexity.preseqR'), bhattacharyya=NA) |
|
456 |
+ model$qualityInfo <- list(shannon.entropy=qc.entropy(binned.data$counts), spikiness=qc.spikiness(binned.data$counts), complexity=attr(binned.data, 'complexity.preseqR'), bhattacharyya=NA) |
|
457 | 457 |
plot.karyogram(model, both.strands=both.strands, file=file) |
458 | 458 |
} else if (class(model)==class.univariate.hmm) { |
459 | 459 |
plot.karyogram(model, both.strands=both.strands, file=file) |
... | ... |
@@ -496,10 +496,10 @@ plot.karyogram <- function(model, both.strands=FALSE, plot.SCE=FALSE, file=NULL) |
496 | 496 |
|
497 | 497 |
# Quality info |
498 | 498 |
if (is.null(model$qualityInfo$complexity)) { model$qualityInfo$complexity <- NA } |
499 |
- if (is.null(model$qualityInfo$spikyness)) { model$qualityInfo$spikyness <- NA } |
|
499 |
+ if (is.null(model$qualityInfo$spikiness)) { model$qualityInfo$spikiness <- NA } |
|
500 | 500 |
if (is.null(model$qualityInfo$shannon.entropy)) { model$qualityInfo$shannon.entropy <- NA } |
501 | 501 |
if (is.null(model$qualityInfo$bhattacharyya)) { model$qualityInfo$bhattacharyya <- NA } |
502 |
- quality.string <- paste0('reads = ',round(sum(model$bins$counts)/1e6,2),'M, complexity = ',round(model$qualityInfo$complexity),', spikyness = ',round(model$qualityInfo$spikyness,2),', entropy = ',round(model$qualityInfo$shannon.entropy,2),', bhattacharyya = ',round(model$qualityInfo$bhattacharyya,2), ', num.segments = ',length(model$segments)) |
|
502 |
+ quality.string <- paste0('reads = ',round(sum(model$bins$counts)/1e6,2),'M, complexity = ',round(model$qualityInfo$complexity),', spikiness = ',round(model$qualityInfo$spikiness,2),', entropy = ',round(model$qualityInfo$shannon.entropy,2),', bhattacharyya = ',round(model$qualityInfo$bhattacharyya,2), ', num.segments = ',length(model$segments)) |
|
503 | 503 |
|
504 | 504 |
## Get SCE coordinates |
505 | 505 |
if (plot.SCE) { |
... | ... |
@@ -918,7 +918,7 @@ plotProfile <- function(model, both.strands=FALSE, plot.SCE=TRUE, file=NULL) { |
918 | 918 |
model <- list() |
919 | 919 |
model$ID <- '' |
920 | 920 |
model$bins <- binned.data |
921 |
- model$qualityInfo <- list(shannon.entropy=qc.entropy(binned.data$counts), spikyness=qc.spikyness(binned.data$counts), complexity=attr(binned.data, 'complexity.preseqR')) |
|
921 |
+ model$qualityInfo <- list(shannon.entropy=qc.entropy(binned.data$counts), spikiness=qc.spikiness(binned.data$counts), complexity=attr(binned.data, 'complexity.preseqR')) |
|
922 | 922 |
plot.profile(model, both.strands=both.strands, plot.SCE=FALSE, file=file) |
923 | 923 |
} else if (class(model)==class.univariate.hmm) { |
924 | 924 |
plot.profile(model, both.strands=FALSE, plot.SCE=FALSE, file=file) |
... | ... |
@@ -1041,10 +1041,10 @@ plot.profile <- function(model, both.strands=FALSE, plot.SCE=TRUE, file=NULL) { |
1041 | 1041 |
ggplt <- ggplt + scale_x_continuous(breaks=seqlengths(model$bins)/2+cum.seqlengths.0[as.character(seqlevels(model$bins))], labels=seqlevels(model$bins)) |
1042 | 1042 |
# Quality info |
1043 | 1043 |
if (is.null(model$qualityInfo$complexity)) { model$qualityInfo$complexity <- NA } |
1044 |
- if (is.null(model$qualityInfo$spikyness)) { model$qualityInfo$spikyness <- NA } |
|
1044 |
+ if (is.null(model$qualityInfo$spikiness)) { model$qualityInfo$spikiness <- NA } |
|
1045 | 1045 |
if (is.null(model$qualityInfo$shannon.entropy)) { model$qualityInfo$shannon.entropy <- NA } |
1046 | 1046 |
if (is.null(model$qualityInfo$bhattacharyya)) { model$qualityInfo$bhattacharyya <- NA } |
1047 |
- quality.string <- paste0('reads = ',round(sum(model$bins$counts)/1e6,2),'M, complexity = ',round(model$qualityInfo$complexity),', spikyness = ',round(model$qualityInfo$spikyness,2),', entropy = ',round(model$qualityInfo$shannon.entropy,2),', bhattacharyya = ',round(model$qualityInfo$bhattacharyya,2), ', num.segments = ',length(model$segments)) |
|
1047 |
+ quality.string <- paste0('reads = ',round(sum(model$bins$counts)/1e6,2),'M, complexity = ',round(model$qualityInfo$complexity),', spikiness = ',round(model$qualityInfo$spikiness,2),', entropy = ',round(model$qualityInfo$shannon.entropy,2),', bhattacharyya = ',round(model$qualityInfo$bhattacharyya,2), ', num.segments = ',length(model$segments)) |
|
1048 | 1048 |
ggplt <- ggplt + ylab('read count') + ggtitle(bquote(atop(.(model$ID), atop(.(quality.string),'')))) |
1049 | 1049 |
|
1050 | 1050 |
if (!is.null(file)) { |
... | ... |
@@ -15,12 +15,12 @@ |
15 | 15 |
#' @author Aaron Taudt |
16 | 16 |
NULL |
17 | 17 |
|
18 |
-#' @describeIn qualityControl Calculate the spikyness of a library |
|
19 |
-qc.spikyness <- function(counts) { |
|
18 |
+#' @describeIn qualityControl Calculate the spikiness of a library |
|
19 |
+qc.spikiness <- function(counts) { |
|
20 | 20 |
counts <- as.vector(counts) |
21 | 21 |
sum.counts <- sum(counts) |
22 |
- spikyness <- sum(abs(diff(counts))) / sum.counts |
|
23 |
- return(spikyness) |
|
22 |
+ spikiness <- sum(abs(diff(counts))) / sum.counts |
|
23 |
+ return(spikiness) |
|
24 | 24 |
} |
25 | 25 |
|
26 | 26 |
#' @describeIn qualityControl Calculate the Shannon entropy of a library |
... | ... |
@@ -59,7 +59,7 @@ getQC <- function(hmms) { |
59 | 59 |
qframe[[i1]] <- data.frame(total.read.count=sum(hmm$bins$counts), |
60 | 60 |
binsize=width(hmm$bins)[1], |
61 | 61 |
avg.read.count=mean(hmm$bins$counts), |
62 |
- spikyness=hmm$qualityInfo$spikyness, |
|
62 |
+ spikiness=hmm$qualityInfo$spikiness, |
|
63 | 63 |
entropy=hmm$qualityInfo$shannon.entropy, |
64 | 64 |
complexity=hmm$qualityInfo$complexity, |
65 | 65 |
loglik=hmm$convergenceInfo$loglik, |
... | ... |
@@ -79,7 +79,7 @@ getQC <- function(hmms) { |
79 | 79 |
#' |
80 | 80 |
#' The employed quality measures are: |
81 | 81 |
#' \itemize{ |
82 |
-#' \item Spikyness |
|
82 |
+#' \item Spikiness |
|
83 | 83 |
#' \item Entropy |
84 | 84 |
#' \item Number of segments |
85 | 85 |
#' \item Bhattacharrya distance |
... | ... |
@@ -89,8 +89,8 @@ getQC <- function(hmms) { |
89 | 89 |
#' @param hmms A list of \code{\link{aneuHMM}} objects or a list of files that contain such objects. |
90 | 90 |
#' @param G An integer vector specifying the number of clusters that are compared. See \code{\link[mclust:Mclust]{Mclust}} for details. |
91 | 91 |
#' @param itmax The maximum number of outer and inner iterations for the \code{\link[mclust:Mclust]{Mclust}} function. See \code{\link[mclust:emControl]{emControl}} for details. |
92 |
-#' @param measures The quality measures that are used for the clustering. Supported is any combination of \code{c('spikyness','entropy','num.segments','bhattacharyya','loglik','complexity','avg.read.count','total.read.count','binsize')}. |
|
93 |
-#' @param orderBy The quality measure to order the clusters by. Default is \code{'spikyness'}. |
|
92 |
+#' @param measures The quality measures that are used for the clustering. Supported is any combination of \code{c('spikiness','entropy','num.segments','bhattacharyya','loglik','complexity','avg.read.count','total.read.count','binsize')}. |
|
93 |
+#' @param orderBy The quality measure to order the clusters by. Default is \code{'spikiness'}. |
|
94 | 94 |
#' @param reverseOrder Logical indicating whether the ordering by \code{orderBy} is reversed. |
95 | 95 |
#' @return A \code{list} with the classification, parameters and the \code{\link[mclust]{Mclust}} fit. |
96 | 96 |
#' @author Aaron Taudt |
... | ... |
@@ -108,7 +108,7 @@ getQC <- function(hmms) { |
108 | 108 |
#'## Select files from the best 2 clusters for further processing |
109 | 109 |
#'best.files <- unlist(cl$classification[1:2]) |
110 | 110 |
#' |
111 |
-clusterByQuality <- function(hmms, G=1:9, itmax=c(100,100), measures=c('spikyness','entropy','num.segments','bhattacharyya','loglik'), orderBy='spikyness', reverseOrder=FALSE) { |
|
111 |
+clusterByQuality <- function(hmms, G=1:9, itmax=c(100,100), measures=c('spikiness','entropy','num.segments','bhattacharyya','loglik'), orderBy='spikiness', reverseOrder=FALSE) { |
|
112 | 112 |
|
113 | 113 |
hmms <- loadHmmsFromFiles(hmms) |
114 | 114 |
df <- getQC(hmms) |
... | ... |
@@ -41,6 +41,7 @@ readConfig <- function(configfile) { |
41 | 41 |
#' |
42 | 42 |
#' @param conf A list structure with parameter values. Each entry will be written in one line. |
43 | 43 |
#' @param configfile Filename of the outputfile. |
44 |
+#' @return \code{NULL} |
|
44 | 45 |
#' @author Aaron Taudt |
45 | 46 |
writeConfig <- function(conf, configfile) { |
46 | 47 |
|
... | ... |
@@ -85,6 +85,9 @@ Aneufinder(inputfolder, outputfolder, format, configfile = NULL, numCPU = 1, |
85 | 85 |
|
86 | 86 |
\item{cluster.plots}{A logical indicating whether plots should be clustered by similarity.} |
87 | 87 |
} |
88 |
+\value{ |
|
89 |
+\code{NULL} |
|
90 |
+} |
|
88 | 91 |
\description{ |
89 | 92 |
This function is an easy-to-use wrapper to \link[AneuFinder:binning]{bin the data}, \link[AneuFinder:findCNVs]{find copy-number-variations}, \link[AneuFinder:findSCEs]{find sister-chromatid-exchange} events, plot \link[AneuFinder:heatmapGenomewide]{genomewide heatmaps}, \link[AneuFinder:plot.aneuHMM]{distributions, profiles and karyograms}. |
90 | 93 |
} |
... | ... |
@@ -5,8 +5,8 @@ |
5 | 5 |
\title{Cluster based on quality variables} |
6 | 6 |
\usage{ |
7 | 7 |
clusterByQuality(hmms, G = 1:9, itmax = c(100, 100), |
8 |
- measures = c("spikyness", "entropy", "num.segments", "bhattacharyya", |
|
9 |
- "loglik"), orderBy = "spikyness", reverseOrder = FALSE) |
|
8 |
+ measures = c("spikiness", "entropy", "num.segments", "bhattacharyya", |
|
9 |
+ "loglik"), orderBy = "spikiness", reverseOrder = FALSE) |
|
10 | 10 |
} |
11 | 11 |
\arguments{ |
12 | 12 |
\item{hmms}{A list of \code{\link{aneuHMM}} objects or a list of files that contain such objects.} |
... | ... |
@@ -15,9 +15,9 @@ clusterByQuality(hmms, G = 1:9, itmax = c(100, 100), |
15 | 15 |
|
16 | 16 |
\item{itmax}{The maximum number of outer and inner iterations for the \code{\link[mclust:Mclust]{Mclust}} function. See \code{\link[mclust:emControl]{emControl}} for details.} |
17 | 17 |
|
18 |
-\item{measures}{The quality measures that are used for the clustering. Supported is any combination of \code{c('spikyness','entropy','num.segments','bhattacharyya','loglik','complexity','avg.read.count','total.read.count','binsize')}.} |
|
18 |
+\item{measures}{The quality measures that are used for the clustering. Supported is any combination of \code{c('spikiness','entropy','num.segments','bhattacharyya','loglik','complexity','avg.read.count','total.read.count','binsize')}.} |
|
19 | 19 |
|
20 |
-\item{orderBy}{The quality measure to order the clusters by. Default is \code{'spikyness'}.} |
|
20 |
+\item{orderBy}{The quality measure to order the clusters by. Default is \code{'spikiness'}.} |
|
21 | 21 |
|
22 | 22 |
\item{reverseOrder}{Logical indicating whether the ordering by \code{orderBy} is reversed.} |
23 | 23 |
} |
... | ... |
@@ -30,7 +30,7 @@ This function uses the \pkg{\link{mclust}} package to cluster the input samples |
30 | 30 |
\details{ |
31 | 31 |
The employed quality measures are: |
32 | 32 |
\itemize{ |
33 |
-\item Spikyness |
|
33 |
+\item Spikiness |
|
34 | 34 |
\item Entropy |
35 | 35 |
\item Number of segments |
36 | 36 |
\item Bhattacharrya distance |
... | ... |
@@ -40,6 +40,9 @@ exportGRanges(gr, filename, header = TRUE, trackname = NULL, score = NULL, |
40 | 40 |
|
41 | 41 |
\item{chromosome.format}{A character specifying the format of the chromosomes if \code{assembly} is specified. Either 'NCBI' for (1,2,3 ...) or 'UCSC' for (chr1,chr2,chr3 ...).#' @importFrom utils write.table} |
42 | 42 |
} |
43 |
+\value{ |
|
44 |
+\code{NULL} |
|
45 |
+} |
|
43 | 46 |
\description{ |
44 | 47 |
Export copy-number-variation state or read counts as genome browser viewable file |
45 | 48 |
} |
... | ... |
@@ -19,8 +19,8 @@ The input \code{model} with adjusted segments. |
19 | 19 |
} |
20 | 20 |
\examples{ |
21 | 21 |
## Load an HMM |
22 |
-file <- system.file("extdata", "primary-lung", "hmms", "AvdB150303_I_001.bam.RData", |
|
23 |
- package="AneuFinderData") |
|
22 |
+file <- list.files(system.file("extdata", "primary-lung", "hmms", |
|
23 |
+ package="AneuFinderData"), full.names=TRUE) |
|
24 | 24 |
hmm <- loadHmmsFromFiles(file)[[1]] |
25 | 25 |
## Check number of segments before and after filtering |
26 | 26 |
length(hmm$segments) |
... | ... |
@@ -3,11 +3,11 @@ |
3 | 3 |
\name{qualityControl} |
4 | 4 |
\alias{qc.bhattacharyya} |
5 | 5 |
\alias{qc.entropy} |
6 |
-\alias{qc.spikyness} |
|
6 |
+\alias{qc.spikiness} |
|
7 | 7 |
\alias{qualityControl} |
8 | 8 |
\title{Quality control measures for binned read counts} |
9 | 9 |
\usage{ |
10 |
-qc.spikyness(counts) |
|
10 |
+qc.spikiness(counts) |
|
11 | 11 |
|
12 | 12 |
qc.entropy(counts) |
13 | 13 |
|
... | ... |
@@ -31,7 +31,7 @@ Spikyness is defined as \eqn{K = sum(abs(diff(counts))) / sum(counts)}. |
31 | 31 |
} |
32 | 32 |
\section{Functions}{ |
33 | 33 |
\itemize{ |
34 |
-\item \code{qc.spikyness}: Calculate the spikyness of a library |
|
34 |
+\item \code{qc.spikiness}: Calculate the spikiness of a library |
|
35 | 35 |
|
36 | 36 |
\item \code{qc.entropy}: Calculate the Shannon entropy of a library |
37 | 37 |
|
... | ... |
@@ -9,8 +9,7 @@ static double** multiD; |
9 | 9 |
// =================================================================================================================================================== |
10 | 10 |
// This function takes parameters from R, creates a univariate HMM object, creates the distributions, runs the EM and returns the result to R. |
11 | 11 |
// =================================================================================================================================================== |
12 |
-extern "C" { |
|
13 |
-void R_univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm) |
|
12 |
+void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm) |
|
14 | 13 |
{ |
15 | 14 |
|
16 | 15 |
// Define logging level |
... | ... |
@@ -209,14 +208,12 @@ void R_univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, d |
209 | 208 |
delete hmm; |
210 | 209 |
hmm = NULL; // assign NULL to defuse the additional delete in on.exit() call |
211 | 210 |
} |
212 |
-} //extern |
|
213 | 211 |
|
214 | 212 |
|
215 | 213 |
// ===================================================================================================================================================== |
216 | 214 |
// This function takes parameters from R, creates a multivariate HMM object, runs the EM and returns the result to R. |
217 | 215 |
// ===================================================================================================================================================== |
218 |
-extern "C" { |
|
219 |
-void R_multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* algorithm) |
|
216 |
+void multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* algorithm) |
|
220 | 217 |
{ |
221 | 218 |
|
222 | 219 |
// Define logging level {"ERROR", "WARNING", "INFO", "ITERATION", "DEBUG", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4"} |
... | ... |
@@ -351,25 +348,20 @@ void R_multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, |
351 | 348 |
hmm = NULL; // assign NULL to defuse the additional delete in on.exit() call |
352 | 349 |
// FreeDoubleMatrix(multiD, *N); |
353 | 350 |
} |
354 |
-} //extern |
|
355 | 351 |
|
356 | 352 |
|
357 | 353 |
// ======================================================= |
358 | 354 |
// This function make a cleanup if anything was left over |
359 | 355 |
// ======================================================= |
360 |
-extern "C" { |
|
361 |
-void R_univariate_cleanup() |
|
356 |
+void univariate_cleanup() |
|
362 | 357 |
{ |
363 | 358 |
// //FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; // This message will be shown if interrupt happens before start of C-code |
364 | 359 |
delete hmm; |
365 | 360 |
} |
366 |
-} //extern |
|
367 | 361 |
|
368 |
-extern "C" { |
|
369 |
-void R_multivariate_cleanup(int* N) |
|
362 |
+void multivariate_cleanup(int* N) |
|
370 | 363 |
{ |
371 | 364 |
delete hmm; |
372 | 365 |
FreeDoubleMatrix(multiD, *N); |
373 | 366 |
} |
374 |
-} //extern |
|
375 | 367 |
|
... | ... |
@@ -12,11 +12,15 @@ |
12 | 12 |
// #include <omp.h> // parallelization options |
13 | 13 |
// #endif |
14 | 14 |
|
15 |
-// void R_univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm); |
|
16 |
-// |
|
17 |
-// void R_multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* algorithm); |
|
18 |
-// |
|
19 |
-// void R_univariate_cleanup(); |
|
20 |
-// |
|
21 |
-// void R_multivariate_cleanup(int* N); |
|
22 |
-// |
|
15 |
+extern "C" |
|
16 |
+void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm); |
|
17 |
+ |
|
18 |
+extern "C" |
|
19 |
+void multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* algorithm); |
|
20 |
+ |
|
21 |
+extern "C" |
|
22 |
+void univariate_cleanup(); |
|
23 |
+ |
|
24 |
+extern "C" |
|
25 |
+void multivariate_cleanup(int* N); |
|
26 |
+ |
... | ... |
@@ -1,21 +1,22 @@ |
1 |
-// #include <Rinternals.h> |
|
2 |
-// #include <R_ext/Rdynload.h> |
|
3 |
-// #include <R_ext/Visibility.h> |
|
4 |
-// #include "R_interface.h" |
|
5 |
-// |
|
6 |
-// |
|
7 |
-// static const R_CMethodDef CEntries[] = { |
|
8 |
-// {"R_univariate_hmm", (DL_FUNC) &R_univariate_hmm, 24}, |
|
9 |
-// {"R_multivariate_hmm", (DL_FUNC) &R_multivariate_hmm, 18}, |
|
10 |
-// {"R_univariate_cleanup", (DL_FUNC) &R_univariate_cleanup, 0}, |
|
11 |
-// {"R_multivariate_cleanup", (DL_FUNC) &R_multivariate_cleanup, 1}, |
|
12 |
-// {NULL, NULL, 0} |
|
13 |
-// }; |
|
14 |
-// |
|
15 |
-// |
|
16 |
-// void attribute_visible R_init_aneufinder(DllInfo *dll) |
|
17 |
-// { |
|
18 |
-// R_registerRoutines(dll, CEntries, NULL, NULL, NULL); |
|
19 |
-// R_useDynamicSymbols(dll, FALSE); |
|
20 |
-// R_forceSymbols(dll, TRUE); |
|
21 |
-// } |
|
1 |
+#include <Rinternals.h> |
|
2 |
+#include <R_ext/Rdynload.h> |
|
3 |
+#include "R_interface.h" |
|
4 |
+ |
|
5 |
+ |
|
6 |
+static const R_CMethodDef CEntries[] = { |
|
7 |
+ {"C_univariate_hmm", (DL_FUNC) &univariate_hmm, 24, (R_NativePrimitiveArgType[24]) {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP}}, |
|
8 |
+ {"C_multivariate_hmm", (DL_FUNC) &multivariate_hmm, 18, (R_NativePrimitiveArgType[18]) {REALSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP}}, |
|
9 |
+ {"C_univariate_cleanup", (DL_FUNC) &univariate_cleanup, 0, NULL}, |
|
10 |
+ {"C_multivariate_cleanup", (DL_FUNC) &multivariate_cleanup, 1, (R_NativePrimitiveArgType[1]) {INTSXP}}, |
|
11 |
+ {NULL, NULL, 0, NULL} |
|
12 |
+}; |
|
13 |
+ |
|
14 |
+ |
|
15 |
+extern "C" { |
|
16 |
+void R_init_AneuFinder(DllInfo *dll) |
|
17 |
+{ |
|
18 |
+ R_registerRoutines(dll, CEntries, NULL, NULL, NULL); |
|
19 |
+ R_useDynamicSymbols(dll, FALSE); |
|
20 |
+// R_forceSymbols(dll, TRUE); |
|
21 |
+} |
|
22 |
+} |
... | ... |
@@ -12,7 +12,7 @@ |
12 | 12 |
|
13 | 13 |
/* custom error handling class */ |
14 | 14 |
// extern statement to avoid 'multiple definition' errors |
15 |
-extern class exception_nan: public std::exception |
|
15 |
+const class exception_nan: public std::exception |
|
16 | 16 |
{ |
17 | 17 |
virtual const char* what() const throw() |
18 | 18 |
{ |
... | ... |
@@ -23,7 +23,7 @@ |
23 | 23 |
\tableofcontents |
24 | 24 |
\clearpage |
25 | 25 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
26 |
-<<options,echo=FALSE, results='hide', message=FALSE, eval=TRUE>>= |
|
26 |
+<<options, echo=FALSE, results='hide', message=FALSE, eval=TRUE>>= |
|
27 | 27 |
options(fig.width=120) |
28 | 28 |
library(AneuFinder) |
29 | 29 |
@ |
... | ... |
@@ -51,7 +51,7 @@ Aneufinder(inputfolder='folder-with-BAM', outputfolder='output-directory', |
51 | 51 |
|
52 | 52 |
Although in most cases the above command will produce reasonably good results, it might be worthwile to adjust the default parameters to improve performance and the quality of the results (see section \ref{sec:workflow}). You can get a description of all available parameters by typing |
53 | 53 |
\begin{scriptsize} |
54 |
-<<>>== |
|
54 |
+<<eval=TRUE>>== |
|
55 | 55 |
?Aneufinder |
56 | 56 |
@ |
57 | 57 |
\end{scriptsize} |
... | ... |
@@ -90,32 +90,35 @@ heatmapGenomewide(cl$classification[[1]]) |
90 | 90 |
|
91 | 91 |
\section{\label{sec:workflow}A detailed workflow} |
92 | 92 |
|
93 |
-\subsection{Mappability correction} |
|
93 |
+\subsection{\label{sec:mapcor}Mappability correction} |
|
94 | 94 |
The first step of your workflow should be the production of a reference file for mappability correction. Mappability correction is done via a variable-width binning approach (as compared to fixed-width bins) and requires a euploid reference. You can either simulate this reference file or take a real euploid reference. For optimal results we suggest to use a real reference, e.g. by merging BAM files of single cells from a euploid reference tissue. This can be achieved with the 'samtools merge' command (not part of R). Be careful: All CNVs that are present in the reference will lead to artifacts in the analysis later. This includes sex-chromosomes that are present in one copy only, so we advice to use a female reference and to exclude the Y-chromosome from the analysis. If you have no reference available, you can simulate one with the \Rfunction{simulateReads} command: |
95 | 95 |
\begin{scriptsize} |
96 |
-<<eval=FALSE>>== |
|
96 |
+<<eval=TRUE, message=FALSE, warning=FALSE>>== |
|
97 | 97 |
## Load human genome |
98 | 98 |
library(BSgenome.Hsapiens.UCSC.hg19) |
99 | 99 |
|
100 |
-## Get a BAM file for the estimation of quality scores |
|
100 |
+## Get a BAM file for the estimation of quality scores (adjust this to your experiment) |
|
101 | 101 |
bamfile <- system.file("extdata", "BB150803_IV_074.bam", package="AneuFinderData") |
102 | 102 |
|
103 | 103 |
## Simulate reads of length 51bp for human genome |
104 |
-outputfile <- tempfile() # replace this with your destination file |
|
104 |
+# We simulate reads every 5000bp for demonstration purposes, for a real |
|
105 |
+# application you should use a much denser spacing (e.g. 500bp or less) |
|
106 |
+simulatedReads.file <- tempfile() # replace this with your destination file |
|
105 | 107 |
simulateReads(BSgenome.Hsapiens.UCSC.hg19, readLength=51, bamfile=bamfile, |
106 |
- file=outputfile, every.X.bp=500) |
|
108 |
+ file=simulatedReads.file, every.X.bp=5000) |
|
107 | 109 |
@ |
108 | 110 |
\end{scriptsize} |
109 | 111 |
|
110 | 112 |
This simulated FASTQ file must then be aligned with your aligner of choice (ideally the same that you used for your other samples) and given as reference in the \Rfunction{Aneufinder} function (option \Roption{variable.width.reference}). |
111 | 113 |
|
112 |
-\subsection{Blacklisting} |
|
114 |
+\subsection{\label{sec:blacklist}Blacklisting} |
|
113 | 115 |
To further improve the quality of the results and remove artifacts caused by high mappability repeat regions, e.g. near centromers, a blacklist can be used in option \Roption{blacklist} of the \Rfunction{Aneufinder} function. All reads falling into the regions specified by the blacklist will be discarded when importing the read files. |
114 | 116 |
You can either download a blacklist from the UCSC genome browser, e.g. the ``DAC Blacklisted Regions from ENCODE/DAC(Kundaje)'' mappability track, or make your own. For optimal results, we advice to make your own blacklist from a euploid reference. The following code chunck takes a euploid reference and makes fixed-width bins of 100kb. Bins with read count above and beloow the 0.999 and 0.05 quantile are taken as blacklist: |
115 | 117 |
\begin{scriptsize} |
116 | 118 |
<<eval=TRUE, warning=FALSE, message=FALSE, fig.height=3>>== |
117 |
-## Get a euploid reference |
|
119 |
+## Get a euploid reference (adjust this to your experiment) |
|
118 | 120 |
bedfile <- system.file("extdata", "hg19_diploid.bam.bed.gz", package="AneuFinderData") |
121 |
+ |
|
119 | 122 |
## Make 100kb fixed-width bins |
120 | 123 |
bins <- binReads(bedfile, format='bed', assembly='hg19', |
121 | 124 |
binsizes=100e3, chromosomes=c(1:22,'X'))[[1]] |
... | ... |
@@ -128,21 +131,85 @@ p <- p + geom_hline(aes(yintercept=ucutoff), color='red') |
128 | 131 |
print(p) |
129 | 132 |
blacklist <- bins[bins$counts < ucutoff & bins$counts > lcutoff] |
130 | 133 |
## Write blacklist to file |
131 |
-exportGRanges(blacklist, filename=tempfile(), header=FALSE, |
|
134 |
+blacklist.file <- tempfile() |
|
135 |
+exportGRanges(blacklist, filename=blacklist.file, header=FALSE, |
|
132 | 136 |
chromosome.format='NCBI') |
133 | 137 |
@ |
134 | 138 |
\end{scriptsize} |
135 | 139 |
|
136 | 140 |
\subsection{Running Aneufinder} |
141 |
+The function \Rfunction{Aneufinder} takes an input folder with BAM or BED files and produces an output folder with results, plots and browserfiles. The following code is an example of how to run \Rfunction{Aneufinder} with variable-width bins (see section \ref{sec:mapcor}), blacklist (see section \ref{sec:blacklist}) and GC-correction. Results will be stored in 'outputfolder/hmms' as RData objects for further processing such as quality filtering and customized plotting. |
|
142 |
+ |
|
143 |
+\begin{scriptsize} |
|
144 |
+<<eval=TRUE>>== |
|
145 |
+## First, get some data and reference files (adjust this to your experiment) |
|
146 |
+var.width.ref <- system.file("extdata", "hg19_diploid.bam.bed.gz", package="AneuFinderData") |
|
147 |
+blacklist <- system.file("extdata", "blacklist-hg19.bed.gz", package="AneuFinderData") |
|
148 |
+datafolder <- system.file("extdata", "B-ALL-B", package = "AneuFinderData") |
|
149 |
+list.files(datafolder) # only 3 cells for demonstration purposes |
|
150 |
+ |
|
151 |
+## Library for GC correction |
|
152 |
+library(BSgenome.Hsapiens.UCSC.hg19) |
|
153 |
+ |
|
154 |
+## Produce output files |
|
155 |
+cnv.states <- c('zero-inflation', paste0(1:10, '-somy')) |
|
156 |
+outputfolder <- tempdir() |
|
157 |
+Aneufinder(inputfolder = datafolder, outputfolder = outputfolder, format = 'bed', |
|
158 |
+ numCPU = 3, binsizes = c(5e5, 1e6), variable.width.reference = var.width.ref, |
|
159 |
+ chromosomes = c(1:22,'X','Y'), blacklist = blacklist, states = cnv.states, |
|
160 |
+ correction.method = 'GC', GC.BSgenome = BSgenome.Hsapiens.UCSC.hg19, |
|
161 |
+ num.trials = 1, assembly = 'hg19') |
|
162 |
+@ |
|
163 |
+\end{scriptsize} |
|
137 | 164 |
|
138 | 165 |
\subsection{Quality control} |
166 |
+Once the function \Rfunction{Aneufinder} has completed, results will be accessible as .RData files under 'outputfolder/hmms'. Single cell sequencing is prone to noise and therefore it is a good idea to filter the results by quality to retain only high-quality cells. We found that simple filtering procedures such as cutoffs on the total number of reads etc., are insufficient to distinguish good from bad-quality libraries. Therefore, we have implemented a multivariate clustering approach that works on multiple quality metrics (see ?clusterByQuality for details) to increase robustness of the filtering. Here is an example demonstrating the usage of \Rfunction{clusterByQuality}. |
|
167 |
+ |
|
168 |
+\begin{scriptsize} |
|
169 |
+<<eval=TRUE, message=FALSE>>== |
|
170 |
+## Get some pre-produced results (adjust this to your experiment) |
|
171 |
+results <- system.file("extdata", "primary-lung", "hmms", package="AneuFinderData") |
|
172 |
+files <- list.files(results, full.names=TRUE) |
|
173 |
+ |
|
174 |
+## Cluster by quality |
|
175 |
+cl <- clusterByQuality(files) |
|
176 |
+plot(cl$Mclust, what='classification') |
|
177 |
+print(cl$parameters) |
|
178 |
+## Apparently, the third cluster corresponds to failed libraries |
|
179 |
+## while the first cluster contains high-quality libraries |
|
180 |
+ |
|
181 |
+## Select the best cluster and plot it |
|
182 |
+selected.files <- unlist(cl$classification[[1]]) |
|
183 |
+heatmapGenomewide(selected.files) |
|
184 |
+@ |
|
185 |
+\end{scriptsize} |
|
139 | 186 |
|
140 | 187 |
\subsection{Karyotype measures} |
188 |
+This package implements two measures to quantify karyotype heterogeneity, an \emph{aneuploidy} and a \emph{heterogeneity} score. Both measures are independent of the number of cells, the length of the genome, and take into account every position in the genome. The following example compares the heterogeneity and aneuploidy between a primary lung cancer and the corresponding liver metastasis. |
|
141 | 189 |
|
190 |
+\begin{scriptsize} |
|
191 |
+<<eval=TRUE, message=FALSE>>== |
|
192 |
+## Get some pre-produced results (adjust this to your experiment) |
|
193 |
+results <- system.file("extdata", "primary-lung", "hmms", package="AneuFinderData") |
|
194 |
+files.lung <- list.files(results, full.names=TRUE) |
|
195 |
+results <- system.file("extdata", "metastasis-liver", "hmms", package="AneuFinderData") |
|
196 |
+files.liver <- list.files(results, full.names=TRUE) |
|
197 |
+ |
|
198 |
+## Get karyotype measures |
|
199 |
+k.lung <- karyotypeMeasures(files.lung) |
|
200 |
+k.liver <- karyotypeMeasures(files.liver) |
|
201 |
+ |
|
202 |
+## Print the scores in one data.frame |
|
203 |
+df <- rbind(lung = k.lung$genomewide, liver = k.liver$genomewide) |
|
204 |
+print(df) |
|
205 |
+## While the aneuploidy is similar between both cancers, the heterogeneity is |
|
206 |
+## nearly twice as high for the primary lung cancer. |
|
207 |
+@ |
|
208 |
+\end{scriptsize} |
|
142 | 209 |
|
143 | 210 |
\section{Session Info} |
144 | 211 |
\begin{scriptsize} |
145 |
-<<>>= |
|
212 |
+<<eval=TRUE>>= |
|
146 | 213 |
sessionInfo() |
147 | 214 |
warnings() |
148 | 215 |
@ |