1 | 1 |
new file mode 100755 |
... | ... |
@@ -0,0 +1,109 @@ |
1 |
+Package: GRaNIE |
|
2 |
+Title: GRaNIE: Reconstruction cell type specific gene regulatory networks including enhancers using chromatin accessibility and RNA-seq data |
|
3 |
+Version: 0.99.0 |
|
4 |
+Encoding: UTF-8 |
|
5 |
+Authors@R: c(person("Christian", "Arnold", email = |
|
6 |
+ "chrarnold@web.de", role = c("cre","aut")), |
|
7 |
+ person("Judith", "Zaugg", email = |
|
8 |
+ "judith.zaugg@embl.de", role = c("aut")), |
|
9 |
+ person("Rim", "Moussa", email = |
|
10 |
+ "rim.moussa01@gmail.com", role = "aut"), |
|
11 |
+ person("Armando", "Reyes-Palomares", email = |
|
12 |
+ "armandorp@gmail.com", role = "ctb"), |
|
13 |
+ person("Giovanni", "Palla", email = |
|
14 |
+ "giov.pll@gmail.com", role = "ctb"), |
|
15 |
+ person("Maksim", "Kholmatov", email = |
|
16 |
+ "maksim.kholmatov@embl.de", role = "ctb")) |
|
17 |
+Description: Genetic variants associated with diseases often affect non-coding regions, thus likely having a regulatory role. To understand the effects of genetic variants in these regulatory regions, identifying genes that are modulated by specific regulatory elements (REs) is crucial. The effect of gene regulatory elements, such as enhancers, is often cell-type specific, likely because the combinations of transcription factors (TFs) that are regulating a given enhancer have celltype specific activity. This TF activity can be quantified with existing tools such as diffTF and captures differences in binding of a TF in open chromatin regions. Collectively, this forms a gene regulatory network (GRN) with cell-type and data-specific TF-RE and RE-gene links. Here, we reconstruct such a GRN using bulk RNAseq and open chromatin (e.g., using ATACseq or ChIPseq for open chromatin marks) and optionally TF activity data. Our network contains different types of links, connecting TFs to regulatory elements, the latter of which is connected to genes in the vicinity or within the same chromatin domain (TAD). We use a statistical framework to assign empirical FDRs and weights to all links using a permutation-based approach. |
|
18 |
+Imports: |
|
19 |
+ futile.logger, |
|
20 |
+ checkmate, |
|
21 |
+ patchwork, |
|
22 |
+ reshape2, |
|
23 |
+ data.table, |
|
24 |
+ matrixStats, |
|
25 |
+ Matrix, |
|
26 |
+ GenomicRanges, |
|
27 |
+ RColorBrewer, |
|
28 |
+ colorspace, |
|
29 |
+ ComplexHeatmap, |
|
30 |
+ DESeq2, |
|
31 |
+ csaw, |
|
32 |
+ circlize, |
|
33 |
+ robust, |
|
34 |
+ progress, |
|
35 |
+ utils, |
|
36 |
+ methods, |
|
37 |
+ stringr, |
|
38 |
+ scales, |
|
39 |
+ BiocManager, |
|
40 |
+ BiocParallel, |
|
41 |
+ igraph, |
|
42 |
+ S4Vectors, |
|
43 |
+ ggplot2, |
|
44 |
+ rlang, |
|
45 |
+ Biostrings, |
|
46 |
+ GenomeInfoDb, |
|
47 |
+ IRanges, |
|
48 |
+ SummarizedExperiment, |
|
49 |
+ forcats, |
|
50 |
+ gridExtra, |
|
51 |
+ limma, |
|
52 |
+ purrr, |
|
53 |
+ tidyselect, |
|
54 |
+ readr, |
|
55 |
+ grid, |
|
56 |
+ tidyr, |
|
57 |
+ dplyr, |
|
58 |
+ stats, |
|
59 |
+ grDevices, |
|
60 |
+ graphics, |
|
61 |
+ magrittr, |
|
62 |
+ tibble, |
|
63 |
+ viridis |
|
64 |
+Depends: |
|
65 |
+ R (>= 4.2.0), |
|
66 |
+ tidyverse, |
|
67 |
+ topGO |
|
68 |
+Suggests: |
|
69 |
+ GRaNIEData, |
|
70 |
+ knitr, |
|
71 |
+ BSgenome.Hsapiens.UCSC.hg19, |
|
72 |
+ BSgenome.Hsapiens.UCSC.hg38, |
|
73 |
+ BSgenome.Mmusculus.UCSC.mm10, |
|
74 |
+ BSgenome.Mmusculus.UCSC.mm9, |
|
75 |
+ TxDb.Hsapiens.UCSC.hg19.knownGene, |
|
76 |
+ TxDb.Hsapiens.UCSC.hg38.knownGene, |
|
77 |
+ TxDb.Mmusculus.UCSC.mm10.knownGene, |
|
78 |
+ TxDb.Mmusculus.UCSC.mm9.knownGene, |
|
79 |
+ org.Hs.eg.db, |
|
80 |
+ org.Mm.eg.db, |
|
81 |
+ IHW, |
|
82 |
+ biomaRt, |
|
83 |
+ clusterProfiler, |
|
84 |
+ ReactomePA, |
|
85 |
+ DOSE, |
|
86 |
+ ChIPseeker, |
|
87 |
+ testthat (>= 3.0.0), |
|
88 |
+ BiocStyle |
|
89 |
+VignetteBuilder: knitr |
|
90 |
+biocViews: |
|
91 |
+ Software, |
|
92 |
+ GeneExpression, |
|
93 |
+ GeneRegulation, |
|
94 |
+ NetworkInference, |
|
95 |
+ GeneSetEnrichment, |
|
96 |
+ BiomedicalInformatics, |
|
97 |
+ Genetics, |
|
98 |
+ Transcriptomics, |
|
99 |
+ ATACSeq, |
|
100 |
+ RNASeq, |
|
101 |
+ Network, |
|
102 |
+ Transcription, |
|
103 |
+ ChIPSeq |
|
104 |
+License: Artistic-2.0 |
|
105 |
+LazyData: false |
|
106 |
+URL: https://grp-zaugg.embl-community.io/GRaNIE |
|
107 |
+BugReports: https://git.embl.de/grp-zaugg/GRaNIE/issues |
|
108 |
+RoxygenNote: 7.1.2 |
|
109 |
+Config/testthat/edition: 3 |
0 | 110 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,62 @@ |
1 |
+# Generated by roxygen2: do not edit by hand |
|
2 |
+ |
|
3 |
+export(AR_classification_wrapper) |
|
4 |
+export(addConnections_TF_peak) |
|
5 |
+export(addConnections_peak_gene) |
|
6 |
+export(addData) |
|
7 |
+export(addTFBS) |
|
8 |
+export(add_TF_gene_correlation) |
|
9 |
+export(build_eGRN_graph) |
|
10 |
+export(calculateCommunitiesEnrichment) |
|
11 |
+export(calculateCommunitiesStats) |
|
12 |
+export(calculateGeneralEnrichment) |
|
13 |
+export(calculateTFEnrichment) |
|
14 |
+export(deleteIntermediateData) |
|
15 |
+export(filterData) |
|
16 |
+export(filterGRNAndConnectGenes) |
|
17 |
+export(generateStatsSummary) |
|
18 |
+export(getCounts) |
|
19 |
+export(getGRNConnections) |
|
20 |
+export(getParameters) |
|
21 |
+export(getTopNodes) |
|
22 |
+export(initializeGRN) |
|
23 |
+export(loadExampleObject) |
|
24 |
+export(nGenes) |
|
25 |
+export(nPeaks) |
|
26 |
+export(overlapPeaksAndTFBS) |
|
27 |
+export(performAllNetworkAnalyses) |
|
28 |
+export(plotCommunitiesEnrichment) |
|
29 |
+export(plotCommunitiesStats) |
|
30 |
+export(plotDiagnosticPlots_TFPeaks) |
|
31 |
+export(plotDiagnosticPlots_peakGene) |
|
32 |
+export(plotGeneralEnrichment) |
|
33 |
+export(plotGeneralGraphStats) |
|
34 |
+export(plotPCA_all) |
|
35 |
+export(plotTFEnrichment) |
|
36 |
+export(plot_stats_connectionSummary) |
|
37 |
+import(BiocManager) |
|
38 |
+import(GenomicRanges) |
|
39 |
+import(checkmate) |
|
40 |
+import(ggplot2) |
|
41 |
+import(grDevices) |
|
42 |
+import(graphics) |
|
43 |
+import(patchwork) |
|
44 |
+import(tibble) |
|
45 |
+import(tidyverse) |
|
46 |
+import(topGO) |
|
47 |
+import(utils) |
|
48 |
+importFrom(GenomicRanges,mcols) |
|
49 |
+importFrom(IRanges,width) |
|
50 |
+importFrom(circlize,colorRamp2) |
|
51 |
+importFrom(grid,gpar) |
|
52 |
+importFrom(magrittr,`%>%`) |
|
53 |
+importFrom(methods,new) |
|
54 |
+importFrom(rlang,.data) |
|
55 |
+importFrom(rlang,`:=`) |
|
56 |
+importFrom(stats,cor) |
|
57 |
+importFrom(stats,cor.test) |
|
58 |
+importFrom(stats,median) |
|
59 |
+importFrom(stats,quantile) |
|
60 |
+importFrom(stats,sd) |
|
61 |
+importFrom(utils,packageVersion) |
|
62 |
+importFrom(utils,tail) |
0 | 63 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,90 @@ |
1 |
+# GRaNIE 0.15-0.17 (2021-12-13) |
|
2 |
+ |
|
3 |
+## Major changes |
|
4 |
+ |
|
5 |
+- all enrichment analyses have been extended and improved, we added additional ontologies (KEGG, DO, and Reactome), more information in the resulting summary plots |
|
6 |
+- all plotting functions have been homogenized and extended, PDF width and height can now be set for all exported plot functions. Also, the possibility to not to a PDF but instead to the currently active graphics device is possible. Lastly, setting a different filename is finally possible. Collectively, this provides ultimate flexibility for customizing file names, the output devices used and PDF sizes |
|
7 |
+- we added a function to build the eGRN network that can be parameterized and that allows future developmemt more easily. Now, network-specific parameters can be changed, such as whether loops should be allowed |
|
8 |
+- we removed the GRaNIEdev package, the development now happens in a separate branch rather than a different package |
|
9 |
+- we added Leiden clustering for community clustering (see https://www.nature.com/articles/s41598-019-41695-z for a comparison with louvain) |
|
10 |
+- extensive vignette updates |
|
11 |
+ |
|
12 |
+## Bug fixes |
|
13 |
+ |
|
14 |
+- various minor bug fixes |
|
15 |
+ |
|
16 |
+## Minor changes |
|
17 |
+ |
|
18 |
+- changed the object structure slightly (graph slot and structure within the stats$enrichment slot) |
|
19 |
+ |
|
20 |
+ |
|
21 |
+# GRaNIE 0.9-0.14 (2021-12-13) |
|
22 |
+ |
|
23 |
+## Major changes |
|
24 |
+ |
|
25 |
+- major overhaul and continuous work on peak-gene QC plots |
|
26 |
+- the *filterData* functions has now more filter parameters, such as filtering for CV. Also, all filters uniformly have a *min* and *max* filter. |
|
27 |
+- integrated network statistics and various enrichment analyses |
|
28 |
+- handling of edge cases and rare events in various functions |
|
29 |
+- packages have been renamed to *GRaNIE* as basename (before: *GRN*) |
|
30 |
+ |
|
31 |
+## Bug fixes |
|
32 |
+ |
|
33 |
+- various minor bug fixes |
|
34 |
+ |
|
35 |
+## Minor changes |
|
36 |
+ |
|
37 |
+- changed the object structure slightly and moved some gene and peak annotation data (such as mean, CV) to the appropriate annotation slot |
|
38 |
+ |
|
39 |
+ |
|
40 |
+# GRaNIE 0.8 (2021-05-07) |
|
41 |
+ |
|
42 |
+## Major changes |
|
43 |
+ |
|
44 |
+- improved PCA plotting, PCA plots are now produced for both raw and normalized data |
|
45 |
+- new filters for the function *filterGRaNIEAndConnectGenes* (*peak_gene.maxDistance*) as well as more flexibility how to adjust the peak-gene raw p-values for multiple testing (including the possibility to use IHW - experimental) |
|
46 |
+- new function *plotDiagnosticPlots_TFPeaks* for plotting (this function was previously called only internally, but is now properly exported), in analogy to *plotDiagnosticPlots_peakGene* |
|
47 |
+ |
|
48 |
+## Bug fixes |
|
49 |
+ |
|
50 |
+- various minor bug fixes (PCA plotting, compatibility when providing pre-normalized data) |
|
51 |
+ |
|
52 |
+## Minor changes |
|
53 |
+ |
|
54 |
+- changed the object structure slightly and cleaned the config slot, for example |
|
55 |
+- some functions have been added / renamed to make the workflow more clear and streamlined, see Vignette for details |
|
56 |
+- some default parameters changed |
|
57 |
+ |
|
58 |
+# GRaNIE 0.7 (2021-03-12) |
|
59 |
+ |
|
60 |
+## Major changes |
|
61 |
+ |
|
62 |
+- improved PCA plotting, also works for pre-normalized counts now when provided as input originally |
|
63 |
+- more flexibility for data normalization |
|
64 |
+- homogenized wordings, function calls and workflow clarity, removed unnecessary warnings when plotting peak-gene diagnostic plots, added more R help documentation |
|
65 |
+- added IHW (Independent Hypothesis Weighting) as a multiple testing procedure for peak-gene p-values in addition to now allowing all methods that are supported by p.adjust |
|
66 |
+ |
|
67 |
+## Bug fixes |
|
68 |
+ |
|
69 |
+- various minor bug fixes |
|
70 |
+ |
|
71 |
+## Minor changes |
|
72 |
+ |
|
73 |
+ |
|
74 |
+# GRaNIE 0.6 (2021-02-09) |
|
75 |
+ |
|
76 |
+## Major changes |
|
77 |
+ |
|
78 |
+- significant speed improvements for the peak-FDR calculations and subsequent plotting |
|
79 |
+- TF-peak diagnostic plots now also show negatively correlated TF-peak statistics irrespective of whether they have been filtered out in the object / pipeline. This may be useful for diagnostic purposes to check whether excluding them is a sensible choice and to confirm the numbers are low |
|
80 |
+ |
|
81 |
+## Bug fixes |
|
82 |
+ |
|
83 |
+- Numbers for connections per correlation bin in the TF-peak diagnostic plots were wrong as they did not correctly differentiate between the different connection types in case multiple ones had been specified (e.g., expression and TF activity). This has been fixed. |
|
84 |
+ |
|
85 |
+## Minor changes |
|
86 |
+ |
|
87 |
+ |
|
88 |
+# GRaNIE 0.5 (2021-02-02) |
|
89 |
+ |
|
90 |
+first published package version |
1 | 6 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,14 @@ |
1 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2FGRN_pipeline%2Fsrc%2Fall_dev.R="D948B376" |
|
2 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2FGRN_pipeline%2Fsrc%2Fpackage%2FDESCRIPTION="3B3582E1" |
|
3 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2FGRN_pipeline%2Fsrc%2Fpackage%2FNAMESPACE="BA0B5400" |
|
4 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2FGRN_pipeline%2Fsrc%2Fpackage%2FR%2FDataClasses.R="87F91275" |
|
5 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2FGRN_pipeline%2Fsrc%2Fpackage%2FR%2FSNPhood.R="AE624CEF" |
|
6 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2FGRN_pipeline%2Fsrc%2Fpackage%2FR%2Fcore.R="CB7DFE75" |
|
7 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2FGRN_pipeline%2Fsrc%2Fpackage%2FR%2Fmisc.R="CED2D161" |
|
8 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2FGRN_pipeline%2Fsrc%2Fpackage%2FR%2Fvisualize.R="06FD2867" |
|
9 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2FGRN_pipeline%2Fsrc%2Fpackage%2FR%2FworkflowIdeas.R="38498D80" |
|
10 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2FGRN_pipeline%2Fsrc%2Fpackage%2FR%2Fzzz.R="2EB5D733" |
|
11 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2Fmisc%2FDESeq2_sourceCode%2FDESeq2%2FR%2FAllClasses.R="7F57437D" |
|
12 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2Fmisc%2FDESeq2_sourceCode%2FDESeq2%2FR%2FAllGenerics.R="55FD7801" |
|
13 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2Fmisc%2FDESeq2_sourceCode%2FDESeq2%2FR%2Fcore.R="518159FC" |
|
14 |
+%2Fg%2Fscb2%2Fzaugg%2Fcarnold%2FProjects%2Fmisc%2FDESeq2_sourceCode%2FDESeq2%2FR%2Fwrappers.R="29E11859" |
0 | 15 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,24 @@ |
1 |
+{ |
|
2 |
+ "id": "057AAB76", |
|
3 |
+ "path": "/g/scb2/zaugg/carnold/Projects/GRN_pipeline/src/package/R/misc.R", |
|
4 |
+ "project_path": "misc.R", |
|
5 |
+ "type": "r_source", |
|
6 |
+ "hash": "0", |
|
7 |
+ "contents": "", |
|
8 |
+ "dirty": false, |
|
9 |
+ "created": 1599662405635.0, |
|
10 |
+ "source_on_save": false, |
|
11 |
+ "relative_order": 5, |
|
12 |
+ "properties": { |
|
13 |
+ "cursorPosition": "923,51", |
|
14 |
+ "scrollLine": "911" |
|
15 |
+ }, |
|
16 |
+ "folds": "", |
|
17 |
+ "lastKnownWriteTime": 1501860404, |
|
18 |
+ "encoding": "UTF-8", |
|
19 |
+ "collab_server": "", |
|
20 |
+ "source_window": "", |
|
21 |
+ "last_content_update": 1501860404, |
|
22 |
+ "read_only": false, |
|
23 |
+ "read_only_alternatives": [] |
|
24 |
+} |
|
0 | 25 |
\ No newline at end of file |
1 | 26 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,1275 @@ |
1 |
+#' @import checkmate |
|
2 |
+# @import data.table |
|
3 |
+#' @importFrom data.table data.table := |
|
4 |
+#' @importFrom stats binom.test |
|
5 |
+.calcBinomTestVector <- function(successes, |
|
6 |
+ failures, |
|
7 |
+ probSuccess = 0.5, |
|
8 |
+ returnType="p.value", |
|
9 |
+ indexResult= 1, |
|
10 |
+ conf.level = 0.95) { |
|
11 |
+ |
|
12 |
+ # Check types and validity of arguments |
|
13 |
+ assertIntegerish(successes, lower = 0) |
|
14 |
+ assertIntegerish(failures, lower = 0) |
|
15 |
+ assertChoice(returnType, c("p.value","conf.int", "statistic", "parameter", |
|
16 |
+ "estimate", "null.value", "alternative", "method", "data.name")) |
|
17 |
+ assertInt(indexResult, lower = 1) |
|
18 |
+ assertNumber(conf.level, lower = 0, upper = 1) |
|
19 |
+ assertNumber(probSuccess, lower = 0, upper = 1) |
|
20 |
+ |
|
21 |
+ dt = data.table(successes, failures) |
|
22 |
+ dt[, pp := binom.test(c(successes, failures), |
|
23 |
+ n = NULL, |
|
24 |
+ p = probSuccess, |
|
25 |
+ conf.level = conf.level) |
|
26 |
+ [returnType][[1]][indexResult], by = list(successes, failures)]$pp |
|
27 |
+} |
|
28 |
+ |
|
29 |
+ |
|
30 |
+#' @import checkmate |
|
31 |
+#' @importFrom Rsamtools indexBam |
|
32 |
+.checkAndCreateIndexFile <- function(BAMfile) { |
|
33 |
+ |
|
34 |
+ # Check types and validity of arguments |
|
35 |
+ assertFileExists(BAMfile, access = "r") |
|
36 |
+ |
|
37 |
+ indexFile = paste0(BAMfile,".bai") |
|
38 |
+ |
|
39 |
+ if (!testFile(indexFile, access = "r")) { |
|
40 |
+ warning("Could not find index file ", indexFile," for BAM file ", BAMfile, |
|
41 |
+ ". Attemtping to produce it automatically...", sep = "") |
|
42 |
+ indexBam(BAMfile) |
|
43 |
+ } |
|
44 |
+ |
|
45 |
+} |
|
46 |
+ |
|
47 |
+#' @import checkmate |
|
48 |
+#' @import Rsamtools |
|
49 |
+#' @importFrom Rsamtools scanBamFlag |
|
50 |
+.constructScanBamFlagsGen <- function(isPaired = NA, |
|
51 |
+ isProperPair = NA, |
|
52 |
+ isUnmappedQuery = NA, |
|
53 |
+ hasUnmappedMate = NA, |
|
54 |
+ isMinusStrand = NA, |
|
55 |
+ isMateMinusStrand = NA, |
|
56 |
+ isFirstMateRead = NA, |
|
57 |
+ isSecondMateRead = NA, |
|
58 |
+ isSecondaryAlignment = NA, |
|
59 |
+ isNotPassingQualityControls = NA, |
|
60 |
+ isDuplicate = NA ) { |
|
61 |
+ |
|
62 |
+ |
|
63 |
+ assertFlag(isPaired, na.ok = TRUE) |
|
64 |
+ assertFlag(isProperPair, na.ok = TRUE) |
|
65 |
+ assertFlag(isUnmappedQuery, na.ok = TRUE) |
|
66 |
+ assertFlag(hasUnmappedMate, na.ok = TRUE) |
|
67 |
+ assertFlag(isMinusStrand, na.ok = TRUE) |
|
68 |
+ assertFlag(isMateMinusStrand, na.ok = TRUE) |
|
69 |
+ assertFlag(isFirstMateRead, na.ok = TRUE) |
|
70 |
+ assertFlag(isSecondMateRead, na.ok = TRUE) |
|
71 |
+ assertFlag(isSecondaryAlignment, na.ok = TRUE) |
|
72 |
+ assertFlag(isNotPassingQualityControls, na.ok = TRUE) |
|
73 |
+ assertFlag(isDuplicate, na.ok = TRUE) |
|
74 |
+ |
|
75 |
+ flags = scanBamFlag(isPaired = isPaired, |
|
76 |
+ isProperPair = isProperPair, |
|
77 |
+ isUnmappedQuery = isUnmappedQuery, |
|
78 |
+ hasUnmappedMate = hasUnmappedMate, |
|
79 |
+ isMinusStrand = isMinusStrand, |
|
80 |
+ isMateMinusStrand = isMateMinusStrand, |
|
81 |
+ isFirstMateRead = isFirstMateRead, |
|
82 |
+ isSecondMateRead = isSecondMateRead, |
|
83 |
+ isNotPassingQualityControls = isNotPassingQualityControls, |
|
84 |
+ isDuplicate = isDuplicate |
|
85 |
+ ) |
|
86 |
+ |
|
87 |
+ flags |
|
88 |
+ |
|
89 |
+} |
|
90 |
+ |
|
91 |
+ |
|
92 |
+#' @import checkmate |
|
93 |
+.collectFilesGen <- function(patternFiles, |
|
94 |
+ recursive = FALSE, |
|
95 |
+ ignoreCase = TRUE, |
|
96 |
+ verbose) { |
|
97 |
+ |
|
98 |
+ # Check types and validity of arguments |
|
99 |
+ assertCharacter(patternFiles, any.missing = FALSE, min.chars = 1, min.len = 1) |
|
100 |
+ assertFlag(recursive) |
|
101 |
+ assertFlag(ignoreCase) |
|
102 |
+ assertFlag(verbose) |
|
103 |
+ |
|
104 |
+ patternGiven = ifelse(length(patternFiles) == 1, TRUE, FALSE) |
|
105 |
+ |
|
106 |
+ if (patternGiven) { |
|
107 |
+ directory = dirname(patternFiles) |
|
108 |
+ patternFilesRed = basename(patternFiles) |
|
109 |
+ |
|
110 |
+ #################################################### |
|
111 |
+ # Identify the BAM files that need to be processed # |
|
112 |
+ #################################################### |
|
113 |
+ |
|
114 |
+ if (verbose) |
|
115 |
+ message("Search for files with the pattern '", patternFilesRed, |
|
116 |
+ "' in directory ", directory, |
|
117 |
+ " (recursive: ", recursive,", case-sensitive:",!ignoreCase,")") |
|
118 |
+ |
|
119 |
+ filesToProcess.vec = list.files(path = directory, |
|
120 |
+ pattern = patternFilesRed, |
|
121 |
+ full.names = TRUE, |
|
122 |
+ recursive = recursive, |
|
123 |
+ ignore.case = ignoreCase) |
|
124 |
+ |
|
125 |
+ } else { |
|
126 |
+ |
|
127 |
+ filesToProcess.vec = unique(patternFiles) |
|
128 |
+ |
|
129 |
+ # Check if readable and existent |
|
130 |
+ for (i in seq_len(length(filesToProcess.vec))) { |
|
131 |
+ assertFileExists(filesToProcess.vec[i], access = "r") |
|
132 |
+ } |
|
133 |
+ |
|
134 |
+ # Check if they are all BAM files |
|
135 |
+ if (!all(grepl(filesToProcess.vec, pattern = "*.bam$", ignore.case = TRUE))) { |
|
136 |
+ stop("All specified files (", paste0(filesToProcess.vec, collapse = ","), |
|
137 |
+ ") must be BAM files, but at least one file did not end with \".bam\"", sep = "") |
|
138 |
+ } |
|
139 |
+ |
|
140 |
+ |
|
141 |
+ } |
|
142 |
+ |
|
143 |
+ # Make sure only files ending with the correct ending are processed |
|
144 |
+ index = which(grepl(paste0(".","bai","$"), filesToProcess.vec, perl = TRUE)) |
|
145 |
+ if (length(index) > 0) { |
|
146 |
+ filesToProcess.vec = filesToProcess.vec[-index] |
|
147 |
+ } |
|
148 |
+ |
|
149 |
+ |
|
150 |
+ if (verbose) |
|
151 |
+ message("Found the following files:\n", paste(filesToProcess.vec, collapse = "\n ")) |
|
152 |
+ |
|
153 |
+ |
|
154 |
+ if (patternGiven) { |
|
155 |
+ nFilesToProcess = length(filesToProcess.vec) |
|
156 |
+ |
|
157 |
+ if (nFilesToProcess == 0) { |
|
158 |
+ warning(paste0("No files to process in folder ", directory, |
|
159 |
+ " that fulfill the desired criteria.")) |
|
160 |
+ } else { |
|
161 |
+ |
|
162 |
+ if (!is.null(directory)) { |
|
163 |
+ |
|
164 |
+ } else { |
|
165 |
+ if (verbose) |
|
166 |
+ message("The following ", nFilesToProcess," file will be processed:\n ", |
|
167 |
+ paste(filesToProcess.vec, collapse = "\n ")) |
|
168 |
+ |
|
169 |
+ } |
|
170 |
+ } |
|
171 |
+ } |
|
172 |
+ filesToProcess.vec |
|
173 |
+ |
|
174 |
+} |
|
175 |
+ |
|
176 |
+ |
|
177 |
+#' @import checkmate |
|
178 |
+#' @import Rsamtools |
|
179 |
+# @importFrom Rsamtools ScanBamParam scanBam |
|
180 |
+.extractFromBAMGen <- function(regions.gr, |
|
181 |
+ file, |
|
182 |
+ fieldsToRetrieve = scanBamWhat(), |
|
183 |
+ flags, |
|
184 |
+ readGroupSpecificity = TRUE, |
|
185 |
+ simpleCigar = FALSE, |
|
186 |
+ reverseComplement = FALSE, |
|
187 |
+ minMapQ = 0, |
|
188 |
+ verbose = TRUE) { |
|
189 |
+ |
|
190 |
+ assertClass(regions.gr, "GRanges") |
|
191 |
+ assertFileExists(file, access = "r") |
|
192 |
+ assertSubset(fieldsToRetrieve, scanBamWhat(), empty.ok = FALSE) |
|
193 |
+ assertInteger(flags, min.len = 1, any.missing = FALSE) |
|
194 |
+ assertInt(minMapQ, lower = 0) |
|
195 |
+ assertFlag(readGroupSpecificity) |
|
196 |
+ assertFlag(simpleCigar) |
|
197 |
+ assertFlag(reverseComplement) |
|
198 |
+ assertFlag(verbose) |
|
199 |
+ |
|
200 |
+ if (minMapQ == 0) { |
|
201 |
+ minMapQ = NA_integer_ |
|
202 |
+ } |
|
203 |
+ |
|
204 |
+ # Because data types are coerced to IRangesList, |
|
205 |
+ # which does not include strand information (use the flag argument instead). |
|
206 |
+ |
|
207 |
+ if (readGroupSpecificity) { |
|
208 |
+ |
|
209 |
+ param = ScanBamParam(which = regions.gr, |
|
210 |
+ what = fieldsToRetrieve, |
|
211 |
+ flag = flags, |
|
212 |
+ reverseComplement = reverseComplement, |
|
213 |
+ simpleCigar = simpleCigar, |
|
214 |
+ mapqFilter = minMapQ, |
|
215 |
+ tag = "RG" ) |
|
216 |
+ |
|
217 |
+ } else { |
|
218 |
+ |
|
219 |
+ param = ScanBamParam(which = regions.gr, |
|
220 |
+ what = fieldsToRetrieve, |
|
221 |
+ flag = flags, |
|
222 |
+ reverseComplement = reverseComplement, |
|
223 |
+ simpleCigar = simpleCigar, |
|
224 |
+ mapqFilter = minMapQ) |
|
225 |
+ |
|
226 |
+ } |
|
227 |
+ |
|
228 |
+ |
|
229 |
+ return(scanBam(file, param = param)) |
|
230 |
+ |
|
231 |
+} |
|
232 |
+ |
|
233 |
+ |
|
234 |
+#' @import checkmate |
|
235 |
+.filterReads <- function(bamObj, |
|
236 |
+ strand.vec, |
|
237 |
+ alleleCur, |
|
238 |
+ strandPar) { |
|
239 |
+ |
|
240 |
+ assertList(bamObj, any.missing = FALSE, min.len = 1) |
|
241 |
+ assertSubset(strand.vec, c("+","-","*")) |
|
242 |
+ assertCharacter(alleleCur, min.chars = 1, len = 1) |
|
243 |
+ assertSubset(strandPar, c("both", "sense", "antisense")) |
|
244 |
+ |
|
245 |
+ nReadsDeleted = 0 |
|
246 |
+ |
|
247 |
+ |
|
248 |
+ for (i in seq_len(length(bamObj))) { |
|
249 |
+ |
|
250 |
+ indexToDelete = c() |
|
251 |
+ |
|
252 |
+ # Filter strand |
|
253 |
+ if (!is.na(strand.vec[i]) & strand.vec[i] != "*" & strandPar != "both") { |
|
254 |
+ # Filter only those with the desired and specific strand. |
|
255 |
+ # This cannot be done before because in ScanBamParam, data types are coerced to IRangesList, |
|
256 |
+ # which does not include strand information. |
|
257 |
+ # The flags are also not suitable because the strand information flag is set globally |
|
258 |
+ # and cannot be individual for each entry. |
|
259 |
+ |
|
260 |
+ if (strandPar == "sense") { |
|
261 |
+ indexToDelete = which(strand.vec[i] != bamObj[[i]]$strand) |
|
262 |
+ } else { |
|
263 |
+ indexToDelete = which(strand.vec[i] == bamObj[[i]]$strand) |
|
264 |
+ } |
|
265 |
+ |
|
266 |
+ } |
|
267 |
+ |
|
268 |
+ # Filter read group |
|
269 |
+ if (!is.na(alleleCur) & alleleCur != "allReadGroups") { |
|
270 |
+ |
|
271 |
+ # Check if read groups are actually defined in the file. If not, there is nothing to do. |
|
272 |
+ # Important: the where parameter here |
|
273 |
+ if (exists("tag", where = bamObj[[i]])) { |
|
274 |
+ if (exists("RG", where = bamObj[[i]]$tag)) { |
|
275 |
+ |
|
276 |
+ |
|
277 |
+ index = which(bamObj[[i]]$tag$RG != alleleCur) |
|
278 |
+ if (length(index) > 0) indexToDelete = c(indexToDelete, index) |
|
279 |
+ } else { |
|
280 |
+ warning("Read groups cannot be extracted from BAM file (no tag RG found) for region ",i,".") |
|
281 |
+ |
|
282 |
+ } |
|
283 |
+ |
|
284 |
+ } else { |
|
285 |
+ |
|
286 |
+ warning("Read groups cannot be extracted from BAM file (no tag RG found) for region ",i,".") |
|
287 |
+ } |
|
288 |
+ |
|
289 |
+ } |
|
290 |
+ |
|
291 |
+ if (length(indexToDelete) > 0) { |
|
292 |
+ # Delete the reads completely in strand, pos, and qwidth |
|
293 |
+ # Potential improvement: Unlist bam[[1]] and create |
|
294 |
+ # indexToDeleteSignal to delete all information in one step |
|
295 |
+ bamObj[[i]]$strand = bamObj[[i]]$strand[-indexToDelete] |
|
296 |
+ bamObj[[i]]$pos = bamObj[[i]]$pos [-indexToDelete] |
|
297 |
+ |
|
298 |
+ if (exists("qwidth", where = bamObj[[i]])) |
|
299 |
+ bamObj[[i]]$qwidth = bamObj[[i]]$qwidth[-indexToDelete] |
|
300 |
+ if (exists("seq" , where = bamObj[[i]])) |
|
301 |
+ bamObj[[i]]$seq = bamObj[[i]]$seq [-indexToDelete] |
|
302 |
+ |
|
303 |
+ if (exists("tag", where = bamObj[[i]])) { |
|
304 |
+ if (exists("RG", where = bamObj[[i]]$tag)) { |
|
305 |
+ bamObj[[i]]$tag$RG = bamObj[[i]]$tag$RG[-indexToDelete] |
|
306 |
+ } |
|
307 |
+ } |
|
308 |
+ #bamObj[[i]]$rname = bamObj[[i]]$rname [- indexToDelete] |
|
309 |
+ |
|
310 |
+ nReadsDeleted = nReadsDeleted + length(indexToDelete) |
|
311 |
+ } |
|
312 |
+ |
|
313 |
+ } |
|
314 |
+ |
|
315 |
+ list(filteredReads = nReadsDeleted, |
|
316 |
+ bamObj = bamObj) |
|
317 |
+ |
|
318 |
+} |
|
319 |
+ |
|
320 |
+#' @import checkmate |
|
321 |
+.generateDefaultReadFlags <- function(pairedEndReads = TRUE) { |
|
322 |
+ |
|
323 |
+ assertFlag(pairedEndReads) |
|
324 |
+ |
|
325 |
+ par.l = list( |
|
326 |
+ |
|
327 |
+ "readFlag_isPaired" = TRUE, |
|
328 |
+ "readFlag_isProperPair" = TRUE , |
|
329 |
+ "readFlag_isUnmappedQuery" = FALSE, |
|
330 |
+ "readFlag_hasUnmappedMate" = FALSE, |
|
331 |
+ "readFlag_isMinusStrand" = NA, |
|
332 |
+ "readFlag_isMateMinusStrand" = NA, |
|
333 |
+ "readFlag_isFirstMateRead" = NA, |
|
334 |
+ "readFlag_isSecondMateRead" = NA, |
|
335 |
+ "readFlag_isNotPrimaryRead" = FALSE, |
|
336 |
+ "readFlag_isNotPassingQualityControls" = FALSE, |
|
337 |
+ "readFlag_isDuplicate" = FALSE |
|
338 |
+ ) |
|
339 |
+ |
|
340 |
+ if (!pairedEndReads) { |
|
341 |
+ |
|
342 |
+ par.l$readFlag_isPaired = NA |
|
343 |
+ par.l$readFlag_isProperPair = NA |
|
344 |
+ par.l$readFlag_hasUnmappedMate = NA |
|
345 |
+ par.l$readFlag_isMateMinusStrand = NA |
|
346 |
+ par.l$readFlag_isFirstMateRead = NA |
|
347 |
+ par.l$readFlag_isSecondMateRead = NA |
|
348 |
+ par.l$readFlag_isNotPrimaryRead = NA |
|
349 |
+ |
|
350 |
+ |
|
351 |
+ } |
|
352 |
+ |
|
353 |
+ par.l |
|
354 |
+} |
|
355 |
+ |
|
356 |
+#' @import checkmate |
|
357 |
+#' @importFrom GenomeInfoDb fetchExtendedChromInfoFromUCSC |
|
358 |
+.getGenomeData <- function(genome, |
|
359 |
+ includeChrM = FALSE){ |
|
360 |
+ |
|
361 |
+ |
|
362 |
+ # See ?fetchExtendedChromInfoFromUCSC for a list of supported chromosome sizes |
|
363 |
+ assertChoice(genome, c("hg38", "hg19", "hg18", "mm10", "mm9", "dm3", "sacCer3", "sacCer2")) |
|
364 |
+ assertFlag(includeChrM) |
|
365 |
+ |
|
366 |
+ # library("BSgenome") may also be used together with seqlengths(NFKB)=seqlengths(Hsapiens), |
|
367 |
+ # but this requires the download of hundreds of data for a particular genome |
|
368 |
+ |
|
369 |
+ # use a try-catch construct in case the default approach to determine chromosome sizes fails |
|
370 |
+ result = tryCatch( { |
|
371 |
+ chromInfo.df = suppressWarnings(fetchExtendedChromInfoFromUCSC(genome)) |
|
372 |
+ chromInfo.df = chromInfo.df[which(chromInfo.df$SequenceRole == "assembled-molecule"),] |
|
373 |
+ |
|
374 |
+ rowChrM = which(chromInfo.df$UCSC_seqlevel == "chrM") |
|
375 |
+ if (!includeChrM & length(rowChrM) == 1) { |
|
376 |
+ chromInfo.df = chromInfo.df[-rowChrM,] |
|
377 |
+ } |
|
378 |
+ # Quickly transform to a regular data frame |
|
379 |
+ chrSizes.df = data.frame(chr = chromInfo.df$UCSC_seqlevel, |
|
380 |
+ size = as.numeric(chromInfo.df$UCSC_seqlength)) |
|
381 |
+ |
|
382 |
+ rownames(chrSizes.df) = chrSizes.df$chr |
|
383 |
+ |
|
384 |
+ return(chrSizes.df) |
|
385 |
+ |
|
386 |
+ }, error = function(e) { |
|
387 |
+ |
|
388 |
+ warning("Could not obtain chromosome sizes using GenomeInfoDb, try fallback implementation...\n") |
|
389 |
+ assertChoice(genome, c("hg19","hg38", "mm10", "mm9")) |
|
390 |
+ |
|
391 |
+ if (genome == "hg19") { |
|
392 |
+ chrSizes = list( |
|
393 |
+ "chr1" = 249250621, |
|
394 |
+ "chr2" = 243199373, |
|
395 |
+ "chr3" = 198022430, |
|
396 |
+ "chr4" = 191154276, |
|
397 |
+ "chr5" = 180915260, |
|
398 |
+ "chr6" = 171115067, |
|
399 |
+ "chr7" = 159138663, |
|
400 |
+ "chrX" = 155270560, |
|
401 |
+ "chr8" = 146364022, |
|
402 |
+ "chr9" = 141213431, |
|
403 |
+ "chr10" = 135534747, |
|
404 |
+ "chr11" = 135006516, |
|
405 |
+ "chr12" = 133851895, |
|
406 |
+ "chr13" = 115169878, |
|
407 |
+ "chr14" = 107349540, |
|
408 |
+ "chr15" = 102531392, |
|
409 |
+ "chr16" = 90354753, |
|
410 |
+ "chr17" = 81195210, |
|
411 |
+ "chr18" = 78077248, |
|
412 |
+ "chr20" = 63025520, |
|
413 |
+ "chrY" = 59373566, |
|
414 |
+ "chr19" = 59128983, |
|
415 |
+ "chr22" = 51304566, |
|
416 |
+ "chr21" = 48129895, |
|
417 |
+ "chrM" = 16571 |
|
418 |
+ ) |
|
419 |
+ } else if (genome == "hg38") { |
|
420 |
+ chrSizes = list( |
|
421 |
+ "chr1" = 248956422, |
|
422 |
+ "chr2" = 242193529, |
|
423 |
+ "chr3" = 198295559, |
|
424 |
+ "chr4" = 190214555, |
|
425 |
+ "chr5" = 181538259, |
|
426 |
+ "chr6" = 170805979, |
|
427 |
+ "chr7" = 159345973, |
|
428 |
+ "chrX" = 156040895, |
|
429 |
+ "chr8" = 145138636, |
|
430 |
+ "chr9" = 138394717, |
|
431 |
+ "chr11" = 135086622, |
|
432 |
+ "chr10" = 133797422, |
|
433 |
+ "chr12" = 133275309, |
|
434 |
+ "chr13" = 114364328, |
|
435 |
+ "chr14" = 107043718, |
|
436 |
+ "chr15" = 101991189, |
|
437 |
+ "chr16" = 90338345, |
|
438 |
+ "chr17" = 83257441, |
|
439 |
+ "chr18" = 80373285, |
|
440 |
+ "chr20" = 64444167, |
|
441 |
+ "chr19" = 58617616, |
|
442 |
+ "chrY" = 57227415, |
|
443 |
+ "chr22" = 50818468, |
|
444 |
+ "chr21" = 46709983, |
|
445 |
+ "chrM" = 16569 |
|
446 |
+ ) |
|
447 |
+ |
|
448 |
+ } else if (genome == "mm9") { |
|
449 |
+ chrSizes = list( |
|
450 |
+ "chr1" = 197195432, |
|
451 |
+ "chr2" = 181748087, |
|
452 |
+ "chr3" = 159599783, |
|
453 |
+ "chr4" = 155630120, |
|
454 |
+ "chr5" = 152537259, |
|
455 |
+ "chr6" = 149517037, |
|
456 |
+ "chr7" = 152524553, |
|
457 |
+ "chr8" = 131738871, |
|
458 |
+ "chr9" = 124076172, |
|
459 |
+ "chr10" = 129993255, |
|
460 |
+ "chr11" = 121843856, |
|
461 |
+ "chr12" = 121257530, |
|
462 |
+ "chr13" = 120284312, |
|
463 |
+ "chr14" = 125194864, |
|
464 |
+ "chr15" = 103494974, |
|
465 |
+ "chr16" = 98319150, |
|
466 |
+ "chr17" = 95272651, |
|
467 |
+ "chr18" = 90772031, |
|
468 |
+ "chr19" = 61342430, |
|
469 |
+ "chrX" = 166650296, |
|
470 |
+ "chrY" = 15902555 |
|
471 |
+ ) |
|
472 |
+ |
|
473 |
+ |
|
474 |
+ } else if (genome == "mm10") { |
|
475 |
+ chrSizes = list( |
|
476 |
+ "chr1" = 195471971, |
|
477 |
+ "chr2" = 182113224, |
|
478 |
+ "chr3" = 160039680, |
|
479 |
+ "chr4" = 156508116, |
|
480 |
+ "chr5" = 151834684, |
|
481 |
+ "chr6" = 149736546, |
|
482 |
+ "chr7" = 145441459, |
|
483 |
+ "chr8" = 129401213, |
|
484 |
+ "chr9" = 124595110, |
|
485 |
+ "chr10" = 130694993, |
|
486 |
+ "chr11" = 122082543, |
|
487 |
+ "chr12" = 120129022, |
|
488 |
+ "chr13" = 120421639, |
|
489 |
+ "chr14" = 124902244, |
|
490 |
+ "chr15" = 104043685, |
|
491 |
+ "chr16" = 98207768, |
|
492 |
+ "chr17" = 94987271, |
|
493 |
+ "chr18" = 90702639, |
|
494 |
+ "chr19" = 61431566, |
|
495 |
+ "chrX" = 171031299, |
|
496 |
+ "chrY" = 91744698 |
|
497 |
+ ) |
|
498 |
+ |
|
499 |
+ } |
|
500 |
+ |
|
501 |
+ |
|
502 |
+ if (!includeChrM) chrSizes[["chrM"]] = NULL |
|
503 |
+ # Quickly transform to a regular data frame |
|
504 |
+ chrSizes.df = data.frame(chr = names(chrSizes), |
|
505 |
+ size = as.numeric(as.data.frame(chrSizes)[1,])) |
|
506 |
+ |
|
507 |
+ rownames(chrSizes.df) = chrSizes.df$chr |
|
508 |
+ chrSizes.df |
|
509 |
+ } |
|
510 |
+ |
|
511 |
+ ) |
|
512 |
+ |
|
513 |
+ return(result) |
|
514 |
+ |
|
515 |
+ |
|
516 |
+} |
|
517 |
+ |
|
518 |
+ |
|
519 |
+#' @import checkmate |
|
520 |
+.getParameters <- function(parsValid.l, |
|
521 |
+ parName= NULL) { |
|
522 |
+ |
|
523 |
+ assertList(parsValid.l, min.len = 1, all.missing = FALSE) |
|
524 |
+ assert(checkNull(parName), |
|
525 |
+ checkCharacter(parName, any.missing = FALSE, min.chars = 1, len = 1)) |
|
526 |
+ |
|
527 |
+ parName.df = data.frame(parName = names(parsValid.l), |
|
528 |
+ parType = as.character(unlist(parsValid.l)), |
|
529 |
+ stringsAsFactors = FALSE) |
|
530 |
+ |
|
531 |
+ rownames(parName.df) = names(parsValid.l) |
|
532 |
+ |
|
533 |
+ if (is.null(parName)) { |
|
534 |
+ |
|
535 |
+ return(parName.df) |
|
536 |
+ |
|
537 |
+ } else { |
|
538 |
+ |
|
539 |
+ rowIndex = which(parName.df$parName == parName) |
|
540 |
+ if (length(rowIndex) != 1) { |
|
541 |
+ stop("Could not find value of parameter ", parName," in data frame.") |
|
542 |
+ } |
|
543 |
+ |
|
544 |
+ return(parName.df[rowIndex,]$parType) |
|
545 |
+ |
|
546 |
+ } |
|
547 |
+ |
|
548 |
+} |
|
549 |
+ |
|
550 |
+ |
|
551 |
+#' @import checkmate |
|
552 |
+#' @importFrom utils read.table |
|
553 |
+.parseBed6File <- function(path_bedFile, |
|
554 |
+ headerLine = FALSE, |
|
555 |
+ linesToParse = -1, |
|
556 |
+ verbose= FALSE) { |
|
557 |
+ |
|
558 |
+ assertFileExists(path_bedFile, access = "r") |
|
559 |
+ assertFlag(headerLine) |
|
560 |
+ assert(checkInt(linesToParse, lower = -1, upper = -1), |
|
561 |
+ checkInt(linesToParse, lower = 1)) |
|
562 |
+ assertFlag(verbose) |
|
563 |
+ |
|
564 |
+ userRegions.df = read.table(file = path_bedFile, |
|
565 |
+ header = headerLine, |
|
566 |
+ nrows = linesToParse, |
|
567 |
+ stringsAsFactors = FALSE) |
|
568 |
+ |
|
569 |
+ if (nrow(userRegions.df) == 0) { |
|
570 |
+ stop("No entries found in file ", path_bedFile,"\n", sep = "") |
|
571 |
+ } |
|
572 |
+ |
|
573 |
+ nCols = ncol(userRegions.df) |
|
574 |
+ nColsOrig = nCols |
|
575 |
+ |
|
576 |
+ if (nCols < 2 | nCols > 6) { |
|
577 |
+ stop("Wrong number of columns in file ", path_bedFile, |
|
578 |
+ ": Expected 2-6, found only", nCols, |
|
579 |
+ ". At least two columns are needed (chromosome, start\n", sep = "") |
|
580 |
+ } |
|
581 |
+ |
|
582 |
+ # Check column 1, always present |
|
583 |
+ assert(checkInteger(userRegions.df[,1], lower = 1, any.missing = FALSE, min.len = 1), |
|
584 |
+ checkCharacter(userRegions.df[,1], min.chars = 4, pattern = "chr", any.missing = FALSE, min.len = 1)) |
|
585 |
+ |
|
586 |
+ # Column 2: (Start) position, always present |
|
587 |
+ assertInteger(userRegions.df[,2], lower = 0, any.missing = FALSE) |
|
588 |
+ |
|
589 |
+ # Set the column names |
|
590 |
+ if (nCols == 2) { |
|
591 |
+ colnames(userRegions.df) = c("chr","start") |
|
592 |
+ } |
|
593 |
+ if (nCols == 3) { |
|
594 |
+ colnames(userRegions.df) = c("chr","start","annotation") |
|
595 |
+ assertCharacter(as.character(userRegions.df[,3]), min.chars = 1, any.missing = FALSE, min.len = 1) |
|
596 |
+ } |
|
597 |
+ if (nCols == 4) { |
|
598 |
+ colnames(userRegions.df) = c("chr","start","end","annotation") |
|
599 |
+ assertInteger(userRegions.df[,3], lower = 0, any.missing = FALSE) |
|
600 |
+ assertCharacter(userRegions.df[,4], min.chars = 1, any.missing = FALSE, min.len = 1) |
|
601 |
+ |
|
602 |
+ } |
|
603 |
+ if (nCols == 5) { |
|
604 |
+ colnames(userRegions.df) = c("chr","start","end","annotation","strand") |
|
605 |
+ assertInteger(userRegions.df[,3], lower = 0, any.missing = FALSE) |
|
606 |
+ assertCharacter(userRegions.df[,4], min.chars = 1, any.missing = FALSE, min.len = 1) |
|
607 |
+ assertSubset(userRegions.df[,5], c("+","-","*",".")) |
|
608 |
+ } |
|
609 |
+ if (nCols == 6) { |
|
610 |
+ colnames(userRegions.df) = c("chr","start","end","annotation","score","strand") |
|
611 |
+ assertInteger(userRegions.df[,3], lower = 0, any.missing = FALSE) |
|
612 |
+ assertCharacter(userRegions.df[,4], min.chars = 1, any.missing = FALSE, min.len = 1) |
|
613 |
+ assertInteger(userRegions.df[,5], any.missing = FALSE) |
|
614 |
+ assertSubset(userRegions.df[,6], c("+","-","*",".")) |
|
615 |
+ |
|
616 |
+ } |
|
617 |
+ |
|
618 |
+ if (nCols < 3) { |
|
619 |
+ |
|
620 |
+ userRegions.df$annotation = |
|
621 |
+ paste0("seq", seq_len(nrow(userRegions.df)),":", |
|
622 |
+ userRegions.df[,1],"_", userRegions.df[,2]) |
|
623 |
+ |
|
624 |
+ } |
|
625 |
+ |
|
626 |
+ if (nCols < 6) { |
|
627 |
+ userRegions.df$score = rep(".", nrow(userRegions.df)) |
|
628 |
+ } |
|
629 |
+ |
|
630 |
+ if (nCols < 5) { |
|
631 |
+ userRegions.df$strand = rep("*", nrow(userRegions.df)) |
|
632 |
+ if (verbose) |
|
633 |
+ message(" Add strand information for all regions. Assume strand is irrelevant (*)...") |
|
634 |
+ } |
|
635 |
+ |
|
636 |
+ if (nCols < 4) { |
|
637 |
+ |
|
638 |
+ userRegions.df$end = as.numeric(userRegions.df[,2]) |
|
639 |
+ } |
|
640 |
+ |
|
641 |
+ userRegions.df$annotation = as.character(userRegions.df$annotation) |
|
642 |
+ |
|
643 |
+ |
|
644 |
+ # Do a few more tests to see if the content is valid |
|
645 |
+ userRegions.df$strand[which(userRegions.df$strand == ".")] = "*" |
|
646 |
+ |
|
647 |
+ # Check if the chromosome names conform to the "standard" chr{Nr} |
|
648 |
+ if (length(which(grepl("^chr", userRegions.df$chr, perl = TRUE))) != nrow(userRegions.df)) { |
|
649 |
+ |
|
650 |
+ if (verbose) message(" Modify chromosome names because they do not start with \"chr\"") |
|
651 |
+ userRegions.df$chr = paste0("chr", userRegions.df$chr) |
|
652 |
+ } |
|
653 |
+ |
|
654 |
+ # Test if end is an integer |
|
655 |
+ if (!all(floor(userRegions.df$end) == userRegions.df$end) | |
|
656 |
+ !all(floor(userRegions.df$start) == userRegions.df$start)) { |
|
657 |
+ |
|
658 |
+ stop("Either start or end positions do not contain valid numbers. ", |
|
659 |
+ "Please check if you, for example, accidentally include the header line. ", |
|
660 |
+ "In that case, adjust the parameter headerLine accordingly.", sep = "") |
|
661 |
+ } |
|
662 |
+ |
|
663 |
+ |
|
664 |
+ if (max(userRegions.df$end) > 536870912) { |
|
665 |
+ stop("At least one end position is larger than 536870912 (", max(userRegions.df$end), |
|
666 |
+ "), which is not supported by the IRange package\n", sep = "") |
|
667 |
+ } |
|
668 |
+ |
|
669 |
+ userRegions.df = userRegions.df[, c("chr","start","end","annotation","score","strand")] |
|
670 |
+ |
|
671 |
+ list(userRegions.df, nColsOrig) |
|
672 |
+ |
|
673 |
+} |
|
674 |
+ |
|
675 |
+#' @import checkmate |
|
676 |
+#' @importFrom GenomeInfoDb sortSeqlevels |
|
677 |
+#' @import GenomicRanges |
|
678 |
+# @importFrom GenomicRanges GRanges |
|
679 |
+#' @importFrom IRanges IRanges |
|
680 |
+.parseAndProcessUserRegions <- function(par.l, |
|
681 |
+ chrSizes.df, |
|
682 |
+ verbose = TRUE) { |
|
683 |
+ |
|
684 |
+ assertList(par.l, min.len = 1, all.missing = FALSE) |
|
685 |
+ assertDataFrame(chrSizes.df, min.rows = 1, min.cols = 1) |
|
686 |
+ assertFlag(verbose) |
|
687 |
+ |
|
688 |
+ listFunction = .parseBed6File(par.l$path_userRegions, |
|
689 |
+ par.l$headerLine, par.l$linesToParse, |
|
690 |
+ verbose = verbose) |
|
691 |
+ |
|
692 |
+ if (verbose) message(" Parse the user regions file") |
|
693 |
+ |
|
694 |
+ userRegions.df = listFunction[[1]] |
|
695 |
+ nColOrig = listFunction[[2]] |
|
696 |
+ |
|
697 |
+ nRegions = nrow(userRegions.df) |
|
698 |
+ |
|
699 |
+ if (verbose) |
|
700 |
+ message(" Finished parsing. Number of entries processed: ", nRegions) |
|
701 |
+ |
|
702 |
+ |
|
703 |
+ |
|
704 |
+ if (par.l$linesToParse != -1) { |
|
705 |
+ warning("The parameter \"linesToParse\" has been set to a non-default value. ", |
|
706 |
+ "Only a subset of all lines have been parsed\n") |
|
707 |
+ } |
|
708 |
+ |
|
709 |
+ |
|
710 |
+ # Check if the user wants to use unassembled chromosomes. If yes, remove them. |
|
711 |
+ regionsUnassembled = which(!userRegions.df$chr %in% as.character(chrSizes.df$chr)) |
|
712 |
+ if (length(regionsUnassembled) > 0) { |
|
713 |
+ warning(length(regionsUnassembled), |
|
714 |
+ " user regions originate from unassembled or unknown chromosomes. ", |
|
715 |
+ "They have been deleted.\n") |
|
716 |
+ |
|
717 |
+ userRegions.df = userRegions.df[-regionsUnassembled,] |
|
718 |
+ } |
|
719 |
+ |
|
720 |
+ |
|
721 |
+ # Normalize everything to fully closed one-based coordinates. |
|
722 |
+ # GRange coordinates are fully closed, and 1 based! |
|
723 |
+ # See http://master.bioconductor.org/help/course-materials/2014/SeattleFeb2014/Ranges.pdf. |
|
724 |
+ |
|
725 |
+ # Adjust positions by one if zero-based coordinates |
|
726 |
+ if (par.l$zeroBasedCoordinates) { |
|
727 |
+ if (verbose) |
|
728 |
+ message(" Increase start and end coordinates by one because they are zero-based") |
|
729 |
+ userRegions.df$start = userRegions.df$start + 1 |
|
730 |
+ userRegions.df$end = userRegions.df$end + 1 |
|
731 |
+ } |
|
732 |
+ |
|
733 |
+ # Similar to the BED standard with half open intervals, we also need those |
|
734 |
+ if (par.l$endOpen & nColOrig != 2) { |
|
735 |
+ |
|
736 |
+ # only do it when the user did specify end coordinates already |
|
737 |
+ if (verbose) |
|
738 |
+ message(" Decrease end coordinates by one because coordinates have to be closed at the end") |
|
739 |
+ userRegions.df$end = userRegions.df$end - 1 |
|
740 |
+ } |
|
741 |
+ |
|
742 |
+ if (par.l$startOpen) { |
|
743 |
+ if (verbose) |
|
744 |
+ message(" Decrease start coordinates by one because coordinates have to be closed at ", |
|
745 |
+ "both start and end according to the BAM standard)") |
|
746 |
+ userRegions.df$start = userRegions.df$start + 1 |
|
747 |
+ } |
|
748 |
+ |
|
749 |
+ # Check if some of the positions are larger than the chromosome sizes |
|
750 |
+ index.vec = which(userRegions.df$start > chrSizes.df[userRegions.df$chr,]$size) |
|
751 |
+ if (length(index.vec) > 0) { |
|
752 |
+ stop(length(index.vec), " regions (starting with annotations ", |
|
753 |
+ paste(userRegions.df$annotation[head(index.vec)], collapse = ","), |
|
754 |
+ ") have invalid start positions that exceed the lengths of the corresponding chromosomes. ", |
|
755 |
+ "Has the genome assembly version be specified correctly?", sep = "") |
|
756 |
+ } |
|
757 |
+ index.vec = which(userRegions.df$end > chrSizes.df[userRegions.df$chr,]$size) |
|
758 |
+ if (length(index.vec) > 0) { |
|
759 |
+ stop(length(index.vec), " regions (starting with annotations ", |
|
760 |
+ paste(userRegions.df$annotation[head(index.vec)], collapse = ","), |
|
761 |
+ ") have invalid end positions that exceed the lengths of the corresponding chromosomes. ", |
|
762 |
+ "Has the genome assembly version be specified correctly?", sep = "") |
|
763 |
+ } |
|
764 |
+ |
|
765 |
+ |
|
766 |
+ if (!all((userRegions.df$end - userRegions.df$start + 1) == 1)) { |
|
767 |
+ stop("Only SNPs are supported, not regions of length > 1. At least one region has a length >1.") |
|
768 |
+ } |
|
769 |
+ |
|
770 |
+ |
|
771 |
+ userRegions.df$SNPPos = userRegions.df$start |
|
772 |
+ |
|
773 |
+ # Add the regionSize Parameters to the boundaries of the regions |
|
774 |
+ userRegions.df$start = userRegions.df$start - par.l$regionSize |
|
775 |
+ userRegions.df$end = userRegions.df$end + par.l$regionSize |
|
776 |
+ |
|
777 |
+ indexRegions = unique(c(which(userRegions.df$start < 0), which((userRegions.df$end + par.l$regionSize) > chrSizes.df[userRegions.df$chr,]$size))) |
|
778 |
+ |
|
779 |
+ if (length(indexRegions) > 0) { |
|
780 |
+ |
|
781 |
+ invalidRegions = paste0(userRegions.df$chr[indexRegions], ":", |
|
782 |
+ userRegions.df$start[indexRegions], "-", |
|
783 |
+ userRegions.df$end[indexRegions], "(", |
|
784 |
+ userRegions.df$annotation[indexRegions], ")", collapse = ", ") |
|
785 |
+ |
|
786 |
+ stop("For the following ", length(indexRegions), " user regions, either the chromosome starts or ends have been exceeded ", |
|
787 |
+ "after incorporating the regionSize parameter:", invalidRegions, |
|
788 |
+ ". Delete or modify these regions from your regions file or adjust your regionSize parameter accordingly.") |
|
789 |
+ |
|
790 |
+ |
|
791 |
+ } |
|
792 |
+ |
|
793 |
+ userRegions.df$length = userRegions.df$end - userRegions.df$start + 1 # Because intervals are fully-closed |
|
794 |
+ |
|
795 |
+ # Check for duplicate rows and filter them |
|
796 |
+ nDuplicateRows = length(which(duplicated(userRegions.df))) |
|
797 |
+ if (nDuplicateRows > 0) { |
|
798 |
+ userRegions.df = userRegions.df[!duplicated(userRegions.df),] |
|
799 |
+ warning(nDuplicateRows, " duplicate region removed in file ", par.l$path_userRegions, |
|
800 |
+ " out of ", nRegions," regions. New number of regions: ", nrow(userRegions.df)) |
|
801 |
+ |
|
802 |
+ } |
|
803 |
+ |
|
804 |
+ # Check if the IDs are unique. If not, throw an error |
|
805 |
+ if (length(unique(userRegions.df$annotation)) != length(userRegions.df$annotation)) { |
|
806 |
+ warning("All IDs from the user regions file must be unique. However, some of the IDs ", |
|
807 |
+ "were identical and have been changed by adding \"_1\", \"_2\", etc to make them unique") |
|
808 |
+ duplicatedIDs = duplicated(userRegions.df$annotation) |
|
809 |
+ |
|
810 |
+ # Change the IDs so that they are all unique |
|
811 |
+ currentDuplicationIndex = 1 |
|
812 |
+ for (i in seq_len(nrow(userRegions.df))) { |
|
813 |
+ |
|
814 |
+ if (duplicatedIDs[i]) { |
|
815 |
+ |
|
816 |
+ currentDuplicationIndex = currentDuplicationIndex + 1 |
|
817 |
+ userRegions.df$annotation[i] = paste0(userRegions.df$annotation[i], |
|
818 |
+ "_", |
|
819 |
+ currentDuplicationIndex) |
|
820 |
+ |
|
821 |
+ } else { |
|
822 |
+ currentDuplicationIndex = 1 |
|
823 |
+ } |
|
824 |
+ |
|
825 |
+ } |
|
826 |
+ |
|
827 |
+ } |
|
828 |
+ |
|
829 |
+ assertCharacter(userRegions.df$annotation, unique = TRUE, any.missing = FALSE) |
|
830 |
+ |
|
831 |
+ # Finally, order the user regions by chromosome and strand. |
|
832 |
+ # This is important for the scanBam function so that it does not change the order of the regions |
|
833 |
+ #userRegions.df = userRegions.df[order(userRegions.df$chr, userRegions.df$start),] |
|
834 |
+ # TODO: test if working and compare results |
|
835 |
+ |
|
836 |
+ gr = GRanges(seqnames = userRegions.df$chr, |
|
837 |
+ ranges = IRanges(start = userRegions.df$start, end = userRegions.df$end), |
|
838 |
+ strand = userRegions.df$strand, |
|
839 |
+ annotation = userRegions.df$annotation, |
|
840 |
+ SNPPos = userRegions.df$SNPPos |
|
841 |
+ ) |
|
842 |
+ |
|
843 |
+ gr = sortSeqlevels(gr) |
|
844 |
+ gr = sort(gr) |
|
845 |
+ |
|
846 |
+ gr |
|
847 |
+ |
|
848 |
+ |
|
849 |
+} |
|
850 |
+ |
|
851 |
+#' @import checkmate |
|
852 |
+#' @importFrom DESeq2 DESeqDataSetFromMatrix estimateSizeFactors counts sizeFactors |
|
853 |
+.scaleLibraries <- function(rawCounts.m, |
|
854 |
+ verbose = TRUE) { |
|
855 |
+ |
|
856 |
+ # Check types and validity of arguments |
|
857 |
+ assertMatrix(rawCounts.m, min.rows = 1, min.cols = 1) |
|
858 |
+ assertFlag(verbose) |
|
859 |
+ |
|
860 |
+ # Use DeSeq to compute the size factors and scale the libraries |
|
861 |
+ # Make sure we have column names |
|
862 |
+ if (testNull(colnames(rawCounts.m))) { |
|
863 |
+ colnames(rawCounts.m) = paste0("sample",seq_len(ncol(rawCounts.m))) |
|
864 |
+ } |
|
865 |
+ |
|
866 |
+ # column names don't matter because of the specific design matrix (only intercept fitting) |
|
867 |
+ colData = data.frame(condition = colnames(rawCounts.m)) |
|
868 |
+ |
|
869 |
+ dds = DESeqDataSetFromMatrix(countData = rawCounts.m, colData = colData, design = ~1) |
|
870 |
+ |
|
871 |
+ # Test how many genes will be skipped for determinging the size factors |
|
872 |
+ log.vec = apply(rawCounts.m,1, function(x) {any(x == 0)}) |
|
873 |
+ nGenesNorm = length(which(!log.vec)) |
|
874 |
+ if (verbose) |
|
875 |
+ message("Number of genes on which library normalization using DeSeq2 is based on out of ", |
|
876 |
+ nrow(rawCounts.m),":", nGenesNorm) |
|
877 |
+ |
|
878 |
+ if (nGenesNorm == 0) { |
|
879 |
+ stop("All genes have at least one zero count in one of the samples. ", |
|
880 |
+ "Size factor normalization cannot be performed.", sep = "") |
|
881 |
+ } |
|
882 |
+ |
|
883 |
+ # Print a warning if the number of rows with at least one zero is too high, |
|
884 |
+ # as the size factors are only based on rows with no zeroes altogether |
|
885 |
+ percThreshold = 0.5 |
|
886 |
+ if (nGenesNorm < (percThreshold*nrow(rawCounts.m)) | nGenesNorm < 50 ) { |
|
887 |
+ warning("The proportion of genes that can be used for normalization among all samples is lower than ", |
|
888 |
+ percThreshold," or 50 altogether. Size factors will still be determined but may be inaccurate.") |
|
889 |
+ } |
|
890 |
+ |
|
891 |
+ |
|
892 |
+ # Regular DESeq2 workflow not needed here |
|
893 |
+ dds = estimateSizeFactors(dds) |
|
894 |
+ |
|
895 |
+ res.l = list() |
|
896 |
+ res.l$adjCounts = counts(dds, normalized = TRUE) |
|
897 |
+ res.l$sizeFactors = sizeFactors(dds) |
|
898 |
+ |
|
899 |
+ names(res.l$sizeFactors) = colnames(rawCounts.m) |
|
900 |
+ |
|
901 |
+ res.l |
|
902 |
+ |
|
903 |
+} |
|
904 |
+ |
|
905 |
+ |
|
906 |
+ |
|
907 |
+#' @import checkmate |
|
908 |
+#' @importFrom stats sd |
|
909 |
+.normalizeMatrixForClustering <- function(target.m, |
|
910 |
+ verbose = TRUE) { |
|
911 |
+ |
|
912 |
+ assertMatrix(target.m, any.missing = FALSE, min.rows = 1, min.cols = 1) |
|
913 |
+ assertFlag(verbose) |
|
914 |
+ |
|
915 |
+ valueToAdd = 0.01 |
|
916 |
+ |
|
917 |
+ SdS = apply(target.m, 1, sd) |
|
918 |
+ # SdS = rowSds(target.m) |
|
919 |
+ zeroSds = length(which(SdS == 0)) |
|
920 |
+ |
|
921 |
+ if (zeroSds > 0) { |
|
922 |
+ if (verbose) message(zeroSds," regions had a standard deviation of 0 across all bins. ", |
|
923 |
+ "For normalization and plotting purposes, a small value of ", |
|
924 |
+ valueToAdd," was added") |
|
925 |
+ |
|
926 |
+ SdS = replace(SdS, SdS == 0, valueToAdd) |
|
927 |
+ } |
|
928 |
+ |
|
929 |
+ target.m = (target.m - rowMeans(target.m)) / SdS |
|
930 |
+ |
|
931 |
+ target.m |
|
932 |
+} |
|
933 |
+ |
|
934 |
+ |
|
935 |
+#' @import checkmate |
|
936 |
+#' @importFrom cluster clara silhouette sortSilhouette |
|
937 |
+#' @importFrom reshape2 melt |
|
938 |
+#' @importFrom stats complete.cases |
|
939 |
+ |
|
940 |
+ |
|
941 |
+.pamClustering = function(target.m, |
|
942 |
+ nCols, |
|
943 |
+ binAnnotation, |
|
944 |
+ verticalLinePos, |
|
945 |
+ rownamesPlot, |
|
946 |
+ xAxisLabels, |
|
947 |
+ nClustersVec, |
|
948 |
+ ...) { |
|
949 |
+ |
|
950 |
+ assertMatrix(target.m, any.missing = FALSE, min.rows = 1, min.cols = 1) |
|
951 |
+ assertIntegerish(nClustersVec, lower = 2, any.missing = FALSE, min.len = 1) |
|
952 |
+ |
|
953 |
+ |
|
954 |
+ target.m_new = target.m[complete.cases(target.m),] |
|
955 |
+ if (nrow(target.m) > nrow(target.m_new)) { |
|
956 |
+ warning((nrow(target.m) - nrow(target.m_new)), |
|
957 |
+ " rows in the matrix contained NAs and they have been removed.") |
|
958 |
+ } |
|
959 |
+ target.m = target.m_new |
|
960 |
+ |
|
961 |
+ # Init the objects that are returned |
|
962 |
+ clusterplot.list = vector("list", length(nClustersVec)) |
|
963 |
+ avgSilInfo = c() |
|
964 |
+ q = vector("list", length(nClustersVec)) |
|
965 |
+ |
|
966 |
+ for (i in seq_len(length(nClustersVec))) { |
|
967 |
+ |
|
968 |
+ .pamClustering = clara(target.m, nClustersVec[i]) |
|
969 |
+ |
|
970 |
+ pam.clusters.m = as.matrix(.pamClustering$clustering) |
|
971 |
+ |
|
972 |
+ pam.clusters.m = as.data.frame(pam.clusters.m, stringsAsFactors = FALSE) |
|
973 |
+ |
|
974 |
+ clusterplot.df = merge(x = pam.clusters.m, y = target.m, by = "row.names") |
|
975 |
+ clusterplot.df = clusterplot.df[, -1] |
|
976 |
+ |
|
977 |
+ colnames(clusterplot.df) = c("cluster", binAnnotation) |
|
978 |
+ clusterplot.df = clusterplot.df[order(clusterplot.df$cluster, decreasing = TRUE),] |
|
979 |
+ |
|
980 |
+ # Collect results |
|
981 |
+ avgSilInfo = c(avgSilInfo, .pamClustering$silinfo$avg.width) |
|
982 |
+ clusterplot.list[[i]] = clusterplot.df |
|
983 |
+ |
|
984 |
+ q[[i]] = .generateClusterPlot(clusterplot.df, |
|
985 |
+ rownamesPlot, |
|
986 |
+ nCols, |
|
987 |
+ verticalLinePos, |
|
988 |
+ xAxisLabels, |
|
989 |
+ clustersToPlot = NULL, |
|
990 |
+ ...) |
|
991 |
+ |
|
992 |
+ } |
|
993 |
+ |
|
994 |
+ res.l = list(clusterplot.list, avgSilInfo, q) |
|
995 |
+ names(res.l) = c("clusteringMatrix","averageSilhouette","plots") |
|
996 |
+ |
|
997 |
+ return(invisible(res.l)) |
|
998 |
+ |
|
999 |
+} |
|
1000 |
+ |
|
1001 |
+ |
|
1002 |
+#' @importFrom lattice levelplot panel.levelplot panel.abline |
|
1003 |
+#' @importFrom grDevices gray |
|
1004 |
+#' @import checkmate |
|
1005 |
+ |
|
1006 |
+.generateClusterPlot <- function(clusterplot.df, |
|
1007 |
+ rownamesPlot, |
|
1008 |
+ nColumns, |
|
1009 |
+ verticalLinePos, |
|
1010 |
+ xAxisLabels, |
|
1011 |
+ clustersToPlot = NULL, |
|
1012 |
+ ...) { |
|
1013 |
+ |
|
1014 |
+ assertInt(nColumns, lower = 1) |
|
1015 |
+ |
|
1016 |
+ assertDataFrame(clusterplot.df, min.rows = 1, ncols = nColumns + 1) |
|
1017 |
+ assert(checkNull(clustersToPlot), |
|
1018 |
+ checkSubset(clustersToPlot, unique(clusterplot.df$cluster))) |
|
1019 |
+ |
|
1020 |
+ if (testNull(clustersToPlot)) { |
|
1021 |
+ clustersToPlot = unique(clusterplot.df$cluster) |
|
1022 |
+ } |
|
1023 |
+ |
|
1024 |
+ # Select only the clusters the user wants |
|
1025 |
+ |
|
1026 |
+ clusterplot.df = clusterplot.df[which(clusterplot.df$cluster %in% clustersToPlot),] |
|
1027 |
+ clusterplot.df = clusterplot.df[order(clusterplot.df$cluster, decreasing = TRUE),] |
|
1028 |
+ clusterplot.df = t(clusterplot.df[, -1]) |
|
1029 |
+ |
|
1030 |
+ rownames(clusterplot.df) = rownamesPlot |
|
1031 |
+ colnames(clusterplot.df) = seq_len(ncol(clusterplot.df)) |
|
1032 |
+ |
|
1033 |
+ nNumbersYAxis = 5 |
|
1034 |
+ yPos = seq(1, ncol(clusterplot.df), max(floor(ncol(clusterplot.df) / nNumbersYAxis), 1)) |
|
1035 |
+ |
|
1036 |
+ colors = gray(100:0/100) # white for low read counts |
|
1037 |
+ #colors = gray(0:100/100) # black for low read counts |
|
1038 |
+ |
|
1039 |
+ p = levelplot(as.matrix(clusterplot.df), xlab = xAxisLabels , ylab = "User regions", |
|
1040 |
+ scales = list(y = list(at = yPos)), |
|
1041 |
+ col.regions = colors, aspect = "fill", |
|
1042 |
+ panel = function(...) { |
|
1043 |
+ panel.levelplot(...) |
|
1044 |
+ panel.abline(v = verticalLinePos , type = "o", pch = 22, lty = 2, col = "red") |
|
1045 |
+ }) |
|
1046 |
+ |
|
1047 |
+ p |
|
1048 |
+} |
|
1049 |
+ |
|
1050 |
+ |
|
1051 |
+#' @importFrom BiocParallel multicoreWorkers MulticoreParam |
|
1052 |
+#' @import checkmate |
|
1053 |
+.initBiocParallel <- function(nWorkers) { |
|
1054 |
+ |
|
1055 |
+ assertInt(nWorkers, lower = 1) |
|
1056 |
+ |
|
1057 |
+ if (nWorkers > multicoreWorkers()) { |
|
1058 |
+ warning("Requested ", nWorkers, " CPUs, but only ", multicoreWorkers(), " are available and can be used.") |
|
1059 |
+ nWorkers = multicoreWorkers() |
|
1060 |
+ } |
|
1061 |
+ |
|
1062 |
+ MulticoreParam(workers = nWorkers, |
|
1063 |
+ progressBar = TRUE, |
|
1064 |
+ stop.on.error = TRUE) |
|
1065 |
+ |
|
1066 |
+} |
|
1067 |
+ |
|
1068 |
+#' @importFrom BiocParallel bplapply bpok |
|
1069 |
+#' @import checkmate |
|
1070 |
+# .execInParallelGen <- function(nCores, |
|
1071 |
+# returnAsList, |
|
1072 |
+# iteration, |
|
1073 |
+# verbose = TRUE, |
|
1074 |
+# functionName, |
|
1075 |
+# ...) { |
|
1076 |
+# |
|
1077 |
+# start.time <- Sys.time() |
|
1078 |
+# |
|
1079 |
+# assertInt(nCores, lower = 1) |
|
1080 |
+# assertFlag(returnAsList) |
|
1081 |
+# assertFunction(functionName) |
|
1082 |
+# assertVector(iteration, any.missing = FALSE, min.len = 1) |
|
1083 |
+# |
|
1084 |
+# res.l = list() |
|
1085 |
+# |
|
1086 |
+# if (nCores > multicoreWorkers()) { |
|
1087 |
+# nCores = multicoreWorkers() |
|
1088 |
+# } |
|
1089 |
+# |
|
1090 |
+# if (nCores > 1) { |
|
1091 |
+# |
|
1092 |
+# res.l = tryCatch( { |
|
1093 |
+# bplapply(iteration, functionName, ..., BPPARAM = .initBiocParallel(nCores)) |
|
1094 |
+# |
|
1095 |
+# }, error = function(e) { |
|
1096 |
+# warning("An error occured while executing the function with ", |
|
1097 |
+# nCores," CPUs. Try one last time in parallel...") |
|
1098 |
+# #lapply(iteration, functionName, ...) |
|
1099 |
+# } |
|
1100 |
+# ) |
|
1101 |
+# |
|
1102 |
+# failedTasks = which(!bpok(res.l)) |
|
1103 |
+# if (length(failedTasks) > 0) { |
|
1104 |
+# warning("At least one task failed while executing in parallel, ", |
|
1105 |
+# "attempting to rerun those that failed: ", |
|
1106 |
+# res.l[[failedTasks[1]]]) |
|
1107 |
+# |
|
1108 |
+# res.l = tryCatch( { |
|
1109 |
+# bplapply(iteration, |
|
1110 |
+# functionName, |
|
1111 |
+# ..., |
|
1112 |
+# BPPARAM = .initBiocParallel(nCores), |
|
1113 |
+# BPREDO = res.l) |
|
1114 |
+# |
|
1115 |
+# }, error = function(e) { |
|
1116 |
+# warning("Once again, an error occured while executing the function ", |
|
1117 |
+# "with multiple CPUs. Trying again using only only one CPU...") |
|
1118 |
+# lapply(iteration, functionName, ...) |
|
1119 |
+# } |
|
1120 |
+# ) |
|
1121 |
+# } |
|
1122 |
+# |
|
1123 |
+# } else { |
|
1124 |
+# res.l = lapply(iteration, functionName, ...) |
|
1125 |
+# } |
|
1126 |
+# |
|
1127 |
+# |
|
1128 |
+# if (verbose) message(" Finished execution using ",nCores," cores.") |
|
1129 |
+# .printExecutionTime(start.time, verbose = verbose) |
|
1130 |
+# |
|
1131 |
+# if (!returnAsList) { |
|
1132 |
+# return(unlist(res.l)) |
|
1133 |
+# } |
|
1134 |
+# |
|
1135 |
+# res.l |
|
1136 |
+# |
|
1137 |
+# } |
|
1138 |
+ |
|
1139 |
+.execInParallelGen <- function(nCores, returnAsList = TRUE, listNames = NULL, iteration, abortIfErrorParallel = TRUE, verbose = FALSE, functionName, ...) { |
|
1140 |
+ |
|
1141 |
+ start.time <- Sys.time() |
|
1142 |
+ |
|
1143 |
+ assertInt(nCores, lower = 1) |
|
1144 |
+ assertFlag(returnAsList) |
|
1145 |
+ assertFunction(functionName) |
|
1146 |
+ assertVector(iteration, any.missing = FALSE, min.len = 1) |
|
1147 |
+ assert(checkNull(listNames), checkCharacter(listNames, len = length(iteration))) |
|
1148 |
+ |
|
1149 |
+ res.l = list() |
|
1150 |
+ |
|
1151 |
+ if (nCores > 1) { |
|
1152 |
+ |
|
1153 |
+ res.l = tryCatch( { |
|
1154 |
+ bplapply(iteration, functionName, ..., BPPARAM = .initBiocParallel(nCores)) |
|
1155 |
+ |
|
1156 |
+ }#, error = function(e) { |
|
1157 |
+ # warning("An error occured while executing the function with multiple CPUs. Trying again using only only one CPU...") |
|
1158 |
+ # lapply(iteration, functionName, ...) |
|
1159 |
+ # } |
|
1160 |
+ ) |
|
1161 |
+ |
|
1162 |
+ failedTasks = which(!bpok(res.l)) |
|
1163 |
+ if (length(failedTasks) > 0) { |
|
1164 |
+ warning("At least one task failed while executing in parallel, attempting to rerun those that failed: ",res.l[[failedTasks[1]]]) |
|
1165 |
+ if (abortIfErrorParallel) stop() |
|
1166 |
+ |
|
1167 |
+ res.l = tryCatch( { |
|
1168 |
+ bplapply(iteration, functionName, ..., BPPARAM = .initBiocParallel(nCores), BPREDO = res.l) |
|
1169 |
+ |
|
1170 |
+ }, error = function(e) { |
|
1171 |
+ warning("Once again, an error occured while executing the function with multiple CPUs. Trying again using only only one CPU...") |
|
1172 |
+ if (abortIfErrorParallel) stop() |
|
1173 |
+ lapply(iteration, functionName, ...) |
|
1174 |
+ } |
|
1175 |
+ ) |
|
1176 |
+ } |
|
1177 |
+ |
|
1178 |
+ } else { |
|
1179 |
+ res.l = lapply(iteration, functionName, ...) |
|
1180 |
+ } |
|
1181 |
+ |
|
1182 |
+ end.time <- Sys.time() |
|
1183 |
+ |
|
1184 |
+ if (nCores > multicoreWorkers()) { |
|
1185 |
+ nCores = multicoreWorkers() |
|
1186 |
+ } |
|
1187 |
+ |
|
1188 |
+ if (verbose) message(" Finished execution using ",nCores," cores. TOTAL RUNNING TIME: ", round(end.time - start.time, 1), " ", units(end.time - start.time),"\n") |
|
1189 |
+ |
|
1190 |
+ |
|
1191 |
+ if (!returnAsList) { |
|
1192 |
+ return(unlist(res.l)) |
|
1193 |
+ } |
|
1194 |
+ |
|
1195 |
+ if (!is.null(listNames)) { |
|
1196 |
+ names(res.l) = listNames |
|
1197 |
+ } |
|
1198 |
+ |
|
1199 |
+ res.l |
|
1200 |
+ |
|
1201 |
+} |
|
1202 |
+ |
|
1203 |
+ |
|
1204 |
+#' @importFrom Rsamtools countBam scanBam ScanBamParam |
|
1205 |
+#' @import checkmate |
|
1206 |
+.calculateGenomeWideBackground <- function(fileCur, |
|
1207 |
+ par.l, |
|
1208 |
+ nReadsToCheck = 1000, |
|
1209 |
+ verbose = TRUE) { |
|
1210 |
+ |
|
1211 |
+ if (verbose) message(" Calculate background average") |
|
1212 |
+ |
|
1213 |
+ # Read BAM file and automatically determine the read size or at least a good proxy for it |
|
1214 |
+ |
|
1215 |
+ param = ScanBamParam(flag = .constructScanBamFlags(par.l), |
|
1216 |
+ reverseComplement = par.l$readFlag_reverseComplement, |
|
1217 |
+ simpleCigar = par.l$readFlag_simpleCigar, |
|
1218 |
+ mapqFilter = par.l$readFlag_minMapQ, |
|
1219 |
+ what = "seq") |
|
1220 |
+ |
|
1221 |
+ res = scanBam(BamFile(fileCur, yieldSize = nReadsToCheck), |
|
1222 |
+ param = param) |
|
1223 |
+ |
|
1224 |
+ readLengthBAM = max(width(res[[1]]$seq)) |
|
1225 |
+ |
|
1226 |
+ effectiveGenomeSizeProp = .getUniqueMappabilityData(par.l$assemblyVersion, |
|
1227 |
+ readLengthBAM, |
|
1228 |
+ verbose = verbose) |
|
1229 |
+ |
|
1230 |
+ genomeSizeTotal = sum(.getGenomeData(par.l$assemblyVersion)$size) |
|
1231 |
+ |
|
1232 |
+ # Obtain total number of reads in BAM file. |
|
1233 |
+ # Subject the reads to the same constraint as for the signal files |
|
1234 |
+ nReadsTotal = countBam(fileCur, param = param)$records |
|
1235 |
+ |
|
1236 |
+ avgBackground = (nReadsTotal / (genomeSizeTotal * effectiveGenomeSizeProp)) * |
|
1237 |
+ (par.l$binSize + readLengthBAM - 1) |
|
1238 |
+ |
|
1239 |
+ if (verbose) message(" Average background: ", avgBackground) |
|
1240 |
+ |
|
1241 |
+ avgBackground |
|
1242 |
+ |
|
1243 |
+} |
|
1244 |
+ |
|
1245 |
+ |
|
1246 |
+.printExecutionTime <- function(startTime, |
|
1247 |
+ verbose = TRUE) { |
|
1248 |
+ |
|
1249 |
+ endTime <- Sys.time() |
|
1250 |
+ if (verbose) |
|
1251 |
+ message(" Execution time: ", round(endTime - startTime, 1), |
|
1252 |
+ " ", units(endTime - startTime)) |
|
1253 |
+} |
|
1254 |
+ |
|
1255 |
+#' @importFrom utils object.size |
|
1256 |
+.getMemoryProfile <- function(object, |
|
1257 |
+ verbose = TRUE) { |
|
1258 |
+ |
|
1259 |
+ if (verbose) |
|
1260 |
+ message(" |Size of object: ", format(object.size(object), units = "Mb")) |
|
1261 |
+ |
|
1262 |
+ if (requireNamespace("pryr", quietly = TRUE)) { |
|
1263 |
+ |
|
1264 |
+ if (verbose) { |
|
1265 |
+ message(" |Total memory used by R according to pryr::mem_used: ", |
|
1266 |
+ round(as.numeric(pryr::mem_used()) / 1024 / 1024,2), " Mb") |
|
1267 |
+ } |
|
1268 |
+ } else { |
|
1269 |
+ |
|
1270 |
+ message(" The package pryr is not available, cannot compute memory consumption... ") |
|
1271 |
+ |
|
1272 |
+ } |
|
1273 |
+ |
|
1274 |
+} |
|
1275 |
+ |
0 | 1276 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,2874 @@ |
1 |
+############################################################ |
|
2 |
+# |
|
3 |
+# DESeq2 organization of R files |
|
4 |
+# |
|
5 |
+# core ........... most of the statistical code (example call below) |
|
6 |
+# fitNbinomGLMs .. three functions for fitting NB GLMs |
|
7 |
+# methods ........ the S4 methods (estimateSizeFactors, etc.) |
|
8 |
+# AllClasses ..... class definitions and object constructors |
|
9 |
+# AllGenerics .... the generics defined in DESeq2 |
|
10 |
+# results ........ results() function and helpers |
|
11 |
+# plots .......... all plotting functions |
|
12 |
+# lfcShrink ...... log2 fold change shrinkage |
|
13 |
+# helper ......... unmix, collapseReplicates, fpkm, fpm, DESeqParallel |
|
14 |
+# expanded ....... helpers for dealing with expanded model matrices |
|
15 |
+# wrappers ....... the R wrappers for the C++ functions (mine) |
|
16 |
+# RcppExports .... the R wrappers for the C++ functions (auto) |
|
17 |
+# |
|
18 |
+# rlogTransformation ... rlog |
|
19 |
+# varianceStabilizingTransformation ... VST |
|
20 |
+# |
|
21 |
+# general outline of the internal function calls. |
|
22 |
+# note: not all of these functions are exported. |
|
23 |
+# |
|
24 |
+# DESeq |
|
25 |
+# |- estimateSizeFactors |
|
26 |
+# |- estimateSizeFactorsForMatrix |
|
27 |
+# |- estimateDispersions |
|
28 |
+# |- estimateDispersionsGeneEst |
|
29 |
+# |- fitNbinomGLMs |
|
30 |
+# |- fitBeta (C++) |
|
31 |
+# |- fitDisp (C++) |
|
32 |
+# |- estimateDispersionsFit |
|
33 |
+# |- estimateDispersionsMAP |
|
34 |
+# |- estimateDispersionPriorVar |
|
35 |
+# |- fitDisp (C++) |
|
36 |
+# |- nbinomWaldTest |
|
37 |
+# |- fitGLMsWithPrior |
|
38 |
+# |- fitNbinomGLMs |
|
39 |
+# |- fitBeta (C++) |
|
40 |
+# |- estimateBetaPriorVar |
|
41 |
+# |- fitNbinomGLMs |
|
42 |
+# |- fitBeta (C++) |
|
43 |
+# |
|
44 |
+############################################################ |
|
45 |
+ |
|
46 |
+ |
|
47 |
+#' DESeq2 package for differential analysis of count data |
|
48 |
+#' |
|
49 |
+#' The DESeq2 package is designed for normalization, |
|
50 |
+#' visualization, and differential analysis of high-dimensional |
|
51 |
+#' count data. It makes use of empirical Bayes techniques |
|
52 |
+#' to estimate priors for log fold change and dispersion, and |
|
53 |
+#' to calculate posterior estimates for these quantities. |
|
54 |
+#' |
|
55 |
+#' The main functions are: |
|
56 |
+#' |
|
57 |
+#' \itemize{ |
|
58 |
+#' \item \code{\link{DESeqDataSet}} - build the dataset, see tximeta & tximport packages for preparing input |
|
59 |
+#' \item \code{\link{DESeq}} - perform differential analysis |
|
60 |
+#' \item \code{\link{results}} - build a results table |
|
61 |
+#' \item \code{\link{lfcShrink}} - estimate shrunken LFC (posterior estimates) using apeglm & ashr pakges |
|
62 |
+#' \item \code{\link{vst}} - apply variance stabilizing transformation, e.g. for PCA or sample clustering |
|
63 |
+#' \item Plots, e.g.: \code{\link{plotPCA}}, \code{\link{plotMA}}, \code{\link{plotCounts}} |
|
64 |
+#' } |
|
65 |
+#' |
|
66 |
+#' For detailed information on usage, see the package vignette, by typing |
|
67 |
+#' \code{vignette("DESeq2")}, or the workflow linked to on the first page |
|
68 |
+#' of the vignette. |
|
69 |
+#' |
|
70 |
+#' All software-related questions should be posted to the Bioconductor Support Site: |
|
71 |
+#' |
|
72 |
+#' \url{https://support.bioconductor.org} |
|
73 |
+#' |
|
74 |
+#' The code can be viewed at the GitHub repository, |
|
75 |
+#' which also lists the contributor code of conduct: |
|
76 |
+#' |
|
77 |
+#' \url{https://github.com/mikelove/tximport} |
|
78 |
+#' |
|
79 |
+#' @references |
|
80 |
+#' |
|
81 |
+#' Love, M.I., Huber, W., Anders, S. (2014) |
|
82 |
+#' Moderated estimation of fold change and dispersion |
|
83 |
+#' for RNA-seq data with DESeq2. Genome Biology, 15:550. |
|
84 |
+#' \url{https://doi.org/10.1186/s13059-014-0550-8} |
|
85 |
+#' |
|
86 |
+#' @author Michael Love, Wolfgang Huber, Simon Anders |
|
87 |
+#' |
|
88 |
+#' @docType package |
|
89 |
+#' @name DESeq2-package |
|
90 |
+#' @aliases DESeq2-package |
|
91 |
+#' @keywords package |
|
92 |
+NULL |
|
93 |
+ |
|
94 |
+#' Differential expression analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution |
|
95 |
+#' |
|
96 |
+#' This function performs a default analysis through the steps: |
|
97 |
+#' \enumerate{ |
|
98 |
+#' \item estimation of size factors: \code{\link{estimateSizeFactors}} |
|
99 |
+#' \item estimation of dispersion: \code{\link{estimateDispersions}} |
|
100 |
+#' \item Negative Binomial GLM fitting and Wald statistics: \code{\link{nbinomWaldTest}} |
|
101 |
+#' } |
|
102 |
+#' For complete details on each step, see the manual pages of the respective |
|
103 |
+#' functions. After the \code{DESeq} function returns a DESeqDataSet object, |
|
104 |
+#' results tables (log2 fold changes and p-values) can be generated |
|
105 |
+#' using the \code{\link{results}} function. |
|
106 |
+#' Shrunken LFC can then be generated using the \code{\link{lfcShrink}} function. |
|
107 |
+#' All support questions should be posted to the Bioconductor |
|
108 |
+#' support site: \url{http://support.bioconductor.org}. |
|
109 |
+#' |
|
110 |
+#' The differential expression analysis uses a generalized linear model of the form: |
|
111 |
+#' |
|
112 |
+#' \deqn{ K_{ij} \sim \textrm{NB}( \mu_{ij}, \alpha_i) }{ K_ij ~ NB(mu_ij, alpha_i) } |
|
113 |
+#' \deqn{ \mu_{ij} = s_j q_{ij} }{ mu_ij = s_j q_ij } |
|
114 |
+#' \deqn{ \log_2(q_{ij}) = x_{j.} \beta_i }{ log2(q_ij) = x_j. beta_i } |
|
115 |
+#' |
|
116 |
+#' where counts \eqn{K_{ij}}{K_ij} for gene i, sample j are modeled using |
|
117 |
+#' a Negative Binomial distribution with fitted mean \eqn{\mu_{ij}}{mu_ij} |
|
118 |
+#' and a gene-specific dispersion parameter \eqn{\alpha_i}{alpha_i}. |
|
119 |
+#' The fitted mean is composed of a sample-specific size factor |
|
120 |
+#' \eqn{s_j}{s_j} and a parameter \eqn{q_{ij}}{q_ij} proportional to the |
|
121 |
+#' expected true concentration of fragments for sample j. |
|
122 |
+#' The coefficients \eqn{\beta_i}{beta_i} give the log2 fold changes for gene i for each |
|
123 |
+#' column of the model matrix \eqn{X}{X}. |
|
124 |
+#' The sample-specific size factors can be replaced by |
|
125 |
+#' gene-specific normalization factors for each sample using |
|
126 |
+#' \code{\link{normalizationFactors}}. |
|
127 |
+#' |
|
128 |
+#' For details on the fitting of the log2 fold changes and calculation of p-values, |
|
129 |
+#' see \code{\link{nbinomWaldTest}} if using \code{test="Wald"}, |
|
130 |
+#' or \code{\link{nbinomLRT}} if using \code{test="LRT"}. |
|
131 |
+#' |
|
132 |
+#' Experiments without replicates do not allow for estimation of the dispersion |
|
133 |
+#' of counts around the expected value for each group, which is critical for |
|
134 |
+#' differential expression analysis. Analysis without replicates was deprecated |
|
135 |
+#' in v1.20 and is no longer supported since v1.22. |
|
136 |
+#' |
|
137 |
+#' The argument \code{minReplicatesForReplace} is used to decide which samples |
|
138 |
+#' are eligible for automatic replacement in the case of extreme Cook's distance. |
|
139 |
+#' By default, \code{DESeq} will replace outliers if the Cook's distance is |
|
140 |
+#' large for a sample which has 7 or more replicates (including itself). |
|
141 |
+#' This replacement is performed by the \code{\link{replaceOutliers}} |
|
142 |
+#' function. This default behavior helps to prevent filtering genes |
|
143 |
+#' based on Cook's distance when there are many degrees of freedom. |
|
144 |
+#' See \code{\link{results}} for more information about filtering using |
|
145 |
+#' Cook's distance, and the 'Dealing with outliers' section of the vignette. |
|
146 |
+#' Unlike the behavior of \code{\link{replaceOutliers}}, here original counts are |
|
147 |
+#' kept in the matrix returned by \code{\link{counts}}, original Cook's |
|
148 |
+#' distances are kept in \code{assays(dds)[["cooks"]]}, and the replacement |
|
149 |
+#' counts used for fitting are kept in \code{assays(dds)[["replaceCounts"]]}. |
|
150 |
+#' |
|
151 |
+#' Note that if a log2 fold change prior is used (betaPrior=TRUE) |
|
152 |
+#' then expanded model matrices will be used in fitting. These are |
|
153 |
+#' described in \code{\link{nbinomWaldTest}} and in the vignette. The |
|
154 |
+#' \code{contrast} argument of \code{\link{results}} should be used for |
|
155 |
+#' generating results tables. |
|
156 |
+#' |
|
157 |
+#' @return a \code{\link{DESeqDataSet}} object with results stored as |
|
158 |
+#' metadata columns. These results should accessed by calling the \code{\link{results}} |
|
159 |
+#' function. By default this will return the log2 fold changes and p-values for the last |
|
160 |
+#' variable in the design formula. See \code{\link{results}} for how to access results |
|
161 |
+#' for other variables. |
|
162 |
+#' |
|
163 |
+#' @param object a DESeqDataSet object, see the constructor functions |
|
164 |
+#' \code{\link{DESeqDataSet}}, |
|
165 |
+#' \code{\link{DESeqDataSetFromMatrix}}, |
|
166 |
+#' \code{\link{DESeqDataSetFromHTSeqCount}}. |
|
167 |
+#' @param test either "Wald" or "LRT", which will then use either |
|
168 |
+#' Wald significance tests (defined by \code{\link{nbinomWaldTest}}), |
|
169 |
+#' or the likelihood ratio test on the difference in deviance between a |
|
170 |
+#' full and reduced model formula (defined by \code{\link{nbinomLRT}}) |
|
171 |
+#' @param fitType either "parametric", "local", "mean", or "glmGamPoi" |
|
172 |
+#' for the type of fitting of dispersions to the mean intensity. |
|
173 |
+#' See \code{\link{estimateDispersions}} for description. |
|
174 |
+#' @param sfType either "ratio", "poscounts", or "iterate" |
|
175 |
+#' for the type of size factor estimation. See |
|
176 |
+#' \code{\link{estimateSizeFactors}} for description. |
|
177 |
+#' @param betaPrior whether or not to put a zero-mean normal prior on |
|
178 |
+#' the non-intercept coefficients |
|
179 |
+#' See \code{\link{nbinomWaldTest}} for description of the calculation |
|
180 |
+#' of the beta prior. In versions \code{>=1.16}, the default is set |
|
181 |
+#' to \code{FALSE}, and shrunken LFCs are obtained afterwards using |
|
182 |
+#' \code{\link{lfcShrink}}. |
|
183 |
+#' @param full for \code{test="LRT"}, the full model formula, |
|
184 |
+#' which is restricted to the formula in \code{design(object)}. |
|
185 |
+#' alternatively, it can be a model matrix constructed by the user. |
|
186 |
+#' advanced use: specifying a model matrix for full and \code{test="Wald"} |
|
187 |
+#' is possible if \code{betaPrior=FALSE} |
|
188 |
+#' @param reduced for \code{test="LRT"}, a reduced formula to compare against, |
|
189 |
+#' i.e., the full formula with the term(s) of interest removed. |
|
190 |
+#' alternatively, it can be a model matrix constructed by the user |
|
191 |
+#' @param quiet whether to print messages at each step |
|
192 |
+#' @param minReplicatesForReplace the minimum number of replicates required |
|
193 |
+#' in order to use \code{\link{replaceOutliers}} on a |
|
194 |
+#' sample. If there are samples with so many replicates, the model will |
|
195 |
+#' be refit after these replacing outliers, flagged by Cook's distance. |
|
196 |
+#' Set to \code{Inf} in order to never replace outliers. |
|
197 |
+#' @param modelMatrixType either "standard" or "expanded", which describe |
|
198 |
+#' how the model matrix, X of the GLM formula is formed. |
|
199 |
+#' "standard" is as created by \code{model.matrix} using the |
|
200 |
+#' design formula. "expanded" includes an indicator variable for each |
|
201 |
+#' level of factors in addition to an intercept. for more information |
|
202 |
+#' see the Description of \code{\link{nbinomWaldTest}}. |
|
203 |
+#' betaPrior must be set to TRUE in order for expanded model matrices |
|
204 |
+#' to be fit. |
|
205 |
+#' @param useT logical, passed to \code{\link{nbinomWaldTest}}, default is FALSE, |
|
206 |
+#' where Wald statistics are assumed to follow a standard Normal |
|
207 |
+#' @param minmu lower bound on the estimated count for fitting gene-wise dispersion |
|
208 |
+#' and for use with \code{nbinomWaldTest} and \code{nbinomLRT}. |
|
209 |
+#' If \code{fitType="glmGamPoi"}, then 1e-6 will be used |
|
210 |
+#' (as this fitType is optimized for single cell data, where a lower |
|
211 |
+#' minmu is recommended), otherwise the default value |
|
212 |
+#' as evaluated on bulk datasets is 0.5 |
|
213 |
+#' @param parallel if FALSE, no parallelization. if TRUE, parallel |
|
214 |
+#' execution using \code{BiocParallel}, see next argument \code{BPPARAM}. |
|
215 |
+#' A note on running in parallel using \code{BiocParallel}: it may be |
|
216 |
+#' advantageous to remove large, unneeded objects from your current |
|
217 |
+#' R environment before calling \code{DESeq}, |
|
218 |