git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@127901 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,8 +1,8 @@ |
1 | 1 |
Package: msa |
2 | 2 |
Type: Package |
3 | 3 |
Title: Multiple Sequence Alignment |
4 |
-Version: 1.7.0 |
|
5 |
-Date: 2016-08-09 |
|
4 |
+Version: 1.7.1 |
|
5 |
+Date: 2017-03-31 |
|
6 | 6 |
Author: Enrico Bonatesta, Christoph Horejs-Kainrath, Ulrich Bodenhofer |
7 | 7 |
Maintainer: Ulrich Bodenhofer <bodenhofer@bioinf.jku.at> |
8 | 8 |
Description: The 'msa' package provides a unified R/Bioconductor interface to |
... | ... |
@@ -16,7 +16,7 @@ Description: The 'msa' package provides a unified R/Bioconductor interface to |
16 | 16 |
URL: http://www.bioinf.jku.at/software/msa/ |
17 | 17 |
License: GPL (>= 2) |
18 | 18 |
Copyright: See file inst/COPYRIGHT |
19 |
-Depends: R (>= 3.1.0), methods, Biostrings (>= 2.30.0) |
|
19 |
+Depends: R (>= 3.1.0), methods, Biostrings (>= 2.40.0) |
|
20 | 20 |
Imports: Rcpp (>= 0.11.1), BiocGenerics, IRanges (>= 1.20.0), |
21 | 21 |
S4Vectors, tools |
22 | 22 |
Suggests: Biobase, knitr, seqinr, ape, phangorn |
... | ... |
@@ -28,6 +28,7 @@ Collate: AllClasses.R AllGenerics.R params-methods.R version-methods.R |
28 | 28 |
helperFunctions.R inputChecks.R convertRows.R msaPrettyPrint.R |
29 | 29 |
print-methods.R show-methods.R msa.R msaMuscle.R msaClustalW.R |
30 | 30 |
msaClustalOmega.R msaConvert.R msaCheckNames.R |
31 |
+ msaConsensusSequence-methods.R msaConservationScore-methods.R |
|
31 | 32 |
biocViews: MultipleSequenceAlignment, Alignment, MultipleComparison, |
32 | 33 |
Sequencing |
33 | 34 |
NeedsCompilation: yes |
... | ... |
@@ -14,5 +14,5 @@ export(msa, msaMuscle, msaClustalW, msaClustalOmega, msaPrettyPrint, |
14 | 14 |
|
15 | 15 |
exportClasses(MsaDNAMultipleAlignment, MsaRNAMultipleAlignment, |
16 | 16 |
MsaAAMultipleAlignment, MsaMetaData) |
17 |
-exportMethods(params, version, show, print) |
|
18 |
- |
|
17 |
+exportMethods(params, version, show, print, msaConservationScore, |
|
18 |
+ msaConsensusSequence) |
... | ... |
@@ -1 +1,7 @@ |
1 | 1 |
setGeneric("version", function(object) standardGeneric("version")) |
2 |
+ |
|
3 |
+setGeneric("msaConsensusSequence", |
|
4 |
+ function(x, ...) standardGeneric("msaConsensusSequence")) |
|
5 |
+ |
|
6 |
+setGeneric("msaConservationScore", |
|
7 |
+ function(x, ...) standardGeneric("msaConservationScore")) |
2 | 8 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,122 @@ |
1 |
+msaConsensusSequence.matrix <- function(x, type=c("Biostrings", "upperlower"), |
|
2 |
+ thresh=c(80, 20), ignoreGaps=FALSE, |
|
3 |
+ ...) |
|
4 |
+{ |
|
5 |
+ type <- match.arg(type) |
|
6 |
+ |
|
7 |
+ if (is.null(rownames(x)) || |
|
8 |
+ any(!(rownames(x) %in% c(LETTERS, "-", "+", ".")))) |
|
9 |
+ stop("consensus matrix 'x' is not in proper format") |
|
10 |
+ |
|
11 |
+ sel <- match(c("+", "."), rownames(x)) |
|
12 |
+ sel <- sel[which(!is.na(sel))] |
|
13 |
+ if (length(sel) > 0) |
|
14 |
+ x <- x[-sel, ] |
|
15 |
+ |
|
16 |
+ if (!is.numeric(thresh) || length(thresh) !=2 || |
|
17 |
+ thresh[1] > 100 || thresh[2] < 0 || thresh[1] < thresh[2]) |
|
18 |
+ stop("'thresh' must be a decreasing sequence of two numbers ", |
|
19 |
+ "between 0 and 100") |
|
20 |
+ |
|
21 |
+ thresh <- thresh / 100 |
|
22 |
+ |
|
23 |
+ if (type == "Biostrings") |
|
24 |
+ { |
|
25 |
+ cs <- colSums(x) |
|
26 |
+ |
|
27 |
+ if (any(is.na(cs))) |
|
28 |
+ { |
|
29 |
+ res <- rep.int("#", ncol(x)) |
|
30 |
+ |
|
31 |
+ sel <- which(!is.na(cs)) |
|
32 |
+ |
|
33 |
+ if (length(sel) > 0) |
|
34 |
+ { |
|
35 |
+ sstr <- consensusString(x[, sel, drop=FALSE], ...) |
|
36 |
+ res[sel] <- unlist(strsplit(sstr, "")) |
|
37 |
+ } |
|
38 |
+ |
|
39 |
+ out <- paste(res, collapse="") |
|
40 |
+ } |
|
41 |
+ else |
|
42 |
+ out <- consensusString(x, ...) |
|
43 |
+ } |
|
44 |
+ else |
|
45 |
+ { |
|
46 |
+ if (ignoreGaps) |
|
47 |
+ { |
|
48 |
+ sel <- match("-", rownames(x)) |
|
49 |
+ |
|
50 |
+ if (!is.na(sel)) |
|
51 |
+ sel <- (1:nrow(x))[-sel] |
|
52 |
+ else |
|
53 |
+ sel <- (1:nrow(x)) |
|
54 |
+ |
|
55 |
+ perColFunc <- function(y) |
|
56 |
+ { |
|
57 |
+ if (any(is.na(y))) |
|
58 |
+ char <- "#" |
|
59 |
+ else |
|
60 |
+ { |
|
61 |
+ y <- y[sel] / length(sel) |
|
62 |
+ |
|
63 |
+ maxw <-which.max(y[sel]) |
|
64 |
+ maxi <- y[sel[maxw]] |
|
65 |
+ |
|
66 |
+ char <- rownames(x)[sel[maxw]] |
|
67 |
+ |
|
68 |
+ if (maxi < thresh[1]) |
|
69 |
+ { |
|
70 |
+ if (maxi >= thresh[2]) |
|
71 |
+ char <- tolower(char) |
|
72 |
+ else |
|
73 |
+ char <- "." |
|
74 |
+ } |
|
75 |
+ } |
|
76 |
+ |
|
77 |
+ char |
|
78 |
+ } |
|
79 |
+ } |
|
80 |
+ else |
|
81 |
+ { |
|
82 |
+ perColFunc <- function(y) |
|
83 |
+ { |
|
84 |
+ if (any(is.na(y))) |
|
85 |
+ char <- "#" |
|
86 |
+ else |
|
87 |
+ { |
|
88 |
+ y <- y / length(y) |
|
89 |
+ |
|
90 |
+ maxw <- which.max(y) |
|
91 |
+ maxi <- y[maxw] |
|
92 |
+ |
|
93 |
+ char <- rownames(x)[maxw] |
|
94 |
+ |
|
95 |
+ if (maxi < thresh[1]) |
|
96 |
+ { |
|
97 |
+ if (maxi >= thresh[2]) |
|
98 |
+ char <- tolower(char) |
|
99 |
+ else |
|
100 |
+ char <- "." |
|
101 |
+ } |
|
102 |
+ } |
|
103 |
+ |
|
104 |
+ char |
|
105 |
+ } |
|
106 |
+ } |
|
107 |
+ |
|
108 |
+ out <- paste(apply(x, 2, perColFunc), collapse="") |
|
109 |
+ } |
|
110 |
+ |
|
111 |
+ out |
|
112 |
+} |
|
113 |
+ |
|
114 |
+setMethod("msaConsensusSequence", signature("matrix"), msaConsensusSequence.matrix) |
|
115 |
+ |
|
116 |
+ |
|
117 |
+setMethod("msaConsensusSequence", signature("MultipleAlignment"), |
|
118 |
+ function(x, ...) |
|
119 |
+ { |
|
120 |
+ mat <- consensusMatrix(x) |
|
121 |
+ msaConsensusSequence(mat, ...) |
|
122 |
+ }) |
0 | 123 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,60 @@ |
1 |
+msaConservationScore.matrix <- function(x, substitutionMatrix, gapVsGap=NULL, |
|
2 |
+ ...) |
|
3 |
+{ |
|
4 |
+ if (!is.matrix(substitutionMatrix) || |
|
5 |
+ nrow(substitutionMatrix) != ncol(substitutionMatrix) || |
|
6 |
+ any(!(rownames(substitutionMatrix) %in% c(LETTERS, "-", "*"))) || |
|
7 |
+ any(!(colnames(substitutionMatrix) %in% c(LETTERS, "-", "*"))) || |
|
8 |
+ any(rownames(substitutionMatrix) != colnames(substitutionMatrix))) |
|
9 |
+ stop("substitution matrix is not in proper format") |
|
10 |
+ |
|
11 |
+ if (is.null(rownames(x)) || |
|
12 |
+ any(!(rownames(x) %in% c(LETTERS, "-", "+", ".")))) |
|
13 |
+ stop("consensus matrix 'x' is not in proper format") |
|
14 |
+ |
|
15 |
+ sel <- match(c("+", "."), rownames(x)) |
|
16 |
+ sel <- sel[which(!is.na(sel))] |
|
17 |
+ if (length(sel) > 0) |
|
18 |
+ x <- x[-sel, ] |
|
19 |
+ |
|
20 |
+ sel <- match(c("*", "-"), rownames(substitutionMatrix)) |
|
21 |
+ |
|
22 |
+ if (is.na(sel[2]) && !is.na(sel[1])) |
|
23 |
+ { |
|
24 |
+ rownames(substitutionMatrix)[sel[1]] <- "-" |
|
25 |
+ colnames(substitutionMatrix)[sel[1]] <- "-" |
|
26 |
+ } |
|
27 |
+ |
|
28 |
+ sel <- match(rownames(x), rownames(substitutionMatrix)) |
|
29 |
+ |
|
30 |
+ if (any(is.na(sel))) |
|
31 |
+ stop("some letters occurring in alignment 'x' are missing ", |
|
32 |
+ "in substitution matrix") |
|
33 |
+ |
|
34 |
+ substitutionMatrix <- substitutionMatrix[sel, sel] |
|
35 |
+ |
|
36 |
+ if (!is.null(gapVsGap)) |
|
37 |
+ { |
|
38 |
+ if (!is.numeric(gapVsGap) || length(gapVsGap) != 1) |
|
39 |
+ stop("'gapVsGap' must be NULL or a single number") |
|
40 |
+ |
|
41 |
+ substitutionMatrix["-", "-"] <- gapVsGap |
|
42 |
+ } |
|
43 |
+ |
|
44 |
+ out <- drop(apply(x, 2, function(y) crossprod(y, substitutionMatrix %*% y))) |
|
45 |
+ |
|
46 |
+ names(out) <- unlist(strsplit(msaConsensusSequence(x, ...), "")) |
|
47 |
+ |
|
48 |
+ out |
|
49 |
+} |
|
50 |
+ |
|
51 |
+setMethod("msaConservationScore", signature("matrix"), |
|
52 |
+ msaConservationScore.matrix) |
|
53 |
+ |
|
54 |
+ |
|
55 |
+setMethod("msaConservationScore", signature("MultipleAlignment"), |
|
56 |
+ function(x, ...) |
|
57 |
+ { |
|
58 |
+ mat <- consensusMatrix(x) |
|
59 |
+ msaConservationScore.matrix(mat, ...) |
|
60 |
+ }) |
... | ... |
@@ -2,7 +2,8 @@ msaConvert <- function(x, type=c("seqinr::alignment", |
2 | 2 |
"bios2mds::align", |
3 | 3 |
"ape::AAbin", |
4 | 4 |
"ape::DNAbin", |
5 |
- "phangorn::phyDat")) |
|
5 |
+ "phangorn::phyDat", |
|
6 |
+ "bio3d::fasta")) |
|
6 | 7 |
{ |
7 | 8 |
type <- match.arg(type) |
8 | 9 |
|
... | ... |
@@ -57,6 +58,14 @@ msaConvert <- function(x, type=c("seqinr::alignment", |
57 | 58 |
else |
58 | 59 |
stop("conversion to 'phyDat' requires package 'phangorn'") |
59 | 60 |
} |
61 |
+ else if (type == "bio3d::fasta") |
|
62 |
+ { |
|
63 |
+ out <- list(id=names(xn), |
|
64 |
+ ali=.Call("SplitCharVector2Matrix", xn, "-"), |
|
65 |
+ call=x@call) |
|
66 |
+ rownames(out$ali) <- out$id |
|
67 |
+ class(out) <- "fasta" |
|
68 |
+ } |
|
60 | 69 |
|
61 | 70 |
out |
62 | 71 |
} |
... | ... |
@@ -79,7 +79,7 @@ print.MsaMultipleAlignmentChunk <- function(str, names=NULL, halfNrow=9, pos="", |
79 | 79 |
|
80 | 80 |
print.MsaMultipleAlignment <- function(x, show=c("alignment", "version", "call"), |
81 | 81 |
showNames=TRUE, showConsensus=TRUE, |
82 |
- halfNrow=9, nameWidth=20) |
|
82 |
+ halfNrow=9, nameWidth=20, ...) |
|
83 | 83 |
{ |
84 | 84 |
show <- match.arg(show, |
85 | 85 |
choices=c("alignment", "complete", "version", "call", |
... | ... |
@@ -154,7 +154,7 @@ print.MsaMultipleAlignment <- function(x, show=c("alignment", "version", "call") |
154 | 154 |
|
155 | 155 |
if (showConsensus) |
156 | 156 |
{ |
157 |
- cons <- consensusString(consensusMatrix(unmasked(x))) |
|
157 |
+ cons <- msaConsensusSequence(x, ...) |
|
158 | 158 |
strings <- c(strings, cons) |
159 | 159 |
|
160 | 160 |
if (length(names(strings)) > 0) |
... | ... |
@@ -1,6 +1,25 @@ |
1 | 1 |
Change history of package msa: |
2 | 2 |
============================== |
3 | 3 |
|
4 |
+Version 1.7.1: |
|
5 |
+- additional conversions implemented for msaConvert() function |
|
6 |
+- added a new method msaConsensusSequence() that extends the |
|
7 |
+ functionality provided by Biostring's consensusString() method |
|
8 |
+- added a new method msaConservationScore() |
|
9 |
+- print() method extended such that it now also allows for |
|
10 |
+ customization of the consensus sequence (via the new |
|
11 |
+ msaConsensusSequence() method) |
|
12 |
+- package now depends on Biostrings version >= 2.40.0 in order |
|
13 |
+ to make sure that consensusMatrix() also works correctly |
|
14 |
+ for masked alignments |
|
15 |
+- corresponding changes in documentation and vignette |
|
16 |
+ |
|
17 |
+Version 1.7.0: |
|
18 |
+- new branch for Bioconductor 3.5 devel |
|
19 |
+ |
|
20 |
+Version 1.6.0: |
|
21 |
+- release as part of Bioconductor 3.4 |
|
22 |
+ |
|
4 | 23 |
Version 1.5.5: |
5 | 24 |
- fixes in ClustalOmega source code to ensure Windows compatibility of |
6 | 25 |
GCC6 compatibility fix |
... | ... |
@@ -46,7 +46,7 @@ |
46 | 46 |
\section{Methods}{ |
47 | 47 |
\describe{ |
48 | 48 |
\item{\code{print(x, show=c("alignment", "version", "call"), |
49 |
- showNames=TRUE, showConsensus=TRUE, halfNrow=9, nameWidth=20)}:}{ |
|
49 |
+ showNames=TRUE, showConsensus=TRUE, halfNrow=9, nameWidth=20, ...)}:}{ |
|
50 | 50 |
prints information about the object \code{x}; the \code{show} |
51 | 51 |
argument allows for determining what should be printed. |
52 | 52 |
The \code{show} must be a character vector and may contain any |
... | ... |
@@ -95,6 +95,9 @@ |
95 | 95 |
where the masked sequences/positions are shown as hash marks. |
96 | 96 |
However, the consensus sequences are computed from the |
97 | 97 |
complete, unmasked alignment and displayed as such. |
98 |
+ Additional arguments are passed on to |
|
99 |
+ \code{\link{msaConsensusSequence}} for customizing how the |
|
100 |
+ consensus sequence is computed. |
|
98 | 101 |
} |
99 | 102 |
\item{\code{show(object)}:}{displays the alignment along with |
100 | 103 |
metadata; synonymous to calling \code{print} with default |
... | ... |
@@ -138,12 +141,14 @@ show(myAlignment) |
138 | 141 |
## print the results |
139 | 142 |
print(myAlignment, show="alignment") |
140 | 143 |
print(myAlignment, show="alignment", showConsensus=FALSE) |
141 |
-print(myAlignment, show="complete") |
|
142 | 144 |
print(myAlignment, show=c("alignment", "version")) |
143 | 145 |
print(myAlignment, show="standardParams") |
144 | 146 |
print(myAlignment, show="algParams") |
145 | 147 |
print(myAlignment, show=c("call", "version")) |
146 | 148 |
|
149 |
+## print results with custom consensus sequence |
|
150 |
+print(myAlignment, show="complete", type="upperlower", thresh=c(50, 20)) |
|
151 |
+ |
|
147 | 152 |
## show the params |
148 | 153 |
params(myAlignment) |
149 | 154 |
} |
150 | 155 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,120 @@ |
1 |
+\name{msaConsensusSequence} |
|
2 |
+\alias{msaConsensusSequence} |
|
3 |
+\alias{msaConsensusSequence,MultipleAlignment-method} |
|
4 |
+\alias{msaConsensusSequence,matrix-method} |
|
5 |
+\title{Computation of Consensus Sequence from Multiple Alignment} |
|
6 |
+\description{ |
|
7 |
+ This method computes a consensus sequence from a multiple alignment |
|
8 |
+ or a previously computed consensus matrix. Currently, two different |
|
9 |
+ ways of these computations are available. |
|
10 |
+} |
|
11 |
+\usage{ |
|
12 |
+\S4method{msaConsensusSequence}{matrix}(x, type=c("Biostrings", "upperlower"), |
|
13 |
+ thresh=c(80, 20), ignoreGaps=FALSE, ...) |
|
14 |
+\S4method{msaConsensusSequence}{MultipleAlignment}(x, ...) |
|
15 |
+} |
|
16 |
+\arguments{ |
|
17 |
+ \item{x}{an object of class \code{\linkS4class{MultipleAlignment}} |
|
18 |
+ (which includes objects of classes |
|
19 |
+ \code{\linkS4class{MsaAAMultipleAlignment}}, |
|
20 |
+ \code{\linkS4class{MsaDNAMultipleAlignment}}, and |
|
21 |
+ \code{\linkS4class{MsaRNAMultipleAlignment}}) or a previously |
|
22 |
+ computed consensus matrix (see details below).} |
|
23 |
+ \item{type}{a character string specifying how to compute the consensus |
|
24 |
+ sequence. Currently, types \code{"Biostrings"} and |
|
25 |
+ \code{"upperlower"} are available (see details below).} |
|
26 |
+ \item{thresh}{a decreasing two-element numeric vector of numbers |
|
27 |
+ between 0 and 100 corresponding to the two conservation thresholds. |
|
28 |
+ Only relevant for \code{type="upperlower"} (see details below), |
|
29 |
+ otherwise ignored.} |
|
30 |
+ \item{ignoreGaps}{a logical (default: \code{FALSE}) |
|
31 |
+ indicating whether gaps should be |
|
32 |
+ considered when computing the consensus sequence. |
|
33 |
+ Only relevant for \code{type="upperlower"} (see details below), |
|
34 |
+ otherwise ignored.} |
|
35 |
+ \item{...}{when the method is called for a |
|
36 |
+ \code{\linkS4class{MultipleAlignment}} object, the consensus matrix is |
|
37 |
+ computed and, including all further arguments, passed |
|
38 |
+ passed on to the \code{msaConsensusSequence} method with signature |
|
39 |
+ \code{matrix}. The method with signature \code{matrix} forwards |
|
40 |
+ additional arguments to the \code{\link{consensusString}} method |
|
41 |
+ from the \pkg{Biostrings} package if \code{type="Biostrings"}.} |
|
42 |
+ } |
|
43 |
+\details{ |
|
44 |
+ The method takes a \code{\linkS4class{MultipleAlignment}} object or a |
|
45 |
+ previously computed consensus matrix and computes a consensus |
|
46 |
+ sequence. For \code{type="Biostrings"}, the method |
|
47 |
+ \code{\link{consensusString}} from the \pkg{Biostrings} package is |
|
48 |
+ used to compute the consensus sequence. For \code{type="upperlower"}, |
|
49 |
+ two thresholds (argument \code{thresh}, see above) are used to |
|
50 |
+ compute the consensus sequence: |
|
51 |
+ \itemize{ |
|
52 |
+ \item{If the relative frequency of the most frequent letter at a |
|
53 |
+ given position is at least as large as the first threshold (default: |
|
54 |
+ 80\%), then this most frequent letter is used for the consensus |
|
55 |
+ sequence at this position as it is.} |
|
56 |
+ \item{If the relative frequency of the most frequent letter at a |
|
57 |
+ given position is smaller than the first threshold, but at least |
|
58 |
+ as large as the second threshold (default: 20\%), then this |
|
59 |
+ most frequent letter is used for the consensus sequence at this |
|
60 |
+ position, but converted to lower case.} |
|
61 |
+ \item{If the relative frequency of the most frequent letter in a |
|
62 |
+ column is even smaller than the second threshold, then a dot is |
|
63 |
+ used for the consensus sequence at this position.} |
|
64 |
+ \item{If \code{ignoreGaps=FALSE} (which is the default), |
|
65 |
+ gaps are treated like all other |
|
66 |
+ letters except for the fact that obviously no lowercase conversion |
|
67 |
+ takes place in case that the relative frequency is between the |
|
68 |
+ two thresholds. If \code{ignoreGaps=TRUE}, gaps are ignored |
|
69 |
+ completely, and the consensus sequence is computed from the |
|
70 |
+ non-gap letters only.} |
|
71 |
+ } |
|
72 |
+ |
|
73 |
+ If the consensus matrix of a multiple alignment of nucleotide |
|
74 |
+ sequences contains rows labeled \sQuote{+} and/or \sQuote{.}, these |
|
75 |
+ rows are ignored. |
|
76 |
+} |
|
77 |
+\value{ |
|
78 |
+ The function returns a character string with the consensus sequence. |
|
79 |
+} |
|
80 |
+\author{Ulrich Bodenhofer <msa@bioinf.jku.at> |
|
81 |
+} |
|
82 |
+\references{ |
|
83 |
+ \url{http://www.bioinf.jku.at/software/msa} |
|
84 |
+ |
|
85 |
+ U. Bodenhofer, E. Bonatesta, C. Horejs-Kainrath, and S. Hochreiter |
|
86 |
+ (2015). msa: an R package for multiple sequence alignment. |
|
87 |
+ \emph{Bioinformatics} \bold{31}(24):3997-3999. DOI: |
|
88 |
+ \href{http://dx.doi.org/10.1093/bioinformatics/btv494}{10.1093/bioinformatics/btv494}. |
|
89 |
+} |
|
90 |
+\seealso{\code{\link{msa}}, \code{\linkS4class{MsaAAMultipleAlignment}}, |
|
91 |
+ \code{\linkS4class{MsaDNAMultipleAlignment}}, |
|
92 |
+ \code{\linkS4class{MsaRNAMultipleAlignment}}, |
|
93 |
+ \code{\linkS4class{MsaMetaData}}, |
|
94 |
+ \code{\linkS4class{MultipleAlignment}}, |
|
95 |
+ \code{\link{consensusString}} |
|
96 |
+} |
|
97 |
+\examples{ |
|
98 |
+## read sequences |
|
99 |
+filepath <- system.file("examples", "HemoglobinAA.fasta", package="msa") |
|
100 |
+mySeqs <- readAAStringSet(filepath) |
|
101 |
+ |
|
102 |
+## perform multiple alignment |
|
103 |
+myAlignment <- msa(mySeqs) |
|
104 |
+ |
|
105 |
+## regular consensus sequence using consensusString() method from the |
|
106 |
+## 'Biostrings' package |
|
107 |
+msaConsensusSequence(myAlignment) |
|
108 |
+ |
|
109 |
+## use the other method |
|
110 |
+msaConsensusSequence(myAlignment, type="upperlower") |
|
111 |
+ |
|
112 |
+## use the other method with custom parameters |
|
113 |
+msaConsensusSequence(myAlignment, type="upperlower", thresh=c(50, 20), |
|
114 |
+ ignoreGaps=TRUE) |
|
115 |
+ |
|
116 |
+## compute a consensus matrix first |
|
117 |
+conMat <- consensusMatrix(myAlignment) |
|
118 |
+msaConsensusSequence(conMat) |
|
119 |
+} |
|
120 |
+\keyword{manip} |
0 | 121 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,139 @@ |
1 |
+\name{msaConservationScore} |
|
2 |
+\alias{msaConservationScore} |
|
3 |
+\alias{msaConservationScore,MultipleAlignment-method} |
|
4 |
+\alias{msaConservationScore,matrix-method} |
|
5 |
+\title{Computation of Conservation Scores from Multiple Alignment} |
|
6 |
+\description{ |
|
7 |
+ This method computes a vector of conservation scores from a |
|
8 |
+ multiple alignment or a previously computed consensus matrix. |
|
9 |
+} |
|
10 |
+\usage{ |
|
11 |
+\S4method{msaConservationScore}{matrix}(x, substitutionMatrix, gapVsGap=NULL, |
|
12 |
+ ...) |
|
13 |
+\S4method{msaConservationScore}{MultipleAlignment}(x, ...) |
|
14 |
+} |
|
15 |
+\arguments{ |
|
16 |
+ \item{x}{an object of class \code{\linkS4class{MultipleAlignment}} |
|
17 |
+ (which includes objects of classes |
|
18 |
+ \code{\linkS4class{MsaAAMultipleAlignment}}, |
|
19 |
+ \code{\linkS4class{MsaDNAMultipleAlignment}}, and |
|
20 |
+ \code{\linkS4class{MsaRNAMultipleAlignment}}) or a previously |
|
21 |
+ computed consensus matrix (see details below).} |
|
22 |
+ \item{substitutionMatrix}{substitution matrix (see |
|
23 |
+ details below).} |
|
24 |
+ \item{gapVsGap}{score to use for aligning gaps versus gaps (see |
|
25 |
+ details below).} |
|
26 |
+ \item{...}{when the method is called for a |
|
27 |
+ \code{\linkS4class{MultipleAlignment}} object, the consensus matrix is |
|
28 |
+ computed and, including all further arguments, |
|
29 |
+ passed on to the \code{msaConservationScore} method with signature |
|
30 |
+ \code{matrix}. This method passes all further arguments on to the |
|
31 |
+ \code{\link{msaConsensusSequence}} method to customize the way |
|
32 |
+ the consensus sequence is computed.} |
|
33 |
+ } |
|
34 |
+\details{ |
|
35 |
+ The method takes a \code{\linkS4class{MultipleAlignment}} object or a |
|
36 |
+ previously computed consensus matrix and computes the sum of pairwise |
|
37 |
+ scores for all positions of the alignment. For computing these scores, |
|
38 |
+ it is compulsory to specify a substitution/scoring matrix. This matrix |
|
39 |
+ must be provided as a \code{\link{matrix}} object. This can either be |
|
40 |
+ one of the ready-made matrices provided by the \pkg{Biostrings} |
|
41 |
+ package (e.g. \code{\link{BLOSUM62}}) or any other hand-crafted |
|
42 |
+ matrix. In the latter case, the following restrictions apply: |
|
43 |
+ \itemize{ |
|
44 |
+ \item{The matrix must be quadratic.} |
|
45 |
+ \item{For reasonable results, the matrix should be symmetric (note |
|
46 |
+ that this is not checked).} |
|
47 |
+ \item{Rows and columns must be named and the order of letters/symbols |
|
48 |
+ in row names and column names must be identical.} |
|
49 |
+ \item{All letters/symbols occurring in the multiple alignment, |
|
50 |
+ including gaps \sQuote{-}, must also be found in the row/column |
|
51 |
+ names of the substitution matrix. For consistency with the |
|
52 |
+ matrices from the \pkg{Biostrings} package, the |
|
53 |
+ row/column corresponding to gap penalties |
|
54 |
+ may also be labeled \sQuote{*} instead of \sQuote{-}.} |
|
55 |
+ } |
|
56 |
+ So, nucleotide substitution matrices created by |
|
57 |
+ \code{\link{nucleotideSubstitutionMatrix}} can be used for multiple |
|
58 |
+ alignments of nucleotide sequences, but must be |
|
59 |
+ completed with gap penalty rows and columns (see example below). |
|
60 |
+ |
|
61 |
+ If the consensus matrix of a multiple alignment of nucleotide |
|
62 |
+ sequences contains rows labeled \sQuote{+} and/or \sQuote{.}, these |
|
63 |
+ rows are ignored. |
|
64 |
+ |
|
65 |
+ The parameter \code{gapVsGap} can be used to control how |
|
66 |
+ pairs of gaps are scored. If \code{gapVsGap=NULL} (default), the |
|
67 |
+ corresponding diagonal entry of the substitution matrix is used as is. |
|
68 |
+ In the BLOSUM matrices, this is usually a positive value, which may |
|
69 |
+ not make sense under all circumstances. Therefore, the parameter |
|
70 |
+ \code{gapVsGap} can be set to an alternative value, e.g. 0 for |
|
71 |
+ ignoring gap-gap pairs. |
|
72 |
+ |
|
73 |
+ The method, in any case, returns a vector of scores that is as long |
|
74 |
+ as the alignment/consensus matrix has columns. The names of the vector |
|
75 |
+ entries are the corresponding positions of the consensus sequence of |
|
76 |
+ the alignment. How this consensus sequence is computed, can be |
|
77 |
+ controlled with additional arguments that are passed on to the |
|
78 |
+ \code{\link{msaConsensusSequence}} method. |
|
79 |
+} |
|
80 |
+\value{ |
|
81 |
+ The function returns a vector as long as the alignment/consensus |
|
82 |
+ matrix has columns. The vector is named with the consensus sequence |
|
83 |
+ (see details above). |
|
84 |
+} |
|
85 |
+\author{Ulrich Bodenhofer <msa@bioinf.jku.at> |
|
86 |
+} |
|
87 |
+\references{ |
|
88 |
+ \url{http://www.bioinf.jku.at/software/msa} |
|
89 |
+ |
|
90 |
+ U. Bodenhofer, E. Bonatesta, C. Horejs-Kainrath, and S. Hochreiter |
|
91 |
+ (2015). msa: an R package for multiple sequence alignment. |
|
92 |
+ \emph{Bioinformatics} \bold{31}(24):3997-3999. DOI: |
|
93 |
+ \href{http://dx.doi.org/10.1093/bioinformatics/btv494}{10.1093/bioinformatics/btv494}. |
|
94 |
+} |
|
95 |
+\seealso{\code{\link{msa}}, \code{\linkS4class{MsaAAMultipleAlignment}}, |
|
96 |
+ \code{\linkS4class{MsaDNAMultipleAlignment}}, |
|
97 |
+ \code{\linkS4class{MsaRNAMultipleAlignment}}, |
|
98 |
+ \code{\linkS4class{MsaMetaData}}, |
|
99 |
+ \code{\linkS4class{MultipleAlignment}}, |
|
100 |
+ \code{\link{msaConsensusSequence}} |
|
101 |
+} |
|
102 |
+\examples{ |
|
103 |
+## read sequences |
|
104 |
+filepath <- system.file("examples", "HemoglobinAA.fasta", package="msa") |
|
105 |
+mySeqs <- readAAStringSet(filepath) |
|
106 |
+ |
|
107 |
+## perform multiple alignment |
|
108 |
+myAlignment <- msa(mySeqs) |
|
109 |
+ |
|
110 |
+## compute consensus scores using the BLOSUM62 matrix |
|
111 |
+data(BLOSUM62) |
|
112 |
+msaConservationScore(myAlignment, BLOSUM62) |
|
113 |
+ |
|
114 |
+## compute consensus scores using the BLOSUM62 matrix |
|
115 |
+## without scoring gap-gap pairs and using a different consensus sequence |
|
116 |
+msaConservationScore(myAlignment, BLOSUM62, gapVsGap=0, |
|
117 |
+ type="upperlower") |
|
118 |
+ |
|
119 |
+## compute a consensus matrix first |
|
120 |
+conMat <- consensusMatrix(myAlignment) |
|
121 |
+data(PAM250) |
|
122 |
+msaConservationScore(conMat, PAM250, gapVsGap=0) |
|
123 |
+ |
|
124 |
+ |
|
125 |
+## DNA example |
|
126 |
+filepath <- system.file("examples", "exampleDNA.fasta", package="msa") |
|
127 |
+mySeqs <- readDNAStringSet(filepath) |
|
128 |
+ |
|
129 |
+## perform multiple alignment |
|
130 |
+myAlignment <- msa(mySeqs) |
|
131 |
+ |
|
132 |
+## create substitution matrix with gap penalty -8 |
|
133 |
+mat <- nucleotideSubstitutionMatrix(4, -1) |
|
134 |
+mat <- cbind(rbind(mat, "-"=-8), "-"=-8) |
|
135 |
+ |
|
136 |
+## compute consensus scores using this matrix |
|
137 |
+msaConservationScore(myAlignment, mat, gapVsGap=0) |
|
138 |
+} |
|
139 |
+\keyword{manip} |
... | ... |
@@ -9,7 +9,7 @@ |
9 | 9 |
msaConvert(x, |
10 | 10 |
type=c("seqinr::alignment", "bios2mds::align", |
11 | 11 |
"ape::AAbin", "ape::DNAbin", |
12 |
- "phangorn::phyDat")) |
|
12 |
+ "phangorn::phyDat", "bio3d::fasta")) |
|
13 | 13 |
} |
14 | 14 |
\arguments{ |
15 | 15 |
\item{x}{an object of class \code{\linkS4class{MultipleAlignment}} |
... | ... |
@@ -20,8 +20,8 @@ |
20 | 20 |
\item{type}{a character string specifying to which type of object |
21 | 21 |
\code{x} should be converted; currently, the |
22 | 22 |
values \code{"seqinr::alignment"}, \code{"bios2mds::align"}, |
23 |
- \code{"ape::AAbin"}, \code{"ape::DNAbin"}, and |
|
24 |
- \code{"phangorn::phyDat"}.} |
|
23 |
+ \code{"ape::AAbin"}, \code{"ape::DNAbin"}, |
|
24 |
+ \code{"phangorn::phyDat"}, and \code{"bio3d::fasta"}.} |
|
25 | 25 |
} |
26 | 26 |
\details{ |
27 | 27 |
The function converts \code{x} to the class of object as |
... | ... |
@@ -382,8 +382,8 @@ both as setter and getter functions. To set row or column masks, an |
382 | 382 |
\verb+IRanges+ object must be supplied: |
383 | 383 |
<<maskExample>>= |
384 | 384 |
myMaskedAlignment <- myFirstAlignment |
385 |
-rowM <- IRanges(start=1, end=2) |
|
386 |
-rowmask(myMaskedAlignment) <- rowM |
|
385 |
+colM <- IRanges(start=1, end=100) |
|
386 |
+colmask(myMaskedAlignment) <- colM |
|
387 | 387 |
myMaskedAlignment |
388 | 388 |
@ |
389 | 389 |
|
... | ... |
@@ -400,16 +400,26 @@ conMat <- consensusMatrix(myFirstAlignment) |
400 | 400 |
dim(conMat) |
401 | 401 |
conMat[, 101:110] |
402 | 402 |
@ |
403 |
-Note that \verb+consensusMatrix()+ cannot handle |
|
404 |
-alignments with active masks. So, the masks in multiple alignment objects must |
|
405 |
-must be removed prior to the computation of the consensus matrix: |
|
403 |
+If called on a masked alignment, \verb+consensusMatrix()+ only uses those |
|
404 |
+sequences/rows that are not masked. If there are masked columns, |
|
405 |
+the matrix contains \verb+NA+'s in those columns: |
|
406 | 406 |
<<consensusExample2>>= |
407 |
-conMat <- consensusMatrix(unmasked(myMaskedAlignment)) |
|
407 |
+conMat <- consensusMatrix(myMaskedAlignment) |
|
408 |
+conMat[, 95:104] |
|
408 | 409 |
@ |
409 | 410 |
|
410 |
-Consensus strings can be computed from consensus matrices: |
|
411 |
+Multiple alignments also inherit the \verb+consensusString()+ method from |
|
412 |
+the \verb+Biostrings+ package. However, for more flexibility and consistency, |
|
413 |
+we rather advise users to use the method \verb+msaConsensusSequence()+ |
|
414 |
+method (see below). |
|
415 |
+ |
|
416 |
+\subsection{Consensus Sequences and Conservation Scores} |
|
417 |
+ |
|
418 |
+With version 1.7.1 of \MSA, new methods have been provided that allow for |
|
419 |
+the computation of consensus sequences and conservation scores. |
|
420 |
+By default, the \verb+msaConsensusSequence()+ method is a wrapper around the |
|
421 |
+\verb+consensusString()+ method from the \verb+Biostrings+: |
|
411 | 422 |
<<consensusExample3>>= |
412 |
-## auxiliary function for splitting a string into displayable portions |
|
413 | 423 |
printSplitString <- function(x, width=getOption("width") - 1) |
414 | 424 |
{ |
415 | 425 |
starts <- seq(from=1, to=nchar(x), by=width) |
... | ... |
@@ -418,20 +428,55 @@ printSplitString <- function(x, width=getOption("width") - 1) |
418 | 428 |
cat(substr(x, starts[i], starts[i] + width - 1), "\n") |
419 | 429 |
} |
420 | 430 |
|
421 |
-printSplitString(consensusString(conMat)) |
|
431 |
+printSplitString(msaConsensusSequence(myFirstAlignment)) |
|
422 | 432 |
@ |
423 |
-\noindent Consensus sequences can also be computed directly without computing |
|
424 |
-intermediate consensus matrices. However, the \verb+consensusString()+ |
|
425 |
-function cannot handle the |
|
426 |
-masks contained in the multiple alignment objects (no matter whether |
|
427 |
-there are active masks or not). Therefore, it is necessary to remove |
|
428 |
-the masks beforehand: |
|
433 |
+However, there is also a second method for computing consensus sequence that |
|
434 |
+has been implemented in line with a consensus sequence method implemented |
|
435 |
+in \TeXshade\ that allows for specify an upper and a lower conservation threshold |
|
436 |
+(see example below). This method can be accessed via the argument |
|
437 |
+\verb+type="upperlower"+. Additional customizations are available, too: |
|
429 | 438 |
<<consensusExample4>>= |
430 |
-printSplitString(consensusString(unmasked(myFirstAlignment))) |
|
431 |
-printSplitString(consensusString(unmasked(myMaskedAlignment))) |
|
439 |
+printSplitString(msaConsensusSequence(myFirstAlignment, type="upperlower", |
|
440 |
+ thresh=c(40, 20))) |
|
441 |
+@ |
|
442 |
+ |
|
443 |
+Regardless of which method is used, masks are taken into account: masked |
|
444 |
+rows/sequences are neglected and masked columns are shown as ``\verb+#+'' in |
|
445 |
+the consensus sequence: |
|
446 |
+<<consensusExample5>>= |
|
447 |
+printSplitString(msaConsensusSequence(myMaskedAlignment, type="upperlower", |
|
448 |
+ thresh=c(40, 20))) |
|
449 |
+@ |
|
450 |
+ |
|
451 |
+The main purpose of consensus sequences is to get an impression of conservation |
|
452 |
+at individual positions/columns of a multiple alignment. The \MSA\ package also |
|
453 |
+provides another means of analyzing conservation: the method |
|
454 |
+\verb+msaConservationScore()+ computes sums of pairwise scores for a given |
|
455 |
+substitution/scoring matrix. Thereby, conservation can also be analyzed in a |
|
456 |
+more sensible way than by only taking relative frequencies of letters into |
|
457 |
+account as \verb+msaConsensusSequence()+ does. |
|
458 |
+<<conservationExample1>>= |
|
459 |
+data(BLOSUM62) |
|
460 |
+msaConservationScore(myFirstAlignment, BLOSUM62) |
|
461 |
+@ |
|
462 |
+As the above example shows, a substitution matrix must be provided. The result |
|
463 |
+is obviously a vector as long as the alignment has columns. The entries of the |
|
464 |
+vector are labeled by the consensus sequence. The way the consensus sequence is |
|
465 |
+computed can be customized: |
|
466 |
+<<conservationExample2>>= |
|
467 |
+msaConservationScore(myFirstAlignment, BLOSUM62, gapVsGap=0, |
|
468 |
+ type="upperlower", thresh=c(40, 20)) |
|
469 |
+@ |
|
470 |
+The additional argument \verb+gapVsGap+ allows for controlling how pairs of |
|
471 |
+gap are taken into account when computing pairwise scores (see |
|
472 |
+\verb+?msaConservationScore+ for more details). |
|
473 |
+ |
|
474 |
+Conservation scores can also be computed from masked alignments. For masked |
|
475 |
+columns, \verb+NA+'s are returned: |
|
476 |
+<<conservationExample3>>= |
|
477 |
+msaConservationScore(myMaskedAlignment, BLOSUM62, gapVsGap=0, |
|
478 |
+ type="upperlower", thresh=c(40, 20)) |
|
432 | 479 |
@ |
433 |
-\noindent Actually, the \verb+print()+ method (see Section~\ref{sec:msaPrint} above) |
|
434 |
-uses this function to compute the consensus sequence. |
|
435 | 480 |
|
436 | 481 |
\subsection{Interfacing to Other Packages} |
437 | 482 |
|
... | ... |
@@ -795,6 +840,16 @@ source package tarball, untar it, comment/uncomment the corresponding line in |
795 | 840 |
\verb+msa/src/ClustalOmega/msaMakefile+ (see first six lines), and |
796 | 841 |
build/install the package from source. |
797 | 842 |
|
843 |
+\subsubsection*{Build/installation issues} |
|
844 |
+ |
|
845 |
+Some users have reported compiler and linker errors when building \MSA\ from |
|
846 |
+source on Linux systems. In almost all cases, these could have been tracked down |
|
847 |
+to issues with the \R\ setup on those systems (e.g.\ a \verb+Rprofile.site+ file |
|
848 |
+that makes changes to the \R\ environment that are not compatible with \MSA's |
|
849 |
+Makefiles).\footnote{See, e.g., \url{https://support.bioconductor.org/p/90735/}} |
|
850 |
+In most cases, these issues can be avoided by installing \MSA\ in a ``vanilla \R\ session'', |
|
851 |
+i.e.\ starting \R\ with option \verb+--vanilla+ when installing \MSA. |
|
852 |
+ |
|
798 | 853 |
\section{Future Extensions}\label{sec:future} |
799 | 854 |
|
800 | 855 |
We envision the following changes/extensions in future versions of the package: |
... | ... |
@@ -834,6 +889,21 @@ bibliography below). |
834 | 889 |
\section{Change Log} |
835 | 890 |
|
836 | 891 |
\begin{description} |
892 |
+\item[Version 1.7.1:] \mbox{ } \begin{itemize} |
|
893 |
+ \item additional conversions implemented for \verb+msaConvert()+ function |
|
894 |
+ \item added a new method \verb+msaConsensusSequence()+ that extends the |
|
895 |
+ functionality provided by \verb+Biostring+'s \verb+consensusString()+ method |
|
896 |
+ \item added a new method \verb+msaConservationScore()+ |
|
897 |
+ \item \verb+print()+ method extended such that it now also allows for |
|
898 |
+ customization of the consensus sequence (via the new |
|
899 |
+ \verb+msaConsensusSequence()+ method) |
|
900 |
+ \item package now depends on \verb+Biostrings+ version $\geq$2.40.0 in order |
|
901 |
+ to make sure that \verb+consensusMatrix()+ also works correctly |
|
902 |
+ for masked alignments |
|
903 |
+ \item corresponding changes in documentation and vignette |
|
904 |
+ \end{itemize} |
|
905 |
+\item[Version 1.7.0:] new branch for Bioconductor 3.5 devel |
|
906 |
+\item[Version 1.6.0:] release as part of Bioconductor 3.4 |
|
837 | 907 |
\item[Version 1.5.5:] \mbox{ } \begin{itemize} |
838 | 908 |
\item fixes in ClustalOmega source code to ensure Windows compatibility of |
839 | 909 |
GCC6 compatibility fix |