Browse code

New functionality for computing consensus sequences and conservation scores; version number bumped to 1.7.1

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@127901 bc3139a8-67e5-0310-9ffc-ced21a209358

Ulrich Bodenhofer authored on 31/03/2017 11:35:08
Showing13 changed files

... ...
@@ -1,8 +1,8 @@
1 1
 Package: msa
2 2
 Type: Package
3 3
 Title: Multiple Sequence Alignment
4
-Version: 1.7.0
5
-Date: 2016-08-09
4
+Version: 1.7.1
5
+Date: 2017-03-31
6 6
 Author: Enrico Bonatesta, Christoph Horejs-Kainrath, Ulrich Bodenhofer
7 7
 Maintainer: Ulrich Bodenhofer <bodenhofer@bioinf.jku.at>
8 8
 Description: The 'msa' package provides a unified R/Bioconductor interface to
... ...
@@ -16,7 +16,7 @@ Description: The 'msa' package provides a unified R/Bioconductor interface to
16 16
 URL: http://www.bioinf.jku.at/software/msa/
17 17
 License: GPL (>= 2)
18 18
 Copyright: See file inst/COPYRIGHT
19
-Depends: R (>= 3.1.0), methods, Biostrings (>= 2.30.0)
19
+Depends: R (>= 3.1.0), methods, Biostrings (>= 2.40.0)
20 20
 Imports: Rcpp (>= 0.11.1), BiocGenerics, IRanges (>= 1.20.0),
21 21
         S4Vectors, tools
22 22
 Suggests: Biobase, knitr, seqinr, ape, phangorn
... ...
@@ -28,6 +28,7 @@ Collate: AllClasses.R AllGenerics.R params-methods.R version-methods.R
28 28
         helperFunctions.R inputChecks.R convertRows.R msaPrettyPrint.R
29 29
         print-methods.R show-methods.R msa.R msaMuscle.R msaClustalW.R
30 30
         msaClustalOmega.R msaConvert.R msaCheckNames.R
31
+	msaConsensusSequence-methods.R msaConservationScore-methods.R
31 32
 biocViews: MultipleSequenceAlignment, Alignment, MultipleComparison,
32 33
         Sequencing
33 34
 NeedsCompilation: yes
... ...
@@ -14,5 +14,5 @@ export(msa, msaMuscle, msaClustalW, msaClustalOmega, msaPrettyPrint,
14 14
 
15 15
 exportClasses(MsaDNAMultipleAlignment, MsaRNAMultipleAlignment,
16 16
               MsaAAMultipleAlignment, MsaMetaData)
17
-exportMethods(params, version, show, print)
18
-
17
+exportMethods(params, version, show, print, msaConservationScore,
18
+              msaConsensusSequence)
... ...
@@ -1 +1,7 @@
1 1
 setGeneric("version", function(object) standardGeneric("version"))
2
+
3
+setGeneric("msaConsensusSequence",
4
+           function(x, ...) standardGeneric("msaConsensusSequence"))
5
+
6
+setGeneric("msaConservationScore",
7
+           function(x, ...) standardGeneric("msaConservationScore"))
2 8
new file mode 100644
... ...
@@ -0,0 +1,122 @@
1
+msaConsensusSequence.matrix <- function(x, type=c("Biostrings", "upperlower"),
2
+                                        thresh=c(80, 20), ignoreGaps=FALSE,
3
+                                        ...)
4
+{
5
+    type <- match.arg(type)
6
+
7
+    if (is.null(rownames(x)) ||
8
+        any(!(rownames(x) %in% c(LETTERS, "-", "+", "."))))
9
+        stop("consensus matrix 'x' is not in proper format")
10
+
11
+    sel <- match(c("+", "."), rownames(x))
12
+    sel <- sel[which(!is.na(sel))]
13
+    if (length(sel) > 0)
14
+        x <- x[-sel, ]
15
+
16
+    if (!is.numeric(thresh) || length(thresh) !=2 ||
17
+        thresh[1] > 100 || thresh[2] < 0 || thresh[1] < thresh[2])
18
+        stop("'thresh' must be a decreasing sequence of two numbers ",
19
+             "between 0 and 100")
20
+
21
+    thresh <- thresh / 100
22
+
23
+    if (type == "Biostrings")
24
+    {
25
+        cs <- colSums(x)
26
+
27
+        if (any(is.na(cs)))
28
+        {
29
+            res <- rep.int("#", ncol(x))
30
+
31
+            sel <- which(!is.na(cs))
32
+
33
+            if (length(sel) > 0)
34
+            {
35
+                sstr <- consensusString(x[, sel, drop=FALSE], ...)
36
+                res[sel] <- unlist(strsplit(sstr, ""))
37
+            }
38
+
39
+            out <- paste(res, collapse="")
40
+        }
41
+        else
42
+            out <- consensusString(x, ...)
43
+    }
44
+    else
45
+    {
46
+        if (ignoreGaps)
47
+        {
48
+            sel <- match("-", rownames(x))
49
+
50
+            if (!is.na(sel))
51
+                sel <- (1:nrow(x))[-sel]
52
+            else
53
+                sel <- (1:nrow(x))
54
+
55
+            perColFunc <- function(y)
56
+            {
57
+                if (any(is.na(y)))
58
+                    char <- "#"
59
+                else
60
+                {
61
+                    y <- y[sel] / length(sel)
62
+
63
+                    maxw <-which.max(y[sel])
64
+                    maxi <- y[sel[maxw]]
65
+
66
+                    char <- rownames(x)[sel[maxw]]
67
+
68
+                    if (maxi < thresh[1])
69
+                    {
70
+                        if (maxi >= thresh[2])
71
+                            char <- tolower(char)
72
+                        else
73
+                            char <- "."
74
+                    }
75
+                }
76
+
77
+                char
78
+            }
79
+        }
80
+        else
81
+        {
82
+            perColFunc <- function(y)
83
+            {
84
+                if (any(is.na(y)))
85
+                    char <- "#"
86
+                else
87
+                {
88
+                    y <- y / length(y)
89
+
90
+                    maxw <- which.max(y)
91
+                    maxi <- y[maxw]
92
+
93
+                    char <- rownames(x)[maxw]
94
+
95
+                    if (maxi < thresh[1])
96
+                    {
97
+                        if (maxi >= thresh[2])
98
+                            char <- tolower(char)
99
+                        else
100
+                            char <- "."
101
+                    }
102
+                }
103
+
104
+                char
105
+            }
106
+        }
107
+
108
+        out <- paste(apply(x, 2, perColFunc), collapse="")
109
+    }
110
+
111
+    out
112
+}
113
+
114
+setMethod("msaConsensusSequence", signature("matrix"), msaConsensusSequence.matrix)
115
+
116
+
117
+setMethod("msaConsensusSequence", signature("MultipleAlignment"),
118
+          function(x, ...)
119
+          {
120
+              mat <- consensusMatrix(x)
121
+              msaConsensusSequence(mat, ...)
122
+          })
0 123
new file mode 100644
... ...
@@ -0,0 +1,60 @@
1
+msaConservationScore.matrix <- function(x, substitutionMatrix, gapVsGap=NULL,
2
+                                        ...)
3
+{
4
+    if (!is.matrix(substitutionMatrix) ||
5
+        nrow(substitutionMatrix) != ncol(substitutionMatrix) ||
6
+        any(!(rownames(substitutionMatrix) %in% c(LETTERS, "-", "*"))) ||
7
+        any(!(colnames(substitutionMatrix) %in% c(LETTERS, "-", "*"))) ||
8
+        any(rownames(substitutionMatrix) != colnames(substitutionMatrix)))
9
+        stop("substitution matrix is not in proper format")
10
+
11
+    if (is.null(rownames(x)) ||
12
+        any(!(rownames(x) %in% c(LETTERS, "-", "+", "."))))
13
+        stop("consensus matrix 'x' is not in proper format")
14
+
15
+    sel <- match(c("+", "."), rownames(x))
16
+    sel <- sel[which(!is.na(sel))]
17
+    if (length(sel) > 0)
18
+        x <- x[-sel, ]
19
+
20
+    sel <- match(c("*", "-"), rownames(substitutionMatrix))
21
+
22
+    if (is.na(sel[2]) && !is.na(sel[1]))
23
+    {
24
+        rownames(substitutionMatrix)[sel[1]] <- "-"
25
+        colnames(substitutionMatrix)[sel[1]] <- "-"
26
+    }
27
+
28
+    sel <- match(rownames(x), rownames(substitutionMatrix))
29
+
30
+    if (any(is.na(sel)))
31
+        stop("some letters occurring in alignment 'x' are missing ",
32
+             "in substitution matrix")
33
+
34
+    substitutionMatrix <- substitutionMatrix[sel, sel]
35
+
36
+    if (!is.null(gapVsGap))
37
+    {
38
+        if (!is.numeric(gapVsGap) || length(gapVsGap) != 1)
39
+            stop("'gapVsGap' must be NULL or a single number")
40
+
41
+        substitutionMatrix["-", "-"] <- gapVsGap
42
+    }
43
+
44
+    out <- drop(apply(x, 2, function(y) crossprod(y, substitutionMatrix %*% y)))
45
+
46
+    names(out) <- unlist(strsplit(msaConsensusSequence(x, ...), ""))
47
+
48
+    out
49
+}
50
+
51
+setMethod("msaConservationScore", signature("matrix"),
52
+          msaConservationScore.matrix)
53
+
54
+
55
+setMethod("msaConservationScore", signature("MultipleAlignment"),
56
+          function(x, ...)
57
+          {
58
+              mat <- consensusMatrix(x)
59
+              msaConservationScore.matrix(mat, ...)
60
+          })
... ...
@@ -2,7 +2,8 @@ msaConvert <- function(x, type=c("seqinr::alignment",
2 2
                                  "bios2mds::align",
3 3
                                  "ape::AAbin",
4 4
                                  "ape::DNAbin",
5
-                                 "phangorn::phyDat"))
5
+                                 "phangorn::phyDat",
6
+                                 "bio3d::fasta"))
6 7
 {
7 8
     type <- match.arg(type)
8 9
 
... ...
@@ -57,6 +58,14 @@ msaConvert <- function(x, type=c("seqinr::alignment",
57 58
         else
58 59
             stop("conversion to 'phyDat' requires package 'phangorn'")
59 60
     }
61
+    else if (type == "bio3d::fasta")
62
+    {
63
+        out <- list(id=names(xn),
64
+                    ali=.Call("SplitCharVector2Matrix", xn, "-"),
65
+                    call=x@call)
66
+        rownames(out$ali) <- out$id
67
+        class(out) <- "fasta"
68
+    }
60 69
 
61 70
     out
62 71
 }
... ...
@@ -79,7 +79,7 @@ print.MsaMultipleAlignmentChunk <- function(str, names=NULL, halfNrow=9, pos="",
79 79
 
80 80
 print.MsaMultipleAlignment <- function(x, show=c("alignment", "version", "call"),
81 81
                                        showNames=TRUE, showConsensus=TRUE,
82
-                                       halfNrow=9, nameWidth=20)
82
+                                       halfNrow=9, nameWidth=20, ...)
83 83
 {
84 84
     show <- match.arg(show,
85 85
                       choices=c("alignment", "complete", "version", "call",
... ...
@@ -154,7 +154,7 @@ print.MsaMultipleAlignment <- function(x, show=c("alignment", "version", "call")
154 154
 
155 155
             if (showConsensus)
156 156
             {
157
-                cons <- consensusString(consensusMatrix(unmasked(x)))
157
+                cons <- msaConsensusSequence(x, ...)
158 158
                 strings <- c(strings, cons)
159 159
 
160 160
                 if (length(names(strings)) > 0)
... ...
@@ -1,6 +1,25 @@
1 1
 Change history of package msa:
2 2
 ==============================
3 3
 
4
+Version 1.7.1:
5
+- additional conversions implemented for msaConvert() function
6
+- added a new method msaConsensusSequence() that extends the
7
+  functionality provided by Biostring's consensusString() method
8
+- added a new method msaConservationScore()
9
+- print() method extended such that it now also allows for
10
+  customization of the consensus sequence (via the new
11
+  msaConsensusSequence() method)
12
+- package now depends on Biostrings version >= 2.40.0 in order
13
+  to make sure that consensusMatrix() also works correctly
14
+  for masked alignments
15
+- corresponding changes in documentation and vignette
16
+
17
+Version 1.7.0:
18
+- new branch for Bioconductor 3.5 devel
19
+
20
+Version 1.6.0:
21
+- release as part of Bioconductor 3.4
22
+
4 23
 Version 1.5.5:
5 24
 - fixes in ClustalOmega source code to ensure Windows compatibility of
6 25
   GCC6 compatibility fix
... ...
@@ -46,7 +46,7 @@
46 46
 \section{Methods}{
47 47
   \describe{
48 48
     \item{\code{print(x, show=c("alignment", "version", "call"),
49
-      showNames=TRUE, showConsensus=TRUE, halfNrow=9, nameWidth=20)}:}{
49
+      showNames=TRUE, showConsensus=TRUE, halfNrow=9, nameWidth=20, ...)}:}{
50 50
       prints information about the object \code{x}; the \code{show}
51 51
       argument allows for determining what should be printed.
52 52
       The \code{show} must be a character vector and may contain any
... ...
@@ -95,6 +95,9 @@
95 95
       where the masked sequences/positions are shown as hash marks.
96 96
       However, the consensus sequences are computed from the
97 97
       complete, unmasked alignment and displayed as such.
98
+      Additional arguments are passed on to
99
+      \code{\link{msaConsensusSequence}} for customizing how the
100
+      consensus sequence is computed.
98 101
     }
99 102
     \item{\code{show(object)}:}{displays the alignment along with
100 103
       metadata; synonymous to calling \code{print} with default
... ...
@@ -138,12 +141,14 @@ show(myAlignment)
138 141
 ## print the results
139 142
 print(myAlignment, show="alignment")
140 143
 print(myAlignment, show="alignment", showConsensus=FALSE)
141
-print(myAlignment, show="complete")
142 144
 print(myAlignment, show=c("alignment", "version"))
143 145
 print(myAlignment, show="standardParams")
144 146
 print(myAlignment, show="algParams")
145 147
 print(myAlignment, show=c("call", "version"))
146 148
 
149
+## print results with custom consensus sequence
150
+print(myAlignment, show="complete", type="upperlower", thresh=c(50, 20))
151
+
147 152
 ## show the params
148 153
 params(myAlignment)
149 154
 }
150 155
new file mode 100644
... ...
@@ -0,0 +1,120 @@
1
+\name{msaConsensusSequence}
2
+\alias{msaConsensusSequence}
3
+\alias{msaConsensusSequence,MultipleAlignment-method}
4
+\alias{msaConsensusSequence,matrix-method}
5
+\title{Computation of Consensus Sequence from Multiple Alignment}
6
+\description{
7
+  This method computes a consensus sequence from a multiple alignment
8
+  or a previously computed consensus matrix. Currently, two different
9
+  ways of these computations are available.
10
+}
11
+\usage{
12
+\S4method{msaConsensusSequence}{matrix}(x, type=c("Biostrings", "upperlower"),
13
+    thresh=c(80, 20), ignoreGaps=FALSE, ...)
14
+\S4method{msaConsensusSequence}{MultipleAlignment}(x, ...)
15
+}
16
+\arguments{
17
+  \item{x}{an object of class \code{\linkS4class{MultipleAlignment}}
18
+    (which includes objects of classes
19
+    \code{\linkS4class{MsaAAMultipleAlignment}},
20
+    \code{\linkS4class{MsaDNAMultipleAlignment}}, and
21
+    \code{\linkS4class{MsaRNAMultipleAlignment}}) or a previously
22
+    computed consensus matrix (see details below).}
23
+  \item{type}{a character string specifying how to compute the consensus
24
+    sequence. Currently, types \code{"Biostrings"} and
25
+    \code{"upperlower"} are available (see details below).}
26
+  \item{thresh}{a decreasing two-element numeric vector of numbers
27
+    between 0 and 100 corresponding to the two conservation thresholds.
28
+    Only relevant for \code{type="upperlower"} (see details below),
29
+    otherwise ignored.}
30
+  \item{ignoreGaps}{a logical (default: \code{FALSE})
31
+    indicating whether gaps should be
32
+    considered when computing the consensus sequence. 
33
+    Only relevant for \code{type="upperlower"} (see details below),
34
+    otherwise ignored.}
35
+  \item{...}{when the method is called for a
36
+    \code{\linkS4class{MultipleAlignment}} object, the consensus matrix is
37
+    computed and, including all further arguments, passed
38
+    passed on to the \code{msaConsensusSequence} method with signature
39
+    \code{matrix}. The method with signature \code{matrix} forwards
40
+    additional arguments to the \code{\link{consensusString}} method
41
+    from the \pkg{Biostrings} package if \code{type="Biostrings"}.}
42
+  }
43
+\details{
44
+  The method takes a \code{\linkS4class{MultipleAlignment}} object or a
45
+  previously computed consensus matrix and computes a consensus
46
+  sequence. For \code{type="Biostrings"}, the method
47
+  \code{\link{consensusString}} from the \pkg{Biostrings} package is
48
+  used to compute the consensus sequence. For \code{type="upperlower"},
49
+  two thresholds (argument \code{thresh}, see above) are used to
50
+  compute the consensus sequence:
51
+  \itemize{
52
+    \item{If the relative frequency of the most frequent letter at a
53
+      given position is at least as large as the first threshold (default:
54
+      80\%), then this most frequent letter is used for the consensus
55
+      sequence at this position as it is.}
56
+    \item{If the relative frequency of the most frequent letter at a
57
+      given position is smaller than the first threshold, but at least
58
+      as large as the second threshold (default: 20\%), then this
59
+      most frequent letter is used for the consensus sequence at this
60
+      position, but converted to lower case.}
61
+    \item{If the relative frequency of the most frequent letter in a
62
+      column is even smaller than the second threshold, then a dot is
63
+      used for the consensus sequence at this position.}
64
+    \item{If \code{ignoreGaps=FALSE} (which is the default),
65
+      gaps are treated like all other
66
+      letters except for the fact that obviously no lowercase conversion
67
+      takes place in case that the relative frequency is between the
68
+      two thresholds. If \code{ignoreGaps=TRUE}, gaps are ignored
69
+      completely, and the consensus sequence is computed from the
70
+      non-gap letters only.}
71
+  }
72
+
73
+  If the consensus matrix of a multiple alignment of nucleotide
74
+  sequences contains rows labeled \sQuote{+} and/or \sQuote{.}, these
75
+  rows are ignored.
76
+}
77
+\value{
78
+  The function returns a character string with the consensus sequence.
79
+}
80
+\author{Ulrich Bodenhofer <msa@bioinf.jku.at>
81
+}
82
+\references{
83
+  \url{http://www.bioinf.jku.at/software/msa}
84
+  
85
+  U. Bodenhofer, E. Bonatesta, C. Horejs-Kainrath, and S. Hochreiter
86
+  (2015). msa: an R package for multiple sequence alignment. 
87
+  \emph{Bioinformatics} \bold{31}(24):3997-3999. DOI:
88
+  \href{http://dx.doi.org/10.1093/bioinformatics/btv494}{10.1093/bioinformatics/btv494}.
89
+}
90
+\seealso{\code{\link{msa}}, \code{\linkS4class{MsaAAMultipleAlignment}},
91
+  \code{\linkS4class{MsaDNAMultipleAlignment}},
92
+  \code{\linkS4class{MsaRNAMultipleAlignment}},
93
+  \code{\linkS4class{MsaMetaData}},
94
+  \code{\linkS4class{MultipleAlignment}},
95
+  \code{\link{consensusString}}
96
+}
97
+\examples{
98
+## read sequences
99
+filepath <- system.file("examples", "HemoglobinAA.fasta", package="msa")
100
+mySeqs <- readAAStringSet(filepath)
101
+
102
+## perform multiple alignment
103
+myAlignment <- msa(mySeqs)
104
+
105
+## regular consensus sequence using consensusString() method from the
106
+## 'Biostrings' package
107
+msaConsensusSequence(myAlignment)
108
+
109
+## use the other method
110
+msaConsensusSequence(myAlignment, type="upperlower")
111
+
112
+## use the other method with custom parameters
113
+msaConsensusSequence(myAlignment, type="upperlower", thresh=c(50, 20),
114
+                     ignoreGaps=TRUE)
115
+
116
+## compute a consensus matrix first
117
+conMat <- consensusMatrix(myAlignment)
118
+msaConsensusSequence(conMat)
119
+}
120
+\keyword{manip}
0 121
new file mode 100644
... ...
@@ -0,0 +1,139 @@
1
+\name{msaConservationScore}
2
+\alias{msaConservationScore}
3
+\alias{msaConservationScore,MultipleAlignment-method}
4
+\alias{msaConservationScore,matrix-method}
5
+\title{Computation of Conservation Scores from Multiple Alignment}
6
+\description{
7
+  This method computes a vector of conservation scores from a
8
+  multiple alignment or a previously computed consensus matrix.
9
+}
10
+\usage{
11
+\S4method{msaConservationScore}{matrix}(x, substitutionMatrix, gapVsGap=NULL,
12
+    ...)
13
+\S4method{msaConservationScore}{MultipleAlignment}(x, ...)
14
+}
15
+\arguments{
16
+  \item{x}{an object of class \code{\linkS4class{MultipleAlignment}}
17
+    (which includes objects of classes
18
+    \code{\linkS4class{MsaAAMultipleAlignment}},
19
+    \code{\linkS4class{MsaDNAMultipleAlignment}}, and
20
+    \code{\linkS4class{MsaRNAMultipleAlignment}}) or a previously
21
+    computed consensus matrix (see details below).}
22
+  \item{substitutionMatrix}{substitution matrix (see
23
+    details below).}
24
+  \item{gapVsGap}{score to use for aligning gaps versus gaps (see
25
+    details below).}
26
+  \item{...}{when the method is called for a
27
+    \code{\linkS4class{MultipleAlignment}} object, the consensus matrix is
28
+    computed and, including all further arguments,
29
+    passed on to the \code{msaConservationScore} method with signature
30
+    \code{matrix}. This method passes all further arguments on to the
31
+    \code{\link{msaConsensusSequence}} method to customize the way
32
+    the consensus sequence is computed.}
33
+  }
34
+\details{
35
+  The method takes a \code{\linkS4class{MultipleAlignment}} object or a
36
+  previously computed consensus matrix and computes the sum of pairwise
37
+  scores for all positions of the alignment. For computing these scores,
38
+  it is compulsory to specify a substitution/scoring matrix. This matrix
39
+  must be provided as a \code{\link{matrix}} object. This can either be
40
+  one of the ready-made matrices provided by the \pkg{Biostrings}
41
+  package (e.g. \code{\link{BLOSUM62}}) or any other hand-crafted
42
+  matrix. In the latter case, the following restrictions apply:
43
+  \itemize{
44
+    \item{The matrix must be quadratic.}
45
+    \item{For reasonable results, the matrix should be symmetric (note
46
+      that this is not checked).}
47
+    \item{Rows and columns must be named and the order of letters/symbols
48
+      in row names and column names must be identical.}
49
+    \item{All letters/symbols occurring in the multiple alignment,
50
+      including gaps \sQuote{-}, must also be found in the row/column
51
+      names of the substitution matrix. For consistency with the
52
+      matrices from the \pkg{Biostrings} package, the
53
+      row/column corresponding to gap penalties
54
+      may also be labeled \sQuote{*} instead of \sQuote{-}.}
55
+  }
56
+  So, nucleotide substitution matrices created by
57
+  \code{\link{nucleotideSubstitutionMatrix}} can be used for multiple
58
+  alignments of nucleotide sequences, but must be
59
+  completed with gap penalty rows and columns (see example below).
60
+
61
+  If the consensus matrix of a multiple alignment of nucleotide
62
+  sequences contains rows labeled \sQuote{+} and/or \sQuote{.}, these
63
+  rows are ignored.
64
+  
65
+  The parameter \code{gapVsGap} can be used to control how
66
+  pairs of gaps are scored. If \code{gapVsGap=NULL} (default), the
67
+  corresponding diagonal entry of the substitution matrix is used as is.
68
+  In the BLOSUM matrices, this is usually a positive value, which may
69
+  not make sense under all circumstances. Therefore, the parameter
70
+  \code{gapVsGap} can be set to an alternative value, e.g. 0 for
71
+  ignoring gap-gap pairs.
72
+
73
+  The method, in any case, returns a vector of scores that is as long
74
+  as the alignment/consensus matrix has columns. The names of the vector
75
+  entries are the corresponding positions of the consensus sequence of
76
+  the alignment. How this consensus sequence is computed, can be
77
+  controlled with additional arguments that are passed on to the
78
+  \code{\link{msaConsensusSequence}} method.
79
+}
80
+\value{
81
+  The function returns a vector as long as the alignment/consensus
82
+  matrix has columns. The vector is named with the consensus sequence
83
+  (see details above).
84
+}
85
+\author{Ulrich Bodenhofer <msa@bioinf.jku.at>
86
+}
87
+\references{
88
+  \url{http://www.bioinf.jku.at/software/msa}
89
+  
90
+  U. Bodenhofer, E. Bonatesta, C. Horejs-Kainrath, and S. Hochreiter
91
+  (2015). msa: an R package for multiple sequence alignment. 
92
+  \emph{Bioinformatics} \bold{31}(24):3997-3999. DOI:
93
+  \href{http://dx.doi.org/10.1093/bioinformatics/btv494}{10.1093/bioinformatics/btv494}.
94
+}
95
+\seealso{\code{\link{msa}}, \code{\linkS4class{MsaAAMultipleAlignment}},
96
+  \code{\linkS4class{MsaDNAMultipleAlignment}},
97
+  \code{\linkS4class{MsaRNAMultipleAlignment}},
98
+  \code{\linkS4class{MsaMetaData}},
99
+  \code{\linkS4class{MultipleAlignment}},
100
+  \code{\link{msaConsensusSequence}}
101
+}
102
+\examples{
103
+## read sequences
104
+filepath <- system.file("examples", "HemoglobinAA.fasta", package="msa")
105
+mySeqs <- readAAStringSet(filepath)
106
+
107
+## perform multiple alignment
108
+myAlignment <- msa(mySeqs)
109
+
110
+## compute consensus scores using the BLOSUM62 matrix
111
+data(BLOSUM62)
112
+msaConservationScore(myAlignment, BLOSUM62)
113
+
114
+## compute consensus scores using the BLOSUM62 matrix
115
+## without scoring gap-gap pairs and using a different consensus sequence
116
+msaConservationScore(myAlignment, BLOSUM62, gapVsGap=0,
117
+                     type="upperlower")
118
+
119
+## compute a consensus matrix first
120
+conMat <- consensusMatrix(myAlignment)
121
+data(PAM250)
122
+msaConservationScore(conMat, PAM250, gapVsGap=0)
123
+
124
+
125
+## DNA example
126
+filepath <- system.file("examples", "exampleDNA.fasta", package="msa")
127
+mySeqs <- readDNAStringSet(filepath)
128
+
129
+## perform multiple alignment
130
+myAlignment <- msa(mySeqs)
131
+
132
+## create substitution matrix with gap penalty -8
133
+mat <- nucleotideSubstitutionMatrix(4, -1)
134
+mat <- cbind(rbind(mat, "-"=-8), "-"=-8)
135
+
136
+## compute consensus scores using this matrix
137
+msaConservationScore(myAlignment, mat, gapVsGap=0)
138
+}
139
+\keyword{manip}
... ...
@@ -9,7 +9,7 @@
9 9
     msaConvert(x,
10 10
                type=c("seqinr::alignment", "bios2mds::align",
11 11
                       "ape::AAbin", "ape::DNAbin",
12
-                      "phangorn::phyDat"))
12
+                      "phangorn::phyDat", "bio3d::fasta"))
13 13
 }
14 14
 \arguments{
15 15
   \item{x}{an object of class \code{\linkS4class{MultipleAlignment}}
... ...
@@ -20,8 +20,8 @@
20 20
   \item{type}{a character string specifying to which type of object
21 21
     \code{x} should be converted; currently, the
22 22
     values \code{"seqinr::alignment"}, \code{"bios2mds::align"},
23
-    \code{"ape::AAbin"}, \code{"ape::DNAbin"}, and
24
-    \code{"phangorn::phyDat"}.}
23
+    \code{"ape::AAbin"}, \code{"ape::DNAbin"},
24
+    \code{"phangorn::phyDat"}, and \code{"bio3d::fasta"}.}
25 25
   }
26 26
 \details{
27 27
   The function converts \code{x} to the class of object as
... ...
@@ -382,8 +382,8 @@ both as setter and getter functions. To set row or column masks, an
382 382
 \verb+IRanges+ object must be supplied:
383 383
 <<maskExample>>=
384 384
 myMaskedAlignment <- myFirstAlignment
385
-rowM <- IRanges(start=1, end=2)
386
-rowmask(myMaskedAlignment) <- rowM
385
+colM <- IRanges(start=1, end=100)
386
+colmask(myMaskedAlignment) <- colM
387 387
 myMaskedAlignment
388 388
 @
389 389
 
... ...
@@ -400,16 +400,26 @@ conMat <- consensusMatrix(myFirstAlignment)
400 400
 dim(conMat)
401 401
 conMat[, 101:110]
402 402
 @
403
-Note that \verb+consensusMatrix()+ cannot handle
404
-alignments with active masks. So, the masks in multiple alignment objects must
405
-must be removed prior to the computation of the consensus matrix:
403
+If called on a masked alignment, \verb+consensusMatrix()+ only uses those
404
+sequences/rows that are not masked. If there are masked columns,
405
+the matrix contains \verb+NA+'s in those columns:
406 406
 <<consensusExample2>>=
407
-conMat <- consensusMatrix(unmasked(myMaskedAlignment))
407
+conMat <- consensusMatrix(myMaskedAlignment)
408
+conMat[, 95:104]
408 409
 @
409 410
 
410
-Consensus strings can be computed from consensus matrices:
411
+Multiple alignments also inherit the \verb+consensusString()+ method from
412
+the \verb+Biostrings+ package. However, for more flexibility and consistency,
413
+we rather advise users to use the method \verb+msaConsensusSequence()+
414
+method (see below).
415
+
416
+\subsection{Consensus Sequences and Conservation Scores}
417
+
418
+With version 1.7.1 of \MSA, new methods have been provided that allow for
419
+the computation of consensus sequences and conservation scores.
420
+By default, the \verb+msaConsensusSequence()+ method is a wrapper around the
421
+\verb+consensusString()+ method from the \verb+Biostrings+:
411 422
 <<consensusExample3>>=
412
-## auxiliary function for splitting a string into displayable portions
413 423
 printSplitString <- function(x, width=getOption("width") - 1)
414 424
 {
415 425
     starts <- seq(from=1, to=nchar(x), by=width)
... ...
@@ -418,20 +428,55 @@ printSplitString <- function(x, width=getOption("width") - 1)
418 428
         cat(substr(x, starts[i], starts[i] + width - 1), "\n")
419 429
 }
420 430
 
421
-printSplitString(consensusString(conMat))
431
+printSplitString(msaConsensusSequence(myFirstAlignment))
422 432
 @
423
-\noindent Consensus sequences can also be computed directly without computing
424
-intermediate consensus matrices. However, the \verb+consensusString()+
425
-function cannot handle the
426
-masks contained in the multiple alignment objects (no matter whether
427
-there are active masks or not). Therefore, it is necessary to remove
428
-the masks beforehand:
433
+However, there is also a second method for computing consensus sequence that
434
+has been implemented in line with a consensus sequence method implemented
435
+in \TeXshade\ that allows for specify an upper and a lower conservation threshold
436
+(see example below). This method can be accessed via the argument
437
+\verb+type="upperlower"+. Additional customizations are available, too:
429 438
 <<consensusExample4>>=
430
-printSplitString(consensusString(unmasked(myFirstAlignment)))
431
-printSplitString(consensusString(unmasked(myMaskedAlignment)))
439
+printSplitString(msaConsensusSequence(myFirstAlignment, type="upperlower",
440
+                                      thresh=c(40, 20)))
441
+@
442
+
443
+Regardless of which method is used, masks are taken into account: masked
444
+rows/sequences are neglected and masked columns are shown as ``\verb+#+'' in
445
+the consensus sequence:
446
+<<consensusExample5>>=
447
+printSplitString(msaConsensusSequence(myMaskedAlignment, type="upperlower",
448
+                                      thresh=c(40, 20)))
449
+@
450
+
451
+The main purpose of consensus sequences is to get an impression of conservation
452
+at individual positions/columns of a multiple alignment. The \MSA\ package also
453
+provides another means of analyzing conservation: the method
454
+\verb+msaConservationScore()+ computes sums of pairwise scores for a given
455
+substitution/scoring matrix. Thereby, conservation can also be analyzed in a
456
+more sensible way than by only taking relative frequencies of letters into
457
+account as \verb+msaConsensusSequence()+ does.
458
+<<conservationExample1>>=
459
+data(BLOSUM62)
460
+msaConservationScore(myFirstAlignment, BLOSUM62)
461
+@
462
+As the above example shows, a substitution matrix must be provided. The result
463
+is obviously a vector as long as the alignment has columns. The entries of the
464
+vector are labeled by the consensus sequence. The way the consensus sequence is
465
+computed can be customized:
466
+<<conservationExample2>>=
467
+msaConservationScore(myFirstAlignment, BLOSUM62, gapVsGap=0,
468
+                     type="upperlower", thresh=c(40, 20))
469
+@
470
+The additional argument \verb+gapVsGap+ allows for controlling how pairs of
471
+gap are taken into account when computing pairwise scores (see
472
+\verb+?msaConservationScore+ for more details).
473
+
474
+Conservation scores can also be computed from masked alignments. For masked
475
+columns, \verb+NA+'s are returned:
476
+<<conservationExample3>>=
477
+msaConservationScore(myMaskedAlignment, BLOSUM62, gapVsGap=0,
478
+                     type="upperlower", thresh=c(40, 20))
432 479
 @
433
-\noindent Actually, the \verb+print()+ method (see Section~\ref{sec:msaPrint} above)
434
-uses this function to compute the consensus sequence.
435 480
 
436 481
 \subsection{Interfacing to Other Packages}
437 482
 
... ...
@@ -795,6 +840,16 @@ source package tarball, untar it, comment/uncomment the corresponding line in
795 840
 \verb+msa/src/ClustalOmega/msaMakefile+ (see first six lines), and
796 841
 build/install the package from source.
797 842
 
843
+\subsubsection*{Build/installation issues}
844
+
845
+Some users have reported compiler and linker errors when building \MSA\ from
846
+source on Linux systems. In almost all cases, these could have been tracked down
847
+to issues with the \R\ setup on those systems (e.g.\ a \verb+Rprofile.site+ file
848
+that makes changes to the \R\ environment that are not compatible with \MSA's
849
+Makefiles).\footnote{See, e.g., \url{https://support.bioconductor.org/p/90735/}}
850
+In most cases, these issues can be avoided by installing \MSA\ in a ``vanilla \R\ session'',
851
+i.e.\ starting \R\ with option \verb+--vanilla+ when installing \MSA.
852
+
798 853
 \section{Future Extensions}\label{sec:future}
799 854
 
800 855
 We envision the following changes/extensions in future versions of the package:
... ...
@@ -834,6 +889,21 @@ bibliography below).
834 889
 \section{Change Log}
835 890
 
836 891
 \begin{description}
892
+\item[Version 1.7.1:] \mbox{ }  \begin{itemize}
893
+   \item additional conversions implemented for \verb+msaConvert()+ function
894
+   \item added a new method \verb+msaConsensusSequence()+ that extends the
895
+     functionality provided by \verb+Biostring+'s \verb+consensusString()+ method
896
+   \item added a new method \verb+msaConservationScore()+
897
+   \item \verb+print()+ method extended such that it now also allows for
898
+     customization of the consensus sequence (via the new
899
+     \verb+msaConsensusSequence()+ method)
900
+   \item package now depends on \verb+Biostrings+ version $\geq$2.40.0 in order
901
+     to make sure that \verb+consensusMatrix()+ also works correctly
902
+     for masked alignments
903
+   \item corresponding changes in documentation and vignette
904
+  \end{itemize}
905
+\item[Version 1.7.0:] new branch for Bioconductor 3.5 devel
906
+\item[Version 1.6.0:] release as part of Bioconductor 3.4
837 907
 \item[Version 1.5.5:] \mbox{ }  \begin{itemize}
838 908
    \item fixes in ClustalOmega source code to ensure Windows compatibility of
839 909
    GCC6 compatibility fix