git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@115880 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,8 +1,8 @@ |
1 | 1 |
Package: msa |
2 | 2 |
Type: Package |
3 | 3 |
Title: Multiple Sequence Alignment |
4 |
-Version: 1.3.3 |
|
5 |
-Date: 2016-02-25 |
|
4 |
+Version: 1.3.4 |
|
5 |
+Date: 2016-04-06 |
|
6 | 6 |
Author: Enrico Bonatesta, Christoph Horejs-Kainrath, Ulrich Bodenhofer |
7 | 7 |
Maintainer: Ulrich Bodenhofer <bodenhofer@bioinf.jku.at> |
8 | 8 |
Description: This package provides a unified R/Bioconductor interface to the |
... | ... |
@@ -27,7 +27,7 @@ LazyLoad: yes |
27 | 27 |
Collate: AllClasses.R AllGenerics.R params-methods.R version-methods.R |
28 | 28 |
helperFunctions.R inputChecks.R convertRows.R msaPrettyPrint.R |
29 | 29 |
print-methods.R show-methods.R msa.R msaMuscle.R msaClustalW.R |
30 |
- msaClustalOmega.R msaConvert.R |
|
30 |
+ msaClustalOmega.R msaConvert.R msaCheckNames.R |
|
31 | 31 |
biocViews: MultipleSequenceAlignment, Alignment, MultipleComparison, |
32 | 32 |
Sequencing |
33 | 33 |
NeedsCompilation: yes |
... | ... |
@@ -7,9 +7,10 @@ import(Biostrings) |
7 | 7 |
import(S4Vectors) |
8 | 8 |
import(IRanges) |
9 | 9 |
importFrom(tools, texi2dvi) |
10 |
+importFrom(utils, capture.output) |
|
10 | 11 |
|
11 | 12 |
export(msa, msaMuscle, msaClustalW, msaClustalOmega, msaPrettyPrint, |
12 |
- msaConvert) |
|
13 |
+ msaConvert, msaCheckNames) |
|
13 | 14 |
|
14 | 15 |
exportClasses(MsaDNAMultipleAlignment, MsaRNAMultipleAlignment, |
15 | 16 |
MsaAAMultipleAlignment, MsaMetaData) |
16 | 17 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,20 @@ |
1 |
+msaCheckNames <- function(x, replacement=" ", verbose=TRUE) |
|
2 |
+{ |
|
3 |
+ if (!is(x, "MultipleAlignment")) |
|
4 |
+ stop("x must be a multiple alignment object") |
|
5 |
+ |
|
6 |
+ out <- x |
|
7 |
+ |
|
8 |
+ pattern <- "[^a-zA-Z0-9,;:.?!\\-\\(\\)\\'\" ]" |
|
9 |
+ |
|
10 |
+ if (length(grep(pattern, rownames(x), perl=TRUE)) > 0) |
|
11 |
+ { |
|
12 |
+ if (verbose) |
|
13 |
+ message("sequence names contain invalid characters") |
|
14 |
+ |
|
15 |
+ rownames(out) <- gsub(pattern, replacement, rownames(x), |
|
16 |
+ perl=TRUE) |
|
17 |
+ } |
|
18 |
+ |
|
19 |
+ invisible(out) |
|
20 |
+} |
... | ... |
@@ -1,6 +1,13 @@ |
1 | 1 |
Change history of package msa: |
2 | 2 |
============================== |
3 | 3 |
|
4 |
+Version 1.3.4: |
|
5 |
+- added function for checking and fixing sequence names for |
|
6 |
+ possibly problematic characters that could lead to LaTeX |
|
7 |
+ errors when using msaPrettyPrint() |
|
8 |
+- corresponding changes in documentation |
|
9 |
+- minor namespace fix |
|
10 |
+ |
|
4 | 11 |
Version 1.3.3: |
5 | 12 |
- added function for converting multiple sequence alignments for |
6 | 13 |
use with other sequence alignment packages |
7 | 14 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,91 @@ |
1 |
+\name{msaCheckNames} |
|
2 |
+\alias{msaCheckNames} |
|
3 |
+\title{Check and fix sequence names} |
|
4 |
+\description{ |
|
5 |
+ This function checks and fixed sequence names of multiple |
|
6 |
+ alignment objects if they contain characters |
|
7 |
+ that might lead to LaTeX problems when using |
|
8 |
+ \code{\link{msaPrettyPrint}}. |
|
9 |
+} |
|
10 |
+\usage{ |
|
11 |
+ msaCheckNames(x, replacement=" ", verbose=TRUE) |
|
12 |
+} |
|
13 |
+\arguments{ |
|
14 |
+ \item{x}{an object of class \code{\linkS4class{MultipleAlignment}} |
|
15 |
+ (which includes objects of classes |
|
16 |
+ \code{\linkS4class{MsaAAMultipleAlignment}}, |
|
17 |
+ \code{\linkS4class{MsaDNAMultipleAlignment}}, and |
|
18 |
+ \code{\linkS4class{MsaRNAMultipleAlignment}})} |
|
19 |
+ \item{replacement}{a character string specifying with which |
|
20 |
+ character(s) potentially problematic characters should be replaced.} |
|
21 |
+ \item{verbose}{if \code{TRUE} (default), a warning message is shown |
|
22 |
+ if potentially problematic characters are found. Otherwise, |
|
23 |
+ the function silently replaces these characters (see details below).} |
|
24 |
+} |
|
25 |
+\details{ |
|
26 |
+ The \pkg{Biostrings} package does not impose any restrictions on |
|
27 |
+ the names of sequences. Consequently, \pkg{msa} also allows all |
|
28 |
+ possible ASCII strings as sequence (row) names in multiple alignments. |
|
29 |
+ As soon as \code{\link{msaPrettyPrint}} is used for pretty-printing |
|
30 |
+ multiple sequence alignments, however, the sequence names are |
|
31 |
+ interpreted as plain LaTeX source code. Consequently, LaTeX errors |
|
32 |
+ may arise because of characters or words in the sequence names that LaTeX |
|
33 |
+ does not or cannot interpret as plain text correctly. This |
|
34 |
+ particularly includes appearances of special characters and backslash |
|
35 |
+ characters in the sequence names. |
|
36 |
+ |
|
37 |
+ The \code{msaCheckNames} function takes a multiple alignment object |
|
38 |
+ and checks sequence names for possibly problematic characters, which |
|
39 |
+ are all characters but letters (upper and lower case), digits, |
|
40 |
+ spaces, commas, colons, semicolons, periods, question and exclamation |
|
41 |
+ marks, dashes, braces, single quotes, and double quotes. |
|
42 |
+ All other characters are |
|
43 |
+ considered problematic. The function allows for both checking and |
|
44 |
+ fixing the sequence names. If called with \code{verbose=TRUE} |
|
45 |
+ (default), the function prints a warning if a problematic character is |
|
46 |
+ found. At the same time, regardless of the \code{verbose} argument, |
|
47 |
+ the function invisibly returns a copy of \code{x} in whose sequence |
|
48 |
+ names all problematic characters have been replaced by the string |
|
49 |
+ that is supplied via the \code{replacement} argument (the default is |
|
50 |
+ a single space). |
|
51 |
+ |
|
52 |
+ In any case, the best solution is to check sequence names carefully |
|
53 |
+ and to avoid problematic sequence names from the beginning. |
|
54 |
+} |
|
55 |
+\value{ |
|
56 |
+ The function invisibly returns a copy of the argument \code{x} |
|
57 |
+ (therefore, an object of the same class as \code{x}), but |
|
58 |
+ with modified sequence/row names (see details above). |
|
59 |
+} |
|
60 |
+\author{Ulrich Bodenhofer <msa@bioinf.jku.at> |
|
61 |
+} |
|
62 |
+\references{ |
|
63 |
+ \url{http://www.bioinf.jku.at/software/msa} |
|
64 |
+ |
|
65 |
+ U. Bodenhofer, E. Bonatesta, C. Horejs-Kainrath, and S. Hochreiter |
|
66 |
+ (2015). msa: an R package for multiple sequence alignment. |
|
67 |
+ \emph{Bioinformatics} \bold{31}(24):3997-3999. DOI: |
|
68 |
+ \href{http://dx.doi.org/10.1093/bioinformatics/btv494}{10.1093/bioinformatics/btv494}. |
|
69 |
+} |
|
70 |
+\seealso{\code{\link{msaPrettyPrint}}, |
|
71 |
+ \code{\linkS4class{MsaAAMultipleAlignment}}, |
|
72 |
+ \code{\linkS4class{MsaDNAMultipleAlignment}}, |
|
73 |
+ \code{\linkS4class{MsaRNAMultipleAlignment}} |
|
74 |
+} |
|
75 |
+\examples{ |
|
76 |
+## create toy example |
|
77 |
+mySeqs <- DNAStringSet(c("ACGATCGATC", "ACGACGATC", "ACGATCCCCC")) |
|
78 |
+names(mySeqs) <- c("Seq. #1", "Seq. \\2", "Seq. ~3") |
|
79 |
+ |
|
80 |
+## perform multiple alignment |
|
81 |
+myAlignment <- msa(mySeqs) |
|
82 |
+myAlignment |
|
83 |
+ |
|
84 |
+## check names |
|
85 |
+msaCheckNames(myAlignment) |
|
86 |
+ |
|
87 |
+## fix names |
|
88 |
+myAlignment <- msaCheckNames(myAlignment, replacement="", verbose=FALSE) |
|
89 |
+myAlignment |
|
90 |
+} |
|
91 |
+\keyword{manip} |
... | ... |
@@ -666,10 +666,28 @@ the \shade\ package must be loaded in the preamble: |
666 | 666 |
\end{verbatim} |
667 | 667 |
\end{quote} |
668 | 668 |
|
669 |
+\subsection{Sequence Names} |
|
670 |
+ |
|
671 |
+The \verb+Biostrings+ package does not impose any restrictions on the names of |
|
672 |
+sequences. Consequently, \MSA\ also allows all possible ASCII strings as |
|
673 |
+sequence (row) names in multiple alignments. As soon as \verb+msaPrettyPrint()+ |
|
674 |
+is used for pretty-printing multiple sequence alignments, however, the sequence |
|
675 |
+names are interpreted as plain \LaTeX\ source code. Consequently, \LaTeX\ errors |
|
676 |
+may arise because of characters or words in the sequence names that \LaTeX\ |
|
677 |
+does not or cannot interpret as plain text correctly. This particularly includes |
|
678 |
+appearances of special characters and backslash characters in the sequence names. |
|
679 |
+ |
|
680 |
+The \MSA\ package offers a function \verb+msaCheckNames()+ which allows for finding |
|
681 |
+and replacing potentially problematic characters in the sequence names of |
|
682 |
+multiple alignment objects (see \verb+?msaCheckNames+). However, the best solution |
|
683 |
+is to check sequence names carefully and to avoid problematic sequence names from the beginning. |
|
684 |
+Note, moreover, that too long sequence names will lead to less appealing outputs, |
|
685 |
+so users are generally advised to consider sequence names carefully. |
|
686 |
+ |
|
669 | 687 |
\subsection{Further Caveats} |
670 | 688 |
|
671 | 689 |
\begin{itemize} |
672 |
- \item Note that \verb+texi2dvi()+ and \verb+ttexi2pdf()+ always |
|
690 |
+ \item Note that \verb+texi2dvi()+ and \verb+texi2pdf()+ always |
|
673 | 691 |
save the resulting DVI/PDF files to the current working directory, |
674 | 692 |
even if the \LaTeX\ source file is in a different directory. |
675 | 693 |
That is also the reason why the temporary file is created in the |
... | ... |
@@ -764,6 +782,13 @@ bibliography below). |
764 | 782 |
\section{Change Log} |
765 | 783 |
|
766 | 784 |
\begin{description} |
785 |
+\item[Version 1.3.4:] \mbox{ } \begin{itemize} |
|
786 |
+ \item added function for checking and fixing sequence names for |
|
787 |
+ possibly problematic characters that could lead to \LaTeX\ errors |
|
788 |
+ when using \verb+msaPrettyPrint()+ |
|
789 |
+ \item corresponding changes in documentation |
|
790 |
+ \item minor namespace fix |
|
791 |
+ \end{itemize} |
|
767 | 792 |
\item[Version 1.3.3:] \mbox{ } \begin{itemize} |
768 | 793 |
\item added function for converting multiple sequence alignments for |
769 | 794 |
use with other sequence alignment packages |