Browse code

Added a new function msaCheckNames; version number bumped to 1.3.4

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@115880 bc3139a8-67e5-0310-9ffc-ced21a209358

Ulrich Bodenhofer authored on 06/04/2016 12:24:49
Showing 6 changed files

... ...
@@ -1,8 +1,8 @@
1 1
 Package: msa
2 2
 Type: Package
3 3
 Title: Multiple Sequence Alignment
4
-Version: 1.3.3
5
-Date: 2016-02-25
4
+Version: 1.3.4
5
+Date: 2016-04-06
6 6
 Author: Enrico Bonatesta, Christoph Horejs-Kainrath, Ulrich Bodenhofer
7 7
 Maintainer: Ulrich Bodenhofer <bodenhofer@bioinf.jku.at>
8 8
 Description: This package provides a unified R/Bioconductor interface to the
... ...
@@ -27,7 +27,7 @@ LazyLoad: yes
27 27
 Collate: AllClasses.R AllGenerics.R params-methods.R version-methods.R
28 28
         helperFunctions.R inputChecks.R convertRows.R msaPrettyPrint.R
29 29
         print-methods.R show-methods.R msa.R msaMuscle.R msaClustalW.R
30
-        msaClustalOmega.R msaConvert.R
30
+        msaClustalOmega.R msaConvert.R msaCheckNames.R
31 31
 biocViews: MultipleSequenceAlignment, Alignment, MultipleComparison,
32 32
         Sequencing
33 33
 NeedsCompilation: yes
... ...
@@ -7,9 +7,10 @@ import(Biostrings)
7 7
 import(S4Vectors)
8 8
 import(IRanges)
9 9
 importFrom(tools, texi2dvi)
10
+importFrom(utils, capture.output)
10 11
 
11 12
 export(msa, msaMuscle, msaClustalW, msaClustalOmega, msaPrettyPrint,
12
-       msaConvert)
13
+       msaConvert, msaCheckNames)
13 14
 
14 15
 exportClasses(MsaDNAMultipleAlignment, MsaRNAMultipleAlignment,
15 16
               MsaAAMultipleAlignment, MsaMetaData)
16 17
new file mode 100644
... ...
@@ -0,0 +1,20 @@
1
+msaCheckNames <- function(x, replacement=" ", verbose=TRUE)
2
+{
3
+    if (!is(x, "MultipleAlignment"))
4
+        stop("x must be a multiple alignment object")
5
+
6
+    out <- x
7
+
8
+    pattern <- "[^a-zA-Z0-9,;:.?!\\-\\(\\)\\'\" ]"
9
+
10
+    if (length(grep(pattern, rownames(x), perl=TRUE)) > 0)
11
+    {
12
+        if (verbose)
13
+            message("sequence names contain invalid characters")
14
+
15
+        rownames(out) <- gsub(pattern, replacement, rownames(x),
16
+                              perl=TRUE)
17
+    }
18
+
19
+    invisible(out)
20
+}
... ...
@@ -1,6 +1,13 @@
1 1
 Change history of package msa:
2 2
 ==============================
3 3
 
4
+Version 1.3.4:
5
+- added function for checking and fixing sequence names for
6
+  possibly problematic characters that could lead to LaTeX
7
+  errors when using msaPrettyPrint()
8
+- corresponding changes in documentation
9
+- minor namespace fix
10
+
4 11
 Version 1.3.3:
5 12
 - added function for converting multiple sequence alignments for
6 13
   use with other sequence alignment packages
7 14
new file mode 100644
... ...
@@ -0,0 +1,91 @@
1
+\name{msaCheckNames}
2
+\alias{msaCheckNames}
3
+\title{Check and fix sequence names}
4
+\description{
5
+  This function checks and fixed sequence names of multiple
6
+  alignment objects if they contain characters
7
+  that might lead to LaTeX problems when using
8
+  \code{\link{msaPrettyPrint}}.
9
+}
10
+\usage{
11
+    msaCheckNames(x, replacement=" ", verbose=TRUE)
12
+}
13
+\arguments{
14
+  \item{x}{an object of class \code{\linkS4class{MultipleAlignment}}
15
+    (which includes objects of classes
16
+    \code{\linkS4class{MsaAAMultipleAlignment}},
17
+    \code{\linkS4class{MsaDNAMultipleAlignment}}, and
18
+    \code{\linkS4class{MsaRNAMultipleAlignment}})}
19
+  \item{replacement}{a character string specifying with which
20
+    character(s) potentially problematic characters should be replaced.}
21
+  \item{verbose}{if \code{TRUE} (default), a warning message is shown
22
+    if potentially problematic characters are found. Otherwise,
23
+    the function silently replaces these characters (see details below).}
24
+}
25
+\details{
26
+  The \pkg{Biostrings} package does not impose any restrictions on
27
+  the names of sequences. Consequently, \pkg{msa} also allows all
28
+  possible ASCII strings as sequence (row) names in multiple alignments.
29
+  As soon as \code{\link{msaPrettyPrint}} is used for pretty-printing
30
+  multiple sequence alignments, however, the sequence names are
31
+  interpreted as plain LaTeX source code. Consequently, LaTeX errors
32
+  may arise because of characters or words in the sequence names that LaTeX
33
+  does not or cannot interpret as plain text correctly. This
34
+  particularly includes appearances of special characters and backslash
35
+  characters in the sequence names.
36
+
37
+  The \code{msaCheckNames} function takes a multiple alignment object
38
+  and checks sequence names for possibly problematic characters, which
39
+  are all characters but letters (upper and lower case), digits,
40
+  spaces, commas, colons, semicolons, periods, question and exclamation
41
+  marks, dashes, braces, single quotes, and double quotes.
42
+  All other characters are
43
+  considered problematic. The function allows for both checking and
44
+  fixing the sequence names. If called with \code{verbose=TRUE}
45
+  (default), the function prints a warning if a problematic character is
46
+  found. At the same time, regardless of the \code{verbose} argument,
47
+  the function invisibly returns a copy of \code{x} in whose sequence
48
+  names all problematic characters have been replaced by the string
49
+  that is supplied via the \code{replacement} argument (the default is
50
+  a single space).
51
+
52
+  In any case, the best solution is to check sequence names carefully
53
+  and to avoid problematic sequence names from the beginning.
54
+}
55
+\value{
56
+  The function invisibly returns a copy of the argument \code{x}
57
+  (therefore, an object of the same class as \code{x}), but
58
+  with modified sequence/row names (see details above).
59
+}
60
+\author{Ulrich Bodenhofer <msa@bioinf.jku.at>
61
+}
62
+\references{
63
+  \url{http://www.bioinf.jku.at/software/msa}
64
+  
65
+  U. Bodenhofer, E. Bonatesta, C. Horejs-Kainrath, and S. Hochreiter
66
+  (2015). msa: an R package for multiple sequence alignment. 
67
+  \emph{Bioinformatics} \bold{31}(24):3997-3999. DOI:
68
+  \href{http://dx.doi.org/10.1093/bioinformatics/btv494}{10.1093/bioinformatics/btv494}.
69
+}
70
+\seealso{\code{\link{msaPrettyPrint}},
71
+  \code{\linkS4class{MsaAAMultipleAlignment}},
72
+  \code{\linkS4class{MsaDNAMultipleAlignment}},
73
+  \code{\linkS4class{MsaRNAMultipleAlignment}}
74
+}
75
+\examples{
76
+## create toy example
77
+mySeqs <- DNAStringSet(c("ACGATCGATC", "ACGACGATC", "ACGATCCCCC"))
78
+names(mySeqs) <- c("Seq. #1", "Seq. \\2", "Seq. ~3")
79
+
80
+## perform multiple alignment
81
+myAlignment <- msa(mySeqs)
82
+myAlignment
83
+
84
+## check names
85
+msaCheckNames(myAlignment)
86
+
87
+## fix names
88
+myAlignment <- msaCheckNames(myAlignment, replacement="", verbose=FALSE)
89
+myAlignment
90
+}
91
+\keyword{manip}
... ...
@@ -666,10 +666,28 @@ the \shade\ package must be loaded in the preamble:
666 666
 \end{verbatim}
667 667
 \end{quote}
668 668
 
669
+\subsection{Sequence Names}
670
+
671
+The \verb+Biostrings+ package does not impose any restrictions on the names of
672
+sequences. Consequently, \MSA\ also allows all possible ASCII strings as
673
+sequence (row) names in multiple alignments. As soon as \verb+msaPrettyPrint()+
674
+is used for pretty-printing multiple sequence alignments, however, the sequence
675
+names are interpreted as plain \LaTeX\ source code. Consequently, \LaTeX\ errors
676
+may arise because of characters or words in the sequence names that \LaTeX\
677
+does not or cannot interpret as plain text correctly. This particularly includes
678
+appearances of special characters and backslash characters in the sequence names.
679
+
680
+The \MSA\ package offers a function \verb+msaCheckNames()+ which allows for finding
681
+and replacing potentially problematic characters in the sequence names of
682
+multiple alignment objects (see \verb+?msaCheckNames+). However, the best solution
683
+is to check sequence names carefully and to avoid problematic sequence names from the beginning.
684
+Note, moreover, that too long sequence names will lead to less appealing outputs,
685
+so users are generally advised to consider sequence names carefully.
686
+
669 687
 \subsection{Further Caveats}
670 688
 
671 689
 \begin{itemize}
672
-  \item Note that \verb+texi2dvi()+ and \verb+ttexi2pdf()+ always
690
+  \item Note that \verb+texi2dvi()+ and \verb+texi2pdf()+ always
673 691
     save the resulting DVI/PDF files to the current working directory,
674 692
     even if the \LaTeX\ source file is in a different directory.
675 693
     That is also the reason why the temporary file is created in the
... ...
@@ -764,6 +782,13 @@ bibliography below).
764 782
 \section{Change Log}
765 783
 
766 784
 \begin{description}
785
+\item[Version 1.3.4:] \mbox{ }  \begin{itemize}
786
+   \item added function for checking and fixing sequence names for
787
+     possibly problematic characters that could lead to \LaTeX\ errors
788
+     when using \verb+msaPrettyPrint()+
789
+   \item corresponding changes in documentation
790
+   \item minor namespace fix
791
+  \end{itemize}
767 792
 \item[Version 1.3.3:] \mbox{ }  \begin{itemize}
768 793
    \item added function for converting multiple sequence alignments for
769 794
      use with other sequence alignment packages