Browse code

update to latest gstruct; brings faster bam_tally (for high coverage regions) and read-group filtering support in bam_tally

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@80611 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 20/09/2013 02:05:42
Showing 26 changed files

... ...
@@ -10,7 +10,7 @@ Description: GSNAP and GMAP are a pair of tools to align short-read
10 10
     to work with GMAP and GSNAP from within R. In addition, it provides 
11 11
     methods to tally alignment results on a per-nucleotide basis using 
12 12
     the bam_tally tool.
13
-Version: 1.3.9
13
+Version: 1.3.10
14 14
 Depends: R (>= 2.15.0), methods, GenomicRanges
15 15
 Imports: IRanges, Rsamtools (>= 1.7.4), rtracklayer (>= 1.17.15), GenomicRanges,
16 16
          GenomicFeatures, Biostrings, VariantAnnotation (>= 1.7.34), tools,
... ...
@@ -8,6 +8,7 @@
8 8
 setClass("BamTallyParam",
9 9
          representation(genome = "GmapGenome",
10 10
                         which = "GenomicRanges",
11
+                        desired_read_group = "characterORNULL",
11 12
                         minimum_mapq = "integer",
12 13
                         concordant_only = "logical",
13 14
                         unique_only = "logical",
... ...
@@ -32,13 +33,15 @@ normArgWhich <- function(x, genome) {
32 33
 }
33 34
 
34 35
 BamTallyParam <- function(genome, which = GRanges(),
35
-                          minimum_mapq = 0L,
36
+                          desired_read_group = NULL, minimum_mapq = 0L,
36 37
                           concordant_only = FALSE, unique_only = FALSE,
37 38
                           primary_only = FALSE, ignore_duplicates = FALSE,
38 39
                           min_depth = 0L, variant_strand = 0L,
39 40
                           ignore_query_Ns = FALSE,
40 41
                           indels = FALSE)
41 42
 {
43
+  if (!is.null(desired_read_group) && !isSingleString(desired_read_group))
44
+    stop("'desired_read_group' must be NULL or a single, non-NA string")
42 45
   if (!isSingleNumber(minimum_mapq) || minimum_mapq < 0)
43 46
     stop("minimum_mapq must be a single, non-negative, non-NA number")
44 47
   if (!isTRUEorFALSE(concordant_only))
... ...
@@ -120,11 +120,11 @@ normalizeIndelAlleles <- function(x, genome) {
120 120
   is.indel <- nchar(ref(x)) == 0L | nchar(alt(x)) == 0L
121 121
   if (any(is.indel)) {
122 122
     indels <- x[is.indel]
123
-    indels <- flank(indels, 1)
124
-    anchor <- getSeq(genome, indels)
125
-    ref(indels) <- paste0(anchor, ref(indels))
126
-    alt(indels) <- paste0(anchor, alt(indels))
127
-    x[is.indel] <- indels
123
+    flanks <- flank(indels, 1)
124
+    anchor <- getSeq(genome, flanks)
125
+    ref(x)[is.indel] <- paste0(anchor, ref(indels))
126
+    alt(x)[is.indel] <- paste0(anchor, alt(indels))
127
+    ranges(x)[is.indel] <- resize(ranges(indels), nchar(ref(x)[is.indel]), "end")
128 128
   }
129 129
   x
130 130
 }
... ...
@@ -136,7 +136,7 @@ normalizeIndelAlleles <- function(x, genome) {
136 136
 normArgSingleInteger <- function(x) {
137 137
   name <- deparse(substitute(x))
138 138
   x <- as.integer(x)
139
-  if (!IRanges:::isSingleInteger(x))
139
+  if (!isSingleInteger(x))
140 140
     stop("'", name, "' should be a single, non-NA integer")
141 141
   x
142 142
 }
... ...
@@ -149,7 +149,8 @@ normArgTRUEorFALSE <- function(x) {
149 149
 
150 150
 .bam_tally_C <- function(bamreader, genome_dir = NULL, db = NULL,
151 151
                          which = NULL, read_pos_breaks = NULL,
152
-                         high_base_quality = 0L, alloclength = 200000L,
152
+                         high_base_quality = 0L, desired_read_group = NULL,
153
+                         alloclength = 200000L,
153 154
                          minimum_mapq = 0L, good_unique_mapq = 35L,
154 155
                          maximum_nhits = 1000000L,
155 156
                          concordant_only = FALSE, unique_only = FALSE,
... ...
@@ -164,10 +165,12 @@ normArgTRUEorFALSE <- function(x) {
164 165
   if (length(which) > 0L) {
165 166
     which <- list(as.character(seqnames(which)), start(which), end(which))
166 167
   } else which <- NULL
167
-  if (!is.null(genome_dir) && !IRanges:::isSingleString(genome_dir))
168
+  if (!is.null(genome_dir) && !isSingleString(genome_dir))
168 169
     stop("'genome_dir' must be NULL or a single, non-NA string")
169
-  if (!is.null(db) && !IRanges:::isSingleString(db))
170
+  if (!is.null(db) && !isSingleString(db))
170 171
     stop("'db' must be NULL or a single, non-NA string")
172
+  if (!is.null(desired_read_group) && !isSingleString(desired_read_group))
173
+    stop("'desired_read_group' must be NULL or a single, non-NA string")
171 174
   if (!is.null(read_pos_breaks)) {
172 175
     read_pos_breaks <- as.integer(read_pos_breaks)
173 176
     if (any(is.na(read_pos_breaks)))
... ...
@@ -178,6 +181,7 @@ normArgTRUEorFALSE <- function(x) {
178 181
       stop("'read_pos_breaks' must be sorted")
179 182
   }
180 183
   .Call(R_Bamtally_iit, bamreader@.extptr, genome_dir, db, which,
184
+        desired_read_group,
181 185
         normArgSingleInteger(alloclength),
182 186
         normArgSingleInteger(minimum_mapq),
183 187
         normArgSingleInteger(good_unique_mapq),
... ...
@@ -14,14 +14,16 @@
14 14
 }
15 15
 
16 16
 \usage{
17
-  BamTallyParam(genome, which = RangesList(), read_pos_breaks = NULL,
17
+  BamTallyParam(genome, which = RangesList(),
18
+                desired_read_group = NULL,
19
+                read_pos_breaks = NULL,
18 20
                 high_base_quality = 0L,
19 21
                 minimum_mapq = 0L,
20 22
                 concordant_only = FALSE, unique_only = FALSE,
21 23
                 primary_only = FALSE, ignore_duplicates = FALSE,
22 24
                 min_depth = 0L, variant_strand = 0L,
23 25
                 ignore_query_Ns = FALSE,
24
-                indels = FALSE)
26
+                indels = TRUE)
25 27
 }
26 28
 \arguments{
27 29
   \item{genome}{A \code{GmapGenome} object, or something coercible to one.}
... ...
@@ -29,6 +31,8 @@
29 31
     one that limits the tally to that range or set of ranges. By
30 32
     default, the entire genome is processed.
31 33
   }
34
+  \item{desired_read_group}{The name of the read group to which to limit
35
+    the tallying; if not NULL, must be a single, non-NA string.}
32 36
   \item{read_pos_breaks}{The breaks, like those passed to \code{\link{cut}}
33 37
     for aggregating the per-read position counts. If \code{NULL}, no per-cycle
34 38
     counts are returned.}
... ...
@@ -54,7 +58,12 @@
54 58
     is a good way to save resources.}
55 59
   \item{ignore_query_Ns}{Whether to ignore the N base pairs when
56 60
     counting. Can save a lot of resources when processing low quality data.}
57
-  \item{indels}{Whether to return indel counts; not supported yet.}
61
+  \item{indels}{Whether to return indel counts. The \code{ref} and
62
+    \code{alt} columns in the returned \code{VRanges} conform to VCF
63
+    conventions; i.e., the first base upstream is included. The range
64
+    always spans the sequence in \code{ref}; so e.g. a deletion extends
65
+    one nt upstream of the actual deleted sequence.
66
+  }
58 67
 }
59 68
 
60 69
 \seealso{
... ...
@@ -49,7 +49,7 @@ gstruct/Makefile: gstruct/configure ${R_SRC_DIR}/samtools
49 49
 	            --prefix=${PREFIX} --includedir=${GSTRUCT_INCLUDE_DIR} \
50 50
 	            --libdir=${PREFIX}/${LIBnn} \
51 51
 	            --with-samtools=${R_SRC_DIR}/samtools \
52
-		    --disable-binaries --disable-maintainer-mode #\
52
+		    --disable-binaries #--disable-maintainer-mode #\
53 53
 ## does not appear to be a true dependency yet
54 54
 ##	            --with-gmap=${PREFIX}/bin
55 55
 
... ...
@@ -7,7 +7,7 @@
7 7
 static const R_CallMethodDef callMethods[] = {
8 8
 
9 9
   /* bamtally.c */
10
-  CALLMETHOD_DEF(R_Bamtally_iit, 18),
10
+  CALLMETHOD_DEF(R_Bamtally_iit, 19),
11 11
   CALLMETHOD_DEF(R_tally_iit_parse, 4),
12 12
   
13 13
   /* bamreader.c */
... ...
@@ -13,7 +13,7 @@
13 13
 
14 14
 SEXP
15 15
 R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
16
-                SEXP which_R,
16
+                SEXP which_R, SEXP desired_read_group_R,
17 17
                 SEXP alloclength_R,
18 18
                 SEXP minimum_mapq_R, SEXP good_unique_mapq_R,
19 19
                 SEXP maximum_nhits_R,
... ...
@@ -30,6 +30,9 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
30 30
     genome_dir_R == R_NilValue ? NULL : CHAR(asChar(genome_dir_R));
31 31
   const char *db = CHAR(asChar(db_R));
32 32
   int alloclength = asInteger(alloclength_R);
33
+  const char *desired_read_group =
34
+    desired_read_group_R == R_NilValue ? NULL :
35
+    CHAR(asChar(desired_read_group_R));
33 36
   int minimum_mapq = asInteger(minimum_mapq_R);
34 37
   int good_unique_mapq = asInteger(good_unique_mapq_R);
35 38
   int maximum_nhits = asInteger(maximum_nhits_R);
... ...
@@ -57,9 +60,12 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
57 60
     end = asInteger(VECTOR_ELT(which_R, 2));
58 61
   }
59 62
 
60
-  IIT_T tally_iit = Bamtally_iit(bamreader, (char *)chr, start, end,
61
-                                 genome, chromosome_iit,
62
-                                 alloclength, minimum_mapq, good_unique_mapq,
63
+  IIT_T tally_iit = Bamtally_iit(bamreader, (char *)chr, 
64
+                                 /* TODO: bam_lacks_chr */ NULL,
65
+                                 start, end,
66
+                                 genome, chromosome_iit, alloclength,
67
+                                 (char *)desired_read_group,
68
+                                 minimum_mapq, good_unique_mapq,
63 69
                                  maximum_nhits, need_concordant_p,
64 70
                                  need_unique_p, need_primary_p,
65 71
                                  ignore_duplicates_p,
... ...
@@ -10,7 +10,7 @@ R_Bamread_new (SEXP bamfile_R);
10 10
 
11 11
 SEXP
12 12
 R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
13
-                SEXP which_R,
13
+                SEXP which_R, SEXP desired_read_group_R,
14 14
                 SEXP alloclength_R,
15 15
                 SEXP minimum_mapq_R, SEXP good_unique_mapq_R,
16 16
                 SEXP maximum_nhits_R,
... ...
@@ -51,13 +51,13 @@ build_triplet = @build@
51 51
 host_triplet = @host@
52 52
 target_triplet = @target@
53 53
 subdir = .
54
-DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
54
+DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog COPYING \
55 55
 	$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
56 56
 	$(top_srcdir)/configure $(am__configure_deps) \
57
-	$(top_srcdir)/lib/gstruct.pc.in COPYING config/compile \
58
-	config/config.guess config/config.sub config/depcomp \
59
-	config/install-sh config/missing config/ltmain.sh \
60
-	$(top_srcdir)/config/compile $(top_srcdir)/config/config.guess \
57
+	$(top_srcdir)/lib/gstruct.pc.in config/compile \
58
+	config/config.guess config/config.sub config/install-sh \
59
+	config/missing config/ltmain.sh $(top_srcdir)/config/compile \
60
+	$(top_srcdir)/config/config.guess \
61 61
 	$(top_srcdir)/config/config.sub \
62 62
 	$(top_srcdir)/config/install-sh $(top_srcdir)/config/ltmain.sh \
63 63
 	$(top_srcdir)/config/missing
... ...
@@ -231,7 +231,6 @@ LIBTOOL = @LIBTOOL@
231 231
 LIPO = @LIPO@
232 232
 LN_S = @LN_S@
233 233
 LTLIBOBJS = @LTLIBOBJS@
234
-MAINT = @MAINT@
235 234
 MAKEINFO = @MAKEINFO@
236 235
 MANIFEST_TOOL = @MANIFEST_TOOL@
237 236
 MKDIR_P = @MKDIR_P@
... ...
@@ -329,7 +328,7 @@ all: all-recursive
329 328
 .SUFFIXES:
330 329
 am--refresh: Makefile
331 330
 	@:
332
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
331
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
333 332
 	@for dep in $?; do \
334 333
 	  case '$(am__configure_deps)' in \
335 334
 	    *$$dep*) \
... ...
@@ -356,9 +355,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
356 355
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
357 356
 	$(SHELL) ./config.status --recheck
358 357
 
359
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
358
+$(top_srcdir)/configure:  $(am__configure_deps)
360 359
 	$(am__cd) $(srcdir) && $(AUTOCONF)
361
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
360
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
362 361
 	$(am__cd) $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
363 362
 $(am__aclocal_m4_deps):
364 363
 lib/gstruct-${LIBGSTRUCT_API_VERSION}.pc: $(top_builddir)/config.status $(top_srcdir)/lib/gstruct.pc.in
... ...
@@ -1,22 +1,2 @@
1 1
 
2 2
 
3
-
4
-Usage:
5
-
6
-bam_singlemap -d <genome> <bam> | samtools view -Sb - > <new bam>
7
-
8
-bam_tally -d <genome> <bam>
9
-bam_gstruct -d <genome> <bam> | iit_store -o <pred>
10
-bam_print -d <genome> -m <genes>,<pred> <bam> <coords>
11
-
12
-
13
-genecompare -d <genome> -m <genes> <pred.iit> > x, or
14
-genecompare <goldstd.iit> <pred.iit> > x
15
-
16
-
17
-Depends on samtools and gmap packages.
18
-
19
-gfusion depends on gmap and qsub
20
-
21
-
22
-
... ...
@@ -1 +1 @@
1
-2013-02-14
2 1
\ No newline at end of file
2
+2013-09-12
3 3
\ No newline at end of file
... ...
@@ -596,42 +596,6 @@ fi
596 596
 rmdir .tst 2>/dev/null
597 597
 AC_SUBST([am__leading_dot])])
598 598
 
599
-# Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
600
-# From Jim Meyering
601
-
602
-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
603
-#
604
-# This file is free software; the Free Software Foundation
605
-# gives unlimited permission to copy and/or distribute it,
606
-# with or without modifications, as long as this notice is preserved.
607
-
608
-# AM_MAINTAINER_MODE([DEFAULT-MODE])
609
-# ----------------------------------
610
-# Control maintainer-specific portions of Makefiles.
611
-# Default is to disable them, unless 'enable' is passed literally.
612
-# For symmetry, 'disable' may be passed as well.  Anyway, the user
613
-# can override the default with the --enable/--disable switch.
614
-AC_DEFUN([AM_MAINTAINER_MODE],
615
-[m4_case(m4_default([$1], [disable]),
616
-       [enable], [m4_define([am_maintainer_other], [disable])],
617
-       [disable], [m4_define([am_maintainer_other], [enable])],
618
-       [m4_define([am_maintainer_other], [enable])
619
-        m4_warn([syntax], [unexpected argument to AM@&t@_MAINTAINER_MODE: $1])])
620
-AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
621
-  dnl maintainer-mode's default is 'disable' unless 'enable' is passed
622
-  AC_ARG_ENABLE([maintainer-mode],
623
-    [AS_HELP_STRING([--]am_maintainer_other[-maintainer-mode],
624
-      am_maintainer_other[ make rules and dependencies not useful
625
-      (and sometimes confusing) to the casual installer])],
626
-    [USE_MAINTAINER_MODE=$enableval],
627
-    [USE_MAINTAINER_MODE=]m4_if(am_maintainer_other, [enable], [no], [yes]))
628
-  AC_MSG_RESULT([$USE_MAINTAINER_MODE])
629
-  AM_CONDITIONAL([MAINTAINER_MODE], [test $USE_MAINTAINER_MODE = yes])
630
-  MAINT=$MAINTAINER_MODE_TRUE
631
-  AC_SUBST([MAINT])dnl
632
-]
633
-)
634
-
635 599
 # Check to see how 'make' treats includes.	            -*- Autoconf -*-
636 600
 
637 601
 # Copyright (C) 2001-2013 Free Software Foundation, Inc.
... ...
@@ -1,6 +1,6 @@
1 1
 #! /bin/sh
2 2
 # Guess values for system-dependent variables and create Makefiles.
3
-# Generated by GNU Autoconf 2.69 for gstruct 2013-02-14.
3
+# Generated by GNU Autoconf 2.69 for gstruct 2013-09-12.
4 4
 #
5 5
 # Report bugs to <Thomas Wu <twu@gene.com>>.
6 6
 #
... ...
@@ -590,8 +590,8 @@ MAKEFLAGS=
590 590
 # Identity of this package.
591 591
 PACKAGE_NAME='gstruct'
592 592
 PACKAGE_TARNAME='gstruct'
593
-PACKAGE_VERSION='2013-02-14'
594
-PACKAGE_STRING='gstruct 2013-02-14'
593
+PACKAGE_VERSION='2013-09-12'
594
+PACKAGE_STRING='gstruct 2013-09-12'
595 595
 PACKAGE_BUGREPORT='Thomas Wu <twu@gene.com>'
596 596
 PACKAGE_URL=''
597 597
 
... ...
@@ -684,9 +684,6 @@ MAINTAINER_FALSE
684 684
 MAINTAINER_TRUE
685 685
 FULLDIST_FALSE
686 686
 FULLDIST_TRUE
687
-MAINT
688
-MAINTAINER_MODE_FALSE
689
-MAINTAINER_MODE_TRUE
690 687
 AM_BACKSLASH
691 688
 AM_DEFAULT_VERBOSITY
692 689
 AM_DEFAULT_V
... ...
@@ -789,7 +786,6 @@ enable_option_checking
789 786
 enable_largefile
790 787
 enable_dependency_tracking
791 788
 enable_silent_rules
792
-enable_maintainer_mode
793 789
 enable_fulldist
794 790
 enable_maintainer
795 791
 enable_static_linking
... ...
@@ -1355,7 +1351,7 @@ if test "$ac_init_help" = "long"; then
1355 1351
   # Omit some internal or obsolete options to make the list less imposing.
1356 1352
   # This message is too long to be a string in the A/UX 3.1 sh.
1357 1353
   cat <<_ACEOF
1358
-\`configure' configures gstruct 2013-02-14 to adapt to many kinds of systems.
1354
+\`configure' configures gstruct 2013-09-12 to adapt to many kinds of systems.
1359 1355
 
1360 1356
 Usage: $0 [OPTION]... [VAR=VALUE]...
1361 1357
 
... ...
@@ -1426,7 +1422,7 @@ fi
1426 1422
 
1427 1423
 if test -n "$ac_init_help"; then
1428 1424
   case $ac_init_help in
1429
-     short | recursive ) echo "Configuration of gstruct 2013-02-14:";;
1425
+     short | recursive ) echo "Configuration of gstruct 2013-09-12:";;
1430 1426
    esac
1431 1427
   cat <<\_ACEOF
1432 1428
 
... ...
@@ -1441,9 +1437,6 @@ Optional Features:
1441 1437
                           speeds up one-time build
1442 1438
   --enable-silent-rules   less verbose build output (undo: "make V=1")
1443 1439
   --disable-silent-rules  verbose build output (undo: "make V=0")
1444
-  --disable-maintainer-mode
1445
-                          disable make rules and dependencies not useful (and
1446
-                          sometimes confusing) to the casual installer
1447 1440
   --enable-fulldist       For use by program maintainer
1448 1441
   --enable-maintainer     For use by program maintainer
1449 1442
   --enable-static-linking Link binaries statically (default=no)
... ...
@@ -1545,7 +1538,7 @@ fi
1545 1538
 test -n "$ac_init_help" && exit $ac_status
1546 1539
 if $ac_init_version; then
1547 1540
   cat <<\_ACEOF
1548
-gstruct configure 2013-02-14
1541
+gstruct configure 2013-09-12
1549 1542
 generated by GNU Autoconf 2.69
1550 1543
 
1551 1544
 Copyright (C) 2012 Free Software Foundation, Inc.
... ...
@@ -2151,7 +2144,7 @@ cat >config.log <<_ACEOF
2151 2144
 This file contains any messages produced by compilers while
2152 2145
 running configure, to aid debugging if configure makes a mistake.
2153 2146
 
2154
-It was created by gstruct $as_me 2013-02-14, which was
2147
+It was created by gstruct $as_me 2013-09-12, which was
2155 2148
 generated by GNU Autoconf 2.69.  Invocation command line was
2156 2149
 
2157 2150
   $ $0 $@
... ...
@@ -2501,8 +2494,8 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
2501 2494
 
2502 2495
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking package version" >&5
2503 2496
 $as_echo_n "checking package version... " >&6; }
2504
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 2013-02-14" >&5
2505
-$as_echo "2013-02-14" >&6; }
2497
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 2013-09-12" >&5
2498
+$as_echo "2013-09-12" >&6; }
2506 2499
 
2507 2500
 ### Read defaults
2508 2501
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking CONFIG_SITE" >&5
... ...
@@ -4256,8 +4249,9 @@ fi
4256 4249
 
4257 4250
 
4258 4251
 # Define the identity of the package.
4259
- PACKAGE='gstruct'
4260
- VERSION='2013-02-14'
4252
+
4253
+ PACKAGE=gstruct
4254
+ VERSION=2013-09-12
4261 4255
 
4262 4256
 
4263 4257
 cat >>confdefs.h <<_ACEOF
... ...
@@ -4432,29 +4426,6 @@ fi
4432 4426
 
4433 4427
 
4434 4428
 
4435
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to enable maintainer-specific portions of Makefiles" >&5
4436
-$as_echo_n "checking whether to enable maintainer-specific portions of Makefiles... " >&6; }
4437
-    # Check whether --enable-maintainer-mode was given.
4438
-if test "${enable_maintainer_mode+set}" = set; then :
4439
-  enableval=$enable_maintainer_mode; USE_MAINTAINER_MODE=$enableval
4440
-else
4441
-  USE_MAINTAINER_MODE=yes
4442
-fi
4443
-
4444
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $USE_MAINTAINER_MODE" >&5
4445
-$as_echo "$USE_MAINTAINER_MODE" >&6; }
4446
-   if test $USE_MAINTAINER_MODE = yes; then
4447
-  MAINTAINER_MODE_TRUE=
4448
-  MAINTAINER_MODE_FALSE='#'
4449
-else
4450
-  MAINTAINER_MODE_TRUE='#'
4451
-  MAINTAINER_MODE_FALSE=
4452
-fi
4453
-
4454
-  MAINT=$MAINTAINER_MODE_TRUE
4455
-
4456
-
4457
-
4458 4429
  if test "x$enable_fulldist" = xyes; then
4459 4430
   FULLDIST_TRUE=
4460 4431
   FULLDIST_FALSE='#'
... ...
@@ -15457,10 +15428,6 @@ else
15457 15428
   am__EXEEXT_FALSE=
15458 15429
 fi
15459 15430
 
15460
-if test -z "${MAINTAINER_MODE_TRUE}" && test -z "${MAINTAINER_MODE_FALSE}"; then
15461
-  as_fn_error $? "conditional \"MAINTAINER_MODE\" was never defined.
15462
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
15463
-fi
15464 15431
 if test -z "${FULLDIST_TRUE}" && test -z "${FULLDIST_FALSE}"; then
15465 15432
   as_fn_error $? "conditional \"FULLDIST\" was never defined.
15466 15433
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
... ...
@@ -15875,7 +15842,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
15875 15842
 # report actual input values of CONFIG_FILES etc. instead of their
15876 15843
 # values after options handling.
15877 15844
 ac_log="
15878
-This file was extended by gstruct $as_me 2013-02-14, which was
15845
+This file was extended by gstruct $as_me 2013-09-12, which was
15879 15846
 generated by GNU Autoconf 2.69.  Invocation command line was
15880 15847
 
15881 15848
   CONFIG_FILES    = $CONFIG_FILES
... ...
@@ -15941,7 +15908,7 @@ _ACEOF
15941 15908
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
15942 15909
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
15943 15910
 ac_cs_version="\\
15944
-gstruct config.status 2013-02-14
15911
+gstruct config.status 2013-09-12
15945 15912
 configured by $0, generated by GNU Autoconf 2.69,
15946 15913
   with options \\"\$ac_cs_config\\"
15947 15914
 
... ...
@@ -63,8 +63,7 @@ AC_SYS_LARGEFILE
63 63
 AC_ARG_PROGRAM
64 64
 
65 65
 #AM_INIT_AUTOMAKE([no-dependencies])
66
-AM_INIT_AUTOMAKE([1.10.0])
67
-AM_MAINTAINER_MODE([enable])
66
+AM_INIT_AUTOMAKE(AC_PACKAGE_NAME, AC_PACKAGE_VERSION)
68 67
 
69 68
 AM_CONDITIONAL(FULLDIST,test "x$enable_fulldist" = xyes)
70 69
 AC_ARG_ENABLE([fulldist],
... ...
@@ -360,7 +360,6 @@ LIBTOOL = @LIBTOOL@
360 360
 LIPO = @LIPO@
361 361
 LN_S = @LN_S@
362 362
 LTLIBOBJS = @LTLIBOBJS@
363
-MAINT = @MAINT@
364 363
 MAKEINFO = @MAKEINFO@
365 364
 MANIFEST_TOOL = @MANIFEST_TOOL@
366 365
 MKDIR_P = @MKDIR_P@
... ...
@@ -705,7 +704,7 @@ all: config.h
705 704
 
706 705
 .SUFFIXES:
707 706
 .SUFFIXES: .c .lo .o .obj
708
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
707
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
709 708
 	@for dep in $?; do \
710 709
 	  case '$(am__configure_deps)' in \
711 710
 	    *$$dep*) \
... ...
@@ -730,9 +729,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
730 729
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
731 730
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
732 731
 
733
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
732
+$(top_srcdir)/configure:  $(am__configure_deps)
734 733
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
735
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
734
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
736 735
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
737 736
 $(am__aclocal_m4_deps):
738 737
 
... ...
@@ -743,7 +742,7 @@ config.h: stamp-h1
743 742
 stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
744 743
 	@rm -f stamp-h1
745 744
 	cd $(top_builddir) && $(SHELL) ./config.status src/config.h
746
-$(srcdir)/config.h.in: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) 
745
+$(srcdir)/config.h.in:  $(am__configure_deps) 
747 746
 	($(am__cd) $(top_srcdir) && $(AUTOHEADER))
748 747
 	rm -f stamp-h1
749 748
 	touch $@
... ...
@@ -1,4 +1,4 @@
1
-static char rcsid[] = "$Id: bam_fastq.c 87713 2013-03-01 18:32:34Z twu $";
1
+static char rcsid[] = "$Id: bam_fastq.c 108654 2013-09-19 23:11:00Z twu $";
2 2
 #ifdef HAVE_CONFIG_H
3 3
 #include <config.h>
4 4
 #endif
... ...
@@ -57,6 +57,7 @@ static char rcsid[] = "$Id: bam_fastq.c 87713 2013-03-01 18:32:34Z twu $";
57 57
 static char *output_root = "out";
58 58
 static char *error_root = NULL;
59 59
 static bool fastap = false;
60
+static bool add_bounds_p = false;
60 61
 static bool allp = true;
61 62
 static bool single_end_output_p = false; /* print paired-end BAM as single-end file */
62 63
 static int terminal_minlength = -1;
... ...
@@ -75,6 +76,7 @@ static struct option long_options[] = {
75 76
   {"output", required_argument, 0, 'o'},	/* output_root */
76 77
   {"error", required_argument, 0, 'e'},	/* error_root */
77 78
   {"fasta", no_argument, 0, 0},			/* fastap */
79
+  {"add-bounds", no_argument, 0, 0},		/* add_bounds_p */
78 80
   {"all", no_argument, 0, 'A'}, /* allp */
79 81
   {"single-end-output", no_argument, 0, '1'}, /* single_end_output_p */
80 82
   {"spliced", no_argument, 0, 0}, /* splicedp */
... ...
@@ -109,6 +111,7 @@ Usage: bam_fastq [options] [-o <output>] <bam file...>\n\
109 111
 \n\
110 112
 Options:\n\
111 113
   --fasta                         Writes to stdout in FASTA format\n\
114
+  --add-lh                        Add chromosomal low/high designation for concordant paired-end alignments\n\
112 115
   -o, --output=STRING             Writes FASTQ to <output>_1.fq and <output>_2.fq (default=out)\n\
113 116
   -1, --single-end-output         Prints reads as single-end (even if BAM files have paired-end reads)\n\
114 117
   -S, --terminal-minlength=INT    Print only reads with terminal ends (cigar S length) equal to\n\
... ...
@@ -197,11 +200,35 @@ struct T {
197 200
   char *queryseq;
198 201
   char *quality_string;
199 202
   bool splicedp;
203
+
204
+  char querystart_strand;
205
+  char *querystart_chr;
206
+  Genomicpos_T querystart_chrpos;
207
+
208
+  char querybreak1_strand;
209
+  char *querybreak1_chr;
210
+  Genomicpos_T querybreak1_chrpos;
211
+
212
+  char querybreak2_strand;
213
+  char *querybreak2_chr;
214
+  Genomicpos_T querybreak2_chrpos;
215
+
216
+  char queryend_strand;
217
+  char *queryend_chr;
218
+  Genomicpos_T queryend_chrpos;
200 219
 };
201 220
 
202 221
 
203 222
 static void
204 223
 Read_free (T *old) {
224
+  if ((*old)->querybreak1_chrpos != 0) {
225
+    FREE((*old)->querybreak1_chr);
226
+  }
227
+  if ((*old)->querybreak2_chrpos != 0) {
228
+    FREE((*old)->querybreak2_chr);
229
+  }
230
+  FREE((*old)->queryend_chr);
231
+  FREE((*old)->querystart_chr);
205 232
   FREE((*old)->queryseq);
206 233
   if ((*old)->quality_string != NULL) {
207 234
     FREE((*old)->quality_string);
... ...
@@ -211,6 +238,102 @@ Read_free (T *old) {
211 238
 }
212 239
 
213 240
 
241
+static void
242
+Read_add_chrinfo_querystart (Read_T read, int flag, char *chr, Genomicpos_T chrpos_low, Genomicpos_T chrpos_high) {
243
+
244
+  if (chr == NULL) {
245
+    read->querystart_strand = '*';
246
+    read->querystart_chr = (char *) CALLOC(1,sizeof(char));
247
+    read->querystart_chr[0] = '\0';
248
+    read->querystart_chrpos = 0;
249
+  } else {
250
+    read->querystart_chr = (char *) CALLOC(strlen(chr)+1,sizeof(char));
251
+    strcpy(read->querystart_chr,chr);
252
+    if (flag & QUERY_MINUSP) {
253
+      read->querystart_strand = '-';
254
+      read->querystart_chrpos = chrpos_high;
255
+    } else {
256
+      read->querystart_strand = '+';
257
+      read->querystart_chrpos = chrpos_low;
258
+    }
259
+  }
260
+
261
+  return;
262
+}
263
+
264
+static void
265
+Read_add_chrinfo_querybreak2 (Read_T read, int flag, char *chr, Genomicpos_T chrpos_low, Genomicpos_T chrpos_high) {
266
+
267
+  if (chr == NULL) {
268
+    abort();
269
+    read->querybreak2_strand = '*';
270
+    read->querybreak2_chr = (char *) CALLOC(1,sizeof(char));
271
+    read->querybreak2_chr[0] = '\0';
272
+    read->querybreak2_chrpos = 0;
273
+  } else {
274
+    read->querybreak2_chr = (char *) CALLOC(strlen(chr)+1,sizeof(char));
275
+    strcpy(read->querybreak2_chr,chr);
276
+    if (flag & QUERY_MINUSP) {
277
+      read->querybreak2_strand = '-';
278
+      read->querybreak2_chrpos = chrpos_high;
279
+    } else {
280
+      read->querybreak2_strand = '+';
281
+      read->querybreak2_chrpos = chrpos_low;
282
+    }
283
+  }
284
+
285
+  return;
286
+}
287
+
288
+
289
+static void
290
+Read_add_chrinfo_queryend (Read_T read, int flag, char *chr, Genomicpos_T chrpos_low, Genomicpos_T chrpos_high) {
291
+
292
+  if (chr == NULL) {
293
+    read->queryend_strand = '*';
294
+    read->queryend_chr = (char *) CALLOC(1,sizeof(char));
295
+    read->queryend_chr[0] = '\0';
296
+    read->queryend_chrpos = 0;
297
+  } else {
298
+    read->queryend_chr = (char *) CALLOC(strlen(chr)+1,sizeof(char));
299
+    strcpy(read->queryend_chr,chr);
300
+    if (flag & QUERY_MINUSP) {
301
+      read->queryend_strand = '-';
302
+      read->queryend_chrpos = chrpos_low;
303
+    } else {
304
+      read->queryend_strand = '+';
305
+      read->queryend_chrpos = chrpos_high;
306
+    }
307
+  }
308
+
309
+  return;
310
+}
311
+
312
+static void
313
+Read_add_chrinfo_querybreak1 (Read_T read, int flag, char *chr, Genomicpos_T chrpos_low, Genomicpos_T chrpos_high) {
314
+
315
+  if (chr == NULL) {
316
+    abort();
317
+    read->querybreak1_strand = '*';
318
+    read->querybreak1_chr = (char *) CALLOC(1,sizeof(char));
319
+    read->querybreak1_chr[0] = '\0';
320
+    read->querybreak1_chrpos = 0;
321
+  } else {
322
+    read->querybreak1_chr = (char *) CALLOC(strlen(chr)+1,sizeof(char));
323
+    strcpy(read->querybreak1_chr,chr);
324
+    if (flag & QUERY_MINUSP) {
325
+      read->querybreak1_strand = '-';
326
+      read->querybreak1_chrpos = chrpos_low;
327
+    } else {
328
+      read->querybreak1_strand = '+';
329
+      read->querybreak1_chrpos = chrpos_high;
330
+    }
331
+  }
332
+
333
+  return;
334
+}
335
+
336
+
214 337
 static T
215 338
 Read_create_bam (bool *completep, T read, Bamline_T bamline) {
216 339
   unsigned int flag;
... ...
@@ -220,6 +343,7 @@ Read_create_bam (bool *completep, T read, Bamline_T bamline) {
220 343
   char strand;
221 344
   int type;
222 345
   int cigar_querylength, mlength, querypos, i;
346
+  /* bool insidep; */
223 347
 
224 348
   flag = Bamline_flag(bamline);
225 349
   shortread = Bamline_read(bamline);
... ...
@@ -235,6 +359,9 @@ Read_create_bam (bool *completep, T read, Bamline_T bamline) {
235 359
       strcpy(read->quality_string,quality_string);
236 360
     }
237 361
     read->splicedp = true;
362
+    read->querybreak1_chrpos = read->querybreak2_chrpos = 0;
363
+    Read_add_chrinfo_querystart(read,flag,Bamline_chr(bamline),Bamline_chrpos_low(bamline),Bamline_chrpos_high(bamline));
364
+    Read_add_chrinfo_queryend(read,flag,Bamline_chr(bamline),Bamline_chrpos_low(bamline),Bamline_chrpos_high(bamline));
238 365
     *completep = true;
239 366
     return read;
240 367
   }
... ...
@@ -244,6 +371,7 @@ Read_create_bam (bool *completep, T read, Bamline_T bamline) {
244 371
   cigar_querylength = Bamline_cigar_querylength(bamline);
245 372
 
246 373
   if (read == NULL) {
374
+    /* Initialize */
247 375
     read = (T) MALLOC(sizeof(struct T));
248 376
     read->queryseq = (char *) CALLOC(cigar_querylength+1,sizeof(char));
249 377
     for (i = 0; i < cigar_querylength; i++) {
... ...
@@ -255,24 +383,42 @@ Read_create_bam (bool *completep, T read, Bamline_T bamline) {
255 383
       read->quality_string = (char *) CALLOC(cigar_querylength+1,sizeof(char));
256 384
     }
257 385
     read->splicedp = false;
386
+    read->querybreak1_chrpos = read->querybreak2_chrpos = 0;
258 387
   }
259 388
 
260 389
   if (flag & QUERY_MINUSP) {
261 390
     strand = '-';
262 391
     querypos = cigar_querylength - 1;
392
+    if (Intlist_head(types) == 'H') {
393
+      Read_add_chrinfo_querybreak1(read,flag,Bamline_chr(bamline),Bamline_chrpos_low(bamline),Bamline_chrpos_high(bamline));
394
+    } else {
395
+      Read_add_chrinfo_queryend(read,flag,Bamline_chr(bamline),Bamline_chrpos_low(bamline),Bamline_chrpos_high(bamline));
396
+    }
263 397
   } else {
264 398
     strand = '+';
265 399
     querypos = 0;
400
+    if (Intlist_head(types) == 'H') {
401
+      Read_add_chrinfo_querybreak2(read,flag,Bamline_chr(bamline),Bamline_chrpos_low(bamline),Bamline_chrpos_high(bamline));
402
+    } else {
403
+      Read_add_chrinfo_querystart(read,flag,Bamline_chr(bamline),Bamline_chrpos_low(bamline),Bamline_chrpos_high(bamline));
404
+    }
266 405
   }
267 406
 
407
+
408
+
268 409
   p = shortread;
269 410
   q = quality_string;
411
+  /* insidep = true; */
270 412
   for (a = types, b = npositions; a != NULL; a = Intlist_next(a), b = Uintlist_next(b)) {
271 413
     type = Intlist_head(a);
272 414
     mlength = Uintlist_head(b);
273 415
 
274 416
     if (type == 'H') {
275
-      querypos += ((strand == '+') ? +mlength : -mlength);
417
+      if (strand == '+') {
418
+	querypos += mlength;
419
+      } else {
420
+	querypos -= mlength;
421
+      }
276 422
 
277 423
     } else if (type == 'N') {
278 424
       /* Ignore */
... ...
@@ -328,6 +474,20 @@ Read_create_bam (bool *completep, T read, Bamline_T bamline) {
328 474
     }
329 475
   }
330 476
   
477
+  if (type == 'H') {
478
+    if (flag & QUERY_MINUSP) {
479
+      Read_add_chrinfo_querybreak2(read,flag,Bamline_chr(bamline),Bamline_chrpos_low(bamline),Bamline_chrpos_high(bamline));
480
+    } else {
481
+      Read_add_chrinfo_querybreak1(read,flag,Bamline_chr(bamline),Bamline_chrpos_low(bamline),Bamline_chrpos_high(bamline));
482
+    }
483
+  } else {
484
+    if (flag & QUERY_MINUSP) {
485
+      Read_add_chrinfo_querystart(read,flag,Bamline_chr(bamline),Bamline_chrpos_low(bamline),Bamline_chrpos_high(bamline));
486
+    } else {
487
+      Read_add_chrinfo_queryend(read,flag,Bamline_chr(bamline),Bamline_chrpos_low(bamline),Bamline_chrpos_high(bamline));
488
+    }
489
+  }
490
+
331 491
   *completep = true;
332 492
   for (i = 0; i < cigar_querylength; i++) {
333 493
     if (read->queryseq[i] == '*') {
... ...
@@ -335,6 +495,16 @@ Read_create_bam (bool *completep, T read, Bamline_T bamline) {
335 495
     }
336 496
   }
337 497
 
498
+#if 0
499
+  if (insidep == true && Bamline_paired_read_p(bamline) == true) {
500
+    if (Bamline_lowend_p(bamline) == true) {
501
+      read->lowp = true;
502
+    } else {
503
+      read->highp = true;
504
+    }
505
+  }
506
+#endif
507
+
338 508
   return read;
339 509
 }
340 510
 
... ...
@@ -349,14 +519,20 @@ Read_create_sam (bool *completep, T read, char *line) {
349 519
   int type;
350 520
   int cigar_querylength, mlength, querypos, i;
351 521
 
352
-  Genomicpos_T chrpos_low;
353
-  char *chr, *acc, *cigar, *auxinfo;
522
+  Genomicpos_T chrpos_low, chrpos_high, mate_chrpos_low;
523
+  char *chr, *mate_chr, *acc, *cigar, *auxinfo;
354 524
   int mapq;
355 525
   int readlength;
526
+  /* bool insidep; */
356 527
 
357
-  auxinfo = Samread_parse_line(&acc,&flag,&mapq,&chr,&chrpos_low,&cigar,&readlength,&shortread,&quality_string,line);
528
+  auxinfo = Samread_parse_line(&acc,&flag,&mapq,&chr,&chrpos_low,&cigar,
529
+			       &mate_chr,&mate_chrpos_low,&readlength,
530
+			       &shortread,&quality_string,line);
531
+  types = Samread_parse_cigar(&npositions,&cigar_querylength,cigar);
532
+  chrpos_high = Samread_chrpos_high(types,npositions,chrpos_low);
533
+  FREE(mate_chr);
534
+  FREE(cigar);
358 535
   FREE(acc);
359
-  FREE(chr);
360 536
 
361 537
   if (flag & QUERY_UNMAPPED) {
362 538
     read = (T) MALLOC(sizeof(struct T));
... ...
@@ -369,19 +545,24 @@ Read_create_sam (bool *completep, T read, char *line) {
369 545
       strcpy(read->quality_string,quality_string);
370 546
     }
371 547
     read->splicedp = true;
548
+    read->querybreak1_chrpos = read->querybreak2_chrpos = 0;
549
+    Read_add_chrinfo_querystart(read,flag,chr,chrpos_low,chrpos_high);
550
+    Read_add_chrinfo_queryend(read,flag,chr,chrpos_low,chrpos_high);
372 551
     *completep = true;
373 552
 
553
+    FREE(chr);
374 554
     FREE(cigar);
375 555
     FREE(quality_string);
376 556
     FREE(shortread);
377 557
 
558
+    Uintlist_free(&npositions);
559
+    Intlist_free(&types);
378 560
     return read;
379 561
   }
380 562
 
381
-  types = Samread_parse_cigar(&npositions,&cigar_querylength,cigar);
382
-  FREE(cigar);
383 563
 
384 564
   if (read == NULL) {
565
+    /* Initialize */
385 566
     read = (T) MALLOC(sizeof(struct T));
386 567
     read->queryseq = (char *) CALLOC(cigar_querylength+1,sizeof(char));
387 568
     for (i = 0; i < cigar_querylength; i++) {
... ...
@@ -393,24 +574,52 @@ Read_create_sam (bool *completep, T read, char *line) {
393 574
       read->quality_string = (char *) CALLOC(cigar_querylength+1,sizeof(char));
394 575
     }
395 576
     read->splicedp = false;
577
+    read->querybreak1_chrpos = read->querybreak2_chrpos = 0;
396 578
   }
397 579
 
398 580
   if (flag & QUERY_MINUSP) {
399 581
     strand = '-';
400 582
     querypos = cigar_querylength - 1;
583
+    if (Intlist_head(types) == 'H') {
584
+      Read_add_chrinfo_querybreak1(read,flag,chr,chrpos_low,chrpos_high);
585
+    } else {
586
+      Read_add_chrinfo_queryend(read,flag,chr,chrpos_low,chrpos_high);
587
+    }
401 588
   } else {
402 589
     strand = '+';
403 590
     querypos = 0;
591
+    if (Intlist_head(types) == 'H') {
592
+      Read_add_chrinfo_querybreak2(read,flag,chr,chrpos_low,chrpos_high);
593
+    } else {
594
+      Read_add_chrinfo_querystart(read,flag,chr,chrpos_low,chrpos_high);
595
+    }
404 596
   }
405 597
 
406 598
   p = shortread;
407 599
   q = quality_string;
600
+  /* insidep = true; */
408 601
   for (a = types, b = npositions; a != NULL; a = Intlist_next(a), b = Uintlist_next(b)) {
409 602
     type = Intlist_head(a);
410 603
     mlength = Uintlist_head(b);
411 604
 
412 605
     if (type == 'H') {
413
-      querypos += ((strand == '+') ? +mlength : -mlength);
606
+      if (strand == '+') {
607
+#if 0
608
+	if (querypos > 0) {
609
+	  /* Already filled in beginning of read and skipping end of read */
610
+	  insidep = false;
611
+	}
612
+#endif
613
+	querypos += mlength;
614
+      } else {
615
+#if 0
616
+	if (querypos == cigar_querylength - 1) {
617
+	  /* About to skip end of read */
618
+	  insidep = false;
619
+	}
620
+#endif
621
+	querypos -= mlength;
622
+      }
414 623
 
415 624
     } else if (type == 'N') {
416 625
       /* Ignore */
... ...
@@ -471,7 +680,20 @@ Read_create_sam (bool *completep, T read, char *line) {
471 680
   FREE(quality_string);
472 681
   FREE(shortread);
473 682
 
474
-  
683
+  if (type == 'H') {
684
+    if (flag & QUERY_MINUSP) {
685
+      Read_add_chrinfo_querybreak2(read,flag,chr,chrpos_low,chrpos_high);
686
+    } else {
687
+      Read_add_chrinfo_querybreak1(read,flag,chr,chrpos_low,chrpos_high);
688
+    }
689
+  } else {
690
+    if (flag & QUERY_MINUSP) {
691
+      Read_add_chrinfo_querystart(read,flag,chr,chrpos_low,chrpos_high);
692
+    } else {
693
+      Read_add_chrinfo_queryend(read,flag,chr,chrpos_low,chrpos_high);
694
+    }
695
+  }
696
+
475 697
   *completep = true;
476 698
   for (i = 0; i < cigar_querylength; i++) {
477 699
     if (read->queryseq[i] == '*') {
... ...
@@ -479,6 +701,18 @@ Read_create_sam (bool *completep, T read, char *line) {
479 701
     }
480 702
   }
481 703
 
704
+#if 0
705
+  if (insidep == true && mate_chr == NULL) {
706
+    if (chrpos_low < mate_chrpos_low) {
707
+      read->lowp = true;
708
+    } else if (chrpos_low > mate_chrpos_low) {
709
+      read->highp = true;
710
+    }
711
+  }
712
+#endif
713
+
714
+  FREE(chr);
715
+
482 716
   return read;
483 717
 }
484 718
 
... ...
@@ -567,36 +801,99 @@ print_upper_case (FILE *fp, char *queryseq) {
567 801
 static void
568 802
 print_fasta_single (FILE *fp, char *acc, T read) {
569 803
   if (want_splicedp == true && read->splicedp == true) {
570
-    fprintf(fp,">%s\n",acc);
804
+    fprintf(fp,">%s",acc);
805
+    if (add_bounds_p == true) {
806
+      fprintf(fp,"(%c%s:%u",read->querystart_strand,read->querystart_chr,read->querystart_chrpos);
807
+      if (read->querybreak1_chrpos != 0) {
808
+	fprintf(fp,",%c%s:%u",read->querybreak1_strand,read->querybreak1_chr,read->querybreak1_chrpos);
809
+	fprintf(fp,",%c%s:%u",read->querybreak2_strand,read->querybreak2_chr,read->querybreak2_chrpos);
810
+      }
811
+      fprintf(fp,",%c%s:%u)",read->queryend_strand,read->queryend_chr,read->queryend_chrpos);
812
+    }
813
+    fprintf(fp,"\n");
571 814
     print_upper_case(fp,read->queryseq);
572 815
   } else if (terminal_minlength < 0 || terminal_length(read->queryseq) >= terminal_minlength) {
573
-    fprintf(fp,">%s\n",acc);
816
+    fprintf(fp,">%s",acc);
817
+    if (add_bounds_p == true) {
818
+      fprintf(fp,"(%c%s:%u",read->querystart_strand,read->querystart_chr,read->querystart_chrpos);
819
+      if (read->querybreak1_chrpos != 0) {
820
+	fprintf(fp,",%c%s:%u",read->querybreak1_strand,read->querybreak1_chr,read->querybreak1_chrpos);
821
+	fprintf(fp,",%c%s:%u",read->querybreak2_strand,read->querybreak2_chr,read->querybreak2_chrpos);
822
+      }
823
+      fprintf(fp,",%c%s:%u)",read->queryend_strand,read->queryend_chr,read->queryend_chrpos);
824
+    }
825
+    fprintf(fp,"\n");
574 826
     fprintf(fp,"%s\n",read->queryseq);
575 827
   }
576 828
   return;
577 829
 }
578 830
 
579 831
 
832
+/* Bounds are from given read querystart to mate querystart */
580 833
 static void
581 834
 print_fasta_paired (FILE *fp, char *acc, T read1, T read2) {
582 835
   int terminal_length_1, terminal_length_2;
583 836
 
584 837
   if (single_end_output_p == true) {
585 838
     /* Important to add the /1 and /2 endings for the remove-intragenic program, which also requires unique alignments */
839
+    /* First read.  Mate is read2 */
586 840
     if (want_splicedp == true && read1->splicedp == true) {
587
-      fprintf(fp,">%s/1\n",acc);
841
+      fprintf(fp,">%s",acc);
842
+      if (add_bounds_p == true) {
843
+	fprintf(fp,"(%c%s:%u",read1->querystart_strand,read1->querystart_chr,read1->querystart_chrpos);
844
+	if (read1->querybreak1_chrpos != 0) {
845
+	  fprintf(fp,",%c%s:%u",read1->querybreak1_strand,read1->querybreak1_chr,read1->querybreak1_chrpos);
846
+	  fprintf(fp,",%c%s:%u",read1->querybreak2_strand,read1->querybreak2_chr,read1->querybreak2_chrpos);
847
+	}
848
+	fprintf(fp,",%c%s:%u",read1->queryend_strand,read1->queryend_chr,read1->queryend_chrpos);
849
+	fprintf(fp,"..%c%s:%u)",read2->querystart_strand,read2->querystart_chr,read2->querystart_chrpos);
850
+      }
851
+      fprintf(fp,"/1\n");
588 852
       print_upper_case(fp,read1->queryseq);
589 853
     } else if (terminal_minlength < 0 || terminal_length(read1->queryseq) >= terminal_minlength) {
590
-      fprintf(fp,">%s/1\n",acc);
854
+      fprintf(fp,">%s",acc);
855
+      if (add_bounds_p == true) {
856
+	fprintf(fp,"(%c%s:%u",read1->querystart_strand,read1->querystart_chr,read1->querystart_chrpos);
857
+	if (read1->querybreak1_chrpos != 0) {
858
+	  fprintf(fp,",%c%s:%u",read1->querybreak1_strand,read1->querybreak1_chr,read1->querybreak1_chrpos);
859
+	  fprintf(fp,",%c%s:%u",read1->querybreak2_strand,read1->querybreak2_chr,read1->querybreak2_chrpos);
860
+	}
861
+	fprintf(fp,",%c%s:%u",read1->queryend_strand,read1->queryend_chr,read1->queryend_chrpos);
862
+	fprintf(fp,"..%c%s:%u)",read2->querystart_strand,read2->querystart_chr,read2->querystart_chrpos);
863
+      }
864
+      fprintf(fp,"/1\n");
591 865
       fprintf(fp,"%s\n",read1->queryseq);
592 866
     }
867
+
868
+    /* Second read.  Mate is read1 */
593 869
     if (want_splicedp == true && read2->splicedp == true) {
594
-      fprintf(fp,">%s/2\n",acc);
870
+      fprintf(fp,">%s",acc);
871
+      if (add_bounds_p == true) {
872
+	fprintf(fp,"(%c%s:%u",read2->querystart_strand,read2->querystart_chr,read2->querystart_chrpos);
873
+	if (read2->querybreak1_chrpos != 0) {
874
+	  fprintf(fp,",%c%s:%u",read2->querybreak1_strand,read2->querybreak1_chr,read2->querybreak1_chrpos);
875
+	  fprintf(fp,",%c%s:%u",read2->querybreak2_strand,read2->querybreak2_chr,read2->querybreak2_chrpos);
876
+	}
877
+	fprintf(fp,",%c%s:%u",read2->queryend_strand,read2->queryend_chr,read2->queryend_chrpos);
878
+	fprintf(fp,"..%c%s:%u)",read1->querystart_strand,read1->querystart_chr,read1->querystart_chrpos);
879
+      }
880
+      fprintf(fp,"/2\n");
595 881
       print_upper_case(fp,read2->queryseq);
596 882
     } else if (terminal_minlength < 0 || terminal_length(read2->queryseq) >= terminal_minlength) {
597
-      fprintf(fp,">%s/2\n",acc);
883
+      fprintf(fp,">%s",acc);
884
+      if (add_bounds_p == true) {
885
+	fprintf(fp,"(%c%s:%u",read2->querystart_strand,read2->querystart_chr,read2->querystart_chrpos);
886
+	if (read2->querybreak1_chrpos != 0) {
887
+	  fprintf(fp,",%c%s:%u",read2->querybreak1_strand,read2->querybreak1_chr,read2->querybreak1_chrpos);
888
+	  fprintf(fp,",%c%s:%u",read2->querybreak2_strand,read2->querybreak2_chr,read2->querybreak2_chrpos);
889
+	}
890
+	fprintf(fp,",%c%s:%u",read2->queryend_strand,read2->queryend_chr,read2->queryend_chrpos);
891
+	fprintf(fp,"..%c%s:%u)",read1->querystart_strand,read1->querystart_chr,read1->querystart_chrpos);
892
+      }
893
+      fprintf(fp,"/2\n");
598 894
       fprintf(fp,"%s\n",read2->queryseq);
599 895
     }
896
+
600 897
   } else {
601 898
     if (want_splicedp == true && (read1->splicedp == true || read2->splicedp == true)) {
602 899
       fprintf(fp,">%s\n",acc);
... ...
@@ -789,9 +1086,9 @@ parse_bam_input (FILE *fp1, FILE *fp2, Bamreader_T bamreader, Table_T read_table
789 1086
   bool found1p, found2p, complete1p, complete2p;
790 1087
   bool copyp;
791 1088
 
792
-  while ((bamline = Bamread_next_bamline(bamreader,/*minimum_mapq*/0,/*good_unique_mapq*/0,/*maximum_nhits*/1000000,
793
-					 /*need_unique_p*/false,/*need_primary_p*/true,/*ignore_duplicates_p*/false,
794
-					 /*need_concordant_p*/false)) != NULL) {
1089
+  while ((bamline = Bamread_next_bamline(bamreader,/*desired_read_group*/NULL,/*minimum_mapq*/0,/*good_unique_mapq*/0,
1090
+					 /*maximum_nhits*/1000000,/*need_unique_p*/false,/*need_primary_p*/true,
1091
+					 /*ignore_duplicates_p*/false,/*need_concordant_p*/false)) != NULL) {
795 1092
     acc = Bamline_acc(bamline);
796 1093
     acc = strip_illumina_acc_ending(&copyp,acc);
797 1094
     debug(printf("Acc is %s",acc));
... ...
@@ -1050,6 +1347,9 @@ main (int argc, char *argv[]) {
1050 1347
       } else if (!strcmp(long_name,"fasta")) {
1051 1348
 	fastap = true;
1052 1349
 
1350
+      } else if (!strcmp(long_name,"add-bounds")) {
1351
+	add_bounds_p = true;
1352
+
1053 1353
       } else if (!strcmp(long_name,"terminal-minlength")) {
1054 1354
 	terminal_minlength = atoi(optarg);
1055 1355
 
... ...
@@ -1,4 +1,4 @@
1
-static char rcsid[] = "$Id: bam_get.c 87713 2013-03-01 18:32:34Z twu $";
1
+static char rcsid[] = "$Id: bam_get.c 108654 2013-09-19 23:11:00Z twu $";
2 2
 #ifdef HAVE_CONFIG_H
3 3
 #include <config.h>
4 4
 #endif
... ...
@@ -31,6 +31,7 @@ static char rcsid[] = "$Id: bam_get.c 87713 2013-03-01 18:32:34Z twu $";
31 31
 
32 32
 
33 33
 /* Filters */
34
+static char *desired_read_group = NULL;
34 35
 static int minimum_mapq = 0;
35 36
 static int good_unique_mapq = 35;
36 37
 static int maximum_nhits = 1000000;
... ...
@@ -50,6 +51,7 @@ static struct option long_options[] = {
50 51
   {"primary", required_argument, 0, 'P'}, /* need_primary_p */
51 52
   {"ignore-duplicates", no_argument, 0, 0}, /* ignore_duplicates_p */
52 53
   {"allow-duplicates", no_argument, 0, 0}, /* ignore_duplicates_p */
54
+  {"read-group", required_argument, 0, 0}, /* desired_read_group */
53 55
 
54 56
   /* Help options */
55 57
   {"version", no_argument, 0, 0}, /* print_program_version */
... ...
@@ -111,6 +113,9 @@ main (int argc, char *argv[]) {
111 113
       } else if (!strcmp(long_name,"allow-duplicates")) {
112 114
 	ignore_duplicates_p = false;
113 115
 
116
+      } else if (!strcmp(long_name,"read-group")) {
117
+	desired_read_group = optarg;
118
+
114 119
       } else {
115 120
 	/* Shouldn't reach here */
116 121
 	fprintf(stderr,"Don't recognize option %s.  For usage, run 'gsnap --help'",long_name);
... ...
@@ -164,7 +169,7 @@ main (int argc, char *argv[]) {
164 169
 
165 170
   Bamread_write_header(bamreader);
166 171
   Bamread_limit_region(bamreader,chromosome,chrstart,chrend);
167
-  while ((bamline = Bamread_next_bamline(bamreader,minimum_mapq,good_unique_mapq,maximum_nhits,
172
+  while ((bamline = Bamread_next_bamline(bamreader,desired_read_group,minimum_mapq,good_unique_mapq,maximum_nhits,
168 173
 					 need_unique_p,need_primary_p,ignore_duplicates_p,
169 174
 					 /*need_concordant_p*/true)) != NULL) {
170 175
     Bamline_print(stdout,bamline,Bamline_flag(bamline),quality_score_adj);
... ...
@@ -212,6 +217,8 @@ where\n\
212 217
          or startposition+length (+ strand)\n\
213 218
 \n\
214 219
 Filtering options\n\
220
+  --read-group=STRING            Require alignments to have this read group in the RG field of\n\
221
+                                   the BAM line\n\
215 222
   -q, --min-mapq=INT             Require alignments to have this mapping quality and higher\n\
216 223
                                    (default 0)\n\
217 224
   -n, --nhits=INT                Require alignments to have this many hits or fewer\n\
... ...
@@ -1,10 +1,11 @@
1
-static char rcsid[] = "$Id: bamread.c 87713 2013-03-01 18:32:34Z twu $";
1
+static char rcsid[] = "$Id: bamread.c 108654 2013-09-19 23:11:00Z twu $";
2 2
 #ifdef HAVE_CONFIG_H
3 3
 #include <config.h>
4 4
 #endif
5 5
 
6 6
 #include "bamread.h"
7 7
 #include <stdlib.h>
8
+#include "assert.h"
8 9
 #include "mem.h"
9 10
 #include "complement.h"
10 11
 #include "bool.h"
... ...
@@ -366,7 +367,7 @@ static void
366 367
 parse_line (T this, char **acc, unsigned int *flag, int *mapq, char **chr, Genomicpos_T *chrpos,
367 368
 	    char **mate_chr, Genomicpos_T *mate_chrpos, int *insert_length,
368 369
 	    Intlist_T *cigartypes, Uintlist_T *cigarlengths, int *cigarlength,
369
-	    int *readlength, char **read, char **quality_string) {
370
+	    int *readlength, char **read, char **quality_string, char **read_group) {
370 371
   int type;
371 372
   int i;
372 373
   unsigned int length;
... ...
@@ -424,6 +425,13 @@ parse_line (T this, char **acc, unsigned int *flag, int *mapq, char **chr, Genom
424 425
     *quality_string = (char *) ptr;
425 426
   }
426 427
 
428
+  ptr = bam_aux_get(this->bam,"RG");
429
+  if (ptr == NULL) {
430
+    *read_group = (char *) NULL;
431
+  } else {
432
+    *read_group = bam_aux2Z(ptr);
433
+  }
434
+
427 435
   /* Cigar */
428 436
   *cigarlength = 0;
429 437
   *cigartypes = (Intlist_T) NULL;
... ...
@@ -464,7 +472,7 @@ int
464 472
 Bamread_next_line (T this, char **acc, unsigned int *flag, int *mapq, char **chr, Genomicpos_T *chrpos,
465 473
 		   char **mate_chr, Genomicpos_T *mate_chrpos,
466 474
 		   Intlist_T *cigartypes, Uintlist_T *cigarlengths, int *cigarlength,
467
-		   int *readlength, char **read, char **quality_string) {
475
+		   int *readlength, char **read, char **quality_string, char **read_group) {
468 476
   int insert_length;
469 477
 
470 478
 #ifndef HAVE_SAMTOOLS
... ...
@@ -477,7 +485,7 @@ Bamread_next_line (T this, char **acc, unsigned int *flag, int *mapq, char **chr
477 485
       parse_line(this,&(*acc),&(*flag),&(*mapq),&(*chr),&(*chrpos),
478 486
 		 &(*mate_chr),&(*mate_chrpos),&insert_length,
479 487
 		 &(*cigartypes),&(*cigarlengths),&(*cigarlength),
480
-		 &(*readlength),&(*read),&(*quality_string));
488
+		 &(*readlength),&(*read),&(*quality_string),&(*read_group));
481 489
       return 1;
482 490
     }
483 491
   } else {
... ...
@@ -487,7 +495,7 @@ Bamread_next_line (T this, char **acc, unsigned int *flag, int *mapq, char **chr
487 495
       parse_line(this,&(*acc),&(*flag),&(*mapq),&(*chr),&(*chrpos),
488 496
 		 &(*mate_chr),&(*mate_chrpos),&insert_length,
489 497
 		 &(*cigartypes),&(*cigarlengths),&(*cigarlength),
490
-		 &(*readlength),&(*read),&(*quality_string));
498
+		 &(*readlength),&(*read),&(*quality_string),&(*read_group));
491 499
       return 1;
492 500
     }
493 501
   }
... ...
@@ -653,6 +661,29 @@ Bamread_print_cigar (FILE *fp, Bamline_T this) {
653 661
   return;
654 662
 }
655 663
 
664
+char *
665
+Bamline_cigar_string (Bamline_T this) {
666
+  char *string, number[12];
667
+  Intlist_T p;
668
+  Uintlist_T q;
669
+  int string_length = 0;
670
+
671
+  for (p = this->cigar_types, q = this->cigar_npositions; p != NULL; p = Intlist_next(p), q = Uintlist_next(q)) {
672
+    sprintf(number,"%u",Uintlist_head(q));
673
+    string_length += strlen(number) + 1;
674
+  }
675
+  string = (char *) CALLOC(string_length+1,sizeof(char));
676
+
677
+  string_length = 0;
678
+  for (p = this->cigar_types, q = this->cigar_npositions; p != NULL; p = Intlist_next(p), q = Uintlist_next(q)) {
679
+    sprintf(number,"%u",Uintlist_head(q));
680
+    sprintf(&(string[string_length]),"%u%c",Uintlist_head(q),Intlist_head(p));
681
+    string_length += strlen(number) + 1;
682
+  }
683
+  
684
+  return string;
685
+}
686
+
656 687
 int
657 688
 Bamline_cigar_querylength (Bamline_T this) {
658 689
   return this->cigar_querylength;
... ...
@@ -1232,7 +1263,7 @@ Bamline_new (char *acc, unsigned int flag, int nhits, bool good_unique_p, int ma
1232 1263
 
1233 1264
 
1234 1265
 Bamline_T
1235
-Bamread_next_bamline (T this, int minimum_mapq, int good_unique_mapq, int maximum_nhits,
1266
+Bamread_next_bamline (T this, char *desired_read_group, int minimum_mapq, int good_unique_mapq, int maximum_nhits,
1236 1267
 		      bool need_unique_p, bool need_primary_p, bool ignore_duplicates_p,
1237 1268
 		      bool need_concordant_p) {
1238 1269
   char *acc, *chr, *mate_chr, splice_strand;
... ...
@@ -1247,6 +1278,7 @@ Bamread_next_bamline (T this, int minimum_mapq, int good_unique_mapq, int maximu
1247 1278
   int cigar_querylength, readlength;
1248 1279
   char *read;
1249 1280
   char *quality_string;
1281
+  char *read_group;
1250 1282
 
1251 1283
 #ifndef HAVE_SAMTOOLS
1252 1284
   return (Bamline_T) NULL;
... ...
@@ -1258,8 +1290,13 @@ Bamread_next_bamline (T this, int minimum_mapq, int good_unique_mapq, int maximu
1258 1290
       parse_line(this,&acc,&flag,&mapq,&chr,&chrpos_low,
1259 1291
 		 &mate_chr,&mate_chrpos_low,&insert_length,
1260 1292
 		 &cigar_types,&cigarlengths,&cigar_querylength,
1261
-		 &readlength,&read,&quality_string);
1262
-      if (mapq < minimum_mapq) {
1293
+		 &readlength,&read,&quality_string,&read_group);
1294
+      if (desired_read_group != NULL && (read_group == NULL || strcmp(desired_read_group,read_group))) {
1295
+	debug1(fprintf(stderr,"Fails because doesn't have desired read group %s\n",desired_read_group));
1296
+	Intlist_free(&cigar_types);
1297
+	Uintlist_free(&cigarlengths);
1298
+	FREE(read);
1299
+      } else if (mapq < minimum_mapq) {
1263 1300
 	debug1(fprintf(stderr,"Fails because mapq %d < minimum %d\n",mapq,minimum_mapq));
1264 1301
 	Intlist_free(&cigar_types);
1265 1302
 	Uintlist_free(&cigarlengths);
... ...
@@ -1308,7 +1345,7 @@ Bamread_next_bamline (T this, int minimum_mapq, int good_unique_mapq, int maximu
1308 1345
       parse_line(this,&acc,&flag,&mapq,&chr,&chrpos_low,
1309 1346
 		 &mate_chr,&mate_chrpos_low,&insert_length,
1310 1347
 		 &cigar_types,&cigarlengths,&cigar_querylength,
1311
-		 &readlength,&read,&quality_string);
1348
+		 &readlength,&read,&quality_string,&read_group);
1312 1349
       if (mapq >= minimum_mapq &&
1313 1350
 	  (nhits = aux_nhits(this)) <= maximum_nhits &&
1314 1351
 	  (need_unique_p == false || nhits == 1) &&
... ...
@@ -1349,7 +1386,7 @@ Bamread_get_acc (T this, char *desired_chr, Genomicpos_T desired_chrpos, char *d
1349 1386
   int cigar_querylength, readlength;
1350 1387
   char *read;
1351 1388
   char *quality_string;
1352
-
1389
+  char *read_group;
1353 1390
 
1354 1391
   Bamread_limit_region(this,desired_chr,desired_chrpos,desired_chrpos);
1355 1392
   while (bam_iter_read(this->fp,this->iter,this->bam) >= 0) {
... ...
@@ -1357,7 +1394,7 @@ Bamread_get_acc (T this, char *desired_chr, Genomicpos_T desired_chrpos, char *d
1357 1394
     parse_line(this,&acc,&flag,&mapq,&chr,&chrpos_low,
1358 1395
 	       &mate_chr,&mate_chrpos_low,&insert_length,
1359 1396
 	       &cigar_types,&cigarlengths,&cigar_querylength,
1360
-	       &readlength,&read,&quality_string);
1397
+	       &readlength,&read,&quality_string,&read_group);
1361 1398
     splice_strand = aux_splice_strand(this);
1362 1399
     if (!strcmp(acc,desired_acc) && chrpos_low == desired_chrpos) {
1363 1400
       Bamread_unlimit_region(this);
... ...
@@ -1653,7 +1690,7 @@ Bampair_details (Uintlist_T *chrpos_lows, Uintlist_T *chrpos_highs,
1653 1690
 
1654 1691
 
1655 1692
 List_T
1656
-Bamread_all_pairs (T bamreader, int minimum_mapq, int good_unique_mapq, int maximum_nhits,
1693
+Bamread_all_pairs (T bamreader, char *desired_read_group, int minimum_mapq, int good_unique_mapq, int maximum_nhits,
1657 1694
 		   bool need_unique_p, bool need_primary_p, bool ignore_duplicates_p,
1658 1695
 		   bool need_concordant_p) {
1659 1696
   List_T lines = NULL, p;
... ...
@@ -1670,7 +1707,7 @@ Bamread_all_pairs (T bamreader, int minimum_mapq, int good_unique_mapq, int maxi
1670 1707
 
1671 1708
   bamstore_chrtable = Table_new(100,Chrom_compare_table,Chrom_hash_table);
1672 1709
 
1673
-  while ((bamline = Bamread_next_bamline(bamreader,minimum_mapq,good_unique_mapq,maximum_nhits,
1710
+  while ((bamline = Bamread_next_bamline(bamreader,desired_read_group,minimum_mapq,good_unique_mapq,maximum_nhits,
1674 1711
 					 need_unique_p,need_primary_p,ignore_duplicates_p,
1675 1712
 					 need_concordant_p)) != NULL) {
1676 1713
     if (Bamline_concordantp(bamline) == false) {
... ...
@@ -1770,7 +1807,7 @@ level_cmp (const void *x, const void *y) {
1770 1807
 int
1771 1808
 Bampair_compute_levels (List_T bampairs, Genomicpos_T mincoord,
1772 1809
 			Genomicpos_T maxcoord, int max_allowed_levels,
1773
-			double xfactor, Genomicpos_T min_pairlength) {
1810
+			double xfactor, Genomicpos_T min_pairlength, bool only_internal_p) {
1774 1811
   int nbampairs, i;
1775 1812
   int maxlevel = -1, level;
1776 1813
   bool donep;
... ...
@@ -1790,6 +1827,8 @@ Bampair_compute_levels (List_T bampairs, Genomicpos_T mincoord,
1790 1827
       bampair = array[i];
1791 1828
       if (bampair->chrpos_high - bampair->chrpos_low < min_pairlength) {
1792 1829
 	bampair->level = -1;
1830
+      } else if (only_internal_p == true && bampair->chrpos_low < mincoord && bampair->chrpos_high > maxcoord) {
1831
+	bampair->level = -1;
1793 1832
       } else {
1794 1833
 	/* Find appropriate level */
1795 1834
 	level = 0;
... ...
@@ -1,4 +1,4 @@
1
-/* $Id: bamread.h 87713 2013-03-01 18:32:34Z twu $ */
1
+/* $Id: bamread.h 108654 2013-09-19 23:11:00Z twu $ */
2 2
 #ifndef BAMREAD_INCLUDED
3 3
 #define BAMREAD_INCLUDED
4 4
 /* Cannot use bool, since it appears to conflict with samtools */
... ...
@@ -43,7 +43,7 @@ extern int
43 43
 Bamread_next_line (T this, char **acc, unsigned int *flag, int *mapq, char **chr, Genomicpos_T *chrpos,
44 44
 		   char **mate_chr, Genomicpos_T *mate_chrpos,
45 45
 		   Intlist_T *cigartypes, Uintlist_T *cigarlengths, int *cigarlength,
46
-		   int *readlength, char **read, char **quality_string);
46
+		   int *readlength, char **read, char **quality_string, char **read_group);
47 47
 
48 48
 typedef struct Bamline_T *Bamline_T;
49 49
 
... ...
@@ -83,12 +83,16 @@ extern int
83 83
 Bamline_cigar_querylength (Bamline_T this);
84 84
 extern void
85 85
 Bamread_print_cigar (FILE *fp, Bamline_T this);
86
+extern char *
87
+Bamline_cigar_string (Bamline_T this);
86 88
 extern int
87 89
 Bamline_readlength (Bamline_T this);
88 90
 extern char *
89 91
 Bamline_read (Bamline_T this);
90 92
 extern char *
91 93
 Bamline_quality_string (Bamline_T this);
94
+extern char *
95
+Bamline_read_group (Bamline_T this);
92 96
 extern void
93 97
 Bamline_print (FILE *fp, Bamline_T this, unsigned int newflag, int quality_score_adj);
94 98
 
... ...
@@ -104,7 +108,7 @@ Bamline_chrpos_high (Bamline_T this);
104 108
 extern void
105 109
 Bamline_free (Bamline_T *old);
106 110
 extern Bamline_T
107
-Bamread_next_bamline (T this, int minimum_mapq, int good_unique_mapq, int maximum_nhits,
111
+Bamread_next_bamline (T this, char *desired_read_group, int minimum_mapq, int good_unique_mapq, int maximum_nhits,
108 112
 		      bool need_unique_p, bool need_primary_p, bool ignore_duplicates_p,
109 113
 		      bool need_concordant_p);
110 114
 extern Bamline_T
... ...
@@ -160,14 +164,14 @@ Bampair_details (Uintlist_T *chrpos_lows, Uintlist_T *chrpos_highs,
160 164
 		 Bampair_T this);
161 165
 
162 166
 extern List_T
163
-Bamread_all_pairs (T bamreader, int minimum_mapq, int good_unique_mapq, int maximum_nhits,
167
+Bamread_all_pairs (T bamreader, char *desired_read_group, int minimum_mapq, int good_unique_mapq, int maximum_nhits,
164 168
 		   bool need_unique_p, bool need_primary_p, bool ignore_duplicates_p,
165 169
 		   bool need_concordant_p);
166 170
 
167 171
 extern int
168 172
 Bampair_compute_levels (List_T bampairs, Genomicpos_T mincoord,
169 173
 			Genomicpos_T maxcoord, int max_allowed_levels,
170
-			double xfactor, Genomicpos_T min_pairlength);
174
+			double xfactor, Genomicpos_T min_pairlength, bool only_internal_p);
171 175
 
172 176
 
173 177
 #undef T
... ...
@@ -1,4 +1,4 @@
1
-static char rcsid[] = "$Id: bamtally.c 87717 2013-03-01 18:34:33Z twu $";
1
+static char rcsid[] = "$Id: bamtally.c 108654 2013-09-19 23:11:00Z twu $";
2 2
 #ifdef HAVE_CONFIG_H
3 3
 #include <config.h>
4 4
 #endif
... ...
@@ -38,6 +38,12 @@ static char rcsid[] = "$Id: bamtally.c 87717 2013-03-01 18:34:33Z twu $";
38 38
 #include "parserange.h"
39 39
 
40 40
 
41
+#define ARRAY_THRESHOLD 20
42
+#define INITIAL_READLENGTH 75
43
+#define MAX_QUALITY_SCORE 40
44
+#define MAX_MAPQ_SCORE 40
45
+
46
+
41 47
 /* Alloc and block control structure */
42 48
 #ifdef DEBUG0
43 49
 #define debug0(x) x
... ...
@@ -619,9 +625,21 @@ struct T {
619 625
   long int n_fromleft_plus; /* Used for reference count for insertions */
620 626
   long int n_fromleft_minus; /* Used for reference count for insertions */
621 627
 
622
-  List_T matches_byshift;
623
-  List_T matches_byquality;
624
-  List_T matches_bymapq;
628
+  bool use_array_p;
629
+  List_T list_matches_byshift;
630
+  List_T list_matches_byquality;
631
+  List_T list_matches_bymapq;
632
+
633
+  int n_matches_byshift_plus;
634
+  int *matches_byshift_plus;
<