git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@110593 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -9,7 +9,7 @@ Description: GSNAP and GMAP are a pair of tools to align short-read |
9 | 9 |
methods to work with GMAP and GSNAP from within R. In addition, |
10 | 10 |
it provides methods to tally alignment results on a |
11 | 11 |
per-nucleotide basis using the bam_tally tool. |
12 |
-Version: 1.13.1 |
|
12 |
+Version: 1.13.2 |
|
13 | 13 |
Depends: R (>= 2.15.0), methods, GenomeInfoDb (>= 1.1.3), |
14 | 14 |
GenomicRanges (>= 1.17.12) |
15 | 15 |
Imports: S4Vectors, IRanges, Rsamtools (>= 1.17.8), rtracklayer (>= 1.25.5), |
... | ... |
@@ -24,7 +24,8 @@ setClass("BamTallyParam", |
24 | 24 |
xs = "logical", |
25 | 25 |
read_pos = "logical", |
26 | 26 |
min_base_quality = "integer", |
27 |
- noncovered = "logical")) |
|
27 |
+ noncovered = "logical", |
|
28 |
+ nm = "logical")) |
|
28 | 29 |
|
29 | 30 |
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
30 | 31 |
### Constructor |
... | ... |
@@ -107,7 +108,8 @@ BamTallyParam <- function(genome, which = GRanges(), |
107 | 108 |
indels = FALSE, include_soft_clips = 0L, |
108 | 109 |
exon_iit = NULL, IIT_BPPARAM = NULL, |
109 | 110 |
xs = FALSE, read_pos = FALSE, |
110 |
- min_base_quality = 0L, noncovered = FALSE) |
|
111 |
+ min_base_quality = 0L, noncovered = FALSE, |
|
112 |
+ nm = FALSE) |
|
111 | 113 |
{ |
112 | 114 |
if (!is.null(desired_read_group) && !isSingleString(desired_read_group)) |
113 | 115 |
stop("'desired_read_group' must be NULL or a single, non-NA string") |
... | ... |
@@ -139,6 +141,8 @@ BamTallyParam <- function(genome, which = GRanges(), |
139 | 141 |
stop("min_base_quality must be a single, non-negative, non-NA number") |
140 | 142 |
if (!isTRUEorFALSE(noncovered)) |
141 | 143 |
stop("noncovered must be TRUE or FALSE") |
144 |
+ if (!isTRUEorFALSE(nm)) |
|
145 |
+ stop("nm must be TRUE or FALSE") |
|
142 | 146 |
args <- names(formals(sys.function())) |
143 | 147 |
params <- mget(args, environment()) |
144 | 148 |
params$genome <- as(genome, "GmapGenome") |
... | ... |
@@ -11,11 +11,12 @@ setClass("TallyIIT", representation(ptr = "externalptr", |
11 | 11 |
genome = "GmapGenome", |
12 | 12 |
bam = "BamFile", |
13 | 13 |
xs = "logical", |
14 |
- read_pos = "logical")) |
|
14 |
+ read_pos = "logical", |
|
15 |
+ nm = "logical")) |
|
15 | 16 |
|
16 |
-TallyIIT <- function(ptr, genome, bam, xs, read_pos) { |
|
17 |
+TallyIIT <- function(ptr, genome, bam, xs, read_pos, nm) { |
|
17 | 18 |
new("TallyIIT", ptr = ptr, genome = genome, bam = bam, xs = xs, |
18 |
- read_pos = read_pos) |
|
19 |
+ read_pos = read_pos, nm = nm) |
|
19 | 20 |
} |
20 | 21 |
|
21 | 22 |
setMethod("genome", "TallyIIT", function(x) x@genome) |
... | ... |
@@ -72,16 +73,22 @@ setMethod("bam_tally", "GmapBamReader", |
72 | 73 |
|
73 | 74 |
TallyIIT(do.call(.bam_tally_C, c(list(x), param_list)), genome, |
74 | 75 |
as(x, "BamFile"), xs=param_list$xs, |
75 |
- read_pos=param_list$read_pos) |
|
76 |
+ read_pos=param_list$read_pos, nm=param_list$nm) |
|
76 | 77 |
}) |
77 | 78 |
|
78 |
-variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L, |
|
79 |
- keep_ref_rows = FALSE, read_length = NA_integer_) |
|
79 |
+variantSummary <- function(x, read_pos_breaks = NULL, |
|
80 |
+ keep_ref_rows = FALSE, read_length = NA_integer_, |
|
81 |
+ high_nm_score = NA_integer_) |
|
80 | 82 |
{ |
81 | 83 |
read_length <- as.integer(read_length) |
82 | 84 |
if (length(read_length) != 1L) { |
83 | 85 |
stop("'read_length' must be a single integer") |
84 | 86 |
} |
87 |
+ high_nm_score <- as.integer(high_nm_score) |
|
88 |
+ stopifnot(length(high_nm_score) == 1L) |
|
89 |
+ if (!is.na(high_nm_score) && !x@nm) { |
|
90 |
+ stop("'high_nm_score is not NA but NM scores were not tallied") |
|
91 |
+ } |
|
85 | 92 |
if (length(read_pos_breaks) > 0L) { |
86 | 93 |
if (!x@read_pos) { |
87 | 94 |
stop("'read_pos_breaks' non-empty, but read positions were not tallied") |
... | ... |
@@ -97,25 +104,23 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L, |
97 | 104 |
|
98 | 105 |
tally <- .Call(R_tally_iit_parse, x@ptr, |
99 | 106 |
read_pos_breaks, |
100 |
- normArgSingleInteger(high_base_quality), |
|
101 |
- NULL, read_length, x@xs) |
|
107 |
+ NULL, read_length, x@xs, high_nm_score) |
|
102 | 108 |
|
103 | 109 |
tally_names <- c("seqnames", "pos", "ref", "alt", |
104 | 110 |
"n.read.pos", "n.read.pos.ref", |
105 |
- "raw.count", "raw.count.ref", |
|
106 |
- "raw.count.total", |
|
107 |
- "high.quality", "high.quality.ref", |
|
108 |
- "high.quality.total", "mean.quality", |
|
109 |
- "mean.quality.ref", |
|
111 |
+ "count", "count.ref", "count.total", |
|
110 | 112 |
"count.plus", "count.plus.ref", |
111 | 113 |
"count.minus", "count.minus.ref", |
112 | 114 |
"del.count.plus", "del.count.minus", |
113 | 115 |
"read.pos.mean", "read.pos.mean.ref", |
114 | 116 |
"read.pos.var", "read.pos.var.ref", |
115 |
- "mdfne", "mdfne.ref", "codon.dir", |
|
117 |
+ "mdfne", "mdfne.ref", "codon.strand", |
|
116 | 118 |
"count.xs.plus", "count.xs.plus.ref", |
117 |
- "count.xs.minus", "count.xs.minus.ref") |
|
118 |
- |
|
119 |
+ "count.xs.minus", "count.xs.minus.ref", |
|
120 |
+ "count.high.nm", "count.high.nm.ref") |
|
121 |
+ |
|
122 |
+ tally$codon.strand <- structure(tally$codon.strand, class="factor", |
|
123 |
+ levels=strand()) |
|
119 | 124 |
break_names <- character() |
120 | 125 |
if (length(read_pos_breaks) > 0L) { |
121 | 126 |
read_pos_breaks <- as.integer(read_pos_breaks) |
... | ... |
@@ -124,6 +129,7 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L, |
124 | 129 |
tally_names <- c(tally_names, break_names) |
125 | 130 |
} |
126 | 131 |
names(tally) <- tally_names |
132 |
+ tally <- Filter(Negate(is.null), tally) |
|
127 | 133 |
|
128 | 134 |
if (!keep_ref_rows) { |
129 | 135 |
variant_rows <- !is.na(tally$alt) |
... | ... |
@@ -131,23 +137,20 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L, |
131 | 137 |
tally <- lapply(tally, `[`, variant_rows) |
132 | 138 |
} |
133 | 139 |
|
134 |
- meta_names <- setdiff(tally_names, |
|
135 |
- c("seqnames", "pos", "ref", "alt", "high.quality", |
|
136 |
- "high.quality.ref", "high.quality.total")) |
|
140 |
+ meta_names <- setdiff(names(tally), |
|
141 |
+ c("seqnames", "pos", "ref", "alt", "count", |
|
142 |
+ "count.ref", "count.total")) |
|
137 | 143 |
genome <- genome(x) |
138 | 144 |
indel <- nchar(tally$ref) == 0L | nchar(tally$alt) == 0L |
139 | 145 |
metacols <- DataFrame(tally[meta_names]) |
140 | 146 |
mcols(metacols) <- variantSummaryColumnDescriptions(read_pos_breaks) |
141 | 147 |
|
142 |
- samecols = tally[["ref"]] != tally[["alt"]] |
|
143 | 148 |
gr <- with(tally, |
144 | 149 |
VRanges(seqnames, |
145 | 150 |
IRanges(pos, |
146 | 151 |
width = ifelse(nchar(alt) == 0, nchar(ref), 1L)), |
147 | 152 |
ref, alt, |
148 |
- ifelse(indel, raw.count.total, high.quality.total), |
|
149 |
- ifelse(indel, raw.count.ref, high.quality.ref), |
|
150 |
- ifelse(indel, raw.count, high.quality), |
|
153 |
+ count.total, count.ref, count, |
|
151 | 154 |
seqlengths = seqlengths(genome))) |
152 | 155 |
mcols(gr) <- metacols |
153 | 156 |
checkTallyConsistency(gr) |
... | ... |
@@ -160,11 +163,8 @@ variantSummary <- function(x, read_pos_breaks = NULL, high_base_quality = 0L, |
160 | 163 |
|
161 | 164 |
checkTallyConsistency <- function(x) { |
162 | 165 |
with(mcols(x), { |
163 |
- stopifnot(all(raw.count + raw.count.ref <= raw.count.total, na.rm=TRUE)) |
|
164 |
- stopifnot(all(altDepth(x) <= raw.count, na.rm=TRUE)) |
|
165 |
- stopifnot(all(refDepth(x) <= raw.count.ref, na.rm=TRUE)) |
|
166 |
- stopifnot(all(count.plus + count.minus == raw.count, na.rm=TRUE)) |
|
167 |
- stopifnot(all(count.plus.ref + count.minus.ref == raw.count.ref)) |
|
166 |
+ stopifnot(all(count.plus + count.minus == altDepth(x), na.rm=TRUE)) |
|
167 |
+ stopifnot(all(count.plus.ref + count.minus.ref == refDepth(x))) |
|
168 | 168 |
}) |
169 | 169 |
} |
170 | 170 |
|
... | ... |
@@ -228,7 +228,7 @@ normArgSingleCharacterOrNULL <- function(x) { |
228 | 228 |
blocksize = 1000L, verbosep = FALSE, |
229 | 229 |
include_soft_clips = 0L, |
230 | 230 |
exon_iit = NULL, xs = FALSE, read_pos = FALSE, |
231 |
- min_base_quality = 0L, noncovered = FALSE) |
|
231 |
+ min_base_quality = 0L, noncovered = FALSE, nm = FALSE) |
|
232 | 232 |
{ |
233 | 233 |
if (!is(bamreader, "GmapBamReader")) |
234 | 234 |
stop("'bamreader' must be a GmapBamReader") |
... | ... |
@@ -262,7 +262,8 @@ normArgSingleCharacterOrNULL <- function(x) { |
262 | 262 |
normArgTRUEorFALSE(xs), |
263 | 263 |
normArgTRUEorFALSE(read_pos), |
264 | 264 |
normArgSingleInteger(min_base_quality), |
265 |
- normArgTRUEorFALSE(noncovered)) |
|
265 |
+ normArgTRUEorFALSE(noncovered), |
|
266 |
+ normArgTRUEorFALSE(nm)) |
|
266 | 267 |
} |
267 | 268 |
|
268 | 269 |
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
... | ... |
@@ -26,7 +26,8 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R, |
26 | 26 |
SEXP verbosep_R, SEXP max_softclip_R, |
27 | 27 |
SEXP genome_iit_file_R, |
28 | 28 |
SEXP print_xs_scores_p_R, SEXP print_cycles_p_R, |
29 |
- SEXP minimum_quality_score_R, SEXP noncovered_R) |
|
29 |
+ SEXP minimum_quality_score_R, SEXP noncovered_R, |
|
30 |
+ SEXP print_nm_scores_p_R) |
|
30 | 31 |
{ |
31 | 32 |
Bamreader_T bamreader = (Bamreader_T) R_ExternalPtrAddr(bamreader_R); |
32 | 33 |
const char *genome_dir = |
... | ... |
@@ -54,6 +55,7 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R, |
54 | 55 |
bool print_cycles_p = asLogical(print_cycles_p_R); |
55 | 56 |
int minimum_quality_score = asInteger(minimum_quality_score_R); |
56 | 57 |
bool noncovered_p = asLogical(noncovered_R); |
58 |
+ bool print_nm_scores_p = asLogical(print_nm_scores_p_R); |
|
57 | 59 |
|
58 | 60 |
Genome_T genome = createGenome(genome_dir, db); |
59 | 61 |
IIT_T chromosome_iit = readChromosomeIIT(genome_dir, db); |
... | ... |
@@ -88,7 +90,7 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R, |
88 | 90 |
print_indels_p, blocksize, verbosep, |
89 | 91 |
/*readlevel_p*/false, max_softclip, |
90 | 92 |
print_cycles_p, |
91 |
- /*print_nm_scores_p*/ false, |
|
93 |
+ print_nm_scores_p, |
|
92 | 94 |
print_xs_scores_p, noncovered_p); |
93 | 95 |
IIT_free(&chromosome_iit); |
94 | 96 |
Genome_free(&genome); |
... | ... |
@@ -23,7 +23,8 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R, |
23 | 23 |
SEXP verbosep_R, SEXP max_softclip_R, |
24 | 24 |
SEXP genome_iit_file_R, |
25 | 25 |
SEXP print_xs_scores_p_R, SEXP print_cycles_p_R, |
26 |
- SEXP minimum_quality_score_R, SEXP nonconvered_R); |
|
26 |
+ SEXP minimum_quality_score_R, SEXP nonconvered_R, |
|
27 |
+ SEXP print_nm_scores_p_R); |
|
27 | 28 |
|
28 | 29 |
SEXP |
29 | 30 |
R_tally_iit_parse(SEXP tally_iit_R, SEXP cycle_breaks_R, |
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-static char rcsid[] = "$Id: bamtally.c 172388 2015-08-21 20:10:50Z twu $"; |
|
1 |
+static char rcsid[] = "$Id: bamtally.c 178481 2015-11-09 20:55:28Z twu $"; |
|
2 | 2 |
#ifdef HAVE_CONFIG_H |
3 | 3 |
#include <config.h> |
4 | 4 |
#endif |
... | ... |
@@ -859,13 +859,18 @@ pass_variant_filter_p (long int nmatches, long int delcounts_plus, long int delc |
859 | 859 |
if (variant_strands == 0) { |
860 | 860 |
return true; |
861 | 861 |
} else if (variant_strands == 1) { |
862 |
- if (mismatches_by_shift == NULL) { |
|
862 |
+ /* Not sure if we should look at insertions_byshift also */ |
|
863 |
+ if (mismatches_by_shift == NULL && delcounts_plus == 0 && delcounts_minus == 0) { |
|
863 | 864 |
return false; |
864 | 865 |
} else { |
865 | 866 |
return true; |
866 | 867 |
} |
867 | 868 |
|
868 | 869 |
} else { |
870 |
+ if (delcounts_plus > 0 && delcounts_minus > 0) { |
|
871 |
+ return true; |
|
872 |
+ } |
|
873 |
+ |
|
869 | 874 |
plus_strand_p[0] = plus_strand_p[1] = plus_strand_p[2] = plus_strand_p[3] = false; |
870 | 875 |
minus_strand_p[0] = minus_strand_p[1] = minus_strand_p[2] = minus_strand_p[3] = false; |
871 | 876 |
|
... | ... |
@@ -873,64 +878,71 @@ pass_variant_filter_p (long int nmatches, long int delcounts_plus, long int delc |
873 | 878 |
for (ptr = mismatches_by_shift; ptr != NULL; ptr = List_next(ptr)) { |
874 | 879 |
mismatch = (Mismatch_T) List_head(ptr); |
875 | 880 |
if (mismatch->shift > 0) { |
881 |
+ if (delcounts_minus > 0) { |
|
882 |
+ return true; |
|
883 |
+ } |
|
876 | 884 |
switch (mismatch->nt) { |
877 | 885 |
case 'A': |
878 |
- if (minus_strand_p[0] == true) { |
|
886 |
+ if (minus_strand_p[/*A*/0] == true) { |
|
879 | 887 |
return true; |
880 | 888 |
} else { |
881 |
- plus_strand_p[0] = true; |
|
889 |
+ plus_strand_p[/*A*/0] = true; |
|
882 | 890 |
} |
883 | 891 |
break; |
884 | 892 |
case 'C': |
885 |
- if (minus_strand_p[1] == true) { |
|
893 |
+ if (minus_strand_p[/*C*/1] == true) { |
|
886 | 894 |
return true; |
887 | 895 |
} else { |
888 |
- plus_strand_p[1] = true; |
|
896 |
+ plus_strand_p[/*C*/1] = true; |
|
889 | 897 |
} |
890 | 898 |
break; |
891 | 899 |
case 'G': |
892 |
- if (minus_strand_p[2] == true) { |
|
900 |
+ if (minus_strand_p[/*G*/2] == true) { |
|
893 | 901 |
return true; |
894 | 902 |
} else { |
895 |
- plus_strand_p[2] = true; |
|
903 |
+ plus_strand_p[/*G*/2] = true; |
|
896 | 904 |
} |
897 | 905 |
break; |
898 | 906 |
case 'T': |
899 |
- if (minus_strand_p[3] == true) { |
|
907 |
+ if (minus_strand_p[/*T*/3] == true) { |
|
900 | 908 |
return true; |
901 | 909 |
} else { |
902 |
- plus_strand_p[3] = true; |
|
910 |
+ plus_strand_p[/*T*/3] = true; |
|
903 | 911 |
} |
904 | 912 |
break; |
905 | 913 |
} |
914 |
+ |
|
906 | 915 |
} else if (mismatch->shift < 0) { |
916 |
+ if (delcounts_plus > 0) { |
|
917 |
+ return true; |
|
918 |
+ } |
|
907 | 919 |
switch (mismatch->nt) { |
908 | 920 |
case 'A': |
909 |
- if (plus_strand_p[0] == true) { |
|
921 |
+ if (plus_strand_p[/*A*/0] == true) { |
|
910 | 922 |
return true; |
911 | 923 |
} else { |
912 |
- minus_strand_p[0] = true; |
|
924 |
+ minus_strand_p[/*A*/0] = true; |
|
913 | 925 |
} |
914 | 926 |
break; |
915 | 927 |
case 'C': |
916 |
- if (plus_strand_p[1] == true) { |
|
928 |
+ if (plus_strand_p[/*C*/1] == true) { |
|
917 | 929 |
return true; |
918 | 930 |
} else { |
919 |
- minus_strand_p[1] = true; |
|
931 |
+ minus_strand_p[/*C*/1] = true; |
|
920 | 932 |
} |
921 | 933 |
break; |
922 | 934 |
case 'G': |
923 |
- if (plus_strand_p[2] == true) { |
|
935 |
+ if (plus_strand_p[/*G*/2] == true) { |
|
924 | 936 |
return true; |
925 | 937 |
} else { |
926 |
- minus_strand_p[2] = true; |
|
938 |
+ minus_strand_p[/*G*/2] = true; |
|
927 | 939 |
} |
928 | 940 |
break; |
929 | 941 |
case 'T': |
930 |
- if (plus_strand_p[3] == true) { |
|
942 |
+ if (plus_strand_p[/*T*/3] == true) { |
|
931 | 943 |
return true; |
932 | 944 |
} else { |
933 |
- minus_strand_p[3] = true; |
|
945 |
+ minus_strand_p[/*T*/3] = true; |
|
934 | 946 |
} |
935 | 947 |
break; |
936 | 948 |
} |
... | ... |
@@ -5930,6 +5942,11 @@ Bamtally_run (long int **tally_matches, long int **tally_mismatches, |
5930 | 5942 |
ignore_duplicates_p = true; |
5931 | 5943 |
#endif |
5932 | 5944 |
|
5945 |
+ /* |
|
5946 |
+ fprintf(stderr,"Called with alloclength %d, minimum_mapq %d, good_unique_mapq %d, minimum_quality_score %d, maximum_nhits %d, need_concordant_p %d, need_unique_p %d, need_primary_p %d, ignore_duplicates_p %d, min_depth %d, variant_strands %d, ignore_query_Ns_p %d, blocksize %d, verbosep %d, readlevel_p %d, max_softclip %d\n", |
|
5947 |
+ alloclength,minimum_mapq,good_unique_mapq,minimum_quality_score,maximum_nhits,need_concordant_p,need_unique_p,need_primary_p,ignore_duplicates_p,min_depth,variant_strands,ignore_query_Ns_p,blocksize,verbosep,readlevel_p,max_softclip); |
|
5948 |
+ */ |
|
5949 |
+ |
|
5933 | 5950 |
|
5934 | 5951 |
/* Create tally at position N to store n_fromleft */ |
5935 | 5952 |
alloc_tallies_alloc = (Tally_T *) CALLOC(alloclength + 2*max_softclip + 1,sizeof(Tally_T)); |
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-#define DEBUG2 1 |
|
1 |
+#define DEBUG2 0 |
|
2 | 2 |
|
3 | 3 |
#include <stdlib.h> /* for abs */ |
4 | 4 |
#include <string.h> /* for strlen */ |
... | ... |
@@ -7,21 +7,18 @@ |
7 | 7 |
#include "bytestream.h" |
8 | 8 |
|
9 | 9 |
/*I changed the order of these, because the XS spots are optional...*/ |
10 |
-enum { SEQNAMES, POS, REF, READ, N_CYCLES, N_CYCLES_REF, COUNT, COUNT_REF, //8 |
|
11 |
- COUNT_TOTAL, HIGH_QUALITY, HIGH_QUALITY_REF, HIGH_QUALITY_TOTAL, //12 |
|
12 |
- MEAN_QUALITY, MEAN_QUALITY_REF, //14 |
|
13 |
- COUNT_PLUS, COUNT_PLUS_REF, COUNT_MINUS, //17 |
|
14 |
- COUNT_MINUS_REF, DELCOUNT_PLUS, DELCOUNT_MINUS, //21 |
|
15 |
- READ_POS_MEAN, READ_POS_MEAN_REF, READ_POS_VAR, //24 |
|
16 |
- READ_POS_VAR_REF, MDFNE, MDFNE_REF, CODON_STRAND, //28 |
|
17 |
- COUNT_XS_PLUS, COUNT_XS_PLUS_REF, //30 |
|
18 |
- COUNT_XS_MINUS, COUNT_XS_MINUS_REF, N_BASE_COLS }; //33 |
|
19 |
- |
|
20 |
- |
|
21 |
-enum{ CODON_MINUS = -1, |
|
22 |
- NON_CODON, |
|
23 |
- CODON_PLUS}; |
|
24 |
- |
|
10 |
+enum { SEQNAMES, POS, REF, READ, N_CYCLES, N_CYCLES_REF, COUNT, COUNT_REF, |
|
11 |
+ COUNT_TOTAL, COUNT_PLUS, COUNT_PLUS_REF, COUNT_MINUS, |
|
12 |
+ COUNT_MINUS_REF, DELCOUNT_PLUS, DELCOUNT_MINUS, |
|
13 |
+ READ_POS_MEAN, READ_POS_MEAN_REF, READ_POS_VAR, |
|
14 |
+ READ_POS_VAR_REF, MDFNE, MDFNE_REF, CODON_STRAND, |
|
15 |
+ COUNT_XS_PLUS, COUNT_XS_PLUS_REF, |
|
16 |
+ COUNT_XS_MINUS, COUNT_XS_MINUS_REF, |
|
17 |
+ COUNT_HIGH_NM, COUNT_HIGH_NM_REF, N_BASE_COLS }; |
|
18 |
+ |
|
19 |
+enum { CODON_PLUS = 1, |
|
20 |
+ CODON_MINUS, |
|
21 |
+ NON_CODON }; |
|
25 | 22 |
|
26 | 23 |
static char *codon_table[64] = |
27 | 24 |
{"AAA", "AAC", "AAG", "AAT", "ACA", "ACC", "ACG", "ACT", |
... | ... |
@@ -33,10 +30,6 @@ static char *codon_table[64] = |
33 | 30 |
"TAA", "TAC", "TAG", "TAT", "TCA", "TCC", "TCG", "TCT", |
34 | 31 |
"TGA", "TGC", "TGG", "TGT", "TTA", "TTC", "TTG", "TTT"}; |
35 | 32 |
|
36 |
- |
|
37 |
- |
|
38 |
- |
|
39 |
- |
|
40 | 33 |
static int NUM_PTRS = 5; |
41 | 34 |
|
42 | 35 |
typedef struct TallyTable { |
... | ... |
@@ -49,11 +42,6 @@ typedef struct TallyTable { |
49 | 42 |
int *count; |
50 | 43 |
int *count_ref; |
51 | 44 |
int *count_total; |
52 |
- int *high_quality; |
|
53 |
- int *high_quality_ref; |
|
54 |
- int *high_quality_total; |
|
55 |
- double *mean_quality; |
|
56 |
- double *mean_quality_ref; |
|
57 | 45 |
int *count_plus; |
58 | 46 |
int *count_plus_ref; |
59 | 47 |
int *count_minus; |
... | ... |
@@ -70,6 +58,8 @@ typedef struct TallyTable { |
70 | 58 |
int *count_xs_plus_ref; |
71 | 59 |
int *count_xs_minus; |
72 | 60 |
int *count_xs_minus_ref; |
61 |
+ int *count_high_nm; |
|
62 |
+ int *count_high_nm_ref; |
|
73 | 63 |
int *strand; |
74 | 64 |
int **cycle_bins; |
75 | 65 |
} TallyTable; |
... | ... |
@@ -77,16 +67,15 @@ typedef struct TallyTable { |
77 | 67 |
typedef struct TallyParam { |
78 | 68 |
int *cycle_breaks; |
79 | 69 |
int n_cycle_bins; |
80 |
- int high_base_quality; |
|
81 | 70 |
int read_length; |
82 | 71 |
double *mdfne_buf; |
83 | 72 |
bool xs; |
73 |
+ int high_nm_score; |
|
84 | 74 |
} TallyParam; |
85 | 75 |
|
86 | 76 |
static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins, bool xs) { |
87 | 77 |
SEXP tally_R; /* the result list */ |
88 | 78 |
PROTECT(tally_R = allocVector(VECSXP, N_BASE_COLS + n_cycle_bins)); |
89 |
- |
|
90 | 79 |
SET_VECTOR_ELT(tally_R, SEQNAMES, allocVector(STRSXP, n_rows)); |
91 | 80 |
SET_VECTOR_ELT(tally_R, POS, allocVector(INTSXP, n_rows)); |
92 | 81 |
SET_VECTOR_ELT(tally_R, REF, allocVector(STRSXP, n_rows)); |
... | ... |
@@ -96,11 +85,6 @@ static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins, bool xs) { |
96 | 85 |
SET_VECTOR_ELT(tally_R, COUNT, allocVector(INTSXP, n_rows)); |
97 | 86 |
SET_VECTOR_ELT(tally_R, COUNT_REF, allocVector(INTSXP, n_rows)); |
98 | 87 |
SET_VECTOR_ELT(tally_R, COUNT_TOTAL, allocVector(INTSXP, n_rows)); |
99 |
- SET_VECTOR_ELT(tally_R, HIGH_QUALITY, allocVector(INTSXP, n_rows)); |
|
100 |
- SET_VECTOR_ELT(tally_R, HIGH_QUALITY_REF, allocVector(INTSXP, n_rows)); |
|
101 |
- SET_VECTOR_ELT(tally_R, HIGH_QUALITY_TOTAL, allocVector(INTSXP, n_rows)); |
|
102 |
- SET_VECTOR_ELT(tally_R, MEAN_QUALITY, allocVector(REALSXP, n_rows)); |
|
103 |
- SET_VECTOR_ELT(tally_R, MEAN_QUALITY_REF, allocVector(REALSXP, n_rows)); |
|
104 | 88 |
SET_VECTOR_ELT(tally_R, COUNT_PLUS, allocVector(INTSXP, n_rows)); |
105 | 89 |
SET_VECTOR_ELT(tally_R, COUNT_PLUS_REF, allocVector(INTSXP, n_rows)); |
106 | 90 |
SET_VECTOR_ELT(tally_R, COUNT_MINUS, allocVector(INTSXP, n_rows)); |
... | ... |
@@ -114,12 +98,15 @@ static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins, bool xs) { |
114 | 98 |
SET_VECTOR_ELT(tally_R, CODON_STRAND, allocVector(INTSXP, n_rows)); |
115 | 99 |
SET_VECTOR_ELT(tally_R, DELCOUNT_PLUS, allocVector(INTSXP, n_rows)); |
116 | 100 |
SET_VECTOR_ELT(tally_R, DELCOUNT_MINUS, allocVector(INTSXP, n_rows)); |
117 |
- SET_VECTOR_ELT(tally_R, COUNT_XS_PLUS, allocVector(INTSXP, n_rows)); |
|
118 |
- SET_VECTOR_ELT(tally_R, COUNT_XS_PLUS_REF, allocVector(INTSXP, n_rows)); |
|
119 |
- SET_VECTOR_ELT(tally_R, COUNT_XS_MINUS, allocVector(INTSXP, n_rows)); |
|
120 |
- SET_VECTOR_ELT(tally_R, COUNT_XS_MINUS_REF, allocVector(INTSXP, n_rows)); |
|
121 |
- |
|
122 |
- |
|
101 |
+ if (xs) { |
|
102 |
+ SET_VECTOR_ELT(tally_R, COUNT_XS_PLUS, allocVector(INTSXP, n_rows)); |
|
103 |
+ SET_VECTOR_ELT(tally_R, COUNT_XS_PLUS_REF, allocVector(INTSXP, n_rows)); |
|
104 |
+ SET_VECTOR_ELT(tally_R, COUNT_XS_MINUS, allocVector(INTSXP, n_rows)); |
|
105 |
+ SET_VECTOR_ELT(tally_R, COUNT_XS_MINUS_REF, allocVector(INTSXP, n_rows)); |
|
106 |
+ } |
|
107 |
+ SET_VECTOR_ELT(tally_R, COUNT_HIGH_NM, allocVector(INTSXP, n_rows)); |
|
108 |
+ SET_VECTOR_ELT(tally_R, COUNT_HIGH_NM_REF, allocVector(INTSXP, n_rows)); |
|
109 |
+ |
|
123 | 110 |
for (int bin = 0; bin < n_cycle_bins; bin++) { |
124 | 111 |
SEXP cycle_bin_R = allocVector(INTSXP, n_rows); |
125 | 112 |
SET_VECTOR_ELT(tally_R, bin + N_BASE_COLS, cycle_bin_R); |
... | ... |
@@ -130,7 +117,7 @@ static SEXP R_TallyTable_new(int n_rows, int n_cycle_bins, bool xs) { |
130 | 117 |
} |
131 | 118 |
|
132 | 119 |
static TallyTable *TallyTable_new(SEXP tally_R, bool xs) { |
133 |
- TallyTable *tally = (TallyTable *) R_alloc(sizeof(TallyTable), 1); |
|
120 |
+ TallyTable *tally = (TallyTable *) S_alloc(sizeof(TallyTable), 1); |
|
134 | 121 |
int n_cycle_bins = length(tally_R) - N_BASE_COLS; |
135 | 122 |
|
136 | 123 |
tally->seqnames_R = VECTOR_ELT(tally_R, SEQNAMES); |
... | ... |
@@ -142,11 +129,6 @@ static TallyTable *TallyTable_new(SEXP tally_R, bool xs) { |
142 | 129 |
tally->count = INTEGER(VECTOR_ELT(tally_R, COUNT)); |
143 | 130 |
tally->count_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_REF)); |
144 | 131 |
tally->count_total = INTEGER(VECTOR_ELT(tally_R, COUNT_TOTAL)); |
145 |
- tally->high_quality = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY)); |
|
146 |
- tally->high_quality_ref = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY_REF)); |
|
147 |
- tally->high_quality_total = INTEGER(VECTOR_ELT(tally_R, HIGH_QUALITY_TOTAL)); |
|
148 |
- tally->mean_quality = REAL(VECTOR_ELT(tally_R, MEAN_QUALITY)); |
|
149 |
- tally->mean_quality_ref = REAL(VECTOR_ELT(tally_R, MEAN_QUALITY_REF)); |
|
150 | 132 |
tally->count_plus = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS)); |
151 | 133 |
tally->count_plus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_PLUS_REF)); |
152 | 134 |
tally->count_minus = INTEGER(VECTOR_ELT(tally_R, COUNT_MINUS)); |
... | ... |
@@ -161,12 +143,18 @@ static TallyTable *TallyTable_new(SEXP tally_R, bool xs) { |
161 | 143 |
tally->delcount_plus = INTEGER(VECTOR_ELT(tally_R, DELCOUNT_PLUS)); |
162 | 144 |
tally->delcount_minus = INTEGER(VECTOR_ELT(tally_R, DELCOUNT_MINUS)); |
163 | 145 |
tally->cycle_bins = (int **) R_alloc(sizeof(int*), n_cycle_bins); |
164 |
- tally->count_xs_plus = INTEGER(VECTOR_ELT(tally_R, COUNT_XS_PLUS)); |
|
165 |
- tally->count_xs_plus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_XS_PLUS_REF)); |
|
166 |
- tally->count_xs_minus = INTEGER(VECTOR_ELT(tally_R, COUNT_XS_MINUS)); |
|
167 |
- tally->count_xs_minus_ref = INTEGER(VECTOR_ELT(tally_R, COUNT_XS_MINUS_REF)); |
|
146 |
+ if (xs) { |
|
147 |
+ tally->count_xs_plus = INTEGER(VECTOR_ELT(tally_R, COUNT_XS_PLUS)); |
|
148 |
+ tally->count_xs_plus_ref = INTEGER(VECTOR_ELT(tally_R, |
|
149 |
+ COUNT_XS_PLUS_REF)); |
|
150 |
+ tally->count_xs_minus = INTEGER(VECTOR_ELT(tally_R, COUNT_XS_MINUS)); |
|
151 |
+ tally->count_xs_minus_ref = INTEGER(VECTOR_ELT(tally_R, |
|
152 |
+ COUNT_XS_MINUS_REF)); |
|
153 |
+ } |
|
154 |
+ tally->count_high_nm = INTEGER(VECTOR_ELT(tally_R, COUNT_HIGH_NM)); |
|
155 |
+ tally->count_high_nm_ref = INTEGER(VECTOR_ELT(tally_R, |
|
156 |
+ COUNT_HIGH_NM_REF)); |
|
168 | 157 |
|
169 |
- |
|
170 | 158 |
for (int bin = 0; bin < n_cycle_bins; bin++) { |
171 | 159 |
tally->cycle_bins[bin] = INTEGER(VECTOR_ELT(tally_R, bin + N_BASE_COLS)); |
172 | 160 |
} |
... | ... |
@@ -181,11 +169,30 @@ read_total_counts(unsigned char **bytes, int row, int *count_total) { |
181 | 169 |
count_total[row] = count_total_plus + count_total_minus; |
182 | 170 |
} |
183 | 171 |
|
172 |
+static void |
|
173 |
+read_nm_counts(unsigned char **bytes, int row, int *count_high_nm, |
|
174 |
+ int high_nm_score) |
|
175 |
+{ |
|
176 |
+ int n_nm = read_int(bytes); |
|
177 |
+ count_high_nm[row] = 0; |
|
178 |
+ /* FIXME: currently bam_tally gives us counts whether we want them or not */ |
|
179 |
+ for (int index = 0; index < n_nm; index++) { |
|
180 |
+ int nm = read_int(bytes); |
|
181 |
+ int count = read_int(bytes); |
|
182 |
+ if (nm >= high_nm_score) { |
|
183 |
+ count_high_nm[row] += count; |
|
184 |
+ } |
|
185 |
+ } |
|
186 |
+} |
|
187 |
+ |
|
184 | 188 |
static void |
185 | 189 |
read_xs_counts(unsigned char **bytes, int row, int *count_xs_plus, |
186 | 190 |
int *count_xs_minus) |
187 | 191 |
{ |
188 | 192 |
int n_xs = read_int(bytes); |
193 |
+ if (count_xs_plus == NULL) { |
|
194 |
+ return; |
|
195 |
+ } |
|
189 | 196 |
count_xs_plus[row] = 0; |
190 | 197 |
count_xs_minus[row] = 0; |
191 | 198 |
for (int index = 0; index < n_xs; index++) { |
... | ... |
@@ -197,14 +204,8 @@ read_xs_counts(unsigned char **bytes, int row, int *count_xs_plus, |
197 | 204 |
count_xs_minus[row] = count; |
198 | 205 |
} |
199 | 206 |
} |
200 |
-#ifdef DEBUG2 |
|
201 |
- printf("row %d xs counts: %d plus %d minus", row, count_xs_plus[row], count_xs_minus[row]); |
|
202 |
-#endif |
|
203 | 207 |
} |
204 | 208 |
|
205 |
- |
|
206 |
- |
|
207 |
- |
|
208 | 209 |
static void |
209 | 210 |
read_cycle_counts(unsigned char **bytes, int row, TallyParam param, |
210 | 211 |
int *n_cycles, double *read_pos_mean, double *read_pos_var, |
... | ... |
@@ -274,9 +275,6 @@ parse_indels(unsigned char *bytes, int row, |
274 | 275 |
TallyParam param, TallyTable *tally, bool insertion) |
275 | 276 |
{ |
276 | 277 |
int indel_count = read_int(&bytes); |
277 |
-#ifdef DEBUG2 |
|
278 |
- printf("row %d indel count: %d", row, indel_count); |
|
279 |
-#endif |
|
280 | 278 |
for (int indel = 0; indel < indel_count; indel++, row++) { |
281 | 279 |
for (int b = 0; b < param.n_cycle_bins; b++) { |
282 | 280 |
tally->cycle_bins[b][row] = 0; |
... | ... |
@@ -302,72 +300,48 @@ parse_indels(unsigned char *bytes, int row, |
302 | 300 |
read_cycle_counts(&bytes, row, param, tally->n_cycles, |
303 | 301 |
tally->read_pos_mean, tally->read_pos_var, |
304 | 302 |
tally->mdfne, tally->cycle_bins); |
305 |
- /* quality is per-base and so not relevant for indels */ |
|
306 |
- tally->mean_quality[row] = NA_REAL; |
|
307 |
- tally->mean_quality_ref[row] = NA_REAL; |
|
308 |
- tally->high_quality[row] = NA_INTEGER; |
|
309 |
- tally->high_quality_ref[row] = NA_INTEGER; |
|
310 |
- tally->high_quality_total[row] = NA_INTEGER; |
|
303 |
+ read_nm_counts(&bytes, row, tally->count_high_nm, param.high_nm_score); |
|
304 |
+ read_xs_counts(&bytes, row, tally->count_xs_plus, tally->count_xs_minus); |
|
311 | 305 |
/* no position from which to tabulate the cycles */ |
312 | 306 |
tally->n_cycles_ref[row] = NA_INTEGER; |
313 | 307 |
tally->read_pos_mean_ref[row] = NA_REAL; |
314 | 308 |
tally->read_pos_var_ref[row] = NA_REAL; |
315 | 309 |
tally->mdfne_ref[row] = NA_REAL; |
316 |
- } |
|
317 |
- return indel_count; |
|
318 |
-} |
|
319 |
- |
|
320 |
-static void |
|
321 |
-read_quality_counts(unsigned char **bytes, int row, int *high_quality, |
|
322 |
- double *mean_quality, int high_base_quality) |
|
323 |
-{ |
|
324 |
- int n_qualities = read_int(bytes); |
|
325 |
- int total_quality = 0; |
|
326 |
- int total_quality_weight = 0; |
|
327 |
- high_quality[row] = 0; |
|
328 |
- for (int index = 0; index < n_qualities; index++) { |
|
329 |
- int quality = read_int(bytes); |
|
330 |
- int count = read_int(bytes); |
|
331 |
- if (quality >= high_base_quality) { |
|
332 |
- total_quality += quality * count; |
|
333 |
- total_quality_weight += count; |
|
334 |
- high_quality[row] += count; |
|
310 |
+ /* overlapping deletion count only relevant for SNVs */ |
|
311 |
+ tally->delcount_plus[row] = NA_INTEGER; |
|
312 |
+ tally->delcount_minus[row] = NA_INTEGER; |
|
313 |
+ /* information not available for "ref" allele */ |
|
314 |
+ tally->count_high_nm_ref[row] = NA_INTEGER; |
|
315 |
+ if (param.xs) { |
|
316 |
+ tally->count_xs_plus_ref[row] = NA_INTEGER; |
|
317 |
+ tally->count_xs_minus_ref[row] = NA_INTEGER; |
|
335 | 318 |
} |
336 | 319 |
} |
337 |
- mean_quality[row] = total_quality_weight > 0 ? |
|
338 |
- (double)total_quality / total_quality_weight : R_NaN; |
|
320 |
+ return indel_count; |
|
339 | 321 |
} |
340 | 322 |
|
341 | 323 |
static int |
342 | 324 |
read_allele_counts(unsigned char **bytes, int row, SEXP read_R, |
343 |
- int *count_plus, int *count_minus, int *count, int strand) |
|
325 |
+ int *count_plus, int *count_minus, int *count, |
|
326 |
+ int *delcount_plus, int *delcount_minus, |
|
327 |
+ int strand) |
|
344 | 328 |
{ |
345 | 329 |
int n_alleles = 0; |
346 | 330 |
char allele; |
347 | 331 |
char stop = strand == 0 ? '\0' : (char) 255; |
348 |
-#ifdef DEBUG2 |
|
349 |
- printf("Starting read_allele_counts (codon strand %d) at row %d. Total length of read_R is %d\n", strand, row, LENGTH(read_R)); |
|
350 |
-#endif |
|
332 |
+ |
|
351 | 333 |
while((allele = (char)read_char(bytes)) != stop) { |
352 |
-#ifdef DEBUG2 |
|
353 |
- printf("Parsing counts for allele: %d (%s) row %d \n", allele, &allele, row ); |
|
354 |
-#endif |
|
334 |
+ if (allele == '_') { |
|
335 |
+ *delcount_plus = read_int(bytes); |
|
336 |
+ *delcount_minus = read_int(bytes); |
|
337 |
+ continue; |
|
338 |
+ } |
|
355 | 339 |
if(strand == 0) |
356 | 340 |
SET_STRING_ELT(read_R, row, mkCharLen(&allele, 1)); |
357 | 341 |
else |
358 | 342 |
SET_STRING_ELT(read_R, row, mkCharLen(codon_table[(int)allele], 3)); |
359 |
-#ifdef DEBUG2 |
|
360 |
- printf("Reading allele plus count: "); |
|
361 |
-#endif |
|
362 | 343 |
count_plus[row] = read_int(bytes); |
363 |
-#ifdef DEBUG2 |
|
364 |
- printf("%d\n", count_plus[row]); |
|
365 |
- printf("Reading allele minus count: "); |
|
366 |
-#endif |
|
367 | 344 |
count_minus[row] = read_int(bytes); |
368 |
-#ifdef DEBUG2 |
|
369 |
- printf("%d\n", count_minus[row]); |
|
370 |
-#endif |
|
371 | 345 |
count[row] = count_plus[row] + count_minus[row]; |
372 | 346 |
row++; |
373 | 347 |
n_alleles++; |
... | ... |
@@ -376,69 +350,35 @@ read_allele_counts(unsigned char **bytes, int row, SEXP read_R, |
376 | 350 |
return n_alleles; |
377 | 351 |
} |
378 | 352 |
|
379 |
- |
|
380 |
-//format _(int)(int) |
|
381 |
-static void |
|
382 |
-read_del_counts(unsigned char **bytes, int row, int *delcount_plus, |
|
383 |
- int *delcount_minus) |
|
384 |
-{ |
|
385 |
- |
|
386 |
-//This consumes a byte!!! |
|
387 |
- unsigned char mychar = read_char(bytes); |
|
388 |
-#ifdef DEBUG2 |
|
389 |
- printf("in read_del_counts, mchar is (as int) %d", mychar); |
|
390 |
-#endif |
|
391 |
- |
|
392 |
- if(mychar == '_') { |
|
393 |
- |
|
394 |
- |
|
395 |
- *delcount_plus = read_int(bytes); |
|
396 |
- *delcount_minus = read_int(bytes); |
|
397 |
-#ifdef DEBUG2 |
|
398 |
- printf("read_del_counts: '_' character detected. count plus %d count minus %d\n", *delcount_plus, *delcount_minus); |
|
399 |
-#endif |
|
400 |
- } else { |
|
401 |
- //backtrack char consumption if it wasn't a "_" |
|
402 |
- *bytes = *bytes-1; |
|
403 |
- *delcount_plus = 0; |
|
404 |
- *delcount_minus = 0; |
|
405 |
- } |
|
406 |
- return; |
|
407 |
-} |
|
408 |
- |
|
409 | 353 |
static int |
410 | 354 |
parse_alleles(unsigned char *bytes, int row, int ref_row, |
411 | 355 |
TallyParam param, TallyTable *tally, int strand) |
412 | 356 |
{ |
413 | 357 |
|
414 |
- bool have_ref_row = false; |
|
415 | 358 |
read_total_counts(&bytes, row, tally->count_total); |
359 |
+ int delcount_plus = 0, delcount_minus = 0; |
|
416 | 360 |
int n_alleles = read_allele_counts(&bytes, row, tally->read_R, |
417 | 361 |
tally->count_plus, tally->count_minus, |
418 | 362 |
tally->count, |
363 |
+ &delcount_plus, &delcount_minus, |
|
419 | 364 |
strand); |
420 |
- int delcount_plus, delcount_minus; |
|
421 |
-/* read_del_counts(&bytes, row, &delcount_plus, &delcount_minus);*/ |
|
422 | 365 |
for (int allele = 0; allele < n_alleles; allele++, row++) { |
423 | 366 |
tally->n_cycles[row] = 0; |
424 | 367 |
for (int b = 0; b < param.n_cycle_bins; b++) { |
425 | 368 |
tally->cycle_bins[b][row] = 0; |
426 | 369 |
} |
427 |
- tally->high_quality[row] = 0; |
|
428 |
- tally->mean_quality[row] = R_NaN; |
|
429 | 370 |
tally->strand[row] = strand; |
430 |
- tally->delcount_plus[row] = R_NaN;//delcount_plus; |
|
431 |
- tally->delcount_minus[row] = R_NaN;//delcount_minus; |
|
371 |
+ tally->delcount_plus[row] = delcount_plus; |
|
372 |
+ tally->delcount_minus[row] = delcount_minus; |
|
432 | 373 |
if (tally->count[row] > 0) { |
433 | 374 |
read_cycle_counts(&bytes, row, param, tally->n_cycles, |
434 | 375 |
tally->read_pos_mean, tally->read_pos_var, |
435 | 376 |
tally->mdfne, tally->cycle_bins); |
436 |
- read_quality_counts(&bytes, row, tally->high_quality, tally->mean_quality, |
|
437 |
- param.high_base_quality); |
|
377 |
+ read_nm_counts(&bytes, row, tally->count_high_nm, param.high_nm_score); |
|
438 | 378 |
if (param.xs) { |
439 |
- read_xs_counts(&bytes, row, tally->count_xs_plus, tally->count_xs_minus); |
|
379 |
+ read_xs_counts(&bytes, row, tally->count_xs_plus, |
|
380 |
+ tally->count_xs_minus); |
|
440 | 381 |
} |
441 |
- |
|
442 | 382 |
} else { |
443 | 383 |
tally->read_pos_mean[row] = R_NaN; |
444 | 384 |
tally->read_pos_var[row] = NA_REAL; |
... | ... |
@@ -446,27 +386,19 @@ parse_alleles(unsigned char *bytes, int row, int ref_row, |
446 | 386 |
if (param.xs) { |
447 | 387 |
tally->count_xs_plus[row] = 0; |
448 | 388 |
tally->count_xs_minus[row] = 0; |
449 |
- |
|
450 | 389 |
} |
451 | 390 |
} |
452 |
- have_ref_row = true; |
|
453 |
- } |
|
454 |
- int high_quality_total = 0; |
|
455 |
- for (int r = ref_row; r < row; r++) { |
|
456 |
- high_quality_total += tally->high_quality[r]; |
|
457 | 391 |
} |
458 | 392 |
for (int r = ref_row; r < row; r++) { |
459 | 393 |
tally->n_cycles_ref[r] = tally->n_cycles[ref_row]; |
460 | 394 |
tally->count_total[r] = tally->count_total[ref_row]; |
461 |
- tally->mean_quality_ref[r] = tally->mean_quality[ref_row]; |
|
462 |
- tally->high_quality_ref[r] = tally->high_quality[ref_row]; |
|
463 |
- tally->high_quality_total[r] = high_quality_total; |
|
464 | 395 |
tally->count_plus_ref[r] = tally->count_plus[ref_row]; |
465 | 396 |
tally->count_minus_ref[r] = tally->count_minus[ref_row]; |
466 | 397 |
tally->count_ref[r] = tally->count_plus_ref[r] + tally->count_minus_ref[r]; |
467 | 398 |
tally->read_pos_mean_ref[r] = tally->read_pos_mean[ref_row]; |
468 | 399 |
tally->read_pos_var_ref[r] = tally->read_pos_var[ref_row]; |
469 | 400 |
tally->mdfne_ref[r] = tally->mdfne[ref_row]; |
401 |
+ tally->count_high_nm_ref[r] = tally->count_high_nm[ref_row]; |
|
470 | 402 |
if (param.xs) { |
471 | 403 |
tally->count_xs_plus_ref[r] = tally->count_xs_plus[ref_row]; |
472 | 404 |
tally->count_xs_minus_ref[r] = tally->count_xs_minus[ref_row]; |
... | ... |
@@ -474,12 +406,9 @@ parse_alleles(unsigned char *bytes, int row, int ref_row, |
474 | 406 |
SET_STRING_ELT(tally->ref_R, r, STRING_ELT(tally->read_R, ref_row)); |
475 | 407 |
} |
476 | 408 |
|
477 |
-// if (have_ref_row) { |
|
478 | 409 |
/* clear the 'alt' columns for the 'ref' row with NAs */ |
479 | 410 |
SET_STRING_ELT(tally->read_R, ref_row, NA_STRING); |
480 | 411 |
tally->n_cycles[ref_row] = NA_INTEGER; |
481 |
- tally->mean_quality[ref_row] = NA_REAL; |
|
482 |
- tally->high_quality[ref_row] = NA_REAL; |
|
483 | 412 |
tally->count_plus[ref_row] = NA_INTEGER; |
484 | 413 |
tally->count_minus[ref_row] = NA_INTEGER; |
485 | 414 |
tally->count[ref_row] = NA_INTEGER; |
... | ... |
@@ -491,7 +420,6 @@ parse_alleles(unsigned char *bytes, int row, int ref_row, |
491 | 420 |
tally->count_xs_minus[ref_row] = NA_INTEGER; |
492 | 421 |
} |
493 | 422 |
|
494 |
-// } |
|
495 | 423 |
return n_alleles; |
496 | 424 |
} |
497 | 425 |
|
... | ... |
@@ -504,11 +432,9 @@ static int parse_allele_count(unsigned char *bytes) { |
504 | 432 |
int n_alleles = 1; /* always have a reference */ |
505 | 433 |
bytes += sizeof(int) * 4 + 1; /* skip total and reference */ |
506 | 434 |
while(bytes[0] != '\0') { |
507 |
-#ifdef DEBUG2 |
|
508 |
- printf("Found allele %s\n", &bytes[0]); |
|
509 |
-#endif |
|
510 |
- bytes += sizeof(int) * 2 + 1; |
|
511 |
- n_alleles++; |
|
435 |
+ if (bytes[0] != '_') |
|
436 |
+ n_alleles++; |
|
437 |
+ bytes += sizeof(int) * 2 + 1; |
|
512 | 438 |
} |
513 | 439 |
return n_alleles; |
514 | 440 |
} |
... | ... |
@@ -518,9 +444,6 @@ static int parse_codon_count(unsigned char *bytes) { |
518 | 444 |
int n_alleles = 1; /* always have a reference */ |
519 | 445 |
bytes += sizeof(int) * 4 + 1; /* skip reference */ |
520 | 446 |
while((int) bytes[0] != 255 ){ // && bytes [0] != '\0') { |
521 |
-#ifdef DEBUG2 |
|
522 |
- printf("Found codon %d\n", bytes[0]); |
|
523 |
-#endif |
|
524 | 447 |
bytes += sizeof(int) * 2 + 1; |
525 | 448 |
n_alleles++; |
526 | 449 |
} |
... | ... |
@@ -571,38 +494,12 @@ static int parse_interval(IIT_T tally_iit, int index, |
571 | 494 |
SEXP divstring_R; |
572 | 495 |
PROTECT(divstring_R = mkChar(IIT_divstring_from_index(tally_iit, index))); |
573 | 496 |
for (int position = 0; position < width; position++) { |
574 |
-#ifdef DEBUG2 |
|
575 |
- printf("Reading insertion offset: "); |
|
576 |
-#endif |
|
577 | 497 |
int insertion_offset = read_int(&bytes); |
578 |
-#ifdef DEBUG2 |
|
579 |
- printf("%d\n", insertion_offset); |
|
580 |
- printf("Reading deletion offset: "); |
|
581 |
-#endif |
|
582 | 498 |
int deletion_offset = read_int(&bytes); |
583 |
-#ifdef DEBUG2 |
|
584 |
- printf("%d\n", deletion_offset); |
|
585 |
- printf("Reading allele offset: "); |
|
586 |
-#endif |
|
587 | 499 |
int allele_offset = read_int(&bytes); |
588 |
-#ifdef DEBUG2 |
|
589 |
- printf("%d\n", allele_offset); |
|
590 |
- printf("Reading codon plus offset: "); |
|
591 |
-#endif |
|
592 | 500 |
int codon_plus_offset = read_int(&bytes); |
593 |
-#ifdef DEBUG2 |
|
594 |
- printf("%d\n", codon_plus_offset); |
|
595 |
- printf("Reading codon_minus offset: "); |
|
596 |
-#endif |
|
597 | 501 |
int codon_minus_offset = read_int(&bytes); |
598 |
-#ifdef DEBUG2 |
|
599 |
- printf("%d\n", codon_minus_offset); |
|
600 |
- printf("Reading 'next' offset: "); |
|
601 |
-#endif |
|
602 | 502 |
int next_offset = read_int(&bytes); |
603 |
-#ifdef DEBUG2 |
|
604 |
- printf("%d\n", next_offset); |
|
605 |
-#endif |
|
606 | 503 |
int ref_row = row; |
607 | 504 |
bytes -= 4; /* rewind from read-ahead */ |
608 | 505 |
if (codon_plus_offset - allele_offset > 0) |
... | ... |
@@ -616,12 +513,10 @@ static int parse_interval(IIT_T tally_iit, int index, |
616 | 513 |
param, tally, false); |
617 | 514 |
|
618 | 515 |
if (codon_minus_offset - codon_plus_offset > 0) { |
619 |
-// ref_row = row; |
|
620 | 516 |
row += parse_alleles(base + codon_plus_offset, row, row, |
621 | 517 |
param, tally, CODON_PLUS); |
622 | 518 |
} |
623 | 519 |
if (next_offset - codon_minus_offset > 0) { |
624 |
-// ref_row = row; |
|
625 | 520 |
row += parse_alleles(base + codon_minus_offset, row, row, |
626 | 521 |
param, tally, CODON_MINUS); |
627 | 522 |
} |
... | ... |
@@ -640,17 +535,9 @@ static SEXP parse_all(IIT_T tally_iit, TallyParam param) |
640 | 535 |
int n_rows = 0; |
641 | 536 |
/* loop over the IIT, getting the total number of rows |
642 | 537 |
this is (num alts + 1) for every position. */ |
643 |
-#ifdef DEBUG2 |
|
644 |
- printf("Total number of intervals: %d\n", IIT_total_nintervals(tally_iit)); |
|
645 |
-#endif |
|
646 | 538 |
for (int index = 1; index <= IIT_total_nintervals(tally_iit); index++) { |
647 | 539 |
n_rows += count_rows_for_interval(tally_iit, index); |
648 | 540 |
} |
649 |
- |
|
650 |
-#ifdef DEBUG2 |
|
651 |
- printf("Total number of rows: %d\n", n_rows); |
|
652 |
-#endif |
|
653 |
- |
|
654 | 541 |
|
655 | 542 |
SEXP tally_R; |
656 | 543 |
PROTECT(tally_R = R_TallyTable_new(n_rows, param.n_cycle_bins, param.xs)); |
... | ... |
@@ -699,26 +586,27 @@ static SEXP parse_some(IIT_T tally_iit, TallyParam param, |
699 | 586 |
/* FORMAT |
700 | 587 |
|
701 | 588 |
block header: |
702 |
- [0][ins][del][mm] |
|
589 |
+ [0][ins][del][mm][codon+][codon-] |
|
703 | 590 |
|
704 |
- mismatches for each position: |
|
705 |
- [t+][t-][rn][r+][r-]([an][a+][a-])*[\0]([c#]([cv][cc])*)*([q#]([qv][qc])*)* |
|
591 |
+ mismatches/codons: |
|
592 |
+ [t+][t-][rn][r+][r-]([an][a+][a-])* |
|
593 |
+ mismatches only: |
|
594 |
+ '_'[d+][d-][\0]([c#]([cv][cc])*[nm#]([nmv][nmc])*[xs#]([xsv][xsc])*)* |
|
706 | 595 |
|
707 | 596 |
insertions: |
708 |
- [i#]([seq][t+][t-][c#]([cv][cc])*)* |
|
597 |
+ [i#]([t+][t-][r+][r-][seq][c#]([cv][cc])*[nm#]([nmv][nmc])*[xs#]([xsv][xsc])* |
|
709 | 598 |
|
710 | 599 |
deletions: |
711 |
- [d#]([seq][t+][t-][c#]([cv][cc])*)* |
|
600 |
+ [d#]([t+][t-][r+][r-][seq][c#]([cv][cc])*[nm#]([nmv][nmc])*[xs#]([xsv][xsc])* |
|
712 | 601 |
*/ |
713 | 602 |
|
714 | 603 |
SEXP R_tally_iit_parse(SEXP tally_iit_R, SEXP cycle_breaks_R, |
715 |
- SEXP high_base_quality_R, SEXP which_R, |
|
716 |
- SEXP read_length_R, SEXP xs_R) |
|
604 |
+ SEXP which_R, |
|
605 |
+ SEXP read_length_R, SEXP xs_R, SEXP high_nm_score_R) |
|
717 | 606 |
{ |
718 | 607 |
IIT_T tally_iit = (IIT_T) R_ExternalPtrAddr(tally_iit_R); |
719 | 608 |
SEXP tally_R; |
720 | 609 |
TallyParam param; |
721 |
- param.high_base_quality = asInteger(high_base_quality_R); |
|
722 | 610 |
param.cycle_breaks = |
723 | 611 |
cycle_breaks_R == R_NilValue ? NULL : INTEGER(cycle_breaks_R); |
724 | 612 |
param.n_cycle_bins = |
... | ... |
@@ -730,6 +618,7 @@ SEXP R_tally_iit_parse(SEXP tally_iit_R, SEXP cycle_breaks_R, |
730 | 618 |
param.mdfne_buf = NULL; |
731 | 619 |
} |
732 | 620 |
param.xs = asLogical(xs_R); |
621 |
+ param.high_nm_score = asInteger(high_nm_score_R); |
|
733 | 622 |
|
734 | 623 |
if (which_R == R_NilValue) { |
735 | 624 |
tally_R = parse_all(tally_iit, param); |