Browse code

start on C-level interface to IIT files

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@125153 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 14/12/2016 18:20:54
Showing4 changed files

... ...
@@ -24,10 +24,10 @@ Collate: GmapBamReader-class.R GmapGenomeDirectory-class.R
24 24
         GmapGenome-class.R GmapSnpDirectory-class.R GmapSnps-class.R
25 25
         GmapParam-class.R GsnapParam-class.R
26 26
         GsnapOutput-class.R GmapOutput-class.R
27
-        atoiindex-command.R
27
+        atoiindex-command.R iit-format.R
28 28
         BamTallyParam-class.R bam_tally-command.R cmetindex-command.R
29 29
         get-genome-command.R gmap-command.R gmap_build-command.R
30
-        gsnap-command.R iit-format.R iit_store-command.R info.R
30
+        gsnap-command.R iit_store-command.R info.R
31 31
         snpindex-command.R system.R test_gmapR_package.R
32 32
         makeGmapGenomePackage.R TP53Genome.R utils.R asSystemCall.R
33 33
 biocViews: Alignment
... ...
@@ -7,13 +7,13 @@
7 7
 ### Raw tally result
8 8
 ###
9 9
 
10
-setClass("TallyIIT", representation(ptr = "externalptr",
11
-                                    genome = "GmapGenome",
10
+setClass("TallyIIT", representation(genome = "GmapGenome",
12 11
                                     bam = "BamFile",
13 12
                                     xs = "logical",
14 13
                                     read_pos = "logical",
15 14
                                     nm = "logical",
16
-                                    codon = "logical"))
15
+                                    codon = "logical"),
16
+         contains = "IIT")
17 17
 
18 18
 TallyIIT <- function(ptr, genome, bam, xs, read_pos, nm, codon) {
19 19
   new("TallyIIT", ptr = ptr, genome = genome, bam = bam, xs = xs,
... ...
@@ -10,12 +10,63 @@
10 10
 ### IITFile class
11 11
 ###
12 12
 
13
-setClass("IITFile", contains = "RTLFile")
13
+setClass("IIT", representation(ptr = "externalptr"))
14
+setClassUnion("IITORNULL", c("IIT", "NULL"))
15
+
16
+setRefClass("IITFile",
17
+            fields=c(iit="IITORNULL"),
18
+            contains="RTLFile")
14 19
 
15 20
 IITFile <- function(resource) {
16 21
   new("IITFile", resource = resource)
17 22
 }
18 23
 
24
+IIT <- function(ptr) {
25
+    new("IIT", ptr=ptr)
26
+}
27
+
28
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
29
+### Accessors
30
+###
31
+
32
+iit <- function(x) x$iit
33
+`iit<-` <- function(x, value) {
34
+    x$iit <- value
35
+    x
36
+}
37
+
38
+fieldNames <- function(x) {
39
+    .Call(R_iit_fieldNames(x), iit(x))
40
+}
41
+
42
+typeNames <- function(x) {
43
+    .Call(R_iit_typeNames(x), iit(x))
44
+}
45
+
46
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
47
+### Open/close
48
+###
49
+
50
+open.IITFile <- function(con, ranges = TRUE, labels = TRUE) {
51
+    stopifnot(isTRUEorFALSE(ranges),
52
+              isTRUEorFALSE(labels),
53
+              !isOpen(con))
54
+    iit(con) <- IIT(.Call(R_iit_read, resource(con), ranges, labels))
55
+    invisible(con)
56
+}
57
+
58
+close.IITFile <- function(con) {
59
+    iit(con) <- NULL
60
+    invisible(con)
61
+}
62
+
63
+setMethod("isOpen", "IITFile", function(con, rw=c("", "read", "write")) {
64
+    rw <- match.arg(rw)
65
+    if (rw == "write")
66
+        FALSE
67
+    else !is.null(iit(con))
68
+})
69
+
19 70
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
20 71
 ### Export
21 72
 ###
... ...
@@ -36,3 +87,96 @@ setMethod("export", c("ANY", "IITFile"), function(object, con, ...) {
36 87
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
37 88
 ### Import
38 89
 ###
90
+
91
+normArgWhich <- function(which) {
92
+    if (!is.null(which) && !is.character(which)) {
93
+        which <- as(which, "GRanges")
94
+        sign <- as.integer(strand(which))
95
+        sign[sign == 1L] <- 4L
96
+        sign <- sign - 3L
97
+        which <- list(as.character(seqnames(which)), start(which), end(which),
98
+                      sign)
99
+    }
100
+    which
101
+}
102
+
103
+openForWhich <- function(con, which) {
104
+    labels <- is.character(which)
105
+    if (labels && anyNA(which))
106
+        stop("query labels cannot be NA")
107
+    ranges <- !labels && !is.null(which)
108
+    if (ranges)
109
+        which <- as(which, "GRanges")
110
+    open(con, ranges=ranges, labels=labels)
111
+    which
112
+}
113
+
114
+normArgFields <- function(con, fields) {
115
+    iit_fields <- fieldNames(con)
116
+    bad_fields <- setdiff(fields, iit_fields)
117
+    if (length(bad_fields) > 0L)
118
+        stop("invalid 'fields' requested: ", paste(bad_fields, collapse=", "))
119
+    fields
120
+}
121
+
122
+normArgType <- function(con, type) {
123
+    if (is.null(type))
124
+        return(NULL)
125
+    if (!isSingleString(type))
126
+        stop("'type' must be NULL or a single, non-NA string")
127
+    iit_types <- typeNames(con)
128
+    if (!(type %in% iit_types))
129
+        stop("'type' must be one of ", paste(iit_types, collapse=", "))
130
+    type
131
+}
132
+
133
+### NOTE: if 'exact=TRUE' and is.null(type), only gets intervals with NO type
134
+setMethod("import", "IITFile",
135
+          function(con, format, text, which = NULL, type = NULL,
136
+                   fields = fieldNames(con),
137
+                   ignore.strand = FALSE, exact = FALSE,
138
+                   as = c("GRanges", "GRangesList", "DataFrame"))
139
+{
140
+    if (!missing(format))
141
+        checkArgFormat(con, format)
142
+    if (!missing(text))
143
+        stop("'text' not supported")
144
+    if (!isOpen(con)) {
145
+        openForWhich(con, which)    
146
+    }
147
+    type <- normArgType(con, type)
148
+    if (!is.null(type) && is.character(which))
149
+        stop("cannot restrict by 'type' when 'which' is character")
150
+    fields <- normArgFields(con, fields)
151
+    stopifnot(isTRUEorFALSE(ignore.strand),
152
+              isTRUEorFALSE(exact))
153
+    as <- match.arg(as)
154
+    exact.strand <- exact && !ignore.strand && !is.null(which)
155
+    include.ranges <- as %in% c("GRanges", "GRangesList") || exact.strand
156
+    
157
+    result <- .Call(R_iit_read, iit(con), which, type, fields, ignore.strand,
158
+                    exact, include.ranges)
159
+    names(ans) <- c(if (include.ranges)
160
+                        c("seqnames", "start", "width", "strand"),
161
+                    "which", "mcols")
162
+    mcols <- DataFrame(result["which"], setNames(result[["mcols"]], fields))
163
+    result[["mcols"]] <- NULL
164
+    result[["which"]] <- NULL
165
+    
166
+    if (as %in% c("GRanges", "GRangesList")) {
167
+        ans <- makeGRangesFromDataFrame(DataFrame(result, mcols))
168
+        if (as == "GRangesList")
169
+            ans <- split(ans, mcols[["which"]])
170
+    } else {
171
+        ans <- mcols
172
+    }
173
+    
174
+    if (exact.strand) {
175
+        which <- which[mcols[["which"]]]
176
+        ans <- extractROWS(ans, strand(which) == "*" |
177
+                                result[["strand"]] == "*" |
178
+                                ans[["strand"]] == strand(which))
179
+    }
180
+
181
+    ans
182
+})
... ...
@@ -1,3 +1,4 @@
1
+#include <stdlib.h>
1 2
 #include <gstruct/iit-read.h>
2 3
 
3 4
 #include "iit.h"
... ...
@@ -14,14 +15,308 @@ SEXP R_IIT_new(IIT_T iit) {
14 15
   return iit_R;
15 16
 }
16 17
 
17
-SEXP R_iit_read(SEXP iitfile_R) {
18
+/*
19
+ IIT API:
20
+ * IIT_read
21
+ * IIT_annotation
22
+ * IIT_interval(iit, which), Interval_low, Interval_length, Interval_sign
23
+
24
+ Coordinate mapping (read divs, not labels):
25
+   Any overlap:
26
+   * IIT_get(&(*nmatches),*iit,*divstring,*coordstart,*coordend,sortp);
27
+   By strand:
28
+   * IIT_get_signed()
29
+   Exact coordinate match:
30
+   * IIT_get_exact_multiple()
31
+ 
32
+ Can restrict by "type" of feature:
33
+ * typeint = IIT_typeint(*iit,typestring)
34
+ * IIT_get_typed(&(*nmatches),*iit,*divstring,*coordstart,*coordend,typeint,
35
+                 sortp);
36
+ * IIT_get_typed_signed()
37
+
38
+ Restrict by label (read labels, not divs):
39
+ * IIT_find(&(*nmatches),iit,query);
40
+
41
+ Read standard annotation:
42
+ * field = IIT_annotation(&restofheader,iit,index,&allocp);
43
+ Or a specific field:
44
+ * fieldint = IIT_fieldint(iit,fieldstring)
45
+ * field = IIT_fieldvalue(iit,index,fieldint);
46
+
47
+ Get labels:
48
+ * IIT_label(iit, index, &allocp)
49
+ 
50
+ Introspection: 
51
+ Metadata:
52
+ * IIT_name
53
+ * IIT_version
54
+ * IIT_total_nintervals
55
+ Types:
56
+ * IIT_ntypes(iit)
57
+ * IIT_typestring (iit, typeint);
58
+ Fields:
59
+ * IIT_nfields(iit)
60
+ * IIT_fieldstring(iit, fieldint);
61
+*/
62
+
63
+typedef struct IITMatches {
64
+    IIT_T iit;
65
+    int **subscripts;
66
+    int *nsubscripts;
67
+    int nqueries;
68
+} IITMatches;
69
+
70
+static IITMatches _new_IITMatches(IIT_T iit, int nqueries) {
71
+    IITMatches matches;
72
+    matches.iit = iit;
73
+    matches.subscripts = (int **) R_alloc(sizeof(int *), nqueries);
74
+    matches.nsubscripts = (int *) R_alloc(sizeof(int), nqueries);
75
+    matches.nqueries = nqueries;
76
+    return matches;
77
+}
78
+
79
+SEXP R_iit_open(SEXP iitfile_R, SEXP divread_R, SEXP labels_read_R) {
18 80
   char *iitfile = (char *) CHAR(asChar(iitfile_R));
81
+  int divread = asLogical(divread_R) ? READ_ALL : READ_NONE;
82
+  bool labels_read = asLogical(labels_read_R);
19 83
   IIT_T iit = IIT_read(iitfile, /*name*/NULL, /*readonlyp*/true,
20
-                       /*divread*/READ_ALL, /*divstring*/NULL,
21
-                       /*add_iit_p*/false, /*labels_read_p*/true);
84
+                       divread, /*divstring*/NULL,
85
+                       /*add_iit_p*/false, labels_read);
22 86
   return R_IIT_new(iit);
23 87
 }
24 88
 
89
+static IITMatches _iit_find(IIT_T iit, SEXP which_R) {
90
+    IITMatches matches = _new_IITMatches(iit, length(which_R));
91
+    for (int i = 0; i < length(which_R); i++) {
92
+	matches.subscripts[i] =
93
+	    IIT_find(matches.nsubscripts + i, iit,
94
+		     (char *)CHAR(STRING_ELT(which_R, i)));
95
+    }
96
+    return matches;    
97
+}
98
+
99
+static IITMatches _iit_get_exact_multiple(IIT_T iit,
100
+					  SEXP chr_R, int *start, int *end,
101
+					  int type)
102
+{
103
+    IITMatches matches = _new_IITMatches(iit, length(chr_R));
104
+    for (int i = 0; i < length(chr_R); i++) {
105
+	matches.subscripts[i] =
106
+	    IIT_get_exact_multiple(matches.nsubscripts + i, iit,
107
+				   (char *)CHAR(STRING_ELT(chr_R, i)),
108
+				   start[i], end[i], type);
109
+    }
110
+    return matches;
111
+}
112
+
113
+static IITMatches _iit_get_typed(IIT_T iit,
114
+				 SEXP chr_R, int *start, int *end,
115
+				 int type)
116
+{
117
+    IITMatches matches = _new_IITMatches(iit, length(chr_R));
118
+    for (int i = 0; i < length(chr_R); i++) {
119
+	matches.subscripts[i] =
120
+	    IIT_get_typed(matches.nsubscripts + i, iit,
121
+			  (char *)CHAR(STRING_ELT(chr_R, i)),
122
+			  start[i], end[i], type,
123
+			  /*sortp*/false);
124
+    }
125
+    return matches;
126
+}
127
+
128
+static IITMatches _iit_get(IIT_T iit, SEXP chr_R, int *start, int *end)
129
+{
130
+    IITMatches matches = _new_IITMatches(iit, length(chr_R));
131
+    for (int i = 0; i < length(chr_R); i++) {
132
+	matches.subscripts[i] =
133
+	    IIT_get(matches.nsubscripts + i, iit,
134
+		    (char *)CHAR(STRING_ELT(chr_R, i)),
135
+		    start[i], end[i],
136
+		    /*sortp*/false);
137
+    }
138
+    return matches;
139
+}
140
+
141
+static IITMatches _iit_get_typed_signed(IIT_T iit,
142
+					SEXP chr_R, int *start, int *end,
143
+					int type, int *sign)
144
+{
145
+    IITMatches matches = _new_IITMatches(iit, length(chr_R));
146
+    for (int i = 0; i < length(chr_R); i++) {
147
+	matches.subscripts[i] =
148
+	    IIT_get_typed_signed(matches.nsubscripts + i, iit,
149
+				 (char *)CHAR(STRING_ELT(chr_R, i)),
150
+				 start[i], end[i], type, sign[i],
151
+				 /*sortp*/false);
152
+    }
153
+    return matches;
154
+}
155
+
156
+static IITMatches _iit_get_signed(IIT_T iit,
157
+				  SEXP chr_R, int *start, int *end,
158
+				  int *sign)
159
+{
160
+    IITMatches matches = _new_IITMatches(iit, length(chr_R));
161
+    for (int i = 0; i < length(chr_R); i++) {
162
+	matches.subscripts[i] =
163
+	    IIT_get_signed(matches.nsubscripts + i, iit,
164
+			   (char *)CHAR(STRING_ELT(chr_R, i)),
165
+			   start[i], end[i], sign[i],
166
+			   /*sortp*/false);
167
+    }
168
+    return matches;
169
+}
170
+
171
+static IITMatches _iit_get_for_coords(IIT_T iit, SEXP which_R, SEXP type_R,
172
+				      SEXP ignore_strand_R, SEXP exact_R)
173
+{
174
+    SEXP chr_R = VECTOR_ELT(which_R, 0);
175
+    int *start = INTEGER(VECTOR_ELT(which_R, 1));
176
+    int *end = INTEGER(VECTOR_ELT(which_R, 2));
177
+    int *sign = INTEGER(VECTOR_ELT(which_R, 3));
178
+    int type = type_R == R_NilValue ? 0 :
179
+	IIT_typeint(iit, (char *)CHAR(asChar(type_R)));
180
+    bool ignore_strand = asLogical(ignore_strand_R);
181
+    bool exact = asLogical(exact_R);
182
+    IITMatches matches;
183
+    
184
+    if (exact) {
185
+	/* sign filtering happens in R */
186
+	matches = _iit_get_exact_multiple(iit, chr_R, start, end, type);
187
+    } else if (ignore_strand) {
188
+	if (type > 0) {
189
+	    matches = _iit_get_typed(iit, chr_R, start, end, type);
190
+	} else {
191
+	    matches = _iit_get(iit, chr_R, start, end);
192
+	}
193
+    } else {
194
+	if (type > 0) {
195
+	    matches = _iit_get_typed_signed(iit, chr_R, start, end, type, sign);
196
+	} else {
197
+	    matches = _iit_get_signed(iit, chr_R, start, end, sign);
198
+	}
199
+    }
200
+    
201
+    return matches;
202
+}
203
+
204
+static IITMatches _iit_get_for_labels(IIT_T iit, SEXP which_R) {
205
+    return _iit_find(iit, which_R);
206
+}
207
+
208
+enum { CHR, START, WIDTH, STRAND, ANNO, ANS_LENGTH };
209
+
210
+static SEXP _convert_matches(IITMatches matches, bool ret_ranges, SEXP fields_R)
211
+{
212
+    SEXP ans, chr_R, start_R, width_R, strand_R, anno_R;
213
+    int nfields = fields_R == R_NilValue ? 1 : length(fields_R);
214
+    int *fields;
215
+    int nmatches = 0;
216
+    IIT_T iit = matches.iit;
217
+
218
+    for (int m = 0; m < matches.nqueries; m++) {
219
+	nmatches += matches.nsubscripts[m];
220
+    }
221
+
222
+    PROTECT(ans = allocVector(VECSXP, ANS_LENGTH));
223
+    if (ret_ranges) {
224
+	chr_R = allocVector(STRSXP, nmatches);
225
+	SET_VECTOR_ELT(ans, CHR, chr_R);
226
+	start_R = allocVector(INTSXP, nmatches);
227
+	SET_VECTOR_ELT(ans, START, start_R);
228
+	width_R = allocVector(INTSXP, nmatches);
229
+	SET_VECTOR_ELT(ans, WIDTH, width_R);
230
+	strand_R = allocVector(INTSXP, nmatches);
231
+	SET_VECTOR_ELT(ans, STRAND, strand_R);
232
+    }
233
+    anno_R = allocVector(VECSXP, nfields);
234
+    SET_VECTOR_ELT(ans, ANNO, anno_R);
235
+
236
+    for (int f = 0; f < nfields; f++) {
237
+	SET_VECTOR_ELT(anno_R, f, allocVector(STRSXP, nmatches));
238
+    }
239
+
240
+    if (fields_R != R_NilValue) {
241
+	fields = (int *)R_alloc(sizeof(int), nfields);
242
+	for (int f = 0; f < nfields; f++) {
243
+	    fields[f] = IIT_fieldint(iit, (char *)STRING_ELT(fields_R, f));
244
+	}
245
+    }
246
+    
247
+    for (int i = 0; i < nmatches; i++) {
248
+	if (ret_ranges) {
249
+	    Interval_T interval = IIT_interval(iit, i);
250
+	    SET_STRING_ELT(chr_R, i, mkChar(IIT_divstring_from_index(iit, i)));
251
+	    INTEGER(start_R)[i] = Interval_low(interval);
252
+	    INTEGER(width_R)[i] = Interval_length(interval);
253
+	    INTEGER(strand_R)[i] = Interval_sign(interval);
254
+	}
255
+	if (fields_R == R_NilValue) {
256
+	    char *restofheader;
257
+	    bool allocp;
258
+	    SET_STRING_ELT(anno_R, i,
259
+			   mkChar(IIT_annotation(&restofheader, iit, i,
260
+						 &allocp)));
261
+	    if (allocp == true) {
262
+		free(restofheader);
263
+	    }
264
+	} else {
265
+	    for (int f = 0; f < nfields; f++) {
266
+		SET_STRING_ELT(VECTOR_ELT(anno_R, f), i,
267
+			       mkChar(IIT_fieldvalue(iit, i, fields[f])));
268
+	    }
269
+	}
270
+    }
271
+
272
+    UNPROTECT(1);
273
+    return ans;
274
+}
275
+
276
+SEXP R_iit_read(SEXP iit_R, SEXP which_R, SEXP type_R, SEXP fields_R,
277
+		SEXP ignore_strand_R, SEXP exact_R, SEXP ret_ranges_R)
278
+{
279
+    IITMatches matches;
280
+    IIT_T iit = R_ExternalPtrAddr(iit_R);
281
+    bool ret_ranges = asLogical(ret_ranges_R);
282
+
283
+    if (TYPEOF(which_R) == VECSXP) {
284
+	matches = _iit_get_for_coords(iit, which_R, type_R,
285
+				      ignore_strand_R, exact_R);
286
+    } else {
287
+	matches = _iit_get_for_labels(iit, which_R);
288
+    }
289
+
290
+    return _convert_matches(matches, ret_ranges, fields_R);
291
+}
292
+
293
+SEXP R_iit_typeNames(SEXP iit_R) {
294
+    IIT_T iit = R_ExternalPtrAddr(iit_R);
295
+    SEXP ans;
296
+    PROTECT(ans = allocVector(STRSXP, IIT_ntypes(iit)));
297
+    for (int i = 0; i < IIT_ntypes(iit); i++) {
298
+	SET_STRING_ELT(ans, i, mkChar(IIT_typestring(iit, i)));
299
+    }
300
+    UNPROTECT(1);
301
+    return ans;
302
+}
303
+
304
+SEXP R_iit_fieldNames(SEXP iit_R) {
305
+    IIT_T iit = R_ExternalPtrAddr(iit_R);
306
+    SEXP ans;
307
+    PROTECT(ans = allocVector(STRSXP, IIT_nfields(iit)));
308
+    for (int i = 0; i < IIT_nfields(iit); i++) {
309
+	SET_STRING_ELT(ans, i, mkChar(IIT_fieldstring(iit, i)));
310
+    }
311
+    UNPROTECT(1);
312
+    return ans;
313
+}
314
+
315
+SEXP R_iit_length(SEXP iit_R) {
316
+    IIT_T iit = R_ExternalPtrAddr(iit_R);
317
+    return ScalarInteger(IIT_total_nintervals(iit));
318
+}
319
+
25 320
 /* 'IIT_output_direct' is not included in the libgstruct binary */
26 321
 
27 322
 /*