Browse code

add universal_set in make_comb_set()

jokergoo authored on 27/03/2019 16:20:24
Showing 2 changed files

... ...
@@ -1,5 +1,7 @@
1 1
 
2
-make_comb_mat_from_matrix = function(x, mode, top_n_sets = Inf, min_set_size = -Inf, complement_size = NULL) {
2
+make_comb_mat_from_matrix = function(x, mode, top_n_sets = Inf, min_set_size = -Inf, 
3
+	universal_set = NULL, complement_size = NULL) {
4
+
3 5
 	# check whether x is a binary matrix
4 6
 	if(is.data.frame(x)) {
5 7
 		lc = sapply(x, function(x) {
... ...
@@ -38,6 +40,18 @@ make_comb_mat_from_matrix = function(x, mode, top_n_sets = Inf, min_set_size = -
38 40
 	set_size = set_size[l]
39 41
 	x = x[, l, drop = FALSE]
40 42
 	x = x[rowSums(x) > 0, , drop = FALSE]
43
+
44
+	if(!is.null(universal_set)) {
45
+		if(is.null(rownames(x))) {
46
+			stop_wrap("`x` should have row names when `universal_set` is set.")
47
+		}
48
+
49
+    	x = x[intersect(rownames(x), universal_set), , drop = FALSE]
50
+    	if(nrow(x) == 0) {
51
+    		stop_wrap("There is no combination set left after intersecting to `universal_set`.")
52
+    	}
53
+    	complement_size = length(setdiff(universal_set, rownames(x)))
54
+    }
41 55
 	
42 56
 	comb_mat = unique(x)
43 57
 	rn = apply(comb_mat, 1, binaryToInt)
... ...
@@ -93,22 +107,47 @@ make_comb_mat_from_matrix = function(x, mode, top_n_sets = Inf, min_set_size = -
93 107
 		})
94 108
 	}
95 109
 
110
+	#normalize comb_mat to 2^n - 1 columns
111
+	n = nrow(comb_mat)
112
+	if(ncol(comb_mat) < 2^n - 1) {
113
+		full_comb_mat = matrix(0, nrow = n, ncol = 2^n - 1)
114
+		j = 1
115
+		for(k in seq_len(n)) {
116
+			comb = combn(n, k)
117
+	        for(i in 1:ncol(comb)) {
118
+	            full_comb_mat[comb[, i], j] = TRUE
119
+	            j = j + 1
120
+	        }
121
+		}
122
+		ind = which(! apply(full_comb_mat, 2, binaryToInt) %in% apply(comb_mat, 2, binaryToInt))
123
+		comb_mat = cbind(comb_mat, full_comb_mat[, ind, drop = FALSE])
124
+		comb_size = c(comb_size, rep(0, length(ind)))
125
+		
126
+	}
127
+
96 128
 	if(!is.null(complement_size)) {
97 129
 		comb_mat = cbind(rep(0, nrow(comb_mat)), comb_mat)
98 130
 		comb_size = c(complement_size, comb_size)
99 131
 	}
100 132
 
133
+	od = order(apply(comb_mat, 2, function(x) sum(x*seq_along(x))))
134
+	comb_mat = comb_mat[, od, drop = FALSE]
135
+	comb_size = comb_size[od]
136
+
101 137
 	attr(comb_mat, "set_size") = set_size
102 138
 	attr(comb_mat, "comb_size") = comb_size
103 139
 	attr(comb_mat, "mode") = mode
104 140
 	attr(comb_mat, "set_on_rows") = TRUE
105 141
 	attr(comb_mat, "x") = x
142
+	attr(comb_mat, "universal_set") = universal_set
106 143
 	class(comb_mat) = c("comb_mat", "matrix")
107 144
 	return(comb_mat)
108 145
 
109 146
 }
110 147
 
111
-make_comb_mat_from_list = function(lt, mode, value_fun = length, top_n_sets = Inf, min_set_size = -Inf, complement_size = NULL) {
148
+make_comb_mat_from_list = function(lt, mode, value_fun = length, top_n_sets = Inf, 
149
+	min_set_size = -Inf, universal_set = NULL, complement_size = NULL) {
150
+
112 151
 	n = length(lt)
113 152
     nm = names(lt)
114 153
     if(is.null(nm)) {
... ...
@@ -145,6 +184,15 @@ make_comb_mat_from_list = function(lt, mode, value_fun = length, top_n_sets = In
145 184
 	lt = lt[l]
146 185
 	n = length(lt)
147 186
     nm = names(lt)
187
+
188
+    if(!is.null(universal_set)) {
189
+    	lt = lapply(lt, function(x) intersect(x, universal_set))
190
+    	complement_set = universal_set
191
+    	for(i in seq_along(lt)) {
192
+    		complement_set = setdiff(complement_set, lt[[i]])
193
+    	}
194
+    	complement_size = value_fun(complement_set)
195
+    }
148 196
     
149 197
     comb_mat = matrix(FALSE, nrow = n, ncol = sum(choose(n, 1:n)))
150 198
     rownames(comb_mat) = nm
... ...
@@ -199,6 +247,7 @@ make_comb_mat_from_list = function(lt, mode, value_fun = length, top_n_sets = In
199 247
 	attr(comb_mat, "mode") = mode
200 248
 	attr(comb_mat, "set_on_rows") = TRUE
201 249
 	attr(comb_mat, "lt") = lt
250
+	attr(comb_mat, "universal_set") = universal_set
202 251
 	class(comb_mat) = c("comb_mat", "matrix")
203 252
 	return(comb_mat)
204 253
 }
... ...
@@ -208,21 +257,28 @@ make_comb_mat_from_list = function(lt, mode, value_fun = length, top_n_sets = In
208 257
 #
209 258
 # == param
210 259
 # -lt A list of vectors.
260
+# -universal_set The universal set.
211 261
 #
212 262
 # == details
213 263
 # It converts the list which have m sets to a binary matrix with n rows and m columns
214
-# where n is the number of union of all sets in the list.
264
+# where n is the size of universal set.
215 265
 #
216 266
 # == example
217 267
 # set.seed(123)
218
-# lt = list(a = sample(letters, 10),
219
-#           b = sample(letters, 15),
220
-#           c = sample(letters, 20))
268
+# lt = list(a = sample(letters, 5),
269
+#           b = sample(letters, 10),
270
+#           c = sample(letters, 15))
221 271
 # list_to_matrix(lt)
222
-list_to_matrix = function(lt) {
223
-	cn = unique(unlist(lt))
224
-	mat = matrix(0, nrow = length(cn), ncol = length(lt))
225
-	rownames(mat) = cn
272
+# list_to_matrix(lt, universal_set = letters)
273
+list_to_matrix = function(lt, universal_set = NULL) {
274
+	if(!is.null(universal_set)) {
275
+		lt = lapply(lt, function(x) intersect(x, universal_set))
276
+	} else {
277
+		universal_set = unique(unlist(lt))
278
+	}
279
+
280
+	mat = matrix(0, nrow = length(universal_set), ncol = length(lt))
281
+	rownames(mat) = sort(universal_set)
226 282
 	colnames(mat) = names(lt)
227 283
 	for(i in seq_along(lt)) {
228 284
 		mat[unique(lt[[i]]), i] = 1
... ...
@@ -239,6 +295,7 @@ list_to_matrix = function(lt) {
239 295
 # -mode The mode for forming the combination set, see Mode section.
240 296
 # -top_n_sets Number of sets with largest size.
241 297
 # -min_set_size Ths minimal set size that is used for generating the combination matrix.
298
+# -universal_set The universal set. It if is specified, ``complement_size`` is ignored.
242 299
 # -complement_size The size for the complement of all sets. If it is specified, the combination
243 300
 #                  set name will be like "00...".
244 301
 # -value_fun For each combination set, how to calculate the size? If it is a scalar set, 
... ...
@@ -329,7 +386,7 @@ list_to_matrix = function(lt) {
329 386
 # m = make_comb_mat(lt)
330 387
 # }
331 388
 make_comb_mat = function(..., mode = c("distinct", "intersect", "union"),
332
-	top_n_sets = Inf, min_set_size = -Inf, complement_size = NULL, value_fun) {
389
+	top_n_sets = Inf, min_set_size = -Inf, universal_set = NULL, complement_size = NULL, value_fun) {
333 390
 
334 391
 	lt = list(...)
335 392
 
... ...
@@ -337,7 +394,8 @@ make_comb_mat = function(..., mode = c("distinct", "intersect", "union"),
337 394
 	if(length(lt) == 1) {
338 395
 		lt = lt[[1]]
339 396
 		if(!is.null(dim(lt))) {
340
-			return(make_comb_mat_from_matrix(lt, mode = mode, top_n_sets = top_n_sets, min_set_size = min_set_size, complement_size = complement_size))
397
+			return(make_comb_mat_from_matrix(lt, mode = mode, top_n_sets = top_n_sets, 
398
+				min_set_size = min_set_size, universal_set = universal_set, complement_size = complement_size))
341 399
 		}
342 400
 	}
343 401
 
... ...
@@ -350,7 +408,8 @@ make_comb_mat = function(..., mode = c("distinct", "intersect", "union"),
350 408
 			value_fun = length
351 409
 		}
352 410
 	}
353
-	make_comb_mat_from_list(lt, value_fun, mode = mode, top_n_sets = top_n_sets, min_set_size = min_set_size, complement_size = complement_size)
411
+	make_comb_mat_from_list(lt, value_fun, mode = mode, top_n_sets = top_n_sets, min_set_size = min_set_size, 
412
+		universal_set = universal_set, complement_size = complement_size)
354 413
 }
355 414
 
356 415
 
... ...
@@ -404,7 +463,7 @@ set_name = function(m) {
404 463
 # m = make_comb_mat(lt)
405 464
 # set_size(m)
406 465
 set_size = function(m) {
407
-	attr(m, "set_size")
466
+	structure(attr(m, "set_size"), names = set_name(m))
408 467
 }
409 468
 
410 469
 # == title
... ...
@@ -424,7 +483,7 @@ set_size = function(m) {
424 483
 # m = make_comb_mat(lt)
425 484
 # comb_size(m)
426 485
 comb_size = function(m) {
427
-	attr(m, "comb_size")
486
+	structure(attr(m, "comb_size"), names = comb_name(m))
428 487
 }
429 488
 
430 489
 # == title
... ...
@@ -481,10 +540,11 @@ comb_name = function(m) {
481 540
 comb_degree = function(m) {
482 541
 	set_on_rows = attr(m, "set_on_rows")
483 542
 	if(set_on_rows) {
484
-		colSums(m)
543
+		d = colSums(m)
485 544
 	} else {
486
-		rowSums(m)
545
+		d = rowSums(m)
487 546
 	}
547
+	structure(d, names = comb_name(m))
488 548
 }
489 549
 
490 550
 # == title
... ...
@@ -506,10 +566,6 @@ comb_degree = function(m) {
506 566
 # extract_comb(m, "110")
507 567
 extract_comb = function(m, comb_name) {
508 568
 
509
-	if(grepl("^0+$", comb_name)) {
510
-		stop_wrap(qq("Cannot extract elements for the complement set '@{comb_name}'."))
511
-	}
512
-
513 569
 	all_comb_names = comb_name(m)
514 570
 	if(!comb_name %in% all_comb_names) {
515 571
 		stop_wrap(paste0("Cannot find a combination name:	", comb_name, ", valid combination name should be in `comb_name(m)`."))
... ...
@@ -519,8 +575,22 @@ extract_comb = function(m, comb_name) {
519 575
 
520 576
 	x = attr(m, "x")
521 577
 	lt = attr(m, "lt")
578
+	universal_set = attr(m, "universal_set")
522 579
 	mode = attr(m, "mode")
580
+
581
+	is_complement_set = function(comb_name) {
582
+		grepl("^0+$", comb_name)
583
+	}
584
+
585
+	if(is.null(universal_set) && is_complement_set(comb_name)) {
586
+		stop_wrap(qq("Cannot extract elements for the complement set '@{comb_name}' since universal set was not set."))
587
+	}
588
+
523 589
 	if(!is.null(x)) {
590
+		if(is_complement_set(comb_name)) {
591
+			return(setdiff(universal_set, rownames(x)))
592
+		}
593
+
524 594
 		if(mode == "distinct") {
525 595
 			l = apply(x, 1, function(y) all(y == query))
526 596
 		} else if(mode == "intersect") {
... ...
@@ -556,6 +626,14 @@ extract_comb = function(m, comb_name) {
556 626
 	    	setdiff = getFromNamespace("setdiff", ns = "BiocGenerics")
557 627
 	    }
558 628
 
629
+	    if(is_complement_set(comb_name)) {
630
+			s = universal_set
631
+			for(i in seq_along(lt)) {
632
+				s = setdiff(s, lt[[i]])
633
+			}
634
+			return(s)
635
+		}
636
+
559 637
 		do_comb = function(lt, mode, do = rep(TRUE, length(lt))) {
560 638
 	        set1_index = which(do)
561 639
 	        set2_index = which(!do)
562 640
new file mode 100644
... ...
@@ -0,0 +1,72 @@
1
+lt = list(
2
+	a = c("h", "t", "j", "u", "w"),
3
+	b = c("b", "n", "v", "m", "k", "u", "j", "w", "x", "z"),
4
+	c = c("x", "g", "b", "h", "u", "s", "n", "m", "r", "l", "q", "i", "o", "d", "z")
5
+)
6
+
7
+
8
+test_that("test default list_to_matrix", {
9
+	m = list_to_matrix(lt)
10
+	expect_that(rownames(m), is_identical_to(sort(unique(unlist(lt)))))
11
+	expect_that(colnames(m), is_identical_to(names(lt)))
12
+	expect_that(unname(m["u", ]), is_identical_to(c(1, 1, 1)))
13
+	expect_that(unname(m["j", ]), is_identical_to(c(1, 1, 0)))
14
+})
15
+
16
+test_that("test list_to_matrix with universal_set", {
17
+	m = list_to_matrix(lt, universal_set = letters)
18
+	expect_that(rownames(m), is_identical_to(letters))
19
+	expect_that(unname(m["y", ]), is_identical_to(c(0, 0, 0)))
20
+})
21
+
22
+test_that("test list_to_matrix with universal_set which is smaller than the input set", {
23
+	m = list_to_matrix(lt, universal_set = letters[1:10])
24
+	expect_that(rownames(m), is_identical_to(letters[1:10]))
25
+	expect_that(unname(m["a", ]), is_identical_to(c(0, 0, 0)))
26
+})
27
+
28
+test_that("test default make_comb_mat", {
29
+	m = make_comb_mat(lt)
30
+
31
+	tb = table(table(unlist(lt)))
32
+	expect_that(tb[names(tb) == 1][[1]], equals(sum(comb_size(m)[comb_degree(m) == 1])))
33
+	expect_that(tb[names(tb) == 2][[1]], equals(sum(comb_size(m)[comb_degree(m) == 2])))
34
+	expect_that(tb[names(tb) == 3][[1]], equals(sum(comb_size(m)[comb_degree(m) == 3])))
35
+
36
+	tb = table(unlist(lt))
37
+	expect_that(extract_comb(m, "111"), is_identical_to(names(tb[tb == 3])))
38
+
39
+	m1 = make_comb_mat(lt)
40
+	m2 = make_comb_mat(list_to_matrix(lt))
41
+	attr(m1, "x") = NULL
42
+	attr(m2, "x") = NULL
43
+	attr(m1, "lt") = NULL
44
+	attr(m2, "lt") = NULL
45
+	expect_that(m1, equals(m2))
46
+	
47
+	m1 = make_comb_mat(lt)
48
+	m2 = make_comb_mat(list_to_matrix(lt))
49
+	expect_that(sort(extract_comb(m1, "111")), is_identical_to(sort(extract_comb(m2, "111"))))
50
+	expect_that(sort(extract_comb(m1, "011")), is_identical_to(sort(extract_comb(m2, "011"))))
51
+})
52
+
53
+
54
+test_that("test default make_comb_mat with universal_set", {
55
+	m = make_comb_mat(lt, universal_set = letters)
56
+
57
+	expect_that(length(comb_size(m)), is_identical_to(8))
58
+	expect_that("000" %in% comb_name(m), is_identical_to(TRUE))
59
+	expect_that(0 %in% comb_degree(m), is_identical_to(TRUE))
60
+
61
+})
62
+
63
+
64
+test_that("test default make_comb_mat with universal_set which is smaller than the input set", {
65
+	m = make_comb_mat(lt, universal_set = letters[1:10])
66
+
67
+	expect_that("000" %in% comb_name(m), is_identical_to(TRUE))
68
+	expect_that(0 %in% comb_degree(m), is_identical_to(TRUE))
69
+
70
+})
71
+
72
+# test GRanges
0 73
\ No newline at end of file