git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/preprocessCore@51771 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -65,6 +65,7 @@ |
65 | 65 |
** Nov 19, 2008 - add *_via_subset code |
66 | 66 |
** Jan 15, 2009 - fix VECTOR_ELT/STRING_ELT issues |
67 | 67 |
** Dec 1, 2010 - change how PTHREAD_STACK_MIN is used |
68 |
+ ** Jan 5, 2011 - use_target issue when target distribution length != nrow(x) fixed |
|
68 | 69 |
** |
69 | 70 |
***********************************************************/ |
70 | 71 |
|
... | ... |
@@ -2612,7 +2613,7 @@ static double linear_interpolate_helper(double v, double *x, double *y, int n) |
2612 | 2613 |
|
2613 | 2614 |
|
2614 | 2615 |
|
2615 |
-void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset, double *target, int *targetrows, int start_col, int end_col){ |
|
2616 |
+static void using_target_via_subset_part1(double *data, int *rows, int *cols, int *in_subset, double *target, int *targetrows, int start_col, int end_col, int subset_count){ |
|
2616 | 2617 |
|
2617 | 2618 |
int i,j,ind,target_ind; |
2618 | 2619 |
|
... | ... |
@@ -2627,91 +2628,94 @@ void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset, |
2627 | 2628 |
int targetnon_na = *targetrows; |
2628 | 2629 |
int non_na = 0; |
2629 | 2630 |
|
2630 |
- int subset_count = 0; |
|
2631 |
- |
|
2632 | 2631 |
double *sample_percentiles; |
2633 | 2632 |
double *datvec; |
2634 | 2633 |
|
2635 |
- |
|
2636 |
- /* Two parts to the algorithm */ |
|
2637 |
- /* Part 1: Adjust the elements not in the "subset" */ |
|
2638 |
- for (i = 0; i < *rows; i++){ |
|
2639 |
- if (in_subset[i] == 1){ |
|
2640 |
- subset_count++; |
|
2641 |
- } |
|
2642 |
- } |
|
2643 | 2634 |
|
2644 |
- if (*rows > subset_count){ |
|
2645 |
- /* We have non subset elements to deal with */ |
|
2646 |
- sample_percentiles = (double *)Calloc(subset_count, double); |
|
2647 |
- datvec = (double *)Calloc(*rows,double); |
|
2648 |
- dimat = (dataitem **)Calloc(1,dataitem *); |
|
2649 |
- dimat[0] = (dataitem *)Calloc(*rows,dataitem); |
|
2635 |
+ sample_percentiles = (double *)Calloc(subset_count, double); |
|
2636 |
+ datvec = (double *)Calloc(*rows,double); |
|
2637 |
+ dimat = (dataitem **)Calloc(1,dataitem *); |
|
2638 |
+ dimat[0] = (dataitem *)Calloc(*rows,dataitem); |
|
2650 | 2639 |
|
2651 |
- for (j = start_col; j <= end_col; j++){ |
|
2652 |
- non_na = 0; |
|
2653 |
- for (i =0; i < *rows; i++){ |
|
2654 |
- if (ISNA(data[j*(*rows) + i]) || (in_subset[i] == 0)){ |
|
2655 |
- |
|
2656 |
- } else { |
|
2657 |
- dimat[0][non_na].data = data[j*(*rows) + i]; |
|
2658 |
- dimat[0][non_na].rank = i; |
|
2659 |
- non_na++; |
|
2660 |
- } |
|
2661 |
- } |
|
2662 |
- qsort(dimat[0],non_na,sizeof(dataitem),sort_fn); |
|
2663 |
- get_ranks(ranks,dimat[0],non_na); |
|
2664 |
- |
|
2665 |
- for (i=0; i < non_na; i++){ |
|
2666 |
- sample_percentiles[i] = (double)(ranks[i] - 1)/(double)(non_na-1); |
|
2667 |
- datvec[i] = dimat[0][i].data; |
|
2640 |
+ for (j = start_col; j <= end_col; j++){ |
|
2641 |
+ |
|
2642 |
+ /* First figure out percentiles of the "subset" data */ |
|
2643 |
+ non_na = 0; |
|
2644 |
+ for (i =0; i < *rows; i++){ |
|
2645 |
+ if (!ISNA(data[j*(*rows) + i]) && (in_subset[i] == 1)){ |
|
2646 |
+ dimat[0][non_na].data = data[j*(*rows) + i]; |
|
2647 |
+ dimat[0][non_na].rank = i; |
|
2648 |
+ non_na++; |
|
2668 | 2649 |
} |
2669 |
- |
|
2670 |
- for (i =0; i < *rows; i++){ |
|
2650 |
+ } |
|
2651 |
+ qsort(dimat[0],non_na,sizeof(dataitem),sort_fn); |
|
2652 |
+ get_ranks(ranks,dimat[0],non_na); |
|
2653 |
+ |
|
2654 |
+ for (i=0; i < non_na; i++){ |
|
2655 |
+ sample_percentiles[i] = (double)(ranks[i] - 1)/(double)(non_na-1); |
|
2656 |
+ datvec[i] = dimat[0][i].data; |
|
2657 |
+ } |
|
2658 |
+ |
|
2659 |
+ /* Now try to estimate what percentile of the "subset" data each datapoint in the "non-subset" data falls */ |
|
2660 |
+ for (i =0; i < *rows; i++){ |
|
2671 | 2661 |
/*Linear interpolate to get sample percentile */ |
2672 |
- if (in_subset[i] == 0 && !ISNA(data[j*(*rows) + i])){ |
|
2673 |
- samplepercentile = linear_interpolate_helper(data[j*(*rows) + i], datvec, sample_percentiles, non_na); |
|
2674 |
- target_ind_double = 1.0 + ((double)(targetnon_na) - 1.0) * samplepercentile; |
|
2675 |
- target_ind_double_floor = floor(target_ind_double + 4*DOUBLE_EPS); |
|
2676 |
- |
|
2677 |
- target_ind_double = target_ind_double - target_ind_double_floor; |
|
2678 |
- |
|
2679 |
- if (fabs(target_ind_double) <= 4*DOUBLE_EPS){ |
|
2680 |
- target_ind_double = 0.0; |
|
2681 |
- } |
|
2682 |
- |
|
2683 |
- |
|
2684 |
- if (target_ind_double == 0.0){ |
|
2685 |
- target_ind = (int)floor(target_ind_double_floor + 0.5); /* nearbyint(target_ind_double_floor); */ |
|
2686 |
- ind = dimat[0][i].rank; |
|
2687 |
- data[j*(*rows) +i] = row_mean[target_ind-1]; |
|
2688 |
- } else if (target_ind_double == 1.0){ |
|
2689 |
- target_ind = (int)floor(target_ind_double_floor + 1.5); /* (int)nearbyint(target_ind_double_floor + 1.0); */ |
|
2690 |
- ind = dimat[0][i].rank; |
|
2691 |
- data[j*(*rows) +i] = row_mean[target_ind-1]; |
|
2662 |
+ if (in_subset[i] == 0 && !ISNA(data[j*(*rows) + i])){ |
|
2663 |
+ samplepercentile = linear_interpolate_helper(data[j*(*rows) + i], datvec, sample_percentiles, non_na); |
|
2664 |
+ target_ind_double = 1.0 + ((double)(targetnon_na) - 1.0) * samplepercentile; |
|
2665 |
+ target_ind_double_floor = floor(target_ind_double + 4*DOUBLE_EPS); |
|
2666 |
+ |
|
2667 |
+ target_ind_double = target_ind_double - target_ind_double_floor; |
|
2668 |
+ |
|
2669 |
+ if (fabs(target_ind_double) <= 4*DOUBLE_EPS){ |
|
2670 |
+ target_ind_double = 0.0; |
|
2671 |
+ } |
|
2672 |
+ if (target_ind_double == 0.0){ |
|
2673 |
+ target_ind = (int)floor(target_ind_double_floor + 0.5); /* nearbyint(target_ind_double_floor); */ |
|
2674 |
+ ind = dimat[0][i].rank; |
|
2675 |
+ data[j*(*rows) +i] = row_mean[target_ind-1]; |
|
2676 |
+ } else if (target_ind_double == 1.0){ |
|
2677 |
+ target_ind = (int)floor(target_ind_double_floor + 1.5); /* (int)nearbyint(target_ind_double_floor + 1.0); */ |
|
2678 |
+ ind = dimat[0][i].rank; |
|
2679 |
+ data[j*(*rows) +i] = row_mean[target_ind-1]; |
|
2680 |
+ } else { |
|
2681 |
+ target_ind = (int)floor(target_ind_double_floor + 0.5); /* nearbyint(target_ind_double_floor); */ |
|
2682 |
+ ind = dimat[0][i].rank; |
|
2683 |
+ if ((target_ind < *targetrows) && (target_ind > 0)){ |
|
2684 |
+ data[j*(*rows) +i] = (1.0- target_ind_double)*row_mean[target_ind-1] + target_ind_double*row_mean[target_ind]; |
|
2685 |
+ } else if (target_ind >= *targetrows){ |
|
2686 |
+ data[j*(*rows) +i] = row_mean[*targetrows-1]; |
|
2692 | 2687 |
} else { |
2693 |
- target_ind = (int)floor(target_ind_double_floor + 0.5); /* nearbyint(target_ind_double_floor); */ |
|
2694 |
- ind = dimat[0][i].rank; |
|
2695 |
- if ((target_ind < *targetrows) && (target_ind > 0)){ |
|
2696 |
- data[j*(*rows) +i] = (1.0- target_ind_double)*row_mean[target_ind-1] + target_ind_double*row_mean[target_ind]; |
|
2697 |
- } else if (target_ind >= *targetrows){ |
|
2698 |
- data[j*(*rows) +i] = row_mean[*targetrows-1]; |
|
2699 |
- } else { |
|
2700 |
- data[j*(*rows) +i] = row_mean[0]; |
|
2701 |
- } |
|
2688 |
+ data[j*(*rows) +i] = row_mean[0]; |
|
2702 | 2689 |
} |
2703 | 2690 |
} |
2704 | 2691 |
} |
2705 |
- } |
|
2706 |
- Free(dimat[0]); |
|
2707 |
- Free(dimat); |
|
2708 |
- Free(datvec); |
|
2709 |
- Free(sample_percentiles); |
|
2692 |
+ } |
|
2693 |
+ } |
|
2694 |
+ Free(dimat[0]); |
|
2695 |
+ Free(dimat); |
|
2696 |
+ Free(datvec); |
|
2697 |
+ Free(sample_percentiles); |
|
2698 |
+} |
|
2710 | 2699 |
|
2711 |
- } |
|
2712 | 2700 |
|
2701 |
+static void using_target_via_subset_part2(double *data, int *rows, int *cols, int *in_subset, double *target, int *targetrows, int start_col, int end_col, int subset_count){ |
|
2702 |
+ |
|
2703 |
+ int i,j,ind,target_ind; |
|
2704 |
+ |
|
2705 |
+ dataitem **dimat; |
|
2706 |
+ |
|
2707 |
+ double *row_mean = target; |
|
2708 |
+ |
|
2709 |
+ double *ranks = (double *)Calloc((*rows),double); |
|
2710 |
+ double samplepercentile; |
|
2711 |
+ double target_ind_double,target_ind_double_floor; |
|
2712 |
+ |
|
2713 |
+ int targetnon_na = *targetrows; |
|
2714 |
+ int non_na = 0; |
|
2715 |
+ |
|
2716 |
+ double *sample_percentiles; |
|
2717 |
+ double *datvec; |
|
2713 | 2718 |
|
2714 |
- /* Part 2: Adjust the elements in the subset*/ |
|
2715 | 2719 |
if (*rows == targetnon_na){ |
2716 | 2720 |
/* now assign back distribution */ |
2717 | 2721 |
/* this is basically the standard story */ |
... | ... |
@@ -2722,9 +2726,7 @@ void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset, |
2722 | 2726 |
for (j = start_col; j <= end_col; j++){ |
2723 | 2727 |
non_na = 0; |
2724 | 2728 |
for (i =0; i < *rows; i++){ |
2725 |
- if (ISNA(data[j*(*rows) + i]) || (in_subset[i] == 0)){ |
|
2726 |
- |
|
2727 |
- } else { |
|
2729 |
+ if (!ISNA(data[j*(*rows) + i]) && (in_subset[i] == 1)){ |
|
2728 | 2730 |
dimat[0][non_na].data = data[j*(*rows) + i]; |
2729 | 2731 |
dimat[0][non_na].rank = i; |
2730 | 2732 |
non_na++; |
... | ... |
@@ -2790,9 +2792,7 @@ void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset, |
2790 | 2792 |
for (j = start_col; j <= end_col; j++){ |
2791 | 2793 |
non_na = 0; |
2792 | 2794 |
for (i =0; i < *rows; i++){ |
2793 |
- if (ISNA(data[j*(*rows) + i])){ |
|
2794 |
- |
|
2795 |
- } else { |
|
2795 |
+ if (!ISNA(data[j*(*rows) + i]) && (in_subset[i] == 1)){ |
|
2796 | 2796 |
dimat[0][non_na].data = data[j*(*rows) + i]; |
2797 | 2797 |
dimat[0][non_na].rank = i; |
2798 | 2798 |
non_na++; |
... | ... |
@@ -2841,6 +2841,47 @@ void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset, |
2841 | 2841 |
Free(dimat[0]); |
2842 | 2842 |
Free(dimat); |
2843 | 2843 |
Free(ranks); |
2844 |
+ |
|
2845 |
+ |
|
2846 |
+} |
|
2847 |
+ |
|
2848 |
+void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset, double *target, int *targetrows, int start_col, int end_col){ |
|
2849 |
+ |
|
2850 |
+ int i,j,ind,target_ind; |
|
2851 |
+ |
|
2852 |
+ dataitem **dimat; |
|
2853 |
+ |
|
2854 |
+ double *row_mean = target; |
|
2855 |
+ |
|
2856 |
+ double *ranks = (double *)Calloc((*rows),double); |
|
2857 |
+ double samplepercentile; |
|
2858 |
+ double target_ind_double,target_ind_double_floor; |
|
2859 |
+ |
|
2860 |
+ int targetnon_na = *targetrows; |
|
2861 |
+ int non_na = 0; |
|
2862 |
+ |
|
2863 |
+ int subset_count = 0; |
|
2864 |
+ |
|
2865 |
+ double *sample_percentiles; |
|
2866 |
+ double *datvec; |
|
2867 |
+ |
|
2868 |
+ |
|
2869 |
+ /* Two parts to the algorithm */ |
|
2870 |
+ |
|
2871 |
+ /* First find out if the enitirety of the data is in the subset */ |
|
2872 |
+ for (i = 0; i < *rows; i++){ |
|
2873 |
+ if (in_subset[i] == 1){ |
|
2874 |
+ subset_count++; |
|
2875 |
+ } |
|
2876 |
+ } |
|
2877 |
+ /* Part 1: Adjust the elements not in the "subset" */ |
|
2878 |
+ if (*rows > subset_count){ |
|
2879 |
+ /* We have non subset elements to deal with */ |
|
2880 |
+ using_target_via_subset_part1(data, rows, cols, in_subset, target, targetrows, start_col, end_col,subset_count); |
|
2881 |
+ } |
|
2882 |
+ |
|
2883 |
+ /* Part 2: Adjust the elements in the "subset"*/ |
|
2884 |
+ using_target_via_subset_part2(data, rows, cols, in_subset, target, targetrows, start_col, end_col,subset_count); |
|
2844 | 2885 |
} |
2845 | 2886 |
|
2846 | 2887 |
|