Browse code

(1.13.4) fix issues with normalize.quantiles.use.target when subset argument supplied and when target distribution length != nrow(x)

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/preprocessCore@51771 bc3139a8-67e5-0310-9ffc-ced21a209358

Ben Bolstad authored on 06/01/2011 03:43:01
Showing2 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: preprocessCore
2
-Version: 1.13.3
2
+Version: 1.13.4
3 3
 Title: A collection of pre-processing functions
4 4
 Author: Benjamin Milo Bolstad <bmb@bmbolstad.com>
5 5
 Maintainer: Benjamin Milo Bolstad <bmb@bmbolstad.com>
... ...
@@ -65,6 +65,7 @@
65 65
  ** Nov 19, 2008 - add *_via_subset code
66 66
  ** Jan 15, 2009 - fix VECTOR_ELT/STRING_ELT issues
67 67
  ** Dec 1, 2010 - change how  PTHREAD_STACK_MIN is used
68
+ ** Jan 5, 2011 - use_target issue when target distribution length != nrow(x) fixed
68 69
  **
69 70
  ***********************************************************/
70 71
 
... ...
@@ -2612,7 +2613,7 @@ static double linear_interpolate_helper(double v, double *x, double *y, int n)
2612 2613
 
2613 2614
 
2614 2615
 
2615
-void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset, double *target, int *targetrows, int start_col, int end_col){
2616
+static void using_target_via_subset_part1(double *data, int *rows, int *cols, int *in_subset, double *target, int *targetrows, int start_col, int end_col, int subset_count){
2616 2617
 
2617 2618
   int i,j,ind,target_ind;
2618 2619
   
... ...
@@ -2627,91 +2628,94 @@ void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset,
2627 2628
   int targetnon_na = *targetrows;
2628 2629
   int non_na = 0;
2629 2630
   
2630
-  int subset_count = 0;
2631
-
2632 2631
   double *sample_percentiles;
2633 2632
   double *datvec;
2634 2633
   
2635
-
2636
-  /* Two parts to the algorithm */
2637
-  /* Part 1: Adjust the elements not in the "subset" */
2638
-  for (i = 0; i <  *rows; i++){
2639
-    if (in_subset[i] == 1){
2640
-      subset_count++;
2641
-    }
2642
-  }
2643 2634
   
2644
-  if (*rows > subset_count){
2645
-    /* We have non subset elements to deal with */
2646
-    sample_percentiles = (double *)Calloc(subset_count, double);
2647
-    datvec = (double *)Calloc(*rows,double);
2648
-    dimat = (dataitem **)Calloc(1,dataitem *);
2649
-    dimat[0] = (dataitem *)Calloc(*rows,dataitem);
2635
+  sample_percentiles = (double *)Calloc(subset_count, double);
2636
+  datvec = (double *)Calloc(*rows,double);
2637
+  dimat = (dataitem **)Calloc(1,dataitem *);
2638
+  dimat[0] = (dataitem *)Calloc(*rows,dataitem);
2650 2639
    
2651
-    for (j = start_col; j <= end_col; j++){
2652
-      non_na = 0;
2653
-      for (i =0; i < *rows; i++){
2654
-	if (ISNA(data[j*(*rows) + i]) || (in_subset[i] == 0)){
2655
-	  
2656
-	} else {
2657
-	  dimat[0][non_na].data = data[j*(*rows) + i];
2658
-	  dimat[0][non_na].rank = i;
2659
-	  non_na++;
2660
-	}
2661
-      }	   
2662
-      qsort(dimat[0],non_na,sizeof(dataitem),sort_fn);
2663
-      get_ranks(ranks,dimat[0],non_na);
2664
-      
2665
-      for (i=0; i < non_na; i++){
2666
-	sample_percentiles[i] = (double)(ranks[i] - 1)/(double)(non_na-1);
2667
-	datvec[i] = dimat[0][i].data;
2640
+  for (j = start_col; j <= end_col; j++){
2641
+    
2642
+    /* First figure out percentiles of the "subset" data */
2643
+    non_na = 0;
2644
+    for (i =0; i < *rows; i++){
2645
+      if (!ISNA(data[j*(*rows) + i]) && (in_subset[i] == 1)){
2646
+	dimat[0][non_na].data = data[j*(*rows) + i];
2647
+	dimat[0][non_na].rank = i;
2648
+	non_na++;
2668 2649
       }
2669
-      
2670
-      for  (i =0; i < *rows; i++){
2650
+    }	   
2651
+    qsort(dimat[0],non_na,sizeof(dataitem),sort_fn);
2652
+    get_ranks(ranks,dimat[0],non_na);
2653
+    
2654
+    for (i=0; i < non_na; i++){
2655
+      sample_percentiles[i] = (double)(ranks[i] - 1)/(double)(non_na-1);
2656
+      datvec[i] = dimat[0][i].data;
2657
+    }
2658
+    
2659
+    /* Now try to estimate what percentile of the "subset" data each datapoint in the "non-subset" data falls */
2660
+    for  (i =0; i < *rows; i++){
2671 2661
       /*Linear interpolate to get sample percentile */
2672
-	if (in_subset[i] == 0 && !ISNA(data[j*(*rows) + i])){
2673
-	  samplepercentile = linear_interpolate_helper(data[j*(*rows) + i], datvec, sample_percentiles, non_na);
2674
-	  target_ind_double = 1.0 + ((double)(targetnon_na) - 1.0) * samplepercentile;
2675
-	  target_ind_double_floor = floor(target_ind_double + 4*DOUBLE_EPS);
2676
-	  
2677
-	  target_ind_double = target_ind_double - target_ind_double_floor;
2678
-	  
2679
-	  if (fabs(target_ind_double) <=  4*DOUBLE_EPS){
2680
-	    target_ind_double = 0.0;
2681
-	  }
2682
-	  
2683
-	  
2684
-	  if (target_ind_double  == 0.0){
2685
-	    target_ind = (int)floor(target_ind_double_floor + 0.5); /* nearbyint(target_ind_double_floor); */	
2686
-	    ind = dimat[0][i].rank;
2687
-	    data[j*(*rows) +i] = row_mean[target_ind-1];
2688
-	  } else if (target_ind_double == 1.0){
2689
-	    target_ind = (int)floor(target_ind_double_floor + 1.5); /* (int)nearbyint(target_ind_double_floor + 1.0); */ 
2690
-	    ind = dimat[0][i].rank;
2691
-	    data[j*(*rows) +i] = row_mean[target_ind-1];
2662
+      if (in_subset[i] == 0 && !ISNA(data[j*(*rows) + i])){
2663
+	samplepercentile = linear_interpolate_helper(data[j*(*rows) + i], datvec, sample_percentiles, non_na);
2664
+	target_ind_double = 1.0 + ((double)(targetnon_na) - 1.0) * samplepercentile;
2665
+	target_ind_double_floor = floor(target_ind_double + 4*DOUBLE_EPS);
2666
+	
2667
+	target_ind_double = target_ind_double - target_ind_double_floor;
2668
+	
2669
+	if (fabs(target_ind_double) <=  4*DOUBLE_EPS){
2670
+	  target_ind_double = 0.0;
2671
+	}
2672
+	if (target_ind_double  == 0.0){
2673
+	  target_ind = (int)floor(target_ind_double_floor + 0.5); /* nearbyint(target_ind_double_floor); */	
2674
+	  ind = dimat[0][i].rank;
2675
+	  data[j*(*rows) +i] = row_mean[target_ind-1];
2676
+	} else if (target_ind_double == 1.0){
2677
+	  target_ind = (int)floor(target_ind_double_floor + 1.5); /* (int)nearbyint(target_ind_double_floor + 1.0); */ 
2678
+	  ind = dimat[0][i].rank;
2679
+	  data[j*(*rows) +i] = row_mean[target_ind-1];
2680
+	} else {
2681
+	  target_ind = (int)floor(target_ind_double_floor + 0.5); /* nearbyint(target_ind_double_floor); */	
2682
+	  ind = dimat[0][i].rank;
2683
+	  if ((target_ind < *targetrows) && (target_ind > 0)){
2684
+	    data[j*(*rows) +i] = (1.0- target_ind_double)*row_mean[target_ind-1] + target_ind_double*row_mean[target_ind];
2685
+	  } else if (target_ind >= *targetrows){
2686
+	    data[j*(*rows) +i] = row_mean[*targetrows-1];
2692 2687
 	  } else {
2693
-	    target_ind = (int)floor(target_ind_double_floor + 0.5); /* nearbyint(target_ind_double_floor); */	
2694
-	    ind = dimat[0][i].rank;
2695
-	    if ((target_ind < *targetrows) && (target_ind > 0)){
2696
-	      data[j*(*rows) +i] = (1.0- target_ind_double)*row_mean[target_ind-1] + target_ind_double*row_mean[target_ind];
2697
-	    } else if (target_ind >= *targetrows){
2698
-	      data[j*(*rows) +i] = row_mean[*targetrows-1];
2699
-	    } else {
2700
-	      data[j*(*rows) +i] = row_mean[0];
2701
-	    }
2688
+	    data[j*(*rows) +i] = row_mean[0];
2702 2689
 	  }
2703 2690
 	}
2704 2691
       }
2705
-    } 
2706
-    Free(dimat[0]);
2707
-    Free(dimat);
2708
-    Free(datvec);
2709
-    Free(sample_percentiles);
2692
+    }
2693
+  } 
2694
+  Free(dimat[0]);
2695
+  Free(dimat);
2696
+  Free(datvec);
2697
+  Free(sample_percentiles);
2698
+}
2710 2699
 
2711
-  }
2712 2700
 
2701
+static void using_target_via_subset_part2(double *data, int *rows, int *cols, int *in_subset, double *target, int *targetrows, int start_col, int end_col, int subset_count){
2702
+
2703
+  int i,j,ind,target_ind;
2704
+  
2705
+  dataitem **dimat;
2706
+
2707
+  double *row_mean = target;
2708
+
2709
+  double *ranks = (double *)Calloc((*rows),double);
2710
+  double samplepercentile;
2711
+  double target_ind_double,target_ind_double_floor;
2712
+
2713
+  int targetnon_na = *targetrows;
2714
+  int non_na = 0;
2715
+  
2716
+  double *sample_percentiles;
2717
+  double *datvec;
2713 2718
 
2714
-  /* Part 2: Adjust the elements in the subset*/
2715 2719
   if (*rows == targetnon_na){
2716 2720
     /* now assign back distribution */
2717 2721
     /* this is basically the standard story */
... ...
@@ -2722,9 +2726,7 @@ void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset,
2722 2726
     for (j = start_col; j <= end_col; j++){
2723 2727
       non_na = 0;
2724 2728
       for (i =0; i < *rows; i++){
2725
-	if (ISNA(data[j*(*rows) + i]) || (in_subset[i] == 0)){
2726
-	  
2727
-	} else {
2729
+	if (!ISNA(data[j*(*rows) + i]) && (in_subset[i] == 1)){
2728 2730
 	  dimat[0][non_na].data = data[j*(*rows) + i];
2729 2731
 	  dimat[0][non_na].rank = i;
2730 2732
 	  non_na++;
... ...
@@ -2790,9 +2792,7 @@ void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset,
2790 2792
     for (j = start_col; j <= end_col; j++){
2791 2793
       non_na = 0;
2792 2794
       for (i =0; i < *rows; i++){	
2793
-	if (ISNA(data[j*(*rows) + i])){
2794
-
2795
-	} else {
2795
+	if (!ISNA(data[j*(*rows) + i]) && (in_subset[i] == 1)){
2796 2796
 	  dimat[0][non_na].data = data[j*(*rows) + i];
2797 2797
 	  dimat[0][non_na].rank = i;
2798 2798
 	  non_na++;
... ...
@@ -2841,6 +2841,47 @@ void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset,
2841 2841
   Free(dimat[0]);
2842 2842
   Free(dimat);
2843 2843
   Free(ranks);
2844
+
2845
+
2846
+}
2847
+
2848
+void using_target_via_subset(double *data, int *rows, int *cols, int *in_subset, double *target, int *targetrows, int start_col, int end_col){
2849
+
2850
+  int i,j,ind,target_ind;
2851
+  
2852
+  dataitem **dimat;
2853
+
2854
+  double *row_mean = target;
2855
+
2856
+  double *ranks = (double *)Calloc((*rows),double);
2857
+  double samplepercentile;
2858
+  double target_ind_double,target_ind_double_floor;
2859
+
2860
+  int targetnon_na = *targetrows;
2861
+  int non_na = 0;
2862
+  
2863
+  int subset_count = 0;
2864
+
2865
+  double *sample_percentiles;
2866
+  double *datvec;
2867
+  
2868
+
2869
+  /* Two parts to the algorithm */
2870
+ 
2871
+  /* First find out if the enitirety of the data is in the subset */
2872
+  for (i = 0; i <  *rows; i++){
2873
+    if (in_subset[i] == 1){
2874
+      subset_count++;
2875
+    }
2876
+  }
2877
+   /* Part 1: Adjust the elements not in the "subset" */
2878
+  if (*rows > subset_count){
2879
+     /* We have non subset elements to deal with */	
2880
+     using_target_via_subset_part1(data, rows, cols, in_subset, target, targetrows, start_col, end_col,subset_count);
2881
+  }
2882
+
2883
+  /* Part 2: Adjust the elements in the "subset"*/
2884
+  using_target_via_subset_part2(data, rows, cols, in_subset, target, targetrows, start_col, end_col,subset_count);
2844 2885
 }
2845 2886
 
2846 2887