Browse code

fix very minor ties issue. Also note that current functions do not handle matrices with NA values (to be fixed in the future)

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/preprocessCore@25783 bc3139a8-67e5-0310-9ffc-ced21a209358

Ben Bolstad authored on 13/07/2007 04:21:15
Showing 5 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: preprocessCore
2
-Version: 0.99.8
2
+Version: 0.99.9
3 3
 Title: A collection of pre-processing functions
4 4
 Author: Benjamin Milo Bolstad <bmb@bmbolstad.com>
5 5
 Maintainer: Benjamin Milo Bolstad <bmb@bmbolstad.com>
... ...
@@ -44,7 +44,7 @@ normalize.quantiles <- function(x,copy=TRUE){
44 44
 
45 45
   #matrix(.C("qnorm_c", as.double(as.vector(x)), as.integer(rows), as.integer(cols))[[1]], rows, cols)
46 46
 
47
-  .Call("R_qnorm_c",x,copy, PACKAGE="preprocessCore");
47
+  .Call("R_qnorm_c_improvedties",x,copy, PACKAGE="preprocessCore");
48 48
 }
49 49
 
50 50
 
... ...
@@ -19,6 +19,8 @@
19 19
   outliers. If you make use of quantile normalization 
20 20
   please cite Bolstad et al, Bioinformatics (2003).
21 21
 
22
+  Note that this function does not handle missing (ie NA) values.
23
+
22 24
 }
23 25
 
24 26
 \value{
... ...
@@ -34,6 +34,9 @@
34 34
   most different from all the other means, \bold{both} removes first
35 35
   extreme variance and then an extreme mean. The option \bold{none} does
36 36
   not remove any chips, but will assign equal weights to all chips.
37
+
38
+  Note that this function does not handle missing values (ie
39
+  NA). Unexpected results might occur in this situation.
37 40
   
38 41
 }
39 42
 \note{This function is still experimental.}
... ...
@@ -54,6 +54,7 @@
54 54
  ** Nov 13, 2006 - remove median code
55 55
  ** May 20, 2007 - move to preprocessCore. clean up code.
56 56
  ** May 26, 2007 - fix memory leak in qnorm_c_determine_target
57
+ ** Jul 12, 2007 - improved ties handling (fixes off by one error)
57 58
  **
58 59
  ***********************************************************/
59 60
 
... ...
@@ -333,7 +334,7 @@ static double med_abs(double *x, int length){
333 334
 /*****************************************************************************************************
334 335
  *****************************************************************************************************
335 336
  **
336
- ** The following block implements the standard quantile normalization function
337
+ ** The following block implements the standard quantile normalization function (aka "classic")
337 338
  **
338 339
  **
339 340
  *****************************************************************************************************
... ...
@@ -348,6 +349,8 @@ static double med_abs(double *x, int length){
348 349
  **
349 350
  ** returns 1 if there is a problem, 0 otherwise
350 351
  **
352
+ ** Note that this function does not handle missing data (ie NA)
353
+ **
351 354
  ********************************************************/
352 355
 
353 356
 int qnorm_c(double *data, int *rows, int *cols){
... ...
@@ -474,6 +477,8 @@ SEXP R_qnorm_c(SEXP X, SEXP copy){
474 477
  **
475 478
  ** This function implements the "robust" quantile normalizer
476 479
  **
480
+ ** Note that this function does not handle NA values.
481
+ **
477 482
  ********************************************************/
478 483
 
479 484
 int qnorm_robust_c(double *data,double *weights, int *rows, int *cols, int *use_median, int *use_log2, int *weight_scheme){
... ...
@@ -1596,3 +1601,123 @@ SEXP R_qnorm_within_blocks(SEXP X,SEXP blocks,SEXP copy){
1596 1601
 
1597 1602
 }
1598 1603
 
1604
+
1605
+
1606
+
1607
+
1608
+
1609
+
1610
+/*********************************************************
1611
+ **
1612
+ ** void qnorm_c_improved(double *data, int *rows, int *cols)
1613
+ **
1614
+ **  this is the function that actually implements the
1615
+ ** quantile normalization algorithm. It is called from R.
1616
+ **
1617
+ ** returns 1 if there is a problem, 0 otherwise
1618
+ **
1619
+ ** Note that this function does not handle missing data (ie NA)
1620
+ **
1621
+ ********************************************************/
1622
+
1623
+int qnorm_c_improvedties(double *data, int *rows, int *cols){
1624
+  int i,j,ind;
1625
+  dataitem **dimat;
1626
+  /*  double sum; */
1627
+  double *row_mean = (double *)Calloc((*rows),double);
1628
+  double *datvec; /* = (double *)Calloc(*cols,double); */
1629
+  double *ranks = (double *)Calloc((*rows),double);
1630
+  
1631
+  datvec = (double *)Calloc(*rows,double);
1632
+  
1633
+  for (i =0; i < *rows; i++){
1634
+    row_mean[i] = 0.0;
1635
+  }
1636
+  
1637
+  /* first find the normalizing distribution */
1638
+  for (j = 0; j < *cols; j++){
1639
+    for (i =0; i < *rows; i++){
1640
+      datvec[i] = data[j*(*rows) + i];
1641
+    }
1642
+    qsort(datvec,*rows,sizeof(double),(int(*)(const void*, const void*))sort_double);
1643
+    for (i =0; i < *rows; i++){
1644
+      row_mean[i] += datvec[i]/((double)*cols);
1645
+    }
1646
+  }
1647
+  
1648
+  /* now assign back distribution */
1649
+  dimat = (dataitem **)Calloc(1,dataitem *);
1650
+  dimat[0] = (dataitem *)Calloc(*rows,dataitem);
1651
+  
1652
+  for (j = 0; j < *cols; j++){
1653
+    for (i =0; i < *rows; i++){
1654
+      dimat[0][i].data = data[j*(*rows) + i];
1655
+      dimat[0][i].rank = i;
1656
+    }
1657
+    qsort(dimat[0],*rows,sizeof(dataitem),sort_fn);
1658
+    get_ranks(ranks,dimat[0],*rows);
1659
+    for (i =0; i < *rows; i++){
1660
+      ind = dimat[0][i].rank;
1661
+      if (ranks[i] - floor(ranks[i]) > 0.4){
1662
+	data[j*(*rows) +ind] = 0.5*(row_mean[(int)floor(ranks[i])-1] + row_mean[(int)floor(ranks[i])]);
1663
+      } else { 
1664
+	data[j*(*rows) +ind] = row_mean[(int)floor(ranks[i])-1];
1665
+      }
1666
+    }
1667
+  }
1668
+  
1669
+  Free(ranks);
1670
+  Free(datvec);
1671
+  Free(dimat[0]);
1672
+  
1673
+  Free(dimat);
1674
+  Free(row_mean);
1675
+  return 0;
1676
+}
1677
+
1678
+
1679
+
1680
+
1681
+/*********************************************************
1682
+ **
1683
+ ** SEXP R_qnorm_c(SEXP X)
1684
+ **
1685
+ ** SEXP X      - a matrix
1686
+ ** SEXP copy   - a flag if TRUE then make copy
1687
+ **               before normalizing, if FALSE work in place
1688
+ **               note that this can be dangerous since
1689
+ **               it will change the original matrix.
1690
+ **
1691
+ ** returns a quantile normalized matrix.
1692
+ **
1693
+ ** This is a .Call() interface for quantile normalization
1694
+ **
1695
+ *********************************************************/
1696
+
1697
+SEXP R_qnorm_c_improvedties(SEXP X, SEXP copy){
1698
+
1699
+  SEXP Xcopy,dim1;
1700
+  double *Xptr;
1701
+  int rows,cols;
1702
+  
1703
+  PROTECT(dim1 = getAttrib(X,R_DimSymbol));
1704
+  rows = INTEGER(dim1)[0];
1705
+  cols = INTEGER(dim1)[1];
1706
+  if (asInteger(copy)){
1707
+    PROTECT(Xcopy = allocMatrix(REALSXP,rows,cols));
1708
+    copyMatrix(Xcopy,X,0);
1709
+  } else {
1710
+    Xcopy = X;
1711
+  }
1712
+  Xptr = NUMERIC_POINTER(AS_NUMERIC(Xcopy));
1713
+  
1714
+  qnorm_c_improvedties(Xptr, &rows, &cols);
1715
+  if (asInteger(copy)){
1716
+    UNPROTECT(2);
1717
+  } else {
1718
+    UNPROTECT(1);
1719
+  }
1720
+  return Xcopy;
1721
+}
1722
+
1723
+