Browse code

normalize.quantiles() now handles missing NA values by assuming that the data is "missing at random"

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/preprocessCore@25822 bc3139a8-67e5-0310-9ffc-ced21a209358

Ben Bolstad authored on 15/07/2007 03:17:38
Showing5 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: preprocessCore
2
-Version: 0.99.11
2
+Version: 0.99.12
3 3
 Title: A collection of pre-processing functions
4 4
 Author: Benjamin Milo Bolstad <bmb@bmbolstad.com>
5 5
 Maintainer: Benjamin Milo Bolstad <bmb@bmbolstad.com>
... ...
@@ -44,7 +44,8 @@ normalize.quantiles <- function(x,copy=TRUE){
44 44
 
45 45
   #matrix(.C("qnorm_c", as.double(as.vector(x)), as.integer(rows), as.integer(cols))[[1]], rows, cols)
46 46
 
47
-  .Call("R_qnorm_c",x,copy, PACKAGE="preprocessCore");
47
+##  .Call("R_qnorm_c",x,copy, PACKAGE="preprocessCore");
48
+  .Call("R_qnorm_c_handleNA",x,copy, PACKAGE="preprocessCore");
48 49
 }
49 50
 
50 51
 
... ...
@@ -12,14 +12,19 @@
12 12
   \item{x}{A matrix of intensities where each column corresponds to a
13 13
     chip and each row is a probe.}
14 14
   \item{copy}{Make a copy of matrix before normalizing. Usually safer to
15
-    work with a copy}
15
+    work with a copy, but in certain situations not making a copy of the
16
+    matrix, but instead normalizing it in place will be more memory friendly.}
16 17
 }
17 18
 \details{This method is based upon the concept of a quantile-quantile
18 19
   plot extended to n dimensions. No special allowances are made for
19 20
   outliers. If you make use of quantile normalization 
20 21
   please cite Bolstad et al, Bioinformatics (2003).
21 22
 
22
-  Note that this function does not handle missing (ie NA) values.
23
+  This functions will handle missing data (ie NA values), based on the
24
+  assumption that the data is missing at random.
25
+
26
+  Note that the current implementation optimizes for better memory usage
27
+  at the cost of some additional run-time.
23 28
 
24 29
 }
25 30
 
... ...
@@ -28,6 +28,10 @@
28 28
   \code{\link[affy]{rma}} or \code{\link[affy]{expresso}}
29 29
   please cite Bolstad et al, Bioinformatics (2003).
30 30
 
31
+
32
+  These functions will handle missing data (ie NA values), based on the
33
+  assumption that the data is missing at random.
34
+  
31 35
 }
32 36
 
33 37
 \value{
... ...
@@ -54,10 +54,22 @@
54 54
  ** Nov 13, 2006 - remove median code
55 55
  ** May 20, 2007 - move to preprocessCore. clean up code.
56 56
  ** May 26, 2007 - fix memory leak in qnorm_c_determine_target
57
- ** Jul 12, 2007 - improved ties handling (fixes off by "half" error which affects odd numbers of ties)
57
+ ** Jul 12, 2007 - improved ties handling (fixes off by "half" error which affects even numbers of ties)
58
+ ** Jul 14, 2007 - add NA handling to qnorm_c_using_target and qnorm_c_determine_target
58 59
  **
59 60
  ***********************************************************/
60 61
 
62
+/*****************************************************************************************************
63
+ *****************************************************************************************************
64
+ **
65
+ ** GENERAL NOTE: Many of the functions take pointers for arguements that are essentially just
66
+ **               int's. This is mostly legacy for when the functions were called via .C() in R rather
67
+ **               than via the .Call() interface.
68
+ **
69
+ *****************************************************************************************************
70
+ *****************************************************************************************************/
71
+
72
+
61 73
 #include <stdio.h>
62 74
 #include <stdlib.h>
63 75
 #include <math.h>
... ...
@@ -1179,7 +1191,6 @@ SEXP R_qnorm_robust_weights(SEXP X, SEXP remove_extreme, SEXP n_remove){
1179 1191
  **                  normalized to)
1180 1192
  ** int *targetrows - length of target distribution vector
1181 1193
  **
1182
- ** Note that it is assumed that there is no NA or Inf type values in the vectors. (ie no missing data)
1183 1194
  **
1184 1195
  ** if targetrows == rows then the standard methodology is used.
1185 1196
  ** 
... ...
@@ -1591,6 +1602,77 @@ SEXP R_qnorm_determine_target(SEXP X, SEXP targetlength){
1591 1602
 
1592 1603
 
1593 1604
 
1605
+
1606
+/*********************************************************
1607
+ **
1608
+ ** void qnorm_c_handleNA(double *data, int *rows, int *cols)
1609
+ **
1610
+ **  this is the function that actually implements the
1611
+ ** quantile normalization algorithm. It is called from R.
1612
+ **
1613
+ ** returns 1 if there is a problem, 0 otherwise
1614
+ **
1615
+ ** Note that this function does not handle missing data (ie NA)
1616
+ **
1617
+ ********************************************************/
1618
+
1619
+
1620
+void qnorm_c_handleNA(double *data, int *rows, int *cols){
1621
+
1622
+  double *target = Calloc(*rows,double);
1623
+    
1624
+  qnorm_c_determine_target(data, rows, cols, target, rows);
1625
+  qnorm_c_using_target(data, rows, cols, target, rows);
1626
+
1627
+  Free(target);
1628
+
1629
+}
1630
+
1631
+
1632
+/*********************************************************
1633
+ **
1634
+ ** SEXP R_qnorm_c_handleNA(SEXP X)
1635
+ **
1636
+ ** SEXP X      - a matrix
1637
+ ** SEXP copy   - a flag if TRUE then make copy
1638
+ **               before normalizing, if FALSE work in place
1639
+ **               note that this can be dangerous since
1640
+ **               it will change the original matrix.
1641
+ **
1642
+ ** returns a quantile normalized matrix.
1643
+ **
1644
+ ** This is a .Call() interface for quantile normalization
1645
+ **
1646
+ *********************************************************/
1647
+
1648
+SEXP R_qnorm_c_handleNA(SEXP X, SEXP copy){
1649
+
1650
+  SEXP Xcopy,dim1;
1651
+  double *Xptr;
1652
+  int rows,cols;
1653
+  
1654
+  PROTECT(dim1 = getAttrib(X,R_DimSymbol));
1655
+  rows = INTEGER(dim1)[0];
1656
+  cols = INTEGER(dim1)[1];
1657
+  if (asInteger(copy)){
1658
+    PROTECT(Xcopy = allocMatrix(REALSXP,rows,cols));
1659
+    copyMatrix(Xcopy,X,0);
1660
+  } else {
1661
+    Xcopy = X;
1662
+  }
1663
+  Xptr = NUMERIC_POINTER(AS_NUMERIC(Xcopy));
1664
+  
1665
+  qnorm_c_handleNA(Xptr, &rows, &cols);
1666
+  if (asInteger(copy)){
1667
+    UNPROTECT(2);
1668
+  } else {
1669
+    UNPROTECT(1);
1670
+  }
1671
+  return Xcopy;
1672
+}
1673
+
1674
+
1675
+
1594 1676
 /*****************************************************************************************************
1595 1677
  *****************************************************************************************************
1596 1678
  **