Merge pull request #59 from dllahr/master

math.tests/fast_cov and fast_corr: added methods for calculating when nan's are present
cmap · Nov 8, 2019 · a08ddeb · a08ddeb
2 parents a149cbc + 908d943
commit a08ddeb
Show file tree

Hide file tree

Showing 4 changed files with 534 additions and 14 deletions.
diff --git a/cmapPy/math/fast_corr.py b/cmapPy/math/fast_corr.py
@@ -25,8 +25,8 @@ def fast_corr(x, y=None, destination=None):
     """
     if y is None:
         y = x
-
-    r = fast_cov.fast_cov(x, y, destination)
+    
+    r = fast_cov.fast_cov(x, y, destination=destination)
 
     std_x = numpy.std(x, axis=0, ddof=1)
     std_y = numpy.std(y, axis=0, ddof=1)
@@ -37,6 +37,68 @@ def fast_corr(x, y=None, destination=None):
     return r
 
 
+def calculate_moments_with_additional_mask(x, mask):
+    """calculate the moments (y, y^2, and variance) of the columns of x, excluding masked within x, for each of the masking columns in mask
+    Number of rows in x and mask must be the same.
+
+    Args:
+        x (numpy.ma.array like)
+        mask (numpy array-like boolean) 
+    """
+    non_mask_overlaps = fast_cov.calculate_non_mask_overlaps(x.mask, mask)
+
+    unmask = 1.0 * ~mask
+
+    expect_x = numpy.ma.dot(x.T, unmask) / non_mask_overlaps
+    expect_x = expect_x.T
+
+    expect_x_squared = numpy.ma.dot(
+        numpy.power(x, 2.0).T, unmask
+    ) / non_mask_overlaps
+    expect_x_squared = expect_x_squared.T
+
+    var_x = (expect_x_squared - numpy.power(expect_x, 2.0)) * non_mask_overlaps.T / (non_mask_overlaps.T - 1)
+
+    return expect_x, expect_x_squared, var_x
+
+
+def nan_fast_corr(x, y=None, destination=None):
+    """calculate the pearson correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the pearson correlaton matrix
+    between x and y (with dimensions OxP).  If destination is provided, put the results there.  
+    In the language of statistics the columns are the variables and the rows are the observations.
+
+    Args:
+        x (numpy array-like) MxN in shape
+        y (optional, numpy array-like) OxP in shape.  M (# rows in x) must equal O (# rows in y)
+        destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy
+            memmap of a file)
+
+        returns (numpy array-like) array of the covariance values
+            for defaults (y=None), shape is NxN
+            if y is provied, shape is NxP
+    """
+    x_masked = numpy.ma.array(x, mask=numpy.isnan(x))
+
+    if y is None:
+        y_masked = x_masked
+    else:
+        y_masked = numpy.ma.array(y, mask=numpy.isnan(y))
+
+    r = fast_cov.nan_fast_cov(x_masked, y_masked, destination=destination)
+
+    # calculate the standard deviation of the columns of each matrix, given the masking from the other
+    _, _, var_x = calculate_moments_with_additional_mask(x_masked, y_masked.mask)
+    std_x = numpy.sqrt(var_x)
+
+    _, _, var_y = calculate_moments_with_additional_mask(y_masked, x_masked.mask)
+    std_y = numpy.sqrt(var_y)
+
+    numpy.divide(r, std_x.T, out=r)
+    numpy.divide(r, std_y, out=r)
+
+    return r
+
+
 def fast_spearman(x, y=None, destination=None):
     """calculate the spearman correlation matrix for the columns of x (with dimensions MxN), or optionally, the spearman correlaton
     matrix between the columns of x and the columns of y (with dimensions OxP).  If destination is provided, put the results there.
@@ -53,13 +115,42 @@ def fast_spearman(x, y=None, destination=None):
                 for defaults (y=None), shape is NxN
                 if y is provied, shape is NxP
     """
+    r = _fast_spearman(fast_corr, x, y, destination)
+    return r
+
+
+def _fast_spearman(corr_method, x, y, destination):
+    """internal method for calculating spearman correlation, allowing subsititution of methods for calculationg correlation (corr_method),
+    allowing to choose methods that are fast (fast_corr) or tolerant of nan's (nan_fast_corr) to be used
+    """
     logger.debug("x.shape:  {}".format(x.shape))
     if hasattr(y, "shape"):  
         logger.debug("y.shape:  {}".format(y.shape))
 
-    x_ranks = pandas.DataFrame(x).rank(method="average").values
+    x_ranks = pandas.DataFrame(x).rank(method="average", na_option="keep").values
     logger.debug("some min and max ranks of x_ranks:\n{}\n{}".format(numpy.min(x_ranks[:10], axis=0), numpy.max(x_ranks[:10], axis=0)))
 
-    y_ranks = pandas.DataFrame(y).rank(method="average").values if y is not None else None
+    y_ranks = pandas.DataFrame(y).rank(method="average", na_option="keep").values if y is not None else None
+
+    return corr_method(x_ranks, y_ranks, destination=destination)
 
-    return fast_corr(x_ranks, y_ranks, destination)
+
+def nan_fast_spearman(x, y=None, destination=None):
+    """calculate the spearman correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the spearman correlaton
+    matrix between the columns of x and the columns of y (with dimensions OxP).  If destination is provided, put the results there.
+    In the language of statistics the columns are the variables and the rows are the observations.
+    Note that the ranks will be slightly miscalculated in the masked situations leading to slight errors in the spearman rho value.
+
+    Args:
+        x (numpy array-like) MxN in shape
+        y (optional, numpy array-like) OxP in shape.  M (# rows in x) must equal O (# rows in y)
+        destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy
+            memmap of a file)
+
+        returns:
+            (numpy array-like) array of the covariance values
+                for defaults (y=None), shape is NxN
+                if y is provied, shape is NxP
+    """
+    r = _fast_spearman(nan_fast_corr, x, y, destination)
+    return r
diff --git a/cmapPy/math/fast_cov.py b/cmapPy/math/fast_cov.py
@@ -6,6 +6,36 @@
 logger = logging.getLogger(setup_logger.LOGGER_NAME)
 
 
+def _fast_dot_divide(x, y, destination):
+    """helper method for use within the _fast_cov method - carry out the dot product and subsequent 
+    division to generate the covariance values.  For use when there are no missing values.
+    """
+    numpy.dot(x.T, y, out=destination)
+    numpy.divide(destination, (x.shape[0] - 1), out=destination)
+
+
+def calculate_non_mask_overlaps(x_mask, y_mask):
+    """for two mask arrays (x_mask, y_mask - boolean arrays) determine the number of entries in common there would be for each 
+    entry if their dot product were taken
+    """
+    x_is_not_nan = 1 * ~x_mask
+    y_is_not_nan = 1 * ~y_mask
+
+    r = numpy.dot(x_is_not_nan.T, y_is_not_nan)
+    return r
+
+
+def _nan_dot_divide(x, y, destination):
+    """helper method for use within the _fast_cov method - carry out the dot product and subsequent
+    division to generate the covariance values.  For use when there are missing values.
+    """
+    numpy.ma.dot(x.T, y, out=destination)
+
+    divisor = calculate_non_mask_overlaps(x.mask, y.mask) - 1
+
+    numpy.ma.divide(destination, divisor, out=destination)
+
+
 def fast_cov(x, y=None, destination=None):
     """calculate the covariance matrix for the columns of x (MxN), or optionally, the covariance matrix between the
     columns of x and and the columns of y (MxP).  (In the language of statistics, the columns are variables, the rows
@@ -21,6 +51,12 @@ def fast_cov(x, y=None, destination=None):
             for defaults (y=None), shape is NxN
             if y is provided, shape is NxP
     """
+    r = _fast_cov(numpy.mean, _fast_dot_divide, x, y, destination)
+
+    return r
+
+
+def _fast_cov(mean_method, dot_divide_method, x, y, destination):
     validate_inputs(x, y, destination)
 
     if y is None:
@@ -29,14 +65,13 @@ def fast_cov(x, y=None, destination=None):
     if destination is None:
         destination = numpy.zeros((x.shape[1], y.shape[1]))
 
-    mean_x = numpy.mean(x, axis=0)
-    mean_y = numpy.mean(y, axis=0)
+    mean_x = mean_method(x, axis=0)
+    mean_y = mean_method(y, axis=0)
 
     mean_centered_x = (x - mean_x).astype(destination.dtype)
     mean_centered_y = (y - mean_y).astype(destination.dtype)
 
-    numpy.dot(mean_centered_x.T, mean_centered_y, out=destination)
-    numpy.divide(destination, (x.shape[0] - 1), out=destination)
+    dot_divide_method(mean_centered_x, mean_centered_y, destination)
 
     return destination
 
@@ -71,5 +106,41 @@ def validate_inputs(x, y, destination):
         raise CmapPyMathFastCovInvalidInputXY(error_msg)
 
 
+def nan_fast_cov(x, y=None, destination=None):
+    """calculate the covariance matrix (ignoring nan values) for the columns of x (MxN), or optionally, the covariance matrix between the
+    columns of x and and the columns of y (MxP).  (In the language of statistics, the columns are variables, the rows
+    are observations).
+
+    Args:
+        x (numpy array-like) MxN in shape
+        y (numpy array-like) MxP in shape
+        destination (numpy masked array-like) optional location where to store the results as they are calculated (e.g. a numpy
+            memmap of a file)
+
+        returns (numpy array-like) array of the covariance values
+            for defaults (y=None), shape is NxN
+            if y is provided, shape is NxP
+    """
+    x_masked = numpy.ma.array(x, mask=numpy.isnan(x))
+
+    if y is None:
+        y_masked = x_masked
+    else:
+        y_masked = numpy.ma.array(y, mask=numpy.isnan(y))
+
+    dest_was_None = False
+    if destination is None:
+        destination = numpy.ma.zeros((x_masked.shape[1], y_masked.shape[1]))
+        dest_was_None = True
+
+    r = _fast_cov(numpy.nanmean, _nan_dot_divide, x_masked, y_masked, destination)
+
+    r[numpy.isinf(r)] = numpy.nan
+
+    r = numpy.ma.filled(r, fill_value=numpy.nan) if dest_was_None else r
+
+    return r
+
+
 class CmapPyMathFastCovInvalidInputXY(Exception):
     pass