Skip to content

Commit

Permalink
Merge pull request #59 from dllahr/master
Browse files Browse the repository at this point in the history
math.tests/fast_cov and fast_corr:  added methods for calculating when nan's are present
  • Loading branch information
tnat1031 authored Nov 8, 2019
2 parents a149cbc + 908d943 commit a08ddeb
Show file tree
Hide file tree
Showing 4 changed files with 534 additions and 14 deletions.
101 changes: 96 additions & 5 deletions cmapPy/math/fast_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ def fast_corr(x, y=None, destination=None):
"""
if y is None:
y = x

r = fast_cov.fast_cov(x, y, destination)
r = fast_cov.fast_cov(x, y, destination=destination)

std_x = numpy.std(x, axis=0, ddof=1)
std_y = numpy.std(y, axis=0, ddof=1)
Expand All @@ -37,6 +37,68 @@ def fast_corr(x, y=None, destination=None):
return r


def calculate_moments_with_additional_mask(x, mask):
"""calculate the moments (y, y^2, and variance) of the columns of x, excluding masked within x, for each of the masking columns in mask
Number of rows in x and mask must be the same.
Args:
x (numpy.ma.array like)
mask (numpy array-like boolean)
"""
non_mask_overlaps = fast_cov.calculate_non_mask_overlaps(x.mask, mask)

unmask = 1.0 * ~mask

expect_x = numpy.ma.dot(x.T, unmask) / non_mask_overlaps
expect_x = expect_x.T

expect_x_squared = numpy.ma.dot(
numpy.power(x, 2.0).T, unmask
) / non_mask_overlaps
expect_x_squared = expect_x_squared.T

var_x = (expect_x_squared - numpy.power(expect_x, 2.0)) * non_mask_overlaps.T / (non_mask_overlaps.T - 1)

return expect_x, expect_x_squared, var_x


def nan_fast_corr(x, y=None, destination=None):
"""calculate the pearson correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the pearson correlaton matrix
between x and y (with dimensions OxP). If destination is provided, put the results there.
In the language of statistics the columns are the variables and the rows are the observations.
Args:
x (numpy array-like) MxN in shape
y (optional, numpy array-like) OxP in shape. M (# rows in x) must equal O (# rows in y)
destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy
memmap of a file)
returns (numpy array-like) array of the covariance values
for defaults (y=None), shape is NxN
if y is provied, shape is NxP
"""
x_masked = numpy.ma.array(x, mask=numpy.isnan(x))

if y is None:
y_masked = x_masked
else:
y_masked = numpy.ma.array(y, mask=numpy.isnan(y))

r = fast_cov.nan_fast_cov(x_masked, y_masked, destination=destination)

# calculate the standard deviation of the columns of each matrix, given the masking from the other
_, _, var_x = calculate_moments_with_additional_mask(x_masked, y_masked.mask)
std_x = numpy.sqrt(var_x)

_, _, var_y = calculate_moments_with_additional_mask(y_masked, x_masked.mask)
std_y = numpy.sqrt(var_y)

numpy.divide(r, std_x.T, out=r)
numpy.divide(r, std_y, out=r)

return r


def fast_spearman(x, y=None, destination=None):
"""calculate the spearman correlation matrix for the columns of x (with dimensions MxN), or optionally, the spearman correlaton
matrix between the columns of x and the columns of y (with dimensions OxP). If destination is provided, put the results there.
Expand All @@ -53,13 +115,42 @@ def fast_spearman(x, y=None, destination=None):
for defaults (y=None), shape is NxN
if y is provied, shape is NxP
"""
r = _fast_spearman(fast_corr, x, y, destination)
return r


def _fast_spearman(corr_method, x, y, destination):
"""internal method for calculating spearman correlation, allowing subsititution of methods for calculationg correlation (corr_method),
allowing to choose methods that are fast (fast_corr) or tolerant of nan's (nan_fast_corr) to be used
"""
logger.debug("x.shape: {}".format(x.shape))
if hasattr(y, "shape"):
logger.debug("y.shape: {}".format(y.shape))

x_ranks = pandas.DataFrame(x).rank(method="average").values
x_ranks = pandas.DataFrame(x).rank(method="average", na_option="keep").values
logger.debug("some min and max ranks of x_ranks:\n{}\n{}".format(numpy.min(x_ranks[:10], axis=0), numpy.max(x_ranks[:10], axis=0)))

y_ranks = pandas.DataFrame(y).rank(method="average").values if y is not None else None
y_ranks = pandas.DataFrame(y).rank(method="average", na_option="keep").values if y is not None else None

return corr_method(x_ranks, y_ranks, destination=destination)

return fast_corr(x_ranks, y_ranks, destination)

def nan_fast_spearman(x, y=None, destination=None):
"""calculate the spearman correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the spearman correlaton
matrix between the columns of x and the columns of y (with dimensions OxP). If destination is provided, put the results there.
In the language of statistics the columns are the variables and the rows are the observations.
Note that the ranks will be slightly miscalculated in the masked situations leading to slight errors in the spearman rho value.
Args:
x (numpy array-like) MxN in shape
y (optional, numpy array-like) OxP in shape. M (# rows in x) must equal O (# rows in y)
destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy
memmap of a file)
returns:
(numpy array-like) array of the covariance values
for defaults (y=None), shape is NxN
if y is provied, shape is NxP
"""
r = _fast_spearman(nan_fast_corr, x, y, destination)
return r
79 changes: 75 additions & 4 deletions cmapPy/math/fast_cov.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,36 @@
logger = logging.getLogger(setup_logger.LOGGER_NAME)


def _fast_dot_divide(x, y, destination):
"""helper method for use within the _fast_cov method - carry out the dot product and subsequent
division to generate the covariance values. For use when there are no missing values.
"""
numpy.dot(x.T, y, out=destination)
numpy.divide(destination, (x.shape[0] - 1), out=destination)


def calculate_non_mask_overlaps(x_mask, y_mask):
"""for two mask arrays (x_mask, y_mask - boolean arrays) determine the number of entries in common there would be for each
entry if their dot product were taken
"""
x_is_not_nan = 1 * ~x_mask
y_is_not_nan = 1 * ~y_mask

r = numpy.dot(x_is_not_nan.T, y_is_not_nan)
return r


def _nan_dot_divide(x, y, destination):
"""helper method for use within the _fast_cov method - carry out the dot product and subsequent
division to generate the covariance values. For use when there are missing values.
"""
numpy.ma.dot(x.T, y, out=destination)

divisor = calculate_non_mask_overlaps(x.mask, y.mask) - 1

numpy.ma.divide(destination, divisor, out=destination)


def fast_cov(x, y=None, destination=None):
"""calculate the covariance matrix for the columns of x (MxN), or optionally, the covariance matrix between the
columns of x and and the columns of y (MxP). (In the language of statistics, the columns are variables, the rows
Expand All @@ -21,6 +51,12 @@ def fast_cov(x, y=None, destination=None):
for defaults (y=None), shape is NxN
if y is provided, shape is NxP
"""
r = _fast_cov(numpy.mean, _fast_dot_divide, x, y, destination)

return r


def _fast_cov(mean_method, dot_divide_method, x, y, destination):
validate_inputs(x, y, destination)

if y is None:
Expand All @@ -29,14 +65,13 @@ def fast_cov(x, y=None, destination=None):
if destination is None:
destination = numpy.zeros((x.shape[1], y.shape[1]))

mean_x = numpy.mean(x, axis=0)
mean_y = numpy.mean(y, axis=0)
mean_x = mean_method(x, axis=0)
mean_y = mean_method(y, axis=0)

mean_centered_x = (x - mean_x).astype(destination.dtype)
mean_centered_y = (y - mean_y).astype(destination.dtype)

numpy.dot(mean_centered_x.T, mean_centered_y, out=destination)
numpy.divide(destination, (x.shape[0] - 1), out=destination)
dot_divide_method(mean_centered_x, mean_centered_y, destination)

return destination

Expand Down Expand Up @@ -71,5 +106,41 @@ def validate_inputs(x, y, destination):
raise CmapPyMathFastCovInvalidInputXY(error_msg)


def nan_fast_cov(x, y=None, destination=None):
"""calculate the covariance matrix (ignoring nan values) for the columns of x (MxN), or optionally, the covariance matrix between the
columns of x and and the columns of y (MxP). (In the language of statistics, the columns are variables, the rows
are observations).
Args:
x (numpy array-like) MxN in shape
y (numpy array-like) MxP in shape
destination (numpy masked array-like) optional location where to store the results as they are calculated (e.g. a numpy
memmap of a file)
returns (numpy array-like) array of the covariance values
for defaults (y=None), shape is NxN
if y is provided, shape is NxP
"""
x_masked = numpy.ma.array(x, mask=numpy.isnan(x))

if y is None:
y_masked = x_masked
else:
y_masked = numpy.ma.array(y, mask=numpy.isnan(y))

dest_was_None = False
if destination is None:
destination = numpy.ma.zeros((x_masked.shape[1], y_masked.shape[1]))
dest_was_None = True

r = _fast_cov(numpy.nanmean, _nan_dot_divide, x_masked, y_masked, destination)

r[numpy.isinf(r)] = numpy.nan

r = numpy.ma.filled(r, fill_value=numpy.nan) if dest_was_None else r

return r


class CmapPyMathFastCovInvalidInputXY(Exception):
pass
Loading

0 comments on commit a08ddeb

Please sign in to comment.