diff --git a/cmapPy/math/fast_corr.py b/cmapPy/math/fast_corr.py index c6d3fe6..ac50dac 100644 --- a/cmapPy/math/fast_corr.py +++ b/cmapPy/math/fast_corr.py @@ -25,8 +25,8 @@ def fast_corr(x, y=None, destination=None): """ if y is None: y = x - - r = fast_cov.fast_cov(x, y, destination) + + r = fast_cov.fast_cov(x, y, destination=destination) std_x = numpy.std(x, axis=0, ddof=1) std_y = numpy.std(y, axis=0, ddof=1) @@ -37,6 +37,68 @@ def fast_corr(x, y=None, destination=None): return r +def calculate_moments_with_additional_mask(x, mask): + """calculate the moments (y, y^2, and variance) of the columns of x, excluding masked within x, for each of the masking columns in mask + Number of rows in x and mask must be the same. + + Args: + x (numpy.ma.array like) + mask (numpy array-like boolean) + """ + non_mask_overlaps = fast_cov.calculate_non_mask_overlaps(x.mask, mask) + + unmask = 1.0 * ~mask + + expect_x = numpy.ma.dot(x.T, unmask) / non_mask_overlaps + expect_x = expect_x.T + + expect_x_squared = numpy.ma.dot( + numpy.power(x, 2.0).T, unmask + ) / non_mask_overlaps + expect_x_squared = expect_x_squared.T + + var_x = (expect_x_squared - numpy.power(expect_x, 2.0)) * non_mask_overlaps.T / (non_mask_overlaps.T - 1) + + return expect_x, expect_x_squared, var_x + + +def nan_fast_corr(x, y=None, destination=None): + """calculate the pearson correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the pearson correlaton matrix + between x and y (with dimensions OxP). If destination is provided, put the results there. + In the language of statistics the columns are the variables and the rows are the observations. + + Args: + x (numpy array-like) MxN in shape + y (optional, numpy array-like) OxP in shape. M (# rows in x) must equal O (# rows in y) + destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy + memmap of a file) + + returns (numpy array-like) array of the covariance values + for defaults (y=None), shape is NxN + if y is provied, shape is NxP + """ + x_masked = numpy.ma.array(x, mask=numpy.isnan(x)) + + if y is None: + y_masked = x_masked + else: + y_masked = numpy.ma.array(y, mask=numpy.isnan(y)) + + r = fast_cov.nan_fast_cov(x_masked, y_masked, destination=destination) + + # calculate the standard deviation of the columns of each matrix, given the masking from the other + _, _, var_x = calculate_moments_with_additional_mask(x_masked, y_masked.mask) + std_x = numpy.sqrt(var_x) + + _, _, var_y = calculate_moments_with_additional_mask(y_masked, x_masked.mask) + std_y = numpy.sqrt(var_y) + + numpy.divide(r, std_x.T, out=r) + numpy.divide(r, std_y, out=r) + + return r + + def fast_spearman(x, y=None, destination=None): """calculate the spearman correlation matrix for the columns of x (with dimensions MxN), or optionally, the spearman correlaton matrix between the columns of x and the columns of y (with dimensions OxP). If destination is provided, put the results there. @@ -53,13 +115,42 @@ def fast_spearman(x, y=None, destination=None): for defaults (y=None), shape is NxN if y is provied, shape is NxP """ + r = _fast_spearman(fast_corr, x, y, destination) + return r + + +def _fast_spearman(corr_method, x, y, destination): + """internal method for calculating spearman correlation, allowing subsititution of methods for calculationg correlation (corr_method), + allowing to choose methods that are fast (fast_corr) or tolerant of nan's (nan_fast_corr) to be used + """ logger.debug("x.shape: {}".format(x.shape)) if hasattr(y, "shape"): logger.debug("y.shape: {}".format(y.shape)) - x_ranks = pandas.DataFrame(x).rank(method="average").values + x_ranks = pandas.DataFrame(x).rank(method="average", na_option="keep").values logger.debug("some min and max ranks of x_ranks:\n{}\n{}".format(numpy.min(x_ranks[:10], axis=0), numpy.max(x_ranks[:10], axis=0))) - y_ranks = pandas.DataFrame(y).rank(method="average").values if y is not None else None + y_ranks = pandas.DataFrame(y).rank(method="average", na_option="keep").values if y is not None else None + + return corr_method(x_ranks, y_ranks, destination=destination) - return fast_corr(x_ranks, y_ranks, destination) + +def nan_fast_spearman(x, y=None, destination=None): + """calculate the spearman correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the spearman correlaton + matrix between the columns of x and the columns of y (with dimensions OxP). If destination is provided, put the results there. + In the language of statistics the columns are the variables and the rows are the observations. + Note that the ranks will be slightly miscalculated in the masked situations leading to slight errors in the spearman rho value. + + Args: + x (numpy array-like) MxN in shape + y (optional, numpy array-like) OxP in shape. M (# rows in x) must equal O (# rows in y) + destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy + memmap of a file) + + returns: + (numpy array-like) array of the covariance values + for defaults (y=None), shape is NxN + if y is provied, shape is NxP + """ + r = _fast_spearman(nan_fast_corr, x, y, destination) + return r diff --git a/cmapPy/math/fast_cov.py b/cmapPy/math/fast_cov.py index 447253a..d26502e 100644 --- a/cmapPy/math/fast_cov.py +++ b/cmapPy/math/fast_cov.py @@ -6,6 +6,36 @@ logger = logging.getLogger(setup_logger.LOGGER_NAME) +def _fast_dot_divide(x, y, destination): + """helper method for use within the _fast_cov method - carry out the dot product and subsequent + division to generate the covariance values. For use when there are no missing values. + """ + numpy.dot(x.T, y, out=destination) + numpy.divide(destination, (x.shape[0] - 1), out=destination) + + +def calculate_non_mask_overlaps(x_mask, y_mask): + """for two mask arrays (x_mask, y_mask - boolean arrays) determine the number of entries in common there would be for each + entry if their dot product were taken + """ + x_is_not_nan = 1 * ~x_mask + y_is_not_nan = 1 * ~y_mask + + r = numpy.dot(x_is_not_nan.T, y_is_not_nan) + return r + + +def _nan_dot_divide(x, y, destination): + """helper method for use within the _fast_cov method - carry out the dot product and subsequent + division to generate the covariance values. For use when there are missing values. + """ + numpy.ma.dot(x.T, y, out=destination) + + divisor = calculate_non_mask_overlaps(x.mask, y.mask) - 1 + + numpy.ma.divide(destination, divisor, out=destination) + + def fast_cov(x, y=None, destination=None): """calculate the covariance matrix for the columns of x (MxN), or optionally, the covariance matrix between the columns of x and and the columns of y (MxP). (In the language of statistics, the columns are variables, the rows @@ -21,6 +51,12 @@ def fast_cov(x, y=None, destination=None): for defaults (y=None), shape is NxN if y is provided, shape is NxP """ + r = _fast_cov(numpy.mean, _fast_dot_divide, x, y, destination) + + return r + + +def _fast_cov(mean_method, dot_divide_method, x, y, destination): validate_inputs(x, y, destination) if y is None: @@ -29,14 +65,13 @@ def fast_cov(x, y=None, destination=None): if destination is None: destination = numpy.zeros((x.shape[1], y.shape[1])) - mean_x = numpy.mean(x, axis=0) - mean_y = numpy.mean(y, axis=0) + mean_x = mean_method(x, axis=0) + mean_y = mean_method(y, axis=0) mean_centered_x = (x - mean_x).astype(destination.dtype) mean_centered_y = (y - mean_y).astype(destination.dtype) - numpy.dot(mean_centered_x.T, mean_centered_y, out=destination) - numpy.divide(destination, (x.shape[0] - 1), out=destination) + dot_divide_method(mean_centered_x, mean_centered_y, destination) return destination @@ -71,5 +106,41 @@ def validate_inputs(x, y, destination): raise CmapPyMathFastCovInvalidInputXY(error_msg) +def nan_fast_cov(x, y=None, destination=None): + """calculate the covariance matrix (ignoring nan values) for the columns of x (MxN), or optionally, the covariance matrix between the + columns of x and and the columns of y (MxP). (In the language of statistics, the columns are variables, the rows + are observations). + + Args: + x (numpy array-like) MxN in shape + y (numpy array-like) MxP in shape + destination (numpy masked array-like) optional location where to store the results as they are calculated (e.g. a numpy + memmap of a file) + + returns (numpy array-like) array of the covariance values + for defaults (y=None), shape is NxN + if y is provided, shape is NxP + """ + x_masked = numpy.ma.array(x, mask=numpy.isnan(x)) + + if y is None: + y_masked = x_masked + else: + y_masked = numpy.ma.array(y, mask=numpy.isnan(y)) + + dest_was_None = False + if destination is None: + destination = numpy.ma.zeros((x_masked.shape[1], y_masked.shape[1])) + dest_was_None = True + + r = _fast_cov(numpy.nanmean, _nan_dot_divide, x_masked, y_masked, destination) + + r[numpy.isinf(r)] = numpy.nan + + r = numpy.ma.filled(r, fill_value=numpy.nan) if dest_was_None else r + + return r + + class CmapPyMathFastCovInvalidInputXY(Exception): pass diff --git a/cmapPy/math/tests/test_fast_corr.py b/cmapPy/math/tests/test_fast_corr.py index 6f36b83..715d918 100644 --- a/cmapPy/math/tests/test_fast_corr.py +++ b/cmapPy/math/tests/test_fast_corr.py @@ -1,6 +1,7 @@ import unittest import logging import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger +import cmapPy.math.fast_cov import cmapPy.math.fast_corr as fast_corr import numpy import pandas @@ -17,11 +18,23 @@ class TestFastCorr(unittest.TestCase): @staticmethod def build_standard_x_y(): x = numpy.array([[1,7,2], [5,3,11]]) - logger.debug("x: {}".format(x)) + logger.debug("x:\n{}".format(x)) logger.debug("x.shape: {}".format(x.shape)) y = numpy.array([[13, 17, 19], [23, 31, 29]]) - logger.debug("y: {}".format(y)) + logger.debug("y:\n{}".format(y)) + logger.debug("y.shape: {}".format(y.shape)) + + return x, y + + @staticmethod + def build_nan_containing_x_y(): + x = numpy.array([[1,numpy.nan,2], [3,5,7], [11,13,17]], dtype=float) + logger.debug("x:\n{}".format(x)) + logger.debug("x.shape: {}".format(x.shape)) + + y = numpy.array([[19, 23, 29], [31, 37, 41], [numpy.nan, 43, 47]], dtype=float) + logger.debug("y:\n{}".format(y)) logger.debug("y.shape: {}".format(y.shape)) return x, y @@ -153,7 +166,7 @@ def test_fast_spearman(self): ex = numpy.array([[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0], [1.0, 1.0, 1.0]]) r = fast_corr.fast_spearman(x, y) - logger.debug("r: {}".format(r)) + logger.debug("r:\n{}".format(r)) self.assertTrue(numpy.allclose(ex, r)) @@ -163,6 +176,193 @@ def test_fast_spearman(self): self.assertIs(dest, r) self.assertTrue(numpy.allclose(ex, dest)) + def test_nan_fast_spearman(self): + x, y = TestFastCorr.build_nan_containing_x_y() + y[:,2] = y[:,2][::-1] + logger.debug("updated y:\n{}".format(y)) + + ex = numpy.array([[1.0, 1.0, -1.0], [numpy.nan, 1.0, -1.0], [1.0, 1.0, -1.0]]) + + r = fast_corr.nan_fast_spearman(x, y) + logger.debug("r:\n{}".format(r)) + + nan_locs = numpy.isnan(ex) + self.assertTrue(all(numpy.isnan(r[nan_locs]))) + + self.assertTrue(numpy.allclose(ex[~nan_locs], r[~nan_locs])) + + def test_calculate_moments_with_additional_mask(self): + x,y = TestFastCorr.build_nan_containing_x_y() + x = numpy.ma.array(x, mask=numpy.isnan(x)) + y = numpy.ma.array(y, mask=numpy.isnan(y)) + + r_x, r_x2, r_var = fast_corr.calculate_moments_with_additional_mask(x, y.mask) + + logger.debug("r_x:\n{}".format(r_x)) + for i in range(y.shape[1]): + locs = ~numpy.isnan(y[:,i]) # locations in column where y is not nan + logger.debug("check for r_x[{},:] - locs: {}".format(i, locs)) + + for j in range(x.shape[1]): + c = numpy.ma.mean(x[locs, j]) + logger.debug("check for r_x[i,j] entry - c: {}".format(i, j, c)) + self.assertTrue(numpy.isclose(c, r_x[i,j])) + + logger.debug("r_x2:\n{}".format(r_x2)) + for i in range(y.shape[1]): + locs = ~numpy.isnan(y[:,i]) # locations in column where y is not nan + logger.debug("check for r_x2[{},:] - locs: {}".format(i, locs)) + + for j in range(x.shape[1]): + c = numpy.ma.mean(numpy.power(x[locs, j], 2.0)) + logger.debug("check for r_x2[{},{}] entry - c: {}".format(i, j, c)) + self.assertTrue(numpy.isclose(c, r_x2[i,j])) + + + logger.debug("r_var:\n{}".format(r_var)) + overlaps = cmapPy.math.fast_cov.calculate_non_mask_overlaps(x.mask, y.mask).T + logger.debug("overlaps:\n{}".format(overlaps)) + locs = (0,0) + n = float(overlaps[locs]) + c = (r_x2[locs] - r_x[locs]**2) * n / (n - 1.) + logger.debug("check for locs: {} n: {} c: {}".format(locs, n, c)) + self.assertTrue(numpy.isclose(c, r_var[locs])) + locs = (1,0) + n = float(overlaps[locs]) + c = (r_x2[locs] - r_x[locs]**2) * n / (n - 1.) + logger.debug("check for locs: {} n: {} c: {}".format(locs, n, c)) + # self.assertTrue(numpy.isclose(c, r_var[locs])) + locs = (2,0) + n = float(overlaps[locs]) + c = (r_x2[locs] - r_x[locs]**2) * n / (n - 1.) + logger.debug("check for locs: {} n: {} c: {}".format(locs, n, c)) + self.assertTrue(numpy.isclose(c, r_var[locs])) + locs = (2,2) + n = float(overlaps[locs]) + c = (r_x2[locs] - r_x[locs]**2) * n / (n - 1.) + logger.debug("check for locs: {} n: {} c: {}".format(locs, n, c)) + self.assertTrue(numpy.isclose(c, r_var[locs])) + + def test_nan_fast_corr_just_x(self): + logger.debug("*************happy path just x") + x, _ = TestFastCorr.build_nan_containing_x_y() + + logger.debug("current matrix - x:\n{}".format(x)) + ex_with_nan = numpy.corrcoef(x, rowvar=False) + logger.debug("expected with nan's - ex_with_nan:\n{}".format(ex_with_nan)) + + r = fast_corr.nan_fast_corr(x) + logger.debug("r:\n{}".format(r)) + + non_nan_locs = ~numpy.isnan(ex_with_nan) + self.assertTrue(numpy.allclose(ex_with_nan[non_nan_locs], r[non_nan_locs])) + + check_nominal_nans = numpy.zeros(x.shape[1]) + + u = x[1:,1] + print(u) + for i in range(x.shape[1]): + t = x[1:, i] + print(t) + c = numpy.corrcoef(t, u)[0,1] + print(c) + check_nominal_nans[i] = c + print(check_nominal_nans - r[:,1]) + logger.debug("calculate entries that would be nan - check_nominal_nans: {}".format(check_nominal_nans)) + self.assertTrue(numpy.allclose(check_nominal_nans, r[:, 1])) + self.assertTrue(numpy.allclose(check_nominal_nans, r[1, :])) + + x[0,1] = x[1,2] + x[1,2] = numpy.nan + logger.debug("current matrix - x:\n{}".format(x)) + ex_with_nan = numpy.corrcoef(x, rowvar=False) + logger.debug("expected with nan's - ex_with_nan:\n{}".format(ex_with_nan)) + + r = fast_corr.nan_fast_corr(x) + logger.debug("r:\n{}".format(r)) + + non_nan_locs = ~numpy.isnan(ex_with_nan) + self.assertTrue(numpy.allclose(ex_with_nan[non_nan_locs], r[non_nan_locs])) + + check_nominal_nans = numpy.zeros(x.shape[1]) + + u = x[(0,2),2] + print(u) + for i in range(x.shape[1]): + t = x[(0,2), i] + print(t) + c = numpy.corrcoef(t, u)[0,1] + print(c) + check_nominal_nans[i] = c + print(check_nominal_nans - r[:,2]) + logger.debug("calculate entries that would be nan - check_nominal_nans: {}".format(check_nominal_nans)) + self.assertTrue(numpy.allclose(check_nominal_nans, r[:, 2])) + self.assertTrue(numpy.allclose(check_nominal_nans, r[2, :])) + + def test_nan_fast_corr_different_shapes(self): + x,y = TestFastCorr.build_standard_x_y() + + z = numpy.zeros((x.shape[0], x.shape[1]+2)) + z[:, :y.shape[1]] = y + for i in range(y.shape[1], x.shape[1]+2): + t = (i+1) * numpy.array(range(1, z.shape[0]+1)) + if i%2 == 0: + z[:, i] = t + else: + z[:, i] = t[::-1] + logger.debug("z:\n{}".format(z)) + logger.debug("z.shape: {}".format(z.shape)) + + combined = numpy.hstack([x, z]) + + raw_ex = numpy.corrcoef(combined, rowvar=False) + logger.debug("raw_ex.shape: {}".format(raw_ex.shape)) + + ex = raw_ex[:x.shape[1], -z.shape[1]:] + logger.debug("ex:\n{}".format(ex)) + logger.debug("ex.shape: {}".format(ex.shape)) + + r = fast_corr.nan_fast_corr(x, z) + logger.debug("r:\n{}".format(r)) + logger.debug("r.shape: {}".format(r.shape)) + + self.assertTrue(numpy.allclose(ex, r)) + + + def test_nan_fast_corr_functional(self): + logger.debug("*************happy path functional test nan_fast_corr using randomly generated matrices") + + # first without any nan's + for i in range(num_iterations_functional_tests): + #the dimension containing the observations must have at least size 2 + x_shape = [numpy.random.randint(2, max_dimension_functional_tests), + numpy.random.randint(1, max_dimension_functional_tests)] + logger.debug("x_shape: {}".format(x_shape)) + + x = numpy.random.rand(x_shape[0], x_shape[1]) * numpy.random.randint(1, multiplier_max_functional_tests, size=1) + logger.debug("x:\n{}".format(x)) + + y_other_shape = numpy.random.randint(1, max_dimension_functional_tests, size=1)[0] + y_shape = (x_shape[0], y_other_shape) + logger.debug("y_shape: {}".format(y_shape)) + y = numpy.random.rand(y_shape[0], y_shape[1]) * numpy.random.randint(1, multiplier_max_functional_tests, size=1) + logger.debug("y:\n{}".format(y)) + + combined = numpy.hstack([x, y]) + + raw_ex = numpy.corrcoef(combined, rowvar=False) + logger.debug("raw_ex.shape: {}".format(raw_ex.shape)) + + ex = raw_ex[:x.shape[1], -y.shape[1]:] + logger.debug("ex:\n{}".format(ex)) + logger.debug("ex.shape: {}".format(ex.shape)) + + r = fast_corr.nan_fast_corr(x, y) + logger.debug("r:\n{}".format(r)) + logger.debug("r.shape: {}".format(r.shape)) + + self.assertTrue(numpy.allclose(ex, r)) + if __name__ == "__main__": setup_logger.setup(verbose=True) diff --git a/cmapPy/math/tests/test_fast_cov.py b/cmapPy/math/tests/test_fast_cov.py index 4a765cd..67396ae 100644 --- a/cmapPy/math/tests/test_fast_cov.py +++ b/cmapPy/math/tests/test_fast_cov.py @@ -13,16 +13,28 @@ class TestFastCov(unittest.TestCase): @staticmethod def build_standard_x_y(): - x = numpy.array([[1,2,3], [5,7,11]]) + x = numpy.array([[1,2,3], [5,7,11]], dtype=float) logger.debug("x: {}".format(x)) logger.debug("x.shape: {}".format(x.shape)) - y = numpy.array([[13, 17, 19], [23, 29, 31]]) + y = numpy.array([[13, 17, 19], [23, 29, 31]], dtype=float) logger.debug("y: {}".format(y)) logger.debug("y.shape: {}".format(y.shape)) return x, y + @staticmethod + def build_nan_containing_x_y(): + x = numpy.array([[1,numpy.nan,2], [3,5,7], [11,13,17]], dtype=float) + logger.debug("x:\n{}".format(x)) + logger.debug("x.shape: {}".format(x.shape)) + + y = numpy.array([[19, 23, 29], [31, 37, 41], [numpy.nan, 43, 47]], dtype=float) + logger.debug("y:\n{}".format(y)) + logger.debug("y.shape: {}".format(y.shape)) + + return x, y + def test_validate_inputs(self): shape = (3,2) @@ -212,6 +224,152 @@ def test_fast_cov_x_and_y_different_shapes(self): self.assertIs(dest, r) self.assertTrue(numpy.allclose(ex, dest)) + def test_calculate_non_mask_overlaps(self): + x = numpy.zeros((3,3)) + x[0,1] = numpy.nan + x = numpy.ma.array(x, mask=numpy.isnan(x)) + logger.debug("happy path x has 1 nan - x:\n{}".format(x)) + + r = fast_cov.calculate_non_mask_overlaps(x.mask, x.mask) + logger.debug("r:\n{}".format(r)) + + expected = numpy.array([[3,2,3], [2,2,2], [3,2,3]], dtype=int) + self.assertTrue(numpy.array_equal(expected, r)) + + def test_nan_fast_cov_just_x(self): + logger.debug("*************happy path just x") + x, _ = TestFastCov.build_nan_containing_x_y() + + ex_with_nan = numpy.cov(x, rowvar=False) + logger.debug("expected with nan's - ex_with_nan:\n{}".format(ex_with_nan)) + + r = fast_cov.nan_fast_cov(x) + logger.debug("r:\n{}".format(r)) + + non_nan_locs = ~numpy.isnan(ex_with_nan) + self.assertTrue(numpy.allclose(ex_with_nan[non_nan_locs], r[non_nan_locs])) + + check_nominal_nans = [] + u = x[1:, 1] + for i in range(3): + t = x[1:, i] + c = numpy.cov(t, u, bias=False)[0,1] + check_nominal_nans.append(c) + logger.debug("calculate entries that would be nan - check_nominal_nans: {}".format(check_nominal_nans)) + self.assertTrue(numpy.allclose(check_nominal_nans, r[:, 1])) + self.assertTrue(numpy.allclose(check_nominal_nans, r[1, :])) + + def test_nan_fast_cov_x_and_y(self): + logger.debug("*************happy path x and y") + x, y = TestFastCov.build_nan_containing_x_y() + + combined = numpy.hstack([x, y]) + logger.debug("combined:\n{}".format(combined)) + logger.debug("combined.shape: {}".format(combined.shape)) + + off_diag_ind = int(combined.shape[1] / 2) + + raw_ex = numpy.cov(combined, rowvar=False) + logger.debug("raw expected produced from numpy.cov on full combined - raw_ex:\n{}".format(raw_ex)) + ex = raw_ex[:off_diag_ind, off_diag_ind:] + logger.debug("expected ex:\n{}".format(ex)) + + r = fast_cov.nan_fast_cov(x, y) + logger.debug("r:\n{}".format(r)) + + non_nan_locs = ~numpy.isnan(ex) + logger.debug("ex[non_nan_locs]: {}".format(ex[non_nan_locs])) + logger.debug("r[non_nan_locs]: {}".format(r[non_nan_locs])) + self.assertTrue(numpy.allclose(ex[non_nan_locs], r[non_nan_locs])) + + check_nominal_nans = [] + t = x[1:, 1] + for i in [1,2]: + u = y[1:, i] + c = numpy.cov(t,u) + check_nominal_nans.append(c[0,1]) + logger.debug("calculate entries that would be nan - check_nominal_nans: {}".format(check_nominal_nans)) + logger.debug("r values to compare to - r[1, 1:]: {}".format(r[1, 1:])) + self.assertTrue(numpy.allclose(check_nominal_nans, r[1, 1:])) + + check_nominal_nans = [] + u = y[:2, 0] + for i in [0, 2]: + t = x[:2, i] + c = numpy.cov(t,u) + check_nominal_nans.append(c[0,1]) + logger.debug("calculate entries that would be nan - check_nominal_nans: {}".format(check_nominal_nans)) + logger.debug("r values to compare to - r[[0,2], 0]: {}".format(r[[0,2], 0])) + self.assertTrue(numpy.allclose(check_nominal_nans, r[[0,2], 0])) + + self.assertTrue(numpy.isnan(r[1,0]), """expect this entry to be nan b/c for the intersection of x[:,1] and y[:,0] + there is only one entry in common, therefore covariance is undefined""") + + def test_nan_fast_cov_x_and_y_different_shapes(self): + logger.debug("*************happy path x and y different shapes") + x, t = TestFastCov.build_nan_containing_x_y() + y = numpy.zeros((t.shape[0], t.shape[1]+1)) + y[:, :t.shape[1]] = t + y[:, t.shape[1]] = [53, 59, 61] + + logger.debug("y.shape: {}".format(y.shape)) + logger.debug("y:\n{}".format(y)) + + combined = numpy.hstack([x, y]) + logger.debug("combined:\n{}".format(combined)) + logger.debug("combined.shape: {}".format(combined.shape)) + + raw_ex = numpy.cov(combined, rowvar=False) + logger.debug("raw expected produced from numpy.cov on full combined - raw_ex:\n{}".format(raw_ex)) + logger.debug("raw_ex.shape: {}".format(raw_ex.shape)) + + ex = raw_ex[:x.shape[1], -y.shape[1]:] + logger.debug("expected ex:\n{}".format(ex)) + logger.debug("ex.shape: {}".format(ex.shape)) + + r = fast_cov.nan_fast_cov(x, y) + logger.debug("r:\n{}".format(r)) + + non_nan_locs = ~numpy.isnan(ex) + logger.debug("ex[non_nan_locs]: {}".format(ex[non_nan_locs])) + logger.debug("r[non_nan_locs]: {}".format(r[non_nan_locs])) + self.assertTrue(numpy.allclose(ex[non_nan_locs], r[non_nan_locs])) + + check_nominal_nans = [] + t = x[1:, 1] + for i in [1,2,3]: + u = y[1:, i] + c = numpy.cov(t,u) + check_nominal_nans.append(c[0,1]) + logger.debug("calculate entries that would be nan - check_nominal_nans: {}".format(check_nominal_nans)) + logger.debug("r values to compare to - r[1, 1:]: {}".format(r[1, 1:])) + self.assertTrue(numpy.allclose(check_nominal_nans, r[1, 1:])) + + check_nominal_nans = [] + u = y[:2, 0] + for i in [0, 2]: + t = x[:2, i] + c = numpy.cov(t,u) + check_nominal_nans.append(c[0,1]) + logger.debug("calculate entries that would be nan - check_nominal_nans: {}".format(check_nominal_nans)) + logger.debug("r values to compare to - r[[0,2], 0]: {}".format(r[[0,2], 0])) + self.assertTrue(numpy.allclose(check_nominal_nans, r[[0,2], 0])) + + self.assertTrue(numpy.isnan(r[1,0]), """expect this entry to be nan b/c for the intersection of x[:,1] and y[:,0] + there is only one entry in common, therefore covariance is undefined""") + + def test_nan_fast_cov_all_nan(self): + x = numpy.zeros(3) + x[:] = numpy.nan + x = x[:, numpy.newaxis] + logger.debug("x:\n{}".format(x)) + + r = fast_cov.nan_fast_cov(x) + logger.debug("r:\n{}".format(r)) + + self.assertEqual(1, numpy.sum(numpy.isnan(r))) + + if __name__ == "__main__": setup_logger.setup(verbose=True)