diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py index 1254a7ba..22b3e14d 100644 --- a/cellxgene_schema_cli/cellxgene_schema/cli.py +++ b/cellxgene_schema_cli/cellxgene_schema/cli.py @@ -36,7 +36,8 @@ def schema_cli(verbose): type=click.Path(exists=False, dir_okay=False, writable=True), ) @click.option("-i", "--ignore-labels", help="Ignore ontology labels when validating", is_flag=True) -def schema_validate(h5ad_file, add_labels_file, ignore_labels): +@click.option("-n", "--num-workers", help="Number of workers to use for parallel processing", default=1, type=int) +def schema_validate(h5ad_file, add_labels_file, ignore_labels, num_workers): # Imports are very slow so we defer loading until Click arg validation has passed logger.info("Loading dependencies") try: @@ -47,7 +48,7 @@ def schema_validate(h5ad_file, add_labels_file, ignore_labels): logger.info("Loading validator modules") from .validate import validate - is_valid, _, _ = validate(h5ad_file, add_labels_file, ignore_labels=ignore_labels) + is_valid, _, _ = validate(h5ad_file, add_labels_file, ignore_labels=ignore_labels, n_workers=num_workers) if is_valid: sys.exit(0) else: diff --git a/cellxgene_schema_cli/cellxgene_schema/utils.py b/cellxgene_schema_cli/cellxgene_schema/utils.py index e2b558f7..b58cee9e 100644 --- a/cellxgene_schema_cli/cellxgene_schema/utils.py +++ b/cellxgene_schema_cli/cellxgene_schema/utils.py @@ -6,7 +6,10 @@ from typing import Dict, List, Union import anndata as ad +import h5py import numpy as np +from anndata.compat import DaskArray +from anndata.experimental import read_dispatched, read_elem_as_dask from cellxgene_ontology_guide.ontology_parser import OntologyParser from scipy import sparse from xxhash import xxh3_64_intdigest @@ -68,7 +71,7 @@ def remap_deprecated_features(*, adata: ad.AnnData, remapped_features: Dict[str, return adata -def get_matrix_format(adata: ad.AnnData, matrix: Union[np.ndarray, sparse.spmatrix]) -> str: +def get_matrix_format(matrix: DaskArray) -> str: """ Given a matrix, returns the format as one of: csc, csr, coo, dense or unknown. @@ -84,15 +87,11 @@ def get_matrix_format(adata: ad.AnnData, matrix: Union[np.ndarray, sparse.spmatr # >>> return getattr(matrix, "format_str", "dense) # matrix_format = "unknown" - if adata.n_obs == 0 or adata.n_vars == 0: + matrix_slice = matrix[0:1, 0:1].compute() + if isinstance(matrix_slice, sparse.spmatrix): + matrix_format = matrix_slice.format + elif isinstance(matrix_slice, np.ndarray): matrix_format = "dense" - else: - matrix_slice = matrix[0:1, 0:1] - if isinstance(matrix_slice, sparse.spmatrix): - matrix_format = matrix_slice.format - elif isinstance(matrix_slice, np.ndarray): - matrix_format = "dense" - assert matrix_format in ["unknown", "csr", "csc", "coo", "dense"] return matrix_format @@ -116,7 +115,38 @@ def getattr_anndata(adata: ad.AnnData, attr: str = None): return getattr(adata, attr) -def read_h5ad(h5ad_path: Union[str, bytes, os.PathLike]) -> ad.AnnData: +def read_backed(f: h5py.File, chunk_size: int) -> ad.AnnData: + """ + Read an AnnData object from a h5py.File object, reading in matrices (dense or sparse) as dask arrays. Does not + read full matrices into memory. + + :param f: h5py.File object + :param chunk_size: size of chunks to read matrices in + :return: ad.AnnData object + """ + + def callback(func, elem_name: str, elem, iospec): + if "/layers" in elem_name or elem_name == "/X" or elem_name == "/raw/X": + if iospec.encoding_type in ( + "csr_matrix", + "csc_matrix", + ): + n_vars = elem.attrs.get("shape")[1] + return read_elem_as_dask(elem, chunks=(chunk_size, n_vars)) + elif iospec.encoding_type == "array" and len(elem.shape) == 2: + n_vars = elem.shape[1] + return read_elem_as_dask(elem, chunks=(chunk_size, n_vars)) + else: + return func(elem) + else: + return func(elem) + + adata = read_dispatched(f, callback=callback) + + return adata + + +def read_h5ad(h5ad_path: Union[str, bytes, os.PathLike], chunk_size: int = 5000) -> ad.AnnData: """ Reads h5ad into adata :params Union[str, bytes, os.PathLike] h5ad_path: path to h5ad to read @@ -124,13 +154,14 @@ def read_h5ad(h5ad_path: Union[str, bytes, os.PathLike]) -> ad.AnnData: :rtype None """ try: - adata = ad.read_h5ad(h5ad_path, backed="r") + f = h5py.File(h5ad_path) + adata = read_backed(f, chunk_size) # This code, and AnnData in general, is optimized for row access. # Running backed, with CSC, is prohibitively slow. Read the entire # AnnData into memory if it is CSC. - if (get_matrix_format(adata, adata.X) == "csc") or ( - (adata.raw is not None) and (get_matrix_format(adata, adata.raw.X) == "csc") + if (get_matrix_format(adata.X) == "csc") or ( + (adata.raw is not None) and (get_matrix_format(adata.raw.X) == "csc") ): logger.warning("Matrices are in CSC format; loading entire dataset into memory.") adata = adata.to_memory() diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 292cc8ec..7da436e2 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -1,5 +1,4 @@ import logging -import math import numbers import os import re @@ -7,12 +6,14 @@ from typing import Dict, List, Mapping, Optional, Tuple, Union import anndata +import dask import matplotlib.colors as mcolors import numpy as np import pandas as pd import scipy -from anndata._core.sparse_dataset import SparseDataset +from anndata.compat import DaskArray from cellxgene_ontology_guide.ontology_parser import OntologyParser +from dask.array import map_blocks from scipy import sparse from . import gencode, schema @@ -78,9 +79,6 @@ def reset(self, hi_res_size: Optional[int] = None, true_mat_size: Optional[int] self._hires_max_dimension_size = hi_res_size self._visium_and_is_single_true_matrix_size = true_mat_size - # Matrix (e.g., X, raw.X, ...) number non-zero cache - self.number_non_zero = dict() - @property def adata(self) -> anndata.AnnData: return self._adata @@ -449,43 +447,17 @@ def _validate_feature_id(self, feature_id: str, df_name: str): return @staticmethod - def _chunk_matrix( - matrix: Union[np.ndarray, sparse.spmatrix], - obs_chunk_size: Optional[int] = 10_000, - ): - """ - Iterator which chunks the _named_ or _specified_ matrix by the - first (obs) dimension - - The parameter type restrictions are strictly for ensuring that the - AnnData read fast-path is used (as of AnnData 0.8.0). - - Iterator produces a sequence of tuples, each containing - (chunk, start, end) - """ - start = 0 - n = matrix.shape[0] - for i in range(int(n // obs_chunk_size)): - logger.debug(f"_chunk_matrix [{i} of {math.ceil(n / obs_chunk_size)}]") - end = start + obs_chunk_size - yield (matrix[start:end], start, end) - start = end - if start < n: - yield (matrix[start:n], start, n) - - def _count_matrix_nonzero(self, matrix_name: str, matrix: Union[np.ndarray, sparse.spmatrix]) -> int: - if matrix_name in self.number_non_zero: - return self.number_non_zero[matrix_name] - - logger.debug(f"Counting non-zero values in {matrix_name}") - - nnz = 0 - matrix_format = get_matrix_format(self.adata, matrix) - for matrix_chunk, _, _ in self._chunk_matrix(matrix): - nnz += matrix_chunk.count_nonzero() if matrix_format != "dense" else np.count_nonzero(matrix_chunk) - - self.number_non_zero[matrix_name] = nnz - return nnz + def count_matrix_nonzero(matrix: DaskArray) -> int: + def count_nonzeros(matrix_chunk: Union[np.ndarray, sparse.spmatrix], is_sparse_matrix: bool) -> np.array: + nnz = matrix_chunk.nnz if is_sparse_matrix else np.count_nonzero(matrix_chunk) + return np.array([nnz]) + + is_sparse_matrix = get_matrix_format(matrix) in SPARSE_MATRIX_TYPES + if len(matrix.chunks[0]) > 1: + nonzeros = map_blocks(count_nonzeros, matrix, is_sparse_matrix, drop_axis=1, dtype=int).compute().sum() + else: + nonzeros = count_nonzeros(matrix.compute(), is_sparse_matrix)[0] + return nonzeros def _validate_genetic_ancestry(self): """ @@ -607,20 +579,7 @@ def _validate_column_feature_is_filtered(self, column: pd.Series, column_name: s return if sum(column) > 0: - n_nonzero = 0 - - X_format = get_matrix_format(self.adata, self.adata.X) - if X_format in SPARSE_MATRIX_TYPES: - n_nonzero = self.adata.X[:, column].count_nonzero() - - elif X_format == "dense": - n_nonzero = np.count_nonzero(self.adata.X[:, column]) - - else: - self.errors.append( - f"X matrix is of type {type(self.adata.X)}, validation of 'feature_is_filtered' " - f"cannot be completed." - ) + n_nonzero = self.count_matrix_nonzero(self.adata.X[:, column]) if n_nonzero > 0: self.errors.append( @@ -912,8 +871,7 @@ def _validate_dataframe(self, df_name: str): for column_name in df.columns: column = df[column_name] if column.dtype.name != "category": - # Check for columns with mixed values, which is not supported by anndata 0.8.0 - # TODO: check if this can be removed after upgading to anndata 0.10.0 + # Check for columns with mixed values, which is not supported by anndata value_types = {type(x) for x in column.values} if len(value_types) != 1: self.errors.append( @@ -929,16 +887,14 @@ def _validate_dataframe(self, df_name: str): f"zero observations. These categories will be removed when `--add-labels` flag is present." ) categorical_types = {type(x) for x in column.dtype.categories.values} - # Check for columns that have illegal categories, which are not supported by anndata 0.8.0 - # TODO: check if this can be removed after upgading to anndata 0.10.0 + # Check for columns that have illegal categories, which are not supported by anndata blocked_categorical_types = {bool} illegal_categorical_types = categorical_types & blocked_categorical_types if illegal_categorical_types: self.errors.append( f"Column '{column_name}' in dataframe '{df_name}' contains {illegal_categorical_types=}." ) - # Check for categorical column has mixed types, which is not supported by anndata 0.8.0 - # TODO: check if this can be removed after upgading to anndata 0.10.0 + # Check for categorical column has mixed types, which is not supported by anndata categorical_types = {type(x) for x in column.dtype.categories.values} if len(categorical_types) > 1: self.errors.append( @@ -1060,7 +1016,7 @@ def _validate_sparsity(self): # Check sparsity for x, x_name in to_validate: - matrix_format = get_matrix_format(self.adata, x) + matrix_format = get_matrix_format(x) if matrix_format == "csr": continue assert matrix_format != "unknown" @@ -1072,7 +1028,7 @@ def _validate_sparsity(self): # function is to recommend CSR for _any_ matrix with sparsity beyond # a given limit. - nnz = self._count_matrix_nonzero(x_name, x) + nnz = self.count_matrix_nonzero(x) sparsity = 1 - nnz / np.prod(x.shape) if sparsity > max_sparsity: self.warnings.append( @@ -1259,7 +1215,7 @@ def _has_valid_raw(self, force: bool = False) -> bool: self.errors.append("Raw matrix values must have type numpy.float32.") return self._raw_layer_exists - matrix_format = get_matrix_format(self.adata, x) + matrix_format = get_matrix_format(x) assert matrix_format != "unknown" self._raw_layer_exists = True is_sparse_matrix = matrix_format in SPARSE_MATRIX_TYPES @@ -1285,31 +1241,38 @@ def _has_valid_raw(self, force: bool = False) -> bool: return self._raw_layer_exists - def _validate_raw_data(self, x: Union[np.ndarray, sparse.spmatrix], is_sparse_matrix: bool): + def _validate_raw_data(self, x: DaskArray, is_sparse_matrix: bool): """ Validates the data values in the raw matrix. Matrix size is chunked for large matrices. :param x: raw matrix :param is_sparse_matrix: bool indicating if the matrix is sparse {csc, csr, coo} """ - has_row_of_zeros = False - has_invalid_nonzero_value = False - for matrix_chunk, _, _ in self._chunk_matrix(x): - if not has_row_of_zeros: + + def validate_chunk(matrix_chunk: Union[np.ndarray, sparse.spmatrix], is_sparse_matrix: bool) -> np.array: + chunk_has_row_of_zeros = False + chunk_has_invalid_nonzero_value = False + if not chunk_has_row_of_zeros: if is_sparse_matrix: row_indices, _ = matrix_chunk.nonzero() if len(set(row_indices)) != matrix_chunk.shape[0]: - has_row_of_zeros = True + chunk_has_row_of_zeros = True # else, must be dense matrix, confirm that all rows have at least 1 nonzero value elif not all(np.apply_along_axis(np.any, axis=1, arr=matrix_chunk)): - has_row_of_zeros = True + chunk_has_row_of_zeros = True - if not has_invalid_nonzero_value and self._matrix_has_invalid_nonzero_values(matrix_chunk): - has_invalid_nonzero_value = True + if not chunk_has_invalid_nonzero_value and self._matrix_has_invalid_nonzero_values(matrix_chunk): + chunk_has_invalid_nonzero_value = True - if has_row_of_zeros and has_invalid_nonzero_value: - # Fail fast, exit loop and report - break + return np.array([np.array([chunk_has_row_of_zeros, chunk_has_invalid_nonzero_value], dtype=object)]) + + if len(x.chunks[0]) > 1: + results = map_blocks(validate_chunk, x, is_sparse_matrix, dtype=object).compute() + # Combine the results from all chunks + has_row_of_zeros = any(chunk_result[0] for chunk_result in results) + has_invalid_nonzero_value = any(chunk_result[1] for chunk_result in results) + else: + has_row_of_zeros, has_invalid_nonzero_value = validate_chunk(x.compute(), is_sparse_matrix)[0] if has_row_of_zeros: self._raw_layer_exists = False @@ -1318,34 +1281,68 @@ def _validate_raw_data(self, x: Union[np.ndarray, sparse.spmatrix], is_sparse_ma self._raw_layer_exists = False self.errors.append("All non-zero values in raw matrix must be positive integers of type numpy.float32.") - def _validate_raw_data_with_in_tissue_0( - self, x: Union[np.ndarray, sparse.spmatrix, SparseDataset], is_sparse_matrix: bool - ): + def _validate_raw_data_with_in_tissue_0(self, x: DaskArray, is_sparse_matrix: bool): """ Special case validation checks for Visium data with is_single = True and in_tissue column in obs where in_tissue - has at least one value 0. Static matrix size of 4992 rows, so chunking is not required. + has at least one value 0. :param x: raw matrix - :param is_sparse_matrix: bool indicating if the matrix is sparse {csc, csr, coo} - """ - has_tissue_0_non_zero_row = False - has_tissue_1_zero_row = False - if isinstance(x, SparseDataset): - x = x.to_memory() - if is_sparse_matrix: - nonzero_row_indices, _ = x.nonzero() - else: # must be dense matrix - nonzero_row_indices = np.where(np.any(x != 0, axis=1))[0] - for i in range(x.shape[0]): - if not has_tissue_0_non_zero_row and i in nonzero_row_indices and self.adata.obs["in_tissue"].iloc[i] == 0: - has_tissue_0_non_zero_row = True - elif ( - not has_tissue_1_zero_row and i not in nonzero_row_indices and self.adata.obs["in_tissue"].iloc[i] == 1 - ): - has_tissue_1_zero_row = True - if has_tissue_0_non_zero_row and has_tissue_1_zero_row: - # exit early and report - break + :param is_sparse_matrix: bool indicating if the matrix is sparse + """ + + def validate_chunk( + matrix_chunk: Union[np.ndarray, sparse.spmatrix], is_sparse_matrix: bool, block_info: dict = None + ) -> np.array: + chunk_has_tissue_0_non_zero_row = False + chunk_has_tissue_1_zero_row = False + chunk_has_invalid_nonzero_values = False + chunk_start_row = block_info[0]["array-location"][0][0] if (block_info and block_info.get(0)) else 0 + if self._matrix_has_invalid_nonzero_values(matrix_chunk): + chunk_has_invalid_nonzero_values = True + if is_sparse_matrix: + nonzero_row_indices, _ = matrix_chunk.nonzero() + else: # must be dense matrix + nonzero_row_indices = np.where(np.any(matrix_chunk != 0, axis=1))[0] + for i in range(matrix_chunk.shape[0]): + if chunk_has_tissue_0_non_zero_row and chunk_has_tissue_1_zero_row: + # exit inner loop early + break + unchunked_i = i + chunk_start_row + if ( + not chunk_has_tissue_0_non_zero_row + and i in nonzero_row_indices + and self.adata.obs["in_tissue"].iloc[unchunked_i] == 0 + ): + chunk_has_tissue_0_non_zero_row = True + elif ( + not chunk_has_tissue_1_zero_row + and i not in nonzero_row_indices + and self.adata.obs["in_tissue"].iloc[unchunked_i] == 1 + ): + chunk_has_tissue_1_zero_row = True + return np.array( + [ + np.array( + [ + chunk_has_tissue_0_non_zero_row, + chunk_has_tissue_1_zero_row, + chunk_has_invalid_nonzero_values, + ], + dtype=object, + ) + ] + ) + + if len(x.chunks[0]) > 1: + results = map_blocks(validate_chunk, x, is_sparse_matrix, dtype=object).compute() + # Combine the results from all chunks + has_tissue_0_non_zero_row = any(chunk_result[0] for chunk_result in results) + has_tissue_1_zero_row = any(chunk_result[1] for chunk_result in results) + has_invalid_nonzero_values = any(chunk_result[2] for chunk_result in results) + else: + has_tissue_0_non_zero_row, has_tissue_1_zero_row, has_invalid_nonzero_values = validate_chunk( + x.compute(), is_sparse_matrix + )[0] if not has_tissue_0_non_zero_row: self._raw_layer_exists = False @@ -1359,7 +1356,7 @@ def _validate_raw_data_with_in_tissue_0( "Each observation with obs['in_tissue'] == 1 must have at least one " "non-zero value in its row in the raw matrix." ) - if self._matrix_has_invalid_nonzero_values(x): + if has_invalid_nonzero_values: self._raw_layer_exists = False self.errors.append("All non-zero values in raw matrix must be positive integers of type numpy.float32.") @@ -2113,7 +2110,7 @@ def validate( h5ad_path: Union[str, bytes, os.PathLike], add_labels_file: str = None, ignore_labels: bool = False, - verbose: bool = False, + n_workers: int = 1, ) -> (bool, list, bool): from .write_labels import AnnDataLabelAppender @@ -2133,22 +2130,30 @@ def validate( validator = Validator( ignore_labels=ignore_labels, ) - validator.validate_adata(h5ad_path) - logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}") - - # Stop if validation was unsuccessful - if not validator.is_valid: - return False, validator.errors, False - - if add_labels_file: - label_start = datetime.now() - writer = AnnDataLabelAppender(validator) - writer.write_labels(add_labels_file) - logger.info( - f"H5AD label writing complete in {datetime.now() - label_start}, was_writing_successful: " - f"{writer.was_writing_successful}" - ) + with dask.config.set( + { + "num_workers": n_workers, + "threads_per_worker": 1, + "distributed.worker.memory.limit": "6GB", + "scheduler": "threads", + } + ): + validator.validate_adata(h5ad_path) + logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}") + + # Stop if validation was unsuccessful + if not validator.is_valid: + return False, validator.errors, False + + if add_labels_file: + label_start = datetime.now() + writer = AnnDataLabelAppender(validator) + writer.write_labels(add_labels_file) + logger.info( + f"H5AD label writing complete in {datetime.now() - label_start}, was_writing_successful: " + f"{writer.was_writing_successful}" + ) - return (validator.is_valid and writer.was_writing_successful, validator.errors + writer.errors, False) + return (validator.is_valid and writer.was_writing_successful, validator.errors + writer.errors, False) - return True, validator.errors, False + return True, validator.errors, False diff --git a/cellxgene_schema_cli/requirements.txt b/cellxgene_schema_cli/requirements.txt index 18b58ada..ac80c18d 100644 --- a/cellxgene_schema_cli/requirements.txt +++ b/cellxgene_schema_cli/requirements.txt @@ -1,7 +1,8 @@ -anndata>=0.8,<0.11 +anndata==0.11.2 cellxgene-ontology-guide==1.3.0 # update before a schema migration click<9 Cython<4 +dask==2024.12.0 numpy<3 pandas>2,<3 PyYAML<7 diff --git a/cellxgene_schema_cli/tests/fixtures/examples_validate.py b/cellxgene_schema_cli/tests/fixtures/examples_validate.py index accbecfc..f6e56778 100644 --- a/cellxgene_schema_cli/tests/fixtures/examples_validate.py +++ b/cellxgene_schema_cli/tests/fixtures/examples_validate.py @@ -5,6 +5,7 @@ import os from scipy import sparse from cellxgene_schema.utils import get_hash_digest_column +from dask.array import from_array # -----------------------------------------------------------------# # General example information @@ -473,8 +474,10 @@ # --- # 4. Creating expression matrix, # X has integer values and non_raw_X has real values -X = numpy.ones([good_obs.shape[0], good_var.shape[0]], dtype=numpy.float32) -non_raw_X = sparse.csr_matrix(X.copy()) +X = from_array(sparse.csr_matrix((good_obs.shape[0], good_var.shape[0]), dtype=numpy.float32)) +X[0, 0] = 1 +X[1, 0] = 1 +non_raw_X = X.copy() non_raw_X[0, 0] = 1.5 # --- @@ -487,14 +490,14 @@ # the unittests # Valid anndata -adata = anndata.AnnData(X=sparse.csr_matrix(X), obs=good_obs, uns=good_uns, obsm=good_obsm, var=good_var) +adata = anndata.AnnData(X=X.copy(), obs=good_obs, uns=good_uns, obsm=good_obsm, var=good_var) adata.raw = adata.copy() adata.X = non_raw_X adata.raw.var.drop("feature_is_filtered", axis=1, inplace=True) # Anndata with "X" and "raw.X" but neither has actual raw values adata_no_raw_values = anndata.AnnData( - X=sparse.csr_matrix(non_raw_X), + X=non_raw_X.copy(), obs=good_obs, uns=good_uns, obsm=good_obsm, @@ -504,11 +507,11 @@ adata_no_raw_values.raw.var.drop("feature_is_filtered", axis=1, inplace=True) # Anndata with no obs nor var -adata_minimal = anndata.AnnData(X=sparse.csr_matrix(X), uns=good_uns, obsm=good_obsm) +adata_minimal = anndata.AnnData(X=X.copy(), uns=good_uns, obsm=good_obsm) # Anndata with a expression matrix that is not raw adata_non_raw = anndata.AnnData( - X=sparse.csr_matrix(non_raw_X), + X=non_raw_X.copy(), obs=good_obs, uns=good_uns, obsm=good_obsm, @@ -517,7 +520,7 @@ # Expected anndata with labels that the validator must write in obs and var adata_with_labels = anndata.AnnData( - X=sparse.csr_matrix(X), + X=X.copy(), obs=pd.concat([good_obs, obs_expected], axis=1), var=var_expected, uns=good_uns_with_labels, @@ -525,20 +528,18 @@ ) # Expected anndata with colors for categorical obs fields -adata_with_colors = anndata.AnnData( - X=sparse.csr_matrix(X), obs=good_obs, uns=good_uns_with_colors, obsm=good_obsm, var=good_var -) +adata_with_colors = anndata.AnnData(X=X.copy(), obs=good_obs, uns=good_uns_with_colors, obsm=good_obsm, var=good_var) # Expected anndata with Visium spatial data adata_visium = anndata.AnnData( - X=sparse.csr_matrix(X), obs=good_obs_visium, uns=good_uns_with_visium_spatial, obsm=good_obsm_spatial, var=good_var + X=X.copy(), obs=good_obs_visium, uns=good_uns_with_visium_spatial, obsm=good_obsm_spatial, var=good_var ) adata_visium.raw = adata_visium.copy() adata_visium.raw.var.drop("feature_is_filtered", axis=1, inplace=True) # Expected anndata with Slide-seqV2 spatial data adata_slide_seqv2 = anndata.AnnData( - X=sparse.csr_matrix(X), + X=X.copy(), obs=good_obs_slide_seqv2, uns=good_uns_with_slide_seqV2_spatial, obsm=good_obsm_spatial, @@ -546,7 +547,7 @@ ) adata_spatial_is_single_false = anndata.AnnData( - X=sparse.csr_matrix(X), + X=X.copy(), obs=good_obs_visium_is_single_false, uns=good_uns_with_is_single_false, obsm=good_obsm_spatial, @@ -619,9 +620,11 @@ ], ) -unmigrated_X = numpy.zeros([unmigrated_obs.shape[0], var_unmigrated.shape[0]], dtype=numpy.float32) +unmigrated_X = from_array( + sparse.csr_matrix(numpy.zeros([unmigrated_obs.shape[0], var_unmigrated.shape[0]], dtype=numpy.float32)) +) adata_with_labels_unmigrated = anndata.AnnData( - X=sparse.csr_matrix(unmigrated_X), + X=unmigrated_X.copy(), obs=unmigrated_obs, uns=good_uns_with_labels, var=var_unmigrated, diff --git a/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad b/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad index a1b121bd..d037317a 100644 Binary files a/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad and b/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad differ diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index f78ad6da..75aae7b0 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -2,7 +2,6 @@ Tests for schema compliance of an AnnData object """ -import tempfile import unittest from copy import deepcopy @@ -13,7 +12,7 @@ import pytest import scipy.sparse from cellxgene_schema.schema import get_schema_definition -from cellxgene_schema.utils import getattr_anndata +from cellxgene_schema.utils import getattr_anndata, read_h5ad from cellxgene_schema.validate import ( ASSAY_VISIUM_11M, ERROR_SUFFIX_IS_SINGLE, @@ -28,6 +27,7 @@ Validator, ) from cellxgene_schema.write_labels import AnnDataLabelAppender +from dask.array import from_array from fixtures.examples_validate import visium_library_id schema_def = get_schema_definition() @@ -107,17 +107,6 @@ def label_writer(validator_with_validated_adata) -> AnnDataLabelAppender: return label_writer -def save_and_read_adata(adata: anndata.AnnData) -> anndata.AnnData: - """ - Saves adata to a temporary file and reads it back. Used to test read/write errors. - :param adata: AnnData object - :return: AnnData object - """ - with tempfile.NamedTemporaryFile(suffix=".h5ad") as f: - adata.write_h5ad(f.name) - return anndata.read_h5ad(f.name) - - class TestValidAnndata: """ Tests a valid AnnData object. Most other tests below modify this AnnData object and test for failure cases. @@ -175,7 +164,7 @@ def test_sparsity(self, validator_with_adata): sparse_X = numpy.zeros([validator.adata.obs.shape[0], validator.adata.var.shape[0]], dtype=numpy.float32) sparse_X[0, 1] = 1 sparse_X[1, 1] = 1 - validator.adata.X = sparse_X + validator.adata.X = from_array(sparse_X) validator.validate_adata() assert validator.warnings == [ "WARNING: Sparsity of 'X' is 0.75 which is greater than 0.5, " @@ -215,7 +204,7 @@ def test_raw_values__invalid_spatial(self, validator_with_visium_assay, invalid_ "ERROR: Raw data may be missing: data in 'raw.X' does not meet schema requirements.", ] - @pytest.mark.parametrize("datatype", [int, "float16", "float64"]) + @pytest.mark.parametrize("datatype", [int, "float64"]) def test_raw_values__wrong_datatype(self, validator_with_adata, datatype): """ When both `adata.X` and `adata.raw.X` are present, but `adata.raw.X` values are stored as the wrong datatype @@ -285,8 +274,8 @@ def test_raw_values__contains_all_zero_rows_in_tissue_0(self, validator_with_vis validator = validator_with_visium_assay validator.adata.obs["in_tissue"] = 0 validator.adata.obs["cell_type_ontology_term_id"] = "unknown" - validator.adata.X = numpy.zeros( - [validator.adata.obs.shape[0], validator.adata.var.shape[0]], dtype=numpy.float32 + validator.adata.X = from_array( + numpy.zeros([validator.adata.obs.shape[0], validator.adata.var.shape[0]], dtype=numpy.float32) ) validator.adata.raw = validator.adata.copy() validator.adata.raw.var.drop("feature_is_filtered", axis=1, inplace=True) @@ -384,8 +373,8 @@ def test_raw_values__multiple_invalid_in_tissue_errors( validator.adata.uns["spatial"][visium_library_id]["images"]["hires"] = numpy.zeros( (1, image_size, 3), dtype=numpy.uint8 ) - validator.adata.X = numpy.zeros( - [validator.adata.obs.shape[0], validator.adata.var.shape[0]], dtype=numpy.float32 + validator.adata.X = from_array( + numpy.zeros([validator.adata.obs.shape[0], validator.adata.var.shape[0]], dtype=numpy.float32) ) validator.adata.raw = validator.adata.copy() validator.adata.raw.var.drop("feature_is_filtered", axis=1, inplace=True) @@ -451,7 +440,7 @@ def test_raw_values__matrix_chunks(self, validator_with_adata): """ Test adata is validated correctly when matrix is larger than the chunk size """ - with unittest.mock.patch.object(validator_with_adata._chunk_matrix, "__defaults__", (1,)): + with unittest.mock.patch.object(read_h5ad, "__defaults__", (1,)): validator = validator_with_adata validator.validate_adata() assert validator.errors == [] @@ -1831,7 +1820,7 @@ def test_feature_is_filtered(self, validator_with_adata): for i in range(X.shape[0]): X[i, 0] = 0 X[0, 0] = 1 - + validator.adata.X = X.map_blocks(lambda x: (x.eliminate_zeros() or x), dtype=X.dtype, meta=X._meta) validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ @@ -2643,7 +2632,6 @@ def test_obsm_size_zero(self, validator_with_adata): validator = validator_with_adata adata = validator.adata adata.obsm["badsize"] = numpy.empty((2, 0)) - validator.adata = save_and_read_adata(adata) validator.validate_adata() assert validator.errors == [ "ERROR: The size of the ndarray stored for a 'adata.obsm['badsize']' MUST NOT " "be zero.", @@ -2660,7 +2648,6 @@ def test_obsp_size_zero(self, validator_with_adata): validator = validator_with_adata adata = validator.adata adata.obsp["badsize"] = numpy.empty((2, 2, 0)) - validator.adata = save_and_read_adata(adata) validator.validate_adata() assert validator.errors == [ "ERROR: The size of the ndarray stored for a 'adata.obsp['badsize']' MUST NOT be zero." @@ -2675,7 +2662,6 @@ def test_varm_size_zero(self, validator_with_adata): validator = validator_with_adata adata = validator.adata adata.varm["badsize"] = numpy.empty((4, 0)) - validator.adata = save_and_read_adata(adata) validator.validate_adata() assert validator.errors == [ "ERROR: The size of the ndarray stored for a 'adata.varm['badsize']' MUST NOT be " "zero." @@ -2690,7 +2676,6 @@ def test_varp_size_zero(self, validator_with_adata): validator = validator_with_adata adata = validator.adata adata.varp["badsize"] = numpy.empty((4, 4, 0)) - validator.adata = save_and_read_adata(adata) validator.validate_adata() assert validator.errors == [ "ERROR: The size of the ndarray stored for a 'adata.varp['badsize']' MUST NOT be zero." diff --git a/cellxgene_schema_cli/tests/test_utils.py b/cellxgene_schema_cli/tests/test_utils.py index 69d8808e..4a88a515 100644 --- a/cellxgene_schema_cli/tests/test_utils.py +++ b/cellxgene_schema_cli/tests/test_utils.py @@ -155,4 +155,3 @@ def test_read_h5ad(self): h5ad_path = h5ad_valid adata = read_h5ad(h5ad_path) assert isinstance(adata, AnnData) - assert adata.isbacked diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index cd7652bf..230061a2 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -22,6 +22,7 @@ validate, ) from cellxgene_schema.write_labels import AnnDataLabelAppender +from dask.array import from_array from fixtures.examples_validate import adata as adata_valid from fixtures.examples_validate import ( adata_minimal, @@ -370,13 +371,15 @@ def test__validate_spatial_visium_ok(self): validator.validate_adata() assert not validator.errors + @mock.patch("cellxgene_schema.validate.VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE", 2) def test__validate_from_file(self): - """Testing compatibility with SparseDatset types in Anndata""" + """Testing compatibility with SparseDataset types in Anndata""" validator: Validator = Validator() validator._set_schema_def() + validator._visium_and_is_single_true_matrix_size = 2 with tempfile.TemporaryDirectory() as temp_dir: - file_path = os.path.join(temp_dir, "slide_seqv2.h5ad") - adata_slide_seqv2.write_h5ad(file_path) + file_path = os.path.join(temp_dir, "visium.h5ad") + adata_visium.write_h5ad(file_path) # Confirm spatial is valid. validator.validate_adata(file_path) assert not validator.errors @@ -386,7 +389,7 @@ def test__validate_spatial_visium_dense_matrix_ok(self): validator._set_schema_def() validator.adata = adata_visium.copy() validator._visium_and_is_single_true_matrix_size = 2 - validator.adata.X = validator.adata.X.toarray() + validator.adata.X = from_array(validator.adata.X.compute().toarray()) validator.adata.raw = validator.adata.copy() validator.adata.raw.var.drop("feature_is_filtered", axis=1, inplace=True) # Confirm spatial is valid. @@ -1251,15 +1254,15 @@ def create_validator(data: Union[ndarray, spmatrix], matrix_format: str) -> Vali "data, matrix_format, expected_result", [ # Test case with integer values in a dense matrix - (np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32), "dense", True), + (from_array(np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)), "dense", True), # Test case with float values in a dense matrix - (np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]]), "dense", False), + (from_array(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]])), "dense", False), # Test case with integer values in a sparse matrix (CSR format) - (sparse.csr_matrix([[1, 0, 3], [0, 5, 0]], dtype=np.float32), "csr", True), + (from_array(sparse.csr_matrix([[1, 0, 3], [0, 5, 0]], dtype=np.float32)), "csr", True), # Test case with float values in a sparse matrix (CSC format) - (sparse.csc_matrix([[1.1, 0, 3.3], [0, 5.5, 0]]), "csc", False), + (from_array(sparse.csc_matrix([[1.1, 0, 3.3], [0, 5.5, 0]])), "csc", False), # Test case with mixed integer and float values in a dense matrix - (np.array([[1, 2.2, 3], [4.4, 5, 6.6]]), "dense", False), + (from_array(np.array([[1, 2.2, 3], [4.4, 5, 6.6]])), "dense", False), ], ) def test_has_valid_raw(self, data, matrix_format, expected_result):