diff --git a/cooltools/__init__.py b/cooltools/__init__.py index 690a9c15..816b2f51 100644 --- a/cooltools/__init__.py +++ b/cooltools/__init__.py @@ -32,3 +32,4 @@ from .api.snipping import pileup from .api.directionality import directionality from .api.insulation import insulation +from .api.dotfinder import dots diff --git a/cooltools/api/dotfinder.py b/cooltools/api/dotfinder.py index 718eae6a..f7c399d9 100644 --- a/cooltools/api/dotfinder.py +++ b/cooltools/api/dotfinder.py @@ -1,10 +1,78 @@ """ Collection of functions related to dot-calling +The main user-facing API function is: +dots( + clr, + expected, + expected_value_col="balanced.avg", + clr_weight_name="weight", + view_df=None, + kernels=None, + max_loci_separation=10_000_000, + max_nans_tolerated=1, + n_lambda_bins=40, + lambda_bin_fdr=0.1, + clustering_radius=20_000, + cluster_filtering=None, + tile_size=5_000_000, + nproc=1, +) +This function implements HiCCUPS-style dot calling, but enables user-specified +modifications at multiple steps. The current implementation makes two passes +over the input data, first to create a histogram of pixel enrichment values, +and second to extract significantly enriched pixels. + + * The function starts with compatibility verifications + for the `clr`, `expected` and `view` of interest. + * Recommendation or verification for `kernels` is done next. + Custom kernels must satisfy properties including: square shape, + equal sizes, odd sizes, zeros in the middle, etc. By default, + HiCCUPS-style kernels are recommended based on the binsize. + * Lambda bins are defined for multiple hypothesis + testing separately for different value ranges of the locally adjusted expected. + Currently, log-binned lambda-bins are hardcoded using a pre-defined + BASE of 2^(1/3). `n_lambda_bins` controls the total number of bins. + * Genomic regions in the specified `view`(all chromosomes by default) + are split into smaller tiles of size `tile_size`. + * `scoring_and_histogramming_step()` is performed independently + on the genomic tiles. In this step, locally adjusted expected is + calculated using convolution kernels for each pixel in the tile. + All surveyed pixels are histogrammed according to their adjusted + expected and raw observed counts. Locally adjusted expected is + not stored in memory. + * Chunks of histograms are aggregated together and a modified BH-FDR + procedure is applied to the result in `determine_thresholds()`. + This returns thresholds for statistical significance + in each lambda-bin (for observed counts), along with the adjusted + p-values (q-values). + * Calculated thresholds are used to extract statistically significant + pixels in `scoring_and_extraction_step()`. Because locally adjusted + expected is not stored in memory, it is re-caluclated + during this step, which makes it computationally intensive. + Locally adjusted expected values are required in order to apply + different thresholds of significance depending on the lambda-bin. + * Returned filtered pixels, or 'dots', are significantly enriched + relative to their locally adjusted expecteds and thus have potential + biological interest. Dots are further annotated with their + genomic coordinates and q-values (adjusted p-values) for + all applied kernels. + * All further steps perform optional post-processing on called dots + - enriched pixels that are within `clustering_radius` of each other + are clustered together and the brightest one is selected as the + representative position of a dot. + - cluster-representatives along with "singletons" (enriched pixels + that are not part of any cluster) can be subjected to further + empirical enrichment filtering in `cluster_filtering_hiccups()`. This + both requires clustered dots exceed prescribed enrichment thresholds + relative to their local neighborhoods and that singletons pass an + even more stringent q-value threshold. """ + from functools import partial, reduce import multiprocess as mp import logging +import warnings from scipy.linalg import toeplitz from scipy.ndimage import convolve @@ -16,10 +84,14 @@ import cooler from ..lib.numutils import LazyToeplitz, get_kernel -from ..lib.checks import is_compatible_viewframe, is_cooler_balanced -from ..lib.common import make_cooler_view +from ..lib.common import assign_regions, make_cooler_view +from ..lib.checks import ( + is_cooler_balanced, + is_compatible_viewframe, + is_valid_expected, +) -import bioframe +from bioframe import make_viewframe logging.basicConfig(level=logging.INFO) @@ -30,13 +102,32 @@ nans_inkernel_name = lambda kernel_name: f"la_exp.{kernel_name}.nnans" bin1_id_name = "bin1_id" bin2_id_name = "bin2_id" +bedpe_required_cols = [ + "chrom1", + "start1", + "end1", + "chrom2", + "start2", + "end2", +] + + +# define basepairs to bins for clarity +def bp_to_bins(basepairs, binsize): + return int(basepairs / binsize) -def recommend_kernel_params(binsize): +def recommend_kernels(binsize): """ - Recommned kernel parameters for the - standard convolution kernels, 'donut', etc - same as in Rao et al 2014 + Return a recommended set of convolution kernels for dot-calling + based on the resolution, or binsize, of the input data. + + This function currently recommends the four kernels used in the HiCCUPS method: + donut, horizontal, vertical, lowerleft. Kernels are recommended for resolutions + near 5 kb, 10 kb, and 25 kb. Dots are not typically visible at lower resolutions + (binsize >28kb) and the majority of datasets are too sparse for dot-calling + at very high resolutions (<4kb). Given this, default kernels are not + recommended for resolutions outside this range. Parameters ---------- @@ -45,47 +136,91 @@ def recommend_kernel_params(binsize): Returns ------- - (w,p) : (integer, integer) - tuple of the outer and inner kernel sizes + kernels : {str:ndarray} + dictionary of convolution kernels as ndarrays, with their + names as keys. """ - # kernel parameters depend on the cooler resolution - # TODO: rename w, p to wid, pix probably, or _w, _p to avoid naming conflicts + # define "donut" parameters w and p based on the resolution of the data + # w is the outside half-width, and p is the internal half-width. if binsize > 28000: - # > 30 kb - is probably too much ... raise ValueError( - f"Provided cooler has resolution {binsize} bases," - " which is too coarse for analysis." + f"Provided cooler has resolution of {binsize} bases," + " which is too coarse for automated kernel recommendation." + " Provide custom kernels to proceed." ) elif binsize >= 18000: - # ~ 20-25 kb: w, p = 3, 1 elif binsize >= 8000: - # ~ 10 kb w, p = 5, 2 elif binsize >= 4000: - # ~5 kb w, p = 7, 4 else: - # < 5 kb - is probably too fine ... raise ValueError( f"Provided cooler has resolution {binsize} bases," - " which is too fine for analysis." + " which is too fine for automated kernel recommendation. Provide custom kernels" + " to proceed." ) - # return the results: - return w, p + logging.info( + f"Using recommended donut-based kernels with w={w}, p={p} for binsize={binsize}" + ) + # standard set of 4 kernels used in Rao et al 2014 + # 'upright' is a symmetrical inversion of "lowleft", not needed. + kernel_types = ["donut", "vertical", "horizontal", "lowleft"] + # generate standard kernels - consider providing custom ones + kernels = {k: get_kernel(w, p, k) for k in kernel_types} -def annotate_pixels_with_qvalues( - pixels_df, qvalues, kernels, inplace=False, obs_raw_name=observed_count_name -): + return kernels + + +def is_compatible_kernels(kernels, binsize, max_nans_tolerated): + """ + TODO implement checks for kernels: + - matrices are of the same size + - they should be squared (too restrictive ? maybe pad with 0 as needed) + - dimensions are odd, to have a center pixel to refer to + - they can be turned into int 1/0 ones (too restrictive ? allow weighted kernels ?) + - the central pixel should be zero perhaps (unless weights are allowed 4sure) + - maybe introduce an upper limit to the size - to avoid crazy long calculations + - check relative to the binsize maybe ? what's the criteria ? """ - Add columns with the qvalues to a DataFrame of pixels - ... detailed but unedited notes ... - Extract q-values using l-chunks and IntervalIndex. - we'll do it in an ugly but workign fashion, by simply - iteration over pairs of obs, la_exp and extracting needed qvals - one after another - ... + + # kernels must be a dict with kernel-names as keys + # and kernel ndarrays as values. + if not isinstance(kernels, dict): + raise ValueError( + "'kernels' must be a dictionary" "with name-keys and ndarrays-values." + ) + + # deduce kernel_width - overall footprint + kernel_widths = [len(k) for kn, k in kernels.items()] + # kernels must have the same width for now: + if min(kernel_widths) != max(kernel_widths): + raise ValueError(f"all 'kernels' must have the same size, now: {kernel_widths}") + # now extract their dimensions: + kernel_width = max(kernel_widths) + kernel_half_width = (kernel_width - 1) / 2 # former w parameter + if (kernel_half_width <= 0) or not kernel_half_width.is_integer(): + raise ValueError( + f"Size of the convolution kernels has to be odd and > 3, currently {kernel_width}" + ) + + # once kernel parameters are setup check max_nans_tolerated + # to make sure kernel footprints overlaping 1 side with the + # NaNs filled row/column are not "allowed" + if not max_nans_tolerated <= kernel_width: + raise ValueError( + f"Too many NaNs allowed max_nans_tolerated={max_nans_tolerated}" + ) + # may lead to scoring the same pixel twice, - i.e. duplicates. + + # return True if everyhting passes + return True + + +def annotate_pixels_with_qvalues(pixels_df, qvalues, obs_raw_name=observed_count_name): + """ + Add columns with the qvalues to a DataFrame of scored pixels Parameters ---------- @@ -96,46 +231,42 @@ def annotate_pixels_with_qvalues( qvalues : dict of DataFrames A dictionary with keys being kernel names and values DataFrames storing q-values for each observed count values in each lambda- - chunk. Colunms are Intervals defined by 'ledges' boundaries. + bin. Colunms are Intervals defined by 'ledges' boundaries. Rows corresponding to a range of observed count values. - kernels : dict - A dictionary with keys being kernels names and values being ndarrays - representing those kernels. + obs_raw_name : str + Name of the column/field that carry number of counts per pixel, + i.e. observed raw counts. Returns ------- pixels_qvalue_df : pandas.DataFrame - DataFrame of pixels with additional columns - storing qvalues corresponding to the observed - count value of a given pixel, given kernel-type, - and a lambda-chunk. - - Notes - ----- - Should be applied to a filtered DF of pixels, otherwise would - be too resource-hungry. + DataFrame of pixels with additional columns la_exp.{k}.qval, + storing q-values (adjusted p-values) corresponding to the count + value of a pixel, its kernel, and a lambda-bin it belongs to. """ - if inplace: - pixels_qvalue_df = pixels_df - else: - # let's do it "safe" - using a copy: - pixels_qvalue_df = pixels_df.copy() - # attempting to extract q-values using l-chunks and IntervalIndex: - # we'll do it in an ugly but workign fashion, by simply - # iteration over pairs of obs, la_exp and extracting needed qvals - # one after another ... - for k in kernels: - pixels_qvalue_df[f"la_exp.{k}.qval"] = [ - qvalues[k].loc[o, e] - for o, e in pixels_df[[obs_raw_name, f"la_exp.{k}.value"]].itertuples( - index=False - ) - ] - # qvalues : dict - # A dictionary with keys being kernel names and values pandas.DataFrame-s - # storing q-values: each column corresponds to a lambda-chunk, - # while rows correspond to observed pixels values. - return pixels_qvalue_df + # do it "safe" - using a copy: + pixels_qvalue_df = pixels_df.copy() + # columns to return + cols = list(pixels_qvalue_df.columns) + # will do it efficiently using "melted" qvalues table: + for k, qval_df in qvalues.items(): + lbins = pd.IntervalIndex(qval_df.columns) + pixels_qvalue_df["lbins"] = pd.cut( + pixels_qvalue_df[f"la_exp.{k}.value"], bins=lbins + ) + pixels_qvalue_df = pixels_qvalue_df.merge( + # melted qval_df columns: [counts, la_exp.k.value, value] + qval_df.melt(ignore_index=False).reset_index(), + left_on=[obs_raw_name, "lbins"], + right_on=[obs_raw_name, f"la_exp.{k}.value"], + suffixes=("", "_"), + ) + qval_col_name = f"la_exp.{k}.qval" + pixels_qvalue_df = pixels_qvalue_df.rename(columns={"value": qval_col_name}) + cols.append(qval_col_name) + + # return q-values annotated pixels + return pixels_qvalue_df.loc[:, cols] def clust_2D_pixels( @@ -145,7 +276,6 @@ def clust_2D_pixels( bin2_id_name="bin2_id", clust_label_name="c_label", clust_size_name="c_size", - verbose=True, ): """ Group significant pixels by proximity using Birch clustering. We use @@ -172,8 +302,6 @@ def clust_2D_pixels( Name of the cluster of pixels label. "c_label" by default. clust_size_name : str Name of the cluster of pixels size. "c_size" by default. - verbose : bool - Print verbose clustering summary report defaults is True. Returns ------- @@ -224,14 +352,10 @@ def clust_2D_pixels( # take centroids corresponding to labels (as many as needed): centroids_per_pixel = np.take(clustered_centroids, clustered_labels, axis=0) - if verbose: - # prepare clustering summary report: - msg = ( - "Clustering is completed:\n" - + f"{uniq_counts.size} clusters detected\n" - + f"{uniq_counts.mean():.2f}+/-{uniq_counts.std():.2f} mean size\n" - ) - logging.info(msg) + # clustering message: + logging.info( + f"detected {uniq_counts.size} clusters of {uniq_counts.mean():.2f}+/-{uniq_counts.std():.2f} size" + ) # create output DataFrame centroids_n_labels_df = pd.DataFrame( @@ -252,120 +376,86 @@ def clust_2D_pixels( ################################## -def square_matrix_tiling(start, stop, step, edge, square=False, verbose=False): +def tile_square_matrix(matrix_size, offset, tile_size, pad=0): """ - Generate a stream of tiling coordinates that guarantee to cover an entire - matrix. cis-signal only! + Generate a stream of coordinates of tiles that cover a matrix of a given size. + Matrix has to be square, on-digaonal one: e.g. corresponding to a chromosome + or a chromosomal arm. Parameters ---------- - start : int - Starting position of the matrix slice to be tiled (inclusive, bins, - 0-based). - stop : int - End position of the matrix slice to be tiled (exclusive, bins, 0-based). - step : int - Requested size of the tiles. Boundary tiles may or may not be of - 'step', see 'square'. - edge : int - Small edge around each tile to be included in the yielded coordinates. - square : bool, optional + matrix_size : int + Size of a squared matrix + offset : int + Offset coordinates of generated tiles by 'offset' + tile_size : int + Requested size of the tiles. Tiles near + the right and botoom edges could be rectangular + and smaller then 'tile_size' + pad : int + Small padding around each tile to be included in the yielded coordinates. Yields ------ - Pairs of indices for every chunk use those indices [cstart:cstop) to fetch - chunks from the cooler-object: - >>> clr.matrix()[cstart:cstop, cstart:cstop] + Pairs of indices/coordinates of every tile: (start_i, end_i), (start_j, end_j) Notes ----- - Each slice is characterized by the coordinate - of the top-left corner and size. - - * * * * * * * * * * * * * - * * * * - * * * ... * - * * * * - * * * * * * * * * * - * * * - * * ... * - * * * - * * * * * * - * * - * * - * * - * * * * * * * * * * * * * - - Square parameter determines behavior - of the tiling function, when the - size of the matrix is not an exact - multiple of the 'step': - - square = False - * * * * * * * * * * * - * * * * - * * * * - * * * * - * * * * * * * * * * * - * * * * - * * * * - * * * * - * * * * * * * * * * * - * * * * - * * * * - * * * * * * * * * * * - WARNING: be carefull with extracting - expected in this case, as it is - not trivial at all !!! - - square = True - * * * * * * * * * * * - * * * * * - * * * * * - * * * * * - * * * * * * * * * * * - * * * * * - * * * * * - * * * * * * * * * * * - * * * * * * * * * * * - * * * * * - * * * * * - * * * * * * * * * * * - + Generated tiles coordinates [start_i,end_i) , [start_i,end_i) + can be used to fetch heatmap tiles from cooler: + >>> clr.matrix()[start_i:end_i, start_j:end_j] + + 'offset' is useful when a given matrix is part of a + larger matrix (a given chromosome or arm), and thus + all coordinated needs to be offset to get absolute + coordinates. + + Tiles are non-overlapping (pad=0), but tiles near + the right and bottom edges could be rectangular: + + * * * * * * * * * + * * * * + * * * * + * * * * * * * * + * * * + * * ... * + * * * * * + * * + * * * * * * * * * """ - size = stop - start - tiles = size // step + bool(size % step) + # number of tiles along each axis + if matrix_size % tile_size: + num_tiles = matrix_size // tile_size + 1 + else: + num_tiles = matrix_size // tile_size - if verbose: + logging.info( + f" matrix {matrix_size}X{matrix_size} to be split into {num_tiles * num_tiles} tiles of {tile_size}X{tile_size}." + ) + if pad: logging.info( - f"matrix of size {size}X{size} to be splitted\n" - + f" into square tiles of size {step}.\n" - + f" A small 'edge' of size w={edge} is added, to allow for\n" - + " meaningfull convolution around boundaries.\n" - + f" Resulting number of tiles is {tiles * tiles}" + f" tiles are padded (width={pad}) to enable convolution near the edges" ) - for tx in range(tiles): - for ty in range(tiles): + # generate 'num_tiles X num_tiles' tiles + for ti in range(num_tiles): + for tj in range(num_tiles): - lwx = max(0, step * tx - edge) - rwx = min(size, step * (tx + 1) + edge) - if square and (rwx >= size): - lwx = size - step - edge + start_i = max(0, tile_size * ti - pad) + start_j = max(0, tile_size * tj - pad) - lwy = max(0, step * ty - edge) - rwy = min(size, step * (ty + 1) + edge) - if square and (rwy >= size): - lwy = size - step - edge + end_i = min(matrix_size, tile_size * (ti + 1) + pad) + end_j = min(matrix_size, tile_size * (tj + 1) + pad) - yield (lwx + start, rwx + start), (lwy + start, rwy + start) + yield (start_i + offset, end_i + offset), (start_j + offset, end_j + offset) -def heatmap_tiles_generator_diag(clr, view_df, pad_size, tile_size, band_to_cover): +def generate_tiles_diag_band(clr, view_df, pad_size, tile_size, band_to_cover): """ - A generator yielding heatmap tiles that are needed to cover the requested - band_to_cover around diagonal. Each tile is "padded" with pad_size edge to - allow proper kernel-convolution of pixels close to boundary. + A generator yielding corrdinates of heatmap tiles that are needed to cover + the requested band_to_cover around diagonal. Each tile is "padded" with + the pad of size 'pad_size' to allow for convolution near the boundary of + a tile. Parameters ---------- @@ -383,83 +473,40 @@ def heatmap_tiles_generator_diag(clr, view_df, pad_size, tile_size, band_to_cove Typically correspond to the max_loci_separation for called dots. Returns ------- - tile : tuple - Generator of tuples of three, which contain - chromosome name, row index of the tile, - column index of the tile (region_name, tilei, tilej). - + tile_coords : tuple + Generator of tile coordinates, i.e. tuples of three: + (region_name, tile_span_i, tile_span_j), where 'tile_span_i/j' + each is a tuple of bin ids (bin_start, bin_end). """ for chrom, start, end, region_name in view_df.itertuples(index=False): - region_begin, region_end = clr.extent((chrom, start, end)) - for tilei, tilej in square_matrix_tiling( - region_begin, region_end, tile_size, pad_size + region_start, region_end = clr.extent((chrom, start, end)) + region_size = region_end - region_start + for tile_span_i, tile_span_j in tile_square_matrix( + matrix_size=region_size, + offset=region_start, + tile_size=tile_size, + pad=pad_size, ): # check if a given tile intersects with # with the diagonal band of interest ... - diag_from = tilej[0] - tilei[1] - diag_to = tilej[1] - tilei[0] - # - band_from = 0 - band_to = band_to_cover + tile_diag_start = tile_span_j[0] - tile_span_i[1] + tile_diag_end = tile_span_j[1] - tile_span_i[0] + # TODO allow more flexible definition of a band to cover + band_start, band_end = 0, band_to_cover # we are using this >2*padding trick to exclude # tiles from the lower triangle from calculations ... - if (min(band_to, diag_to) - max(band_from, diag_from)) > 2 * pad_size: - yield region_name, tilei, tilej - - -################################## -# kernel-convolution related: -################################## -def _convolve_and_count_nans(O_bal, E_bal, E_raw, N_bal, kernel): - """ - Dense versions of a bunch of matrices needed for convolution and - calculation of number of NaNs in a vicinity of each pixel. And a kernel to - be provided of course. - - """ - # a matrix filled with the kernel-weighted sums - # based on a balanced observed matrix: - KO = convolve(O_bal, kernel, mode="constant", cval=0.0, origin=0) - # a matrix filled with the kernel-weighted sums - # based on a balanced expected matrix: - KE = convolve(E_bal, kernel, mode="constant", cval=0.0, origin=0) - # get number of NaNs in a vicinity of every - # pixel (kernel's nonzero footprint) - # based on the NaN-matrix N_bal. - # N_bal is shared NaNs between O_bal E_bal, - # is it redundant ? - NN = convolve( - N_bal.astype(np.int64), - # we have to use kernel's - # nonzero footprint: - (kernel != 0).astype(np.int64), - mode="constant", - # there are only NaNs - # beyond the boundary: - cval=1, - origin=0, - ) - ###################################### - # using cval=0 for actual data and - # cval=1 for NaNs matrix reduces - # "boundary issue" to the "number of - # NaNs"-issue - # #################################### - - # now finally, E_raw*(KO/KE), as the - # locally-adjusted expected with raw counts as values: - Ek_raw = np.multiply(E_raw, np.divide(KO, KE)) - # return locally adjusted expected and number of NaNs - # in the form of dense matrices: - return Ek_raw, NN + if ( + min(band_end, tile_diag_end) - max(band_start, tile_diag_start) + ) > 2 * pad_size: + yield region_name, tile_span_i, tile_span_j ######################################################################## # this is the MAIN function to get locally adjusted expected ######################################################################## def get_adjusted_expected_tile_some_nans( - origin, observed, expected, bal_weights, kernels, balance_factor=None, verbose=False + origin_ij, observed, expected, bal_weights, kernels ): """ Get locally adjusted expected for a collection of local-filters (kernels). @@ -509,7 +556,7 @@ def get_adjusted_expected_tile_some_nans( Parameters ---------- - origin : (int,int) tuple + origin_ij : (int,int) tuple tuple of interegers that specify the location of an observed matrix slice. Measured in bins, not in nucleotides. @@ -523,10 +570,10 @@ def get_adjusted_expected_tile_some_nans( bal_weights : numpy.ndarray or (numpy.ndarray, numpy.ndarray) 1D vector used to turn raw observed into balanced observed for a slice of - a matrix with the origin on the diagonal; + a matrix with the origin_ij on the diagonal; and a tuple/list of a couple of 1D arrays in case it is a slice with an arbitrary - origin. + origin_ij. kernels : dict of (str, numpy.ndarray) dictionary of kernels/masks to perform convolution of the heatmap. Kernels @@ -538,42 +585,34 @@ def get_adjusted_expected_tile_some_nans( to be considered significant. Dictionay keys must contain names for each kernel. - Note, scipy.ndimage.convove flips kernel - first and only then applies it to matrix. - balance_factor: float - Multiplicative Balancing factor: - balanced matrix multiplied by this factor - sums up to the total number of reads (taking - symmetry into account) instead of number of - bins in matrix. defaults to None and skips - a lowleft KernelObserved factor calculation. - !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - We need this to test how dynamic donut size - is affecting peak calling results. - !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - verbose: bool - Set to True to print some progress - messages to stdout. + Note, scipy.ndimage.convolve first flips kernel + and only then applies it to matrix. Returns ------- peaks_df : pandas.DataFrame - sparsified DataFrame that stores results of - locally adjusted calculations for every kernel - for a given slice of input matrix. Multiple - instences of such 'peaks_df' can be concatena- - ted and deduplicated for the downstream analysis. - Reported columns: - bin1_id - bin1_id index (row), adjusted to origin - bin2_id - bin bin2_id index, adjusted to origin + DataFrame with the results of locally adjusted calculations + for every kernel for a given slice of input matrix. + + Notes + ----- + + Reported columns: + bin1_id - bin1_id index (row), adjusted to tile_start_i + bin2_id - bin bin2_id index, adjusted to tile_start_j la_exp - locally adjusted expected (for each kernel) la_nan - number of NaNs around (each kernel's footprint) exp.raw - global expected, rescaled to raw-counts - obs.raw - observed values in raw-counts. + obs.raw(counts) - observed values in raw-counts. + + Depending on the intial tiling of the interaction matrix, + concatened `peaks_df` may require "deduplication", as some pixels + can be evaluated in several tiles (e.g. near the tile edges). + Default tilitng in the `dots` functions, should avoid this problem. """ - # extract origin coordinate of this tile: - io, jo = origin + # extract origin_ij coordinate of this tile: + io, jo = origin_ij # let's extract full matrices and ice_vector: O_raw = observed # raw observed, no need to copy, no modifications. E_bal = np.copy(expected) @@ -590,12 +629,6 @@ def get_adjusted_expected_tile_some_nans( "a tuple/list of a couple of numpy.ndarray-s" "for a slice of matrix with an arbitrary origin." ) - # kernels must be a dict with kernel-names as keys - # and kernel ndarrays as values. - if not isinstance(kernels, dict): - raise ValueError( - "'kernels' must be a dictionary" "with name-keys and ndarrays-values." - ) # balanced observed, from raw-observed # by element-wise multiply: @@ -607,7 +640,7 @@ def get_adjusted_expected_tile_some_nans( # and also to provide fair locally adjusted expected # estimation for pixels very close to diagonal, whose # "donuts"(kernels) would be crossing the main diagonal. - # The trickiest thing here would be dealing with the origin: io,jo. + # The trickiest thing here would be dealing with the origin_ij: io,jo. O_bal[np.tril_indices_from(O_bal, k=(io - jo) - 1)] = np.nan E_bal[np.tril_indices_from(E_bal, k=(io - jo) - 1)] = np.nan @@ -626,7 +659,6 @@ def get_adjusted_expected_tile_some_nans( # think about usinf copyto and where functions later: # https://stackoverflow.com/questions/6431973/how-to-copy-data-from-a-numpy-array-to-another # - # # we are going to accumulate all the results # into a DataFrame, keeping NaNs, and other # unfiltered results (even the lower triangle for now): @@ -642,18 +674,6 @@ def get_adjusted_expected_tile_some_nans( # kernel paramters such as width etc # are taken into account implicitly ... ######################################## - ##################### - # unroll _convolve_and_count_nans function back - # for us to test the dynamic donut criteria ... - ##################### - # Ek_raw, NN = _convolve_and_count_nans(O_bal, - # E_bal, - # E_raw, - # N_bal, - # kernel) - # Dense versions of a bunch of matrices needed for convolution and - # calculation of number of NaNs in a vicinity of each pixel. And a kernel to - # be provided of course. # a matrix filled with the kernel-weighted sums # based on a balanced observed matrix: KO = convolve(O_bal, kernel, mode="constant", cval=0.0, origin=0) @@ -685,16 +705,9 @@ def get_adjusted_expected_tile_some_nans( # locally-adjusted expected with raw counts as values: Ek_raw = np.multiply(E_raw, np.divide(KO, KE)) - # this is the place where we would need to extract - # some results of convolution and multuplt it by the - # appropriate factor "cooler._load_attrs(‘bins/weight’)[‘scale’]" ... - if balance_factor and (kernel_name == "lowleft"): - peaks_df[f"factor_balance.{kernel_name}.KerObs"] = ( - balance_factor * KO.flatten() - ) - # KO*balance_factor: to be compared with 16 ... - if verbose: - logging.info(f"Convolution with kernel {kernel_name} is complete.") + logging.debug( + f"Convolution with kernel {kernel_name} is done for tile @ {io} {jo}." + ) # # accumulation into single DataFrame: # store locally adjusted expected for each kernel @@ -707,22 +720,8 @@ def get_adjusted_expected_tile_some_nans( # aggregated over all kernels ... ##################################### peaks_df["exp.raw"] = E_raw.flatten() - # obs.raw -> count peaks_df["count"] = O_raw.flatten() - # TO BE REFACTORED/deprecated ... - # compatibility with legacy API is completely BROKEN - # post-processing allows us to restore it, see tests, - # but we pay with the processing speed for it. - mask_ndx = pd.Series(0, index=peaks_df.index, dtype=np.bool) - for kernel_name, kernel in kernels.items(): - # accummulating with a vector full of 'False': - mask_ndx_kernel = ~np.isfinite(peaks_df["la_exp." + kernel_name + ".value"]) - mask_ndx = np.logical_or(mask_ndx_kernel, mask_ndx) - - # returning only pixels from upper triangle of a matrix - # is likely here to stay: - upper_band = peaks_df["bin1_id"] < peaks_df["bin2_id"] # Consider filling lower triangle of the OBSERVED matrix tile # with NaNs, instead of this - we'd need this for a fair # consideration of the pixels that are super close to the @@ -732,44 +731,39 @@ def get_adjusted_expected_tile_some_nans( # close etc, is now shifted to the outside of this function # a way to simplify code. - # return good semi-sparsified DF: - return peaks_df[~mask_ndx & upper_band].reset_index(drop=True) + return peaks_df ################################## # step-specific dot-calling functions ################################## - - def score_tile( tile_cij, clr, - cis_exp, - exp_v_name, + expected_indexed, + expected_value_col, clr_weight_name, kernels, - nans_tolerated, + max_nans_tolerated, band_to_cover, - balance_factor, - verbose, ): """ The main working function that given a tile of a heatmap, applies kernels to perform convolution to calculate locally-adjusted expected and then - calculates a p-value for every meaningfull pixel against these l.a. expected - values. + calculates a p-value for every meaningfull pixel against these + locally-adjusted expected (la_exp) values. Parameters ---------- tile_cij : tuple - Tuple of 3: chromosome name, tile span row-wise, tile span column-wise: - (chrom, tile_i, tile_j), where tile_i = (start_i, end_i), and - tile_j = (start_j, end_j). + Tuple of 3: region name, tile span row-wise, tile span column-wise: + (region, tile_span_i, tile_span_j), where tile_span_i = (start_i, end_i), and + tile_span_j = (start_j, end_j). clr : cooler Cooler object to use to extract Hi-C heatmap data. - cis_exp : pandas.DataFrame - DataFrame with cis-expected, indexed with 'name' and 'diag'. - exp_v_name : str + expected_indexed : pandas.DataFrame + DataFrame with cis-expected, indexed with 'region1', 'region2', 'dist'. + expected_value_col : str Name of a value column in expected DataFrame clr_weight_name : str Name of a value column with balancing weights in a cooler.bins() @@ -777,17 +771,11 @@ def score_tile( kernels : dict A dictionary with keys being kernels names and values being ndarrays representing those kernels. - nans_tolerated : int + max_nans_tolerated : int Number of NaNs tolerated in a footprint of every kernel. band_to_cover : int Results would be stored only for pixels connecting loci closer than 'band_to_cover'. - balance_factor : float - Balancing factor to turn sum of balanced matrix back approximately - to the number of pairs (used for dynamic-donut criteria mostly). - use None value to disable dynamic-donut criteria calculation. - verbose : bool - Enable verbose output. Returns ------- @@ -799,89 +787,67 @@ def score_tile( """ # unpack tile's coordinates - region_name, tilei, tilej = tile_cij - origin = (tilei[0], tilej[0]) + region_name, tile_span_i, tile_span_j = tile_cij + tile_start_ij = (tile_span_i[0], tile_span_j[0]) # we have to do it for every tile, because # region_name is not known apriori (maybe move outside) # use .loc[region, region] for symmetric cis regions to conform with expected v1.0 - lazy_exp = LazyToeplitz(cis_exp.loc[region_name, region_name][exp_v_name].values) + lazy_exp = LazyToeplitz( + expected_indexed.loc[region_name, region_name][expected_value_col].to_numpy() + ) # RAW observed matrix slice: - observed = clr.matrix(balance=False)[slice(*tilei), slice(*tilej)] + observed = clr.matrix(balance=False)[slice(*tile_span_i), slice(*tile_span_j)] # expected as a rectangular tile : - expected = lazy_exp[slice(*tilei), slice(*tilej)] + expected = lazy_exp[slice(*tile_span_i), slice(*tile_span_j)] # slice of balance_weight for row-span and column-span : - bal_weight_i = clr.bins()[slice(*tilei)][clr_weight_name].values - bal_weight_j = clr.bins()[slice(*tilej)][clr_weight_name].values + bal_weight_i = clr.bins()[slice(*tile_span_i)][clr_weight_name].to_numpy() + bal_weight_j = clr.bins()[slice(*tile_span_j)][clr_weight_name].to_numpy() # do the convolutions result = get_adjusted_expected_tile_some_nans( - origin=origin, + origin_ij=tile_start_ij, observed=observed, expected=expected, bal_weights=(bal_weight_i, bal_weight_j), kernels=kernels, - balance_factor=balance_factor, - verbose=verbose, ) # Post-processing filters + # (0) keep only upper-triangle pixels: + upper_band = result["bin1_id"] < result["bin2_id"] + # (1) exclude pixels that connect loci further than 'band_to_cover' apart: is_inside_band = result["bin1_id"] > (result["bin2_id"] - band_to_cover) # (2) identify pixels that pass number of NaNs compliance test for ALL kernels: does_comply_nans = np.all( - result[[f"la_exp.{k}.nnans" for k in kernels]] < nans_tolerated, axis=1 + result[[f"la_exp.{k}.nnans" for k in kernels]] < max_nans_tolerated, axis=1 ) # so, selecting inside band and nNaNs compliant results: - # ( drop dropping index maybe ??? ) ... - res_df = result[is_inside_band & does_comply_nans].reset_index(drop=True) - # ####################################################################### - # # the following should be rewritten such that we return - # # opnly bare minimum number of columns per chunk - i.e. annotating is too heavy - # # to be performed here ... - # # - # # I'll do it here - switch off annotation, it'll break "extraction" and other - # # downstream stuff - but we'll fix it afterwards ... - # # - # ######################################################################## - # ######################################################################## - # # consider retiring Poisson testing from here, in case we - # # stick with l-chunking or opposite - add histogramming business here(!) - # ######################################################################## - # # do Poisson tests: - # get_pval = lambda la_exp : 1.0 - poisson.cdf(res_df["obs.raw"], la_exp) - # for k in kernels: - # res_df["la_exp."+k+".pval"] = get_pval( res_df["la_exp."+k+".value"] ) - # # annotate and return - # return cooler.annotate(res_df.reset_index(drop=True), clr.bins()[:]) - # - # so return only bin_ids , observed-raw (rename to counts, by Nezar's suggestion) - # and a bunch of locally adjusted expected estimates - 1 per kernel - - # that's the bare minimum ... - # - # per Nezar's suggestion - # rename obs.raw -> count - # this would break A LOT of downstream stuff - but let it be ... + res_df = result[upper_band & is_inside_band & does_comply_nans].reset_index( + drop=True + ) # + # so return only bare minimum: bin_ids , observed-raw counts + # and locally adjusted expected estimates for every kernel return res_df[ ["bin1_id", "bin2_id", "count"] + [f"la_exp.{k}.value" for k in kernels] ].astype(dtype={f"la_exp.{k}.value": "float64" for k in kernels}) def histogram_scored_pixels( - scored_df, kernels, ledges, verbose, obs_raw_name=observed_count_name + scored_df, kernels, ledges, obs_raw_name=observed_count_name ): """ - An attempt to implement HiCCUPS-like lambda-chunking - statistical procedure. - This function aims at building up a histogram of locally - adjusted expected scores for groups of characterized - pixels. - Such histograms are later used to compute FDR thresholds + An attempt to implement HiCCUPS-like lambda-binning statistical procedure. + This function aims at building up a histogram of locally adjusted + expected scores for groups of characterized pixels. + + Such histograms are subsequently used to compute FDR thresholds for different "classes" of hypothesis (classified by their - l.a. expected scores). + locally-adjusted expected (la_exp)). Parameters ---------- @@ -891,11 +857,9 @@ def histogram_scored_pixels( A dictionary with keys being kernels names and values being ndarrays representing those kernels. ledges : ndarray - An ndarray with bin lambda-edges for groupping loc. adj. expecteds, - i.e., classifying statistical hypothesis into lambda-classes. + An ndarray with bin lambda-edges for groupping locally adjusted + expecteds, i.e., classifying statistical hypothesis into lambda-bins. Left-most bin (-inf, 1], and right-most one (value,+inf]. - verbose : bool - Enable verbose output. obs_raw_name : str Name of the column/field that carry number of counts per pixel, i.e. observed raw counts. @@ -903,44 +867,17 @@ def histogram_scored_pixels( Returns ------- hists : dict of pandas.DataFrame - A dictionary of pandas.DataFrame with lambda/observed histogram for + A dictionary of pandas.DataFrame with lambda/observed 2D histogram for every kernel-type. Notes ----- - This is just an attempt to implement HiCCUPS-like lambda-chunking. - So we'll be returning histograms corresponding to the chunks of - scored pixels. - Consider modifying/accumulation a globally defined hists object, - or probably making something like a Feature2D class later on - where hists would be a class feature and histogramming_step would be - a method. - - + returning histograms corresponding to the chunks of scored pixels. """ - # lambda-chunking implies different 'pval' calculation - # procedure with a single Poisson expected for all the - # hypothesis in a same "class", i.e. with the l.a. expecteds - # from the same histogram bin. - - ######################## - # implementation ideas: - ######################## - # same observations/hypothesis needs to be classified according - # to different l.a. expecteds (i.e. for different kernel-types), - # which can be done with a pandas groupby, something like that: - # https://stackoverflow.com/questions/21441259 - # - # after that we could iterate over groups and do np.bincout on - # the "observed.raw" column (assuming it's of integer type) ... - hists = {} for k in kernels: - # verbose: - if verbose: - logging.info(f"Building a histogram for kernel-type {k}") # we would need to generate a bunch of these histograms for all of the # kernel types: # needs to be lambda-binned : scored_df["la_exp."+k+".value"] @@ -948,37 +885,33 @@ def histogram_scored_pixels( # # lambda-bin index for kernel-type "k": lbins = pd.cut(scored_df[f"la_exp.{k}.value"], ledges) - # now for each lambda-bin construct a histogramm of "obs.raw": - obs_hist = {} - for lbin, grp_df in scored_df.groupby(lbins): - # check if obs.raw is integer of spome kind (temporary): - # obs.raw -> count - assert np.issubdtype(grp_df[obs_raw_name].dtype, np.integer) - # perform bincounting ... - obs_hist[lbin] = pd.Series(np.bincount(grp_df[obs_raw_name])) - # ACHTUNG! assigning directly to empty DF leads to data loss! - # turn ndarray in Series for ease of handling, i.e. - # assign a bunch of columns of different sizes to DataFrame. - # - # Consider updating global "hists" later on, or implementing a - # special class for that. Mind that Python multiprocessing - # "threads" are processes and thus cannot share/modify a shared - # memory location - deal with it, maybe dask-something ?! - # - # turned out that storing W1xW2 for every "thread" requires a ton - # of memory - that's why different sizes... @nvictus ? - # store W1x(<=W2) hist for every kernel-type: - hists[k] = pd.DataFrame(obs_hist).fillna(0).astype(np.int64) + # group scored_df by counts and lambda-bins to contructs histograms: + hists[k] = ( + scored_df.groupby([obs_raw_name, lbins], dropna=False, observed=False)[ + f"la_exp.{k}.value" + ] + .count() + .unstack() + .fillna(0) + .astype(np.int64) + ) # return a dict of DataFrames with a bunch of histograms: return hists -def determine_thresholds(kernels, ledges, gw_hist, fdr): +def determine_thresholds(gw_hist, fdr): """ given a 'gw_hist' histogram of observed counts - for each lambda-chunk for each kernel-type, and + for each lambda-bin and for each kernel-type, and also given a FDR, calculate q-values for each observed - count value in each lambda-chunk for each kernel-type. + count value in each lambda-bin for each kernel-type. + + Parameters + ---------- + gw_hist_kernels : dict + dictionary {kernel_name : 2D_hist}, where '2D_hist' is a pd.DataFrame + fdr : float + False Discovery Rate level Returns ------- @@ -988,330 +921,318 @@ def determine_thresholds(kernels, ledges, gw_hist, fdr): each chunk ... qvalues : dict A dictionary with keys being kernel names and values pandas.DataFrames - storing q-values: each column corresponds to a lambda-chunk, + storing q-values: each column corresponds to a lambda-bin, while rows correspond to observed pixels values. """ - rcs_hist = {} - rcs_Poisson = {} + + # extracting significantly enriched interactions: + # We have our *null* hypothesis: intensity of a HiC pixel is Poisson-distributed + # with a certain expected. In this case that would be *locally-adjusted expected*. + # + # Thus for dot-calling, we could estimate a *p*-value for every pixel based + # on its observed intensity and its expected intensity, e.g.: + # lambda = la_exp["la_exp."+k+".value"]; pvals = 1.0 - poisson.cdf(la_exp["count"], lambda) + # However this is technically challenging (too many pixels - genome wide) and + # might not be sensitive enough due to wide dyamic range of interaction counts + # Instead we use the *lambda*-binning procedure from Rao et al 2014 to tackle + # both technicall challenges and some issues associated with the wide dynamic range + # of the expected for the dot-calling (due to distance decay). + # + # Some extra points: + # 1. simple p-value thresholding should be replaced to more "productive" FDR, which is more tractable + # 2. "unfair" to treat all pixels with the same stat-testing (multiple hypothesis) - too wide range of "expected" + # 3. (2) is addressed by spliting the pixels in the groups by their localy adjusted expected - lambda-bins + # 4. upper boundary of each lambda-bin is used as expected for every pixel that belongs to the chunk: + # - for technical/efficiency reasons - test pixels in a chunk all at once + # for each lambda-bin q-values are calculated in an efficient way: + # in part, efficiency comes from collapsing identical observed values, i.e. histogramming + # also upper boundary of each lambda-bin is used as an expected for every pixel in this lambda-bin + qvalues = {} threshold_df = {} - for k in kernels: - # Reverse cumulative histogram for this kernel. - # First row contains total # of pixels in each lambda-chunk. - rcs_hist[k] = gw_hist[k].iloc[::-1].cumsum(axis=0).iloc[::-1] - - # Assign a unit Poisson distribution to each lambda-chunk. - # The expected value is the upper boundary of the lambda-chunk. + for k, _hist in gw_hist.items(): + # Reverse cumulative histogram for kernel 'k'. + rcs_hist = _hist.iloc[::-1].cumsum(axis=0).iloc[::-1] + # 1st row of 'rcs_hist' contains total pixels-counts in each lambda-bin. + norm = rcs_hist.iloc[0, :] + + # Assign a unit Poisson distribution to each lambda-bin. + # The expected value 'lambda' is the upper boundary of each lambda-bin: # poisson.sf = 1 - poisson.cdf, but more precise - # poisson.sf(-1,mu) == 1.0, i.e. is equivalent to the - # poisson.pmf(gw_hist[k].index, mu)[::-1].cumsum()[::-1] - rcs_Poisson[k] = pd.DataFrame() - for mu, column in zip(ledges[1:-1], gw_hist[k].columns): - renorm_factors = rcs_hist[k].loc[0, column] - rcs_Poisson[k][column] = renorm_factors * poisson.sf( - gw_hist[k].index - 1, mu - ) + # poisson.sf(-1, lambda) == 1.0, i.e. is equivalent to the + # poisson.pmf(rcs_hist.index, lambda)[::-1].cumsum()[::-1] + # unit Poisson is a collection of 1-CDF distributions for each l-chunk + # same dimensions as rcs_hist - matching lchunks and observed values: + unit_Poisson = pd.DataFrame().reindex_like(rcs_hist) + for lbin in rcs_hist.columns: + # Number of occurances in Poisson distribution for which we estimate + _occurances = rcs_hist.index.to_numpy() + unit_Poisson[lbin] = poisson.sf(_occurances, lbin.right) + # normalize unit-Poisson distr for the total pixel counts per lambda-bin + unit_Poisson = norm * unit_Poisson # Determine the threshold by checking the value at which 'fdr_diff' - # first turns positive. Fill NaNs with an "unreachably" high value. - fdr_diff = fdr * rcs_hist[k] - rcs_Poisson[k] - very_high_value = len(rcs_hist[k]) + # first turns positive. Fill NaNs with a high value, that's out of reach. + _high_value = rcs_hist.index.max() + 1 + fdr_diff = ((fdr * rcs_hist) - unit_Poisson).cummax() + # cummax ensures monotonic increase of differences threshold_df[k] = ( - fdr_diff.where(fdr_diff > 0) - .apply(lambda col: col.first_valid_index()) - .fillna(very_high_value) + fdr_diff.mask(fdr_diff < 0) # mask negative with nans + .idxmin() # index of the first positive difference + .fillna(_high_value) # set high threshold if no pixels pass .astype(np.int64) ) - # q-values - # bear in mind some issues with lots of NaNs and Infs after - # such a brave operation ... - qvalues[k] = rcs_Poisson[k] / rcs_hist[k] + qvalues[k] = (unit_Poisson / rcs_hist).cummin() + # run cumulative min, on the array of adjusted p-values + # to make sure q-values are monotonously decreasing with pvals + qvalues[k] = qvalues[k].mask(qvalues[k] > 1.0, 1.0) return threshold_df, qvalues -def extract_scored_pixels( - scored_df, kernels, thresholds, ledges, verbose, obs_raw_name=observed_count_name -): +def extract_scored_pixels(scored_df, thresholds, obs_raw_name=observed_count_name): """ - An attempt to implement HiCCUPS-like lambda-chunking - statistical procedure. + Implementation of HiCCUPS-like lambda-binning statistical procedure. Use FDR thresholds for different "classes" of hypothesis - (classified by their l.a. expected scores), in order to - extract pixels that stand out. + (classified by their locally-adjusted expected (la_exp) scores), + in order to extract "enriched" pixels. Parameters ---------- scored_df : pd.DataFrame A table with the scoring information for a group of pixels. - kernels : dict - A dictionary with keys being kernel names and values being ndarrays - representing those kernels. thresholds : dict - A dictionary with keys being kernel names and values pandas.Series - indexed with Intervals defined by 'ledges' boundaries and storing FDR - thresholds for observed values. - ledges : ndarray - An ndarray with bin lambda-edges for groupping loc. adj. expecteds, - i.e., classifying statistical hypothesis into lambda-classes. - Left-most bin (-inf, 1], and right-most one (value,+inf]. - verbose : bool - Enable verbose output. + A dictionary {kernel_name : lambda_thresholds}, where 'lambda_thresholds' + are pd.Series with FDR thresholds indexed by lambda-bin intervals obs_raw_name : str - Name of the column/field that carry number of counts per pixel, + Name of the column/field with number of counts per pixel, i.e. observed raw counts. Returns ------- scored_df_slice : pandas.DataFrame - Filtered DataFrame of pixels extracted applying FDR thresholds. - - Notes - ----- - This is just an attempt to implement HiCCUPS-like lambda-chunking. + Filtered DataFrame of pixels that satisfy thresholds. """ - comply_fdr_list = np.ones(len(scored_df), dtype=np.bool) - - for k in kernels: - # using special features of IntervalIndex: - # http://pandas.pydata.org/pandas-docs/stable/advanced.html#intervalindex - # i.e. IntervalIndex can be .loc-ed with values that would be - # corresponded with their Intervals (!!!): - # obs.raw -> count - comply_fdr_k = ( - scored_df[obs_raw_name].values - > thresholds[k].loc[scored_df[f"la_exp.{k}.value"]].values + compliant_pixel_masks = [] + for kernel_name, threshold in thresholds.items(): + # locally adjusted expected (lambda) of the scored pixels: + lambda_of_pixels = scored_df[f"la_exp.{kernel_name}.value"] + # extract lambda_threshold for every scored pixel based on its estimated lambda, using + # interval-indexed 'threshold' to query it by the lambda-values of each scored pixel + threshold_of_pixels = threshold.loc[lambda_of_pixels] + compliant_pixel_masks.append( + scored_df[obs_raw_name].to_numpy() >= threshold_of_pixels.to_numpy() ) - # extracting q-values for all of the pixels takes a lot of time - # we'll do it externally for filtered_pixels only, in order to save - # time - # - # accumulate comply_fdr_k into comply_fdr_list - # using np.logical_and: - comply_fdr_list = np.logical_and(comply_fdr_list, comply_fdr_k) - # return a slice of 'scored_df' that complies FDR thresholds: - return scored_df[comply_fdr_list] - - -################################## -# large CLI-helper functions wrapping smaller step-specific ones: -# basically - the dot-calling steps - ONE PASS DOT-CALLING: -################################## - - -def scoring_step( - clr, - expected, - expected_name, - clr_weight_name, - tiles, - kernels, - max_nans_tolerated, - loci_separation_bins, - output_path, - nproc, - verbose, -): - """ - Calculates locally adjusted expected - for each pixel in a designated area of - the heatmap and return it as a big - single pixel-table (pandas.DataFrame) - """ - if verbose: - logging.info(f"Preparing to convolve {len(tiles)} tiles:") - - # check if cooler is balanced - try: - _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True) - except Exception as e: - raise ValueError( - f"provided cooler is not balanced or {clr_weight_name} is missing" - ) from e - - # add very_verbose to supress output from convolution of every tile - very_verbose = False - job = partial( - score_tile, - clr=clr, - cis_exp=expected, - exp_v_name=expected_name, - clr_weight_name=clr_weight_name, - kernels=kernels, - nans_tolerated=max_nans_tolerated, - band_to_cover=loci_separation_bins, - balance_factor=None, - verbose=very_verbose, - ) - - if nproc > 1: - pool = mp.Pool(nproc) - map_ = pool.imap - map_kwargs = dict(chunksize=int(np.ceil(len(tiles) / nproc))) - if verbose: - logging.info( - f"creating a Pool of {nproc} workers to tackle {len(tiles)} tiles" - ) - else: - map_ = map - if verbose: - logging.info("fallback to serial implementation.") - map_kwargs = {} - try: - # do the work here - chunks = map_(job, tiles, **map_kwargs) - # ########################################### - # # local - in memory copy of the dataframe (danger of RAM overuse) - # ########################################### - # reset index is required, otherwise there will be duplicate - # indices in the output of this function - return pd.concat(chunks).reset_index(drop=True) - finally: - if nproc > 1: - pool.close() + # return pixels from 'scored_df' that satisfy FDR thresholds for all kernels: + return scored_df[np.all(compliant_pixel_masks, axis=0)] def clustering_step( - scores_df, - expected_regions, + scored_df, dots_clustering_radius, - verbose, + assigned_regions_name="region", obs_raw_name=observed_count_name, ): """ - - This is a new "clustering" step updated for the pixels processed by lambda- - chunking multiple hypothesis testing. - - This method assumes that 'scores_df' is a DataFrame with all of the pixels - that needs to be clustered, thus there is no additional 'comply_fdr' column - and selection of compliant pixels. - - This step is a clustering-only (using Birch from scikit). + Group together adjacent significant pixels into clusters after + the lambda-binning multiple hypothesis testing by iterating over + assigned regions and calling `clust_2D_pixels`. Parameters ---------- - scores_df : pandas.DataFrame - DataFrame that stores filtered pixels that are ready to be - clustered, no more 'comply_fdr' column dependency. - expected_regions : iterable - An iterable of regions to be clustered. + scored_df : pandas.DataFrame + DataFrame with enriched pixels that are ready to be + clustered and are annotated with their genomic coordinates. dots_clustering_radius : int Birch-clustering threshold. - verbose : bool - Enable verbose output. + assigned_regions_name : str | None + Name of the column in scored_df to use for grouping pixels + before clustering. When None, full chromosome clustering is done. + obs_raw_name : str + name of the column with raw observed pixel counts Returns ------- centroids : pandas.DataFrame - Pixels from 'scores_df' annotated with clustering information. + Pixels from 'scored_df' annotated with clustering information. Notes ----- 'dots_clustering_radius' in Birch clustering algorithm corresponds to a double the clustering radius in the "greedy"-clustering used in HiCCUPS - (to be tested). """ - # Annotate regions, if needed: + # make sure provided pixels are annotated with genomic corrdinates and raw counts column is present: + if not {"chrom1", "chrom2", "start1", "start2", obs_raw_name}.issubset(scored_df): + raise ValueError("Scored pixels provided for clustering are not annotated") - scores_df = scores_df.copy() + scored_df = scored_df.copy() if ( - not "region" in scores_df.columns + not assigned_regions_name in scored_df.columns ): # If input scores are not annotated by regions: - scores_df["region"] = np.where( - scores_df["chrom1"] == scores_df["chrom2"], scores_df["chrom1"], np.nan + logging.warning( + f"No regions assigned to the scored pixels before clustering, using chromosomes" + ) + scored_df[assigned_regions_name] = np.where( + scored_df["chrom1"] == scored_df["chrom2"], scored_df["chrom1"], np.nan ) - # using different bin12_id_names since all - # pixels are annotated at this point. - pixel_clust_list = [] - for region in expected_regions: - # Perform clustering for each region separately. - df = scores_df[((scores_df["region"].astype(str) == str(region)))] - if not len(df): - continue + # cluster within each regions separately and accumulate the result: + pixel_clust_list = [] + scored_pixels_by_region = scored_df.groupby(assigned_regions_name, observed=True) + for region, _df in scored_pixels_by_region: + logging.info(f"clustering enriched pixels in region: {region}") + # Using genomic corrdinated for clustering, not bin_id pixel_clust = clust_2D_pixels( - df, + _df, threshold_cluster=dots_clustering_radius, bin1_id_name="start1", bin2_id_name="start2", - verbose=verbose, ) pixel_clust_list.append(pixel_clust) - if verbose: - logging.info("Clustering is over!") + logging.info("Clustering is complete") + # concatenate clustering results ... # indexing information persists here ... - - if len(pixel_clust_list) == 0: - if verbose: - logging.info("No clusters found! Output will be empty") + if not pixel_clust_list: + logging.warning("No clusters found for any regions! Output will be empty") empty_output = pd.DataFrame( [], - columns=list(scores_df.columns) - + ["region1", "region2", "c_label", "c_size", "cstart1", "cstart2"], + columns=list(scored_df.columns) + + [ + assigned_regions_name + "1", + assigned_regions_name + "2", + "c_label", + "c_size", + "cstart1", + "cstart2", + ], ) return empty_output # Empty dataframe with the same columns as anticipated - pixel_clust_df = pd.concat( - pixel_clust_list, ignore_index=False - ) # Concatenate the clustering results for different regions + else: + pixel_clust_df = pd.concat( + pixel_clust_list, ignore_index=False + ) # Concatenate the clustering results for different regions - # now merge pixel_clust_df and scores_df DataFrame ... - # # and merge (index-wise) with the main DataFrame: + # now merge pixel_clust_df and scored_df DataFrame ... + # TODO make a more robust merge here df = pd.merge( - scores_df, pixel_clust_df, how="left", left_index=True, right_index=True + scored_df, pixel_clust_df, how="left", left_index=True, right_index=True ) - # prevents scores_df categorical values (all chroms, including chrM) - df["region1"] = df["region"].astype(str) - df["region2"] = df["region"].astype(str) + # TODO check if next str-cast is neccessary + df[assigned_regions_name + "1"] = df[assigned_regions_name].astype(str) + df[assigned_regions_name + "2"] = df[assigned_regions_name].astype(str) # report only centroids with highest Observed: - chrom_clust_group = df.groupby(["region1", "region2", "c_label"]) + chrom_clust_group = df.groupby( + [assigned_regions_name + "1", assigned_regions_name + "2", "c_label"], + observed=True, + ) centroids = df.loc[ chrom_clust_group[obs_raw_name].idxmax() ] # Select the brightest pixel in the cluster return centroids -# consider switching step names - "extraction" to "FDR-thresholding" -# and "thresholding" to "enrichment" - for final enrichment selection: -def thresholding_step(centroids, obs_raw_name=observed_count_name): - # (2) - # filter by FDR, enrichment etc: - enrichment_factor_1 = 1.5 - enrichment_factor_2 = 1.75 - enrichment_factor_3 = 2.0 - FDR_orphan_threshold = 0.02 - ###################################################################### - # # Temporarily remove orphans filtering, until q-vals are calculated: - ###################################################################### +def cluster_filtering_hiccups( + centroids, + obs_raw_name=observed_count_name, + enrichment_factor_vh=1.5, + enrichment_factor_d_and_ll=1.75, + enrichment_factor_d_or_ll=2.0, + FDR_orphan_threshold=0.02, +): + """ + Centroids of enriched pixels can be filtered to further minimize + the amount of false-positive dot-calls. + + First, centroids are filtered on enrichment relative to the + locally-adjusted expected for the "donut", "lowleft", "vertical", + and "horizontal" kernels. Additionally, singleton pixels + (i.e. pixels that do not belong to a cluster) are filtered based on + a combined q-values for all kernels. This empirical filtering approach + was developed in Rao et al 2014 and results in a conservative dot-calls + with the low rate of false-positive calls. + + Parameters + ---------- + centroids : pd.DataFrame + DataFrame that stores enriched and clustered pixels. + obs_raw_name : str + name of the column with raw observed pixel counts + enrichment_factor_vh : float + minimal enrichment factor for pixels relative to + both "vertical" and "horizontal" kernel. + enrichment_factor_d_and_ll : float + minimal enrichment factor for pixels relative to + both "donut" and "lowleft" kernels. + enrichment_factor_d_or_ll : float + minimal enrichment factor for pixels relative to + either "donut" or" "lowleft" kenels. + FDR_orphan_threshold : float + minimal combined q-value for singleton pixels. + + Returns + ------- + filtered_centroids : pd.DataFrame + filtered dot-calls + """ + # make sure input DataFrame of pixels has been clustered: + if not "c_size" in centroids: + raise ValueError(f"input dataframe of pixels does not seem to be clustered") + + # make sure input DataFrame of pixels has been annotated with genomic coordinates and raw counts column is present: + if not {"chrom1", "chrom2", "start1", "start2", obs_raw_name}.issubset(centroids): + raise ValueError( + "input dataframe of clustered pixels provided for filtering is not annotated" + ) + + # make sureinput DataFrame of pixels was scored using 4-hiccups kernels (donut, lowleft, vertical, horizontal): + _hiccups_kernel_cols_set = { + "la_exp.donut.value", + "la_exp.vertical.value", + "la_exp.horizontal.value", + "la_exp.lowleft.value", + # and q-values as well + "la_exp.donut.qval", + "la_exp.vertical.qval", + "la_exp.horizontal.qval", + "la_exp.lowleft.qval", + } + # make sure input DataFrame of pixels has been annotated with genomic coordinates and raw counts column is present: + if not _hiccups_kernel_cols_set.issubset(centroids): + raise ValueError( + "clustered pixels provided for filtering were not scored with 4 hiccups kernels" + ) + + # ad hoc filtering by enrichment, FDR for singletons etc. + # employed in Rao et al 2014 HiCCUPS enrichment_fdr_comply = ( ( centroids[obs_raw_name] - > enrichment_factor_2 * centroids["la_exp.lowleft.value"] + > enrichment_factor_d_and_ll * centroids["la_exp.lowleft.value"] ) & ( centroids[obs_raw_name] - > enrichment_factor_2 * centroids["la_exp.donut.value"] + > enrichment_factor_d_and_ll * centroids["la_exp.donut.value"] ) & ( centroids[obs_raw_name] - > enrichment_factor_1 * centroids["la_exp.vertical.value"] + > enrichment_factor_vh * centroids["la_exp.vertical.value"] ) & ( centroids[obs_raw_name] - > enrichment_factor_1 * centroids["la_exp.horizontal.value"] + > enrichment_factor_vh * centroids["la_exp.horizontal.value"] ) & ( ( centroids[obs_raw_name] - > enrichment_factor_3 * centroids["la_exp.lowleft.value"] + > enrichment_factor_d_or_ll * centroids["la_exp.lowleft.value"] ) | ( centroids[obs_raw_name] - > enrichment_factor_3 * centroids["la_exp.donut.value"] + > enrichment_factor_d_or_ll * centroids["la_exp.donut.value"] ) ) & ( @@ -1327,63 +1248,19 @@ def thresholding_step(centroids, obs_raw_name=observed_count_name): ) ) ) - # # - # enrichment_fdr_comply = ( - # (centroids[obs_raw_name] > enrichment_factor_2 * centroids["la_exp.lowleft.value"]) & - # (centroids[obs_raw_name] > enrichment_factor_2 * centroids["la_exp.donut.value"]) & - # (centroids[obs_raw_name] > enrichment_factor_1 * centroids["la_exp.vertical.value"]) & - # (centroids[obs_raw_name] > enrichment_factor_1 * centroids["la_exp.horizontal.value"]) & - # ( (centroids[obs_raw_name] > enrichment_factor_3 * centroids["la_exp.lowleft.value"]) - # | (centroids[obs_raw_name] > enrichment_factor_3 * centroids["la_exp.donut.value"]) ) - # ) - # use "enrichment_fdr_comply" to filter out - # non-satisfying pixels: - out = centroids[enrichment_fdr_comply] - - # ... - # to be added to the list of output columns: - # "factor_balance."+"lowleft"+".KerObs" - # ... - - # tentaive output columns list: - columns_for_output = [ - "chrom1", - "start1", - "end1", - "chrom2", - "start2", - "end2", - "cstart1", - "cstart2", - "c_label", - "c_size", - obs_raw_name, - # 'exp.raw', - "la_exp.donut.value", - "la_exp.vertical.value", - "la_exp.horizontal.value", - "la_exp.lowleft.value", - # "factor_balance.lowleft.KerObs", - # 'la_exp.upright.value', - # 'la_exp.upright.qval', - "la_exp.donut.qval", - "la_exp.vertical.qval", - "la_exp.horizontal.qval", - "la_exp.lowleft.qval", - ] - return out[columns_for_output] + # use "enrichment_fdr_comply" to filter out non-satisfying pixels: + return centroids[enrichment_fdr_comply].reset_index(drop=True) -################################## -# large CLI-helper functions wrapping smaller step-specific ones: -# basically - the dot-calling steps - ON THE FLY - 2 PASS DOT-CALLING (HiCCUPS-style): -################################## +#################################################################### +# large helper functions wrapping smaller step-specific ones +#################################################################### def scoring_and_histogramming_step( clr, - expected, - expected_name, + expected_indexed, + expected_value_col, clr_weight_name, tiles, kernels, @@ -1391,224 +1268,404 @@ def scoring_and_histogramming_step( max_nans_tolerated, loci_separation_bins, nproc, - verbose, ): """ - This is a derivative of the 'scoring_step' which is supposed to implement - the 1st of the lambda-chunking procedure - histogramming. + This implements the 1st step of the lambda-binning scoring procedure - histogramming. - Basically we are piping scoring operation together with histogramming into a + In short, this pipes a scoring operation together with histogramming into a single pipeline of per-chunk operations/transforms. """ - if verbose: - logging.info(f"Preparing to convolve {len(tiles)} tiles:") - - # add very_verbose to supress output from convolution of every tile - very_verbose = False - - # check if cooler is balanced - try: - _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True) - except Exception as e: - raise ValueError( - f"provided cooler is not balanced or {clr_weight_name} is missing" - ) from e + logging.info(f"convolving {len(tiles)} tiles to build histograms for lambda-bins") # to score per tile: to_score = partial( score_tile, clr=clr, - cis_exp=expected, - exp_v_name=expected_name, + expected_indexed=expected_indexed, + expected_value_col=expected_value_col, clr_weight_name=clr_weight_name, kernels=kernels, - nans_tolerated=max_nans_tolerated, + max_nans_tolerated=max_nans_tolerated, band_to_cover=loci_separation_bins, - # do not calculate dynamic-donut criteria - # for now. - balance_factor=None, - verbose=very_verbose, ) # to hist per scored chunk: - to_hist = partial( - histogram_scored_pixels, kernels=kernels, ledges=ledges, verbose=very_verbose - ) + to_hist = partial(histogram_scored_pixels, kernels=kernels, ledges=ledges) - # composing/piping scoring and histogramming - # together : + # compose scoring and histogramming together : job = lambda tile: to_hist(to_score(tile)) - # copy paste from @nvictus modified 'scoring_step': + # standard multiprocessing implementation if nproc > 1: + logging.info(f"creating a Pool of {nproc} workers to tackle {len(tiles)} tiles") pool = mp.Pool(nproc) map_ = pool.imap map_kwargs = dict(chunksize=int(np.ceil(len(tiles) / nproc))) - if verbose: - logging.info( - f"creating a Pool of {nproc} workers to tackle {len(tiles)} tiles" - ) else: + logging.info("fallback to serial implementation.") map_ = map - if verbose: - logging.info("fallback to serial implementation.") map_kwargs = {} try: # consider using # https://github.com/mirnylab/cooler/blob/9e72ee202b0ac6f9d93fd2444d6f94c524962769/cooler/tools.py#L59 - # here: - hchunks = map_(job, tiles, **map_kwargs) - # hchunks TO BE ACCUMULATED - # hopefully 'hchunks' would stay in memory - # until we would get a chance to accumulate them: + histogram_chunks = map_(job, tiles, **map_kwargs) finally: if nproc > 1: pool.close() - # - # now we need to combine/sum all of the histograms - # for different kernels: - # - # assuming we know "kernels" - # this is very ugly, but ok - # for the draft lambda-chunking - # lambda version of lambda-chunking: + + # now we need to combine/sum all of the histograms for different kernels: def _sum_hists(hx, hy): - # perform a DataFrame summation - # for every value of the dictionary: + # perform a DataFrame summation for every value of the dictionary: hxy = {} for k in kernels: - hxy[k] = hx[k].add(hy[k], fill_value=0).astype(np.int64) + hxy[k] = hx[k].add(hy[k], fill_value=0).fillna(0).astype(np.int64) # returning the sum: return hxy - # ###################################################### - # this approach is tested and at the very least - # number of pixels in a dump list matches - # with the .sum().sum() of the histogram - # both for 10kb and 5kb, - # thus we should consider this as a reference - # implementation, albeit not a very efficient one ... - # ###################################################### - final_hist = reduce(_sum_hists, hchunks) - # we have to make sure there is nothing in the - # top bin, i.e., there are no l.a. expecteds > base^(len(ledges)-1) + # TODO consider more efficient implementation to accumulate histograms: + final_hist = reduce(_sum_hists, histogram_chunks) + # we have to make sure there is nothing in the last lambda-bin + # this is a temporary implementation detail, until we implement dynamic lambda-bins for k in kernels: - last_la_exp_bin = final_hist[k].columns[-1] - last_la_exp_vals = final_hist[k].iloc[:, -1] - # checking the top bin: - if last_la_exp_vals.sum() != 0: + last_lambda_bin = final_hist[k].iloc[:, -1] + if last_lambda_bin.sum() != 0: raise ValueError( - f"There are la_exp.{k}.value in {last_la_exp_bin}, please check the histogram" + f"There are la_exp.{k}.value in {last_lambda_bin.name}, please check the histogram" ) - # drop that last column/bin (last_edge, +inf]: - final_hist[k] = final_hist[k].drop(columns=last_la_exp_bin) - # consider dropping all of the columns that have zero .sum() + # drop all lambda-bins that do not have pixels in them: + final_hist[k] = final_hist[k].loc[:, final_hist[k].sum() > 0] + # make sure index (observed pixels counts) is sorted + if not final_hist[k].index.is_monotonic: + raise ValueError(f"Histogram for {k}-kernel is not sorted") # returning filtered histogram return final_hist def scoring_and_extraction_step( clr, - expected, - expected_name, + expected_indexed, + expected_value_col, clr_weight_name, tiles, kernels, ledges, thresholds, max_nans_tolerated, - balance_factor, loci_separation_bins, - output_path, nproc, - verbose, bin1_id_name="bin1_id", bin2_id_name="bin2_id", ): """ - This is a derivative of the 'scoring_step' which is supposed to implement - the 2nd of the lambda-chunking procedure - extracting pixels that are FDR - compliant. + This implements the 2nd step of the lambda-binning scoring procedure, + extracting pixels that are FDR compliant. - Basically we are piping scoring operation together with extraction into a + In short, this combines scoring with with extraction into a single pipeline of per-chunk operations/transforms. """ - if verbose: - logging.info(f"Preparing to convolve {len(tiles)} tiles:") - - # add very_verbose to supress output from convolution of every tile - very_verbose = False - - # check if cooler is balanced - try: - _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True) - except Exception as e: - raise ValueError( - f"provided cooler is not balanced or {clr_weight_name} is missing" - ) from e + logging.info(f"convolving {len(tiles)} tiles to extract enriched pixels") # to score per tile: to_score = partial( score_tile, clr=clr, - cis_exp=expected, - exp_v_name=expected_name, + expected_indexed=expected_indexed, + expected_value_col=expected_value_col, clr_weight_name=clr_weight_name, kernels=kernels, - nans_tolerated=max_nans_tolerated, + max_nans_tolerated=max_nans_tolerated, band_to_cover=loci_separation_bins, - balance_factor=balance_factor, - verbose=very_verbose, ) # to hist per scored chunk: to_extract = partial( extract_scored_pixels, - kernels=kernels, thresholds=thresholds, - ledges=ledges, - verbose=very_verbose, ) - # composing/piping scoring and histogramming - # together : + # compose scoring and histogramming together job = lambda tile: to_extract(to_score(tile)) - # copy paste from @nvictus modified 'scoring_step': + # standard multiprocessing implementation if nproc > 1: + logging.info(f"creating a Pool of {nproc} workers to tackle {len(tiles)} tiles") pool = mp.Pool(nproc) map_ = pool.imap map_kwargs = dict(chunksize=int(np.ceil(len(tiles) / nproc))) - if verbose: - logging.info( - f"creating a Pool of {nproc} workers to tackle {len(tiles)} tiles" - ) else: + logging.info("fallback to serial implementation.") map_ = map - if verbose: - logging.info("fallback to serial implementation.") map_kwargs = {} try: # consider using # https://github.com/mirnylab/cooler/blob/9e72ee202b0ac6f9d93fd2444d6f94c524962769/cooler/tools.py#L59 - # here: filtered_pix_chunks = map_(job, tiles, **map_kwargs) significant_pixels = pd.concat(filtered_pix_chunks, ignore_index=True) - if output_path is not None: - significant_pixels.to_csv( - output_path, sep="\t", header=True, index=False, compression=None - ) finally: if nproc > 1: pool.close() - # there should be no duplicates in the "significant_pixels" DataFrame of pixels: - significant_pixels_dups = significant_pixels.duplicated() - if significant_pixels_dups.any(): + # same pixels should never be scored >1 times with the current tiling of the interactions matrix + if significant_pixels.duplicated().any(): raise ValueError( - f"Duplicated pixels detected during exctraction {significant_pixels[significant_pixels_dups]}" + f"Some pixels were scored more than one time, matrix tiling procedure is not correct" ) # sort the result just in case and drop its index: return significant_pixels.sort_values(by=[bin1_id_name, bin2_id_name]).reset_index( drop=True ) + + +# user-friendly high-level API function +def dots( + clr, + expected, + expected_value_col="balanced.avg", + clr_weight_name="weight", + view_df=None, + kernels=None, + max_loci_separation=10_000_000, + max_nans_tolerated=1, # test if this has desired behavior + n_lambda_bins=40, # update this eventually + lambda_bin_fdr=0.1, + clustering_radius=20_000, + cluster_filtering=None, + tile_size=5_000_000, + nproc=1, +): + """ + Call dots on a cooler {clr}, using {expected} defined in regions specified + in {view_df}. + + All convolution kernels specified in {kernels} will be all applied to the {clr}, + and statistical testing will be performed separately for each kernel. A convolutional + kernel is a small squared matrix (e.g. 7x7) of zeros and ones + that defines a "mask" to extract local expected around each pixel. Since the + enrichment is calculated relative to the central pixel, kernel width should + be an odd number >=3. + + Parameters + ---------- + clr : cooler.Cooler + A cooler with balanced Hi-C data. + expected : DataFrame in expected format + Diagonal summary statistics for each chromosome, and name of the column + with the values of expected to use. + expected_value_col : str + Name of the column in expected that holds the values of expected + clr_weight_name : str + Name of the column in the clr.bins to use as balancing weights. + Using raw unbalanced data is not supported for dot-calling. + view_df : viewframe + Viewframe with genomic regions, at the moment the view has to match the + view used for generating expected. If None, generate from the cooler. + kernels : { str:np.ndarray } | None + A dictionary of convolution kernels to be used for calculating locally adjusted + expected. If None the default kernels from HiCCUPS are going to be recommended + based on the resolution of the cooler. + max_loci_separation : int + Miaximum loci separation for dot-calling, i.e., do not call dots for + loci that are further than max_loci_separation basepair apart. default 10Mb. + max_nans_tolerated : int + Maximum number of NaNs tolerated in a footprint of every used kernel + Adjust with caution, as large max_nans_tolerated, might lead to artifacts in + pixels scoring. + n_lambda_bins : int + Number of log-spaced bins, where FDR-testing will be performed independently. + TODO: generate lambda-bins on the fly based on the dynamic range of the data (i.e. maximum pixel count) + lambda_bin_fdr : float + False discovery rate (FDR) for multiple hypothesis testing BH-FDR procedure, applied per lambda bin. + clustering_radius : None | int + Cluster enriched pixels with a given radius. "Brightest" pixels in each group + will be reported as the final dot-calls. If None, no clustering is performed. + cluster_filtering : bool + whether to apply additional filtering to centroids after clustering, using cluster_filtering_hiccups() + tile_size : int + Tile size for the Hi-C heatmap tiling. Typically on order of several mega-bases, and <= max_loci_separation. + Controls tradeoff between memory consumption and speed of execution. + nproc : int + Number of processes to use for multiprocessing. + + Returns + ------- + dots : pandas.DataFrame + BEDPE-style dataFrame with genomic coordinates of called dots and additional annotations. + + Notes + ----- + 'clustering_radius' in Birch clustering algorithm corresponds to a + double the clustering radius in the "greedy"-clustering used in HiCCUPS + (to be tested). + + TODO describe sequence of processing steps + + """ + + #### Generate viewframes #### + if view_df is None: + view_df = make_cooler_view(clr) + else: + try: + _ = is_compatible_viewframe( + view_df, + clr, + check_sorting=True, + raise_errors=True, + ) + except Exception as e: + raise ValueError("view_df is not a valid viewframe or incompatible") from e + + # check balancing status + if clr_weight_name: + # check if cooler is balanced + try: + _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True) + except Exception as e: + raise ValueError( + f"provided cooler is not balanced or {clr_weight_name} is missing" + ) from e + else: + raise ValueError("calling dots on raw data is not supported.") + + # add checks to make sure cis-expected is symmetric + # make sure provided expected is compatible + try: + _ = is_valid_expected( + expected, + "cis", + view_df, + verify_cooler=clr, + expected_value_cols=[ + expected_value_col, + ], + raise_errors=True, + ) + except Exception as e: + raise ValueError("provided expected is not compatible") from e + + # Prepare some parameters. + binsize = clr.binsize + loci_separation_bins = bp_to_bins(max_loci_separation, binsize) + tile_size_bins = bp_to_bins(tile_size, binsize) + + # verify provided kernels or recommend them (HiCCUPS)... + if kernels and is_compatible_kernels(kernels, binsize, max_nans_tolerated): + warnings.warn( + "Compatibility checks for 'kernels' are not fully implemented yet, use at your own risk" + ) + else: + # recommend them (default hiccups ones for now) + kernels = recommend_kernels(binsize) + # deduce kernel_width - overall footprint + kernel_width = max(len(k) for k in kernels.values()) # 2*w+1 + kernel_half_width = int((kernel_width - 1) / 2) # former w parameter + + # try to guess required lambda bins using "max" value of pixel counts + # statistical: lambda-binning edges ... + if not 40 <= n_lambda_bins <= 50: + raise ValueError(f"Incompatible n_lambda_bins={n_lambda_bins}") + BASE = 2 ** (1 / 3) # very arbitrary - parameterize ! + ledges = np.concatenate( + ( + [-np.inf], + np.logspace( + 0, + n_lambda_bins - 1, + num=n_lambda_bins, + base=BASE, + dtype=np.float64, + ), + [np.inf], + ) + ) + + # list of tile coordinate ranges + tiles = list( + generate_tiles_diag_band( + clr, view_df, kernel_half_width, tile_size_bins, loci_separation_bins + ) + ) + + # 1. Calculate genome-wide histograms of scores. + gw_hist = scoring_and_histogramming_step( + clr, + expected.set_index(["region1", "region2", "dist"]), + expected_value_col=expected_value_col, + clr_weight_name=clr_weight_name, + tiles=tiles, + kernels=kernels, + ledges=ledges, + max_nans_tolerated=max_nans_tolerated, + loci_separation_bins=loci_separation_bins, + nproc=nproc, + ) + + logging.info("Done building histograms ...") + + # 2. Determine the FDR thresholds. + threshold_df, qvalues = determine_thresholds(gw_hist, lambda_bin_fdr) + + # 3. Filter using FDR thresholds calculated in the histogramming step + filtered_pixels = scoring_and_extraction_step( + clr, + expected.set_index(["region1", "region2", "dist"]), + expected_value_col=expected_value_col, + clr_weight_name=clr_weight_name, + tiles=tiles, + kernels=kernels, + ledges=ledges, + thresholds=threshold_df, + max_nans_tolerated=max_nans_tolerated, + loci_separation_bins=loci_separation_bins, + nproc=nproc, + bin1_id_name="bin1_id", + bin2_id_name="bin2_id", + ) + + # 4. Post-processing + logging.info(f"Begin post-processing of {len(filtered_pixels)} filtered pixels") + logging.info("preparing to extract needed q-values ...") + + # annotate enriched pixels + filtered_pixels_qvals = annotate_pixels_with_qvalues(filtered_pixels, qvalues) + filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals, clr.bins()[:]) + if not clustering_radius: + # TODO: make sure returned DataFrame has the same columns as "postprocessed_calls" + # columns to return before-clustering + output_cols = [] + output_cols += bedpe_required_cols + output_cols += [ + observed_count_name, + ] + output_cols += [f"la_exp.{k}.value" for k in kernels] + output_cols += [f"la_exp.{k}.qval" for k in kernels] + return filtered_pixels_annotated[output_cols] + + # 4a. clustering + # Clustering is done independently for every region, therefore regions must be assigned: + filtered_pixels_annotated = assign_regions(filtered_pixels_annotated, view_df) + centroids = clustering_step( + filtered_pixels_annotated, + clustering_radius, + ).reset_index(drop=True) + + # columns to return post-clustering + output_cols = [] + output_cols += bedpe_required_cols + output_cols += [ + "cstart1", + "cstart2", + "c_label", + "c_size", + observed_count_name, + ] + output_cols += [f"la_exp.{k}.value" for k in kernels] + output_cols += [f"la_exp.{k}.qval" for k in kernels] + + # 4b. filter by enrichment and qval + if cluster_filtering is None: # default - engage HiCCUPS filtering + postprocessed_calls = cluster_filtering_hiccups(centroids) + elif not cluster_filtering: + postprocessed_calls = centroids + + return postprocessed_calls diff --git a/cooltools/cli/dots.py b/cooltools/cli/dots.py index afe07266..ea7e9021 100644 --- a/cooltools/cli/dots.py +++ b/cooltools/cli/dots.py @@ -13,7 +13,6 @@ from ..lib.common import make_cooler_view, assign_regions from ..lib.io import read_viewframe_from_file, read_expected_from_file - from .util import validate_csv logging.basicConfig(level=logging.INFO) @@ -86,21 +85,7 @@ show_default=True, ) @click.option( - "--kernel-width", - help="Outer half-width of the convolution kernel in pixels" - " e.g. outer size (w) of the 'donut' kernel, with the 2*w+1" - " overall footprint of the 'donut'.", - type=int, -) -@click.option( - "--kernel-peak", - help="Inner half-width of the convolution kernel in pixels" - " e.g. inner size (p) of the 'donut' kernel, with the 2*p+1" - " overall footprint of the punch-hole.", - type=int, -) -@click.option( - "--num-lambda-chunks", + "--num-lambda-bins", help="Number of log-spaced bins to divide your adjusted expected" " between. Same as HiCCUPS_W1_MAX_INDX (40) in the original HiCCUPS.", type=int, @@ -116,7 +101,7 @@ show_default=True, ) @click.option( - "--dots-clustering-radius", + "--clustering-radius", help="Radius for clustering dots that have been called too close to each other." "Typically on order of 40 kilo-bases, and >= binsize.", type=int, @@ -128,10 +113,8 @@ ) @click.option( "-o", - "--out-prefix", - help="Specify prefix for the output file, to store results of dot-calling:" - " all enriched pixels as prefix + '.enriched.tsv'," - " and post-processed dots (clustered,filtered) as prefix + '.postproc.bedpe'", + "--output", + help="Specify output file name to store called dots in a BEDPE-like format", type=str, required=True, ) @@ -144,13 +127,11 @@ def dots( max_loci_separation, max_nans_tolerated, tile_size, - kernel_width, - kernel_peak, - num_lambda_chunks, + num_lambda_bins, fdr, - dots_clustering_radius, + clustering_radius, verbose, - out_prefix, + output, ): """ Call dots on a Hi-C heatmap that are not larger than max_loci_separation. @@ -176,167 +157,44 @@ def dots( clr = cooler.Cooler(cool_path) expected_path, expected_value_col = expected_path - #### Generate viewframes #### - # 1:cooler_view_df. Generate viewframe from clr.chromsizes: - cooler_view_df = make_cooler_view(clr) - - # 2:view_df. Define global view for calculating calling dots - # use input "view" BED file or all chromosomes : + # Either use view from file or all chromosomes in the provided cooler if view is None: - view_df = cooler_view_df + view_df = make_cooler_view(clr) else: view_df = read_viewframe_from_file(view, clr, check_sorting=True) - #### Read expected: #### - expected_summary_cols = [ - expected_value_col, - ] expected = read_expected_from_file( expected_path, contact_type="cis", - expected_value_cols=expected_summary_cols, + expected_value_cols=[expected_value_col], verify_view=view_df, verify_cooler=clr, ) - # add checks to make sure cis-expected is symmetric - - # Prepare some parameters. - binsize = clr.binsize - loci_separation_bins = int(max_loci_separation / binsize) - tile_size_bins = int(tile_size / binsize) - balance_factor = 1.0 # clr._load_attrs("bins/weight")["scale"] - - # clustering would deal with bases-units for now, so supress this for now - # clustering_radius_bins = int(dots_clustering_radius/binsize) - - # kernels - # 'upright' is a symmetrical inversion of "lowleft", not needed. - ktypes = ["donut", "vertical", "horizontal", "lowleft"] - - if (kernel_width is None) or (kernel_peak is None): - w, p = api.dotfinder.recommend_kernel_params(binsize) - logging.info( - f"Using kernel parameters w={w}, p={p} recommended for binsize {binsize}" - ) - else: - w, p = kernel_width, kernel_peak - # add some sanity check for w,p: - if not w > p: - raise ValueError(f"Wrong inner/outer kernel parameters w={w}, p={p}") - logging.info(f"Using kernel parameters w={w}, p={p} provided by user") - - # once kernel parameters are setup check max_nans_tolerated - # to make sure kernel footprints overlaping 1 side with the - # NaNs filled row/column are not "allowed" - # this requires dynamic adjustment for the "shrinking donut" - if not max_nans_tolerated <= 2 * w: - raise ValueError("Too many NaNs allowed!") - # may lead to scoring the same pixel twice, - i.e. duplicates. - - # generate standard kernels - consider providing custom ones - kernels = {k: api.dotfinder.get_kernel(w, p, k) for k in ktypes} - - # list of tile coordinate ranges - tiles = list( - api.dotfinder.heatmap_tiles_generator_diag( - clr, view_df, w, tile_size_bins, loci_separation_bins - ) - ) - # lambda-chunking edges ... - if not 40 <= num_lambda_chunks <= 50: - raise ValueError("Incompatible num_lambda_chunks") - base = 2 ** (1 / 3) - ledges = np.concatenate( - ( - [-np.inf], - np.logspace( - 0, - num_lambda_chunks - 1, - num=num_lambda_chunks, - base=base, - dtype=np.float64, - ), - [np.inf], - ) - ) - - # 1. Calculate genome-wide histograms of scores. - gw_hist = api.dotfinder.scoring_and_histogramming_step( - clr, - expected.set_index(["region1", "region2", "dist"]), - expected_value_col, - clr_weight_name, - tiles, - kernels, - ledges, - max_nans_tolerated, - loci_separation_bins, - nproc, - verbose, - ) - - if verbose: - logging.info("Done building histograms ...") - - # 2. Determine the FDR thresholds. - threshold_df, qvalues = api.dotfinder.determine_thresholds( - kernels, ledges, gw_hist, fdr - ) - - # 3. Filter using FDR thresholds calculated in the histogramming step - filtered_pixels = api.dotfinder.scoring_and_extraction_step( + dot_calls_df = api.dotfinder.dots( clr, - expected.set_index(["region1", "region2", "dist"]), - expected_value_col, - clr_weight_name, - tiles, - kernels, - ledges, - threshold_df, - max_nans_tolerated, - balance_factor, - loci_separation_bins, - op.join(op.dirname(out_prefix), op.basename(out_prefix) + ".enriched.tsv"), - nproc, - verbose, - bin1_id_name="bin1_id", - bin2_id_name="bin2_id", + expected, + expected_value_col=expected_value_col, + clr_weight_name=clr_weight_name, + view_df=view_df, + kernels=None, # engaging default HiCCUPS kernels + max_loci_separation=max_loci_separation, + max_nans_tolerated=max_nans_tolerated, # test if this has desired behavior + n_lambda_bins=num_lambda_bins, # update this eventually + lambda_bin_fdr=fdr, + clustering_radius=clustering_radius, + cluster_filtering=None, + tile_size=tile_size, + nproc=nproc, ) - # 4. Post-processing - if verbose: - logging.info(f"Begin post-processing of {len(filtered_pixels)} filtered pixels") - logging.info("preparing to extract needed q-values ...") - - filtered_pixels_qvals = api.dotfinder.annotate_pixels_with_qvalues( - filtered_pixels, qvalues, kernels - ) - # 4a. clustering - ######################################################################## - # Clustering has to be done using annotated DataFrame of filtered pixels - # why ? - because - clustering has to be done independently for every region! - ######################################################################## - filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals, clr.bins()[:]) - filtered_pixels_annotated = assign_regions(filtered_pixels_annotated, view_df) - # consider reseting index here - centroids = api.dotfinder.clustering_step( - filtered_pixels_annotated, - view_df["name"], - dots_clustering_radius, - verbose, - ) - - # 4b. filter by enrichment and qval - postprocessed_calls = api.dotfinder.thresholding_step(centroids) - - # Final-postprocessed result - if out_prefix is not None: - - postprocessed_fname = op.join( - op.dirname(out_prefix), op.basename(out_prefix) + ".postproc.bedpe" - ) - - postprocessed_calls.to_csv( - postprocessed_fname, sep="\t", header=True, index=False, compression=None + # output results in a file, when specified + if output: + dot_calls_df.to_csv(output, sep="\t", header=True, index=False, na_rep="nan") + # or print into stdout otherwise: + else: + print( + dot_calls_df.to_csv( + output, sep="\t", header=True, index=False, na_rep="nan" + ) ) diff --git a/cooltools/cli/saddle.py b/cooltools/cli/saddle.py index b593b891..c34fddd3 100644 --- a/cooltools/cli/saddle.py +++ b/cooltools/cli/saddle.py @@ -218,7 +218,7 @@ def saddle( track_columns = ["chrom", "start", "end", track_name] # specify dtype as a rudimentary form of validation: track_dtype = { - "chrom": np.str, + "chrom": np.str_, "start": np.int64, "end": np.int64, track_name: np.float64, diff --git a/cooltools/lib/_numutils.pyx b/cooltools/lib/_numutils.pyx index 9597d0f5..5554476e 100644 --- a/cooltools/lib/_numutils.pyx +++ b/cooltools/lib/_numutils.pyx @@ -124,6 +124,15 @@ def iterative_correction_symmetric( tolerance : float If less or equal to zero, will perform max_iter iterations. + Returns + ------- + _x : np.ndarray + Corrected matrix + totalBias : np.ndarray + Vector with corrections biases + report : (bool, int) + A tuple that reports convergence + status and used number of iterations """ cdef int N = len(x) @@ -168,7 +177,7 @@ def iterative_correction_symmetric( corr = totalBias[s0!=0].mean() #mean correction factor x = x * corr * corr #renormalizing everything totalBias /= corr - report = {'converged':converged, 'iternum':iternum} + report = (converged, iternum) return x, totalBias, report diff --git a/cooltools/lib/checks.py b/cooltools/lib/checks.py index 3f5f502f..f95f514c 100644 --- a/cooltools/lib/checks.py +++ b/cooltools/lib/checks.py @@ -39,9 +39,9 @@ def _is_expected( Check if a expected_df looks like an expected DataFrame, i.e.: - has neccessary columns - - there are no Nulls in regions1/2, diag - - every trans region1/2 has a single value - - every cis region1/2 has at least one value + - there are no Nulls in the regions1, regions2, diag columns + - every trans pair region1 region2 has a single value + - every cis pair region1, region2 has at least one value Parameters ---------- @@ -72,7 +72,7 @@ def _is_expected( # that's what we expect as column names: expected_columns = [col for col in expected_dtypes] - # separate "structural" columns: region1/2 and diag if "cis": + # separate "structural" columns: region1, region2, diag if "cis": grouping_columns = expected_columns[:-1] # add columns with values and their dtype (float64): @@ -124,8 +124,8 @@ def _is_expected( if expected_df.duplicated(subset=grouping_columns).any(): raise ValueError(f"Values in {grouping_columns} columns must be unique") - # make sure region1/2 groups have 1 value for trans contacts - # and more than 1 values for cis contacts + # for trans expected, ensure pairs region1, region2 have 1 value + # for cis expected, ensure pairs region1, region2 have >=1 value region1_col, region2_col = grouping_columns[:2] for (r1, r2), df in expected_df.groupby([region1_col, region2_col]): if contact_type == "trans": @@ -174,10 +174,12 @@ def _is_compatible_cis_expected( raise_errors=False, ): """ - Verify expected_df to make sure it is compatible - with its view (viewframe) and cooler, i.e.: - - regions1/2 are matching names from view - - number of diagonals per region1/2 matches cooler + Verify expected_df to make sure it is compatible. + + Expected tables can be verified against both + a view and a cooler. This includes ensuring: + - the entries in columns region1, region2 match names from view + - number of diagonals per pair of region1, region2 matches cooler Parameters ---------- @@ -267,8 +269,7 @@ def _is_compatible_trans_expected( """ Verify expected_df to make sure it is compatible with its view (viewframe) and cooler, i.e.: - - regions1/2 are matching names from view - - number of diagonals per region1/2 matches cooler + - entries in region1 and region2 match names from view Parameters ---------- diff --git a/cooltools/lib/numutils.py b/cooltools/lib/numutils.py index 7a4e7763..5d35c066 100644 --- a/cooltools/lib/numutils.py +++ b/cooltools/lib/numutils.py @@ -2,8 +2,7 @@ from scipy.linalg import toeplitz import scipy.sparse.linalg import scipy.interpolate -from scipy.ndimage.interpolation import zoom -import scipy.ndimage.filters +from scipy.ndimage import zoom, gaussian_filter1d import numpy as np import numba import cooler @@ -668,6 +667,15 @@ def iterative_correction_symmetric( tol : float If less or equal to zero, will perform max_iter iterations. + Returns + ------- + _x : np.ndarray + Corrected matrix + totalBias : np.ndarray + Vector with corrections biases + report : (bool, int) + A tuple that reports convergence + status and used number of iterations """ N = len(x) @@ -710,7 +718,7 @@ def iterative_correction_symmetric( corr = totalBias[~mask].mean() # mean correction factor _x = _x * corr * corr # renormalizing everything totalBias /= corr - report = {"converged": converged, "iternum": iternum} + report = (converged, iternum) return _x, totalBias, report @@ -729,6 +737,18 @@ def iterative_correction_asymmetric(x, max_iter=1000, tol=1e-5, verbose=False): The number of diagonals to ignore during iterative correction. tol : float If less or equal to zero, will perform max_iter iterations. + + Returns + ------- + _x : np.ndarray + Corrected matrix + totalBias : np.ndarray + Vector with corrections biases for columns + totalBias2 : np.ndarray + Vector with corrections biases for rows + report : (bool, int) + A tuple that reports convergence + status and used number of iterations """ N2, N = x.shape _x = x.copy() @@ -777,7 +797,7 @@ def iterative_correction_asymmetric(x, max_iter=1000, tol=1e-5, verbose=False): _x = _x * corr * corr2 # renormalizing everything totalBias /= corr totalBias2 /= corr2 - report = {"converged": converged, "iternum": iternum} + report = (converged, iternum) return _x, totalBias, totalBias2, report @@ -1380,7 +1400,7 @@ def _expand(ar, counts=None): def robust_gauss_filter( - ar, sigma=2, functon=scipy.ndimage.filters.gaussian_filter1d, kwargs=None + ar, sigma=2, functon=gaussian_filter1d, kwargs=None ): """ Implements an edge-handling mode for gaussian filter that basically ignores diff --git a/cooltools/sandbox/expected_smoothing.py b/cooltools/sandbox/expected_smoothing.py index 7eaae476..630675cc 100644 --- a/cooltools/sandbox/expected_smoothing.py +++ b/cooltools/sandbox/expected_smoothing.py @@ -1,4 +1,4 @@ -import collections +from collections.abc import Iterable import numpy as np import numba @@ -276,7 +276,7 @@ def agg_smooth_cvd( groupby = [] elif isinstance(groupby, str): groupby = [groupby] - elif isinstance(groupby, collections.Iterable): + elif isinstance(groupby, Iterable): groupby = list(groupby) else: raise ValueError("groupby must be a string, a list of strings, or None") diff --git a/tests/test_call-dots.py b/tests/test_call-dots.py index d3b83f41..e1dec94f 100644 --- a/tests/test_call-dots.py +++ b/tests/test_call-dots.py @@ -2,78 +2,57 @@ from click.testing import CliRunner from cooltools.cli import cli +import cooler +import numpy as np +from cooltools import api +from cooltools.lib.io import read_viewframe_from_file, read_expected_from_file -def test_call_dots_cli(request, tmpdir): - in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") - in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.chromnamed.tsv") - out_dots = op.join(tmpdir, "test.dots") - - runner = CliRunner() - result = runner.invoke( - cli, - [ - "dots", - "-p", - 1, - "--kernel-width", - 2, - "--kernel-peak", - 1, - "--tile-size", - 60_000_000, - "--max-loci-separation", - 100_000_000, - "--out-prefix", - out_dots, - in_cool, - in_exp, - ], - ) - # This command should fail because viewframe interpreted from cooler does not correspond to toy_expected: - assert result.exit_code == 1 - - -def test_call_dots_view_cli(request, tmpdir): +# test user-facing API for calling dots +def test_dots(request): # Note that call-dots requires ucsc named expected and view in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv") in_regions = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed") - out_dots = op.join(tmpdir, "test.dots") - runner = CliRunner() - cmd = [ - "dots", - "--view", - in_regions, - "-p", - 1, - "--kernel-width", - 2, - "--kernel-peak", - 1, - "--tile-size", - 60_000_000, - "--max-loci-separation", - 100_000_000, - "--out-prefix", - out_dots, - in_cool, + # read data for the test: + clr = cooler.Cooler(in_cool) + view_df = read_viewframe_from_file(in_regions, clr, check_sorting=True) + expected_df = read_expected_from_file( in_exp, - ] - result = runner.invoke(cli, cmd) - assert result.exit_code == 0 - # make sure output is generated: - assert op.isfile(f"{out_dots}.enriched.tsv") - assert op.isfile(f"{out_dots}.postproc.bedpe") + expected_value_cols=["balanced.avg"], + verify_view=view_df, + verify_cooler=clr, + ) + # generate dot-calls + dot_calls_df = api.dotfinder.dots( + clr, + expected_df, + view_df=view_df, + kernels={ + "d": np.array([[1, 0, 1], [0, 0, 0], [1, 0, 1]]), + "v": np.array([[0, 1, 0], [0, 0, 0], [0, 1, 0]]), + "h": np.array([[0, 0, 0], [1, 0, 1], [0, 0, 0]]), + }, + max_loci_separation=100_000_000, + max_nans_tolerated=1, + n_lambda_bins=50, + lambda_bin_fdr=0.1, + clustering_radius=False, + cluster_filtering=None, + tile_size=50_000_000, + nproc=1, + ) -# TODO: Remove this test once "regions" are deprecated altogether: -def test_call_dots_regions_deprecated_cli(request, tmpdir): - # Note that call-dots requires ucsc named expected and view + # no comparison with reference results yet + # just checking if it runs without errors + assert not dot_calls_df.empty + + +def test_call_dots_cli(request, tmpdir): in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") - in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv") - in_regions = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed") + in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.chromnamed.tsv") out_dots = op.join(tmpdir, "test.dots") runner = CliRunner() @@ -81,25 +60,48 @@ def test_call_dots_regions_deprecated_cli(request, tmpdir): cli, [ "dots", - "--regions", - in_regions, "-p", 1, - "--kernel-width", - 2, - "--kernel-peak", - 1, "--tile-size", 60_000_000, "--max-loci-separation", 100_000_000, - "--out-prefix", + "--output", out_dots, in_cool, in_exp, ], ) - assert result.exit_code == 0 - # make sure output is generated: - assert op.isfile(f"{out_dots}.enriched.tsv") - assert op.isfile(f"{out_dots}.postproc.bedpe") + # This command should fail because viewframe interpreted from cooler does not correspond to toy_expected: + assert result.exit_code == 1 + + +# comment this test for now, until we swap out input data and/or allow for custom kernels + +# def test_call_dots_view_cli(request, tmpdir): +# # Note that call-dots requires ucsc named expected and view +# in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") +# in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv") +# in_regions = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed") +# out_dots = op.join(tmpdir, "test.dots") + +# runner = CliRunner() +# cmd = [ +# "dots", +# "--view", +# in_regions, +# "-p", +# 1, +# "--tile-size", +# 60_000_000, +# "--max-loci-separation", +# 100_000_000, +# "--output", +# out_dots, +# in_cool, +# in_exp, +# ] +# result = runner.invoke(cli, cmd) +# assert result.exit_code == 0 +# # make sure output is generated: +# assert op.isfile(out_dots) diff --git a/tests/test_dotfinder_chunking.py b/tests/test_dotfinder_chunking.py index d5ca1511..554dfc4c 100644 --- a/tests/test_dotfinder_chunking.py +++ b/tests/test_dotfinder_chunking.py @@ -1,13 +1,15 @@ -# create a test for the chunking versions of 'get_adjusted_expected_tile_some_nans': +# create a test for square_tiling and get_adjusted_expected_tile_some_nans import numpy as np import pandas as pd import os.path as op -from cooltools.api import dotfinder from cooltools.lib.numutils import LazyToeplitz - +from cooltools.api.dotfinder import ( + tile_square_matrix, + get_adjusted_expected_tile_some_nans, +) # adjust the path for data: testdir = op.realpath(op.dirname(__file__)) @@ -16,6 +18,8 @@ mock_input = op.join(testdir, "data", "dotfinder_mock_inputs.npz") mock_result = op.join(testdir, "data", "dotfinder_mock_res.csv.gz") +# load mock results: +mock_res = pd.read_csv(mock_result).rename(columns={"row": "bin1_id", "col": "bin2_id"}) # load bunch of array from a numpy npz container: arrays_loaded = np.load(mock_input) @@ -29,11 +33,9 @@ # 1D expected extracted for tiling-tests: mock_exp = LazyToeplitz(mock_E_ice[0, :]) -# we need w-edge for tiling procedures: -w = 3 -# p = 1 -# kernel type: 'donut' -# # just a simple donut kernel for testing: +# we need kernel_half_width-edge for tiling procedures: +kernel_half_width = 3 +# just a simple donut kernel for testing: kernel = np.array( [ [1, 1, 1, 0, 1, 1, 1], @@ -45,20 +47,70 @@ [1, 1, 1, 0, 1, 1, 1], ] ) -# bin size: -# mock data were extracted -# from 20kb matrix: +# mock bin size 20kb b = 20000 # 2MB aroud diagonal to look at: band = 2e6 -# 1MB aroud diagonal to look at: -band_1 = 1e6 # start, stop for tiling procedures: start, stop = 0, len(mock_M_raw) -# load mock results: -mock_res = pd.read_csv(mock_result).rename(columns={"row": "bin1_id", "col": "bin2_id"}) + +################################################################# +# test 'get_adjusted_expected_tile_some_nans' without chunking : +################################################################# +def test_adjusted_expected_tile_some_nans(): + print("Running tile some nans la_exp test") + # first, generate that locally-adjusted expected: + res = get_adjusted_expected_tile_some_nans( + origin_ij=(0, 0), + observed=mock_M_raw, # should be RAW ... + expected=mock_E_ice, + bal_weights=mock_v_ice, + kernels={"donut": kernel, "footprint": np.ones_like(kernel)}, + ) + # mock results are supposedly for + # the contacts closer than the band + # nucleotides to each other... + # we want to retire that functional + # from the 'get_adjusted_expected_tile_some_nans' + # so we should complement that selection + # here: + nnans = 1 # no nans tolerated + band_idx = int(band / b) + is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx) + does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans + # so, selecting inside band and nNaNs compliant results: + res = res[is_inside_band & does_comply_nans].reset_index(drop=True) + # + # ACTUAL TESTS: + # integer part of DataFrame must equals exactly: + assert res[["bin1_id", "bin2_id"]].equals(mock_res[["bin1_id", "bin2_id"]]) + # compare floating point part separately: + assert np.allclose( + res["la_exp." + "donut" + ".value"], mock_res["la_expected"], equal_nan=True + ) + # + # try recovering more NaNs ... + # (allowing 1 here per pixel's footprint)... + res = get_adjusted_expected_tile_some_nans( + origin_ij=(0, 0), + observed=mock_M_raw, # should be RAW... + expected=mock_E_ice, + bal_weights=mock_v_ice, + kernels={"donut": kernel, "footprint": np.ones_like(kernel)}, + ) + # post-factum filtering resides + # outside of get_la_exp now: + nnans = 2 # just 1 nan tolerated + band_idx = int(band / b) + is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx) + does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans + # so, selecting inside band and comply nNaNs results only: + res = res[is_inside_band & does_comply_nans].reset_index(drop=True) + + # now we can only guess the size: + assert len(res) > len(mock_res) def test_adjusted_expected_tile_some_nans_and_square_tiling(): @@ -66,9 +118,9 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling(): # first, generate that locally-adjusted expected: nnans = 1 band_idx = int(band / b) - res_df = pd.DataFrame([]) - for tilei, tilej in dotfinder.square_matrix_tiling( - start, stop, step=40, edge=w, square=False + res_ij = [] + for tilei, tilej in tile_square_matrix( + stop - start, start, tile_size=40, pad=kernel_half_width ): # define origin: origin = (tilei[0], tilej[0]) @@ -80,26 +132,23 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling(): ice_weight_i = mock_v_ice[slice(*tilei)] ice_weight_j = mock_v_ice[slice(*tilej)] # that's the main working function from dotfinder: - res = dotfinder.get_adjusted_expected_tile_some_nans( - origin=origin, + res = get_adjusted_expected_tile_some_nans( + origin_ij=origin, observed=observed, expected=expected, bal_weights=(ice_weight_i, ice_weight_j), kernels={"donut": kernel, "footprint": np.ones_like(kernel)}, - # nan_threshold=1, - verbose=False, ) is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx) # new style, selecting good guys: does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans # so, select inside band and nNaNs compliant results and append: - res_df = res_df.append( - res[is_inside_band & does_comply_nans], ignore_index=True - ) + res_ij.append(res[is_inside_band & does_comply_nans]) # drop dups (from overlaping tiles), sort and reset index: res_df = ( - res_df.drop_duplicates() + pd.concat(res_ij, ignore_index=True) + .drop_duplicates() .sort_values(by=["bin1_id", "bin2_id"]) .reset_index(drop=True) ) @@ -118,20 +167,20 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling(): mock_res_sorted[["bin1_id", "bin2_id"]] ) # compare floating point part separately: - assert np.isclose( + assert np.allclose( res_df["la_exp." + "donut" + ".value"], mock_res_sorted["la_expected"], equal_nan=True, - ).all() + ) def test_adjusted_expected_tile_some_nans_and_square_tiling_diag_band(): print("Running tile some nans la_exp test + square tiling + diag band") # # Essentially, testing this function: - # def chrom_chunk_generator_s(chroms, w, band): + # def chrom_chunk_generator_s(chroms, kernel_half_width, band): # for chrom in chroms: # chr_start, chr_stop = the_c.extent(chrom) - # for tilei, tilej in square_matrix_tiling(chr_start, chr_stop, step, w): + # for tilei, tilej in square_matrix_tiling(chr_start, chr_stop, step, kernel_half_width): # # check if a given tile intersects with # # with the diagonal band of interest ... # diag_from = tilej[0] - tilei[1] @@ -141,14 +190,14 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling_diag_band(): # band_to = band # # we are using this >2w trick to exclude # # tiles from the lower triangle from calculations ... - # if (min(band_to,diag_to) - max(band_from,diag_from)) > 2*w: + # if (min(band_to,diag_to) - max(band_from,diag_from)) > 2*kernel_half_width: # yield chrom, tilei, tilej # first, generate that locally-adjusted expected: nnans = 1 band_idx = int(band / b) - res_df = pd.DataFrame([]) - for tilei, tilej in dotfinder.square_matrix_tiling( - start, stop, step=40, edge=w, square=False + res_ij = [] + for tilei, tilej in tile_square_matrix( + stop - start, start, tile_size=40, pad=kernel_half_width ): # check if a given tile intersects with # with the diagonal band of interest ... @@ -159,7 +208,7 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling_diag_band(): band_to = band_idx # we are using this >2w trick to exclude # tiles from the lower triangle from calculations ... - if (min(band_to, diag_to) - max(band_from, diag_from)) > 2 * w: + if (min(band_to, diag_to) - max(band_from, diag_from)) > 2 * kernel_half_width: # define origin: origin = (tilei[0], tilej[0]) # RAW observed matrix slice: @@ -170,26 +219,23 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling_diag_band(): ice_weight_i = mock_v_ice[slice(*tilei)] ice_weight_j = mock_v_ice[slice(*tilej)] # that's the main working function from dotfinder: - res = dotfinder.get_adjusted_expected_tile_some_nans( - origin=origin, + res = get_adjusted_expected_tile_some_nans( + origin_ij=origin, observed=observed, expected=expected, bal_weights=(ice_weight_i, ice_weight_j), kernels={"donut": kernel, "footprint": np.ones_like(kernel)}, - # nan_threshold=1, - verbose=False, ) is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx) # new style, selecting good guys: does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans # so, select inside band and nNaNs compliant results and append: - res_df = res_df.append( - res[is_inside_band & does_comply_nans], ignore_index=True - ) + res_ij.append(res[is_inside_band & does_comply_nans]) # sort and reset index, there shouldn't be any duplicates now: res_df = ( - res_df.drop_duplicates() + pd.concat(res_ij, ignore_index=True) + .drop_duplicates() .sort_values(by=["bin1_id", "bin2_id"]) .reset_index(drop=True) ) @@ -208,8 +254,8 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling_diag_band(): mock_res_sorted[["bin1_id", "bin2_id"]] ) # compare floating point part separately: - assert np.isclose( + assert np.allclose( res_df["la_exp." + "donut" + ".value"], mock_res_sorted["la_expected"], equal_nan=True, - ).all() + ) diff --git a/tests/test_dotfinder_la_exp.py b/tests/test_dotfinder_la_exp.py deleted file mode 100644 index 5b7cf553..00000000 --- a/tests/test_dotfinder_la_exp.py +++ /dev/null @@ -1,113 +0,0 @@ -# create a test for get_locally_adjusted_expected using mock dense matrices ... -# testing la_exp functions on a single piece of matrix -# no chunking applied here ... - -import numpy as np -import pandas as pd - -import os.path as op - -# try importing stuff from dotfinder: -from cooltools.api.dotfinder import get_adjusted_expected_tile_some_nans - -# adjust the path for data: -testdir = op.realpath(op.dirname(__file__)) - -# mock input data location: -mock_input = op.join(testdir, "data", "dotfinder_mock_inputs.npz") -mock_result = op.join(testdir, "data", "dotfinder_mock_res.csv.gz") - -# load mock results: -mock_res = pd.read_csv(mock_result) -mock_res = mock_res.rename(columns={"row": "bin1_id", "col": "bin2_id"}) - -# load bunch of array from a numpy npz container: -arrays_loaded = np.load(mock_input) -# snippets of M_raw, M_ice, E_ice and v_ice are supposed -# to be there ... -mock_M_raw = arrays_loaded["mock_M_raw"] -mock_M_ice = arrays_loaded["mock_M_ice"] -mock_E_ice = arrays_loaded["mock_E_ice"] -mock_v_ice = arrays_loaded["mock_v_ice"] - - -# we need w-edge for tiling procedures: -# w = 3 -# p = 1 -# kernel type: 'donut' -# # just a simple donut kernel for testing: -kernel = np.array( - [ - [1, 1, 1, 0, 1, 1, 1], - [1, 1, 1, 0, 1, 1, 1], - [1, 1, 0, 0, 0, 1, 1], - [0, 0, 0, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 1, 1], - [1, 1, 1, 0, 1, 1, 1], - [1, 1, 1, 0, 1, 1, 1], - ] -) -# bin size: -# mock data were extracted -# from 20kb matrix: -b = 20000 -# 2MB aroud diagonal to look at: -band = 2e6 - - -################################################################# -# main test function for new 'get_adjusted_expected_tile_some_nans': -################################################################# -def test_adjusted_expected_tile_some_nans(): - print("Running tile some nans la_exp test") - # first, generate that locally-adjusted expected: - # Ed_raw, mask_ndx, Cobs, Cexp, NN = - res = get_adjusted_expected_tile_some_nans( - origin=(0, 0), - observed=mock_M_raw, # should be RAW ... - expected=mock_E_ice, - bal_weights=mock_v_ice, - kernels={"donut": kernel, "footprint": np.ones_like(kernel)}, - ) - # mock results are supposedly for - # the contacts closer than the band - # nucleotides to each other... - # we want to retire that functional - # from the 'get_adjusted_expected_tile_some_nans' - # so we should complement that selection - # here: - nnans = 1 # no nans tolerated - band_idx = int(band / b) - is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx) - does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans - # so, selecting inside band and nNaNs compliant results: - res = res[is_inside_band & does_comply_nans].reset_index(drop=True) - # - # ACTUAL TESTS: - # integer part of DataFrame must equals exactly: - assert res[["bin1_id", "bin2_id"]].equals(mock_res[["bin1_id", "bin2_id"]]) - # compare floating point part separately: - assert np.isclose( - res["la_exp." + "donut" + ".value"], mock_res["la_expected"], equal_nan=True - ).all() - # - # try recovering more NaNs ... - # (allowing 1 here per pixel's footprint)... - res = get_adjusted_expected_tile_some_nans( - origin=(0, 0), - observed=mock_M_raw, # should be RAW... - expected=mock_E_ice, - bal_weights=mock_v_ice, - kernels={"donut": kernel, "footprint": np.ones_like(kernel)}, - ) - # post-factum filtering resides - # outside of get_la_exp now: - nnans = 2 # just 1 nan tolerated - band_idx = int(band / b) - is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx) - does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans - # so, selecting inside band and comply nNaNs results only: - res = res[is_inside_band & does_comply_nans].reset_index(drop=True) - - # now we can only guess the size: - assert len(res) > len(mock_res) diff --git a/tests/test_dotfinder_stats.py b/tests/test_dotfinder_stats.py new file mode 100644 index 00000000..63cbb1a2 --- /dev/null +++ b/tests/test_dotfinder_stats.py @@ -0,0 +1,276 @@ +import numpy as np +import pandas as pd +from scipy.stats import poisson + +from cooltools.api.dotfinder import ( + histogram_scored_pixels, + determine_thresholds, + annotate_pixels_with_qvalues, + extract_scored_pixels, +) + + +# helper functions for BH-FDR copied from www.statsmodels.org +def _fdrcorrection(pvals, alpha=0.05): + """ + pvalue correction for false discovery rate. + + This covers Benjamini/Hochberg for independent or positively correlated tests. + + Parameters + ---------- + pvals : np.ndarray + Sorted set of p-values of the individual tests. + alpha : float, optional + Family-wise error rate. Defaults to ``0.05``. + + Returns + ------- + rejected : ndarray, bool + True if a hypothesis is rejected, False if not + pvalue-corrected : ndarray + pvalues adjusted for multiple hypothesis testing to limit FDR + + """ + + ntests = len(pvals) + # empirical Cumulative Distribution Function for pvals: + ecdffactor = np.arange(1, ntests + 1) / float(ntests) + + reject = pvals <= ecdffactor * alpha + if reject.any(): + rejectmax = max(np.nonzero(reject)[0]) + reject[:rejectmax] = True + + pvals_corrected_raw = pvals / ecdffactor + pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] + del pvals_corrected_raw + pvals_corrected[pvals_corrected > 1] = 1 + return reject, pvals_corrected + + +def multipletests(pvals, alpha=0.1, is_sorted=False): + """ + Test results and p-value correction for multiple tests + Using FDR Benjamini-Hochberg method (non-negative) + + Parameters + ---------- + pvals : array_like, 1-d + uncorrected p-values. Must be 1-dimensional. + alpha : float + FWER, family-wise error rate, e.g. 0.1 + is_sorted : bool + If False (default), the p_values will be sorted, but the corrected + pvalues are in the original order. If True, then it assumed that the + pvalues are already sorted in ascending order. + + Returns + ------- + reject : ndarray, boolean + true for hypothesis that can be rejected for given alpha + pvals_corrected : ndarray + p-values corrected for multiple tests + + Notes + ----- + the p-value correction is independent of the + alpha specified as argument. In these cases the corrected p-values + can also be compared with a different alpha + + All procedures that are included, control FWER or FDR in the independent + case, and most are robust in the positively correlated case. + """ + pvals = np.asarray(pvals) + + if not is_sorted: + sortind = np.argsort(pvals) + pvals = np.take(pvals, sortind) + + reject, pvals_corrected = _fdrcorrection(pvals, alpha=alpha) + + if is_sorted: + return reject, pvals_corrected + else: + pvals_corrected_ = np.empty_like(pvals_corrected) + pvals_corrected_[sortind] = pvals_corrected + del pvals_corrected + reject_ = np.empty_like(reject) + reject_[sortind] = reject + return reject_, pvals_corrected_ + + +# mock input data to perform some p-value calculation and correction on +num_pixels = 2500 +max_value = 99 +# fake kernels just for the sake of their names 'd' and 'v': +fake_kernels = { + "d": np.random.randint(2, size=9).reshape(3, 3), + "v": np.random.randint(2, size=9).reshape(3, 3), +} +# table with the "scored" pixel (as if they are returned by dotfinder-scoring function) +pixel_dict = {} +# enrich fake counts to all for more significant calls +pixel_dict["count"] = np.random.randint(max_value, size=num_pixels) + 9 +for k in fake_kernels: + pixel_dict[f"la_exp.{k}.value"] = max_value * np.random.random(num_pixels) +scored_df = pd.DataFrame(pixel_dict) + +# design lambda-bins as in dot-calling: +num_lchunks = 6 +ledges = np.r_[[-np.inf], np.linspace(0, max_value, num_lchunks), [np.inf]] + +# set FDR parameter +FDR = 0.1 + + +# helper functions working on a chunk of counts or pvals +# associated with a given lambda-bin: +def get_pvals_chunk(counts_series_lchunk): + """ + Parameters: + ----------- + counts_series_lchunk : pd.Series(int) + Series of raw pixel counts where the name of the Series + is pd.Interval of the lambda-bin where the pixel belong. + I.e. counts_series_lchunk.name.right - is the upper limit of the chunk + and is used as "expected" in Poisson distribution to estimate p-value. + + Returns: + -------- + pvals: ndarray[float] + array of p-values for each pixel + + Notes: + ------ + poisson.sf = 1.0 - poisson.cdf + """ + return poisson.sf(counts_series_lchunk.values, counts_series_lchunk.name.right) + + +def get_qvals_chunk(pvals_series_lchunk): + """ + Parameters: + ----------- + pvals_series_lchunk : pd.Series(float) + Series of p-values calculated for each pixel, where the name + of the Series is pd.Interval of the lambda-bin where the pixel belong. + + Returns: + -------- + qvals: ndarray[float] + array of q-values, i.e. p-values corrected with the multiple hypothesis + testing procedure BH-FDR, for each pixel + + Notes: + ------ + level of False Discore Rate (FDR) is fixed for testing + """ + _, qvals = multipletests(pvals_series_lchunk.values, alpha=FDR, is_sorted=False) + return qvals + + +def get_reject_chunk(pvals_series_lchunk): + """ + Parameters: + ----------- + pvals_series_lchunk : pd.Series(float) + Series of p-values calculated for each pixel, where the name + of the Series is pd.Interval of the lambda-bin where the pixel belong. + + Returns: + -------- + rej: ndarray[bool] + array of rejection statuses, i.e. for every p-values return if corresponding + null hypothesis can be rejected or not, using multiple hypothesis testing + procedure BH-FDR. + + Notes: + ------ + - pixels with rejected status (not null) are considered as significantly enriched + - level of False Discore Rate (FDR) is fixed for testing + """ + rej, _ = multipletests(pvals_series_lchunk.values, alpha=FDR, is_sorted=False) + return rej + + +# for the fake scored-pixel table calculate p-vals, q-vals, l-chunk where they belong +# and rejection status using introduced statsmodels-based helper functions: +for k in fake_kernels: + lbin = pd.cut(scored_df[f"la_exp.{k}.value"], ledges) + scored_df[f"{k}.pval"] = scored_df.groupby(lbin)["count"].transform(get_pvals_chunk) + scored_df[f"{k}.qval"] = scored_df.groupby(lbin)[f"{k}.pval"].transform( + get_qvals_chunk + ) + scored_df[f"{k}.rej"] = scored_df.groupby(lbin)[f"{k}.pval"].transform( + get_reject_chunk + ) + + +# test functions in dotfinder using this reference from statsmodels +def test_histogramming_summary(): + gw_hists = histogram_scored_pixels( + scored_df, kernels=fake_kernels, ledges=ledges, obs_raw_name="count" + ) + # make sure total sum of the histogram yields total number of pixels: + for k, _hist in gw_hists.items(): + assert _hist.sum().sum() == num_pixels + assert _hist.index.is_monotonic # is index sorted + + +# test threshold and rejection tables and only then try q-values +def test_thresholding(): + # rebuild hists + gw_hists = histogram_scored_pixels( + scored_df, kernels=fake_kernels, ledges=ledges, obs_raw_name="count" + ) + + # # we have to make sure there is nothing in the last lambda-bin + # # this is a temporary implementation detail, until we implement dynamic lambda-bins + for k in fake_kernels: + last_lambda_bin = gw_hists[k].iloc[:, -1] + assert last_lambda_bin.sum() == 0 # should be True by construction: + # drop that last column/bin (last_edge, +inf]: + gw_hists[k] = gw_hists[k].drop(columns=last_lambda_bin.name) + + # calculate q-values and rejection threshold using dotfinder built-in methods + # that are the reimplementation of HiCCUPS statistical procedures: + threshold_df, qvalues = determine_thresholds(gw_hists, FDR) + + enriched_pixels_df = extract_scored_pixels( + scored_df, threshold_df, obs_raw_name="count" + ) + + # all enriched pixels have their Null hypothesis rejected + assert enriched_pixels_df["d.rej"].all() + assert enriched_pixels_df["v.rej"].all() + # number of enriched pixels should match that number of + # pixels with both null-hypothesis rejected: + assert (scored_df["d.rej"] & scored_df["v.rej"]).sum() == len(enriched_pixels_df) + + +def test_qvals(): + # rebuild hists + gw_hists = histogram_scored_pixels( + scored_df, kernels=fake_kernels, ledges=ledges, obs_raw_name="count" + ) + + # # we have to make sure there is nothing in the last lambda-bin + # # this is a temporary implementation detail, until we implement dynamic lambda-bins + for k in fake_kernels: + last_lambda_bin = gw_hists[k].iloc[:, -1] + assert last_lambda_bin.sum() == 0 # should be True by construction: + # drop that last column/bin (last_edge, +inf]: + gw_hists[k] = gw_hists[k].drop(columns=last_lambda_bin.name) + + # calculate q-values and rejection threshold using dotfinder built-in methods + # that are the reimplementation of HiCCUPS statistical procedures: + threshold_df, qvalues = determine_thresholds(gw_hists, FDR) + # annotate scored pixels with q-values: + scored_df_qvals = annotate_pixels_with_qvalues( + scored_df, qvalues, obs_raw_name="count" + ) + + # our procedure in dotfiner should match these q-values exactly, including >1.0 + assert np.allclose(scored_df_qvals["v.qval"], scored_df_qvals["la_exp.v.qval"]) + assert np.allclose(scored_df_qvals["d.qval"], scored_df_qvals["la_exp.d.qval"])