diff --git a/cooltools/__init__.py b/cooltools/__init__.py
index 690a9c15..816b2f51 100644
--- a/cooltools/__init__.py
+++ b/cooltools/__init__.py
@@ -32,3 +32,4 @@
 from .api.snipping import pileup
 from .api.directionality import directionality
 from .api.insulation import insulation
+from .api.dotfinder import dots
diff --git a/cooltools/api/dotfinder.py b/cooltools/api/dotfinder.py
index 718eae6a..f7c399d9 100644
--- a/cooltools/api/dotfinder.py
+++ b/cooltools/api/dotfinder.py
@@ -1,10 +1,78 @@
 """
 Collection of functions related to dot-calling
 
+The main user-facing API function is:
+dots(
+    clr,
+    expected,
+    expected_value_col="balanced.avg",
+    clr_weight_name="weight",
+    view_df=None,
+    kernels=None,
+    max_loci_separation=10_000_000,
+    max_nans_tolerated=1,
+    n_lambda_bins=40,
+    lambda_bin_fdr=0.1,
+    clustering_radius=20_000,
+    cluster_filtering=None,
+    tile_size=5_000_000,
+    nproc=1,
+)
+This function implements HiCCUPS-style dot calling, but enables user-specified
+modifications at multiple steps. The current implementation makes two passes
+over the input data, first to create a histogram of pixel enrichment values, 
+and second to extract significantly enriched pixels.
+
+ * The function starts with compatibility verifications
+    for the `clr`, `expected` and `view` of interest.
+ * Recommendation or verification for `kernels` is done next.
+     Custom kernels must satisfy properties including: square shape,
+     equal sizes, odd sizes, zeros in the middle, etc. By default,
+     HiCCUPS-style kernels are recommended based on the binsize.
+ * Lambda bins are defined for multiple hypothesis
+     testing separately for different value ranges of the locally adjusted expected.
+     Currently, log-binned lambda-bins are hardcoded using a pre-defined
+     BASE of 2^(1/3). `n_lambda_bins` controls the total number of bins.
+ * Genomic regions in the specified `view`(all chromosomes by default)
+     are split into smaller tiles of size `tile_size`.
+ * `scoring_and_histogramming_step()` is performed independently
+     on the genomic tiles. In this step, locally adjusted expected is
+     calculated using convolution kernels for each pixel in the tile.
+     All surveyed pixels are histogrammed according to their adjusted 
+     expected and raw observed counts. Locally adjusted expected is 
+     not stored in memory.
+ * Chunks of histograms are aggregated together and a modified BH-FDR
+     procedure is applied to the result in `determine_thresholds()`.
+     This returns thresholds for statistical significance 
+     in each lambda-bin (for observed counts), along with the adjusted
+     p-values (q-values).
+ * Calculated thresholds are used to extract statistically significant
+     pixels in `scoring_and_extraction_step()`. Because locally adjusted
+     expected is not stored in memory, it is re-caluclated
+     during this step, which makes it computationally intensive.
+     Locally adjusted expected values are required in order to apply
+     different thresholds of significance depending on the lambda-bin.
+ * Returned filtered pixels, or 'dots', are significantly enriched 
+     relative to their locally adjusted expecteds and thus have potential
+     biological interest. Dots are further annotated with their 
+     genomic coordinates and q-values (adjusted p-values) for
+     all applied kernels.
+ * All further steps perform optional post-processing on called dots
+      - enriched pixels that are within `clustering_radius` of each other
+        are clustered together and the brightest one is selected as the
+        representative position of a dot.
+      - cluster-representatives along with "singletons" (enriched pixels
+        that are not part of any cluster) can be subjected to further
+        empirical enrichment filtering in `cluster_filtering_hiccups()`. This 
+        both requires clustered dots exceed prescribed enrichment thresholds 
+        relative to their local neighborhoods and that singletons pass an 
+        even more stringent q-value threshold.
 """
+
 from functools import partial, reduce
 import multiprocess as mp
 import logging
+import warnings
 
 from scipy.linalg import toeplitz
 from scipy.ndimage import convolve
@@ -16,10 +84,14 @@
 import cooler
 
 from ..lib.numutils import LazyToeplitz, get_kernel
-from ..lib.checks import is_compatible_viewframe, is_cooler_balanced
-from ..lib.common import make_cooler_view
+from ..lib.common import assign_regions, make_cooler_view
+from ..lib.checks import (
+    is_cooler_balanced,
+    is_compatible_viewframe,
+    is_valid_expected,
+)
 
-import bioframe
+from bioframe import make_viewframe
 
 logging.basicConfig(level=logging.INFO)
 
@@ -30,13 +102,32 @@
 nans_inkernel_name = lambda kernel_name: f"la_exp.{kernel_name}.nnans"
 bin1_id_name = "bin1_id"
 bin2_id_name = "bin2_id"
+bedpe_required_cols = [
+    "chrom1",
+    "start1",
+    "end1",
+    "chrom2",
+    "start2",
+    "end2",
+]
+
+
+# define basepairs to bins for clarity
+def bp_to_bins(basepairs, binsize):
+    return int(basepairs / binsize)
 
 
-def recommend_kernel_params(binsize):
+def recommend_kernels(binsize):
     """
-    Recommned kernel parameters for the
-    standard convolution kernels, 'donut', etc
-    same as in Rao et al 2014
+    Return a recommended set of convolution kernels for dot-calling
+    based on the resolution, or binsize, of the input data.
+
+    This function currently recommends the four kernels used in the HiCCUPS method:
+    donut, horizontal, vertical, lowerleft. Kernels are recommended for resolutions
+    near 5 kb, 10 kb, and 25 kb. Dots are not typically visible at lower resolutions
+    (binsize >28kb) and the majority of datasets are too sparse for dot-calling
+    at very high resolutions (<4kb). Given this, default kernels are not
+    recommended for resolutions outside this range.
 
     Parameters
     ----------
@@ -45,47 +136,91 @@ def recommend_kernel_params(binsize):
 
     Returns
     -------
-    (w,p) : (integer, integer)
-        tuple of the outer and inner kernel sizes
+    kernels : {str:ndarray}
+        dictionary of convolution kernels as ndarrays, with their
+        names as keys.
     """
-    # kernel parameters depend on the cooler resolution
-    # TODO: rename w, p to wid, pix probably, or _w, _p to avoid naming conflicts
+    # define "donut" parameters w and p based on the resolution of the data
+    # w is the outside half-width, and p is the internal half-width.
     if binsize > 28000:
-        # > 30 kb - is probably too much ...
         raise ValueError(
-            f"Provided cooler has resolution {binsize} bases,"
-            " which is too coarse for analysis."
+            f"Provided cooler has resolution of {binsize} bases,"
+            " which is too coarse for automated kernel recommendation."
+            " Provide custom kernels to proceed."
         )
     elif binsize >= 18000:
-        # ~ 20-25 kb:
         w, p = 3, 1
     elif binsize >= 8000:
-        # ~ 10 kb
         w, p = 5, 2
     elif binsize >= 4000:
-        # ~5 kb
         w, p = 7, 4
     else:
-        # < 5 kb - is probably too fine ...
         raise ValueError(
             f"Provided cooler has resolution {binsize} bases,"
-            " which is too fine for analysis."
+            " which is too fine for automated kernel recommendation. Provide custom kernels"
+            " to proceed."
         )
-    # return the results:
-    return w, p
+    logging.info(
+        f"Using recommended donut-based kernels with w={w}, p={p} for binsize={binsize}"
+    )
+    # standard set of 4 kernels used in Rao et al 2014
+    # 'upright' is a symmetrical inversion of "lowleft", not needed.
+    kernel_types = ["donut", "vertical", "horizontal", "lowleft"]
 
+    # generate standard kernels - consider providing custom ones
+    kernels = {k: get_kernel(w, p, k) for k in kernel_types}
 
-def annotate_pixels_with_qvalues(
-    pixels_df, qvalues, kernels, inplace=False, obs_raw_name=observed_count_name
-):
+    return kernels
+
+
+def is_compatible_kernels(kernels, binsize, max_nans_tolerated):
+    """
+    TODO implement checks for kernels:
+     - matrices are of the same size
+     - they should be squared (too restrictive ? maybe pad with 0 as needed)
+     - dimensions are odd, to have a center pixel to refer to
+     - they can be turned into int 1/0 ones (too restrictive ? allow weighted kernels ?)
+     - the central pixel should be zero perhaps (unless weights are allowed 4sure)
+     - maybe introduce an upper limit to the size - to avoid crazy long calculations
+     - check relative to the binsize maybe ? what's the criteria ?
     """
-    Add columns with the qvalues to a DataFrame of pixels
-    ... detailed but unedited notes ...
-    Extract q-values using l-chunks and IntervalIndex.
-    we'll do it in an ugly but workign fashion, by simply
-    iteration over pairs of obs, la_exp and extracting needed qvals
-    one after another
-    ...
+
+    # kernels must be a dict with kernel-names as keys
+    # and kernel ndarrays as values.
+    if not isinstance(kernels, dict):
+        raise ValueError(
+            "'kernels' must be a dictionary" "with name-keys and ndarrays-values."
+        )
+
+    # deduce kernel_width - overall footprint
+    kernel_widths = [len(k) for kn, k in kernels.items()]
+    # kernels must have the same width for now:
+    if min(kernel_widths) != max(kernel_widths):
+        raise ValueError(f"all 'kernels' must have the same size, now: {kernel_widths}")
+    # now extract their dimensions:
+    kernel_width = max(kernel_widths)
+    kernel_half_width = (kernel_width - 1) / 2  # former w parameter
+    if (kernel_half_width <= 0) or not kernel_half_width.is_integer():
+        raise ValueError(
+            f"Size of the convolution kernels has to be odd and > 3, currently {kernel_width}"
+        )
+
+    # once kernel parameters are setup check max_nans_tolerated
+    # to make sure kernel footprints overlaping 1 side with the
+    # NaNs filled row/column are not "allowed"
+    if not max_nans_tolerated <= kernel_width:
+        raise ValueError(
+            f"Too many NaNs allowed max_nans_tolerated={max_nans_tolerated}"
+        )
+    # may lead to scoring the same pixel twice, - i.e. duplicates.
+
+    # return True if everyhting passes
+    return True
+
+
+def annotate_pixels_with_qvalues(pixels_df, qvalues, obs_raw_name=observed_count_name):
+    """
+    Add columns with the qvalues to a DataFrame of scored pixels
 
     Parameters
     ----------
@@ -96,46 +231,42 @@ def annotate_pixels_with_qvalues(
     qvalues : dict of DataFrames
         A dictionary with keys being kernel names and values DataFrames
         storing q-values for each observed count values in each lambda-
-        chunk. Colunms are Intervals defined by 'ledges' boundaries.
+        bin. Colunms are Intervals defined by 'ledges' boundaries.
         Rows corresponding to a range of observed count values.
-    kernels : dict
-        A dictionary with keys being kernels names and values being ndarrays
-        representing those kernels.
+    obs_raw_name : str
+        Name of the column/field that carry number of counts per pixel,
+        i.e. observed raw counts.
 
     Returns
     -------
     pixels_qvalue_df : pandas.DataFrame
-        DataFrame of pixels with additional columns
-        storing qvalues corresponding to the observed
-        count value of a given pixel, given kernel-type,
-        and a lambda-chunk.
-
-    Notes
-    -----
-    Should be applied to a filtered DF of pixels, otherwise would
-    be too resource-hungry.
+        DataFrame of pixels with additional columns la_exp.{k}.qval,
+        storing q-values (adjusted p-values) corresponding to the count
+        value of a pixel, its kernel, and a lambda-bin it belongs to.
     """
-    if inplace:
-        pixels_qvalue_df = pixels_df
-    else:
-        # let's do it "safe" - using a copy:
-        pixels_qvalue_df = pixels_df.copy()
-    # attempting to extract q-values using l-chunks and IntervalIndex:
-    # we'll do it in an ugly but workign fashion, by simply
-    # iteration over pairs of obs, la_exp and extracting needed qvals
-    # one after another ...
-    for k in kernels:
-        pixels_qvalue_df[f"la_exp.{k}.qval"] = [
-            qvalues[k].loc[o, e]
-            for o, e in pixels_df[[obs_raw_name, f"la_exp.{k}.value"]].itertuples(
-                index=False
-            )
-        ]
-    # qvalues : dict
-    #   A dictionary with keys being kernel names and values pandas.DataFrame-s
-    #   storing q-values: each column corresponds to a lambda-chunk,
-    #   while rows correspond to observed pixels values.
-    return pixels_qvalue_df
+    # do it "safe" - using a copy:
+    pixels_qvalue_df = pixels_df.copy()
+    # columns to return
+    cols = list(pixels_qvalue_df.columns)
+    # will do it efficiently using "melted" qvalues table:
+    for k, qval_df in qvalues.items():
+        lbins = pd.IntervalIndex(qval_df.columns)
+        pixels_qvalue_df["lbins"] = pd.cut(
+            pixels_qvalue_df[f"la_exp.{k}.value"], bins=lbins
+        )
+        pixels_qvalue_df = pixels_qvalue_df.merge(
+            # melted qval_df columns: [counts, la_exp.k.value, value]
+            qval_df.melt(ignore_index=False).reset_index(),
+            left_on=[obs_raw_name, "lbins"],
+            right_on=[obs_raw_name, f"la_exp.{k}.value"],
+            suffixes=("", "_"),
+        )
+        qval_col_name = f"la_exp.{k}.qval"
+        pixels_qvalue_df = pixels_qvalue_df.rename(columns={"value": qval_col_name})
+        cols.append(qval_col_name)
+
+    # return q-values annotated pixels
+    return pixels_qvalue_df.loc[:, cols]
 
 
 def clust_2D_pixels(
@@ -145,7 +276,6 @@ def clust_2D_pixels(
     bin2_id_name="bin2_id",
     clust_label_name="c_label",
     clust_size_name="c_size",
-    verbose=True,
 ):
     """
     Group significant pixels by proximity using Birch clustering. We use
@@ -172,8 +302,6 @@ def clust_2D_pixels(
         Name of the cluster of pixels label. "c_label" by default.
     clust_size_name : str
         Name of the cluster of pixels size. "c_size" by default.
-    verbose : bool
-        Print verbose clustering summary report defaults is True.
 
     Returns
     -------
@@ -224,14 +352,10 @@ def clust_2D_pixels(
     # take centroids corresponding to labels (as many as needed):
     centroids_per_pixel = np.take(clustered_centroids, clustered_labels, axis=0)
 
-    if verbose:
-        # prepare clustering summary report:
-        msg = (
-            "Clustering is completed:\n"
-            + f"{uniq_counts.size} clusters detected\n"
-            + f"{uniq_counts.mean():.2f}+/-{uniq_counts.std():.2f} mean size\n"
-        )
-        logging.info(msg)
+    # clustering message:
+    logging.info(
+        f"detected {uniq_counts.size} clusters of {uniq_counts.mean():.2f}+/-{uniq_counts.std():.2f} size"
+    )
 
     # create output DataFrame
     centroids_n_labels_df = pd.DataFrame(
@@ -252,120 +376,86 @@ def clust_2D_pixels(
 ##################################
 
 
-def square_matrix_tiling(start, stop, step, edge, square=False, verbose=False):
+def tile_square_matrix(matrix_size, offset, tile_size, pad=0):
     """
-    Generate a stream of tiling coordinates that guarantee to cover an entire
-    matrix. cis-signal only!
+    Generate a stream of coordinates of tiles that cover a matrix of a given size.
+    Matrix has to be square, on-digaonal one: e.g. corresponding to a chromosome
+    or a chromosomal arm.
 
     Parameters
     ----------
-    start : int
-        Starting position of the matrix slice to be tiled (inclusive, bins,
-        0-based).
-    stop : int
-        End position of the matrix slice to be tiled (exclusive, bins, 0-based).
-    step : int
-        Requested size of the tiles. Boundary tiles may or may not be of
-        'step', see 'square'.
-    edge : int
-        Small edge around each tile to be included in the yielded coordinates.
-    square : bool, optional
+    matrix_size : int
+        Size of a squared matrix
+    offset : int
+        Offset coordinates of generated tiles by 'offset'
+    tile_size : int
+        Requested size of the tiles. Tiles near
+        the right and botoom edges could be rectangular
+        and smaller then 'tile_size'
+    pad : int
+        Small padding around each tile to be included in the yielded coordinates.
 
     Yields
     ------
-    Pairs of indices for every chunk use those indices [cstart:cstop) to fetch
-    chunks from the cooler-object:
-    >>> clr.matrix()[cstart:cstop, cstart:cstop]
+    Pairs of indices/coordinates of every tile: (start_i, end_i), (start_j, end_j)
 
     Notes
     -----
-    Each slice is characterized by the coordinate
-    of the top-left corner and size.
-
-    * * * * * * * * * * * * *
-    *       *       *       *
-    *       *       *  ...  *
-    *       *       *       *
-    * * * * * * * * *       *
-    *       *               *
-    *       *    ...        *
-    *       *               *
-    * * * * *               *
-    *                       *
-    *                       *
-    *                       *
-    * * * * * * * * * * * * *
-
-    Square parameter determines behavior
-    of the tiling function, when the
-    size of the matrix is not an exact
-    multiple of the 'step':
-
-    square = False
-    * * * * * * * * * * *
-    *       *       *   *
-    *       *       *   *
-    *       *       *   *
-    * * * * * * * * * * *
-    *       *       *   *
-    *       *       *   *
-    *       *       *   *
-    * * * * * * * * * * *
-    *       *       *   *
-    *       *       *   *
-    * * * * * * * * * * *
-    WARNING: be carefull with extracting
-    expected in this case, as it is
-    not trivial at all !!!
-
-    square = True
-    * * * * * * * * * * *
-    *       *   *   *   *
-    *       *   *   *   *
-    *       *   *   *   *
-    * * * * * * * * * * *
-    *       *   *   *   *
-    *       *   *   *   *
-    * * * * * * * * * * *
-    * * * * * * * * * * *
-    *       *   *   *   *
-    *       *   *   *   *
-    * * * * * * * * * * *
-
+    Generated tiles coordinates [start_i,end_i) , [start_i,end_i)
+    can be used to fetch heatmap tiles from cooler:
+    >>> clr.matrix()[start_i:end_i, start_j:end_j]
+
+    'offset' is useful when a given matrix is part of a
+    larger matrix (a given chromosome or arm), and thus
+    all coordinated needs to be offset to get absolute
+    coordinates.
+
+    Tiles are non-overlapping (pad=0), but tiles near
+    the right and bottom edges could be rectangular:
+
+    * * * * * * * * *
+    *     *     *   *
+    *     *     *   *
+    * * * * * * *   *
+    *     *         *
+    *     *  ...    *
+    * * * *         *
+    *               *
+    * * * * * * * * *
     """
-    size = stop - start
-    tiles = size // step + bool(size % step)
+    # number of tiles along each axis
+    if matrix_size % tile_size:
+        num_tiles = matrix_size // tile_size + 1
+    else:
+        num_tiles = matrix_size // tile_size
 
-    if verbose:
+    logging.info(
+        f" matrix {matrix_size}X{matrix_size} to be split into {num_tiles * num_tiles} tiles of {tile_size}X{tile_size}."
+    )
+    if pad:
         logging.info(
-            f"matrix of size {size}X{size} to be splitted\n"
-            + f"  into square tiles of size {step}.\n"
-            + f"  A small 'edge' of size w={edge} is added, to allow for\n"
-            + "  meaningfull convolution around boundaries.\n"
-            + f"  Resulting number of tiles is {tiles * tiles}"
+            f" tiles are padded (width={pad}) to enable convolution near the edges"
         )
 
-    for tx in range(tiles):
-        for ty in range(tiles):
+    # generate 'num_tiles X num_tiles' tiles
+    for ti in range(num_tiles):
+        for tj in range(num_tiles):
 
-            lwx = max(0, step * tx - edge)
-            rwx = min(size, step * (tx + 1) + edge)
-            if square and (rwx >= size):
-                lwx = size - step - edge
+            start_i = max(0, tile_size * ti - pad)
+            start_j = max(0, tile_size * tj - pad)
 
-            lwy = max(0, step * ty - edge)
-            rwy = min(size, step * (ty + 1) + edge)
-            if square and (rwy >= size):
-                lwy = size - step - edge
+            end_i = min(matrix_size, tile_size * (ti + 1) + pad)
+            end_j = min(matrix_size, tile_size * (tj + 1) + pad)
 
-            yield (lwx + start, rwx + start), (lwy + start, rwy + start)
+            yield (start_i + offset, end_i + offset), (start_j + offset, end_j + offset)
 
 
-def heatmap_tiles_generator_diag(clr, view_df, pad_size, tile_size, band_to_cover):
+def generate_tiles_diag_band(clr, view_df, pad_size, tile_size, band_to_cover):
     """
-    A generator yielding heatmap tiles that are needed to cover the requested
-    band_to_cover around diagonal. Each tile is "padded" with pad_size edge to
-    allow proper kernel-convolution of pixels close to boundary.
+    A generator yielding corrdinates of heatmap tiles that are needed to cover
+    the requested band_to_cover around diagonal. Each tile is "padded" with
+    the pad of size 'pad_size' to allow for convolution near the boundary of
+    a tile.
 
     Parameters
     ----------
@@ -383,83 +473,40 @@ def heatmap_tiles_generator_diag(clr, view_df, pad_size, tile_size, band_to_cove
         Typically correspond to the max_loci_separation for called dots.
     Returns
     -------
-    tile : tuple
-        Generator of tuples of three, which contain
-        chromosome name, row index of the tile,
-        column index of the tile (region_name, tilei, tilej).
-
+    tile_coords : tuple
+        Generator of tile coordinates, i.e. tuples of three:
+        (region_name, tile_span_i, tile_span_j), where 'tile_span_i/j'
+        each is a tuple of bin ids (bin_start, bin_end).
     """
 
     for chrom, start, end, region_name in view_df.itertuples(index=False):
-        region_begin, region_end = clr.extent((chrom, start, end))
-        for tilei, tilej in square_matrix_tiling(
-            region_begin, region_end, tile_size, pad_size
+        region_start, region_end = clr.extent((chrom, start, end))
+        region_size = region_end - region_start
+        for tile_span_i, tile_span_j in tile_square_matrix(
+            matrix_size=region_size,
+            offset=region_start,
+            tile_size=tile_size,
+            pad=pad_size,
         ):
             # check if a given tile intersects with
             # with the diagonal band of interest ...
-            diag_from = tilej[0] - tilei[1]
-            diag_to = tilej[1] - tilei[0]
-            #
-            band_from = 0
-            band_to = band_to_cover
+            tile_diag_start = tile_span_j[0] - tile_span_i[1]
+            tile_diag_end = tile_span_j[1] - tile_span_i[0]
+            # TODO allow more flexible definition of a band to cover
+            band_start, band_end = 0, band_to_cover
             # we are using this >2*padding trick to exclude
             # tiles from the lower triangle from calculations ...
-            if (min(band_to, diag_to) - max(band_from, diag_from)) > 2 * pad_size:
-                yield region_name, tilei, tilej
-
-
-##################################
-# kernel-convolution related:
-##################################
-def _convolve_and_count_nans(O_bal, E_bal, E_raw, N_bal, kernel):
-    """
-    Dense versions of a bunch of matrices needed for convolution and
-    calculation of number of NaNs in a vicinity of each pixel. And a kernel to
-    be provided of course.
-
-    """
-    # a matrix filled with the kernel-weighted sums
-    # based on a balanced observed matrix:
-    KO = convolve(O_bal, kernel, mode="constant", cval=0.0, origin=0)
-    # a matrix filled with the kernel-weighted sums
-    # based on a balanced expected matrix:
-    KE = convolve(E_bal, kernel, mode="constant", cval=0.0, origin=0)
-    # get number of NaNs in a vicinity of every
-    # pixel (kernel's nonzero footprint)
-    # based on the NaN-matrix N_bal.
-    # N_bal is shared NaNs between O_bal E_bal,
-    # is it redundant ?
-    NN = convolve(
-        N_bal.astype(np.int64),
-        # we have to use kernel's
-        # nonzero footprint:
-        (kernel != 0).astype(np.int64),
-        mode="constant",
-        # there are only NaNs
-        # beyond the boundary:
-        cval=1,
-        origin=0,
-    )
-    ######################################
-    # using cval=0 for actual data and
-    # cval=1 for NaNs matrix reduces
-    # "boundary issue" to the "number of
-    # NaNs"-issue
-    # ####################################
-
-    # now finally, E_raw*(KO/KE), as the
-    # locally-adjusted expected with raw counts as values:
-    Ek_raw = np.multiply(E_raw, np.divide(KO, KE))
-    # return locally adjusted expected and number of NaNs
-    # in the form of dense matrices:
-    return Ek_raw, NN
+            if (
+                min(band_end, tile_diag_end) - max(band_start, tile_diag_start)
+            ) > 2 * pad_size:
+                yield region_name, tile_span_i, tile_span_j
 
 
 ########################################################################
 # this is the MAIN function to get locally adjusted expected
 ########################################################################
 def get_adjusted_expected_tile_some_nans(
-    origin, observed, expected, bal_weights, kernels, balance_factor=None, verbose=False
+    origin_ij, observed, expected, bal_weights, kernels
 ):
     """
     Get locally adjusted expected for a collection of local-filters (kernels).
@@ -509,7 +556,7 @@ def get_adjusted_expected_tile_some_nans(
 
     Parameters
     ----------
-    origin : (int,int) tuple
+    origin_ij : (int,int) tuple
         tuple of interegers that specify the
         location of an observed matrix slice.
         Measured in bins, not in nucleotides.
@@ -523,10 +570,10 @@ def get_adjusted_expected_tile_some_nans(
     bal_weights : numpy.ndarray or (numpy.ndarray, numpy.ndarray)
         1D vector used to turn raw observed
         into balanced observed for a slice of
-        a matrix with the origin on the diagonal;
+        a matrix with the origin_ij on the diagonal;
         and a tuple/list of a couple of 1D arrays
         in case it is a slice with an arbitrary
-        origin.
+        origin_ij.
     kernels : dict of (str, numpy.ndarray)
         dictionary of kernels/masks to perform
         convolution of the heatmap. Kernels
@@ -538,42 +585,34 @@ def get_adjusted_expected_tile_some_nans(
         to be considered significant.
         Dictionay keys must contain names for
         each kernel.
-        Note, scipy.ndimage.convove flips kernel
-        first and only then applies it to matrix.
-    balance_factor: float
-        Multiplicative Balancing factor:
-        balanced matrix multiplied by this factor
-        sums up to the total number of reads (taking
-        symmetry into account) instead of number of
-        bins in matrix. defaults to None and skips
-        a lowleft KernelObserved factor calculation.
-        !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-        We need this to test how dynamic donut size
-        is affecting peak calling results.
-        !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    verbose: bool
-        Set to True to print some progress
-        messages to stdout.
+        Note, scipy.ndimage.convolve first flips kernel
+        and only then applies it to matrix.
 
     Returns
     -------
     peaks_df : pandas.DataFrame
-        sparsified DataFrame that stores results of
-        locally adjusted calculations for every kernel
-        for a given slice of input matrix. Multiple
-        instences of such 'peaks_df' can be concatena-
-        ted and deduplicated for the downstream analysis.
-        Reported columns:
-        bin1_id - bin1_id index (row), adjusted to origin
-        bin2_id - bin bin2_id index, adjusted to origin
+        DataFrame with the results of locally adjusted calculations
+        for every kernel for a given slice of input matrix.
+
+    Notes
+    -----
+
+    Reported columns:
+        bin1_id - bin1_id index (row), adjusted to tile_start_i
+        bin2_id - bin bin2_id index, adjusted to tile_start_j
         la_exp - locally adjusted expected (for each kernel)
         la_nan - number of NaNs around (each kernel's footprint)
         exp.raw - global expected, rescaled to raw-counts
-        obs.raw - observed values in raw-counts.
+        obs.raw(counts) - observed values in raw-counts.
+
+    Depending on the intial tiling of the interaction matrix,
+    concatened `peaks_df` may require "deduplication", as some pixels
+    can be evaluated in several tiles (e.g. near the tile edges).
+    Default tilitng in the `dots` functions, should avoid this problem.
 
     """
-    # extract origin coordinate of this tile:
-    io, jo = origin
+    # extract origin_ij coordinate of this tile:
+    io, jo = origin_ij
     # let's extract full matrices and ice_vector:
     O_raw = observed  # raw observed, no need to copy, no modifications.
     E_bal = np.copy(expected)
@@ -590,12 +629,6 @@ def get_adjusted_expected_tile_some_nans(
             "a tuple/list of a couple of numpy.ndarray-s"
             "for a slice of matrix with an arbitrary origin."
         )
-    # kernels must be a dict with kernel-names as keys
-    # and kernel ndarrays as values.
-    if not isinstance(kernels, dict):
-        raise ValueError(
-            "'kernels' must be a dictionary" "with name-keys and ndarrays-values."
-        )
 
     # balanced observed, from raw-observed
     # by element-wise multiply:
@@ -607,7 +640,7 @@ def get_adjusted_expected_tile_some_nans(
     # and also to provide fair locally adjusted expected
     # estimation for pixels very close to diagonal, whose
     # "donuts"(kernels) would be crossing the main diagonal.
-    # The trickiest thing here would be dealing with the origin: io,jo.
+    # The trickiest thing here would be dealing with the origin_ij: io,jo.
     O_bal[np.tril_indices_from(O_bal, k=(io - jo) - 1)] = np.nan
     E_bal[np.tril_indices_from(E_bal, k=(io - jo) - 1)] = np.nan
 
@@ -626,7 +659,6 @@ def get_adjusted_expected_tile_some_nans(
     # think about usinf copyto and where functions later:
     # https://stackoverflow.com/questions/6431973/how-to-copy-data-from-a-numpy-array-to-another
     #
-    #
     # we are going to accumulate all the results
     # into a DataFrame, keeping NaNs, and other
     # unfiltered results (even the lower triangle for now):
@@ -642,18 +674,6 @@ def get_adjusted_expected_tile_some_nans(
             # kernel paramters such as width etc
             # are taken into account implicitly ...
             ########################################
-            #####################
-            # unroll _convolve_and_count_nans function back
-            # for us to test the dynamic donut criteria ...
-            #####################
-            # Ek_raw, NN = _convolve_and_count_nans(O_bal,
-            #                                     E_bal,
-            #                                     E_raw,
-            #                                     N_bal,
-            #                                     kernel)
-            # Dense versions of a bunch of matrices needed for convolution and
-            # calculation of number of NaNs in a vicinity of each pixel. And a kernel to
-            # be provided of course.
             # a matrix filled with the kernel-weighted sums
             # based on a balanced observed matrix:
             KO = convolve(O_bal, kernel, mode="constant", cval=0.0, origin=0)
@@ -685,16 +705,9 @@ def get_adjusted_expected_tile_some_nans(
             # locally-adjusted expected with raw counts as values:
             Ek_raw = np.multiply(E_raw, np.divide(KO, KE))
 
-            # this is the place where we would need to extract
-            # some results of convolution and multuplt it by the
-            # appropriate factor "cooler._load_attrs(‘bins/weight’)[‘scale’]" ...
-            if balance_factor and (kernel_name == "lowleft"):
-                peaks_df[f"factor_balance.{kernel_name}.KerObs"] = (
-                    balance_factor * KO.flatten()
-                )
-                # KO*balance_factor: to be compared with 16 ...
-            if verbose:
-                logging.info(f"Convolution with kernel {kernel_name} is complete.")
+            logging.debug(
+                f"Convolution with kernel {kernel_name} is done for tile @ {io} {jo}."
+            )
             #
             # accumulation into single DataFrame:
             # store locally adjusted expected for each kernel
@@ -707,22 +720,8 @@ def get_adjusted_expected_tile_some_nans(
     # aggregated over all kernels ...
     #####################################
     peaks_df["exp.raw"] = E_raw.flatten()
-    # obs.raw -> count
     peaks_df["count"] = O_raw.flatten()
 
-    # TO BE REFACTORED/deprecated ...
-    # compatibility with legacy API is completely BROKEN
-    # post-processing allows us to restore it, see tests,
-    # but we pay with the processing speed for it.
-    mask_ndx = pd.Series(0, index=peaks_df.index, dtype=np.bool)
-    for kernel_name, kernel in kernels.items():
-        # accummulating with a vector full of 'False':
-        mask_ndx_kernel = ~np.isfinite(peaks_df["la_exp." + kernel_name + ".value"])
-        mask_ndx = np.logical_or(mask_ndx_kernel, mask_ndx)
-
-    # returning only pixels from upper triangle of a matrix
-    # is likely here to stay:
-    upper_band = peaks_df["bin1_id"] < peaks_df["bin2_id"]
     # Consider filling lower triangle of the OBSERVED matrix tile
     # with NaNs, instead of this - we'd need this for a fair
     # consideration of the pixels that are super close to the
@@ -732,44 +731,39 @@ def get_adjusted_expected_tile_some_nans(
     # close etc, is now shifted to the outside of this function
     # a way to simplify code.
 
-    # return good semi-sparsified DF:
-    return peaks_df[~mask_ndx & upper_band].reset_index(drop=True)
+    return peaks_df
 
 
 ##################################
 # step-specific dot-calling functions
 ##################################
-
-
 def score_tile(
     tile_cij,
     clr,
-    cis_exp,
-    exp_v_name,
+    expected_indexed,
+    expected_value_col,
     clr_weight_name,
     kernels,
-    nans_tolerated,
+    max_nans_tolerated,
     band_to_cover,
-    balance_factor,
-    verbose,
 ):
     """
     The main working function that given a tile of a heatmap, applies kernels to
     perform convolution to calculate locally-adjusted expected and then
-    calculates a p-value for every meaningfull pixel against these l.a. expected
-    values.
+    calculates a p-value for every meaningfull pixel against these
+    locally-adjusted expected (la_exp) values.
 
     Parameters
     ----------
     tile_cij : tuple
-        Tuple of 3: chromosome name, tile span row-wise, tile span column-wise:
-        (chrom, tile_i, tile_j), where tile_i = (start_i, end_i), and
-        tile_j = (start_j, end_j).
+        Tuple of 3: region name, tile span row-wise, tile span column-wise:
+        (region, tile_span_i, tile_span_j), where tile_span_i = (start_i, end_i), and
+        tile_span_j = (start_j, end_j).
     clr : cooler
         Cooler object to use to extract Hi-C heatmap data.
-    cis_exp : pandas.DataFrame
-        DataFrame with cis-expected, indexed with 'name' and 'diag'.
-    exp_v_name : str
+    expected_indexed : pandas.DataFrame
+        DataFrame with cis-expected, indexed with 'region1', 'region2', 'dist'.
+    expected_value_col : str
         Name of a value column in expected DataFrame
     clr_weight_name : str
         Name of a value column with balancing weights in a cooler.bins()
@@ -777,17 +771,11 @@ def score_tile(
     kernels : dict
         A dictionary with keys being kernels names and values being ndarrays
         representing those kernels.
-    nans_tolerated : int
+    max_nans_tolerated : int
         Number of NaNs tolerated in a footprint of every kernel.
     band_to_cover : int
         Results would be stored only for pixels connecting loci closer than
         'band_to_cover'.
-    balance_factor : float
-        Balancing factor to turn sum of balanced matrix back approximately
-        to the number of pairs (used for dynamic-donut criteria mostly).
-        use None value to disable dynamic-donut criteria calculation.
-    verbose : bool
-        Enable verbose output.
 
     Returns
     -------
@@ -799,89 +787,67 @@ def score_tile(
 
     """
     # unpack tile's coordinates
-    region_name, tilei, tilej = tile_cij
-    origin = (tilei[0], tilej[0])
+    region_name, tile_span_i, tile_span_j = tile_cij
+    tile_start_ij = (tile_span_i[0], tile_span_j[0])
 
     # we have to do it for every tile, because
     # region_name is not known apriori (maybe move outside)
     # use .loc[region, region] for symmetric cis regions to conform with expected v1.0
-    lazy_exp = LazyToeplitz(cis_exp.loc[region_name, region_name][exp_v_name].values)
+    lazy_exp = LazyToeplitz(
+        expected_indexed.loc[region_name, region_name][expected_value_col].to_numpy()
+    )
 
     # RAW observed matrix slice:
-    observed = clr.matrix(balance=False)[slice(*tilei), slice(*tilej)]
+    observed = clr.matrix(balance=False)[slice(*tile_span_i), slice(*tile_span_j)]
     # expected as a rectangular tile :
-    expected = lazy_exp[slice(*tilei), slice(*tilej)]
+    expected = lazy_exp[slice(*tile_span_i), slice(*tile_span_j)]
     # slice of balance_weight for row-span and column-span :
-    bal_weight_i = clr.bins()[slice(*tilei)][clr_weight_name].values
-    bal_weight_j = clr.bins()[slice(*tilej)][clr_weight_name].values
+    bal_weight_i = clr.bins()[slice(*tile_span_i)][clr_weight_name].to_numpy()
+    bal_weight_j = clr.bins()[slice(*tile_span_j)][clr_weight_name].to_numpy()
 
     # do the convolutions
     result = get_adjusted_expected_tile_some_nans(
-        origin=origin,
+        origin_ij=tile_start_ij,
         observed=observed,
         expected=expected,
         bal_weights=(bal_weight_i, bal_weight_j),
         kernels=kernels,
-        balance_factor=balance_factor,
-        verbose=verbose,
     )
 
     # Post-processing filters
+    # (0) keep only upper-triangle pixels:
+    upper_band = result["bin1_id"] < result["bin2_id"]
+
     # (1) exclude pixels that connect loci further than 'band_to_cover' apart:
     is_inside_band = result["bin1_id"] > (result["bin2_id"] - band_to_cover)
 
     # (2) identify pixels that pass number of NaNs compliance test for ALL kernels:
     does_comply_nans = np.all(
-        result[[f"la_exp.{k}.nnans" for k in kernels]] < nans_tolerated, axis=1
+        result[[f"la_exp.{k}.nnans" for k in kernels]] < max_nans_tolerated, axis=1
     )
     # so, selecting inside band and nNaNs compliant results:
-    # ( drop dropping index maybe ??? ) ...
-    res_df = result[is_inside_band & does_comply_nans].reset_index(drop=True)
-    # #######################################################################
-    # # the following should be rewritten such that we return
-    # # opnly bare minimum number of columns per chunk - i.e. annotating is too heavy
-    # # to be performed here ...
-    # #
-    # # I'll do it here - switch off annotation, it'll break "extraction" and other
-    # # downstream stuff - but we'll fix it afterwards ...
-    # #
-    # ########################################################################
-    # ########################################################################
-    # # consider retiring Poisson testing from here, in case we
-    # # stick with l-chunking or opposite - add histogramming business here(!)
-    # ########################################################################
-    # # do Poisson tests:
-    # get_pval = lambda la_exp : 1.0 - poisson.cdf(res_df["obs.raw"], la_exp)
-    # for k in kernels:
-    #     res_df["la_exp."+k+".pval"] = get_pval( res_df["la_exp."+k+".value"] )
-    # # annotate and return
-    # return cooler.annotate(res_df.reset_index(drop=True), clr.bins()[:])
-    #
-    # so return only bin_ids , observed-raw (rename to counts, by Nezar's suggestion)
-    # and a bunch of locally adjusted expected estimates - 1 per kernel -
-    # that's the bare minimum ...
-    #
-    # per Nezar's suggestion
-    # rename obs.raw -> count
-    # this would break A LOT of downstream stuff - but let it be ...
+    res_df = result[upper_band & is_inside_band & does_comply_nans].reset_index(
+        drop=True
+    )
     #
+    # so return only bare minimum: bin_ids , observed-raw counts
+    # and locally adjusted expected estimates for every kernel
     return res_df[
         ["bin1_id", "bin2_id", "count"] + [f"la_exp.{k}.value" for k in kernels]
     ].astype(dtype={f"la_exp.{k}.value": "float64" for k in kernels})
 
 
 def histogram_scored_pixels(
-    scored_df, kernels, ledges, verbose, obs_raw_name=observed_count_name
+    scored_df, kernels, ledges, obs_raw_name=observed_count_name
 ):
     """
-    An attempt to implement HiCCUPS-like lambda-chunking
-    statistical procedure.
-    This function aims at building up a histogram of locally
-    adjusted expected scores for groups of characterized
-    pixels.
-    Such histograms are later used to compute FDR thresholds
+    An attempt to implement HiCCUPS-like lambda-binning statistical procedure.
+    This function aims at building up a histogram of locally adjusted
+    expected scores for groups of characterized pixels.
+
+    Such histograms are subsequently used to compute FDR thresholds
     for different "classes" of hypothesis (classified by their
-    l.a. expected scores).
+    locally-adjusted expected (la_exp)).
 
     Parameters
     ----------
@@ -891,11 +857,9 @@ def histogram_scored_pixels(
         A dictionary with keys being kernels names and values being ndarrays
         representing those kernels.
     ledges : ndarray
-        An ndarray with bin lambda-edges for groupping loc. adj. expecteds,
-        i.e., classifying statistical hypothesis into lambda-classes.
+        An ndarray with bin lambda-edges for groupping locally adjusted
+        expecteds, i.e., classifying statistical hypothesis into lambda-bins.
         Left-most bin (-inf, 1], and right-most one (value,+inf].
-    verbose : bool
-        Enable verbose output.
     obs_raw_name : str
         Name of the column/field that carry number of counts per pixel,
         i.e. observed raw counts.
@@ -903,44 +867,17 @@ def histogram_scored_pixels(
     Returns
     -------
     hists : dict of pandas.DataFrame
-        A dictionary of pandas.DataFrame with lambda/observed histogram for
+        A dictionary of pandas.DataFrame with lambda/observed 2D histogram for
         every kernel-type.
 
 
     Notes
     -----
-    This is just an attempt to implement HiCCUPS-like lambda-chunking.
-    So we'll be returning histograms corresponding to the chunks of
-    scored pixels.
-    Consider modifying/accumulation a globally defined hists object,
-    or probably making something like a Feature2D class later on
-    where hists would be a class feature and histogramming_step would be
-    a method.
-
-
+    returning histograms corresponding to the chunks of scored pixels.
     """
 
-    # lambda-chunking implies different 'pval' calculation
-    # procedure with a single Poisson expected for all the
-    # hypothesis in a same "class", i.e. with the l.a. expecteds
-    # from the same histogram bin.
-
-    ########################
-    # implementation ideas:
-    ########################
-    # same observations/hypothesis needs to be classified according
-    # to different l.a. expecteds (i.e. for different kernel-types),
-    # which can be done with a pandas groupby, something like that:
-    # https://stackoverflow.com/questions/21441259
-    #
-    # after that we could iterate over groups and do np.bincout on
-    # the "observed.raw" column (assuming it's of integer type) ...
-
     hists = {}
     for k in kernels:
-        # verbose:
-        if verbose:
-            logging.info(f"Building a histogram for kernel-type {k}")
         #  we would need to generate a bunch of these histograms for all of the
         # kernel types:
         # needs to be lambda-binned             : scored_df["la_exp."+k+".value"]
@@ -948,37 +885,33 @@ def histogram_scored_pixels(
         #
         # lambda-bin index for kernel-type "k":
         lbins = pd.cut(scored_df[f"la_exp.{k}.value"], ledges)
-        # now for each lambda-bin construct a histogramm of "obs.raw":
-        obs_hist = {}
-        for lbin, grp_df in scored_df.groupby(lbins):
-            # check if obs.raw is integer of spome kind (temporary):
-            # obs.raw -> count
-            assert np.issubdtype(grp_df[obs_raw_name].dtype, np.integer)
-            # perform bincounting ...
-            obs_hist[lbin] = pd.Series(np.bincount(grp_df[obs_raw_name]))
-            # ACHTUNG! assigning directly to empty DF leads to data loss!
-            # turn ndarray in Series for ease of handling, i.e.
-            # assign a bunch of columns of different sizes to DataFrame.
-            #
-            # Consider updating global "hists" later on, or implementing a
-            # special class for that. Mind that Python multiprocessing
-            # "threads" are processes and thus cannot share/modify a shared
-            # memory location - deal with it, maybe dask-something ?!
-            #
-            # turned out that storing W1xW2 for every "thread" requires a ton
-            # of memory - that's why different sizes... @nvictus ?
-        # store W1x(<=W2) hist for every kernel-type:
-        hists[k] = pd.DataFrame(obs_hist).fillna(0).astype(np.int64)
+        # group scored_df by counts and lambda-bins to contructs histograms:
+        hists[k] = (
+            scored_df.groupby([obs_raw_name, lbins], dropna=False, observed=False)[
+                f"la_exp.{k}.value"
+            ]
+            .count()
+            .unstack()
+            .fillna(0)
+            .astype(np.int64)
+        )
     # return a dict of DataFrames with a bunch of histograms:
     return hists
 
 
-def determine_thresholds(kernels, ledges, gw_hist, fdr):
+def determine_thresholds(gw_hist, fdr):
     """
     given a 'gw_hist' histogram of observed counts
-    for each lambda-chunk for each kernel-type, and
+    for each lambda-bin and for each kernel-type, and
     also given a FDR, calculate q-values for each observed
-    count value in each lambda-chunk for each kernel-type.
+    count value in each lambda-bin for each kernel-type.
+
+    Parameters
+    ----------
+    gw_hist_kernels : dict
+        dictionary {kernel_name : 2D_hist}, where '2D_hist' is a pd.DataFrame
+    fdr : float
+        False Discovery Rate level
 
     Returns
     -------
@@ -988,330 +921,318 @@ def determine_thresholds(kernels, ledges, gw_hist, fdr):
       each chunk ...
     qvalues : dict
       A dictionary with keys being kernel names and values pandas.DataFrames
-      storing q-values: each column corresponds to a lambda-chunk,
+      storing q-values: each column corresponds to a lambda-bin,
       while rows correspond to observed pixels values.
 
 
     """
-    rcs_hist = {}
-    rcs_Poisson = {}
+
+    # extracting significantly enriched interactions:
+    # We have our *null* hypothesis: intensity of a HiC pixel is Poisson-distributed
+    # with a certain expected. In this case that would be *locally-adjusted expected*.
+    #
+    # Thus for dot-calling, we could estimate a *p*-value for every pixel based
+    # on its observed intensity and its expected intensity, e.g.:
+    # lambda = la_exp["la_exp."+k+".value"]; pvals = 1.0 - poisson.cdf(la_exp["count"], lambda)
+    # However this is technically challenging (too many pixels - genome wide) and
+    # might not be sensitive enough due to wide dyamic range of interaction counts
+    # Instead we use the *lambda*-binning procedure from Rao et al 2014 to tackle
+    # both technicall challenges and some issues associated with the wide dynamic range
+    # of the expected for the dot-calling (due to distance decay).
+    #
+    # Some extra points:
+    # 1. simple p-value thresholding should be replaced to more "productive" FDR, which is more tractable
+    # 2. "unfair" to treat all pixels with the same stat-testing (multiple hypothesis) - too wide range of "expected"
+    # 3. (2) is addressed by spliting the pixels in the groups by their localy adjusted expected - lambda-bins
+    # 4. upper boundary of each lambda-bin is used as expected for every pixel that belongs to the chunk:
+    #                   - for technical/efficiency reasons - test pixels in a chunk all at once
+    # for each lambda-bin q-values are calculated in an efficient way:
+    # in part, efficiency comes from collapsing identical observed values, i.e. histogramming
+    # also upper boundary of each lambda-bin is used as an expected for every pixel in this lambda-bin
+
     qvalues = {}
     threshold_df = {}
-    for k in kernels:
-        # Reverse cumulative histogram for this kernel.
-        # First row contains total # of pixels in each lambda-chunk.
-        rcs_hist[k] = gw_hist[k].iloc[::-1].cumsum(axis=0).iloc[::-1]
-
-        # Assign a unit Poisson distribution to each lambda-chunk.
-        # The expected value is the upper boundary of the lambda-chunk.
+    for k, _hist in gw_hist.items():
+        # Reverse cumulative histogram for kernel 'k'.
+        rcs_hist = _hist.iloc[::-1].cumsum(axis=0).iloc[::-1]
+        # 1st row of 'rcs_hist' contains total pixels-counts in each lambda-bin.
+        norm = rcs_hist.iloc[0, :]
+
+        # Assign a unit Poisson distribution to each lambda-bin.
+        # The expected value 'lambda' is the upper boundary of each lambda-bin:
         #   poisson.sf = 1 - poisson.cdf, but more precise
-        #   poisson.sf(-1,mu) == 1.0, i.e. is equivalent to the
-        #   poisson.pmf(gw_hist[k].index, mu)[::-1].cumsum()[::-1]
-        rcs_Poisson[k] = pd.DataFrame()
-        for mu, column in zip(ledges[1:-1], gw_hist[k].columns):
-            renorm_factors = rcs_hist[k].loc[0, column]
-            rcs_Poisson[k][column] = renorm_factors * poisson.sf(
-                gw_hist[k].index - 1, mu
-            )
+        #   poisson.sf(-1, lambda) == 1.0, i.e. is equivalent to the
+        #   poisson.pmf(rcs_hist.index, lambda)[::-1].cumsum()[::-1]
+        # unit Poisson is a collection of 1-CDF distributions for each l-chunk
+        # same dimensions as rcs_hist - matching lchunks and observed values:
+        unit_Poisson = pd.DataFrame().reindex_like(rcs_hist)
+        for lbin in rcs_hist.columns:
+            # Number of occurances in Poisson distribution for which we estimate
+            _occurances = rcs_hist.index.to_numpy()
+            unit_Poisson[lbin] = poisson.sf(_occurances, lbin.right)
+        # normalize unit-Poisson distr for the total pixel counts per lambda-bin
+        unit_Poisson = norm * unit_Poisson
 
         # Determine the threshold by checking the value at which 'fdr_diff'
-        # first turns positive. Fill NaNs with an "unreachably" high value.
-        fdr_diff = fdr * rcs_hist[k] - rcs_Poisson[k]
-        very_high_value = len(rcs_hist[k])
+        # first turns positive. Fill NaNs with a high value, that's out of reach.
+        _high_value = rcs_hist.index.max() + 1
+        fdr_diff = ((fdr * rcs_hist) - unit_Poisson).cummax()
+        # cummax ensures monotonic increase of differences
         threshold_df[k] = (
-            fdr_diff.where(fdr_diff > 0)
-            .apply(lambda col: col.first_valid_index())
-            .fillna(very_high_value)
+            fdr_diff.mask(fdr_diff < 0)  # mask negative with nans
+            .idxmin()  # index of the first positive difference
+            .fillna(_high_value)  # set high threshold if no pixels pass
             .astype(np.int64)
         )
-        # q-values
-        # bear in mind some issues with lots of NaNs and Infs after
-        # such a brave operation ...
-        qvalues[k] = rcs_Poisson[k] / rcs_hist[k]
+        qvalues[k] = (unit_Poisson / rcs_hist).cummin()
+        # run cumulative min, on the array of adjusted p-values
+        # to make sure q-values are monotonously decreasing with pvals
+        qvalues[k] = qvalues[k].mask(qvalues[k] > 1.0, 1.0)
 
     return threshold_df, qvalues
 
 
-def extract_scored_pixels(
-    scored_df, kernels, thresholds, ledges, verbose, obs_raw_name=observed_count_name
-):
+def extract_scored_pixels(scored_df, thresholds, obs_raw_name=observed_count_name):
     """
-    An attempt to implement HiCCUPS-like lambda-chunking
-    statistical procedure.
+    Implementation of HiCCUPS-like lambda-binning statistical procedure.
     Use FDR thresholds for different "classes" of hypothesis
-    (classified by their l.a. expected scores), in order to
-    extract pixels that stand out.
+    (classified by their locally-adjusted expected (la_exp) scores),
+    in order to extract "enriched" pixels.
 
     Parameters
     ----------
     scored_df : pd.DataFrame
         A table with the scoring information for a group of pixels.
-    kernels : dict
-        A dictionary with keys being kernel names and values being ndarrays
-        representing those kernels.
     thresholds : dict
-        A dictionary with keys being kernel names and values pandas.Series
-        indexed with Intervals defined by 'ledges' boundaries and storing FDR
-        thresholds for observed values.
-    ledges : ndarray
-        An ndarray with bin lambda-edges for groupping loc. adj. expecteds,
-        i.e., classifying statistical hypothesis into lambda-classes.
-        Left-most bin (-inf, 1], and right-most one (value,+inf].
-    verbose : bool
-        Enable verbose output.
+        A dictionary {kernel_name : lambda_thresholds}, where 'lambda_thresholds'
+        are pd.Series with FDR thresholds indexed by lambda-bin intervals
     obs_raw_name : str
-        Name of the column/field that carry number of counts per pixel,
+        Name of the column/field with number of counts per pixel,
         i.e. observed raw counts.
 
     Returns
     -------
     scored_df_slice : pandas.DataFrame
-        Filtered DataFrame of pixels extracted applying FDR thresholds.
-
-    Notes
-    -----
-    This is just an attempt to implement HiCCUPS-like lambda-chunking.
+        Filtered DataFrame of pixels that satisfy thresholds.
 
     """
-    comply_fdr_list = np.ones(len(scored_df), dtype=np.bool)
-
-    for k in kernels:
-        # using special features of IntervalIndex:
-        # http://pandas.pydata.org/pandas-docs/stable/advanced.html#intervalindex
-        # i.e. IntervalIndex can be .loc-ed with values that would be
-        # corresponded with their Intervals (!!!):
-        # obs.raw -> count
-        comply_fdr_k = (
-            scored_df[obs_raw_name].values
-            > thresholds[k].loc[scored_df[f"la_exp.{k}.value"]].values
+    compliant_pixel_masks = []
+    for kernel_name, threshold in thresholds.items():
+        # locally adjusted expected (lambda) of the scored pixels:
+        lambda_of_pixels = scored_df[f"la_exp.{kernel_name}.value"]
+        # extract lambda_threshold for every scored pixel based on its estimated lambda, using
+        # interval-indexed 'threshold' to query it by the lambda-values of each scored pixel
+        threshold_of_pixels = threshold.loc[lambda_of_pixels]
+        compliant_pixel_masks.append(
+            scored_df[obs_raw_name].to_numpy() >= threshold_of_pixels.to_numpy()
         )
-        # extracting q-values for all of the pixels takes a lot of time
-        # we'll do it externally for filtered_pixels only, in order to save
-        # time
-        #
-        # accumulate comply_fdr_k into comply_fdr_list
-        # using np.logical_and:
-        comply_fdr_list = np.logical_and(comply_fdr_list, comply_fdr_k)
-    # return a slice of 'scored_df' that complies FDR thresholds:
-    return scored_df[comply_fdr_list]
-
-
-##################################
-# large CLI-helper functions wrapping smaller step-specific ones:
-# basically - the dot-calling steps - ONE PASS DOT-CALLING:
-##################################
-
-
-def scoring_step(
-    clr,
-    expected,
-    expected_name,
-    clr_weight_name,
-    tiles,
-    kernels,
-    max_nans_tolerated,
-    loci_separation_bins,
-    output_path,
-    nproc,
-    verbose,
-):
-    """
-    Calculates locally adjusted expected
-    for each pixel in a designated area of
-    the heatmap and return it as a big
-    single pixel-table (pandas.DataFrame)
-    """
-    if verbose:
-        logging.info(f"Preparing to convolve {len(tiles)} tiles:")
-
-    # check if cooler is balanced
-    try:
-        _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True)
-    except Exception as e:
-        raise ValueError(
-            f"provided cooler is not balanced or {clr_weight_name} is missing"
-        ) from e
-
-    # add very_verbose to supress output from convolution of every tile
-    very_verbose = False
-    job = partial(
-        score_tile,
-        clr=clr,
-        cis_exp=expected,
-        exp_v_name=expected_name,
-        clr_weight_name=clr_weight_name,
-        kernels=kernels,
-        nans_tolerated=max_nans_tolerated,
-        band_to_cover=loci_separation_bins,
-        balance_factor=None,
-        verbose=very_verbose,
-    )
-
-    if nproc > 1:
-        pool = mp.Pool(nproc)
-        map_ = pool.imap
-        map_kwargs = dict(chunksize=int(np.ceil(len(tiles) / nproc)))
-        if verbose:
-            logging.info(
-                f"creating a Pool of {nproc} workers to tackle {len(tiles)} tiles"
-            )
-    else:
-        map_ = map
-        if verbose:
-            logging.info("fallback to serial implementation.")
-        map_kwargs = {}
-    try:
-        # do the work here
-        chunks = map_(job, tiles, **map_kwargs)
-        # ###########################################
-        # # local - in memory copy of the dataframe (danger of RAM overuse)
-        # ###########################################
-        # reset index is required, otherwise there will be duplicate
-        # indices in the output of this function
-        return pd.concat(chunks).reset_index(drop=True)
-    finally:
-        if nproc > 1:
-            pool.close()
+    # return pixels from 'scored_df' that satisfy FDR thresholds for all kernels:
+    return scored_df[np.all(compliant_pixel_masks, axis=0)]
 
 
 def clustering_step(
-    scores_df,
-    expected_regions,
+    scored_df,
     dots_clustering_radius,
-    verbose,
+    assigned_regions_name="region",
     obs_raw_name=observed_count_name,
 ):
     """
-
-    This is a new "clustering" step updated for the pixels processed by lambda-
-    chunking multiple hypothesis testing.
-
-    This method assumes that 'scores_df' is a DataFrame with all of the pixels
-    that needs to be clustered, thus there is no additional 'comply_fdr' column
-    and selection of compliant pixels.
-
-    This step is a clustering-only (using Birch from scikit).
+    Group together adjacent significant pixels into clusters after
+    the lambda-binning multiple hypothesis testing by iterating over
+    assigned regions and calling `clust_2D_pixels`.
 
     Parameters
     ----------
-    scores_df : pandas.DataFrame
-        DataFrame that stores filtered pixels that are ready to be
-        clustered, no more 'comply_fdr' column dependency.
-    expected_regions : iterable
-        An iterable of regions to be clustered.
+    scored_df : pandas.DataFrame
+        DataFrame with enriched pixels that are ready to be
+        clustered and are annotated with their genomic  coordinates.
     dots_clustering_radius : int
         Birch-clustering threshold.
-    verbose : bool
-        Enable verbose output.
+    assigned_regions_name : str | None
+        Name of the column in scored_df to use for grouping pixels
+        before clustering. When None, full chromosome clustering is done.
+    obs_raw_name : str
+        name of the column with raw observed pixel counts
     Returns
     -------
     centroids : pandas.DataFrame
-        Pixels from 'scores_df' annotated with clustering information.
+        Pixels from 'scored_df' annotated with clustering information.
 
     Notes
     -----
     'dots_clustering_radius' in Birch clustering algorithm corresponds to a
     double the clustering radius in the "greedy"-clustering used in HiCCUPS
-    (to be tested).
 
     """
-    # Annotate regions, if needed:
+    # make sure provided pixels are annotated with genomic corrdinates and raw counts column is present:
+    if not {"chrom1", "chrom2", "start1", "start2", obs_raw_name}.issubset(scored_df):
+        raise ValueError("Scored pixels provided for clustering are not annotated")
 
-    scores_df = scores_df.copy()
+    scored_df = scored_df.copy()
     if (
-        not "region" in scores_df.columns
+        not assigned_regions_name in scored_df.columns
     ):  # If input scores are not annotated by regions:
-        scores_df["region"] = np.where(
-            scores_df["chrom1"] == scores_df["chrom2"], scores_df["chrom1"], np.nan
+        logging.warning(
+            f"No regions assigned to the scored pixels before clustering, using chromosomes"
+        )
+        scored_df[assigned_regions_name] = np.where(
+            scored_df["chrom1"] == scored_df["chrom2"], scored_df["chrom1"], np.nan
         )
-    # using different bin12_id_names since all
-    # pixels are annotated at this point.
-    pixel_clust_list = []
-    for region in expected_regions:
-        # Perform clustering for each region separately.
-        df = scores_df[((scores_df["region"].astype(str) == str(region)))]
-        if not len(df):
-            continue
 
+    # cluster within each regions separately and accumulate the result:
+    pixel_clust_list = []
+    scored_pixels_by_region = scored_df.groupby(assigned_regions_name, observed=True)
+    for region, _df in scored_pixels_by_region:
+        logging.info(f"clustering enriched pixels in region: {region}")
+        # Using genomic corrdinated for clustering, not bin_id
         pixel_clust = clust_2D_pixels(
-            df,
+            _df,
             threshold_cluster=dots_clustering_radius,
             bin1_id_name="start1",
             bin2_id_name="start2",
-            verbose=verbose,
         )
         pixel_clust_list.append(pixel_clust)
-    if verbose:
-        logging.info("Clustering is over!")
+    logging.info("Clustering is complete")
+
     # concatenate clustering results ...
     # indexing information persists here ...
-
-    if len(pixel_clust_list) == 0:
-        if verbose:
-            logging.info("No clusters found! Output will be empty")
+    if not pixel_clust_list:
+        logging.warning("No clusters found for any regions! Output will be empty")
         empty_output = pd.DataFrame(
             [],
-            columns=list(scores_df.columns)
-            + ["region1", "region2", "c_label", "c_size", "cstart1", "cstart2"],
+            columns=list(scored_df.columns)
+            + [
+                assigned_regions_name + "1",
+                assigned_regions_name + "2",
+                "c_label",
+                "c_size",
+                "cstart1",
+                "cstart2",
+            ],
         )
         return empty_output  # Empty dataframe with the same columns as anticipated
-    pixel_clust_df = pd.concat(
-        pixel_clust_list, ignore_index=False
-    )  # Concatenate the clustering results for different regions
+    else:
+        pixel_clust_df = pd.concat(
+            pixel_clust_list, ignore_index=False
+        )  # Concatenate the clustering results for different regions
 
-    # now merge pixel_clust_df and scores_df DataFrame ...
-    # # and merge (index-wise) with the main DataFrame:
+    # now merge pixel_clust_df and scored_df DataFrame ...
+    # TODO make a more robust merge here
     df = pd.merge(
-        scores_df, pixel_clust_df, how="left", left_index=True, right_index=True
+        scored_df, pixel_clust_df, how="left", left_index=True, right_index=True
     )
-    # prevents scores_df categorical values (all chroms, including chrM)
-    df["region1"] = df["region"].astype(str)
-    df["region2"] = df["region"].astype(str)
+    # TODO check if next str-cast is neccessary
+    df[assigned_regions_name + "1"] = df[assigned_regions_name].astype(str)
+    df[assigned_regions_name + "2"] = df[assigned_regions_name].astype(str)
     # report only centroids with highest Observed:
-    chrom_clust_group = df.groupby(["region1", "region2", "c_label"])
+    chrom_clust_group = df.groupby(
+        [assigned_regions_name + "1", assigned_regions_name + "2", "c_label"],
+        observed=True,
+    )
     centroids = df.loc[
         chrom_clust_group[obs_raw_name].idxmax()
     ]  # Select the brightest pixel in the cluster
     return centroids
 
 
-# consider switching step names - "extraction" to "FDR-thresholding"
-# and "thresholding" to "enrichment" - for final enrichment selection:
-def thresholding_step(centroids, obs_raw_name=observed_count_name):
-    # (2)
-    # filter by FDR, enrichment etc:
-    enrichment_factor_1 = 1.5
-    enrichment_factor_2 = 1.75
-    enrichment_factor_3 = 2.0
-    FDR_orphan_threshold = 0.02
-    ######################################################################
-    # # Temporarily remove orphans filtering, until q-vals are calculated:
-    ######################################################################
+def cluster_filtering_hiccups(
+    centroids,
+    obs_raw_name=observed_count_name,
+    enrichment_factor_vh=1.5,
+    enrichment_factor_d_and_ll=1.75,
+    enrichment_factor_d_or_ll=2.0,
+    FDR_orphan_threshold=0.02,
+):
+    """
+    Centroids of enriched pixels can be filtered to further minimize
+    the amount of false-positive dot-calls.
+
+    First, centroids are filtered on enrichment relative to the
+    locally-adjusted expected for the "donut", "lowleft", "vertical",
+    and "horizontal" kernels. Additionally, singleton pixels
+    (i.e. pixels that do not belong to a cluster) are filtered based on
+    a combined q-values for all kernels. This empirical filtering approach
+    was developed in Rao et al 2014 and results in a conservative dot-calls
+    with the low rate of false-positive calls.
+
+    Parameters
+    ----------
+    centroids : pd.DataFrame
+        DataFrame that stores enriched and clustered pixels.
+    obs_raw_name : str
+        name of the column with raw observed pixel counts
+    enrichment_factor_vh : float
+        minimal enrichment factor for pixels relative to
+        both "vertical" and "horizontal" kernel.
+    enrichment_factor_d_and_ll : float
+        minimal enrichment factor for pixels relative to
+        both "donut" and "lowleft" kernels.
+    enrichment_factor_d_or_ll : float
+        minimal enrichment factor for pixels relative to
+        either "donut" or" "lowleft" kenels.
+    FDR_orphan_threshold : float
+        minimal combined q-value for singleton pixels.
+
+    Returns
+    -------
+    filtered_centroids : pd.DataFrame
+        filtered dot-calls
+    """
+    # make sure input DataFrame of pixels has been clustered:
+    if not "c_size" in centroids:
+        raise ValueError(f"input dataframe of pixels does not seem to be clustered")
+
+    # make sure input DataFrame of pixels has been annotated with genomic coordinates and raw counts column is present:
+    if not {"chrom1", "chrom2", "start1", "start2", obs_raw_name}.issubset(centroids):
+        raise ValueError(
+            "input dataframe of clustered pixels provided for filtering is not annotated"
+        )
+
+    # make sureinput DataFrame of pixels was scored using 4-hiccups kernels (donut, lowleft, vertical, horizontal):
+    _hiccups_kernel_cols_set = {
+        "la_exp.donut.value",
+        "la_exp.vertical.value",
+        "la_exp.horizontal.value",
+        "la_exp.lowleft.value",
+        # and q-values as well
+        "la_exp.donut.qval",
+        "la_exp.vertical.qval",
+        "la_exp.horizontal.qval",
+        "la_exp.lowleft.qval",
+    }
+    # make sure input DataFrame of pixels has been annotated with genomic coordinates and raw counts column is present:
+    if not _hiccups_kernel_cols_set.issubset(centroids):
+        raise ValueError(
+            "clustered pixels provided for filtering were not scored with 4 hiccups kernels"
+        )
+
+    # ad hoc filtering by enrichment, FDR for singletons etc.
+    # employed in Rao et al 2014 HiCCUPS
     enrichment_fdr_comply = (
         (
             centroids[obs_raw_name]
-            > enrichment_factor_2 * centroids["la_exp.lowleft.value"]
+            > enrichment_factor_d_and_ll * centroids["la_exp.lowleft.value"]
         )
         & (
             centroids[obs_raw_name]
-            > enrichment_factor_2 * centroids["la_exp.donut.value"]
+            > enrichment_factor_d_and_ll * centroids["la_exp.donut.value"]
         )
         & (
             centroids[obs_raw_name]
-            > enrichment_factor_1 * centroids["la_exp.vertical.value"]
+            > enrichment_factor_vh * centroids["la_exp.vertical.value"]
         )
         & (
             centroids[obs_raw_name]
-            > enrichment_factor_1 * centroids["la_exp.horizontal.value"]
+            > enrichment_factor_vh * centroids["la_exp.horizontal.value"]
         )
         & (
             (
                 centroids[obs_raw_name]
-                > enrichment_factor_3 * centroids["la_exp.lowleft.value"]
+                > enrichment_factor_d_or_ll * centroids["la_exp.lowleft.value"]
             )
             | (
                 centroids[obs_raw_name]
-                > enrichment_factor_3 * centroids["la_exp.donut.value"]
+                > enrichment_factor_d_or_ll * centroids["la_exp.donut.value"]
             )
         )
         & (
@@ -1327,63 +1248,19 @@ def thresholding_step(centroids, obs_raw_name=observed_count_name):
             )
         )
     )
-    # #
-    # enrichment_fdr_comply = (
-    #     (centroids[obs_raw_name] > enrichment_factor_2 * centroids["la_exp.lowleft.value"]) &
-    #     (centroids[obs_raw_name] > enrichment_factor_2 * centroids["la_exp.donut.value"]) &
-    #     (centroids[obs_raw_name] > enrichment_factor_1 * centroids["la_exp.vertical.value"]) &
-    #     (centroids[obs_raw_name] > enrichment_factor_1 * centroids["la_exp.horizontal.value"]) &
-    #     ( (centroids[obs_raw_name] > enrichment_factor_3 * centroids["la_exp.lowleft.value"])
-    #         | (centroids[obs_raw_name] > enrichment_factor_3 * centroids["la_exp.donut.value"]) )
-    # )
-    # use "enrichment_fdr_comply" to filter out
-    # non-satisfying pixels:
-    out = centroids[enrichment_fdr_comply]
-
-    # ...
-    # to be added to the list of output columns:
-    # "factor_balance."+"lowleft"+".KerObs"
-    # ...
-
-    # tentaive output columns list:
-    columns_for_output = [
-        "chrom1",
-        "start1",
-        "end1",
-        "chrom2",
-        "start2",
-        "end2",
-        "cstart1",
-        "cstart2",
-        "c_label",
-        "c_size",
-        obs_raw_name,
-        # 'exp.raw',
-        "la_exp.donut.value",
-        "la_exp.vertical.value",
-        "la_exp.horizontal.value",
-        "la_exp.lowleft.value",
-        # "factor_balance.lowleft.KerObs",
-        # 'la_exp.upright.value',
-        # 'la_exp.upright.qval',
-        "la_exp.donut.qval",
-        "la_exp.vertical.qval",
-        "la_exp.horizontal.qval",
-        "la_exp.lowleft.qval",
-    ]
-    return out[columns_for_output]
+    # use "enrichment_fdr_comply" to filter out non-satisfying pixels:
+    return centroids[enrichment_fdr_comply].reset_index(drop=True)
 
 
-##################################
-# large CLI-helper functions wrapping smaller step-specific ones:
-# basically - the dot-calling steps - ON THE FLY - 2 PASS DOT-CALLING (HiCCUPS-style):
-##################################
+####################################################################
+# large helper functions wrapping smaller step-specific ones
+####################################################################
 
 
 def scoring_and_histogramming_step(
     clr,
-    expected,
-    expected_name,
+    expected_indexed,
+    expected_value_col,
     clr_weight_name,
     tiles,
     kernels,
@@ -1391,224 +1268,404 @@ def scoring_and_histogramming_step(
     max_nans_tolerated,
     loci_separation_bins,
     nproc,
-    verbose,
 ):
     """
-    This is a derivative of the 'scoring_step' which is supposed to implement
-    the 1st of the lambda-chunking procedure - histogramming.
+    This implements the 1st step of the lambda-binning scoring procedure - histogramming.
 
-    Basically we are piping scoring operation together with histogramming into a
+    In short, this pipes a scoring operation together with histogramming into a
     single pipeline of per-chunk operations/transforms.
     """
-    if verbose:
-        logging.info(f"Preparing to convolve {len(tiles)} tiles:")
-
-    # add very_verbose to supress output from convolution of every tile
-    very_verbose = False
-
-    # check if cooler is balanced
-    try:
-        _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True)
-    except Exception as e:
-        raise ValueError(
-            f"provided cooler is not balanced or {clr_weight_name} is missing"
-        ) from e
+    logging.info(f"convolving {len(tiles)} tiles to build histograms for lambda-bins")
 
     # to score per tile:
     to_score = partial(
         score_tile,
         clr=clr,
-        cis_exp=expected,
-        exp_v_name=expected_name,
+        expected_indexed=expected_indexed,
+        expected_value_col=expected_value_col,
         clr_weight_name=clr_weight_name,
         kernels=kernels,
-        nans_tolerated=max_nans_tolerated,
+        max_nans_tolerated=max_nans_tolerated,
         band_to_cover=loci_separation_bins,
-        # do not calculate dynamic-donut criteria
-        # for now.
-        balance_factor=None,
-        verbose=very_verbose,
     )
 
     # to hist per scored chunk:
-    to_hist = partial(
-        histogram_scored_pixels, kernels=kernels, ledges=ledges, verbose=very_verbose
-    )
+    to_hist = partial(histogram_scored_pixels, kernels=kernels, ledges=ledges)
 
-    # composing/piping scoring and histogramming
-    # together :
+    # compose scoring and histogramming together :
     job = lambda tile: to_hist(to_score(tile))
 
-    # copy paste from @nvictus modified 'scoring_step':
+    # standard multiprocessing implementation
     if nproc > 1:
+        logging.info(f"creating a Pool of {nproc} workers to tackle {len(tiles)} tiles")
         pool = mp.Pool(nproc)
         map_ = pool.imap
         map_kwargs = dict(chunksize=int(np.ceil(len(tiles) / nproc)))
-        if verbose:
-            logging.info(
-                f"creating a Pool of {nproc} workers to tackle {len(tiles)} tiles"
-            )
     else:
+        logging.info("fallback to serial implementation.")
         map_ = map
-        if verbose:
-            logging.info("fallback to serial implementation.")
         map_kwargs = {}
     try:
         # consider using
         # https://github.com/mirnylab/cooler/blob/9e72ee202b0ac6f9d93fd2444d6f94c524962769/cooler/tools.py#L59
-        # here:
-        hchunks = map_(job, tiles, **map_kwargs)
-        # hchunks TO BE ACCUMULATED
-        # hopefully 'hchunks' would stay in memory
-        # until we would get a chance to accumulate them:
+        histogram_chunks = map_(job, tiles, **map_kwargs)
     finally:
         if nproc > 1:
             pool.close()
-    #
-    # now we need to combine/sum all of the histograms
-    # for different kernels:
-    #
-    # assuming we know "kernels"
-    # this is very ugly, but ok
-    # for the draft lambda-chunking
-    # lambda version of lambda-chunking:
+
+    # now we need to combine/sum all of the histograms for different kernels:
     def _sum_hists(hx, hy):
-        # perform a DataFrame summation
-        # for every value of the dictionary:
+        # perform a DataFrame summation for every value of the dictionary:
         hxy = {}
         for k in kernels:
-            hxy[k] = hx[k].add(hy[k], fill_value=0).astype(np.int64)
+            hxy[k] = hx[k].add(hy[k], fill_value=0).fillna(0).astype(np.int64)
         # returning the sum:
         return hxy
 
-    # ######################################################
-    # this approach is tested and at the very least
-    # number of pixels in a dump list matches
-    # with the .sum().sum() of the histogram
-    # both for 10kb and 5kb,
-    # thus we should consider this as a reference
-    # implementation, albeit not a very efficient one ...
-    # ######################################################
-    final_hist = reduce(_sum_hists, hchunks)
-    # we have to make sure there is nothing in the
-    # top bin, i.e., there are no l.a. expecteds > base^(len(ledges)-1)
+    # TODO consider more efficient implementation to accumulate histograms:
+    final_hist = reduce(_sum_hists, histogram_chunks)
+    # we have to make sure there is nothing in the last lambda-bin
+    # this is a temporary implementation detail, until we implement dynamic lambda-bins
     for k in kernels:
-        last_la_exp_bin = final_hist[k].columns[-1]
-        last_la_exp_vals = final_hist[k].iloc[:, -1]
-        # checking the top bin:
-        if last_la_exp_vals.sum() != 0:
+        last_lambda_bin = final_hist[k].iloc[:, -1]
+        if last_lambda_bin.sum() != 0:
             raise ValueError(
-                f"There are la_exp.{k}.value in {last_la_exp_bin}, please check the histogram"
+                f"There are la_exp.{k}.value in {last_lambda_bin.name}, please check the histogram"
             )
-        # drop that last column/bin (last_edge, +inf]:
-        final_hist[k] = final_hist[k].drop(columns=last_la_exp_bin)
-        # consider dropping all of the columns that have zero .sum()
+        # drop all lambda-bins that do not have pixels in them:
+        final_hist[k] = final_hist[k].loc[:, final_hist[k].sum() > 0]
+        # make sure index (observed pixels counts) is sorted
+        if not final_hist[k].index.is_monotonic:
+            raise ValueError(f"Histogram for {k}-kernel is not sorted")
     # returning filtered histogram
     return final_hist
 
 
 def scoring_and_extraction_step(
     clr,
-    expected,
-    expected_name,
+    expected_indexed,
+    expected_value_col,
     clr_weight_name,
     tiles,
     kernels,
     ledges,
     thresholds,
     max_nans_tolerated,
-    balance_factor,
     loci_separation_bins,
-    output_path,
     nproc,
-    verbose,
     bin1_id_name="bin1_id",
     bin2_id_name="bin2_id",
 ):
     """
-    This is a derivative of the 'scoring_step' which is supposed to implement
-    the 2nd of the lambda-chunking procedure - extracting pixels that are FDR
-    compliant.
+    This implements the 2nd step of the lambda-binning scoring procedure,
+    extracting pixels that are FDR compliant.
 
-    Basically we are piping scoring operation together with extraction into a
+    In short, this combines scoring with with extraction into a
     single pipeline of per-chunk operations/transforms.
 
     """
-    if verbose:
-        logging.info(f"Preparing to convolve {len(tiles)} tiles:")
-
-    # add very_verbose to supress output from convolution of every tile
-    very_verbose = False
-
-    # check if cooler is balanced
-    try:
-        _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True)
-    except Exception as e:
-        raise ValueError(
-            f"provided cooler is not balanced or {clr_weight_name} is missing"
-        ) from e
+    logging.info(f"convolving {len(tiles)} tiles to extract enriched pixels")
 
     # to score per tile:
     to_score = partial(
         score_tile,
         clr=clr,
-        cis_exp=expected,
-        exp_v_name=expected_name,
+        expected_indexed=expected_indexed,
+        expected_value_col=expected_value_col,
         clr_weight_name=clr_weight_name,
         kernels=kernels,
-        nans_tolerated=max_nans_tolerated,
+        max_nans_tolerated=max_nans_tolerated,
         band_to_cover=loci_separation_bins,
-        balance_factor=balance_factor,
-        verbose=very_verbose,
     )
 
     # to hist per scored chunk:
     to_extract = partial(
         extract_scored_pixels,
-        kernels=kernels,
         thresholds=thresholds,
-        ledges=ledges,
-        verbose=very_verbose,
     )
 
-    # composing/piping scoring and histogramming
-    # together :
+    # compose scoring and histogramming together
     job = lambda tile: to_extract(to_score(tile))
 
-    # copy paste from @nvictus modified 'scoring_step':
+    # standard multiprocessing implementation
     if nproc > 1:
+        logging.info(f"creating a Pool of {nproc} workers to tackle {len(tiles)} tiles")
         pool = mp.Pool(nproc)
         map_ = pool.imap
         map_kwargs = dict(chunksize=int(np.ceil(len(tiles) / nproc)))
-        if verbose:
-            logging.info(
-                f"creating a Pool of {nproc} workers to tackle {len(tiles)} tiles"
-            )
     else:
+        logging.info("fallback to serial implementation.")
         map_ = map
-        if verbose:
-            logging.info("fallback to serial implementation.")
         map_kwargs = {}
     try:
         # consider using
         # https://github.com/mirnylab/cooler/blob/9e72ee202b0ac6f9d93fd2444d6f94c524962769/cooler/tools.py#L59
-        # here:
         filtered_pix_chunks = map_(job, tiles, **map_kwargs)
         significant_pixels = pd.concat(filtered_pix_chunks, ignore_index=True)
-        if output_path is not None:
-            significant_pixels.to_csv(
-                output_path, sep="\t", header=True, index=False, compression=None
-            )
     finally:
         if nproc > 1:
             pool.close()
-    # there should be no duplicates in the "significant_pixels" DataFrame of pixels:
-    significant_pixels_dups = significant_pixels.duplicated()
-    if significant_pixels_dups.any():
+    # same pixels should never be scored >1 times with the current tiling of the interactions matrix
+    if significant_pixels.duplicated().any():
         raise ValueError(
-            f"Duplicated pixels detected during exctraction {significant_pixels[significant_pixels_dups]}"
+            f"Some pixels were scored more than one time, matrix tiling procedure is not correct"
         )
     # sort the result just in case and drop its index:
     return significant_pixels.sort_values(by=[bin1_id_name, bin2_id_name]).reset_index(
         drop=True
     )
+
+
+# user-friendly high-level API function
+def dots(
+    clr,
+    expected,
+    expected_value_col="balanced.avg",
+    clr_weight_name="weight",
+    view_df=None,
+    kernels=None,
+    max_loci_separation=10_000_000,
+    max_nans_tolerated=1,  # test if this has desired behavior
+    n_lambda_bins=40,  # update this eventually
+    lambda_bin_fdr=0.1,
+    clustering_radius=20_000,
+    cluster_filtering=None,
+    tile_size=5_000_000,
+    nproc=1,
+):
+    """
+    Call dots on a cooler {clr}, using {expected} defined in regions specified
+    in {view_df}.
+
+    All convolution kernels specified in {kernels} will be all applied to the {clr},
+    and statistical testing will be performed separately for each kernel. A convolutional
+    kernel is a small squared matrix (e.g. 7x7) of zeros and ones
+    that defines a "mask" to extract local expected around each pixel. Since the
+    enrichment is calculated relative to the central pixel, kernel width should
+    be an odd number >=3.
+
+    Parameters
+    ----------
+    clr : cooler.Cooler
+        A cooler with balanced Hi-C data.
+    expected : DataFrame in expected format
+        Diagonal summary statistics for each chromosome, and name of the column
+        with the values of expected to use.
+    expected_value_col : str
+        Name of the column in expected that holds the values of expected
+    clr_weight_name : str
+        Name of the column in the clr.bins to use as balancing weights.
+        Using raw unbalanced data is not supported for dot-calling.
+    view_df : viewframe
+        Viewframe with genomic regions, at the moment the view has to match the
+        view used for generating expected. If None, generate from the cooler.
+    kernels : { str:np.ndarray } | None
+        A dictionary of convolution kernels to be used for calculating locally adjusted
+        expected. If None the default kernels from HiCCUPS are going to be recommended
+        based on the resolution of the cooler.
+    max_loci_separation : int
+        Miaximum loci separation for dot-calling, i.e., do not call dots for
+        loci that are further than max_loci_separation basepair apart. default 10Mb.
+    max_nans_tolerated : int
+        Maximum number of NaNs tolerated in a footprint of every used kernel
+        Adjust with caution, as large max_nans_tolerated, might lead to artifacts in
+        pixels scoring.
+    n_lambda_bins : int
+        Number of log-spaced bins, where FDR-testing will be performed independently.
+        TODO: generate lambda-bins on the fly based on the dynamic range of the data (i.e. maximum pixel count)
+    lambda_bin_fdr : float
+        False discovery rate (FDR) for multiple hypothesis testing BH-FDR procedure, applied per lambda bin.
+    clustering_radius : None | int
+        Cluster enriched pixels with a given radius. "Brightest" pixels in each group
+        will be reported as the final dot-calls. If None, no clustering is performed.
+    cluster_filtering : bool
+        whether to apply additional filtering to centroids after clustering, using cluster_filtering_hiccups()
+    tile_size : int
+        Tile size for the Hi-C heatmap tiling. Typically on order of several mega-bases, and <= max_loci_separation.
+        Controls tradeoff between memory consumption and speed of execution.
+    nproc : int
+        Number of processes to use for multiprocessing.
+
+    Returns
+    -------
+    dots : pandas.DataFrame
+        BEDPE-style dataFrame with genomic coordinates of called dots and additional annotations.
+
+    Notes
+    -----
+    'clustering_radius' in Birch clustering algorithm corresponds to a
+    double the clustering radius in the "greedy"-clustering used in HiCCUPS
+    (to be tested).
+
+    TODO describe sequence of processing steps
+
+    """
+
+    #### Generate viewframes ####
+    if view_df is None:
+        view_df = make_cooler_view(clr)
+    else:
+        try:
+            _ = is_compatible_viewframe(
+                view_df,
+                clr,
+                check_sorting=True,
+                raise_errors=True,
+            )
+        except Exception as e:
+            raise ValueError("view_df is not a valid viewframe or incompatible") from e
+
+    # check balancing status
+    if clr_weight_name:
+        # check if cooler is balanced
+        try:
+            _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True)
+        except Exception as e:
+            raise ValueError(
+                f"provided cooler is not balanced or {clr_weight_name} is missing"
+            ) from e
+    else:
+        raise ValueError("calling dots on raw data is not supported.")
+
+    # add checks to make sure cis-expected is symmetric
+    # make sure provided expected is compatible
+    try:
+        _ = is_valid_expected(
+            expected,
+            "cis",
+            view_df,
+            verify_cooler=clr,
+            expected_value_cols=[
+                expected_value_col,
+            ],
+            raise_errors=True,
+        )
+    except Exception as e:
+        raise ValueError("provided expected is not compatible") from e
+
+    # Prepare some parameters.
+    binsize = clr.binsize
+    loci_separation_bins = bp_to_bins(max_loci_separation, binsize)
+    tile_size_bins = bp_to_bins(tile_size, binsize)
+
+    # verify provided kernels or recommend them (HiCCUPS)...
+    if kernels and is_compatible_kernels(kernels, binsize, max_nans_tolerated):
+        warnings.warn(
+            "Compatibility checks for 'kernels' are not fully implemented yet, use at your own risk"
+        )
+    else:
+        # recommend them (default hiccups ones for now)
+        kernels = recommend_kernels(binsize)
+    # deduce kernel_width - overall footprint
+    kernel_width = max(len(k) for k in kernels.values())  # 2*w+1
+    kernel_half_width = int((kernel_width - 1) / 2)  # former w parameter
+
+    # try to guess required lambda bins using "max" value of pixel counts
+    # statistical: lambda-binning edges ...
+    if not 40 <= n_lambda_bins <= 50:
+        raise ValueError(f"Incompatible n_lambda_bins={n_lambda_bins}")
+    BASE = 2 ** (1 / 3)  # very arbitrary - parameterize !
+    ledges = np.concatenate(
+        (
+            [-np.inf],
+            np.logspace(
+                0,
+                n_lambda_bins - 1,
+                num=n_lambda_bins,
+                base=BASE,
+                dtype=np.float64,
+            ),
+            [np.inf],
+        )
+    )
+
+    # list of tile coordinate ranges
+    tiles = list(
+        generate_tiles_diag_band(
+            clr, view_df, kernel_half_width, tile_size_bins, loci_separation_bins
+        )
+    )
+
+    # 1. Calculate genome-wide histograms of scores.
+    gw_hist = scoring_and_histogramming_step(
+        clr,
+        expected.set_index(["region1", "region2", "dist"]),
+        expected_value_col=expected_value_col,
+        clr_weight_name=clr_weight_name,
+        tiles=tiles,
+        kernels=kernels,
+        ledges=ledges,
+        max_nans_tolerated=max_nans_tolerated,
+        loci_separation_bins=loci_separation_bins,
+        nproc=nproc,
+    )
+
+    logging.info("Done building histograms ...")
+
+    # 2. Determine the FDR thresholds.
+    threshold_df, qvalues = determine_thresholds(gw_hist, lambda_bin_fdr)
+
+    # 3. Filter using FDR thresholds calculated in the histogramming step
+    filtered_pixels = scoring_and_extraction_step(
+        clr,
+        expected.set_index(["region1", "region2", "dist"]),
+        expected_value_col=expected_value_col,
+        clr_weight_name=clr_weight_name,
+        tiles=tiles,
+        kernels=kernels,
+        ledges=ledges,
+        thresholds=threshold_df,
+        max_nans_tolerated=max_nans_tolerated,
+        loci_separation_bins=loci_separation_bins,
+        nproc=nproc,
+        bin1_id_name="bin1_id",
+        bin2_id_name="bin2_id",
+    )
+
+    # 4. Post-processing
+    logging.info(f"Begin post-processing of {len(filtered_pixels)} filtered pixels")
+    logging.info("preparing to extract needed q-values ...")
+
+    # annotate enriched pixels
+    filtered_pixels_qvals = annotate_pixels_with_qvalues(filtered_pixels, qvalues)
+    filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals, clr.bins()[:])
+    if not clustering_radius:
+        # TODO: make sure returned DataFrame has the same columns as "postprocessed_calls"
+        # columns to return before-clustering
+        output_cols = []
+        output_cols += bedpe_required_cols
+        output_cols += [
+            observed_count_name,
+        ]
+        output_cols += [f"la_exp.{k}.value" for k in kernels]
+        output_cols += [f"la_exp.{k}.qval" for k in kernels]
+        return filtered_pixels_annotated[output_cols]
+
+    # 4a. clustering
+    # Clustering is done independently for every region, therefore regions must be assigned:
+    filtered_pixels_annotated = assign_regions(filtered_pixels_annotated, view_df)
+    centroids = clustering_step(
+        filtered_pixels_annotated,
+        clustering_radius,
+    ).reset_index(drop=True)
+
+    # columns to return post-clustering
+    output_cols = []
+    output_cols += bedpe_required_cols
+    output_cols += [
+        "cstart1",
+        "cstart2",
+        "c_label",
+        "c_size",
+        observed_count_name,
+    ]
+    output_cols += [f"la_exp.{k}.value" for k in kernels]
+    output_cols += [f"la_exp.{k}.qval" for k in kernels]
+
+    # 4b. filter by enrichment and qval
+    if cluster_filtering is None:  # default - engage HiCCUPS filtering
+        postprocessed_calls = cluster_filtering_hiccups(centroids)
+    elif not cluster_filtering:
+        postprocessed_calls = centroids
+
+    return postprocessed_calls
diff --git a/cooltools/cli/dots.py b/cooltools/cli/dots.py
index afe07266..ea7e9021 100644
--- a/cooltools/cli/dots.py
+++ b/cooltools/cli/dots.py
@@ -13,7 +13,6 @@
 from ..lib.common import make_cooler_view, assign_regions
 from ..lib.io import read_viewframe_from_file, read_expected_from_file
 
-
 from .util import validate_csv
 
 logging.basicConfig(level=logging.INFO)
@@ -86,21 +85,7 @@
     show_default=True,
 )
 @click.option(
-    "--kernel-width",
-    help="Outer half-width of the convolution kernel in pixels"
-    " e.g. outer size (w) of the 'donut' kernel, with the 2*w+1"
-    " overall footprint of the 'donut'.",
-    type=int,
-)
-@click.option(
-    "--kernel-peak",
-    help="Inner half-width of the convolution kernel in pixels"
-    " e.g. inner size (p) of the 'donut' kernel, with the 2*p+1"
-    " overall footprint of the punch-hole.",
-    type=int,
-)
-@click.option(
-    "--num-lambda-chunks",
+    "--num-lambda-bins",
     help="Number of log-spaced bins to divide your adjusted expected"
     " between. Same as HiCCUPS_W1_MAX_INDX (40) in the original HiCCUPS.",
     type=int,
@@ -116,7 +101,7 @@
     show_default=True,
 )
 @click.option(
-    "--dots-clustering-radius",
+    "--clustering-radius",
     help="Radius for clustering dots that have been called too close to each other."
     "Typically on order of 40 kilo-bases, and >= binsize.",
     type=int,
@@ -128,10 +113,8 @@
 )
 @click.option(
     "-o",
-    "--out-prefix",
-    help="Specify prefix for the output file, to store results of dot-calling:"
-    " all enriched pixels as prefix + '.enriched.tsv',"
-    " and post-processed dots (clustered,filtered) as prefix + '.postproc.bedpe'",
+    "--output",
+    help="Specify output file name to store called dots in a BEDPE-like format",
     type=str,
     required=True,
 )
@@ -144,13 +127,11 @@ def dots(
     max_loci_separation,
     max_nans_tolerated,
     tile_size,
-    kernel_width,
-    kernel_peak,
-    num_lambda_chunks,
+    num_lambda_bins,
     fdr,
-    dots_clustering_radius,
+    clustering_radius,
     verbose,
-    out_prefix,
+    output,
 ):
     """
     Call dots on a Hi-C heatmap that are not larger than max_loci_separation.
@@ -176,167 +157,44 @@ def dots(
     clr = cooler.Cooler(cool_path)
     expected_path, expected_value_col = expected_path
 
-    #### Generate viewframes ####
-    # 1:cooler_view_df. Generate viewframe from clr.chromsizes:
-    cooler_view_df = make_cooler_view(clr)
-
-    # 2:view_df. Define global view for calculating calling dots
-    # use input "view" BED file or all chromosomes :
+    # Either use view from file or all chromosomes in the provided cooler
     if view is None:
-        view_df = cooler_view_df
+        view_df = make_cooler_view(clr)
     else:
         view_df = read_viewframe_from_file(view, clr, check_sorting=True)
 
-    #### Read expected: ####
-    expected_summary_cols = [
-        expected_value_col,
-    ]
     expected = read_expected_from_file(
         expected_path,
         contact_type="cis",
-        expected_value_cols=expected_summary_cols,
+        expected_value_cols=[expected_value_col],
         verify_view=view_df,
         verify_cooler=clr,
     )
-    # add checks to make sure cis-expected is symmetric
-
-    # Prepare some parameters.
-    binsize = clr.binsize
-    loci_separation_bins = int(max_loci_separation / binsize)
-    tile_size_bins = int(tile_size / binsize)
-    balance_factor = 1.0  # clr._load_attrs("bins/weight")["scale"]
-
-    # clustering would deal with bases-units for now, so supress this for now
-    # clustering_radius_bins = int(dots_clustering_radius/binsize)
-
-    # kernels
-    # 'upright' is a symmetrical inversion of "lowleft", not needed.
-    ktypes = ["donut", "vertical", "horizontal", "lowleft"]
-
-    if (kernel_width is None) or (kernel_peak is None):
-        w, p = api.dotfinder.recommend_kernel_params(binsize)
-        logging.info(
-            f"Using kernel parameters w={w}, p={p} recommended for binsize {binsize}"
-        )
-    else:
-        w, p = kernel_width, kernel_peak
-        # add some sanity check for w,p:
-        if not w > p:
-            raise ValueError(f"Wrong inner/outer kernel parameters w={w}, p={p}")
-        logging.info(f"Using kernel parameters w={w}, p={p} provided by user")
-
-    # once kernel parameters are setup check max_nans_tolerated
-    # to make sure kernel footprints overlaping 1 side with the
-    # NaNs filled row/column are not "allowed"
-    # this requires dynamic adjustment for the "shrinking donut"
-    if not max_nans_tolerated <= 2 * w:
-        raise ValueError("Too many NaNs allowed!")
-    # may lead to scoring the same pixel twice, - i.e. duplicates.
-
-    # generate standard kernels - consider providing custom ones
-    kernels = {k: api.dotfinder.get_kernel(w, p, k) for k in ktypes}
-
-    # list of tile coordinate ranges
-    tiles = list(
-        api.dotfinder.heatmap_tiles_generator_diag(
-            clr, view_df, w, tile_size_bins, loci_separation_bins
-        )
-    )
 
-    # lambda-chunking edges ...
-    if not 40 <= num_lambda_chunks <= 50:
-        raise ValueError("Incompatible num_lambda_chunks")
-    base = 2 ** (1 / 3)
-    ledges = np.concatenate(
-        (
-            [-np.inf],
-            np.logspace(
-                0,
-                num_lambda_chunks - 1,
-                num=num_lambda_chunks,
-                base=base,
-                dtype=np.float64,
-            ),
-            [np.inf],
-        )
-    )
-
-    # 1. Calculate genome-wide histograms of scores.
-    gw_hist = api.dotfinder.scoring_and_histogramming_step(
-        clr,
-        expected.set_index(["region1", "region2", "dist"]),
-        expected_value_col,
-        clr_weight_name,
-        tiles,
-        kernels,
-        ledges,
-        max_nans_tolerated,
-        loci_separation_bins,
-        nproc,
-        verbose,
-    )
-
-    if verbose:
-        logging.info("Done building histograms ...")
-
-    # 2. Determine the FDR thresholds.
-    threshold_df, qvalues = api.dotfinder.determine_thresholds(
-        kernels, ledges, gw_hist, fdr
-    )
-
-    # 3. Filter using FDR thresholds calculated in the histogramming step
-    filtered_pixels = api.dotfinder.scoring_and_extraction_step(
+    dot_calls_df = api.dotfinder.dots(
         clr,
-        expected.set_index(["region1", "region2", "dist"]),
-        expected_value_col,
-        clr_weight_name,
-        tiles,
-        kernels,
-        ledges,
-        threshold_df,
-        max_nans_tolerated,
-        balance_factor,
-        loci_separation_bins,
-        op.join(op.dirname(out_prefix), op.basename(out_prefix) + ".enriched.tsv"),
-        nproc,
-        verbose,
-        bin1_id_name="bin1_id",
-        bin2_id_name="bin2_id",
+        expected,
+        expected_value_col=expected_value_col,
+        clr_weight_name=clr_weight_name,
+        view_df=view_df,
+        kernels=None,  # engaging default HiCCUPS kernels
+        max_loci_separation=max_loci_separation,
+        max_nans_tolerated=max_nans_tolerated,  # test if this has desired behavior
+        n_lambda_bins=num_lambda_bins,  # update this eventually
+        lambda_bin_fdr=fdr,
+        clustering_radius=clustering_radius,
+        cluster_filtering=None,
+        tile_size=tile_size,
+        nproc=nproc,
     )
 
-    # 4. Post-processing
-    if verbose:
-        logging.info(f"Begin post-processing of {len(filtered_pixels)} filtered pixels")
-        logging.info("preparing to extract needed q-values ...")
-
-    filtered_pixels_qvals = api.dotfinder.annotate_pixels_with_qvalues(
-        filtered_pixels, qvalues, kernels
-    )
-    # 4a. clustering
-    ########################################################################
-    # Clustering has to be done using annotated DataFrame of filtered pixels
-    # why ? - because - clustering has to be done independently for every region!
-    ########################################################################
-    filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals, clr.bins()[:])
-    filtered_pixels_annotated = assign_regions(filtered_pixels_annotated, view_df)
-    # consider reseting index here
-    centroids = api.dotfinder.clustering_step(
-        filtered_pixels_annotated,
-        view_df["name"],
-        dots_clustering_radius,
-        verbose,
-    )
-
-    # 4b. filter by enrichment and qval
-    postprocessed_calls = api.dotfinder.thresholding_step(centroids)
-
-    # Final-postprocessed result
-    if out_prefix is not None:
-
-        postprocessed_fname = op.join(
-            op.dirname(out_prefix), op.basename(out_prefix) + ".postproc.bedpe"
-        )
-
-        postprocessed_calls.to_csv(
-            postprocessed_fname, sep="\t", header=True, index=False, compression=None
+    # output results in a file, when specified
+    if output:
+        dot_calls_df.to_csv(output, sep="\t", header=True, index=False, na_rep="nan")
+    # or print into stdout otherwise:
+    else:
+        print(
+            dot_calls_df.to_csv(
+                output, sep="\t", header=True, index=False, na_rep="nan"
+            )
         )
diff --git a/cooltools/cli/saddle.py b/cooltools/cli/saddle.py
index b593b891..c34fddd3 100644
--- a/cooltools/cli/saddle.py
+++ b/cooltools/cli/saddle.py
@@ -218,7 +218,7 @@ def saddle(
     track_columns = ["chrom", "start", "end", track_name]
     # specify dtype as a rudimentary form of validation:
     track_dtype = {
-        "chrom": np.str,
+        "chrom": np.str_,
         "start": np.int64,
         "end": np.int64,
         track_name: np.float64,
diff --git a/cooltools/lib/_numutils.pyx b/cooltools/lib/_numutils.pyx
index 9597d0f5..5554476e 100644
--- a/cooltools/lib/_numutils.pyx
+++ b/cooltools/lib/_numutils.pyx
@@ -124,6 +124,15 @@ def iterative_correction_symmetric(
     tolerance : float
         If less or equal to zero, will perform max_iter iterations.
 
+    Returns
+    -------
+    _x : np.ndarray
+        Corrected matrix
+    totalBias : np.ndarray
+        Vector with corrections biases
+    report : (bool, int)
+        A tuple that reports convergence
+        status and used number of iterations
     """
     cdef int N = len(x)
 
@@ -168,7 +177,7 @@ def iterative_correction_symmetric(
     corr = totalBias[s0!=0].mean()  #mean correction factor
     x  = x * corr * corr #renormalizing everything
     totalBias /= corr
-    report = {'converged':converged, 'iternum':iternum}
+    report = (converged, iternum)
 
     return x, totalBias, report
 
diff --git a/cooltools/lib/checks.py b/cooltools/lib/checks.py
index 3f5f502f..f95f514c 100644
--- a/cooltools/lib/checks.py
+++ b/cooltools/lib/checks.py
@@ -39,9 +39,9 @@ def _is_expected(
     Check if a expected_df looks like an expected
     DataFrame, i.e.:
      - has neccessary columns
-     - there are no Nulls in regions1/2, diag
-     - every trans region1/2 has a single value
-     - every cis region1/2 has at least one value
+     - there are no Nulls in the regions1, regions2, diag columns
+     - every trans pair region1 region2 has a single value
+     - every cis pair region1, region2 has at least one value
 
     Parameters
     ----------
@@ -72,7 +72,7 @@ def _is_expected(
 
     # that's what we expect as column names:
     expected_columns = [col for col in expected_dtypes]
-    # separate "structural" columns: region1/2 and diag if "cis":
+    # separate "structural" columns: region1, region2, diag if "cis":
     grouping_columns = expected_columns[:-1]
 
     # add columns with values and their dtype (float64):
@@ -124,8 +124,8 @@ def _is_expected(
         if expected_df.duplicated(subset=grouping_columns).any():
             raise ValueError(f"Values in {grouping_columns} columns must be unique")
 
-        # make sure region1/2 groups have 1 value for trans contacts
-        # and more than 1 values for cis contacts
+        # for trans expected, ensure pairs region1, region2 have 1 value
+        # for cis expected, ensure pairs region1, region2 have >=1 value
         region1_col, region2_col = grouping_columns[:2]
         for (r1, r2), df in expected_df.groupby([region1_col, region2_col]):
             if contact_type == "trans":
@@ -174,10 +174,12 @@ def _is_compatible_cis_expected(
     raise_errors=False,
 ):
     """
-    Verify expected_df to make sure it is compatible
-    with its view (viewframe) and cooler, i.e.:
-        - regions1/2 are matching names from view
-        - number of diagonals per region1/2 matches cooler
+    Verify expected_df to make sure it is compatible.
+
+    Expected tables can be verified against both
+    a view and a cooler. This includes ensuring:
+        - the entries in columns region1, region2 match names from view
+        - number of diagonals per pair of region1, region2 matches cooler
 
     Parameters
     ----------
@@ -267,8 +269,7 @@ def _is_compatible_trans_expected(
     """
     Verify expected_df to make sure it is compatible
     with its view (viewframe) and cooler, i.e.:
-        - regions1/2 are matching names from view
-        - number of diagonals per region1/2 matches cooler
+        - entries in region1 and region2 match names from view
 
     Parameters
     ----------
diff --git a/cooltools/lib/numutils.py b/cooltools/lib/numutils.py
index 7a4e7763..5d35c066 100644
--- a/cooltools/lib/numutils.py
+++ b/cooltools/lib/numutils.py
@@ -2,8 +2,7 @@
 from scipy.linalg import toeplitz
 import scipy.sparse.linalg
 import scipy.interpolate
-from scipy.ndimage.interpolation import zoom
-import scipy.ndimage.filters
+from scipy.ndimage import zoom, gaussian_filter1d
 import numpy as np
 import numba
 import cooler
@@ -668,6 +667,15 @@ def iterative_correction_symmetric(
     tol : float
         If less or equal to zero, will perform max_iter iterations.
 
+    Returns
+    -------
+    _x : np.ndarray
+        Corrected matrix
+    totalBias : np.ndarray
+        Vector with corrections biases
+    report : (bool, int)
+        A tuple that reports convergence
+        status and used number of iterations
     """
     N = len(x)
 
@@ -710,7 +718,7 @@ def iterative_correction_symmetric(
     corr = totalBias[~mask].mean()  # mean correction factor
     _x = _x * corr * corr  # renormalizing everything
     totalBias /= corr
-    report = {"converged": converged, "iternum": iternum}
+    report = (converged, iternum)
 
     return _x, totalBias, report
 
@@ -729,6 +737,18 @@ def iterative_correction_asymmetric(x, max_iter=1000, tol=1e-5, verbose=False):
         The number of diagonals to ignore during iterative correction.
     tol : float
         If less or equal to zero, will perform max_iter iterations.
+
+    Returns
+    -------
+    _x : np.ndarray
+        Corrected matrix
+    totalBias : np.ndarray
+        Vector with corrections biases for columns
+    totalBias2 : np.ndarray
+        Vector with corrections biases for rows
+    report : (bool, int)
+        A tuple that reports convergence
+        status and used number of iterations
     """
     N2, N = x.shape
     _x = x.copy()
@@ -777,7 +797,7 @@ def iterative_correction_asymmetric(x, max_iter=1000, tol=1e-5, verbose=False):
     _x = _x * corr * corr2  # renormalizing everything
     totalBias /= corr
     totalBias2 /= corr2
-    report = {"converged": converged, "iternum": iternum}
+    report = (converged, iternum)
 
     return _x, totalBias, totalBias2, report
 
@@ -1380,7 +1400,7 @@ def _expand(ar, counts=None):
 
 
 def robust_gauss_filter(
-    ar, sigma=2, functon=scipy.ndimage.filters.gaussian_filter1d, kwargs=None
+    ar, sigma=2, functon=gaussian_filter1d, kwargs=None
 ):
     """
     Implements an edge-handling mode for gaussian filter that basically ignores
diff --git a/cooltools/sandbox/expected_smoothing.py b/cooltools/sandbox/expected_smoothing.py
index 7eaae476..630675cc 100644
--- a/cooltools/sandbox/expected_smoothing.py
+++ b/cooltools/sandbox/expected_smoothing.py
@@ -1,4 +1,4 @@
-import collections
+from collections.abc import Iterable
 
 import numpy as np
 import numba
@@ -276,7 +276,7 @@ def agg_smooth_cvd(
         groupby = []
     elif isinstance(groupby, str):
         groupby = [groupby]
-    elif isinstance(groupby, collections.Iterable):
+    elif isinstance(groupby, Iterable):
         groupby = list(groupby)
     else:
         raise ValueError("groupby must be a string, a list of strings, or None")
diff --git a/tests/test_call-dots.py b/tests/test_call-dots.py
index d3b83f41..e1dec94f 100644
--- a/tests/test_call-dots.py
+++ b/tests/test_call-dots.py
@@ -2,78 +2,57 @@
 
 from click.testing import CliRunner
 from cooltools.cli import cli
+import cooler
+import numpy as np
+from cooltools import api
+from cooltools.lib.io import read_viewframe_from_file, read_expected_from_file
 
 
-def test_call_dots_cli(request, tmpdir):
-    in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
-    in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.chromnamed.tsv")
-    out_dots = op.join(tmpdir, "test.dots")
-
-    runner = CliRunner()
-    result = runner.invoke(
-        cli,
-        [
-            "dots",
-            "-p",
-            1,
-            "--kernel-width",
-            2,
-            "--kernel-peak",
-            1,
-            "--tile-size",
-            60_000_000,
-            "--max-loci-separation",
-            100_000_000,
-            "--out-prefix",
-            out_dots,
-            in_cool,
-            in_exp,
-        ],
-    )
-    # This command should fail because viewframe interpreted from cooler does not correspond to toy_expected:
-    assert result.exit_code == 1
-
-
-def test_call_dots_view_cli(request, tmpdir):
+# test user-facing API for calling dots
+def test_dots(request):
     # Note that call-dots requires ucsc named expected and view
     in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
     in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv")
     in_regions = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed")
-    out_dots = op.join(tmpdir, "test.dots")
 
-    runner = CliRunner()
-    cmd = [
-        "dots",
-        "--view",
-        in_regions,
-        "-p",
-        1,
-        "--kernel-width",
-        2,
-        "--kernel-peak",
-        1,
-        "--tile-size",
-        60_000_000,
-        "--max-loci-separation",
-        100_000_000,
-        "--out-prefix",
-        out_dots,
-        in_cool,
+    # read data for the test:
+    clr = cooler.Cooler(in_cool)
+    view_df = read_viewframe_from_file(in_regions, clr, check_sorting=True)
+    expected_df = read_expected_from_file(
         in_exp,
-    ]
-    result = runner.invoke(cli, cmd)
-    assert result.exit_code == 0
-    # make sure output is generated:
-    assert op.isfile(f"{out_dots}.enriched.tsv")
-    assert op.isfile(f"{out_dots}.postproc.bedpe")
+        expected_value_cols=["balanced.avg"],
+        verify_view=view_df,
+        verify_cooler=clr,
+    )
 
+    # generate dot-calls
+    dot_calls_df = api.dotfinder.dots(
+        clr,
+        expected_df,
+        view_df=view_df,
+        kernels={
+            "d": np.array([[1, 0, 1], [0, 0, 0], [1, 0, 1]]),
+            "v": np.array([[0, 1, 0], [0, 0, 0], [0, 1, 0]]),
+            "h": np.array([[0, 0, 0], [1, 0, 1], [0, 0, 0]]),
+        },
+        max_loci_separation=100_000_000,
+        max_nans_tolerated=1,
+        n_lambda_bins=50,
+        lambda_bin_fdr=0.1,
+        clustering_radius=False,
+        cluster_filtering=None,
+        tile_size=50_000_000,
+        nproc=1,
+    )
 
-# TODO: Remove this test once "regions" are deprecated altogether:
-def test_call_dots_regions_deprecated_cli(request, tmpdir):
-    # Note that call-dots requires ucsc named expected and view
+    # no comparison with reference results yet
+    # just checking if it runs without errors
+    assert not dot_calls_df.empty
+
+
+def test_call_dots_cli(request, tmpdir):
     in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
-    in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv")
-    in_regions = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed")
+    in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.chromnamed.tsv")
     out_dots = op.join(tmpdir, "test.dots")
 
     runner = CliRunner()
@@ -81,25 +60,48 @@ def test_call_dots_regions_deprecated_cli(request, tmpdir):
         cli,
         [
             "dots",
-            "--regions",
-            in_regions,
             "-p",
             1,
-            "--kernel-width",
-            2,
-            "--kernel-peak",
-            1,
             "--tile-size",
             60_000_000,
             "--max-loci-separation",
             100_000_000,
-            "--out-prefix",
+            "--output",
             out_dots,
             in_cool,
             in_exp,
         ],
     )
-    assert result.exit_code == 0
-    # make sure output is generated:
-    assert op.isfile(f"{out_dots}.enriched.tsv")
-    assert op.isfile(f"{out_dots}.postproc.bedpe")
+    # This command should fail because viewframe interpreted from cooler does not correspond to toy_expected:
+    assert result.exit_code == 1
+
+
+# comment this test for now, until we swap out input data and/or allow for custom kernels
+
+# def test_call_dots_view_cli(request, tmpdir):
+#     # Note that call-dots requires ucsc named expected and view
+#     in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
+#     in_exp = op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv")
+#     in_regions = op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed")
+#     out_dots = op.join(tmpdir, "test.dots")
+
+#     runner = CliRunner()
+#     cmd = [
+#         "dots",
+#         "--view",
+#         in_regions,
+#         "-p",
+#         1,
+#         "--tile-size",
+#         60_000_000,
+#         "--max-loci-separation",
+#         100_000_000,
+#         "--output",
+#         out_dots,
+#         in_cool,
+#         in_exp,
+#     ]
+#     result = runner.invoke(cli, cmd)
+#     assert result.exit_code == 0
+#     # make sure output is generated:
+#     assert op.isfile(out_dots)
diff --git a/tests/test_dotfinder_chunking.py b/tests/test_dotfinder_chunking.py
index d5ca1511..554dfc4c 100644
--- a/tests/test_dotfinder_chunking.py
+++ b/tests/test_dotfinder_chunking.py
@@ -1,13 +1,15 @@
-# create a test for the chunking versions of 'get_adjusted_expected_tile_some_nans':
+# create a test for square_tiling and get_adjusted_expected_tile_some_nans
 
 import numpy as np
 import pandas as pd
 
 import os.path as op
 
-from cooltools.api import dotfinder
 from cooltools.lib.numutils import LazyToeplitz
-
+from cooltools.api.dotfinder import (
+    tile_square_matrix,
+    get_adjusted_expected_tile_some_nans,
+)
 
 # adjust the path for data:
 testdir = op.realpath(op.dirname(__file__))
@@ -16,6 +18,8 @@
 mock_input = op.join(testdir, "data", "dotfinder_mock_inputs.npz")
 mock_result = op.join(testdir, "data", "dotfinder_mock_res.csv.gz")
 
+# load mock results:
+mock_res = pd.read_csv(mock_result).rename(columns={"row": "bin1_id", "col": "bin2_id"})
 
 # load bunch of array from a numpy npz container:
 arrays_loaded = np.load(mock_input)
@@ -29,11 +33,9 @@
 # 1D expected extracted for tiling-tests:
 mock_exp = LazyToeplitz(mock_E_ice[0, :])
 
-# we need w-edge for tiling procedures:
-w = 3
-# p = 1
-# kernel type: 'donut'
-# # just a simple donut kernel for testing:
+# we need kernel_half_width-edge for tiling procedures:
+kernel_half_width = 3
+# just a simple donut kernel for testing:
 kernel = np.array(
     [
         [1, 1, 1, 0, 1, 1, 1],
@@ -45,20 +47,70 @@
         [1, 1, 1, 0, 1, 1, 1],
     ]
 )
-# bin size:
-# mock data were extracted
-# from 20kb matrix:
+# mock bin size 20kb
 b = 20000
 # 2MB aroud diagonal to look at:
 band = 2e6
-# 1MB aroud diagonal to look at:
-band_1 = 1e6
 
 # start, stop for tiling procedures:
 start, stop = 0, len(mock_M_raw)
 
-# load mock results:
-mock_res = pd.read_csv(mock_result).rename(columns={"row": "bin1_id", "col": "bin2_id"})
+
+#################################################################
+# test 'get_adjusted_expected_tile_some_nans' without chunking :
+#################################################################
+def test_adjusted_expected_tile_some_nans():
+    print("Running tile some nans la_exp test")
+    # first, generate that locally-adjusted expected:
+    res = get_adjusted_expected_tile_some_nans(
+        origin_ij=(0, 0),
+        observed=mock_M_raw,  # should be RAW ...
+        expected=mock_E_ice,
+        bal_weights=mock_v_ice,
+        kernels={"donut": kernel, "footprint": np.ones_like(kernel)},
+    )
+    # mock results are supposedly for
+    # the contacts closer than the band
+    # nucleotides to each other...
+    # we want to retire that functional
+    # from the 'get_adjusted_expected_tile_some_nans'
+    # so we should complement that selection
+    # here:
+    nnans = 1  # no nans tolerated
+    band_idx = int(band / b)
+    is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx)
+    does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans
+    # so, selecting inside band and nNaNs compliant results:
+    res = res[is_inside_band & does_comply_nans].reset_index(drop=True)
+    #
+    # ACTUAL TESTS:
+    # integer part of DataFrame must equals exactly:
+    assert res[["bin1_id", "bin2_id"]].equals(mock_res[["bin1_id", "bin2_id"]])
+    # compare floating point part separately:
+    assert np.allclose(
+        res["la_exp." + "donut" + ".value"], mock_res["la_expected"], equal_nan=True
+    )
+    #
+    # try recovering more NaNs ...
+    # (allowing 1 here per pixel's footprint)...
+    res = get_adjusted_expected_tile_some_nans(
+        origin_ij=(0, 0),
+        observed=mock_M_raw,  # should be RAW...
+        expected=mock_E_ice,
+        bal_weights=mock_v_ice,
+        kernels={"donut": kernel, "footprint": np.ones_like(kernel)},
+    )
+    # post-factum filtering resides
+    # outside of get_la_exp now:
+    nnans = 2  # just 1 nan tolerated
+    band_idx = int(band / b)
+    is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx)
+    does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans
+    # so, selecting inside band and comply nNaNs results only:
+    res = res[is_inside_band & does_comply_nans].reset_index(drop=True)
+
+    # now we can only guess the size:
+    assert len(res) > len(mock_res)
 
 
 def test_adjusted_expected_tile_some_nans_and_square_tiling():
@@ -66,9 +118,9 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling():
     # first, generate that locally-adjusted expected:
     nnans = 1
     band_idx = int(band / b)
-    res_df = pd.DataFrame([])
-    for tilei, tilej in dotfinder.square_matrix_tiling(
-        start, stop, step=40, edge=w, square=False
+    res_ij = []
+    for tilei, tilej in tile_square_matrix(
+        stop - start, start, tile_size=40, pad=kernel_half_width
     ):
         # define origin:
         origin = (tilei[0], tilej[0])
@@ -80,26 +132,23 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling():
         ice_weight_i = mock_v_ice[slice(*tilei)]
         ice_weight_j = mock_v_ice[slice(*tilej)]
         # that's the main working function from dotfinder:
-        res = dotfinder.get_adjusted_expected_tile_some_nans(
-            origin=origin,
+        res = get_adjusted_expected_tile_some_nans(
+            origin_ij=origin,
             observed=observed,
             expected=expected,
             bal_weights=(ice_weight_i, ice_weight_j),
             kernels={"donut": kernel, "footprint": np.ones_like(kernel)},
-            # nan_threshold=1,
-            verbose=False,
         )
         is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx)
         # new style, selecting good guys:
         does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans
         # so, select inside band and nNaNs compliant results and append:
-        res_df = res_df.append(
-            res[is_inside_band & does_comply_nans], ignore_index=True
-        )
+        res_ij.append(res[is_inside_band & does_comply_nans])
 
     # drop dups (from overlaping tiles), sort and reset index:
     res_df = (
-        res_df.drop_duplicates()
+        pd.concat(res_ij, ignore_index=True)
+        .drop_duplicates()
         .sort_values(by=["bin1_id", "bin2_id"])
         .reset_index(drop=True)
     )
@@ -118,20 +167,20 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling():
         mock_res_sorted[["bin1_id", "bin2_id"]]
     )
     # compare floating point part separately:
-    assert np.isclose(
+    assert np.allclose(
         res_df["la_exp." + "donut" + ".value"],
         mock_res_sorted["la_expected"],
         equal_nan=True,
-    ).all()
+    )
 
 
 def test_adjusted_expected_tile_some_nans_and_square_tiling_diag_band():
     print("Running tile some nans la_exp test + square tiling + diag band")
     # # Essentially, testing this function:
-    # def chrom_chunk_generator_s(chroms, w, band):
+    # def chrom_chunk_generator_s(chroms, kernel_half_width, band):
     #     for chrom in chroms:
     #         chr_start, chr_stop = the_c.extent(chrom)
-    #         for tilei, tilej in square_matrix_tiling(chr_start, chr_stop, step, w):
+    #         for tilei, tilej in square_matrix_tiling(chr_start, chr_stop, step, kernel_half_width):
     #             # check if a given tile intersects with
     #             # with the diagonal band of interest ...
     #             diag_from = tilej[0] - tilei[1]
@@ -141,14 +190,14 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling_diag_band():
     #             band_to   = band
     #             # we are using this >2w trick to exclude
     #             # tiles from the lower triangle from calculations ...
-    #             if (min(band_to,diag_to) - max(band_from,diag_from)) > 2*w:
+    #             if (min(band_to,diag_to) - max(band_from,diag_from)) > 2*kernel_half_width:
     #                 yield chrom, tilei, tilej
     # first, generate that locally-adjusted expected:
     nnans = 1
     band_idx = int(band / b)
-    res_df = pd.DataFrame([])
-    for tilei, tilej in dotfinder.square_matrix_tiling(
-        start, stop, step=40, edge=w, square=False
+    res_ij = []
+    for tilei, tilej in tile_square_matrix(
+        stop - start, start, tile_size=40, pad=kernel_half_width
     ):
         # check if a given tile intersects with
         # with the diagonal band of interest ...
@@ -159,7 +208,7 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling_diag_band():
         band_to = band_idx
         # we are using this >2w trick to exclude
         # tiles from the lower triangle from calculations ...
-        if (min(band_to, diag_to) - max(band_from, diag_from)) > 2 * w:
+        if (min(band_to, diag_to) - max(band_from, diag_from)) > 2 * kernel_half_width:
             # define origin:
             origin = (tilei[0], tilej[0])
             # RAW observed matrix slice:
@@ -170,26 +219,23 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling_diag_band():
             ice_weight_i = mock_v_ice[slice(*tilei)]
             ice_weight_j = mock_v_ice[slice(*tilej)]
             # that's the main working function from dotfinder:
-            res = dotfinder.get_adjusted_expected_tile_some_nans(
-                origin=origin,
+            res = get_adjusted_expected_tile_some_nans(
+                origin_ij=origin,
                 observed=observed,
                 expected=expected,
                 bal_weights=(ice_weight_i, ice_weight_j),
                 kernels={"donut": kernel, "footprint": np.ones_like(kernel)},
-                # nan_threshold=1,
-                verbose=False,
             )
             is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx)
             # new style, selecting good guys:
             does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans
             # so, select inside band and nNaNs compliant results and append:
-            res_df = res_df.append(
-                res[is_inside_band & does_comply_nans], ignore_index=True
-            )
+            res_ij.append(res[is_inside_band & does_comply_nans])
 
     # sort and reset index, there shouldn't be any duplicates now:
     res_df = (
-        res_df.drop_duplicates()
+        pd.concat(res_ij, ignore_index=True)
+        .drop_duplicates()
         .sort_values(by=["bin1_id", "bin2_id"])
         .reset_index(drop=True)
     )
@@ -208,8 +254,8 @@ def test_adjusted_expected_tile_some_nans_and_square_tiling_diag_band():
         mock_res_sorted[["bin1_id", "bin2_id"]]
     )
     # compare floating point part separately:
-    assert np.isclose(
+    assert np.allclose(
         res_df["la_exp." + "donut" + ".value"],
         mock_res_sorted["la_expected"],
         equal_nan=True,
-    ).all()
+    )
diff --git a/tests/test_dotfinder_la_exp.py b/tests/test_dotfinder_la_exp.py
deleted file mode 100644
index 5b7cf553..00000000
--- a/tests/test_dotfinder_la_exp.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# create a test for get_locally_adjusted_expected using mock dense matrices ...
-# testing la_exp functions on a single piece of matrix
-# no chunking applied here ...
-
-import numpy as np
-import pandas as pd
-
-import os.path as op
-
-# try importing stuff from dotfinder:
-from cooltools.api.dotfinder import get_adjusted_expected_tile_some_nans
-
-# adjust the path for data:
-testdir = op.realpath(op.dirname(__file__))
-
-# mock input data location:
-mock_input = op.join(testdir, "data", "dotfinder_mock_inputs.npz")
-mock_result = op.join(testdir, "data", "dotfinder_mock_res.csv.gz")
-
-# load mock results:
-mock_res = pd.read_csv(mock_result)
-mock_res = mock_res.rename(columns={"row": "bin1_id", "col": "bin2_id"})
-
-# load bunch of array from a numpy npz container:
-arrays_loaded = np.load(mock_input)
-# snippets of M_raw, M_ice, E_ice and v_ice are supposed
-# to be there ...
-mock_M_raw = arrays_loaded["mock_M_raw"]
-mock_M_ice = arrays_loaded["mock_M_ice"]
-mock_E_ice = arrays_loaded["mock_E_ice"]
-mock_v_ice = arrays_loaded["mock_v_ice"]
-
-
-# we need w-edge for tiling procedures:
-# w = 3
-# p = 1
-# kernel type: 'donut'
-# # just a simple donut kernel for testing:
-kernel = np.array(
-    [
-        [1, 1, 1, 0, 1, 1, 1],
-        [1, 1, 1, 0, 1, 1, 1],
-        [1, 1, 0, 0, 0, 1, 1],
-        [0, 0, 0, 0, 0, 0, 0],
-        [1, 1, 0, 0, 0, 1, 1],
-        [1, 1, 1, 0, 1, 1, 1],
-        [1, 1, 1, 0, 1, 1, 1],
-    ]
-)
-# bin size:
-# mock data were extracted
-# from 20kb matrix:
-b = 20000
-# 2MB aroud diagonal to look at:
-band = 2e6
-
-
-#################################################################
-# main test function for new 'get_adjusted_expected_tile_some_nans':
-#################################################################
-def test_adjusted_expected_tile_some_nans():
-    print("Running tile some nans la_exp test")
-    # first, generate that locally-adjusted expected:
-    # Ed_raw, mask_ndx, Cobs, Cexp, NN =
-    res = get_adjusted_expected_tile_some_nans(
-        origin=(0, 0),
-        observed=mock_M_raw,  # should be RAW ...
-        expected=mock_E_ice,
-        bal_weights=mock_v_ice,
-        kernels={"donut": kernel, "footprint": np.ones_like(kernel)},
-    )
-    # mock results are supposedly for
-    # the contacts closer than the band
-    # nucleotides to each other...
-    # we want to retire that functional
-    # from the 'get_adjusted_expected_tile_some_nans'
-    # so we should complement that selection
-    # here:
-    nnans = 1  # no nans tolerated
-    band_idx = int(band / b)
-    is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx)
-    does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans
-    # so, selecting inside band and nNaNs compliant results:
-    res = res[is_inside_band & does_comply_nans].reset_index(drop=True)
-    #
-    # ACTUAL TESTS:
-    # integer part of DataFrame must equals exactly:
-    assert res[["bin1_id", "bin2_id"]].equals(mock_res[["bin1_id", "bin2_id"]])
-    # compare floating point part separately:
-    assert np.isclose(
-        res["la_exp." + "donut" + ".value"], mock_res["la_expected"], equal_nan=True
-    ).all()
-    #
-    # try recovering more NaNs ...
-    # (allowing 1 here per pixel's footprint)...
-    res = get_adjusted_expected_tile_some_nans(
-        origin=(0, 0),
-        observed=mock_M_raw,  # should be RAW...
-        expected=mock_E_ice,
-        bal_weights=mock_v_ice,
-        kernels={"donut": kernel, "footprint": np.ones_like(kernel)},
-    )
-    # post-factum filtering resides
-    # outside of get_la_exp now:
-    nnans = 2  # just 1 nan tolerated
-    band_idx = int(band / b)
-    is_inside_band = res["bin1_id"] > (res["bin2_id"] - band_idx)
-    does_comply_nans = res["la_exp." + "footprint" + ".nnans"] < nnans
-    # so, selecting inside band and comply nNaNs results only:
-    res = res[is_inside_band & does_comply_nans].reset_index(drop=True)
-
-    # now we can only guess the size:
-    assert len(res) > len(mock_res)
diff --git a/tests/test_dotfinder_stats.py b/tests/test_dotfinder_stats.py
new file mode 100644
index 00000000..63cbb1a2
--- /dev/null
+++ b/tests/test_dotfinder_stats.py
@@ -0,0 +1,276 @@
+import numpy as np
+import pandas as pd
+from scipy.stats import poisson
+
+from cooltools.api.dotfinder import (
+    histogram_scored_pixels,
+    determine_thresholds,
+    annotate_pixels_with_qvalues,
+    extract_scored_pixels,
+)
+
+
+# helper functions for BH-FDR copied from www.statsmodels.org
+def _fdrcorrection(pvals, alpha=0.05):
+    """
+    pvalue correction for false discovery rate.
+
+    This covers Benjamini/Hochberg for independent or positively correlated tests.
+
+    Parameters
+    ----------
+    pvals : np.ndarray
+        Sorted set of p-values of the individual tests.
+    alpha : float, optional
+        Family-wise error rate. Defaults to ``0.05``.
+
+    Returns
+    -------
+    rejected : ndarray, bool
+        True if a hypothesis is rejected, False if not
+    pvalue-corrected : ndarray
+        pvalues adjusted for multiple hypothesis testing to limit FDR
+
+    """
+
+    ntests = len(pvals)
+    # empirical Cumulative Distribution Function for pvals:
+    ecdffactor = np.arange(1, ntests + 1) / float(ntests)
+
+    reject = pvals <= ecdffactor * alpha
+    if reject.any():
+        rejectmax = max(np.nonzero(reject)[0])
+        reject[:rejectmax] = True
+
+    pvals_corrected_raw = pvals / ecdffactor
+    pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
+    del pvals_corrected_raw
+    pvals_corrected[pvals_corrected > 1] = 1
+    return reject, pvals_corrected
+
+
+def multipletests(pvals, alpha=0.1, is_sorted=False):
+    """
+    Test results and p-value correction for multiple tests
+    Using FDR Benjamini-Hochberg method (non-negative)
+
+    Parameters
+    ----------
+    pvals : array_like, 1-d
+        uncorrected p-values. Must be 1-dimensional.
+    alpha : float
+        FWER, family-wise error rate, e.g. 0.1
+    is_sorted : bool
+        If False (default), the p_values will be sorted, but the corrected
+        pvalues are in the original order. If True, then it assumed that the
+        pvalues are already sorted in ascending order.
+
+    Returns
+    -------
+    reject : ndarray, boolean
+        true for hypothesis that can be rejected for given alpha
+    pvals_corrected : ndarray
+        p-values corrected for multiple tests
+
+    Notes
+    -----
+    the p-value correction is independent of the
+    alpha specified as argument. In these cases the corrected p-values
+    can also be compared with a different alpha
+
+    All procedures that are included, control FWER or FDR in the independent
+    case, and most are robust in the positively correlated case.
+    """
+    pvals = np.asarray(pvals)
+
+    if not is_sorted:
+        sortind = np.argsort(pvals)
+        pvals = np.take(pvals, sortind)
+
+    reject, pvals_corrected = _fdrcorrection(pvals, alpha=alpha)
+
+    if is_sorted:
+        return reject, pvals_corrected
+    else:
+        pvals_corrected_ = np.empty_like(pvals_corrected)
+        pvals_corrected_[sortind] = pvals_corrected
+        del pvals_corrected
+        reject_ = np.empty_like(reject)
+        reject_[sortind] = reject
+        return reject_, pvals_corrected_
+
+
+# mock input data to perform some p-value calculation and correction on
+num_pixels = 2500
+max_value = 99
+# fake kernels just for the sake of their names 'd' and 'v':
+fake_kernels = {
+    "d": np.random.randint(2, size=9).reshape(3, 3),
+    "v": np.random.randint(2, size=9).reshape(3, 3),
+}
+# table with the "scored" pixel (as if they are returned by dotfinder-scoring function)
+pixel_dict = {}
+# enrich fake counts to all for more significant calls
+pixel_dict["count"] = np.random.randint(max_value, size=num_pixels) + 9
+for k in fake_kernels:
+    pixel_dict[f"la_exp.{k}.value"] = max_value * np.random.random(num_pixels)
+scored_df = pd.DataFrame(pixel_dict)
+
+# design lambda-bins as in dot-calling:
+num_lchunks = 6
+ledges = np.r_[[-np.inf], np.linspace(0, max_value, num_lchunks), [np.inf]]
+
+# set FDR parameter
+FDR = 0.1
+
+
+# helper functions working on a chunk of counts or pvals
+# associated with a given lambda-bin:
+def get_pvals_chunk(counts_series_lchunk):
+    """
+    Parameters:
+    -----------
+    counts_series_lchunk : pd.Series(int)
+        Series of raw pixel counts where the name of the Series
+        is pd.Interval of the lambda-bin where the pixel belong.
+        I.e. counts_series_lchunk.name.right - is the upper limit of the chunk
+        and is used as "expected" in Poisson distribution to estimate p-value.
+
+    Returns:
+    --------
+    pvals: ndarray[float]
+        array of p-values for each pixel
+
+    Notes:
+    ------
+    poisson.sf = 1.0 - poisson.cdf
+    """
+    return poisson.sf(counts_series_lchunk.values, counts_series_lchunk.name.right)
+
+
+def get_qvals_chunk(pvals_series_lchunk):
+    """
+    Parameters:
+    -----------
+    pvals_series_lchunk : pd.Series(float)
+        Series of p-values calculated for each pixel, where the name
+        of the Series is pd.Interval of the lambda-bin where the pixel belong.
+
+    Returns:
+    --------
+    qvals: ndarray[float]
+        array of q-values, i.e. p-values corrected with the multiple hypothesis
+        testing procedure BH-FDR, for each pixel
+
+    Notes:
+    ------
+    level of False Discore Rate (FDR) is fixed for testing
+    """
+    _, qvals = multipletests(pvals_series_lchunk.values, alpha=FDR, is_sorted=False)
+    return qvals
+
+
+def get_reject_chunk(pvals_series_lchunk):
+    """
+    Parameters:
+    -----------
+    pvals_series_lchunk : pd.Series(float)
+        Series of p-values calculated for each pixel, where the name
+        of the Series is pd.Interval of the lambda-bin where the pixel belong.
+
+    Returns:
+    --------
+    rej: ndarray[bool]
+        array of rejection statuses, i.e. for every p-values return if corresponding
+        null hypothesis can be rejected or not, using multiple hypothesis testing
+        procedure BH-FDR.
+
+    Notes:
+    ------
+     - pixels with rejected status (not null) are considered as significantly enriched
+     - level of False Discore Rate (FDR) is fixed for testing
+    """
+    rej, _ = multipletests(pvals_series_lchunk.values, alpha=FDR, is_sorted=False)
+    return rej
+
+
+# for the fake scored-pixel table calculate p-vals, q-vals, l-chunk where they belong
+# and rejection status using introduced statsmodels-based helper functions:
+for k in fake_kernels:
+    lbin = pd.cut(scored_df[f"la_exp.{k}.value"], ledges)
+    scored_df[f"{k}.pval"] = scored_df.groupby(lbin)["count"].transform(get_pvals_chunk)
+    scored_df[f"{k}.qval"] = scored_df.groupby(lbin)[f"{k}.pval"].transform(
+        get_qvals_chunk
+    )
+    scored_df[f"{k}.rej"] = scored_df.groupby(lbin)[f"{k}.pval"].transform(
+        get_reject_chunk
+    )
+
+
+# test functions in dotfinder using this reference from statsmodels
+def test_histogramming_summary():
+    gw_hists = histogram_scored_pixels(
+        scored_df, kernels=fake_kernels, ledges=ledges, obs_raw_name="count"
+    )
+    # make sure total sum of the histogram yields total number of pixels:
+    for k, _hist in gw_hists.items():
+        assert _hist.sum().sum() == num_pixels
+        assert _hist.index.is_monotonic  # is index sorted
+
+
+# test threshold and rejection tables and only then try q-values
+def test_thresholding():
+    # rebuild hists
+    gw_hists = histogram_scored_pixels(
+        scored_df, kernels=fake_kernels, ledges=ledges, obs_raw_name="count"
+    )
+
+    # # we have to make sure there is nothing in the last lambda-bin
+    # # this is a temporary implementation detail, until we implement dynamic lambda-bins
+    for k in fake_kernels:
+        last_lambda_bin = gw_hists[k].iloc[:, -1]
+        assert last_lambda_bin.sum() == 0  # should be True by construction:
+        # drop that last column/bin (last_edge, +inf]:
+        gw_hists[k] = gw_hists[k].drop(columns=last_lambda_bin.name)
+
+    # calculate q-values and rejection threshold using dotfinder built-in methods
+    # that are the reimplementation of HiCCUPS statistical procedures:
+    threshold_df, qvalues = determine_thresholds(gw_hists, FDR)
+
+    enriched_pixels_df = extract_scored_pixels(
+        scored_df, threshold_df, obs_raw_name="count"
+    )
+
+    # all enriched pixels have their Null hypothesis rejected
+    assert enriched_pixels_df["d.rej"].all()
+    assert enriched_pixels_df["v.rej"].all()
+    # number of enriched pixels should match that number of
+    # pixels with both null-hypothesis rejected:
+    assert (scored_df["d.rej"] & scored_df["v.rej"]).sum() == len(enriched_pixels_df)
+
+
+def test_qvals():
+    # rebuild hists
+    gw_hists = histogram_scored_pixels(
+        scored_df, kernels=fake_kernels, ledges=ledges, obs_raw_name="count"
+    )
+
+    # # we have to make sure there is nothing in the last lambda-bin
+    # # this is a temporary implementation detail, until we implement dynamic lambda-bins
+    for k in fake_kernels:
+        last_lambda_bin = gw_hists[k].iloc[:, -1]
+        assert last_lambda_bin.sum() == 0  # should be True by construction:
+        # drop that last column/bin (last_edge, +inf]:
+        gw_hists[k] = gw_hists[k].drop(columns=last_lambda_bin.name)
+
+    # calculate q-values and rejection threshold using dotfinder built-in methods
+    # that are the reimplementation of HiCCUPS statistical procedures:
+    threshold_df, qvalues = determine_thresholds(gw_hists, FDR)
+    # annotate scored pixels with q-values:
+    scored_df_qvals = annotate_pixels_with_qvalues(
+        scored_df, qvalues, obs_raw_name="count"
+    )
+
+    # our procedure in dotfiner should match these q-values exactly, including >1.0
+    assert np.allclose(scored_df_qvals["v.qval"], scored_df_qvals["la_exp.v.qval"])
+    assert np.allclose(scored_df_qvals["d.qval"], scored_df_qvals["la_exp.d.qval"])