diff --git a/.github/workflows/build-conan-deps.yml b/.github/workflows/build-conan-deps.yml index 1de9e2d..a231126 100644 --- a/.github/workflows/build-conan-deps.yml +++ b/.github/workflows/build-conan-deps.yml @@ -45,7 +45,7 @@ jobs: env: CONAN_HOME: "${{ github.workspace }}/.conan2" - IMAGE: quay.io/pypa/manylinux_2_28_${{ inputs.arch }} + IMAGE: quay.io/pypa/manylinux_2_28_${{ inputs.arch }}:2024.11.09-1 outputs: conan-key: ${{ steps.generate-cache-key.outputs.conan-key }} @@ -107,6 +107,30 @@ jobs: path: ${{ env.CONAN_HOME }}/p lookup-only: true + - name: Configure Conan + if: steps.cache-conan.outputs.cache-hit != 'true' + run: | + cat << 'EOF' | tee script.sh > /dev/null + #!usr/bin/env bash + set -u + set -e + + conan_version="$1" + + PATH="/opt/python/cp312-cp312/bin:$PATH" + pip install "conan==$conan_version" + + conan remote update conancenter --url https://center2.conan.io + EOF + + chmod 755 script.sh + + docker run \ + -e "CONAN_HOME=$CONAN_HOME" \ + -v "$PWD/script.sh:/tmp/script.sh:ro" \ + -v "$CONAN_HOME:$CONAN_HOME" \ + "$IMAGE" /tmp/script.sh '${{ inputs.conan-version }}' + - name: Clean Conan cache (pre-build) if: steps.cache-conan.outputs.cache-hit != 'true' run: | @@ -261,7 +285,7 @@ jobs: - uses: actions/setup-python@v5 if: steps.cache-conan.outputs.cache-hit != 'true' with: - python-version: "3.12" + python-version: "3.13" - name: Update build deps if: steps.cache-conan.outputs.cache-hit != 'true' @@ -269,7 +293,9 @@ jobs: - name: Configure Conan if: steps.cache-conan.outputs.cache-hit != 'true' - run: conan profile detect --force + run: | + conan profile detect --force + conan remote update conancenter --url https://center2.conan.io - name: Clean Conan cache (pre-build) if: steps.cache-conan.outputs.cache-hit != 'true' @@ -366,7 +392,7 @@ jobs: - uses: actions/setup-python@v5 if: steps.cache-conan.outputs.cache-hit != 'true' with: - python-version: "3.12" + python-version: "3.13" - name: Update build deps if: steps.cache-conan.outputs.cache-hit != 'true' @@ -380,6 +406,8 @@ jobs: sed -i 's/compiler\.cppstd=.*/compiler.cppstd=${{ inputs.cppstd }}/' "$conan_profile" + conan remote update conancenter --url https://center2.conan.io + - name: Clean Conan cache (pre-build) if: steps.cache-conan.outputs.cache-hit != 'true' run: | diff --git a/.github/workflows/fuzzy-testing.yml b/.github/workflows/fuzzy-testing.yml index 78a341e..5f12118 100644 --- a/.github/workflows/fuzzy-testing.yml +++ b/.github/workflows/fuzzy-testing.yml @@ -11,8 +11,6 @@ on: - "cmake/**" - "src/**" - "test/scripts/fuzzer.py" - - "utils/devel/stubgen.py" - - "utils/devel/symlink_pyarrow_libs.py" - "CMakeLists.txt" - "conanfile.py" - "pyproject.toml" @@ -22,8 +20,6 @@ on: - "cmake/**" - "src/**" - "test/scripts/fuzzer.py" - - "utils/devel/stubgen.py" - - "utils/devel/symlink_pyarrow_libs.py" - "CMakeLists.txt" - "conanfile.py" - "pyproject.toml" @@ -147,6 +143,10 @@ jobs: key: conan-${{ steps.cache-key.outputs.key }} path: ${{ env.CONAN_HOME }}/p + - name: Configure Conan + if: steps.cache-conan.outputs.cache-hit != 'true' + run: conan remote update conancenter --url https://center2.conan.io + - name: Clean Conan cache (pre-build) if: steps.cache-conan.outputs.cache-hit != 'true' run: | diff --git a/.github/workflows/lint-cff.yml b/.github/workflows/lint-cff.yml new file mode 100644 index 0000000..2aba1a9 --- /dev/null +++ b/.github/workflows/lint-cff.yml @@ -0,0 +1,62 @@ +# Copyright (C) 2024 Roberto Rossini +# SPDX-License-Identifier: MIT + +name: Lint CITATION.cff + +on: + push: + branches: [main] + paths: + - ".github/workflows/lint-cff.yml" + - "CITATION.cff" + + pull_request: + paths: + - ".github/workflows/lint-cff.yml" + - "CITATION.cff" + +# https://stackoverflow.com/a/72408109 +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash + +jobs: + lint-cff: + runs-on: ubuntu-latest + name: Lint CITATION.cff + + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: CITATION.cff + sparse-checkout-cone-mode: false + + - name: Generate DESCRIPTION file + run: | + cat << EOF > DESCRIPTION + Package: hictkpy + Title: What the Package Does (One Line, Title Case) + Version: 0.0.0.9000 + Authors@R: + person("First", "Last", , "first.last@example.com", role = c("aut", "cre")) + Description: What the package does (one paragraph). + License: MIT + Encoding: UTF-8 + Roxygen: list(markdown = TRUE) + RoxygenNote: 7.3.2 + Imports: + cffr + EOF + + - name: Setup R + uses: r-lib/actions/setup-r@v2 + + - name: Add requirements + uses: r-lib/actions/setup-r-dependencies@v2 + + - name: Lint CITATION.cff + run: Rscript -e 'cffr::cff_validate("CITATION.cff")' diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml index c0817f9..045e44b 100644 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -13,7 +13,6 @@ on: - "src/**" - "test/**" - "utils/devel/stubgen.py" - - "utils/devel/symlink_pyarrow_libs.py" - "CMakeLists.txt" - "conanfile.py" - "pyproject.toml" @@ -25,7 +24,6 @@ on: - "src/**" - "test/**" - "utils/devel/stubgen.py" - - "utils/devel/symlink_pyarrow_libs.py" - "CMakeLists.txt" - "conanfile.py" - "pyproject.toml" @@ -49,7 +47,7 @@ jobs: fail-fast: false matrix: os: [windows-2022, macos-latest, ubuntu-latest] - python-version: ["3.9", "3.12"] + python-version: ["3.9", "3.13"] runs-on: ${{ matrix.os }} @@ -62,7 +60,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Add requirements - run: python -m pip install --upgrade wheel setuptools + run: python -m pip install --upgrade conan wheel setuptools - name: Generate cache key id: cache-key @@ -71,6 +69,9 @@ jobs: echo "conan-key=pip-${{ matrix.os }}-$hash" >> $GITHUB_OUTPUT + - name: Configure Conan + run: conan remote update conancenter --url https://center2.conan.io + - name: Restore Conan cache id: cache-conan uses: actions/cache/restore@v4 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 9735ed5..8625761 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -14,12 +14,9 @@ on: - "src/**" - "test/**" - "utils/devel/stubgen.py" - - "utils/devel/symlink_pyarrow_libs.py" - "CMakeLists.txt" - "conanfile.py" - "pyproject.toml" - tags: - - "v*.*.*" pull_request: paths: @@ -29,7 +26,6 @@ on: - "src/**" - "test/**" - "utils/devel/stubgen.py" - - "utils/devel/symlink_pyarrow_libs.py" - "CMakeLists.txt" - "conanfile.py" - "pyproject.toml" @@ -56,11 +52,13 @@ jobs: steps: - name: Checkout repo uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Setup Python uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: "3.13" - name: Install cibuildwheel run: pip install 'cibuildwheel>=2.21' @@ -177,6 +175,8 @@ jobs: steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Set up QEMU if: startsWith(matrix.os, 'ubuntu') @@ -228,10 +228,11 @@ jobs: fail-on-cache-miss: true - name: Build wheels - uses: pypa/cibuildwheel@v2.21 + uses: pypa/cibuildwheel@v2.22 with: only: ${{ matrix.wheel-config }} env: + CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT_LINUX: > PIP_VERBOSE=1 HICTK_CI=1 @@ -249,8 +250,6 @@ jobs: CIBW_ENVIRONMENT_PASS_LINUX: > CONAN_HOME HICTKPY_CONAN_INSTALL_ARGS - CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28 - CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28 - name: Verify clean directory run: git diff --exit-code @@ -290,7 +289,37 @@ jobs: name: dist path: dist.tar if-no-files-found: error - retention-days: 30 + retention-days: 7 + + pypi-publish: + name: Upload release to PyPI + if: github.event_name == 'release' && github.event.action == 'published' + needs: [package-artifacts] + environment: + name: PyPI + url: https://pypi.org/p/hictkpy + permissions: + id-token: write + runs-on: ubuntu-latest + + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: dist + merge-multiple: true + + - name: Extract dist.tar + run: | + tar -xf dist.tar + rm dist.tar + + - name: Upload wheels to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + attestations: true + print-hash: true + verbose: true build-wheels-status-check: name: Status Check (Build wheels) @@ -300,11 +329,16 @@ jobs: - build-sdist - build-wheels - package-artifacts + - pypi-publish steps: - name: Collect job results if: | - needs.build-sdist.result != 'success' || - needs.build-wheels.result != 'success' || - needs.package-artifacts.result != 'success' + needs.build-sdist.result != 'success' || + needs.build-wheels.result != 'success' || + needs.package-artifacts.result != 'success' || + ( + needs.pypi-publish.result != 'success' && + needs.pypi-publish.result != 'skipped' + ) run: exit 1 diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 7202e8b..eb9e788 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -5,18 +5,20 @@ version: 2 build: - os: ubuntu-22.04 - apt_packages: - - librsvg2-bin + os: ubuntu-24.04 tools: - python: "3.11" + python: "3.12" -sphinx: - configuration: docs/conf.py - -python: - install: - - requirements: docs/requirements.txt + commands: + - pip install -r docs/requirements.txt + - pip install . -v + - docs/update_index_links.py --root-dir "$PWD" --inplace + - make -C docs linkcheck + - make -C docs html + - make -C docs latexpdf + - mkdir -p "$READTHEDOCS_OUTPUT/pdf" + - cp -r docs/_build/html "$READTHEDOCS_OUTPUT/" + - cp docs/_build/latex/hictkpy.pdf "$READTHEDOCS_OUTPUT/pdf/" formats: - pdf diff --git a/CITATION.cff b/CITATION.cff index 1d5652f..257f5b7 100755 --- a/CITATION.cff +++ b/CITATION.cff @@ -11,12 +11,20 @@ authors: email: roberros@uio.no affiliation: 'Department of Biosciences, University of Oslo' title: hictkpy -abstract: 'Python bindings for hictk.' +abstract: 'Python bindings for hictk: read and write .cool and .hic files directly from Python.' doi: '10.5281/zenodo.8220299' url: 'https://github.com/paulsengroup/hictkpy' repository-code: 'https://github.com/paulsengroup/hictkpy' type: software license: MIT +keywords: + - bindings + - bioinformatics + - conversion + - cooler + - hic + - hictk + - python preferred-citation: type: article authors: @@ -30,10 +38,22 @@ preferred-citation: orcid: 'https://orcid.org/0000-0002-7918-5495' email: jonas.paulsen@ibv.uio.no affiliation: 'Department of Biosciences, University of Oslo' - doi: '10.1101/2023.11.26.568707' - url: 'https://doi.org/10.1101/2023.11.26.568707' - journal: 'Cold Spring Harbor Laboratory' - year: 2023 - month: 11 + doi: '10.1093/bioinformatics/btae408' + url: 'https://academic.oup.com/bioinformatics/article/40/7/btae408/7698028' + journal: 'Bioinformatics' + year: 2024 + month: 06 title: 'hictk: blazing fast toolkit to work with .hic and .cool files' - abstract: 'We developed hictk, a toolkit that can transparently operate on .hic and .cool files with excellent performance. The toolkit is written in C++ and consists of a C++ library with Python bindings as well as CLI tools to perform common operations directly from the shell, including converting between .hic and .mcool formats. We benchmark the performance of hictk and compare it with other popular tools and libraries. We conclude that hictk significantly outperforms existing tools while providing the flexibility of natively working with both file formats without code duplication.' + abstract: > + Hi-C is gaining prominence as a method for mapping genome organization. + With declining sequencing costs and a growing demand for higher-resolution data, efficient tools for processing Hi-C datasets at different resolutions are crucial. + Over the past decade, the .hic and Cooler file formats have become the de-facto standard to store interaction matrices produced by Hi-C experiments in binary format. + Interoperability issues make it unnecessarily difficult to convert between the two formats and to develop applications that can process each format natively. + + We developed hictk, a toolkit that can transparently operate on .hic and .cool files with excellent performance. + The toolkit is written in C++ and consists of a C++ library with Python and R bindings as well as CLI tools to perform common operations directly from the shell, including converting between .hic and .mcool formats. We benchmark the performance of hictk and compare it with other popular tools and libraries. + We conclude that hictk significantly outperforms existing tools while providing the flexibility of natively working with both file formats without code duplication. + + The hictk library, Python bindings and CLI tools are released under the MIT license as a multi-platform application available at github.com/paulsengroup/hictk. + Pre-built binaries for Linux and macOS are available on bioconda. + Python bindings for hictk are available on GitHub at github.com/paulsengroup/hictkpy, while R bindings are available on GitHub at github.com/paulsengroup/hictkR. diff --git a/CMakeLists.txt b/CMakeLists.txt index 9c144ec..d6a9b09 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,8 +35,8 @@ FetchContent_Declare( FetchContent_Declare( hictk URL - "${CMAKE_CURRENT_SOURCE_DIR}/external/hictk-312d83ec.tar.xz" - URL_HASH "SHA256=a47b1088de6650179715db17337e09c44729f8e86b4457848e306f028d0b5878" + "${CMAKE_CURRENT_SOURCE_DIR}/external/hictk-v2.0.1.tar.xz" + URL_HASH "SHA256=9282f7f4e978c71970f07627b360c3a4cf2bce4670ce3bd12e040c51c691ec3c" EXCLUDE_FROM_ALL OVERRIDE_FIND_PACKAGE SYSTEM diff --git a/README.md b/README.md index 825d0a9..e6e26de 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,9 @@ Python bindings for hictk, a blazing fast toolkit to work with .hic and .cool fi ## Installing hictkpy -hictkpy can be installed in various ways. The simples method is using pip: `pip install hictkpy`. +hictkpy can be installed in various ways. The simplest method is using pip: `pip install hictkpy[all]`. -Refer to [Installation](https://hictkpy.readthedocs.io/en/latest/installation.html) for alternative methods. +Refer to [Installation](https://hictkpy.readthedocs.io/en/stable/installation.html) for alternative methods. ## Using hictkpy @@ -37,13 +37,13 @@ m1 = sel.to_numpy() # Get interactions as a numpy matrix m2 = sel.to_coo() # Get interactions as a scipy.sparse.coo_matrix ``` -For more detailed examples refer to [Quickstart](https://hictkpy.readthedocs.io/en/latest/quickstart.html). +For more detailed examples refer to [Quickstart](https://hictkpy.readthedocs.io/en/stable/quickstart.html). -The complete documentation for hictkpy API is available [here](https://hictkpy.readthedocs.io/en/latest/hictkpy.html). +The complete documentation for hictkpy API is available [here](https://hictkpy.readthedocs.io/en/stable/api/index.html). ## Citing -If you use hictkpy in you reaserch, please cite the following publication: +If you use hictkpy in you research, please cite the following publication: Roberto Rossini, Jonas Paulsen, hictk: blazing fast toolkit to work with .hic and .cool files _Bioinformatics_, Volume 40, Issue 7, July 2024, btae408, [https://doi.org/10.1093/bioinformatics/btae408](https://doi.org/10.1093/bioinformatics/btae408) diff --git a/cmake/modules/FindPyarrow.cmake b/cmake/modules/FindPyarrow.cmake deleted file mode 100644 index 3b000f1..0000000 --- a/cmake/modules/FindPyarrow.cmake +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright (C) 2024 Roberto Rossini -# -# SPDX-License-Identifier: MIT - -#[=======================================================================[.rst: -FindPyarrow ---------- - -Finds pyarrow library that is shipped as part of the pyarrow wheels. - -Imported Targets -^^^^^^^^^^^^^^^^ - -This module provides the following imported targets, if found: - -``Arrow::python - The pyarrow library - - -Result Variables -^^^^^^^^^^^^^^^^ - -This will define the following variables: - -``Pyarrow_FOUND`` - True if the system has the Pyarrow library. -``Pyarrow_VERSION`` - The version of the Pyarrow library which was found. -``Pyarrow_INCLUDE_DIRS`` - Include directories needed to use Pyarrow. - -Cache Variables -^^^^^^^^^^^^^^^ - -The following cache variables may also be set: - -``Pyarrow_LIBRARY`` - The path to the Arrow::python library. - -#]=======================================================================] - -find_package( - Python - 3.9 - COMPONENTS - Interpreter - NumPy - REQUIRED -) - -if(TARGET Arrow::python) - message(DEBUG "Arrow::python has already been defined") - return() -endif() - -# Try to import pyarrow -execute_process( - COMMAND - "${Python_EXECUTABLE}" -c "import pyarrow" - RESULT_VARIABLE STATUS -) -if(NOT STATUS EQUAL 0) - message(WARNING "Failed to import pyarrow") - set(Pyarrow_FOUND FALSE) - return() -endif() - -# Get pyarrow version -execute_process( - COMMAND - "${Python_EXECUTABLE}" -c "from importlib.metadata import version; print(version('pyarrow'), end='')" - RESULT_VARIABLE STATUS - OUTPUT_VARIABLE Pyarrow_VERSION -) -if(NOT STATUS EQUAL 0) - message(WARNING "Unable to detect pyarrow version") - set(Pyarrow_FOUND FALSE) - unset(Pyarrow_VERSION) - return() -endif() - -if(Pyarrow_VERSION VERSION_LESS 15.0.0) - message(WARNING "pyarrow version ${Pyarrow_VERSION} is too old. Minimum version required: 15.0.0") - set(Pyarrow_FOUND FALSE) - unset(Pyarrow_VERSION) - return() -endif() - -set(Pyarrow_VERSION_STRING "${Pyarrow_VERSION}") - -# Get include dirs -execute_process( - COMMAND - "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include(), end='')" - RESULT_VARIABLE STATUS - OUTPUT_VARIABLE Pyarrow_INCLUDE_DIRS -) -if(NOT STATUS EQUAL 0 OR Pyarrow_INCLUDE_DIRS STREQUAL "") - message(WARNING "Unable to detect pyarrow include directories") - set(Pyarrow_FOUND FALSE) - unset(Pyarrow_VERSION) - unset(Pyarrow_INCLUDE_DIRS) - return() -endif() - -# Get library dirs -execute_process( - COMMAND - "${Python_EXECUTABLE}" -c "import pyarrow; print(' '.join(pyarrow.get_library_dirs()), end='')" - RESULT_VARIABLE STATUS - OUTPUT_VARIABLE Pyarrow_LIBRARY_DIRS -) -if(NOT STATUS EQUAL 0 OR Pyarrow_LIBRARY_DIRS STREQUAL "") - message(WARNING "Unable to detect pyarrow library directories") - set(Pyarrow_FOUND FALSE) - unset(Pyarrow_VERSION) - unset(Pyarrow_INCLUDE_DIRS) - unset(Pyarrow_LIBRARY_DIRS) - return() -endif() - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args( - Pyarrow - FOUND_VAR Pyarrow_FOUND - REQUIRED_VARS - Pyarrow_INCLUDE_DIRS - Pyarrow_LIBRARY_DIRS - VERSION_VAR Pyarrow_VERSION -) - -find_library(Pyarrow_LIBRARY arrow_python REQUIRED PATHS ${Pyarrow_LIBRARY_DIRS} NO_DEFAULT_PATH) - -if(Pyarrow_FOUND AND NOT TARGET Arrow::python) - if(WIN32) - add_library(Arrow::python UNKNOWN IMPORTED) - else() - add_library(Arrow::python SHARED IMPORTED) - endif() - set_target_properties( - Arrow::python - PROPERTIES - IMPORTED_LOCATION - "${Pyarrow_LIBRARY}" - IMPORTED_CONFIGURATION - Release - ) - target_include_directories( - Arrow::python - INTERFACE - ${Pyarrow_INCLUDE_DIRS} - ${Python_NumPy_INCLUDE_DIR} - ) - target_link_directories(Arrow::python INTERFACE ${Pyarrow_LIBRARY_DIRS}) -endif() - -mark_as_advanced( - Pyarrow_VERSION - Pyarrow_INCLUDE_DIRS - Pyarrow_LIBRARY_DIRS -) diff --git a/docs/api/cooler.rst b/docs/api/cooler.rst index 132df46..05c2a7e 100644 --- a/docs/api/cooler.rst +++ b/docs/api/cooler.rst @@ -11,6 +11,7 @@ Cooler API .. autoclass:: SingleCellFile .. automethod:: __init__ + .. automethod:: __getitem__ .. automethod:: attributes .. automethod:: bins .. automethod:: cells @@ -22,6 +23,7 @@ Cooler API .. automethod:: __init__ .. automethod:: add_pixels + .. automethod:: bins .. automethod:: chromosomes .. automethod:: finalize .. automethod:: path diff --git a/docs/api/generic.rst b/docs/api/generic.rst index 93bdff5..a19910a 100644 --- a/docs/api/generic.rst +++ b/docs/api/generic.rst @@ -18,7 +18,11 @@ Generic API .. autoclass:: MultiResFile .. automethod:: __init__ + .. automethod:: __getitem__ + .. automethod:: attributes .. automethod:: chromosomes + .. automethod:: is_hic + .. automethod:: is_mcool .. automethod:: path .. automethod:: resolutions @@ -46,6 +50,81 @@ Generic API .. automethod:: coord2 .. automethod:: nnz .. automethod:: sum + .. automethod:: to_arrow .. automethod:: to_coo + .. automethod:: to_csr .. automethod:: to_df .. automethod:: to_numpy + .. automethod:: to_pandas + + .. automethod:: __iter__ + + .. code-block:: ipythonconsole + + In [1]: import hictkpy as htk + + In [2]: f = htk.File("file.cool") + + In [3]: sel = f.fetch("chr2L:10,000,000-20,000,000") + + In [4]: for i, pixel in enumerate(sel): + ...: print(pixel.bin1_id, pixel.bin2_id, pixel.count) + ...: if i > 10: + ...: break + ...: + 1000 1000 6759 + 1000 1001 3241 + 1000 1002 760 + 1000 1003 454 + 1000 1004 289 + 1000 1005 674 + 1000 1006 354 + 1000 1007 124 + 1000 1008 130 + 1000 1009 105 + 1000 1010 99 + 1000 1011 120 + + It is also possible to iterate over pixels together with their genomic coordinates by specifying ``join=True`` when calling :py:meth:`hictkpy.File.fetch()`: + + .. code-block:: ipythonconsole + + In [5]: sel = f.fetch("chr2L:10,000,000-20,000,000", join=True) + + In [6]: for i, pixel in enumerate(sel): + ...: print( + ...: pixel.chrom1, pixel.start1, pixel.end1, + ...: pixel.chrom2, pixel.start2, pixel.end2, + ...: pixel.count + ...: ) + ...: if i > 10: + ...: break + ...: + chr2L 10000000 10010000 chr2L 10000000 10010000 6759 + chr2L 10000000 10010000 chr2L 10010000 10020000 3241 + chr2L 10000000 10010000 chr2L 10020000 10030000 760 + chr2L 10000000 10010000 chr2L 10030000 10040000 454 + chr2L 10000000 10010000 chr2L 10040000 10050000 289 + chr2L 10000000 10010000 chr2L 10050000 10060000 674 + chr2L 10000000 10010000 chr2L 10060000 10070000 354 + chr2L 10000000 10010000 chr2L 10070000 10080000 124 + chr2L 10000000 10010000 chr2L 10080000 10090000 130 + chr2L 10000000 10010000 chr2L 10090000 10100000 105 + chr2L 10000000 10010000 chr2L 10100000 10110000 99 + chr2L 10000000 10010000 chr2L 10110000 10120000 120 + +.. autoclass:: Bin + +.. autoclass:: BinTable + + .. automethod:: __init__ + .. automethod:: chromosomes + .. automethod:: get + .. automethod:: get_id + .. automethod:: get_ids + .. automethod:: merge + .. automethod:: resolution + .. automethod:: to_df + .. automethod:: type + + .. automethod:: __iter__ diff --git a/docs/api/hic.rst b/docs/api/hic.rst index 488c2a1..a22e277 100644 --- a/docs/api/hic.rst +++ b/docs/api/hic.rst @@ -12,6 +12,7 @@ Hi-C API .. automethod:: __init__ .. automethod:: add_pixels + .. automethod:: bins .. automethod:: chromosomes .. automethod:: finalize .. automethod:: path diff --git a/docs/assets/heatmap_001.pdf b/docs/assets/heatmap_001.pdf new file mode 100644 index 0000000..1e37d12 Binary files /dev/null and b/docs/assets/heatmap_001.pdf differ diff --git a/docs/conf.py b/docs/conf.py index 0a5a17f..62e2ad5 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,9 +1,19 @@ -#!/usr/bin/env python3 - # Copyright (C) 2023 Roberto Rossini # # SPDX-License-Identifier: MIT + +import os + +# Define the canonical URL if you are using a custom domain on Read the Docs +html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "") + +# Tell Jinja2 templates the build is running on Read the Docs +if os.environ.get("READTHEDOCS", "") == "True": + if "html_context" not in globals(): + html_context = {} + html_context["READTHEDOCS"] = True + # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -14,8 +24,6 @@ # ones. extensions = [ "sphinx_copybutton", - "sphinxcontrib.rsvgconverter", - "sphinxcontrib.moderncmakedomain", "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.autosummary", @@ -27,8 +35,14 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3", None), + "numpy": ("https://numpy.org/doc/stable/", None), + "pandas": ("https://pandas.pydata.org/docs/", None), + "pyarrow": ("https://arrow.apache.org/docs/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/", None), } +intersphinx_timeout = 30 + # Add any paths that contain templates here, relative to this directory. templates_path = [".templates"] @@ -53,7 +67,7 @@ # built documents. # Read the listed version -version = "0.0.5" +version = "1.0.0" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -205,6 +219,7 @@ copybutton_selector = "div:not(.no-copybutton) > div.highlight > pre" copybutton_exclude = ".linenos, .gp, .go" copybutton_copy_empty_lines = False +copybutton_prompt_text = "user@dev:/tmp$" # -- Options for LaTeX output --------------------------------------------- @@ -214,13 +229,6 @@ "papersize": "a4paper", "pointsize": "10pt", "classoptions": ",openany,oneside", - "preamble": r""" -\usepackage{MnSymbol} -\DeclareUnicodeCharacter{25CB}{\ensuremath{\circ}} -\DeclareUnicodeCharacter{25CF}{\ensuremath{\bullet}} -\DeclareUnicodeCharacter{21B5}{\ensuremath{\rhookswarrow}} -\DeclareUnicodeCharacter{2194}{\ensuremath{\leftrightarrow}} -""", } # Grouping the document tree into LaTeX files. List of tuples @@ -249,3 +257,11 @@ # If false, no module index is generated. # latex_domain_indices = True + +linkcheck_ignore = [ + r"https://hictk.*\.readthedocs\.build.*", + r"https://hictk.*readthedocs.*/_/downloads/en/.*/pdf/", +] + +primary_domain = "py" +highlight_language = "py" diff --git a/docs/creating_cool_hic_files.rst b/docs/creating_cool_hic_files.rst index 68ea366..a1f0763 100644 --- a/docs/creating_cool_hic_files.rst +++ b/docs/creating_cool_hic_files.rst @@ -7,13 +7,13 @@ Creating .cool and .hic files hictkpy supports creating .cool and .hic files from pre-binned interactions in COO or BedGraph2 format. -The example use file `4DNFIOTPSS3L.hic `_, which can be downloaded from `here `_. +The example in this section use file `4DNFIOTPSS3L.hic `_, which can be downloaded from `here `_. Preparation ----------- The first step consists of converting interactions from ``4DNFIOTPSS3L.hic`` to bedGraph2 format. -This can be achieved using ``hictk dump`` +This can be achieved using ``hictk dump`` (or alternatively with :py:meth:hictkpy.File.fetch()`. .. code-block:: console @@ -33,7 +33,7 @@ This can be achieved using ``hictk dump`` 2L 0 50000 2L 450000 500000 756 -Next, we also generate the list of chromosomes. +Next, we also generate the list of chromosomes to use as reference. .. code-block:: console diff --git a/docs/index.rst b/docs/index.rst index cc442bf..e9ff9d5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,7 +5,7 @@ Introduction ============ -hictkpy provides Python bindings to `hictk `_, a blazing fast toolkit to work with .hic and .cool files. +hictkpy provides Python bindings to `hictk `_, a blazing fast toolkit to work with .hic and .cool files. .. only:: not latex @@ -13,7 +13,7 @@ hictkpy provides Python bindings to `hictk `_ --------------------- You are reading the HTML version of the documentation. An alternative `PDF - version `__ is + version `_ is also available. Installation @@ -21,9 +21,16 @@ hictkpy provides Python bindings to `hictk `_ .. only:: latex + Documentation formats + --------------------- + + You are reading the PDF version of the documentation. + + The live HTML version of the documentation is available at ``_. + .. rubric:: Installation -Python bindings for hictk can be installed using pip or conda. See :doc:`here <./installation>` for more details. +Python bindings for hictk can be installed using pip or conda. Refer to :doc:`Installation <./installation>` for more details. .. only:: not latex diff --git a/docs/installation.rst b/docs/installation.rst index 9ad6d86..d0b496b 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -12,9 +12,41 @@ PIP .. code-block:: bash - pip install hictkpy + pip install 'hictkpy[all]' +This will install hictkpy together with all its third-party dependencies. +It is also possible to install hictkpy with a minimal set of dependencies with one of the following commands: + +.. code-block:: bash + + pip install hictkpy # this target has no runtime dependencies! + pip install 'hictkpy[numpy]' + pip install 'hictkpy[pandas]' + pip install 'hictkpy[pyarrow]' + pip install 'hictkpy[scipy]' + + +In general, pandas and pyarrow are required when hictkpy is returning data using pandas.DataFrame or arrow.Table, such as when calling :py:meth:`hictkpy.PixelSelector.to_pandas()` or :py:meth:`hictkpy.BinTable.to_df()`. + +NumPy is required when calling methods returning data as np.array, such as :py:meth:`hictkpy.PixelSelector.to_numpy()` or certain overloads of :py:meth:`hictkpy.BinTable.get_ids()`. + +SciPy is required when fetching interactions as sparse matrix with, such as :py:meth:`hictkpy.PixelSelector.to_coo()` and :py:meth:`hictkpy.PixelSelector.to_csr()` + +In case applications call methods depending on missing third-party dependencies, hictkpy will raise an exception like the following: + +.. code-block:: ipythonconsole + + In [3] f.fetch().to_numpy() + + ModuleNotFoundError: No module named 'numpy' + + The above exception was the direct cause of the following exception: + + Traceback (most recent call last): + File "", line 1, in + ModuleNotFoundError: To enable numpy support, please install numpy with: pip install 'hictkpy[numpy]' + Alternatively, you can install hictkpy with all its dependencies by running: pip install 'hictkpy[all]' Conda (bioconda) ---------------- @@ -26,8 +58,70 @@ Conda (bioconda) From source ----------- +Building hictkpy from source should not be necessary for regular users, as we publish pre-built wheels for Linux, MacOS, and Windows for all Python versions we support (currently these include all CPython versions from 3.9 up until 3.13). For a complete and up-to-date list of available wheels refer to the `Download files `_ page on hictkpy's `homepage `_ on PyPI. + +Building hictkpy's wheels from source requires a compiler toolchain supporting C++17, such as: + +* GCC 8+ +* Clang 8+ +* Apple-Clang 10.0+ +* MSVC 19.12+ + +Furthermore, the following tools are required: + +* CMake 3.25+ +* git 2.7+ +* make or ninja + + +Installing the latest version from the main branch +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + .. code-block:: bash - pip install 'git+https://github.com/paulsengroup/hictkpy.git@main' + pip install 'hictkpy[all] @ git+https://github.com/paulsengroup/hictkpy.git@main' + +Installing version corresponding to a git tag +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + pip install 'hictkpy[all] @ git+https://github.com/paulsengroup/hictkpy.git@v1.0.0' + +Installing from a release archive +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + pip install 'hictkpy[all] @ https://pypi.python.org/packages/source/h/hictkpy/hictkpy-1.0.0.tar.gz' + +Running the automated tests +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When building hictkpy from source we highly recommend running the automated test suite before using hictkpy in production. + +This can be achieved in several ways. Here is an example: + +.. code-block:: bash + + git clone https://github.com/paulsengroup/hictkpy.git + + # make sure to run tests for the same version/tag/commit used to build hictkpy + git checkout v1.0.0 + + python -m venv venv + + # On Windows use venv\Scripts\pip.exe instead + venv/bin/pip install pytest + + venv/bin/pytest test/ + +**All tests are expected to pass. Do not ignore test failures!** + +However, it is expected that some test cases are skipped (especially if not all optional dependencies where installed). + +Notes +^^^^^ -Note that this will install hictk's build dependencies under ``~/.conan2``, if you don't need Conan for other purposes feel free to delete this ``~/.conan2`` after installing hictkpy from git. +Building hictkpy requires several dependencies that are not needed after the build process. +Some of these dependencies are installed using Conan, which creates several files under ``~/.conan2``. if you don't need Conan for other purposes feel free to delete the ``~/.conan2`` once the build process completes successfully. diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 200a2d8..f8a9f2a 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -5,11 +5,11 @@ Quickstart ########## -hictkpy provides Python bindings for hictk through pybind11. +hictkpy provides Python bindings for hictk through `nanobind `_. -``hictk.File()`` can open .cool and .hic files and allows retrieval of interactions as well as file metadata. +``hictk.File()`` can open .cool and .hic files and can be used to fetch interactions as well as file metadata. -The example use file `4DNFIOTPSS3L.hic `_, which can be downloaded from `here `_. +The examples in this section use file `4DNFIOTPSS3L.hic `_, which can be downloaded from `here `_. Opening files ------------- @@ -30,7 +30,7 @@ Reading file metadata .. code-block:: ipythonconsole - In [4]: f.bin_size() + In [4]: f.resolution() Out[4]: 10000 In [5]: f.chromosomes() @@ -51,15 +51,15 @@ Reading file metadata 'assembly': '/var/lib/cwl/stgb25a903a-ebb6-4a56-bf3f-90bd84a40bf4/4DNFIBEEN92C.chrom.sizes', 'format-url': 'https://github.com/aidenlab/hic-format', 'nbins': 13758, - 'nchroms': 8} + 'nchroms': 7} Fetch interactions ------------------ -Interactions can be fetched by calling the :py:meth:`hictkpy.File.fetch` method on :py:meth:`hictkpy.File` objects. +Interactions can be fetched by calling the :py:meth:`hictkpy.File.fetch()` method on :py:meth:`hictkpy.File()` objects. -:py:meth:`hictkpy.File.fetch` returns :py:meth:`hictkpy.PixelSelector` objects, which are very cheap to create. +:py:meth:`hictkpy.File.fetch()` returns :py:meth:`hictkpy.PixelSelector` objects, which are very cheap to create. .. code-block:: ipythonconsole @@ -107,24 +107,24 @@ Fetching interactions as pandas DataFrames [339041 rows x 7 columns] -Fetching interactions as scipy.sparse.coo_matrix +Fetching interactions as scipy.sparse.csr_matrix ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: ipythonconsole - In [15]: sel = f.fetch("2L:10,000,000-20,000,000", join=True) + In [15]: sel = f.fetch("2L:10,000,000-20,000,000") - In [16]: sel.to_coo() + In [16]: sel.to_csr() Out[16]: - <1000x1000 sparse matrix of type '' - with 339041 stored elements in COOrdinate format> + -Fetching interactions as numpy NDarray +Fetching interactions as numpy NDArray ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: ipythonconsole - In [17]: sel = f.fetch("2L:10,000,000-20,000,000", join=True) + In [17]: sel = f.fetch("2L:10,000,000-20,000,000") In [18]: m = sel.to_numpy() @@ -136,4 +136,63 @@ Fetching interactions as numpy NDarray In [22]: plt.show() -.. image:: assets/heatmap_001.avif + +.. only:: not latex + + .. image:: assets/heatmap_001.avif + +.. only:: latex + + .. image:: assets/heatmap_001.pdf + + +Fetching other types of data +---------------------------- + +Fetching the table of bins as pandas.DataFrame: + +.. code-block:: ipythonconsole + + In [23]: f.bins() + Out[23]: + chrom start end + 0 2L 0 10000 + 1 2L 10000 20000 + 2 2L 20000 30000 + 3 2L 30000 40000 + 4 2L 40000 50000 + ... ... ... ... + 13753 Y 3620000 3630000 + 13754 Y 3630000 3640000 + 13755 Y 3640000 3650000 + 13756 Y 3650000 3660000 + 13757 Y 3660000 3667352 + + [13758 rows x 3 columns] + +Fetching balancing weights: + +.. code-block:: ipythonconsole + + In [24]: import pandas as pd + + In [25]: weights = {} + ...: for norm in f.avail_normalizations(): + ...: weights[norm] = f.weights(norm) + ...: weights = pd.DataFrame(weights) + ...: weights + Out[25]: + KR VC VC_SQRT + 0 0.582102 0.666016 0.759389 + 1 1.300415 1.496604 1.138349 + 2 1.180977 1.470464 1.128364 + 3 1.007625 1.266340 1.047122 + 4 1.175642 1.492664 1.136850 + ... ... ... ... + 13753 NaN 0.000000 0.000000 + 13754 NaN 0.000000 0.000000 + 13755 NaN 0.000000 0.000000 + 13756 1.155544 2.234906 0.631055 + 13757 NaN 0.069841 0.111556 + + [13758 rows x 3 columns] diff --git a/docs/requirements.txt b/docs/requirements.txt index aa7acdb..87da608 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,7 @@ +# Copyright (C) 2024 Roberto Rossini +# SPDX-License-Identifier: MIT + furo==2024.8.6 -ipython==8.26.0 -sphinx==8.0.2 +ipython==8.28.0 +sphinx==8.1.3 sphinx-copybutton==0.5.2 -sphinxcontrib-moderncmakedomain==3.29.0 -sphinxcontrib-svg2pdfconverter==1.2.2 -hictkpy@git+https://github.com/paulsengroup/hictkpy@main diff --git a/docs/update_index_links.py b/docs/update_index_links.py new file mode 100755 index 0000000..9f69d26 --- /dev/null +++ b/docs/update_index_links.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import argparse +import logging +import os +import pathlib +import re +import subprocess as sp + + +def make_cli() -> argparse.ArgumentParser: + cli = argparse.ArgumentParser() + + cli.add_argument( + "--root-dir", + type=pathlib.Path, + required=False, + help="Path to the root of the doc folder.", + ) + cli.add_argument( + "--inplace", + action="store_true", + default=False, + help="Do the replacement in-place.", + ) + + return cli + + +def infer_root_dir(cwd: pathlib.Path | None = None) -> pathlib.Path: + if cwd is None: + cwd = pathlib.Path(__file__).parent.resolve() + + res = sp.check_output(["git", "rev-parse", "--show-toplevel"], cwd=cwd).decode("utf-8").split("\n")[0] + + root_dir = pathlib.Path(res) + if root_dir.is_dir(): + return root_dir + + if cwd == pathlib.Path(__file__).parent.resolve(): + raise RuntimeError("Unable to infer the root of hictkpy's repository.") + + return infer_root_dir(pathlib.Path(__file__).parent.resolve()) + + +def patch_index_file(path: pathlib.Path, inplace: bool): + url = os.getenv("READTHEDOCS_CANONICAL_URL") + if url is None: + raise RuntimeError("Unable to read url from the READTHEDOCS_CANONICAL_URL env variable") + + logging.info(f'READTHEDOCS_CANONICAL_URL="{url}"') + + toks = url.removeprefix("https://").rstrip("/").split("/") + if len(toks) < 2: + raise RuntimeError("Failed to parse READTHEDOCS_CANONICAL_URL variable") + + tgt_domain = toks[0] + tgt_branch = toks[-1] + + logging.info(f'new_domain="{tgt_domain}"') + logging.info(f'new_branch="{tgt_branch}"') + + pdf_pattern = re.compile(r"https://hictkpy\.readthedocs\.io/_/downloads/en/[\w\-_]+/pdf/") + html_pattern = re.compile(r"https://hictkpy\.readthedocs\.io/en/[\w\-_]+/") + + payload = path.read_text() + payload = pdf_pattern.sub(f"https://{tgt_domain}/_/downloads/en/{tgt_branch}/pdf/", payload) + payload = html_pattern.sub(f"https://{tgt_domain}/en/{tgt_branch}/", payload) + + if inplace: + logging.info(f'Updating file "{path}" inplace...') + path.write_text(payload) + else: + print(payload, end="") + + +def main(): + if "READTHEDOCS" not in os.environ: + logging.info("Script is not being run by ReadTheDocs. Returning immediately!") + return + + args = vars(make_cli().parse_args()) + + root_dir = args["root_dir"] + if root_dir is None: + root_dir = infer_root_dir() + + index_file = root_dir / "docs" / "index.rst" + if not index_file.exists(): + raise RuntimeError(f'Unable to find file "docs/index.rst" under {root_dir}') + + patch_index_file(index_file, args["inplace"]) + + +if __name__ == "__main__": + fmt = "[%(asctime)s] %(levelname)s: %(message)s" + logging.basicConfig(format=fmt) + logging.getLogger().setLevel(logging.INFO) + main() diff --git a/external/hictk-312d83ec.tar.xz b/external/hictk-312d83ec.tar.xz deleted file mode 100644 index 32c05fd..0000000 Binary files a/external/hictk-312d83ec.tar.xz and /dev/null differ diff --git a/external/hictk-v2.0.1.tar.xz b/external/hictk-v2.0.1.tar.xz new file mode 100644 index 0000000..c0459c9 Binary files /dev/null and b/external/hictk-v2.0.1.tar.xz differ diff --git a/pyproject.toml b/pyproject.toml index 8985bb6..8adf47b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,11 +6,8 @@ requires = [ "conan>=2.0.5", "nanobind>=2", # This is required in order to run stubgen - "numpy", - "pandas>=2.1,!=2.2.0", - "pyarrow==17.0", + "numpy>=2", "scikit-build-core>=0.10", - "scipy", "typing_extensions", ] @@ -27,29 +24,38 @@ authors = [ requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering :: Bio-Informatics", "License :: OSI Approved :: MIT License" ] -dependencies = [ +dependencies = [] + +optional-dependencies.numpy = [ "numpy", - "pyarrow==17.0", ] optional-dependencies.pandas = [ + "hictkpy[pyarrow]", "pandas>=2.1.0,!=2.2.0", ] +optional-dependencies.pyarrow = [ + "pyarrow>=16", +] + optional-dependencies.scipy = [ "scipy", ] optional-dependencies.all = [ - "hictkpy[pandas,scipy]", + "hictkpy[numpy,pandas,pyarrow,scipy]", ] optional-dependencies.test = [ - "hictkpy[all]", "pytest>=8.0", ] @@ -62,6 +68,8 @@ optional-dependencies.dev = [ ] [tool.scikit-build] +minimum-version = "0.10" +build-dir = "build/{wheel_tag}" metadata.version.provider = "scikit_build_core.metadata.setuptools_scm" sdist.include = ["src/_version.py"] wheel.expand-macos-universal-tags = true @@ -101,18 +109,33 @@ testpaths = [ ] [tool.cibuildwheel] -skip = ["cp313*", "*musllinux*", "pp*"] +build-frontend = "build" +manylinux-x86_64-image = "quay.io/pypa/manylinux_2_28_x86_64:2024.11.09-1" +manylinux-i686-image = "quay.io/pypa/manylinux_2_28_i686:2024.11.09-1" +manylinux-aarch64-image = "quay.io/pypa/manylinux_2_28_aarch64:2024.11.09-1" +manylinux-ppc64le-image = "quay.io/pypa/manylinux_2_28_ppc64le:2024.11.09-1" +manylinux-s390x-image = "quay.io/pypa/manylinux_2_28_s390x:2024.11.09-1" +manylinux-pypy_x86_64-image = "quay.io/pypa/manylinux_2_28_x86_64:2024.11.09-1" +manylinux-pypy_i686-image = "quay.io/pypa/manylinux_2_28_i686:2024.11.09-1" +manylinux-pypy_aarch64-image = "quay.io/pypa/manylinux_2_28_aarch64:2024.11.09-1" + +skip = ["cp314*", "*musllinux*", "pp*"] test-command = "python -m pytest {project}/test" -test-extras = ["test"] -test-skip = ["cp313*", "*universal2", "pp*"] +test-extras = ["test", "numpy", "pandas", "pyarrow", "scipy"] +test-skip = ["cp314*", "*universal2", "pp*"] # Setuptools bug causes collision between pypy and cpython artifacts before-build = [ "rm -rf '{project}/build'", ] -environment = { PIP_VERBOSE=1 } -# We are using static linking, thus repairing wheels is not necessary. +# We are using static linking, thus repairing wheels should not be necessary. +# Exception being when building portable Linux wheels. +# See https://github.com/scikit-build/scikit-build-core/pull/698#issuecomment-2082991257 +[tool.cibuildwheel.macos] +repair-wheel-command = "" + +[tool.cibuildwheel.windows] repair-wheel-command = "" [tool.ruff] diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b41d9c1..b31d20f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,10 +2,19 @@ # # SPDX-License-Identifier: MIT +find_package( + Python + 3.9 + COMPONENTS + Interpreter + Development.Module + NumPy + REQUIRED +) + find_package(Arrow REQUIRED) find_package(FMT REQUIRED) find_package(nanobind REQUIRED) -find_package(Pyarrow REQUIRED) find_package(spdlog REQUIRED) find_package(Filesystem REQUIRED) @@ -15,6 +24,7 @@ nanobind_add_module( NB_DOMAIN hictkpy LTO MODULE + "${CMAKE_CURRENT_SOURCE_DIR}/bin_table.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/common.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/cooler_file_writer.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/file.cpp" @@ -25,16 +35,15 @@ nanobind_add_module( "${CMAKE_CURRENT_SOURCE_DIR}/pixel_selector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/reference.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/singlecell_file.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/to_pyarrow.cpp" ) target_include_directories( _hictkpy PRIVATE include - "${PYARROW_INCLUDE_DIR}" - "${NUMPY_INCLUDE_DIR}" + "${Python_NumPy_INCLUDE_DIR}" ) -target_link_directories(_hictkpy PRIVATE "${PYARROW_LIB_DIRS}") target_link_libraries( _hictkpy @@ -45,7 +54,7 @@ target_link_libraries( hictk::file hictk::hic hictk::transformers - Arrow::python + Arrow::arrow_$,shared,static> fmt::fmt-header-only spdlog::spdlog_header_only std::filesystem diff --git a/src/bin_table.cpp b/src/bin_table.cpp new file mode 100644 index 0000000..89bb78e --- /dev/null +++ b/src/bin_table.cpp @@ -0,0 +1,535 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include "hictkpy/bin_table.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictkpy/reference.hpp" +#include "hictkpy/to_pyarrow.hpp" + +namespace nb = nanobind; + +namespace hictkpy { + +BinTable::BinTable(std::shared_ptr bins) noexcept : _bins(std::move(bins)) { + if (!_bins) { + _bins = std::make_shared(); + } +} + +BinTable::BinTable(hictk::BinTable bins) + : BinTable(std::make_shared(std::move(bins))) {} + +BinTable::BinTable(const ChromosomeDict& chromosomes, std::uint32_t resolution) + : BinTable(hictk::BinTable{chromosome_dict_to_reference(chromosomes), resolution}) {} + +[[noreturn]] static void throw_except_failed_to_parse_bins_df(const std::exception& e) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "Unable to fetch bins from the given object. Please make sure the given object is a " + "pandas.DataFrame with columns [\"chrom\", \"start\", \"end\"]. Underlying error: {}"), + e.what())); +} + +[[noreturn]] static void throw_except_failed_to_parse_bins_df() { + throw_except_failed_to_parse_bins_df(std::runtime_error{"unknown error"}); +} + +[[nodiscard]] static hictk::Reference get_reference_from_bins_df(const nb::object& df) { + try { + return chromosome_dict_to_reference( + nb::cast(df.attr("groupby")("chrom", nb::arg("observed") = true) + .attr("__getitem__")("end") + .attr("max")() + .attr("to_dict")())); + } catch (const std::exception& e) { + throw_except_failed_to_parse_bins_df(e); + } catch (...) { + throw_except_failed_to_parse_bins_df(); + } +} + +template +[[nodiscard]] static std::vector get_std_vect_from_bins_df(const nb::object& df, + std::string_view col_name) { + static_assert(std::is_arithmetic_v); + try { + return nb::cast>(df.attr("__getitem__")(col_name)); + } catch (const std::exception& e) { + throw_except_failed_to_parse_bins_df(e); + } catch (...) { + throw_except_failed_to_parse_bins_df(); + } +} + +BinTable::BinTable(const nanobind::object& df) + : BinTable(get_reference_from_bins_df(df), + get_std_vect_from_bins_df(df, "start"), + get_std_vect_from_bins_df(df, "end")) {} + +BinTable::BinTable(hictk::Reference chroms, const std::vector& start_pos, + const std::vector& end_pos) + : BinTable(hictk::BinTable{std::move(chroms), start_pos, end_pos}) {} + +const hictk::Reference& BinTable::chromosomes() const noexcept { return _bins->chromosomes(); } + +std::uint32_t BinTable::resolution() const noexcept { return _bins->resolution(); } + +std::string_view BinTable::type() const noexcept { + return _bins->type() == hictk::BinTable::Type::fixed ? "fixed" : "variable"; +} + +std::size_t BinTable::size() const noexcept { return _bins->size(); } + +std::string BinTable::repr() const { + return fmt::format(FMT_STRING("BinTable(num_chroms={}; bin_size={};)"), + _bins->chromosomes().size(), + _bins->resolution() == 0 ? "variable" : fmt::to_string(_bins->resolution())); +} + +[[nodiscard]] static std::shared_ptr chrom_dict() { + return dictionary(arrow::int32(), arrow::utf8()); +} + +[[nodiscard]] static std::shared_ptr make_bin_table_schema() { + arrow::FieldVector fields{}; + fields.emplace_back(arrow::field("chrom", chrom_dict(), false)); + fields.emplace_back(arrow::field("start", arrow::uint32(), false)); + fields.emplace_back(arrow::field("end", arrow::uint32(), false)); + + return arrow::schema(fields); +} + +[[nodiscard]] static std::shared_ptr chrom_names_to_arrow( + const std::vector& chrom_names) { + arrow::StringBuilder chrom_name_builder{}; + auto status = chrom_name_builder.AppendValues(chrom_names); + if (!status.ok()) { + throw std::runtime_error( + fmt::format(FMT_STRING("failed to construct a table of pixels: {}"), status.message())); + } + + auto result = chrom_name_builder.Finish(); + if (!result.ok()) { + throw std::runtime_error(fmt::format(FMT_STRING("failed to construct a table of pixels: {}"), + result.status().message())); + } + + return result.MoveValueUnsafe(); +} + +[[nodiscard]] static nb::object make_bin_table_df(const std::vector& chrom_names, + std::vector chrom_ids, + std::vector start_pos, + std::vector end_pos, + std::vector bin_ids = {}) { + const auto num_bins = static_cast(chrom_ids.size()); + + auto schema = make_bin_table_schema(); + + auto chrom_data = std::make_shared( + chrom_dict(), + std::make_shared(num_bins, arrow::Buffer::FromVector(std::move(chrom_ids)), + nullptr, 0, 0), + chrom_names_to_arrow(chrom_names)); + + auto start_data = std::make_shared( + num_bins, arrow::Buffer::FromVector(std::move(start_pos)), nullptr, 0, 0); + auto end_data = std::make_shared( + num_bins, arrow::Buffer::FromVector(std::move(end_pos)), nullptr, 0, 0); + + auto df = export_pyarrow_table( + arrow::Table::Make(std::move(schema), {chrom_data, start_data, end_data})) + .attr("to_pandas")(nb::arg("self_destruct") = true); + + if (bin_ids.empty()) { + return df; + } + + auto bin_ids_data = std::make_shared( + num_bins, arrow::Buffer::FromVector(std::move(bin_ids)), nullptr, 0, 0); + + schema = + std::make_shared(arrow::FieldVector{arrow::field("bin_id", arrow::uint64())}); + + auto bin_ids_df = + export_pyarrow_table(arrow::Table::Make(std::move(schema), {std::move(bin_ids_data)})) + .attr("to_pandas")(nb::arg("self_destruct") = true); + + df.attr("index") = bin_ids_df.attr("__getitem__")("bin_id"); + return df; +} + +nb::object BinTable::bin_ids_to_coords(std::vector bin_ids) const { + std::vector chrom_ids(bin_ids.size()); + std::vector start_pos(bin_ids.size()); + std::vector end_pos(bin_ids.size()); + + std::visit( + [&](const auto& bins) { + for (std::size_t i = 0; i < bin_ids.size(); ++i) { + const auto bin = bins.at(bin_ids[i]); + chrom_ids[i] = static_cast(bin.chrom().id()); + start_pos[i] = bin.start(); + end_pos[i] = bin.end(); + } + }, + _bins->get()); + + return make_bin_table_df(chrom_names(), std::move(chrom_ids), std::move(start_pos), + std::move(end_pos), std::move(bin_ids)); +} + +hictk::Bin BinTable::bin_id_to_coord(std::uint64_t bin_id) const { return _bins->at(bin_id); } + +hictk::Bin BinTable::coord_to_bin(std::string_view chrom, std::uint32_t pos) const { + return _bins->at(chrom, pos); +} + +nanobind::object BinTable::coords_to_bins(const std::vector& chroms, + const std::vector& positions) const { + if (chroms.size() != positions.size()) { + throw std::runtime_error("chroms and positions should have the same size"); + } + std::vector bin_ids(chroms.size()); + std::vector chrom_ids(chroms.size()); + std::vector start_pos(chroms.size()); + std::vector end_pos(chroms.size()); + + std::visit( + [&](const auto& bins) { + for (std::size_t i = 0; i < chroms.size(); ++i) { + const auto bin = bins.at(chroms[i], positions[i]); + bin_ids[i] = bin.id(); + chrom_ids[i] = static_cast(bin.chrom().id()); + start_pos[i] = bin.start(); + end_pos[i] = bin.end(); + } + }, + _bins->get()); + + return make_bin_table_df(chrom_names(), std::move(chrom_ids), std::move(start_pos), + std::move(end_pos), std::move(bin_ids)); +} + +std::int64_t BinTable::coord_to_bin_id(std::string_view chrom, std::uint32_t pos) const { + return static_cast(_bins->at(chrom, pos).id()); +} + +auto BinTable::coords_to_bin_ids(const std::vector& chroms, + const std::vector& positions) const -> BinIDsVec { + auto np = import_module_checked("numpy"); + + if (chroms.size() != positions.size()) { + throw std::runtime_error("chroms and positions should have the same size"); + } + + auto np_array = + np.attr("empty")(static_cast(chroms.size()), nb::arg("dtype") = "int"); + auto buffer = nb::cast(np_array); + auto bin_ids = buffer.view(); + + std::visit( + [&](const auto& bins) { + for (std::size_t i = 0; i < chroms.size(); ++i) { + bin_ids(static_cast(i)) = + static_cast(bins.at(chroms[i], positions[i]).id()); + } + }, + _bins->get()); + + return buffer; +} + +[[nodiscard]] static nb::object make_bg2_pixels_df(const std::vector& chrom_names, + std::vector chrom1_ids, + std::vector start1_pos, + std::vector end1_pos, + std::vector chrom2_ids, + std::vector start2_pos, + std::vector end2_pos) { + auto pd = import_module_checked("pandas"); + + nb::list dfs{}; + dfs.append(make_bin_table_df(chrom_names, std::move(chrom1_ids), std::move(start1_pos), + std::move(end1_pos))); + dfs.append(make_bin_table_df(chrom_names, std::move(chrom2_ids), std::move(start2_pos), + std::move(end2_pos))); + + auto df = pd.attr("concat")(dfs, nb::arg("axis") = "columns", nb::arg("ignore_index") = true, + nb::arg("copy") = false); + + static const std::vector col_names{"chrom1", "start1", "end1", + "chrom2", "start2", "end2"}; + df.attr("columns") = col_names; + return df; +} + +nb::object BinTable::merge_coords(nb::object df) const { + import_pyarrow_checked(); + auto pd = import_module_checked("pandas"); + + // TODO avoid potentially copying data here + using Buffer64T = nb::ndarray, std::int64_t>; + auto bin1_ids = nb::cast(df.attr("__getitem__")("bin1_id").attr("to_numpy")()); + auto bin2_ids = nb::cast(df.attr("__getitem__")("bin2_id").attr("to_numpy")()); + + const auto n = bin1_ids.size(); + + std::vector chrom1_ids(n); + std::vector starts1(n); + std::vector ends1(n); + std::vector chrom2_ids(n); + std::vector starts2(n); + std::vector ends2(n); + + std::visit( + [&](const auto& bins) { + for (std::size_t i = 0; i < n; ++i) { + const auto bin1 = + bins.at(static_cast(bin1_ids(static_cast(i)))); + chrom1_ids[i] = static_cast(bin1.chrom().id()); + starts1[i] = bin1.start(); + ends1[i] = bin1.end(); + + const auto bin2 = + bins.at(static_cast(bin2_ids(static_cast(i)))); + chrom2_ids[i] = static_cast(bin2.chrom().id()); + starts2[i] = bin2.start(); + ends2[i] = bin2.end(); + } + }, + _bins->get()); + + auto coord_df = + make_bg2_pixels_df(chrom_names(), std::move(chrom1_ids), std::move(starts1), std::move(ends1), + std::move(chrom2_ids), std::move(starts2), std::move(ends2)); + + std::vector index_names{}; + try { + auto name = nb::cast>(df.attr("index").attr("name")); + index_names.emplace_back(name.value_or(nb::cast(nb::str{"index"}))); + } catch (const std::bad_cast&) { + index_names = nb::cast>(df.attr("index").attr("name")); + } + + auto col_names = index_names; + for (auto&& name : nb::cast>(df.attr("columns").attr("tolist")())) { + col_names.emplace_back(name); + } + for (auto&& name : nb::cast>(coord_df.attr("columns").attr("tolist")())) { + col_names.emplace_back(name); + } + + nb::list dfs{}; + dfs.append(df.attr("reset_index")()); + dfs.append(std::move(coord_df)); + + df = pd.attr("concat")(dfs, nb::arg("axis") = "columns", nb::arg("ignore_index") = true, + nb::arg("copy") = false); + df.attr("columns") = col_names; + df.attr("set_index")(index_names, nb::arg("inplace") = true); + return df; +} + +nb::iterator BinTable::make_iterable() const { + return std::visit( + [](const auto& bins) { + return nb::make_iterator(nb::type(), "BinTableIterator", bins.begin(), + bins.end()); + }, + _bins->get()); +} + +static auto compute_num_bins(const hictk::BinTable& bins, const hictk::GenomicInterval& query) { + if (!query) { + return bins.size(); + } + return static_cast(std::visit( + [&](const auto& bins_) { + const auto [first_bin, last_bin] = bins_.find_overlap(query); + return std::distance(first_bin, last_bin); + }, + bins.get())); +} + +nb::object BinTable::to_df(std::optional range, + std::string_view query_type) const { + auto pd = import_module_checked("pandas"); + + const auto qt = + query_type == "UCSC" ? hictk::GenomicInterval::Type::UCSC : hictk::GenomicInterval::Type::BED; + const auto query = !range.has_value() ? hictk::GenomicInterval{} + : hictk::GenomicInterval::parse( + _bins->chromosomes(), std::string{range.value()}, qt); + + const auto n = compute_num_bins(*_bins, query); + + std::vector bin_ids(n); + std::vector chrom_ids(n); + std::vector starts(n); + std::vector ends(n); + + const auto chrom_id_offset = static_cast(_bins->chromosomes().at(0).is_all()); + + std::visit( + [&](const auto& bins) { + const auto [first_bin, last_bin] = !range.has_value() + ? std::make_pair(bins.begin(), bins.end()) + : bins.find_overlap(query); + std::size_t i = 0; + std::for_each(first_bin, last_bin, [&](const auto& bin) { + bin_ids[i] = bin.id(); + chrom_ids[i] = static_cast(bin.chrom().id() - chrom_id_offset); + starts[i] = bin.start(); + ends[i] = bin.end(); + ++i; + }); + }, + _bins->get()); + + return make_bin_table_df(chrom_names(false), std::move(chrom_ids), std::move(starts), + std::move(ends), std::move(bin_ids)); +} + +std::shared_ptr BinTable::get() const noexcept { return _bins; } + +std::vector BinTable::chrom_names(bool include_ALL) const { + std::vector chrom_names; + chrom_names.reserve(_bins->chromosomes().size()); + for (const auto& chrom : _bins->chromosomes()) { + if (!include_ALL && chrom.is_all()) { + continue; + } + + chrom_names.emplace_back(chrom.name()); + } + + return chrom_names; +} + +static void declare_bin_class(nb::module_& m) { + nb::class_(m, "Bin", "Genomic Bin.") + .def_prop_ro("id", [](const hictk::Bin& b) { return b.id(); }) + .def_prop_ro("rel_id", [](const hictk::Bin& b) { return b.rel_id(); }) + .def_prop_ro("chrom", [](const hictk::Bin& b) { return b.chrom().name(); }) + .def_prop_ro("start", [](const hictk::Bin& b) { return b.start(); }) + .def_prop_ro("end", [](const hictk::Bin& b) { return b.end(); }) + .def("__repr__", + [](const hictk::Bin& b) { + return fmt::format(FMT_COMPILE("id={}; rel_id={}; chrom={}; start={}; end={}"), b.id(), + b.rel_id(), b.chrom().name(), b.start(), b.end()); + }) + .def("__str__", [](const hictk::Bin& b) { + return fmt::format(FMT_COMPILE("{}\t{}\t{}"), b.chrom().name(), b.start(), b.end()); + }); +} + +void BinTable::bind(nb::module_& m) { + declare_bin_class(m); + + auto bt = nb::class_(m, "BinTable", "Class representing a table of genomic bins."); + + bt.def(nb::init(), nb::arg("chroms"), nb::arg("resolution"), + "Construct a table of bins given a dictionary mapping chromosomes to their sizes and a " + "resolution."); + + bt.def(nb::init(), nb::arg("bins"), + "Construct a table of bins from a pandas.DataFrame with columns [\"chrom\", \"start\", " + "\"end\"].", + nb::sig("def __init__(self, bins: pandas.DataFrame) -> None")); + + bt.def("__repr__", &BinTable::repr, nb::rv_policy::move); + + bt.def("chromosomes", &get_chromosomes_from_object, + nb::arg("include_ALL") = false, + "Get chromosomes sizes as a dictionary mapping names to sizes.", + nb::rv_policy::take_ownership); + + bt.def("resolution", &BinTable::resolution, + "Get the bin size for the bin table. " + "Return 0 in case the bin table has a variable bin size."); + + bt.def("type", &BinTable::type, + "Get the type of table underlying the BinTable object (i.e. fixed or variable).", + nb::rv_policy::move); + + bt.def("__len__", &BinTable::size, "Get the number of bins in the bin table."); + + bt.def("__iter__", &BinTable::make_iterable, nb::keep_alive<0, 1>(), + nb::sig("def __iter__(self) -> hictkpy.BinTableIterator"), + "Return an iterator over the bins in the table.", nb::rv_policy::take_ownership); + + bt.def("get", &BinTable::bin_id_to_coord, nb::arg("bin_id"), + "Get the genomic coordinate given a bin ID.", + nb::sig("def get(self, bin_id: int) -> hictkpy.Bin"), nb::rv_policy::move); + bt.def("get", &BinTable::bin_ids_to_coords, nb::arg("bin_ids"), + "Get the genomic coordinates given a sequence of bin IDs. " + "Genomic coordinates are returned as a pandas.DataFrame with columns [\"chrom\", " + "\"start\", \"end\"].", + nb::sig("def get(self, bin_ids: collections.abc.Sequence[int]) -> pandas.DataFrame"), + nb::rv_policy::take_ownership); + + bt.def("get", &BinTable::coord_to_bin, nb::arg("chrom"), nb::arg("pos"), + "Get the bin overlapping the given genomic coordinate.", + nb::sig("def get(self, chrom: str, pos: int) -> hictkpy.Bin"), nb::rv_policy::move); + bt.def("get", &BinTable::coords_to_bins, nb::arg("chroms"), nb::arg("pos"), + "Get the bins overlapping the given genomic coordinates. " + "Bins are returned as a pandas.DataFrame with columns [\"chrom\", " + "\"start\", \"end\"].", + nb::sig("def get(self, chroms: collections.abc.Sequence[str], pos: " + "collections.abc.Sequence[int]) -> pandas.DataFrame"), + nb::rv_policy::take_ownership); + + bt.def("get_id", &BinTable::coord_to_bin_id, nb::arg("chrom"), nb::arg("pos"), + "Get the ID of the bin overlapping the given genomic coordinate."); + bt.def("get_ids", &BinTable::coords_to_bin_ids, nb::arg("chroms"), nb::arg("pos"), + "Get the IDs of the bins overlapping the given genomic coordinates.", + nb::sig("def get_ids(self, chroms: collections.abc.Sequence[str], pos: " + "collections.abc.Sequence[int]) -> numpy.ndarray[dtype=int64]"), + nb::rv_policy::take_ownership); + + bt.def("merge", &BinTable::merge_coords, nb::arg("df"), + "Merge genomic coordinates corresponding to the given bin identifiers. " + "Bin identifiers should be provided as a pandas.DataFrame with columns \"bin1_id\" and " + "\"bin2_id\". " + "Genomic coordinates are returned as a pandas.DataFrame containing the same data as the " + "DataFrame given as input, plus columns [\"chrom1\", \"start1\", \"end1\", \"chrom2\", " + "\"start2\", \"end2\"].", + nb::sig("def merge(self, df: pandas.DataFrame) -> pandas.DataFrame"), + nb::rv_policy::take_ownership); + + bt.def("to_df", &BinTable::to_df, nb::arg("range") = nb::none(), nb::arg("query_type") = "UCSC", + "Return the bins in the BinTable as a pandas.DataFrame. The optional \"range\" parameter " + "can be used to only fetch a subset of the bins in the BinTable.", + nb::sig("def to_df(self, range: str | None = None, query_type: str = 'UCSC') -> " + "pandas.DataFrame"), + nb::rv_policy::take_ownership); +} + +} // namespace hictkpy diff --git a/src/cooler_file_writer.cpp b/src/cooler_file_writer.cpp index f7b72b5..2229975 100644 --- a/src/cooler_file_writer.cpp +++ b/src/cooler_file_writer.cpp @@ -12,15 +12,18 @@ #include #include #include +#include #include #include #include +#include #include #include #include #include #include +#include "hictkpy/bin_table.hpp" #include "hictkpy/common.hpp" #include "hictkpy/nanobind.hpp" #include "hictkpy/pixel.hpp" @@ -30,14 +33,12 @@ namespace nb = nanobind; namespace hictkpy { -CoolerFileWriter::CoolerFileWriter(std::filesystem::path path_, const ChromosomeDict &chromosomes_, - std::uint32_t resolution_, std::string_view assembly, - const std::filesystem::path &tmpdir, +CoolerFileWriter::CoolerFileWriter(std::filesystem::path path_, const hictkpy::BinTable &bins_, + std::string_view assembly, const std::filesystem::path &tmpdir, std::uint32_t compression_lvl) : _path(std::move(path_)), _tmpdir(tmpdir, true), - _w(create_file(_path.string(), chromosome_dict_to_reference(chromosomes_), resolution_, - assembly, _tmpdir())), + _w(create_file(_path.string(), *bins_.get(), assembly, _tmpdir())), _compression_lvl(compression_lvl) { if (std::filesystem::exists(_path)) { throw std::runtime_error( @@ -47,6 +48,13 @@ CoolerFileWriter::CoolerFileWriter(std::filesystem::path path_, const Chromosome SPDLOG_INFO(FMT_STRING("using \"{}\" folder to store temporary file(s)"), _tmpdir()); } +CoolerFileWriter::CoolerFileWriter(std::filesystem::path path_, const ChromosomeDict &chromosomes_, + std::uint32_t resolution_, std::string_view assembly, + const std::filesystem::path &tmpdir, + std::uint32_t compression_lvl) + : CoolerFileWriter(std::move(path_), BinTable{chromosomes_, resolution_}, assembly, tmpdir, + compression_lvl) {} + const std::filesystem::path &CoolerFileWriter::path() const noexcept { return _path; } std::uint32_t CoolerFileWriter::resolution() const noexcept { @@ -65,18 +73,27 @@ const hictk::Reference &CoolerFileWriter::chromosomes() const { return ref; } +std::shared_ptr CoolerFileWriter::bins_ptr() const noexcept { + if (!_w) { + return {}; + } + + return _w->bins_ptr(); +} + void CoolerFileWriter::add_pixels(const nb::object &df) { if (!_w.has_value()) { throw std::runtime_error( "caught attempt to add_pixels to a .cool file that has already been finalized!"); } - const auto coo_format = nb::cast(df.attr("columns").attr("__contains__")("bin1_id")); - const auto cell_id = fmt::to_string(_w->cells().size()); auto attrs = hictk::cooler::Attributes::init(_w->resolution()); attrs.assembly = _w->attributes().assembly; + auto lck = std::make_optional(); + const auto coo_format = nb::cast(df.attr("columns").attr("__contains__")("bin1_id")); + const auto dtype = df.attr("__getitem__")("count").attr("dtype"); const auto dtype_str = nb::cast(dtype.attr("__str__")()); const auto var = map_dtype_to_type(dtype_str); @@ -86,6 +103,7 @@ void CoolerFileWriter::add_pixels(const nb::object &df) { using N = hictk::remove_cvref_t; const auto pixels = coo_format ? coo_df_to_thin_pixels(df, true) : bg2_df_to_thin_pixels(_w->bins(), df, true); + lck.reset(); auto clr = _w->create_cell(cell_id, std::move(attrs), hictk::cooler::DEFAULT_HDF5_CACHE_SIZE * 4, 1); @@ -99,8 +117,8 @@ void CoolerFileWriter::add_pixels(const nb::object &df) { var); } -void CoolerFileWriter::finalize([[maybe_unused]] std::string_view log_lvl_str, - std::size_t chunk_size, std::size_t update_freq) { +hictk::File CoolerFileWriter::finalize(std::string_view log_lvl_str, std::size_t chunk_size, + std::size_t update_freq) { if (_finalized) { throw std::runtime_error( fmt::format(FMT_STRING("finalize() was already called on file \"{}\""), _path)); @@ -137,17 +155,18 @@ void CoolerFileWriter::finalize([[maybe_unused]] std::string_view log_lvl_str, _w.reset(); std::filesystem::remove(sclr_path); // NOLINT // NOLINTEND(*-unchecked-optional-access) + + return hictk::File{_path.string()}; } hictk::cooler::SingleCellFile CoolerFileWriter::create_file(std::string_view path, - const hictk::Reference &chromosomes, - std::uint32_t resolution, + const hictk::BinTable &bins, std::string_view assembly, const std::filesystem::path &tmpdir) { - auto attrs = hictk::cooler::SingleCellAttributes::init(resolution); + auto attrs = hictk::cooler::SingleCellAttributes::init(bins.resolution()); attrs.assembly = assembly; return hictk::cooler::SingleCellFile::create(tmpdir / std::filesystem::path{path}.filename(), - chromosomes, resolution, false, std::move(attrs)); + bins, false, std::move(attrs)); } std::string CoolerFileWriter::repr() const { @@ -169,26 +188,39 @@ void CoolerFileWriter::bind(nb::module_ &m) { nb::arg("path"), nb::arg("chromosomes"), nb::arg("resolution"), nb::arg("assembly") = "unknown", nb::arg("tmpdir") = hictk::internal::TmpDir::default_temp_directory_path(), - nb::arg("compression_lvl") = 6, "Open a .cool file for writing."); + nb::arg("compression_lvl") = 6, + "Open a .cool file for writing given a list of chromosomes with their sizes and a " + "resolution."); + writer.def(nb::init(), + nb::arg("path"), nb::arg("bins"), nb::arg("assembly") = "unknown", + nb::arg("tmpdir") = hictk::internal::TmpDir::default_temp_directory_path(), + nb::arg("compression_lvl") = 6, + "Open a .cool file for writing given a table of bins."); // NOLINTEND(*-avoid-magic-numbers) - writer.def("__repr__", &hictkpy::CoolerFileWriter::repr); + writer.def("__repr__", &hictkpy::CoolerFileWriter::repr, nb::rv_policy::move); - writer.def("path", &hictkpy::CoolerFileWriter::path, "Get the file path."); - writer.def("resolutions", &hictkpy::CoolerFileWriter::resolution, "Get the resolution in bp."); - writer.def("chromosomes", &get_chromosomes_from_file, - nb::arg("include_all") = false, - "Get chromosomes sizes as a dictionary mapping names to sizes."); + writer.def("path", &hictkpy::CoolerFileWriter::path, "Get the file path.", nb::rv_policy::copy); + writer.def("resolution", &hictkpy::CoolerFileWriter::resolution, "Get the resolution in bp."); + writer.def("chromosomes", &get_chromosomes_from_object, + nb::arg("include_ALL") = false, + "Get chromosomes sizes as a dictionary mapping names to sizes.", + nb::rv_policy::take_ownership); + writer.def("bins", &get_bins_from_object, "Get table of bins.", + nb::sig("def bins(self) -> hictkpy.BinTable"), nb::rv_policy::move); writer.def("add_pixels", &hictkpy::CoolerFileWriter::add_pixels, + nb::call_guard(), nb::sig("def add_pixels(self, pixels: pandas.DataFrame)"), nb::arg("pixels"), "Add pixels from a pandas DataFrame containing pixels in COO or BG2 format (i.e. " "either with columns=[bin1_id, bin2_id, count] or with columns=[chrom1, start1, end1, " "chrom2, start2, end2, count]."); // NOLINTBEGIN(*-avoid-magic-numbers) - writer.def("finalize", &hictkpy::CoolerFileWriter::finalize, nb::arg("log_lvl") = "WARN", + writer.def("finalize", &hictkpy::CoolerFileWriter::finalize, + nb::call_guard(), nb::arg("log_lvl") = "WARN", nb::arg("chunk_size") = 500'000, nb::arg("update_frequency") = 10'000'000, - "Write interactions to file."); + "Write interactions to file.", nb::rv_policy::move); // NOLINTEND(*-avoid-magic-numbers) } } // namespace hictkpy diff --git a/src/file.cpp b/src/file.cpp index 52a8c28..29ba846 100644 --- a/src/file.cpp +++ b/src/file.cpp @@ -36,6 +36,7 @@ #include "hictkpy/nanobind.hpp" #include "hictkpy/pixel_selector.hpp" #include "hictkpy/reference.hpp" +#include "hictkpy/to_pyarrow.hpp" namespace nb = nanobind; @@ -43,9 +44,34 @@ namespace hictkpy::file { static void ctor(hictk::File *fp, const std::filesystem::path &path, std::optional resolution, std::string_view matrix_type, std::string_view matrix_unit) { - new (fp) hictk::File{path.string(), static_cast(resolution.value_or(0)), - hictk::hic::ParseMatrixTypeStr(std::string{matrix_type}), - hictk::hic::ParseUnitStr(std::string{matrix_unit})}; + const auto resolution_ = static_cast(resolution.value_or(0)); + try { + new (fp) hictk::File{path.string(), resolution_, + hictk::hic::ParseMatrixTypeStr(std::string{matrix_type}), + hictk::hic::ParseUnitStr(std::string{matrix_unit})}; + // TODO all the exceptions should ideally be handled on the hictk side + // but this will have to do until the next release of hictk + } catch (const HighFive::Exception &e) { + std::string_view msg{e.what()}; + if (msg.find("Unable to open the group \"/resolutions/0\"") != std::string_view::npos) { + throw std::runtime_error( + "resolution is required and cannot be None when opening .mcool files"); + } + throw; + } catch (const std::runtime_error &e) { + std::string_view msg{e.what()}; + if (msg.find("resolution cannot be 0 when opening .hic files") != std::string_view::npos) { + throw std::runtime_error("resolution is required and cannot be None when opening .hic files"); + } + throw; + } + + if (resolution.has_value() && fp->resolution() != resolution_) { + // TODO this should also be handled by hictk + throw std::runtime_error( + fmt::format(FMT_STRING("resolution mismatch for file \"{}\": expected {}, found {}"), + fp->uri(), resolution_, fp->resolution())); + } } static std::string repr(const hictk::File &f) { @@ -58,8 +84,9 @@ bool is_cooler(const std::filesystem::path &uri) { bool is_hic(const std::filesystem::path &uri) { return hictk::hic::utils::is_hic_file(uri); } -static hictkpy::PixelSelector fetch(const hictk::File &f, std::string_view range1, - std::string_view range2, std::string_view normalization, +static hictkpy::PixelSelector fetch(const hictk::File &f, std::optional range1, + std::optional range2, + std::optional normalization, std::string_view count_type, bool join, std::string_view query_type) { if (count_type != "float" && count_type != "int") { @@ -70,15 +97,17 @@ static hictkpy::PixelSelector fetch(const hictk::File &f, std::string_view range throw std::runtime_error("query_type should be either UCSC or BED"); } - if (normalization != "NONE") { + const hictk::balancing::Method normalization_method{normalization.value_or("NONE")}; + + if (normalization_method != hictk::balancing::Method::NONE()) { count_type = "float"; } - if (range1.empty()) { - assert(range2.empty()); + if (!range1.has_value() || range1->empty()) { + assert(!range2.has_value() || range2->empty()); return std::visit( [&](const auto &ff) { - auto sel = ff.fetch(hictk::balancing::Method{normalization}); + auto sel = ff.fetch(normalization_method); using SelT = decltype(sel); return hictkpy::PixelSelector(std::make_shared(std::move(sel)), count_type, join); @@ -86,20 +115,21 @@ static hictkpy::PixelSelector fetch(const hictk::File &f, std::string_view range f.get()); } - if (range2.empty()) { + if (!range2.has_value() || range2->empty()) { range2 = range1; } const auto query_type_ = query_type == "UCSC" ? hictk::GenomicInterval::Type::UCSC : hictk::GenomicInterval::Type::BED; - const auto gi1 = hictk::GenomicInterval::parse(f.chromosomes(), std::string{range1}, query_type_); - const auto gi2 = hictk::GenomicInterval::parse(f.chromosomes(), std::string{range2}, query_type_); + const auto gi1 = + hictk::GenomicInterval::parse(f.chromosomes(), std::string{*range1}, query_type_); + const auto gi2 = + hictk::GenomicInterval::parse(f.chromosomes(), std::string{*range2}, query_type_); return std::visit( [&](const auto &ff) { - // Workaround bug fixed in https://github.com/paulsengroup/hictk/pull/158 - auto sel = ff.fetch(fmt::format(FMT_STRING("{}"), gi1), fmt::format(FMT_STRING("{}"), gi2), - hictk::balancing::Method(normalization)); + auto sel = ff.fetch(gi1.chrom().name(), gi1.start(), gi1.end(), gi2.chrom().name(), + gi2.start(), gi2.end(), normalization_method); using SelT = decltype(sel); return hictkpy::PixelSelector(std::make_shared(std::move(sel)), count_type, @@ -164,7 +194,7 @@ static nb::dict get_hic_attrs(const hictk::hic::File &hf) { py_attrs["bin_size"] = hf.resolution(); py_attrs["format"] = "HIC"; - py_attrs["format_version"] = hf.version(); + py_attrs["format-version"] = hf.version(); py_attrs["assembly"] = hf.assembly(); py_attrs["format-url"] = "https://github.com/aidenlab/hic-format"; py_attrs["nbins"] = hf.bins().size(); @@ -189,11 +219,62 @@ static std::vector avail_normalizations(const hictk::File &f) { return norms; } -static std::vector weights(const hictk::File &f, std::string_view normalization, - bool divisive) { +static auto weights(const hictk::File &f, std::string_view normalization, bool divisive) { + using WeightVector = nb::ndarray, nb::c_contig, double>; + + if (normalization == "NONE") { + return WeightVector{}; + } + + const auto type = divisive ? hictk::balancing::Weights::Type::DIVISIVE + : hictk::balancing::Weights::Type::MULTIPLICATIVE; + + // NOLINTNEXTLINE + auto *weights_ptr = new std::vector(f.normalization(normalization).to_vector(type)); + + auto capsule = nb::capsule(weights_ptr, [](void *vect_ptr) noexcept { + delete reinterpret_cast *>(vect_ptr); // NOLINT + }); + + return WeightVector{weights_ptr->data(), {weights_ptr->size()}, capsule}; +} + +static nb::object weights_df(const hictk::File &f, const std::vector &normalizations, + bool divisive) { + phmap::flat_hash_set names(normalizations.size()); + arrow::FieldVector fields(normalizations.size()); + std::vector> columns(normalizations.size()); + + fields.clear(); + columns.clear(); + const auto type = divisive ? hictk::balancing::Weights::Type::DIVISIVE : hictk::balancing::Weights::Type::MULTIPLICATIVE; - return f.normalization(normalization).to_vector(type); + + for (const auto &normalization : normalizations) { + if (normalization == "NONE") { + fields.resize(fields.size() - 1); + columns.resize(columns.size() - 1); + continue; + } + + if (names.contains(normalization)) { + throw std::runtime_error(fmt::format( + FMT_STRING("found duplicated value \"{}\" in the provided normalization name list"), + normalization)); + } + + names.emplace(normalization); + fields.emplace_back(arrow::field(normalization, arrow::float64(), false)); + columns.emplace_back(std::make_shared( + f.nbins(), arrow::Buffer::FromVector(f.normalization(normalization).to_vector(type)), + nullptr, 0, 0)); + } + + auto schema = std::make_shared(std::move(fields)); + + return export_pyarrow_table(arrow::Table::Make(std::move(schema), columns)) + .attr("to_pandas")(nb::arg("self_destruct") = true); } static std::filesystem::path get_path(const hictk::File &f) { return f.path(); } @@ -208,36 +289,47 @@ void declare_file_class(nb::module_ &m) { "resolution.\n" "Resolution is ignored when opening single-resolution Cooler files."); - file.def("__repr__", &file::repr); + file.def("__repr__", &file::repr, nb::rv_policy::move); - file.def("uri", &hictk::File::uri, "Return the file URI."); - file.def("path", &file::get_path, "Return the file path."); + file.def("uri", &hictk::File::uri, "Return the file URI.", nb::rv_policy::move); + file.def("path", &file::get_path, "Return the file path.", nb::rv_policy::move); file.def("is_hic", &hictk::File::is_hic, "Test whether file is in .hic format."); file.def("is_cooler", &hictk::File::is_cooler, "Test whether file is in .cool format."); - file.def("chromosomes", &get_chromosomes_from_file, nb::arg("include_all") = false, - "Get chromosomes sizes as a dictionary mapping names to sizes."); - file.def("bins", &get_bins_from_file, nb::sig("def bins(self) -> pandas.DataFrame"), - "Get bins as a pandas DataFrame."); + file.def("chromosomes", &get_chromosomes_from_object, nb::arg("include_ALL") = false, + "Get chromosomes sizes as a dictionary mapping names to sizes.", + nb::rv_policy::take_ownership); + file.def("bins", &get_bins_from_object, "Get table of bins.", + nb::sig("def bins(self) -> hictkpy.BinTable"), nb::rv_policy::move); file.def("resolution", &hictk::File::resolution, "Get the bin size in bp."); file.def("nbins", &hictk::File::nbins, "Get the total number of bins."); - file.def("nchroms", &hictk::File::nchroms, "Get the total number of chromosomes."); + file.def("nchroms", &hictk::File::nchroms, nb::arg("include_ALL") = false, + "Get the total number of chromosomes."); - file.def("attributes", &file::attributes, "Get file attributes as a dictionary."); + file.def("attributes", &file::attributes, "Get file attributes as a dictionary.", + nb::rv_policy::take_ownership); - file.def("fetch", &file::fetch, nb::keep_alive<0, 1>(), nb::arg("range1") = "", - nb::arg("range2") = "", nb::arg("normalization") = "NONE", nb::arg("count_type") = "int", - nb::arg("join") = false, nb::arg("query_type") = "UCSC", - "Fetch interactions overlapping a region of interest."); + file.def("fetch", &file::fetch, nb::keep_alive<0, 1>(), nb::arg("range1") = nb::none(), + nb::arg("range2") = nb::none(), nb::arg("normalization") = nb::none(), + nb::arg("count_type") = "int", nb::arg("join") = false, nb::arg("query_type") = "UCSC", + "Fetch interactions overlapping a region of interest.", nb::rv_policy::move); file.def("avail_normalizations", &file::avail_normalizations, - "Get the list of available normalizations."); + "Get the list of available normalizations.", nb::rv_policy::move); file.def("has_normalization", &hictk::File::has_normalization, nb::arg("normalization"), "Check whether a given normalization is available."); file.def("weights", &file::weights, nb::arg("name"), nb::arg("divisive") = true, - "Fetch the balancing weights for the given normalization method."); + "Fetch the balancing weights for the given normalization method.", + nb::rv_policy::take_ownership); + file.def( + "weights", &file::weights_df, nb::arg("names"), nb::arg("divisive") = true, + "Fetch the balancing weights for the given normalization methods." + "Weights are returned as a pandas.DataFrame.", + nb::sig("def weights(self, names: collections.abc.Sequence[str], divisive: bool = True) -> " + "pandas.DataFrame"), + nb::rv_policy::take_ownership); } } // namespace hictkpy::file diff --git a/src/hic_file_writer.cpp b/src/hic_file_writer.cpp index f09a7d9..edad8a9 100644 --- a/src/hic_file_writer.cpp +++ b/src/hic_file_writer.cpp @@ -10,14 +10,17 @@ #include #include #include +#include #include #include +#include #include #include #include #include #include +#include "hictkpy/bin_table.hpp" #include "hictkpy/common.hpp" #include "hictkpy/nanobind.hpp" #include "hictkpy/pixel.hpp" @@ -27,6 +30,24 @@ namespace nb = nanobind; namespace hictkpy { +[[nodiscard]] static ChromosomeDict get_chromosomes_checked(const hictk::BinTable &bins) { + if (bins.type() != hictk::BinTable::Type::fixed) { + throw std::runtime_error( + "constructing .hic files is only supported when the BinTable has a uniform bin size"); + } + + ChromosomeDict chroms{}; + for (const auto &chrom : bins.chromosomes()) { + if (chrom.is_all()) { + continue; + } + const auto *chrom_name = chrom.name().data(); + chroms[chrom_name] = chrom.size(); + } + + return chroms; +} + HiCFileWriter::HiCFileWriter(const std::filesystem::path &path_, const ChromosomeDict &chromosomes, const std::vector &resolutions_, std::string_view assembly, std::size_t n_threads, @@ -46,7 +67,15 @@ HiCFileWriter::HiCFileWriter(const std::filesystem::path &path_, const Chromosom : HiCFileWriter(path_, chromosomes, std::vector{resolution}, assembly, n_threads, chunk_size, tmpdir, compression_lvl, skip_all_vs_all_matrix) {} -void HiCFileWriter::finalize([[maybe_unused]] std::string_view log_lvl_str) { +HiCFileWriter::HiCFileWriter(const std::filesystem::path &path_, const hictkpy::BinTable &bins_, + std::string_view assembly, std::size_t n_threads, + std::size_t chunk_size, const std::filesystem::path &tmpdir, + std::uint32_t compression_lvl, bool skip_all_vs_all_matrix) + : HiCFileWriter(path_, get_chromosomes_checked(*bins_.get()), bins_.get()->resolution(), + assembly, n_threads, chunk_size, tmpdir, compression_lvl, + skip_all_vs_all_matrix) {} + +hictk::File HiCFileWriter::finalize([[maybe_unused]] std::string_view log_lvl_str) { if (_finalized) { throw std::runtime_error( fmt::format(FMT_STRING("finalize() was already called on file \"{}\""), _w.path())); @@ -66,28 +95,45 @@ void HiCFileWriter::finalize([[maybe_unused]] std::string_view log_lvl_str) { } SPDLOG_INFO(FMT_STRING("successfully finalized \"{}\"!"), _w.path()); spdlog::default_logger()->set_level(previous_lvl); + + return hictk::File{std::string{_w.path()}, _w.resolutions().front()}; } std::filesystem::path HiCFileWriter::path() const noexcept { return std::filesystem::path{_w.path()}; } -const std::vector &HiCFileWriter::resolutions() const noexcept { - return _w.resolutions(); +auto HiCFileWriter::resolutions() const { + using WeightVector = nb::ndarray, nb::c_contig, std::uint32_t>; + + // NOLINTNEXTLINE + auto *resolutions_ptr = new std::vector(_w.resolutions()); + + auto capsule = nb::capsule(resolutions_ptr, [](void *vect_ptr) noexcept { + delete reinterpret_cast *>(vect_ptr); // NOLINT + }); + + return WeightVector{resolutions_ptr->data(), {resolutions_ptr->size()}, capsule}; } const hictk::Reference &HiCFileWriter::chromosomes() const { return _w.chromosomes(); } +hictkpy::BinTable HiCFileWriter::bins(std::uint32_t resolution) const { + return hictkpy::BinTable{_w.bins(resolution)}; +} + void HiCFileWriter::add_pixels(const nb::object &df) { if (_finalized) { throw std::runtime_error( "caught attempt to add_pixels to a .hic file that has already been finalized!"); } + auto lck = std::make_optional(); const auto coo_format = nb::cast(df.attr("columns").attr("__contains__")("bin1_id")); const auto pixels = coo_format ? coo_df_to_thin_pixels(df, false) : bg2_df_to_thin_pixels(_w.bins(_w.resolutions().front()), df, false); + lck.reset(); SPDLOG_INFO(FMT_STRING("adding {} pixels to file \"{}\"..."), pixels.size(), _w.path()); _w.add_pixels(_w.resolutions().front(), pixels.begin(), pixels.end()); } @@ -111,35 +157,52 @@ void HiCFileWriter::bind(nb::module_ &m) { nb::arg("chunk_size") = 10'000'000, nb::arg("tmpdir") = hictk::internal::TmpDir::default_temp_directory_path(), nb::arg("compression_lvl") = 10, nb::arg("skip_all_vs_all_matrix") = false, - "Open a .hic file for writing."); - - writer.def(nb::init &, std::string_view, std::size_t, - std::size_t, const std::filesystem::path &, std::uint32_t, bool>(), - nb::arg("path"), nb::arg("chromosomes"), nb::arg("resolutions"), - nb::arg("assembly") = "unknown", nb::arg("n_threads") = 1, - nb::arg("chunk_size") = 10'000'000, - nb::arg("tmpdir") = hictk::internal::TmpDir::default_temp_directory_path(), - nb::arg("compression_lvl") = 10, nb::arg("skip_all_vs_all_matrix") = false, - "Open a .hic file for writing."); + "Open a .hic file for writing given a list of chromosomes with their sizes and one " + "resolution."); + + writer.def( + nb::init &, std::string_view, std::size_t, std::size_t, + const std::filesystem::path &, std::uint32_t, bool>(), + nb::arg("path"), nb::arg("chromosomes"), nb::arg("resolutions"), + nb::arg("assembly") = "unknown", nb::arg("n_threads") = 1, nb::arg("chunk_size") = 10'000'000, + nb::arg("tmpdir") = hictk::internal::TmpDir::default_temp_directory_path(), + nb::arg("compression_lvl") = 10, nb::arg("skip_all_vs_all_matrix") = false, + "Open a .hic file for writing given a list of chromosomes with their sizes and one or more " + "resolutions."); + + writer.def( + nb::init(), + nb::arg("path"), nb::arg("bins"), nb::arg("assembly") = "unknown", nb::arg("n_threads") = 1, + nb::arg("chunk_size") = 10'000'000, + nb::arg("tmpdir") = hictk::internal::TmpDir::default_temp_directory_path(), + nb::arg("compression_lvl") = 10, nb::arg("skip_all_vs_all_matrix") = false, + "Open a .hic file for writing given a BinTable. Only BinTable with a fixed bin size are " + "supported."); // NOLINTEND(*-avoid-magic-numbers) - writer.def("__repr__", &hictkpy::HiCFileWriter::repr); + writer.def("__repr__", &hictkpy::HiCFileWriter::repr, nb::rv_policy::move); - writer.def("path", &hictkpy::HiCFileWriter::path, "Get the file path."); + writer.def("path", &hictkpy::HiCFileWriter::path, "Get the file path.", nb::rv_policy::move); writer.def("resolutions", &hictkpy::HiCFileWriter::resolutions, - "Get the list of resolutions in bp."); - writer.def("chromosomes", &get_chromosomes_from_file, - nb::arg("include_all") = false, - "Get chromosomes sizes as a dictionary mapping names to sizes."); + "Get the list of resolutions in bp.", nb::rv_policy::take_ownership); + writer.def("chromosomes", &get_chromosomes_from_object, + nb::arg("include_ALL") = false, + "Get chromosomes sizes as a dictionary mapping names to sizes.", + nb::rv_policy::take_ownership); + writer.def("bins", &hictkpy::HiCFileWriter::bins, "Get table of bins for the given resolution.", + nb::sig("def bins(self, resolution: int) -> hictkpy.BinTable"), nb::rv_policy::move); writer.def("add_pixels", &hictkpy::HiCFileWriter::add_pixels, + nb::call_guard(), nb::sig("def add_pixels(self, pixels: pd.DataFrame) -> None"), nb::arg("pixels"), "Add pixels from a pandas DataFrame containing pixels in COO or BG2 format (i.e. " "either with columns=[bin1_id, bin2_id, count] or with columns=[chrom1, start1, end1, " "chrom2, start2, end2, count]."); - writer.def("finalize", &hictkpy::HiCFileWriter::finalize, nb::arg("log_lvl") = "WARN", - "Write interactions to file."); + writer.def("finalize", &hictkpy::HiCFileWriter::finalize, + nb::call_guard(), nb::arg("log_lvl") = "WARN", + "Write interactions to file.", nb::rv_policy::move); } } // namespace hictkpy diff --git a/src/hictkpy.cpp b/src/hictkpy.cpp index dbf6e27..ebb57b7 100644 --- a/src/hictkpy.cpp +++ b/src/hictkpy.cpp @@ -2,14 +2,12 @@ // // SPDX-License-Identifier: MIT -#include -#include #include #include #include -#include +#include "hictkpy/bin_table.hpp" #include "hictkpy/cooler_file_writer.hpp" #include "hictkpy/file.hpp" #include "hictkpy/hic_file_writer.hpp" @@ -34,12 +32,6 @@ namespace hictkpy { NB_MODULE(_hictkpy, m) { [[maybe_unused]] const auto logger = init_logger(); - if (arrow::py::import_pyarrow() == -1) { - throw std::runtime_error("failed to initialize pyarrow runtime"); - } - - m.attr("__hictkpy_arrow_version__") = - std::make_tuple(ARROW_VERSION_MAJOR, ARROW_VERSION_MINOR, ARROW_VERSION_PATCH); m.attr("__hictk_version__") = hictk::config::version::str(); m.doc() = "Blazing fast toolkit to work with .hic and .cool files."; @@ -57,6 +49,8 @@ NB_MODULE(_hictkpy, m) { declare_pixel_class(m, "Int"); declare_pixel_class(m, "FP"); + BinTable::bind(m); + PixelSelector::bind(m); file::declare_file_class(m); diff --git a/src/hictkpy/__init__.py b/src/hictkpy/__init__.py index e68a928..6cb32fc 100644 --- a/src/hictkpy/__init__.py +++ b/src/hictkpy/__init__.py @@ -3,29 +3,15 @@ # SPDX-License-Identifier: MIT -def _load_pyarrow_and_check_abi_compat(): - import pyarrow as pa +def _get_hictkpy_version() -> str: + from importlib.metadata import version - from ._hictkpy import __hictkpy_arrow_version__ + return version("hictkpy") - major, minor, patch = __hictkpy_arrow_version__ - - if not pa.__version__.startswith(f"{major}.{minor}"): - raise ImportError( - "Detected Arrow ABI version mismatch!\n" - f"hictkpy was compiled with Arrow v{major}.{minor}.{patch}, which is not ABI compatible with the currently " - f"installed version of pyarrow (v{pa.__version__}).\n" - 'Please install a compatible version of pyarrow with e.g. "pip install ' - f'pyarrow=={major}.{minor}".' - ) - - -_load_pyarrow_and_check_abi_compat() - - -from importlib.metadata import version from ._hictkpy import ( + Bin, + BinTable, File, MultiResFile, PixelSelector, @@ -39,9 +25,11 @@ def _load_pyarrow_and_check_abi_compat(): is_scool_file, ) -__version__ = version("hictkpy") +__version__ = _get_hictkpy_version() __all__ = [ "__doc__", + "Bin", + "BinTable", "File", "MultiResFile", "PixelSelector", diff --git a/src/include/hictkpy/bin_table.hpp b/src/include/hictkpy/bin_table.hpp index d53e292..59c0ff4 100644 --- a/src/include/hictkpy/bin_table.hpp +++ b/src/include/hictkpy/bin_table.hpp @@ -4,46 +4,73 @@ #pragma once -#include +#include #include -#include -#include +#include +#include +#include +#include +#include #include +#include #include -#include "hictkpy/dynamic_1d_array.hpp" #include "hictkpy/nanobind.hpp" +#include "hictkpy/reference.hpp" namespace hictkpy { -template -inline nanobind::object get_bins_from_file(const File &f) { - auto pd = import_module_checked("pandas"); - - Dynamic1DA chrom_ids{}; - Dynamic1DA starts{}; - Dynamic1DA ends{}; - for (const auto &bin : f.bins()) { - chrom_ids.push_back(static_cast(bin.chrom().id())); - starts.push_back(static_cast(bin.start())); - ends.push_back(static_cast(bin.end())); - } - - std::vector chrom_names{}; - std::transform(f.chromosomes().begin(), f.chromosomes().end(), std::back_inserter(chrom_names), - [&](const hictk::Chromosome &chrom) { return std::string{chrom.name()}; }); - - nanobind::dict py_bins_dict{}; // NOLINT - - py_bins_dict["chrom"] = - pd.attr("Categorical") - .attr("from_codes")(chrom_ids(), nanobind::arg("categories") = chrom_names, - nanobind::arg("validate") = false); - py_bins_dict["start"] = pd.attr("Series")(starts(), nanobind::arg("copy") = false); - py_bins_dict["end"] = pd.attr("Series")(ends(), nanobind::arg("copy") = false); - - auto df = pd.attr("DataFrame")(py_bins_dict, nanobind::arg("copy") = false); - return df; +class BinTable { + std::shared_ptr _bins = std::make_shared(); + + public: + using BinIDsVec = nanobind::ndarray, std::int64_t>; + + BinTable() = default; + explicit BinTable(std::shared_ptr bins) noexcept; + explicit BinTable(hictk::BinTable bins); + BinTable(const ChromosomeDict& chromosomes, std::uint32_t resolution); + explicit BinTable(const nanobind::object& df); + BinTable(hictk::Reference chroms, const std::vector& start_pos, + const std::vector& end_pos); + + [[nodiscard]] const hictk::Reference& chromosomes() const noexcept; + [[nodiscard]] std::uint32_t resolution() const noexcept; + [[nodiscard]] std::string_view type() const noexcept; + [[nodiscard]] std::size_t size() const noexcept; + + [[nodiscard]] std::string repr() const; + + [[nodiscard]] hictk::Bin bin_id_to_coord(std::uint64_t bin_id) const; + [[nodiscard]] nanobind::object bin_ids_to_coords(std::vector bin_ids) const; + + [[nodiscard]] hictk::Bin coord_to_bin(std::string_view chrom, std::uint32_t pos) const; + [[nodiscard]] nanobind::object coords_to_bins(const std::vector& chroms, + const std::vector& positions) const; + + [[nodiscard]] std::int64_t coord_to_bin_id(std::string_view chrom, std::uint32_t pos) const; + [[nodiscard]] auto coords_to_bin_ids(const std::vector& chroms, + const std::vector& positions) const + -> BinIDsVec; + + [[nodiscard]] nanobind::object merge_coords(nanobind::object df) const; + + [[nodiscard]] nanobind::iterator make_iterable() const; + + [[nodiscard]] nanobind::object to_df(std::optional range, + std::string_view query_type) const; + + [[nodiscard]] std::shared_ptr get() const noexcept; + + static void bind(nanobind::module_& m); + + private: + [[nodiscard]] std::vector chrom_names(bool include_ALL = false) const; +}; + +template +inline hictkpy::BinTable get_bins_from_object(const T& obj) { + return hictkpy::BinTable(obj.bins_ptr()); } } // namespace hictkpy diff --git a/src/include/hictkpy/cooler_file_writer.hpp b/src/include/hictkpy/cooler_file_writer.hpp index cfd1c50..ec90b4b 100644 --- a/src/include/hictkpy/cooler_file_writer.hpp +++ b/src/include/hictkpy/cooler_file_writer.hpp @@ -7,12 +7,14 @@ #include #include #include +#include #include #include #include #include #include +#include "hictkpy/bin_table.hpp" #include "hictkpy/nanobind.hpp" #include "hictkpy/reference.hpp" @@ -30,23 +32,28 @@ class CoolerFileWriter { CoolerFileWriter(std::filesystem::path path_, const ChromosomeDict& chromosomes_, std::uint32_t resolution_, std::string_view assembly, const std::filesystem::path& tmpdir, std::uint32_t compression_lvl); + CoolerFileWriter(std::filesystem::path path_, const hictkpy::BinTable& bins_, + std::string_view assembly, const std::filesystem::path& tmpdir, + std::uint32_t compression_lvl); [[nodiscard]] const std::filesystem::path& path() const noexcept; [[nodiscard]] std::uint32_t resolution() const noexcept; [[nodiscard]] const hictk::Reference& chromosomes() const; + [[nodiscard]] std::shared_ptr bins_ptr() const noexcept; void add_pixels(const nanobind::object& df); - void finalize(std::string_view log_lvl_str, std::size_t chunk_size, std::size_t update_frequency); + [[nodiscard]] hictk::File finalize(std::string_view log_lvl_str, std::size_t chunk_size, + std::size_t update_frequency); [[nodiscard]] std::string repr() const; static void bind(nanobind::module_& m); private: [[nodiscard]] static hictk::cooler::SingleCellFile create_file( - std::string_view path, const hictk::Reference& chromosomes, std::uint32_t resolution, - std::string_view assembly, const std::filesystem::path& tmpdir); + std::string_view path, const hictk::BinTable& bins, std::string_view assembly, + const std::filesystem::path& tmpdir); }; } // namespace hictkpy diff --git a/src/include/hictkpy/dynamic_1d_array.hpp b/src/include/hictkpy/dynamic_1d_array.hpp deleted file mode 100644 index e592e89..0000000 --- a/src/include/hictkpy/dynamic_1d_array.hpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (C) 2024 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include - -#include "hictkpy/nanobind.hpp" - -namespace hictkpy { - -template -struct Dynamic1DA { - private: - using BufferT = nanobind::ndarray, T>; - using VectorT = decltype(std::declval().view()); - nanobind::object _dtype{}; - nanobind::object _np_array{}; - BufferT _buff{}; - VectorT _vector{}; - - std::int64_t _size{}; - std::int64_t _capacity{}; - - public: - explicit Dynamic1DA(std::size_t size_ = 1000); - void push_back(T x); - void emplace_back(T &&x); - void resize(std::int64_t new_size); - void grow(); - void shrink_to_fit(); - [[nodiscard]] auto operator()() -> BufferT; - - private: - [[nodiscard]] static nanobind::object np(); -}; - -} // namespace hictkpy - -#include "./impl/dynamic_1d_array_impl.hpp" diff --git a/src/include/hictkpy/file.hpp b/src/include/hictkpy/file.hpp index 186df5c..23cbf71 100644 --- a/src/include/hictkpy/file.hpp +++ b/src/include/hictkpy/file.hpp @@ -4,38 +4,15 @@ #pragma once -#include #include -#include -#include -#include -#include -#include #include "hictkpy/nanobind.hpp" -#include "hictkpy/pixel_selector.hpp" namespace hictkpy::file { -void ctor(hictk::File *fp, const std::filesystem::path &path, - std::optional resolution, std::string_view matrix_type, - std::string_view matrix_unit); - -[[nodiscard]] std::string repr(const hictk::File &f); [[nodiscard]] bool is_cooler(const std::filesystem::path &uri); [[nodiscard]] bool is_hic(const std::filesystem::path &uri); -[[nodiscard]] hictkpy::PixelSelector fetch(const hictk::File &f, std::string_view range1, - std::string_view range2, std::string_view normalization, - std::string_view count_type, bool join, - std::string_view query_type); - -[[nodiscard]] nanobind::dict attributes(const hictk::File &f); - -[[nodiscard]] std::vector avail_normalizations(const hictk::File &f); -[[nodiscard]] std::vector weights(const hictk::File &f, std::string_view normalization, - bool divisive = true); - void declare_file_class(nanobind::module_ &m); } // namespace hictkpy::file diff --git a/src/include/hictkpy/hic_file_writer.hpp b/src/include/hictkpy/hic_file_writer.hpp index 72a25c3..1041b72 100644 --- a/src/include/hictkpy/hic_file_writer.hpp +++ b/src/include/hictkpy/hic_file_writer.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -14,6 +15,7 @@ #include #include +#include "hictkpy/bin_table.hpp" #include "hictkpy/nanobind.hpp" #include "hictkpy/reference.hpp" @@ -33,15 +35,20 @@ class HiCFileWriter { std::uint32_t resolution, std::string_view assembly, std::size_t n_threads, std::size_t chunk_size, const std::filesystem::path& tmpdir, std::uint32_t compression_lvl, bool skip_all_vs_all_matrix); + HiCFileWriter(const std::filesystem::path& path_, const hictkpy::BinTable& bins_, + std::string_view assembly, std::size_t n_threads, std::size_t chunk_size, + const std::filesystem::path& tmpdir, std::uint32_t compression_lvl, + bool skip_all_vs_all_matrix); [[nodiscard]] std::filesystem::path path() const noexcept; - [[nodiscard]] const std::vector& resolutions() const noexcept; + [[nodiscard]] auto resolutions() const; [[nodiscard]] const hictk::Reference& chromosomes() const; + [[nodiscard]] hictkpy::BinTable bins(std::uint32_t resolution) const; void add_pixels(const nanobind::object& df); - void finalize(std::string_view log_lvl_str); + [[nodiscard]] hictk::File finalize(std::string_view log_lvl_str); [[nodiscard]] std::string repr() const; diff --git a/src/include/hictkpy/impl/dynamic_1d_array_impl.hpp b/src/include/hictkpy/impl/dynamic_1d_array_impl.hpp deleted file mode 100644 index ea83905..0000000 --- a/src/include/hictkpy/impl/dynamic_1d_array_impl.hpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (C) 2024 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include -#include - -#include "hictkpy/common.hpp" -#include "hictkpy/nanobind.hpp" - -namespace hictkpy { - -template -inline Dynamic1DA::Dynamic1DA(std::size_t size_) - : _dtype(np().attr("dtype")(map_type_to_dtype())), - _np_array( - np().attr("empty")(static_cast(size_), nanobind::arg("dtype") = _dtype)), - _buff(nanobind::cast(_np_array)), - _vector(_buff.view()), - _capacity(static_cast(size_)) {} - -template -inline void Dynamic1DA::push_back(T x) { - if (_capacity == _size) { - grow(); - } - _vector(_size++) = x; -} - -template -inline void Dynamic1DA::emplace_back(T &&x) { - if (_capacity == _size) { - grow(); - } - _vector(_size++) = std::move(x); -} - -template -inline void Dynamic1DA::resize(std::int64_t new_size) { - if (_capacity == new_size) { - return; - } - auto new_array = np().attr("empty")(new_size, nanobind::arg("dtype") = _dtype); - auto new_buff = nanobind::cast(new_array); - auto new_vector = new_buff.view(); - - _capacity = new_size; - _size = std::min(_capacity, _size); - std::copy(_vector.data(), _vector.data() + static_cast(_size), new_vector.data()); - - std::swap(new_array, _np_array); - std::swap(new_buff, _buff); - std::swap(new_vector, _vector); -} - -template -inline void Dynamic1DA::grow() { - resize(static_cast(_buff.size() * 2)); -} - -template -inline void Dynamic1DA::shrink_to_fit() { - resize(_size); -} - -template -[[nodiscard]] auto Dynamic1DA::operator()() -> BufferT { - shrink_to_fit(); - return _buff; -} - -template -[[nodiscard]] nanobind::object Dynamic1DA::np() { - return nanobind::module_::import_("numpy"); -} - -} // namespace hictkpy diff --git a/src/include/hictkpy/logger.hpp b/src/include/hictkpy/logger.hpp index 619cc02..11143d7 100644 --- a/src/include/hictkpy/logger.hpp +++ b/src/include/hictkpy/logger.hpp @@ -9,24 +9,20 @@ #include #include -#include "hictkpy/nanobind.hpp" - namespace hictkpy { class Logger { - nanobind::object _py_logger{}; std::shared_ptr _logger{}; public: - explicit Logger(spdlog::level::level_enum level_ = spdlog::level::warn); - explicit Logger(std::string_view level_); + explicit Logger(spdlog::level::level_enum level = spdlog::level::warn); + explicit Logger(std::string_view level); [[nodiscard]] std::shared_ptr get_logger(); private: - [[nodiscard]] static nanobind::object init_py_logger(); [[nodiscard]] static std::shared_ptr init_cpp_logger( - spdlog::level::level_enum level_, nanobind::object py_logger); + spdlog::level::level_enum level); }; } // namespace hictkpy diff --git a/src/include/hictkpy/nanobind.hpp b/src/include/hictkpy/nanobind.hpp index 3300554..d5676e6 100644 --- a/src/include/hictkpy/nanobind.hpp +++ b/src/include/hictkpy/nanobind.hpp @@ -28,15 +28,94 @@ HICTKPY_DISABLE_WARNING_USELESS_CAST HICTKPY_DISABLE_WARNING_POP // clang-format on +#include +#include #include inline nanobind::module_ import_module_checked(const std::string& module_name) { try { return nanobind::module_::import_(module_name.c_str()); } catch (nanobind::python_error& e) { - // NOLINTNEXTLINE(*-pro-type-vararg) + // NOLINTNEXTLINE(*-vararg) nanobind::raise_from(e, PyExc_ModuleNotFoundError, - "To enable %s support, please install %s with: pip install 'hictkpy[%s]'", + "To enable %s support, please install %s with: pip install 'hictkpy[%s]'\n" + "Alternatively, you can install hictkpy with all its dependencies by " + "running: pip install 'hictkpy[all]'", module_name.c_str(), module_name.c_str(), module_name.c_str()); } } + +// NOLINTNEXTLINE(*-avoid-magic-numbers) +inline nanobind::module_ import_pyarrow_checked(int min_version_major = 16, + int min_version_minor = 0, + int min_version_patch = 0) { + assert(min_version_major >= 0); + assert(min_version_minor >= 0); + assert(min_version_patch >= 0); + + static bool version_ok{false}; + static std::mutex mtx{}; + + auto pa = import_module_checked("pyarrow"); + + [[maybe_unused]] const auto lck = std::scoped_lock(mtx); + if (version_ok) { + return pa; + } + + static std::string error_msg{}; + error_msg.clear(); + try { + auto metadata = nanobind::module_::import_("importlib.metadata"); + + const auto version = nanobind::cast>( + metadata.attr("version")("pyarrow").attr("split")(".")); + if (version.size() < 3) { + throw nanobind::import_error( + "unable to detect pyarrow version: assuming pyarrow's version is not compatible: please " + "install a compatible version of pyarrow with: pip install 'hictkpy[pyarrow]'"); + } + + const auto major_version_found = std::stoi(version[0]); + const auto minor_version_found = std::stoi(version[1]); + const auto patch_version_found = std::stoi(version[2]); + + version_ok = major_version_found >= min_version_major; + version_ok |= + major_version_found == min_version_major && minor_version_found >= min_version_minor; + version_ok |= major_version_found == min_version_major && + minor_version_found == min_version_minor && + patch_version_found >= min_version_patch; + + if (!version_ok) { + // Poor man's formatting + error_msg = "pyarrow "; + for (const auto& tok : version) { + error_msg += tok + "."; + } + if (version.size() > 1) { + error_msg.pop_back(); + } + error_msg += + " is too old to be used with hictkpy: please " + "install a compatible version with: pip install 'hictkpy[pyarrow]'"; + throw nanobind::import_error(error_msg.c_str()); + } + } catch (const nanobind::builtin_exception&) { + throw; + } catch (const std::exception& e) { + error_msg = "unable to parse pyarrow version: "; + error_msg += e.what(); + error_msg += + ". Assuming pyarrow's version is not compatible: please " + "install a compatible version of pyarrow with: pip install 'hictkpy[pyarrow]'"; + throw nanobind::import_error(error_msg.c_str()); + } catch (...) { + throw nanobind::import_error( + "unable to parse pyarrow version: Assuming pyarrow's version is not compatible: please " + "install a compatible version of pyarrow with: pip install 'hictkpy[pyarrow]'"); + } + + assert(version_ok); + return pa; +} diff --git a/src/include/hictkpy/pixel.hpp b/src/include/hictkpy/pixel.hpp index 197bf33..c26d98c 100644 --- a/src/include/hictkpy/pixel.hpp +++ b/src/include/hictkpy/pixel.hpp @@ -113,13 +113,22 @@ inline std::vector> bg2_df_to_thin_pixels(const hictk::BinTa const auto &reference = bin_table.chromosomes(); std::vector> buffer(start1_np.size()); for (std::size_t i = 0; i < start1_np.size(); ++i) { + if (end1(i) < start1(i) || end2(i) < start2(i)) { + throw std::runtime_error(fmt::format( + FMT_STRING("Found an invalid pixel {} {} {} {} {} {} {}: bin end position cannot be " + "smaller than its start"), + nanobind::cast(chrom1[i]).c_str(), start1(i), end1(i), + nanobind::cast(chrom2[i]).c_str(), start2(i), end2(i), counts(i))); + } + auto bin1 = bin_table.at(reference.at(nanobind::cast(chrom1[i]).c_str()), start1(i)); auto bin2 = bin_table.at(reference.at(nanobind::cast(chrom2[i]).c_str()), start2(i)); - if (end1(i) - start1(i) > bin_table.resolution() || - end2(i) - start2(i) > bin_table.resolution()) { + if (bin_table.type() == hictk::BinTable::Type::fixed && + (end1(i) - start1(i) > bin_table.resolution() || + end2(i) - start2(i) > bin_table.resolution())) { throw std::runtime_error(fmt::format( FMT_STRING("Found an invalid pixel {} {} {} {} {} {} {}: pixel spans a " "distance greater than the bin size"), diff --git a/src/include/hictkpy/pixel_selector.hpp b/src/include/hictkpy/pixel_selector.hpp index 890a3b2..54d697d 100644 --- a/src/include/hictkpy/pixel_selector.hpp +++ b/src/include/hictkpy/pixel_selector.hpp @@ -48,11 +48,10 @@ struct PixelSelector { [[nodiscard]] std::string repr() const; - using PixelCoordTuple = - std::tuple; + using GenomicCoordTuple = std::tuple; - [[nodiscard]] auto get_coord1() const -> PixelCoordTuple; - [[nodiscard]] auto get_coord2() const -> PixelCoordTuple; + [[nodiscard]] auto get_coord1() const -> GenomicCoordTuple; + [[nodiscard]] auto get_coord2() const -> GenomicCoordTuple; [[nodiscard]] nanobind::iterator make_iterable() const; [[nodiscard]] nanobind::object to_arrow(std::string_view span = "upper_triangle") const; diff --git a/src/include/hictkpy/reference.hpp b/src/include/hictkpy/reference.hpp index 0e1dd53..675509f 100644 --- a/src/include/hictkpy/reference.hpp +++ b/src/include/hictkpy/reference.hpp @@ -16,11 +16,11 @@ using ChromosomeDict = nanobind::typed -inline nanobind::typed get_chromosomes_from_file( - const File &f, bool include_all = false) { +template +inline nanobind::typed get_chromosomes_from_object( + const T &obj, bool include_all = false) { nanobind::typed py_chroms{}; // NOLINT - for (const auto &chrom : f.chromosomes()) { + for (const auto &chrom : obj.chromosomes()) { if (!include_all && chrom.is_all()) { continue; } diff --git a/src/include/hictkpy/to_pyarrow.hpp b/src/include/hictkpy/to_pyarrow.hpp new file mode 100644 index 0000000..092c8dc --- /dev/null +++ b/src/include/hictkpy/to_pyarrow.hpp @@ -0,0 +1,20 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include "hictkpy/nanobind.hpp" + +namespace arrow { +class Table; + +} // namespace arrow + +namespace hictkpy { + +[[nodiscard]] nanobind::object export_pyarrow_table(std::shared_ptr arrow_table); + +} // namespace hictkpy diff --git a/src/logger.cpp b/src/logger.cpp index 6829c53..e663074 100644 --- a/src/logger.cpp +++ b/src/logger.cpp @@ -7,6 +7,11 @@ #include #include +#include +#include +#include +#include + #include "hictkpy/common.hpp" #include "hictkpy/nanobind.hpp" @@ -38,12 +43,11 @@ namespace hictkpy { // NOLINTEND(*-avoid-magic-numbers) } -Logger::Logger(spdlog::level::level_enum level_) - : _py_logger(init_py_logger()), _logger(init_cpp_logger(level_, _py_logger)) {} +Logger::Logger(spdlog::level::level_enum level) : _logger(init_cpp_logger(level)) {} -Logger::Logger(std::string_view level_) : Logger(spdlog::level::from_str(std::string{level_})) {} +Logger::Logger(std::string_view level) : Logger(spdlog::level::from_str(std::string{level})) {} -nb::object Logger::init_py_logger() { +[[nodiscard]] static nb::object get_py_logger() { const auto logging = nb::module_::import_("logging"); return logging.attr("getLogger")("hictkpy"); } @@ -51,13 +55,13 @@ nb::object Logger::init_py_logger() { std::shared_ptr Logger::get_logger() { return _logger; } std::shared_ptr Logger::init_cpp_logger( - [[maybe_unused]] spdlog::level::level_enum level_, [[maybe_unused]] nb::object py_logger) { + [[maybe_unused]] spdlog::level::level_enum level_) { #ifndef _WIN32 auto sink = std::make_shared( - // NOLINTNEXTLINE(*-unnecessary-value-param) - [logger = py_logger](const spdlog::details::log_msg& msg) { - logger.attr("log")(to_py_lvl(msg.level), - std::string_view{msg.payload.data(), msg.payload.size()}); + [logger = get_py_logger()](const spdlog::details::log_msg& msg) mutable { + [[maybe_unused]] const nb::gil_scoped_acquire gil{}; + auto msg_py = nb::str(msg.payload.data(), msg.payload.size()); + logger.attr("log")(to_py_lvl(msg.level), msg_py); }); sink->set_pattern("%v"); diff --git a/src/multires_file.cpp b/src/multires_file.cpp index a101ecb..f8b7121 100644 --- a/src/multires_file.cpp +++ b/src/multires_file.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include "hictkpy/nanobind.hpp" #include "hictkpy/reference.hpp" @@ -27,6 +28,56 @@ static std::string repr(const hictk::MultiResFile& mrf) { static std::filesystem::path get_path(const hictk::MultiResFile& mrf) { return mrf.path(); } +[[nodiscard]] static auto get_resolutions(const hictk::MultiResFile& f) { + using WeightVector = nb::ndarray, nb::c_contig, std::uint32_t>; + + // NOLINTNEXTLINE + auto* resolutions_ptr = new std::vector(f.resolutions()); + + auto capsule = nb::capsule(resolutions_ptr, [](void* vect_ptr) noexcept { + delete reinterpret_cast*>(vect_ptr); // NOLINT + }); + + return WeightVector{resolutions_ptr->data(), {resolutions_ptr->size()}, capsule}; +} + +static nb::dict get_attrs(const hictk::hic::File& hf) { + nb::dict py_attrs; + + py_attrs["format"] = "HIC"; + py_attrs["format-version"] = hf.version(); + py_attrs["assembly"] = hf.assembly(); + py_attrs["format-url"] = "https://github.com/aidenlab/hic-format"; + py_attrs["nchroms"] = hf.nchroms(); + + for (const auto& [k, v] : hf.attributes()) { + py_attrs[nb::cast(k)] = v; + } + + return py_attrs; +} + +static nb::dict get_attrs(const hictk::cooler::MultiResFile& mclr) { + nb::dict py_attrs; + + py_attrs["format"] = mclr.attributes().format; + py_attrs["format-version"] = mclr.attributes().format_version; + py_attrs["format-url"] = "https://github.com/open2c/cooler"; + py_attrs["assembly"] = + mclr.open(mclr.resolutions().front()).attributes().assembly.value_or("unknown"); + py_attrs["nchroms"] = mclr.chromosomes().size(); + + return py_attrs; +} + +static nb::dict attributes(const hictk::MultiResFile& f) { + auto attrs = f.is_hic() ? get_attrs(f.open(f.resolutions().front()).get()) + : get_attrs(hictk::cooler::MultiResFile{f.path()}); + attrs["resolutions"] = get_resolutions(f); + + return attrs; +} + bool is_mcool_file(const std::filesystem::path& path) { return bool(hictk::cooler::utils::is_multires_file(path.string())); } @@ -37,16 +88,23 @@ void declare_multires_file_class(nb::module_& m) { mres_file.def("__init__", &multires_file::ctor, nb::arg("path"), "Open a multi-resolution Cooler file (.mcool) or .hic file."); - mres_file.def("__repr__", &multires_file::repr); + mres_file.def("__repr__", &multires_file::repr, nb::rv_policy::move); - mres_file.def("path", &multires_file::get_path, "Get the file path."); - mres_file.def("chromosomes", &get_chromosomes_from_file, - nb::arg("include_all") = false, - "Get chromosomes sizes as a dictionary mapping names to sizes."); - mres_file.def("resolutions", &hictk::MultiResFile::resolutions, - "Get the list of available resolutions."); + mres_file.def("path", &multires_file::get_path, "Get the file path.", nb::rv_policy::move); + mres_file.def("is_mcool", &hictk::MultiResFile::is_mcool, + "Test whether the file is in .mcool format."); + mres_file.def("is_hic", &hictk::MultiResFile::is_hic, "Test whether the file is in .hic format."); + mres_file.def("chromosomes", &get_chromosomes_from_object, + nb::arg("include_ALL") = false, + "Get chromosomes sizes as a dictionary mapping names to sizes.", + nb::rv_policy::take_ownership); + mres_file.def("resolutions", &get_resolutions, "Get the list of available resolutions.", + nb::rv_policy::take_ownership); + mres_file.def("attributes", &multires_file::attributes, "Get file attributes as a dictionary.", + nb::rv_policy::take_ownership); mres_file.def("__getitem__", &hictk::MultiResFile::open, - "Open the Cooler or .hic file corresponding to the resolution given as input."); + "Open the Cooler or .hic file corresponding to the resolution given as input.", + nb::rv_policy::move); } } // namespace hictkpy::multires_file diff --git a/src/pixel_selector.cpp b/src/pixel_selector.cpp index 4a0969e..50c650a 100644 --- a/src/pixel_selector.cpp +++ b/src/pixel_selector.cpp @@ -7,7 +7,6 @@ #include #endif -#include #include #include @@ -36,6 +35,7 @@ #include "hictkpy/nanobind.hpp" #include "hictkpy/pixel_selector.hpp" +#include "hictkpy/to_pyarrow.hpp" namespace nb = nanobind; @@ -66,7 +66,9 @@ std::string PixelSelector::repr() const { count_type_to_str(pixel_count)); } - return fmt::format(FMT_STRING("PixelSelector({}, {}; {}; {})"), coord1(), coord2(), + return fmt::format(FMT_STRING("PixelSelector({}:{}-{}; {}:{}-{}; {}; {})"), + coord1().bin1.chrom().name(), coord1().bin1.start(), coord1().bin2.end(), + coord2().bin1.chrom().name(), coord2().bin1.start(), coord2().bin2.end(), pixel_format == PixelFormat::COO ? "COO" : "BG2", count_type_to_str(pixel_count)); } @@ -105,16 +107,24 @@ const hictk::BinTable& PixelSelector::bins() const noexcept { return std::visit([](const auto& s) -> const hictk::BinTable& { return s->bins(); }, selector); } -auto PixelSelector::get_coord1() const -> PixelCoordTuple { - const auto c = coord1(); - return PixelCoordTuple{std::make_tuple(c.bin1.chrom().name(), c.bin1.start(), c.bin1.end(), - c.bin2.chrom().name(), c.bin2.start(), c.bin2.end())}; +[[nodiscard]] static PixelSelector::GenomicCoordTuple coords_to_tuple( + const hictk::PixelCoordinates& coords, const hictk::BinTable& bins) { + if (!coords) { + return {"ALL", 0, static_cast(bins.size())}; + } + + assert(coords.bin1.chrom() == coords.bin2.chrom()); + + return {std::string{coords.bin1.chrom().name()}, static_cast(coords.bin1.start()), + static_cast(coords.bin2.end())}; } -auto PixelSelector::get_coord2() const -> PixelCoordTuple { - const auto c = coord2(); - return PixelCoordTuple{std::make_tuple(c.bin1.chrom().name(), c.bin1.start(), c.bin1.end(), - c.bin2.chrom().name(), c.bin2.start(), c.bin2.end())}; +auto PixelSelector::get_coord1() const -> GenomicCoordTuple { + return coords_to_tuple(coord1(), bins()); +} + +auto PixelSelector::get_coord2() const -> GenomicCoordTuple { + return coords_to_tuple(coord2(), bins()); } template @@ -185,8 +195,10 @@ template } nb::object PixelSelector::to_arrow(std::string_view span) const { + std::ignore = import_pyarrow_checked(); + const auto query_span = parse_span(span); - const auto table = std::visit( + auto table = std::visit( [&](const auto& sel_ptr) -> std::shared_ptr { assert(!!sel_ptr); return std::visit( @@ -202,12 +214,12 @@ nb::object PixelSelector::to_arrow(std::string_view span) const { }, selector); - return nb::steal(arrow::py::wrap_table(table)); + return export_pyarrow_table(std::move(table)); } nb::object PixelSelector::to_pandas(std::string_view span) const { import_module_checked("pandas"); - return to_arrow(span).attr("to_pandas")(); + return to_arrow(span).attr("to_pandas")(nb::arg("self_destruct") = true); } nb::object PixelSelector::to_df(std::string_view span) const { return to_pandas(span); } @@ -254,6 +266,8 @@ template } nb::object PixelSelector::to_numpy(std::string_view span) const { + std::ignore = import_module_checked("numpy"); + const auto query_span = parse_span(span); return std::visit( @@ -379,35 +393,37 @@ void PixelSelector::bind(nb::module_& m) { sel.def(nb::init, std::string_view, bool>(), nb::arg("selector"), nb::arg("type"), nb::arg("join")); - sel.def("__repr__", &PixelSelector::repr); + sel.def("__repr__", &PixelSelector::repr, nb::rv_policy::move); - sel.def("coord1", &PixelSelector::get_coord1, "Get query coordinates for the first dimension."); - sel.def("coord2", &PixelSelector::get_coord2, "Get query coordinates for the second dimension."); + sel.def("coord1", &PixelSelector::get_coord1, "Get query coordinates for the first dimension.", + nb::rv_policy::move); + sel.def("coord2", &PixelSelector::get_coord2, "Get query coordinates for the second dimension.", + nb::rv_policy::move); sel.def("__iter__", &PixelSelector::make_iterable, nb::keep_alive<0, 1>(), - nb::sig("def __iter__(self) -> PixelIterator"), - "Return an iterator over the selected pixels."); + nb::sig("def __iter__(self) -> hictkpy.PixelIterator"), + "Return an iterator over the selected pixels.", nb::rv_policy::take_ownership); sel.def("to_arrow", &PixelSelector::to_arrow, nb::arg("query_span") = "upper_triangle", nb::sig("def to_arrow(self, query_span: str = \"upper_triangle\") -> pyarrow.Table"), - "Retrieve interactions as a pandas DataFrame."); + "Retrieve interactions as a pyarrow.Table.", nb::rv_policy::take_ownership); sel.def("to_pandas", &PixelSelector::to_pandas, nb::arg("query_span") = "upper_triangle", nb::sig("def to_pandas(self, query_span: str = \"upper_triangle\") -> pandas.DataFrame"), - "Retrieve interactions as a pandas DataFrame."); + "Retrieve interactions as a pandas DataFrame.", nb::rv_policy::take_ownership); sel.def("to_df", &PixelSelector::to_df, nb::arg("query_span") = "upper_triangle", nb::sig("def to_df(self, query_span: str = \"upper_triangle\") -> pandas.DataFrame"), - "Alias to to_pandas()."); + "Alias to to_pandas().", nb::rv_policy::take_ownership); sel.def("to_numpy", &PixelSelector::to_numpy, nb::arg("query_span") = "full", nb::sig("def to_numpy(self, query_span: str = \"full\") -> numpy.ndarray"), - "Retrieve interactions as a numpy 2D matrix."); + "Retrieve interactions as a numpy 2D matrix.", nb::rv_policy::move); sel.def( "to_coo", &PixelSelector::to_coo, nb::arg("query_span") = "upper_triangle", nb::sig("def to_coo(self, query_span: str = \"upper_triangle\") -> scipy.sparse.coo_matrix"), - "Retrieve interactions as a SciPy COO matrix."); + "Retrieve interactions as a SciPy COO matrix.", nb::rv_policy::take_ownership); sel.def( "to_csr", &PixelSelector::to_csr, nb::arg("query_span") = "upper_triangle", nb::sig("def to_csr(self, query_span: str = \"upper_triangle\") -> scipy.sparse.csr_matrix"), - "Retrieve interactions as a SciPy CSR matrix."); + "Retrieve interactions as a SciPy CSR matrix.", nb::rv_policy::move); sel.def("nnz", &PixelSelector::nnz, "Get the number of non-zero entries for the current pixel selection."); diff --git a/src/singlecell_file.cpp b/src/singlecell_file.cpp index eaa2fcb..0179037 100644 --- a/src/singlecell_file.cpp +++ b/src/singlecell_file.cpp @@ -106,20 +106,25 @@ void declare_singlecell_file_class(nb::module_& m) { scell_file.def("__init__", &singlecell_file::ctor, nb::arg("path"), "Open a single-cell Cooler file (.scool)."); - scell_file.def("__repr__", &singlecell_file::repr); + scell_file.def("__repr__", &singlecell_file::repr, nb::rv_policy::move); - scell_file.def("path", &singlecell_file::get_path, "Get the file path."); + scell_file.def("path", &singlecell_file::get_path, "Get the file path.", nb::rv_policy::move); scell_file.def("resolution", &hictk::cooler::SingleCellFile::resolution, "Get the bin size in bp."); - scell_file.def("chromosomes", &get_chromosomes_from_file, - nb::arg("include_all") = false, - "Get chromosomes sizes as a dictionary mapping names to sizes."); - scell_file.def("bins", &get_bins_from_file, - nb::sig("def bins(self) -> pandas.DataFrame"), "Get bins as a pandas DataFrame."); - scell_file.def("attributes", &singlecell_file::get_attrs, "Get file attributes as a dictionary."); - scell_file.def("cells", &singlecell_file::get_cells, "Get the list of available cells."); + scell_file.def("chromosomes", &get_chromosomes_from_object, + nb::arg("include_ALL") = false, + "Get chromosomes sizes as a dictionary mapping names to sizes.", + nb::rv_policy::take_ownership); + scell_file.def("bins", &get_bins_from_object, + nb::sig("def bins(self) -> hictkpy.BinTable"), "Get table of bins.", + nb::rv_policy::move); + scell_file.def("attributes", &singlecell_file::get_attrs, "Get file attributes as a dictionary.", + nb::rv_policy::take_ownership); + scell_file.def("cells", &singlecell_file::get_cells, "Get the list of available cells.", + nb::rv_policy::move); scell_file.def("__getitem__", &singlecell_file::getitem, - "Open the Cooler file corresponding to the cell ID given as input."); + "Open the Cooler file corresponding to the cell ID given as input.", + nb::rv_policy::move); } } // namespace hictkpy::singlecell_file diff --git a/src/to_pyarrow.cpp b/src/to_pyarrow.cpp new file mode 100644 index 0000000..f2555f3 --- /dev/null +++ b/src/to_pyarrow.cpp @@ -0,0 +1,136 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include "hictkpy/to_pyarrow.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictkpy/nanobind.hpp" + +namespace nb = nanobind; + +namespace hictkpy { + +// https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#pycapsule-standard +// https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + +// NOLINTBEGIN(*-owning-memory,*-no-malloc) +static void release_arrow_schemaPyCapsule(PyObject* capsule) { + auto* schema = static_cast(PyCapsule_GetPointer(capsule, "arrow_schema")); + if (schema->release) { + schema->release(schema); + } + free(schema); +} + +static void release_arrow_array_streamPyCapsule(PyObject* capsule) { + auto* stream = + static_cast(PyCapsule_GetPointer(capsule, "arrow_array_stream")); + if (stream->release) { + stream->release(stream); + } + free(stream); +} + +[[nodiscard]] static nb::object export_arrow_schema(const arrow::Schema& schema_in) { + // https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export + auto* schema = static_cast(malloc(sizeof(ArrowSchema))); + if (!schema) { + throw std::bad_alloc(); + } + + const auto status = arrow::ExportSchema(schema_in, schema); + if (!status.ok()) { + free(schema); + throw std::runtime_error(fmt::format( + FMT_STRING("Failed to export arrow::Schema as ArrowSchema: {}"), status.message())); + } + + auto* capsule = + PyCapsule_New(static_cast(schema), "arrow_schema", release_arrow_schemaPyCapsule); + if (!capsule) { + if (schema->release) { + schema->release(schema); + } + free(schema); + throw std::runtime_error("Failed to create PyCapsule for arrow_schema"); + } + + auto obj = nb::module_::import_("types").attr("SimpleNamespace")(); + obj.attr("__setattr__")("__arrow_c_schema__", + nb::cpp_function([ptr = capsule]() { return nb::handle{ptr}; })); + return obj; +} + +[[nodiscard]] static nb::object export_arrow_array_stream( + std::shared_ptr column) { + // https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowstream-export + auto* array_stream = static_cast(malloc(sizeof(ArrowArrayStream))); + if (!array_stream) { + throw std::bad_alloc(); + } + + const auto status = arrow::ExportChunkedArray(std::move(column), array_stream); + if (!status.ok()) { + free(array_stream); + throw std::runtime_error( + fmt::format(FMT_STRING("Failed to export arrow::ChunkedArray as ArrowArrayStream: {}"), + status.message())); + } + + auto* capsule = PyCapsule_New(static_cast(array_stream), "arrow_array_stream", + release_arrow_array_streamPyCapsule); + if (!capsule) { + if (array_stream->release) { + array_stream->release(array_stream); + } + free(array_stream); + throw std::runtime_error("Failed to create PyCapsule for arrow_array_stream"); + } + + auto obj = nb::module_::import_("types").attr("SimpleNamespace")(); + obj.attr("__setattr__")( + "__arrow_c_stream__", + nb::cpp_function( + [ptr = capsule]([[maybe_unused]] const nb::any& _) { return nb::handle{ptr}; }, + nb::arg("requested_schema") = nb::none())); + return obj; +} + +nb::object export_pyarrow_table(std::shared_ptr arrow_table) { + assert(arrow_table); + + const auto pa = import_pyarrow_checked(); + + std::vector columns_py(static_cast(arrow_table->num_columns())); + std::vector> columns(columns_py.size()); + std::copy(arrow_table->columns().begin(), arrow_table->columns().end(), columns.begin()); + + auto schema = pa.attr("schema")(export_arrow_schema(*arrow_table->schema())); + arrow_table.reset(); + + for (std::size_t i = 0; i < columns.size(); ++i) { + // NOLINTNEXTLINE(*-pro-bounds-pointer-arithmetic) + columns_py[i] = pa.attr("chunked_array")(export_arrow_array_stream(std::move(columns[i]))); + } + + return nb::cast(pa.attr("Table").attr("from_arrays")(columns_py, nb::arg("schema") = schema), + nb::rv_policy::take_ownership); +} +// NOLINTEND(*-owning-memory,*-no-malloc) + +} // namespace hictkpy diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..6ddfe97 --- /dev/null +++ b/test/__init__.py @@ -0,0 +1,3 @@ +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT diff --git a/test/data/cooler_variable_bins_test_file.cool b/test/data/cooler_variable_bins_test_file.cool new file mode 100644 index 0000000..17cc01d Binary files /dev/null and b/test/data/cooler_variable_bins_test_file.cool differ diff --git a/test/helpers.py b/test/helpers.py new file mode 100644 index 0000000..332dad5 --- /dev/null +++ b/test/helpers.py @@ -0,0 +1,39 @@ +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + + +def numpy_avail() -> bool: + try: + import numpy + except ModuleNotFoundError: + return False + + return True + + +def pandas_avail() -> bool: + try: + import pandas + except ModuleNotFoundError: + return False + + return True + + +def pyarrow_avail() -> bool: + try: + import pyarrow + except ModuleNotFoundError: + return False + + return True + + +def scipy_avail() -> bool: + try: + import scipy + except ModuleNotFoundError: + return False + + return True diff --git a/test/test_bin_table.py b/test/test_bin_table.py new file mode 100644 index 0000000..3c7d3b9 --- /dev/null +++ b/test/test_bin_table.py @@ -0,0 +1,128 @@ +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + + +import itertools + +import pytest + +import hictkpy + +from .helpers import numpy_avail, pandas_avail, pyarrow_avail + + +class TestClass: + def test_ctor_fixed_bins(self): + chroms = {"chr1": 1000, "chr2": 500} + bins = hictkpy.BinTable(chroms, 100) + assert bins.type() == "fixed" + assert len(bins) == 15 + + @pytest.mark.skipif(not pandas_avail(), reason="pandas is not available") + def test_ctor_variable_bins(self): + import pandas as pd + + data = pd.DataFrame( + { + "chrom": ["chr1", "chr1", "chr2", "chr2", "chr3"], + "start": [0, 100, 0, 15, 0], + "end": [100, 127, 15, 75, 12], + } + ) + + bins = hictkpy.BinTable(data) + assert bins.type() == "variable" + assert len(bins) == len(data) + + def test_accessors(self): + chroms = {"chr1": 1000, "chr2": 500} + bins = hictkpy.BinTable(chroms, 100) + + assert len(bins.chromosomes()) == 2 + assert bins.resolution() == 100 + assert bins.type() == "fixed" + assert len(bins) == 15 + + assert str(bins).startswith("BinTable(") + + def test_getters(self): + chroms = {"chr1": 1000, "chr2": 500} + bins = hictkpy.BinTable(chroms, 100) + + assert bins.get(1).chrom == "chr1" + assert bins.get(1).start == 100 + assert bins.get(1).end == 200 + with pytest.raises(Exception): + bins.get(9999) + + assert bins.get_id("chr1", 153) == 1 + with pytest.raises(Exception): + bins.get_id("abc", 100) + + @pytest.mark.skipif( + not pandas_avail() or not pyarrow_avail(), + reason="pandas or pyarrow are not available", + ) + def test_vectorized_getters(self): + + chroms = {"chr1": 1000, "chr2": 500} + bins = hictkpy.BinTable(chroms, 100) + + assert len(bins.get([1, 1])) == 2 + assert len(bins.get_ids(["chr1", "chr1"], [1, 1])) == 2 + + @pytest.mark.skipif(not pandas_avail() or not pyarrow_avail(), reason="pandas is not available") + def test_merge(self): + import pandas as pd + + chroms = {"chr1": 1000, "chr2": 500} + bins = hictkpy.BinTable(chroms, 100) + + expected = pd.DataFrame( + { + "chrom1": pd.Categorical(["chr1", "chr2"], categories=["chr1", "chr2"]), + "start1": [0, 0], + "end1": [100, 100], + "chrom2": pd.Categorical(["chr1", "chr2"], categories=["chr1", "chr2"]), + "start2": [0, 0], + "end2": [100, 100], + } + ) + + bin_ids = pd.DataFrame({"bin1_id": [0, 10], "bin2_id": [0, 10]}) + + assert (expected == bins.merge(bin_ids).drop(columns=["bin1_id", "bin2_id"])).all().all() + + @pytest.mark.skipif(not pandas_avail() or not pyarrow_avail(), reason="pandas is not available") + def test_to_df(self): + chroms = {"chr1": 1000, "chr2": 500} + bins = hictkpy.BinTable(chroms, 100) + + assert len(bins.to_df()) == len(bins) + assert len(bins.to_df("chr1")) == 10 + assert len(bins.to_df("chr2:0-200")) == 2 + assert len(bins.to_df("chr2\t0\t200", "BED")) == 2 + with pytest.raises(RuntimeError): + bins.to_df("chr0") + + def test_iters(self): + chroms = {"chr1": 1000, "chr2": 500} + bins = hictkpy.BinTable(chroms, 100) + + expected_chroms = [] + expected_starts = [] + expected_ends = [] + + for chrom, size in chroms.items(): + num_bins = (size + bins.resolution() - 1) // bins.resolution() + expected_chroms.extend([chrom] * num_bins) + starts = list(range(0, size, bins.resolution())) + ends = [min(pos + bins.resolution(), size) for pos in starts] + expected_starts.extend(starts) + expected_ends.extend(ends) + + for chrom, start, end, bin in itertools.zip_longest(expected_chroms, expected_starts, expected_ends, bins): + assert bin.chrom == chrom + assert bin.start == start + assert bin.end == end diff --git a/test/test_fetch_dense.py b/test/test_fetch_dense.py index fe88b26..55f3fff 100644 --- a/test/test_fetch_dense.py +++ b/test/test_fetch_dense.py @@ -4,12 +4,12 @@ import pathlib -import numpy as np -import numpy.typing as npt import pytest import hictkpy +from .helpers import numpy_avail + testdir = pathlib.Path(__file__).resolve().parent pytestmark = pytest.mark.parametrize( @@ -21,7 +21,9 @@ ) -def issymmetric(m: npt.NDArray) -> bool: +def issymmetric(m) -> bool: + import numpy as np + assert m.ndim == 2 if m.size == 0: return True @@ -31,6 +33,7 @@ def issymmetric(m: npt.NDArray) -> bool: return np.allclose(m, m.T, atol=0.0, rtol=0.0) +@pytest.mark.skipif(not numpy_avail(), reason="numpy is not available") class TestClass: def test_genome_wide(self, file, resolution): f = hictkpy.File(file, resolution) @@ -40,6 +43,8 @@ def test_genome_wide(self, file, resolution): assert issymmetric(m) def test_cis(self, file, resolution): + import numpy as np + f = hictkpy.File(file, resolution) m = f.fetch("chr2R:10,000,000-15,000,000").to_numpy() assert m.shape == (50, 50) @@ -68,6 +73,8 @@ def test_cis(self, file, resolution): assert m.sum() == 12_607_205 def test_trans(self, file, resolution): + import numpy as np + f = hictkpy.File(file, resolution) m = f.fetch("chr2R:10,000,000-15,000,000", "chrX:0-10,000,000").to_numpy() assert m.shape == (50, 100) diff --git a/test/test_fetch_df.py b/test/test_fetch_df.py index ed7edca..da6563c 100644 --- a/test/test_fetch_df.py +++ b/test/test_fetch_df.py @@ -2,13 +2,15 @@ # # SPDX-License-Identifier: MIT +import math import pathlib -import numpy as np import pytest import hictkpy +from .helpers import numpy_avail, pandas_avail, pyarrow_avail + testdir = pathlib.Path(__file__).resolve().parent pytestmark = pytest.mark.parametrize( @@ -20,16 +22,7 @@ ) -def pandas_avail() -> bool: - try: - import pandas - except ModuleNotFoundError: - return False - - return True - - -@pytest.mark.skipif(not pandas_avail(), reason="pandas is not available") +@pytest.mark.skipif(not pandas_avail() or not pyarrow_avail(), reason="either pandas or pyarrow are not available") class TestClass: def test_genome_wide(self, file, resolution): f = hictkpy.File(file, resolution) @@ -51,12 +44,17 @@ def test_cis(self, file, resolution): df = f.fetch("chr2R:10,000,000-15,000,000", join=True).to_df() assert df["count"].sum() == 4_519_080 assert len(df.columns) == 7 + assert df["chrom1"].dtype.name == "category" + assert df["chrom2"].dtype.name == "category" + + if numpy_avail(): + import numpy as np - df = f.fetch("chr2R:10,000,000-15,000,000", count_type="int").to_df() - assert df["count"].dtype == np.int32 + df = f.fetch("chr2R:10,000,000-15,000,000", count_type="int").to_df() + assert df["count"].dtype == np.int32 - df = f.fetch("chr2R:10,000,000-15,000,000", count_type="float").to_df() - assert df["count"].dtype == np.float64 + df = f.fetch("chr2R:10,000,000-15,000,000", count_type="float").to_df() + assert df["count"].dtype == np.float64 df = f.fetch("chr2R\t10000000\t15000000", query_type="BED").to_df() assert len(df) == 1275 @@ -82,12 +80,17 @@ def test_trans(self, file, resolution): df = f.fetch("chr2R:10,000,000-15,000,000", "chrX:0-10,000,000", join=True).to_df() assert df["count"].sum() == 83_604 assert len(df.columns) == 7 + assert df["chrom1"].dtype.name == "category" + assert df["chrom2"].dtype.name == "category" + + if numpy_avail(): + import numpy as np - df = f.fetch("chr2R:10,000,000-15,000,000", "chrX:0-10,000,000", count_type="int").to_df() - assert df["count"].dtype == np.int32 + df = f.fetch("chr2R:10,000,000-15,000,000", "chrX:0-10,000,000", count_type="int").to_df() + assert df["count"].dtype == np.int32 - df = f.fetch("chr2R:10,000,000-15,000,000", "chrX:0-10,000,000", count_type="float").to_df() - assert df["count"].dtype == np.float64 + df = f.fetch("chr2R:10,000,000-15,000,000", "chrX:0-10,000,000", count_type="float").to_df() + assert df["count"].dtype == np.float64 df = f.fetch("chr2R\t10000000\t15000000", "chrX\t0\t10000000", query_type="BED").to_df() assert len(df) == 4995 @@ -108,4 +111,4 @@ def test_balanced(self, file, resolution): else: df = f.fetch("chr2R:10,000,000-15,000,000", normalization="ICE").to_df() - assert np.isclose(59.349524704033215, df["count"].sum()) + assert math.isclose(59.349524704033215, df["count"].sum(), rel_tol=1.0e-5, abs_tol=1.0e-8) diff --git a/test/test_fetch_iters.py b/test/test_fetch_iters.py index e3cb757..3680ff0 100644 --- a/test/test_fetch_iters.py +++ b/test/test_fetch_iters.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: MIT +import math import pathlib -import numpy as np import pytest import hictkpy @@ -87,4 +87,4 @@ def test_balanced(self, file, resolution): else: sel = f.fetch("chr2R:10,000,000-15,000,000", normalization="ICE") - assert np.isclose(59.349524704033215, compute_sum(sel)) + assert math.isclose(59.349524704033215, compute_sum(sel), rel_tol=1.0e-5, abs_tol=1.0e-8) diff --git a/test/test_fetch_sparse.py b/test/test_fetch_sparse.py index 919ead0..a9f7b4f 100644 --- a/test/test_fetch_sparse.py +++ b/test/test_fetch_sparse.py @@ -2,13 +2,15 @@ # # SPDX-License-Identifier: MIT +import math import pathlib -import numpy as np import pytest import hictkpy +from .helpers import numpy_avail, scipy_avail + testdir = pathlib.Path(__file__).resolve().parent pytestmark = pytest.mark.parametrize( @@ -20,15 +22,6 @@ ) -def scipy_avail() -> bool: - try: - import scipy - except ModuleNotFoundError: - return False - - return True - - @pytest.mark.skipif(not scipy_avail(), reason="scipy is not available") class TestClass: def test_genome_wide(self, file, resolution): @@ -45,11 +38,14 @@ def test_cis(self, file, resolution): assert m.shape == (50, 50) assert m.sum() == 4_519_080 - m = f.fetch("chr2R:10,000,000-15,000,000", count_type="int").to_coo() - assert m.dtype == np.int32 + if numpy_avail(): + import numpy as np + + m = f.fetch("chr2R:10,000,000-15,000,000", count_type="int").to_coo() + assert m.dtype == np.int32 - m = f.fetch("chr2R:10,000,000-15,000,000", count_type="float").to_coo() - assert m.dtype == np.float64 + m = f.fetch("chr2R:10,000,000-15,000,000", count_type="float").to_coo() + assert m.dtype == np.float64 m = f.fetch("chr2R\t10000000\t15000000", query_type="BED").to_coo() assert m.shape == (50, 50) @@ -69,11 +65,14 @@ def test_trans(self, file, resolution): assert m.shape == (50, 100) assert m.sum() == 83_604 - m = f.fetch("chr2R:10,000,000-15,000,000", "chrX:0-10,000,000", count_type="int").to_coo() - assert m.dtype == np.int32 + if numpy_avail(): + import numpy as np + + m = f.fetch("chr2R:10,000,000-15,000,000", "chrX:0-10,000,000", count_type="int").to_coo() + assert m.dtype == np.int32 - m = f.fetch("chr2R:10,000,000-15,000,000", "chrX:0-10,000,000", count_type="float").to_coo() - assert m.dtype == np.float64 + m = f.fetch("chr2R:10,000,000-15,000,000", "chrX:0-10,000,000", count_type="float").to_coo() + assert m.dtype == np.float64 m = f.fetch("chr2R\t10000000\t15000000", "chrX\t0\t10000000", query_type="BED").to_coo() assert m.shape == (50, 100) @@ -86,4 +85,4 @@ def test_balanced(self, file, resolution): else: m = f.fetch("chr2R:10,000,000-15,000,000", normalization="ICE").to_coo() - assert np.isclose(59.349524704033215, m.sum()) + assert math.isclose(59.349524704033215, m.sum(), rel_tol=1.0e-5, abs_tol=1.0e-8) diff --git a/test/test_file_accessors.py b/test/test_file_accessors.py index 75de68f..ee621ce 100644 --- a/test/test_file_accessors.py +++ b/test/test_file_accessors.py @@ -8,6 +8,8 @@ import hictkpy +from .helpers import pandas_avail + testdir = pathlib.Path(__file__).resolve().parent pytestmark = pytest.mark.parametrize( @@ -20,21 +22,12 @@ ) -def pandas_avail() -> bool: - try: - import pandas - except ModuleNotFoundError: - return False - - return True - - class TestClass: @pytest.mark.skipif(not pandas_avail(), reason="pandas is not available") def test_attributes(self, file, resolution): f = hictkpy.File(file, resolution) assert f.resolution() == 100_000 - # assert f.nchroms() == 8 # TODO enable after merging https://github.com/paulsengroup/hictk/pull/294 + assert f.nchroms() == 8 assert f.nbins() == 1380 assert "chr2L" in f.chromosomes() @@ -50,13 +43,24 @@ def test_attributes(self, file, resolution): def test_normalizations(self, file, resolution): f = hictkpy.File(file, resolution) + cooler_weights = ["KR", "SCALE", "VC", "VC_SQRT", "weight"] + hic_weights = ["ICE"] + if f.is_cooler(): - assert f.avail_normalizations() == ["KR", "SCALE", "VC", "VC_SQRT", "weight"] + assert f.avail_normalizations() == cooler_weights else: - assert f.avail_normalizations() == ["ICE"] + assert f.avail_normalizations() == hic_weights assert not f.has_normalization("foo") name = "weight" if f.is_cooler() else "ICE" weights = f.weights(name) assert len(weights) == f.nbins() + + if f.is_cooler(): + df = f.weights(cooler_weights) + assert len(df.columns) == len(cooler_weights) + else: + df = f.weights(hic_weights) + assert len(df.columns) == len(hic_weights) + assert len(df) == f.nbins() diff --git a/test/test_file_creation_cool.py b/test/test_file_creation_cool.py index 1120f4c..1685326 100644 --- a/test/test_file_creation_cool.py +++ b/test/test_file_creation_cool.py @@ -10,6 +10,8 @@ import hictkpy +from .helpers import pandas_avail, pyarrow_avail + testdir = pathlib.Path(__file__).resolve().parent @@ -17,34 +19,42 @@ "file,resolution", [ (testdir / "data" / "cooler_test_file.mcool", 100_000), + (testdir / "data" / "cooler_variable_bins_test_file.cool", None), ], ) -def pandas_avail() -> bool: - try: - import pandas - except ModuleNotFoundError: - return False - - return True - - -@pytest.mark.skipif(not pandas_avail(), reason="pandas is not available") +@pytest.mark.skipif(not pandas_avail() or not pyarrow_avail(), reason="either pandas or pyarrow are not available") class TestClass: @staticmethod def setup_method(): logging.basicConfig(level="INFO", force=True) logging.getLogger().setLevel("INFO") - def test_file_creation_thin_pixel(self, file, resolution, tmpdir): + def test_accessors(self, file, resolution, tmpdir): + bins = hictkpy.File(file, resolution).bins() + + path = tmpdir / "test.cool" + w = hictkpy.cooler.FileWriter(path, bins) + assert str(w).startswith("CoolFileWriter(") + assert w.path() == path + if resolution is None: + assert w.resolution() == 0 + else: + assert w.resolution() == resolution + assert w.chromosomes() == bins.chromosomes() + assert len(w.bins().to_df().compare(bins.to_df())) == 0 + + def test_file_creation_thin_pixel(self, file, resolution, tmpdir): f = hictkpy.File(file, resolution) + if f.bins().type() != "fixed": + pytest.skip(f'BinTable of file "{file}" does not have fixed bins.') df = f.fetch(join=False).to_df() expected_sum = df["count"].sum() - path = tmpdir / "test1.cool" + path = tmpdir / "test.cool" w = hictkpy.cooler.FileWriter(path, f.chromosomes(), f.resolution()) chunk_size = 1000 @@ -52,7 +62,7 @@ def test_file_creation_thin_pixel(self, file, resolution, tmpdir): end = start + chunk_size w.add_pixels(df[start:end]) - w.finalize("info", 100_000, 100_000) + f = w.finalize("info", 100_000, 100_000) with pytest.raises(Exception): w.add_pixels(df) with pytest.raises(Exception): @@ -61,16 +71,17 @@ def test_file_creation_thin_pixel(self, file, resolution, tmpdir): del w gc.collect() - f = hictkpy.File(path, resolution) assert f.fetch().sum() == expected_sum def test_file_creation(self, file, resolution, tmpdir): f = hictkpy.File(file, resolution) + if f.bins().type() != "fixed": + pytest.skip(f'BinTable of file "{file}" does not have fixed bins.') df = f.fetch(join=True).to_df() expected_sum = df["count"].sum() - path = tmpdir / "test2.cool" + path = tmpdir / "test.cool" w = hictkpy.cooler.FileWriter(path, f.chromosomes(), f.resolution()) chunk_size = 1000 @@ -78,7 +89,32 @@ def test_file_creation(self, file, resolution, tmpdir): end = start + chunk_size w.add_pixels(df[start:end]) - w.finalize("info", 100_000, 100_000) + f = w.finalize("info", 100_000, 100_000) + with pytest.raises(Exception): + w.add_pixels(df) + with pytest.raises(Exception): + w.finalize() + + del w + gc.collect() + + assert f.fetch().sum() == expected_sum + + def test_file_creation_bin_table(self, file, resolution, tmpdir): + f = hictkpy.File(file, resolution) + + df = f.fetch(join=True).to_df() + expected_sum = df["count"].sum() + + path = tmpdir / "test.cool" + w = hictkpy.cooler.FileWriter(path, f.bins()) + + chunk_size = 1000 + for start in range(0, len(df), chunk_size): + end = start + chunk_size + w.add_pixels(df[start:end]) + + f = w.finalize("info", 100_000, 100_000) with pytest.raises(Exception): w.add_pixels(df) with pytest.raises(Exception): @@ -87,17 +123,18 @@ def test_file_creation(self, file, resolution, tmpdir): del w gc.collect() - f = hictkpy.File(path, resolution) assert f.fetch().sum() == expected_sum def test_file_creation_float_counts(self, file, resolution, tmpdir): f = hictkpy.File(file, resolution) + if f.bins().type() != "fixed": + pytest.skip(f'BinTable of file "{file}" does not have fixed bins.') df = f.fetch(join=True, count_type="float").to_df() df["count"] += 0.12345 expected_sum = df["count"].sum() - path = tmpdir / "test3.cool" + path = tmpdir / "test.cool" w = hictkpy.cooler.FileWriter(path, f.chromosomes(), f.resolution()) chunk_size = 1000 @@ -105,7 +142,7 @@ def test_file_creation_float_counts(self, file, resolution, tmpdir): end = start + chunk_size w.add_pixels(df[start:end]) - w.finalize("info", 100_000, 100_000) + f = w.finalize("info", 100_000, 100_000) with pytest.raises(Exception): w.add_pixels(df) with pytest.raises(Exception): @@ -114,5 +151,4 @@ def test_file_creation_float_counts(self, file, resolution, tmpdir): del w gc.collect() - f = hictkpy.File(path, resolution) assert pytest.approx(f.fetch(count_type="float").sum()) == expected_sum diff --git a/test/test_file_creation_hic.py b/test/test_file_creation_hic.py index 1ebdff1..347ef20 100644 --- a/test/test_file_creation_hic.py +++ b/test/test_file_creation_hic.py @@ -10,40 +10,50 @@ import hictkpy +from .helpers import pandas_avail, pyarrow_avail + testdir = pathlib.Path(__file__).resolve().parent pytestmark = pytest.mark.parametrize( "file,resolution", [ - (testdir / "data" / "hic_test_file.hic", 100_000), + (testdir / "data" / "cooler_test_file.mcool", 100_000), + (testdir / "data" / "cooler_variable_bins_test_file.cool", None), ], ) -def pandas_avail() -> bool: - try: - import pandas - except ModuleNotFoundError: - return False - - return True - - -@pytest.mark.skipif(not pandas_avail(), reason="pandas is not available") +@pytest.mark.skipif(not pandas_avail() or not pyarrow_avail(), reason="either pandas or pyarrow are not available") class TestClass: @staticmethod def setup_method(): logging.basicConfig(level="INFO", force=True) logging.getLogger().setLevel("INFO") + def test_accessors(self, file, resolution, tmpdir): + bins = hictkpy.File(file, resolution).bins() + if bins.type() != "fixed": + pytest.skip(f'BinTable of file "{file}" does not have fixed bins.') + + path = tmpdir / "test.hic" + w = hictkpy.hic.FileWriter(path, bins) + + assert str(w).startswith("HiCFileWriter(") + assert w.path() == path + assert w.resolutions() == [resolution] + assert w.chromosomes() == bins.chromosomes() + assert len(w.bins(resolution).to_df().compare(bins.to_df())) == 0 + def test_file_creation_thin_pixel(self, file, resolution, tmpdir): f = hictkpy.File(file, resolution) + if f.bins().type() != "fixed": + pytest.skip(f'BinTable of file "{file}" does not have fixed bins.') df = f.fetch(join=False).to_df() expected_sum = df["count"].sum() - path = tmpdir / "test1.hic" + path = tmpdir / "test.hic" w = hictkpy.hic.FileWriter(path, f.chromosomes(), f.resolution()) chunk_size = 1000 @@ -51,7 +61,7 @@ def test_file_creation_thin_pixel(self, file, resolution, tmpdir): end = start + chunk_size w.add_pixels(df[start:end]) - w.finalize() + f = w.finalize() with pytest.raises(Exception): w.add_pixels(df) with pytest.raises(Exception): @@ -60,16 +70,17 @@ def test_file_creation_thin_pixel(self, file, resolution, tmpdir): del w gc.collect() - f = hictkpy.File(path, resolution) assert f.fetch().sum() == expected_sum def test_file_creation(self, file, resolution, tmpdir): f = hictkpy.File(file, resolution) + if f.bins().type() != "fixed": + pytest.skip(f'BinTable of file "{file}" does not have fixed bins.') df = f.fetch(join=True).to_df() expected_sum = df["count"].sum() - path = tmpdir / "test2.hic" + path = tmpdir / "test.hic" w = hictkpy.hic.FileWriter(path, f.chromosomes(), f.resolution()) chunk_size = 1000 @@ -77,7 +88,37 @@ def test_file_creation(self, file, resolution, tmpdir): end = start + chunk_size w.add_pixels(df[start:end]) - w.finalize() + f = w.finalize() + with pytest.raises(Exception): + w.add_pixels(df) + with pytest.raises(Exception): + w.finalize() + + del w + gc.collect() + + assert f.fetch().sum() == expected_sum + + def test_file_creation_bin_table(self, file, resolution, tmpdir): + f = hictkpy.File(file, resolution) + + df = f.fetch(join=True).to_df() + expected_sum = df["count"].sum() + + path = tmpdir / "test.hic" + if f.bins().type() != "fixed": + with pytest.raises(Exception): + hictkpy.hic.FileWriter(path, f.bins()) + return + + w = hictkpy.hic.FileWriter(path, f.bins()) + + chunk_size = 1000 + for start in range(0, len(df), chunk_size): + end = start + chunk_size + w.add_pixels(df[start:end]) + + f = w.finalize() with pytest.raises(Exception): w.add_pixels(df) with pytest.raises(Exception): @@ -86,5 +127,4 @@ def test_file_creation(self, file, resolution, tmpdir): del w gc.collect() - f = hictkpy.File(path, resolution) assert f.fetch().sum() == expected_sum diff --git a/test/test_file_ctors.py b/test/test_file_ctors.py new file mode 100644 index 0000000..bc6d0c4 --- /dev/null +++ b/test/test_file_ctors.py @@ -0,0 +1,42 @@ +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import pathlib + +import pytest + +import hictkpy + +testdir = pathlib.Path(__file__).resolve().parent + +pytestmark = pytest.mark.parametrize( + "file,resolution", + [ + (testdir / "data" / "cooler_test_file.mcool", 100_000), + (testdir / "data" / "hic_test_file.hic", 100_000), + ], +) + + +class TestClass: + def test_resolution_mismatch(self, file, resolution): + f = hictkpy.MultiResFile(file) + + if not f[resolution].is_cooler(): + pytest.skip(f'File "{file}" is not in .mcool format') + + assert 1_000_000 in f.resolutions() + with pytest.raises(RuntimeError, match="resolution mismatch"): + hictkpy.File(f"{file}::/resolutions/{resolution}", 1_000_000) + + def test_missing_resolution_param(self, file, resolution): + f = hictkpy.MultiResFile(file) + + if f[resolution].is_cooler(): + ext = ".mcool" + else: + ext = ".hic" + + with pytest.raises(RuntimeError, match=f"resolution is required and cannot be None when opening {ext} files"): + hictkpy.File(file) diff --git a/test/test_file_validators.py b/test/test_file_validators.py index a1ad54e..aa6a091 100644 --- a/test/test_file_validators.py +++ b/test/test_file_validators.py @@ -4,8 +4,6 @@ import pathlib -import pytest - import hictkpy testdir = pathlib.Path(__file__).resolve().parent @@ -17,7 +15,7 @@ class TestClass: - def test_validators(self): + def test_valid_formats(self): assert hictkpy.is_cooler(cool_file) assert not hictkpy.is_cooler(hic_file) @@ -26,3 +24,31 @@ def test_validators(self): assert hictkpy.is_scool_file(scool_file) assert not hictkpy.is_scool_file(cool_file) + + assert hictkpy.is_hic(hic_file) + assert not hictkpy.is_hic(cool_file) + + def test_invalid_formats(self): + path = pathlib.Path(__file__).resolve() + + assert not hictkpy.is_cooler(path) + assert not hictkpy.is_mcool_file(path) + assert not hictkpy.is_scool_file(path) + assert not hictkpy.is_hic(path) + + def test_invalid_files(self): + non_existing_file = testdir / "foobar.123" + assert not non_existing_file.exists() + + assert not hictkpy.is_cooler(non_existing_file) + assert not hictkpy.is_mcool_file(non_existing_file) + assert not hictkpy.is_scool_file(non_existing_file) + assert not hictkpy.is_hic(non_existing_file) + + folder = testdir + assert folder.is_dir() + + assert not hictkpy.is_cooler(folder) + assert not hictkpy.is_mcool_file(folder) + assert not hictkpy.is_scool_file(folder) + assert not hictkpy.is_hic(folder) diff --git a/test/test_multires_file_accessors.py b/test/test_multires_file_accessors.py index 707fe7f..4e1e1c5 100644 --- a/test/test_multires_file_accessors.py +++ b/test/test_multires_file_accessors.py @@ -11,19 +11,39 @@ testdir = pathlib.Path(__file__).resolve().parent pytestmark = pytest.mark.parametrize( - "file", + "file,format", [ - testdir / "data" / "cooler_test_file.mcool", + (testdir / "data" / "cooler_test_file.mcool", "mcool"), + (testdir / "data" / "hic_test_file.hic", "hic"), ], ) class TestClass: - def test_attributes(self, file): + def test_accessors(self, file, format): f = hictkpy.MultiResFile(file) + assert str(f).startswith("MultiResFile(") + assert f.path() == file - assert f.resolutions() == [100_000, 1_000_000] + assert f.is_mcool() == (format == "mcool") + assert f.is_hic() == (format == "hic") assert len(f.chromosomes()) == 8 + if f.is_hic(): + resolutions = [100_000] + assert (f.resolutions() == resolutions).all() + assert f.attributes()["format"] == "HIC" + assert f.attributes()["format-version"] == 9 + assert (f.attributes()["resolutions"] == resolutions).all() + else: + resolutions = [100_000, 1_000_000] + assert (f.resolutions() == resolutions).all() + assert f.attributes()["format"] == "HDF5::MCOOL" + assert f.attributes()["format-version"] == 2 + assert (f.attributes()["resolutions"] == resolutions).all() + assert f[100_000].resolution() == 100_000 + + with pytest.raises(Exception): + f[1234] # noqa diff --git a/test/test_pixel_selector_accessors.py b/test/test_pixel_selector_accessors.py new file mode 100644 index 0000000..ada56bb --- /dev/null +++ b/test/test_pixel_selector_accessors.py @@ -0,0 +1,54 @@ +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import pathlib + +import pytest + +import hictkpy + +from .helpers import numpy_avail + +testdir = pathlib.Path(__file__).resolve().parent + +pytestmark = pytest.mark.parametrize( + "file,resolution", + [ + (testdir / "data" / "cooler_test_file.mcool", 100_000), + (testdir / "data" / "hic_test_file.hic", 100_000), + ], +) + + +@pytest.mark.skipif(not numpy_avail(), reason="numpy is not available") +class TestClass: + def test_repr(self, file, resolution): + f = hictkpy.File(file, resolution) + + sel = f.fetch() + assert str(sel) == "PixelSelector(ALL; COO; int32)" + + sel = f.fetch(join=True) + assert str(sel) == "PixelSelector(ALL; BG2; int32)" + + sel = f.fetch(count_type="float") + assert str(sel) == "PixelSelector(ALL; COO; float64)" + + sel = f.fetch("chr2L:0-10,000,000", "chr2L:5,000,000-20,000,000") + assert str(sel) == "PixelSelector(chr2L:0-10000000; chr2L:5000000-20000000; COO; int32)" + + def test_coords(self, file, resolution): + f = hictkpy.File(file, resolution) + + sel = f.fetch() + assert sel.coord1() == ("ALL", 0, len(f.bins())) + assert sel.coord1() == sel.coord2() + + sel = f.fetch("chr2L:0-10,000,000") + assert sel.coord1() == ("chr2L", 0, 10_000_000) + assert sel.coord1() == sel.coord2() + + sel = f.fetch("chr2L:0-10,000,000", "chr2L:5,000,000-20,000,000") + assert sel.coord1() == ("chr2L", 0, 10_000_000) + assert sel.coord2() == ("chr2L", 5_000_000, 20_000_000) diff --git a/test/test_singlecell_file_accessors.py b/test/test_singlecell_file_accessors.py index f48f1c4..aa7a1d9 100644 --- a/test/test_singlecell_file_accessors.py +++ b/test/test_singlecell_file_accessors.py @@ -19,13 +19,19 @@ class TestClass: - def test_attributes(self, file): + def test_accessors(self, file): f = hictkpy.cooler.SingleCellFile(file) + assert str(f).startswith("SingleCellFile(") + assert f.path() == file assert f.resolution() == 100_000 assert len(f.chromosomes()) == 20 + assert len(f.bins()) == 26398 assert len(f.cells()) == 5 assert f.attributes()["format"] == "HDF5::SCOOL" assert f["GSM2687248_41669_ACAGTG-R1-DpnII.100000.cool"].resolution() == 100_000 + + with pytest.raises(Exception): + f["ABC"] # noqa diff --git a/utils/devel/stubgen.py b/utils/devel/stubgen.py index a3d93a4..def380f 100755 --- a/utils/devel/stubgen.py +++ b/utils/devel/stubgen.py @@ -51,9 +51,6 @@ def main(): if "" not in sys.path and "." not in sys.path: sys.path.insert(0, "") - pyarrow = importlib.import_module("pyarrow") - pyarrow.create_library_symlinks() - touch_file(os.path.join(args["output-dir"], "py.typed"), args["force"]) process_module("hictkpy._hictkpy", os.path.join(args["output-dir"], "__init__.pyi"), args["force"])