Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python bindings to cuFileDriverOpen() and cuFileDriverClose() #514

Merged
merged 18 commits into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion ci/run_pytests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,9 @@ set -euo pipefail
# Support invoking run_pytests.sh outside the script directory
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/kvikio

pytest --cache-clear --verbose "$@" tests
# If running CUDA 11.8 on arm64, we skip tests marked "cufile" since
# cuFile didn't support arm until 12.4
[[ "${CUDA_VERSION}" == "11.8.0" && "${RUNNER_ARCH}" == "ARM64" ]] \
&& PYTEST_MARK=( -m 'not cufile' ) || PYTEST_MARK=()

pytest --cache-clear --verbose "${PYTEST_MARK[@]}" "$@" tests
7 changes: 6 additions & 1 deletion ci/test_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,9 @@ RAPIDS_PY_WHEEL_NAME="kvikio_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-fr

python -m pip install "$(echo ${WHEELHOUSE}/kvikio_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"

python -m pytest ./python/kvikio/tests
# If running CUDA 11.8 on arm64, we skip tests marked "cufile" since
# cuFile didn't support arm until 12.4
[[ "${CUDA_VERSION}" == "11.8.0" && "${RUNNER_ARCH}" == "ARM64" ]] \
&& PYTEST_MARK=( -m 'not cufile' ) || PYTEST_MARK=()

python -m pytest --cache-clear --verbose "${PYTEST_MARK[@]}" ./python/kvikio/tests
2 changes: 1 addition & 1 deletion cpp/examples/basic_io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@

#include <kvikio/batch.hpp>
#include <kvikio/buffer.hpp>
#include <kvikio/cufile/driver.hpp>
#include <kvikio/defaults.hpp>
#include <kvikio/driver.hpp>
#include <kvikio/error.hpp>
#include <kvikio/file_handle.hpp>

Expand Down
2 changes: 1 addition & 1 deletion cpp/examples/basic_no_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@

#include <kvikio/batch.hpp>
#include <kvikio/buffer.hpp>
#include <kvikio/cufile/driver.hpp>
#include <kvikio/defaults.hpp>
#include <kvikio/driver.hpp>
#include <kvikio/error.hpp>
#include <kvikio/file_handle.hpp>

Expand Down
18 changes: 17 additions & 1 deletion cpp/examples/downstream/downstream_example.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,23 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <iostream>

#include <kvikio/cufile/driver.hpp>
#include <kvikio/defaults.hpp>
#include <kvikio/driver.hpp>

using namespace std;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -45,7 +45,7 @@ inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noe
class DriverInitializer {
// Optional, if not used cuFiles opens the driver automatically
public:
DriverInitializer() { CUFILE_TRY(cuFileAPI::instance().DriverOpen()); }
DriverInitializer() { cuFileAPI::instance().driver_open(); }

DriverInitializer(DriverInitializer const&) = delete;
DriverInitializer& operator=(DriverInitializer const&) = delete;
Expand All @@ -55,7 +55,7 @@ class DriverInitializer {
~DriverInitializer()
{
try {
CUFILE_TRY(cuFileAPI::instance().DriverClose());
cuFileAPI::instance().driver_close();
} catch (const CUfileException& e) {
std::cerr << "Unable to close GDS file driver: ";
std::cerr << e.what();
Expand Down
5 changes: 1 addition & 4 deletions cpp/include/kvikio/file_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,12 @@

#include <cstddef>
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <optional>
#include <stdexcept>
#include <system_error>
#include <utility>

#include <kvikio/buffer.hpp>
#include <kvikio/cufile_config.hpp>
#include <kvikio/cufile/config.hpp>
#include <kvikio/defaults.hpp>
#include <kvikio/error.hpp>
#include <kvikio/parallel_operation.hpp>
Expand Down
65 changes: 48 additions & 17 deletions cpp/include/kvikio/shim/cufile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
#pragma once

#include <stdexcept>
#include <string>

#include <iostream>
#include <kvikio/shim/cufile_h_wrapper.hpp>
#include <kvikio/shim/utils.hpp>

Expand All @@ -38,8 +38,6 @@ class cuFileAPI {
decltype(cuFileWrite)* Write{nullptr};
decltype(cuFileBufRegister)* BufRegister{nullptr};
decltype(cuFileBufDeregister)* BufDeregister{nullptr};
decltype(cuFileDriverOpen)* DriverOpen{nullptr};
decltype(cuFileDriverClose)* DriverClose{nullptr};
decltype(cuFileDriverGetProperties)* DriverGetProperties{nullptr};
decltype(cuFileDriverSetPollMode)* DriverSetPollMode{nullptr};
decltype(cuFileDriverSetMaxCacheSize)* DriverSetMaxCacheSize{nullptr};
Expand All @@ -54,6 +52,12 @@ class cuFileAPI {
decltype(cuFileStreamRegister)* StreamRegister{nullptr};
decltype(cuFileStreamDeregister)* StreamDeregister{nullptr};

private:
// Don't call driver open and close directly, use `.driver_open()` and `.driver_close()`.
decltype(cuFileDriverOpen)* DriverOpen{nullptr};
decltype(cuFileDriverClose)* DriverClose{nullptr};

public:
bool stream_available = false;

private:
Expand Down Expand Up @@ -105,25 +109,25 @@ class cuFileAPI {
}
#endif

// cuFile is supposed to open and close the driver automatically but because of a bug in
// CUDA 11.8, it sometimes segfault. See <https://github.com/rapidsai/kvikio/issues/159>.
CUfileError_t const error = DriverOpen();
if (error.err != CU_FILE_SUCCESS) {
throw std::runtime_error(std::string{"cuFile error at: "} + __FILE__ + ":" +
KVIKIO_STRINGIFY(__LINE__) + ": " +
cufileop_status_error(error.err));
}
// cuFile is supposed to open and close the driver automatically but
// because of a bug in cuFile v1.4 (CUDA v11.8), it sometimes segfault:
madsbk marked this conversation as resolved.
Show resolved Hide resolved
// <https://github.com/rapidsai/kvikio/issues/159>.
// We use the stream API as an version indicator of cuFile, it was introduced
// in cuFile v1.7 (CUDA v12.2).
madsbk marked this conversation as resolved.
Show resolved Hide resolved
if (!stream_available) { driver_open(); }
}

// Notice, we have to close the driver at program exit (if we opened it) even though we are
// not allowed to call CUDA after main[1]. This is because, cuFile will segfault if the
// driver isn't closed on program exit i.e. we are doomed if we do, doomed if we don't, but
// this seems to be the lesser of two evils.
// [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we explicitly register a std::atexit handler? Assuming that the CUDA runtime is doing the same, atexit has ordering guarantees so we'll be pushing our exit handler onto the stack to run before CUDA does its cleanup.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. This is explicitly UB

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we know how CUDA does its teardown, if not via atexit handlers that have predictable ordering?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think std::atexit has ordering guarantees between multiple shared libraries :/

~cuFileAPI()
{
CUfileError_t const error = DriverClose();
if (error.err != CU_FILE_SUCCESS) {
std::cerr << "Unable to close GDS file driver: " << cufileop_status_error(error.err)
<< std::endl;
}
if (!stream_available) { driver_close(); }
madsbk marked this conversation as resolved.
Show resolved Hide resolved
}
#else
cuFileAPI() { throw std::runtime_error(CUFILE_ERRSTR(0)); }
cuFileAPI() { throw std::runtime_error("KvikIO not compiled with cuFile.h"); }
#endif

public:
Expand All @@ -137,6 +141,33 @@ class cuFileAPI {
static cuFileAPI _instance;
return _instance;
}

/**
* @brief Open the cuFile driver
*
* cuFile allows multiple calls to `cufileDriverOpen()`, only the first call opens
* the driver, but every call should have a matching call to `cufileDriverClose()`.
*/
void driver_open()
madsbk marked this conversation as resolved.
Show resolved Hide resolved
{
CUfileError_t const error = DriverOpen();
if (error.err != CU_FILE_SUCCESS) {
throw std::runtime_error(std::string{"Unable to open GDS file driver: "} +
cufileop_status_error(error.err));
}
}

/**
* @brief Close the cuFile driver
*/
void driver_close()
{
CUfileError_t const error = DriverClose();
if (error.err != CU_FILE_SUCCESS) {
throw std::runtime_error(std::string{"Unable to close GDS file driver: "} +
cufileop_status_error(error.err));
}
}
};

/**
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/kvikio/stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
#pragma once

#include <sys/types.h>
#include <algorithm>
#include <cstdlib>
#include <iostream>
#include <kvikio/error.hpp>
#include <kvikio/shim/cuda.hpp>
#include <kvikio/shim/cufile.hpp>
Expand Down
5 changes: 0 additions & 5 deletions python/kvikio/kvikio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
# See file LICENSE for terms.

from kvikio._lib import driver_properties # type: ignore
from kvikio._version import __git_commit__, __version__
from kvikio.cufile import CuFile
from kvikio.remote_file import RemoteFile, is_remote_file_available

# TODO: Wrap nicely, maybe as a dataclass?
DriverProperties = driver_properties.DriverProperties


__all__ = [
"__git_commit__",
"__version__",
Expand Down
2 changes: 1 addition & 1 deletion python/kvikio/kvikio/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# =============================================================================

# Set the list of Cython files to build, one .so per file
set(cython_modules arr.pyx buffer.pyx defaults.pyx driver_properties.pyx file_handle.pyx future.pyx
set(cython_modules arr.pyx buffer.pyx defaults.pyx cufile_driver.pyx file_handle.pyx future.pyx
libnvcomp.pyx libnvcomp_ll.pyx
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,20 @@
from libcpp cimport bool


cdef extern from "<kvikio/driver.hpp>" nogil:
cdef extern from "<kvikio/shim/cufile.hpp>" nogil:
cdef void cpp_driver_open "kvikio::cuFileAPI::instance().driver_open"() except +
cdef void cpp_driver_close "kvikio::cuFileAPI::instance().driver_close"() except +


def driver_open():
cpp_driver_open()


def driver_close():
cpp_driver_close()


cdef extern from "<kvikio/cufile/driver.hpp>" nogil:
cdef cppclass cpp_DriverProperties "kvikio::DriverProperties":
cpp_DriverProperties() except +
bool is_gds_available() except +
Expand Down
3 changes: 2 additions & 1 deletion python/kvikio/kvikio/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from dask.utils import format_bytes

import kvikio
import kvikio.cufile_driver
import kvikio.defaults


Expand All @@ -26,7 +27,7 @@ def drop_vm_cache() -> None:
def pprint_sys_info() -> None:
"""Pretty print system information"""

props = kvikio.DriverProperties()
props = kvikio.cufile_driver.DriverProperties()
try:
import pynvml

Expand Down
62 changes: 62 additions & 0 deletions python/kvikio/kvikio/cufile_driver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# See file LICENSE for terms.

import atexit

from kvikio._lib import cufile_driver # type: ignore

# TODO: Wrap nicely, maybe as a dataclass?
# <https://github.com/rapidsai/kvikio/issues/526>
DriverProperties = cufile_driver.DriverProperties
madsbk marked this conversation as resolved.
Show resolved Hide resolved


def driver_open() -> None:
"""Open the cuFile driver

cuFile accept multiple calls to `driver_open()`, only the first call
madsbk marked this conversation as resolved.
Show resolved Hide resolved
opens the driver, but every call must have a matching call to
`driver_close()`.

Normally, it is not required to open and close the cuFile driver since
it is done automatically.

Raises
------
RuntimeError
If cuFile isn't available.
"""
return cufile_driver.driver_open()


def driver_close() -> None:
"""Close the cuFile driver

cuFile accept multiple calls to `driver_open()`, only the first call
madsbk marked this conversation as resolved.
Show resolved Hide resolved
opens the driver, but every call must have a matching call to
`driver_close()`.

Raises
------
RuntimeError
If cuFile isn't available.
"""
return cufile_driver.driver_close()


def initialize() -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: Whose job is it to call initialize?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The user for now. If we find a Python example that segfaults because of cuFile's termination issues, we should consider calling it in __init__.py.

"""Open the cuFile driver and close it again at module exit

Normally, it is not required to open and close the cuFile driver since
it is done automatically.

Notes
-----
Registers an atexit handler that calls :func:`driver_close`.

Raises
madsbk marked this conversation as resolved.
Show resolved Hide resolved
------
RuntimeError
If cuFile isn't available.
"""
driver_open()
atexit.register(driver_close)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm well if this can error it seems to invalidate my suggestion to use an atexit handler in the C++...

3 changes: 3 additions & 0 deletions python/kvikio/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,6 @@ filterwarnings = [
"ignore:Jitify is performing a one-time only warm-up to populate the persistent cache",
"ignore::DeprecationWarning:botocore.*",
]
markers = [
"cufile: tests to skip if cuFile isn't available e.g. run with `pytest -m 'not cufile'`"
]
12 changes: 12 additions & 0 deletions python/kvikio/tests/test_cufile_driver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# See file LICENSE for terms.

import pytest

import kvikio.cufile_driver


@pytest.mark.cufile
def test_open_and_close():
kvikio.cufile_driver.driver_open()
kvikio.cufile_driver.driver_close()