Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FAISS with cuVS enabled in cuvs-bench #561

Draft
wants to merge 7 commits into
base: branch-25.02
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/bench/ann/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ function(ConfigureAnnBench)
target_link_libraries(
${BENCH_NAME}
PRIVATE ${ConfigureAnnBench_LINKS}
raft::raft
nlohmann_json::nlohmann_json
Threads::Threads
$<TARGET_NAME_IF_EXISTS:raft::raft_logger>
Expand Down
8 changes: 8 additions & 0 deletions cpp/bench/ann/src/common/benchmark.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,14 @@ void register_search(std::shared_ptr<const dataset<T>> dataset,
->MeasureProcessCPUTime()
->UseRealTime();

if (metric_objective == Mode::kThroughput) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's not put algorithm-specific stuff in the common files. I agree with @achirkin about this one. If needed, you could propoagate this down to the specific algorithms, but the common stuff should be agnostic of he algoithmmic specific.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, @cjnolet for spotting this. @tarang-jain , could you please move the warning into the faiss-related header (perhaps into the set_search_params)? There, you should be able to query the cuvs::bench::benchmark_n_threads > 1 from common/util.hpp.

if (index.algo.find("faiss_gpu") != std::string::npos) {
log_warn(
"FAISS GPU does not work in throughput mode because the underlying "
"StandardGpuResources object is not thread-safe. This might give unexpected results");
}
b->ThreadRange(threads[0], threads[1]);
}
if (metric_objective == Mode::kThroughput) { b->ThreadRange(threads[0], threads[1]); }
}
}
Expand Down
30 changes: 29 additions & 1 deletion cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ void parse_build_param(const nlohmann::json& conf,
typename cuvs::bench::faiss_gpu_ivf_flat<T>::build_param& param)
{
parse_base_build_param<T>(conf, param);
if (conf.contains("use_cuvs")) {
param.use_cuvs = conf.at("use_cuvs");
} else {
param.use_cuvs = false;
}
}

template <typename T>
Expand All @@ -60,6 +65,16 @@ void parse_build_param(const nlohmann::json& conf,
} else {
param.use_float16 = false;
}
if (conf.contains("use_cuvs")) {
param.use_cuvs = conf.at("use_cuvs");
} else {
param.use_cuvs = false;
}
if (conf.contains("bitsPerCode")) {
param.bitsPerCode = conf.at("bitsPerCode");
} else {
param.bitsPerCode = 8;
}
}

template <typename T>
Expand Down Expand Up @@ -138,5 +153,18 @@ REGISTER_ALGO_INSTANCE(std::uint8_t);

#ifdef ANN_BENCH_BUILD_MAIN
#include "../common/benchmark.hpp"
int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); }
int main(int argc, char** argv)
{
rmm::mr::cuda_memory_resource cuda_mr;
// Construct a resource that uses a coalescing best-fit pool allocator
// and is initially sized to half of free device memory.
rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{
&cuda_mr, rmm::percent_of_free_device_memory(50)};
// Updates the current device resource pointer to `pool_mr`
auto old_mr = rmm::mr::set_current_device_resource(&pool_mr);
auto ret = cuvs::bench::run_main(argc, argv);
// Restores the current device resource pointer to its previous value
rmm::mr::set_current_device_resource(old_mr);
return ret;
}
Comment on lines +156 to +169
Copy link
Contributor

@achirkin achirkin Jan 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We've run in many issues in ann-bench in the past doing the program-wide setting of memory resource. This also changes the behavior depending on whether ANN_BENCH_BUILD_MAIN flag is enabled. Therefore, it's best if these lines stay unmodified and same for all algorithms.
Please move this resource setting to the place where the index wrapper is constructed/destructed in the algo header file.

#endif
99 changes: 82 additions & 17 deletions cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#include "../common/ann_types.hpp"
#include "../common/util.hpp"
#include <cuvs/neighbors/refine.hpp>
#include "../cuvs/cuvs_ann_bench_utils.h"

#include <faiss/IndexFlat.h>
#include <faiss/IndexIVFFlat.h>
Expand All @@ -32,6 +34,10 @@
#include <faiss/index_io.h>
#include <omp.h>

#include <raft/core/device_mdarray.hpp>
#include <raft/core/device_resources.hpp>
#include <raft/core/host_mdarray.hpp>

#include <cassert>
#include <iostream>
#include <memory>
Expand All @@ -41,7 +47,7 @@

namespace {

auto parse_metric_type(cuvs::bench::Metric metric) -> faiss::MetricType
auto parse_metric_faiss(cuvs::bench::Metric metric) -> faiss::MetricType
{
if (metric == cuvs::bench::Metric::kInnerProduct) {
return faiss::METRIC_INNER_PRODUCT;
Expand Down Expand Up @@ -93,7 +99,7 @@ class faiss_gpu : public algo<T>, public algo_gpu {
faiss_gpu(Metric metric, int dim, const build_param& param)
: algo<T>(metric, dim),
gpu_resource_{std::make_shared<faiss::gpu::StandardGpuResources>()},
metric_type_(parse_metric_type(metric)),
metric_type_(parse_metric_faiss(metric)),
nlist_{param.nlist},
training_sample_fraction_{1.0 / double(param.ratio)}
{
Expand Down Expand Up @@ -160,6 +166,7 @@ class faiss_gpu : public algo<T>, public algo_gpu {
int device_;
double training_sample_fraction_;
std::shared_ptr<faiss::SearchParameters> search_params_;
std::shared_ptr<faiss::IndexRefineSearchParameters> refine_search_params_{nullptr};
const T* dataset_;
float refine_ratio_ = 1.0;
};
Expand Down Expand Up @@ -199,19 +206,65 @@ template <typename T>
void faiss_gpu<T>::search(
const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
{
// ASSERT(Mode::kLatency, "l2Knn: rowMajorIndex and rowMajorQuery should have same layout");
using IdxT = faiss::idx_t;
static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
"sizes of size_t and faiss::idx_t are different");

if (this->refine_ratio_ > 1.0) {
// TODO(snanditale): FAISS changed their search APIs to accept the search parameters as a struct
// object but their refine API doesn't allow the struct to be passed in. Once this is fixed, we
// need to re-enable refinement below
// index_refine_->search(batch_size, queries, k, distances,
// reinterpret_cast<faiss::idx_t*>(neighbors), this->search_params_.get()); Related FAISS issue:
// https://github.com/facebookresearch/faiss/issues/3118
throw std::runtime_error(
"FAISS doesn't support refinement in their new APIs so this feature is disabled in the "
"benchmarks for the time being.");
if (refine_ratio_ > 1.0) {
if (raft::get_device_for_address(queries) >= 0) {
uint32_t k0 = static_cast<uint32_t>(refine_ratio_ * k);
auto distances_tmp = raft::make_device_matrix<float, IdxT>(
gpu_resource_->getRaftHandle(device_), batch_size, k0);
auto candidates =
raft::make_device_matrix<IdxT, IdxT>(gpu_resource_->getRaftHandle(device_), batch_size, k0);
index_->search(batch_size,
queries,
k0,
distances_tmp.data_handle(),
candidates.data_handle(),
this->search_params_.get());

auto queries_host = raft::make_host_matrix<T, IdxT>(batch_size, index_->d);
auto candidates_host = raft::make_host_matrix<IdxT, IdxT>(batch_size, k0);
auto neighbors_host = raft::make_host_matrix<IdxT, IdxT>(batch_size, k);
auto distances_host = raft::make_host_matrix<float, IdxT>(batch_size, k);
auto dataset_v = raft::make_host_matrix_view<const T, faiss::idx_t>(
this->dataset_, index_->ntotal, index_->d);

raft::device_resources handle_ = gpu_resource_->getRaftHandle(device_);

raft::copy(queries_host.data_handle(), queries, queries_host.size(), handle_.get_stream());
raft::copy(candidates_host.data_handle(),
candidates.data_handle(),
candidates_host.size(),
handle_.get_stream());

// wait for the queries to copy to host in 'stream`
handle_.sync_stream();

cuvs::neighbors::refine(handle_,
dataset_v,
queries_host.view(),
candidates_host.view(),
neighbors_host.view(),
distances_host.view(),
parse_metric_type(this->metric_));

raft::copy(neighbors,
neighbors_host.data_handle(),
neighbors_host.size(),
handle_.get_stream());
raft::copy(
distances, distances_host.data_handle(), distances_host.size(), handle_.get_stream());
} else {
index_refine_->search(batch_size,
queries,
k,
distances,
reinterpret_cast<faiss::idx_t*>(neighbors),
this->refine_search_params_.get());
}
} else {
index_->search(batch_size,
queries,
Expand Down Expand Up @@ -253,15 +306,18 @@ void faiss_gpu<T>::load_(const std::string& file)
template <typename T>
class faiss_gpu_ivf_flat : public faiss_gpu<T> {
public:
using typename faiss_gpu<T>::build_param;
struct build_param : public faiss_gpu<T>::build_param {
bool use_cuvs;
};
using typename faiss_gpu<T>::search_param_base;

faiss_gpu_ivf_flat(Metric metric, int dim, const build_param& param)
: faiss_gpu<T>(metric, dim, param)
{
faiss::gpu::GpuIndexIVFFlatConfig config;
config.device = this->device_;
this->index_ = std::make_shared<faiss::gpu::GpuIndexIVFFlat>(
config.device = this->device_;
config.use_cuvs = param.use_cuvs;
this->index_ = std::make_shared<faiss::gpu::GpuIndexIVFFlat>(
this->gpu_resource_.get(), dim, param.nlist, this->metric_type_, config);
}

Expand Down Expand Up @@ -298,6 +354,8 @@ class faiss_gpu_ivfpq : public faiss_gpu<T> {
int m;
bool use_float16;
bool use_precomputed;
bool use_cuvs;
int bitsPerCode;
};
using typename faiss_gpu<T>::search_param_base;

Expand All @@ -307,14 +365,16 @@ class faiss_gpu_ivfpq : public faiss_gpu<T> {
faiss::gpu::GpuIndexIVFPQConfig config;
config.useFloat16LookupTables = param.use_float16;
config.usePrecomputedTables = param.use_precomputed;
config.device = this->device_;
config.use_cuvs = param.use_cuvs;
if (param.use_cuvs) { config.interleavedLayout = param.use_cuvs; }
config.device = this->device_;

this->index_ =
std::make_shared<faiss::gpu::GpuIndexIVFPQ>(this->gpu_resource_.get(),
dim,
param.nlist,
param.m,
8, // FAISS only supports bitsPerCode=8
param.bitsPerCode,
this->metric_type_,
config);
}
Expand All @@ -334,6 +394,11 @@ class faiss_gpu_ivfpq : public faiss_gpu<T> {
this->index_refine_ =
std::make_shared<faiss::IndexRefineFlat>(this->index_.get(), this->dataset_);
this->index_refine_.get()->k_factor = sp.refine_ratio;
faiss::IndexRefineSearchParameters faiss_refine_search_params;
faiss_refine_search_params.k_factor = this->index_refine_.get()->k_factor;
faiss_refine_search_params.base_index_params = this->search_params_.get();
this->refine_search_params_ =
std::make_unique<faiss::IndexRefineSearchParameters>(faiss_refine_search_params);
}
}

Expand Down
1 change: 1 addition & 0 deletions cpp/cmake/thirdparty/get_faiss.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ function(find_and_configure_faiss)
EXCLUDE_FROM_ALL ${exclude}
OPTIONS
"FAISS_ENABLE_GPU ${PKG_ENABLE_GPU}"
"FAISS_ENABLE_CUVS ${PKG_ENABLE_GPU}"
"FAISS_ENABLE_PYTHON OFF"
"FAISS_OPT_LEVEL ${CUVS_FAISS_OPT_LEVEL}"
"FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,7 @@ def faiss_gpu_ivf_pq_build(params, dims):
ret = params["M"] <= dims and dims % params["M"] == 0
if "use_cuvs" in params and params["use_cuvs"]:
return ret
pq_bits = 8
if "bitsPerCode" in params:
pq_bits = params["bitsPerCode"]
pq_bits = params.get("bitsPerCode", 8)
lookup_table_size = 4
if "useFloat16" in params and params["useFloat16"]:
lookup_table_size = 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ groups:
nlist: [2048]
ratio: [10]
useFloat16: [False, True]
use_raft: [False]
use_cuvs: [False]
search:
nprobe: [1, 5, 10, 50, 100, 200]
refine_ratio: [1]
groups:
baseraft:
basecuvs:
build:
nlist: [2048]
ratio: [10]
useFloat16: [False, True]
use_raft: [True]
use_cuvs: [True]
search:
nprobe: [1, 5, 10, 50, 100, 200]
refine_ratio: [1]
57 changes: 28 additions & 29 deletions python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_pq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,71 +6,70 @@ groups:
base:
build:
nlist: [1024, 2048, 4096, 8192]
M: [64, 32, 16]
ratio: [10]
M: [64, 96]
ratio: [4]
usePrecomputed: [False, True]
useFloat16: [False, True]
use_raft: [False]
use_cuvs: [False]
bitsPerCode: [8]
search:
nprobe: [1, 5, 10, 50, 100, 200]
nprobe: [10, 50, 100, 200]
refine_ratio: [1, 2, 4]
baseraft:
basecuvs:
build:
nlist: [1024, 2048, 4096, 8192]
M: [64, 32, 16]
ratio: [10]
M: [96, 192, 384]
ratio: [4]
usePrecomputed: [False]
useFloat16: [False, True]
use_raft: [True]
bitsPerCode: [8, 6, 5, 4]
use_cuvs: [True]
bitsPerCode: [8]
search:
nprobe: [1, 5, 10, 50, 100, 200]
nprobe: [10, 50, 100, 200]
refine_ratio: [1, 2, 4]
large:
build:
nlist: [8192, 16384, 32768, 65536]
M: [48, 32, 16]
nlist: [16384, 32768, 65536]
M: [64, 96]
ratio: [4]
usePrecomputed: [False, True]
useFloat16: [False, True]
use_raft: [False]
useFloat16: [True]
use_cuvs: [False]
bitsPerCode: [8]
search:
nprobe: [20, 30, 40, 50, 100, 200, 500, 1000]
refine_ratio: [1, 2, 4]
largeraft:
largecuvs:
build:
nlist: [8192, 16384, 32768, 65536]
M: [48, 32, 16]
nlist: [16384, 32768, 65536]
M: [96, 192, 384]
ratio: [4]
usePrecomputed: [False]
useFloat16: [False, True]
use_raft: [True]
use_cuvs: [True]
bitsPerCode: [8, 6, 5, 4]
search:
nprobe: [20, 30, 40, 50, 100, 200, 500, 1000]
refine_ratio: [1, 2, 4]
100M:
build:
nlist: [50000]
M: [48]
ratio: [10]
nlist: [100000]
M: [64, 96]
ratio: [4]
usePrecomputed: [False, True]
useFloat16: [False, True]
use_raft: [False]
useFloat16: [True]
use_cuvs: [False]
bitsPerCode: [8]
search:
nprobe: [20, 30, 40, 50, 100, 200, 500, 1000]
refine_ratio: [1]
100Mraft:
100Mcuvs:
build:
nlist: [50000]
M: [48]
ratio: [10]
usePrecomputed: [False, True]
nlist: [100000]
M: [192, 384]
ratio: [4]
useFloat16: [False, True]
use_raft: [True]
use_cuvs: [True]
bitsPerCode: [8, 6, 5, 4]
search:
nprobe: [20, 30, 40, 50, 100, 200, 500, 1000]
Expand Down
Loading
Loading