From 90fa1b1e6cbe83f9ce1618cdeba1efbadd9d99b3 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Fri, 27 Sep 2024 06:44:51 -0700 Subject: [PATCH 1/5] init rf refactoring --- .../backend/gpu/train_kernel_hist_impl.hpp | 30 +- .../gpu/train_kernel_hist_impl_dpc.cpp | 590 +++++++++++------- .../dal/algo/decision_forest/test/spmd.cpp | 34 +- .../algo/louvain/backend/cpu/louvain_data.hpp | 2 +- cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 189 ++++++ .../dal/backend/primitives/rng/rng_dpc.cpp | 189 ++++++ .../dal/backend/primitives/rng/rng_engine.hpp | 101 --- .../primitives/rng/rng_engine_collection.hpp | 77 +-- .../backend/primitives/rng/test/rng_dpc.cpp | 291 +++++++++ dev/bazel/toolchains/cc_toolchain_lnx.bzl | 6 + 10 files changed, 1104 insertions(+), 405 deletions(-) create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng.hpp create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp delete mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp create mode 100644 cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp index 6d1c4362309..036c41d6a9c 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp @@ -20,7 +20,7 @@ #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/backend/primitives/utils.hpp" #include "oneapi/dal/algo/decision_forest/train_types.hpp" - +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include "oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp" #include "oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp" @@ -50,7 +50,7 @@ class train_kernel_hist_impl { using model_manager_t = train_model_manager; using train_context_t = train_context; using imp_data_t = impurity_data; - using rng_engine_t = pr::engine; + using rng_engine_t = pr::engine; using rng_engine_list_t = std::vector; using msg = dal::detail::error_messages; using comm_t = bk::communicator; @@ -62,7 +62,7 @@ class train_kernel_hist_impl { train_kernel_hist_impl(const bk::context_gpu& ctx) : queue_(ctx.get_queue()), comm_(ctx.get_communicator()), - train_service_kernels_(queue_) {} + train_service_kernels_(ctx.get_queue()) {} ~train_kernel_hist_impl() = default; result_t operator()(const descriptor_t& desc, @@ -83,13 +83,11 @@ class train_kernel_hist_impl { pr::ndarray& node_list, pr::ndarray& tree_order_level, Index engine_offset, - Index node_count); + Index node_count, + const bk::event_vector& deps = {}); void validate_input(const descriptor_t& desc, const table& data, const table& labels) const; - Index get_row_total_count(bool distr_mode, Index row_count); - Index get_global_row_offset(bool distr_mode, Index row_count); - /// Initializes `ctx` training context structure based on data and /// descriptor class. Filling and calculating all parameters in context, /// for example, tree count, required memory size, calculating indexed features, etc. @@ -149,6 +147,24 @@ class train_kernel_hist_impl { Index node_count, const bk::event_vector& deps = {}); + sycl::event compute_initial_imp_for_node_list_regression( + const train_context_t& ctx, + const pr::ndarray& node_list, + const pr::ndarray& local_sum_hist, + const pr::ndarray& local_sum2cent_hist, + imp_data_t& imp_data_list, + Index node_count, + const bk::event_vector& deps = {}); + + sycl::event compute_local_sum_histogram(const train_context_t& ctx, + const pr::ndarray& response, + const pr::ndarray& tree_order, + const pr::ndarray& node_list, + pr::ndarray& local_sum_hist, + pr::ndarray& local_sum2cent_hist, + Index node_count, + const bk::event_vector& deps = {}); + /// Computes initial histograms for each node to compute impurity. /// /// @param[in] ctx a training context structure for a GPU backend diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index 9fac38d25b0..a7294554a9f 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -60,6 +60,7 @@ template void train_kernel_hist_impl::validate_input(const descriptor_t& desc, const table& data, const table& labels) const { + ONEDAL_PROFILER_TASK(validate_input, queue_); if (data.get_row_count() > de::limits::max()) { throw domain_error(msg::invalid_range_of_rows()); } @@ -91,47 +92,16 @@ void train_kernel_hist_impl::validate_input(const descr } } -template -Index train_kernel_hist_impl::get_row_total_count(bool distr_mode, - Index row_count) { - Index row_total_count = row_count; - - if (distr_mode) { - ONEDAL_PROFILER_TASK(allreduce_row_total_count); - comm_.allreduce(row_total_count).wait(); - } - - return row_total_count; -} - -template -Index train_kernel_hist_impl::get_global_row_offset(bool distr_mode, - Index row_count) { - Index global_row_offset = 0; - - if (distr_mode) { - auto row_count_list_host = pr::ndarray::empty({ comm_.get_rank_count() }); - Index* row_count_list_host_ptr = row_count_list_host.get_mutable_data(); - { - ONEDAL_PROFILER_TASK(allgather_row_count); - comm_.allgather(row_count, row_count_list_host.flatten()).wait(); - } - - for (std::int64_t i = 0; i < comm_.get_rank(); ++i) { - global_row_offset += row_count_list_host_ptr[i]; - } - } - - return global_row_offset; -} - template void train_kernel_hist_impl::init_params(train_context_t& ctx, const descriptor_t& desc, const table& data, const table& responses, const table& weights) { - ctx.distr_mode_ = (comm_.get_rank_count() > 1); + ONEDAL_PROFILER_TASK(init_params, queue_); + std::int64_t rank_count = comm_.get_rank_count(); + ctx.distr_mode_ = (rank_count > 1); + auto current_rank = comm_.get_rank(); ctx.use_private_mem_buf_ = true; @@ -143,7 +113,11 @@ void train_kernel_hist_impl::init_params(train_context_ } ctx.row_count_ = de::integral_cast(data.get_row_count()); - ctx.row_total_count_ = get_row_total_count(ctx.distr_mode_, ctx.row_count_); + ctx.row_total_count_ = ctx.row_count_; + { + ONEDAL_PROFILER_TASK(allreduce_total_row_count_exactly_it, queue_); + comm_.allreduce(ctx.row_total_count_, spmd::reduce_op::sum).wait(); + } ctx.column_count_ = de::integral_cast(data.get_column_count()); @@ -154,7 +128,18 @@ void train_kernel_hist_impl::init_params(train_context_ ctx.selected_row_total_count_ = desc.get_observations_per_tree_fraction() * ctx.row_total_count_; - ctx.global_row_offset_ = get_global_row_offset(ctx.distr_mode_, ctx.row_count_); + auto global_rank_offsets = array::zeros(rank_count); + global_rank_offsets.get_mutable_data()[current_rank] = ctx.row_count_; + { + ONEDAL_PROFILER_TASK(allreduce_recv_counts, queue_); + comm_.allreduce(global_rank_offsets, spmd::reduce_op::sum).wait(); + } + + ctx.global_row_offset_ = 0; + for (std::int64_t i = 0; i < current_rank; i++) { + ONEDAL_ASSERT(global_rank_offsets.get_data()[i] >= 0); + ctx.global_row_offset_ += global_rank_offsets.get_data()[i]; + } ctx.tree_count_ = de::integral_cast(desc.get_tree_count()); @@ -211,7 +196,7 @@ void train_kernel_hist_impl::init_params(train_context_ bin_borders_host_[clmn_idx] = ind_ftrs.get_bin_borders(clmn_idx).to_host(queue_); } - data_host_ = pr::table2ndarray_1d(queue_, data, alloc::device).to_host(queue_); + data_host_ = pr::table2ndarray_1d(queue_, data, alloc::host); response_nd_ = pr::table2ndarray_1d(queue_, responses, alloc::device); @@ -332,6 +317,7 @@ void train_kernel_hist_impl::init_params(train_context_ template void train_kernel_hist_impl::allocate_buffers(const train_context_t& ctx) { + ONEDAL_PROFILER_TASK(allocate_buffers, queue_); de::check_mul_overflow(ctx.selected_row_total_count_, ctx.tree_in_block_); // main tree order and auxilliary one are used for partitioning @@ -372,7 +358,8 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or pr::ndarray& node_list_host, pr::ndarray& tree_order_level, Index engine_offset, - Index node_count) { + Index node_count, + const bk::event_vector& deps) { ONEDAL_PROFILER_TASK(gen_initial_tree_order, queue_); ONEDAL_ASSERT(node_list_host.get_count() == node_count * impl_const_t::node_prop_count_); @@ -382,50 +369,74 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or sycl::event last_event; if (ctx.bootstrap_) { - auto selected_row_global_host = - pr::ndarray::empty({ ctx.selected_row_total_count_ * ctx.tree_in_block_ }); - pr::ndarray selected_row_host; + auto selected_row_global = + pr::ndarray::empty(queue_, + { ctx.selected_row_total_count_ * ctx.tree_in_block_ }, + alloc::device); + pr::ndarray selected_row; if (ctx.distr_mode_) { - selected_row_host = pr::ndarray::empty( - { ctx.selected_row_total_count_ * ctx.tree_in_block_ }); + selected_row = + pr::ndarray::empty(queue_, + { ctx.selected_row_total_count_ * ctx.tree_in_block_ }, + alloc::device); } - Index* const selected_row_global_ptr = selected_row_global_host.get_mutable_data(); - Index* const selected_row_ptr = - ctx.distr_mode_ ? selected_row_host.get_mutable_data() : nullptr; + Index* const selected_row_global_ptr = selected_row_global.get_mutable_data(); + Index* const selected_row_ptr = ctx.distr_mode_ ? selected_row.get_mutable_data() : nullptr; Index* const node_list_ptr = node_list_host.get_mutable_data(); - + pr::rng rn_gen; for (Index node_idx = 0; node_idx < node_count; ++node_idx) { - pr::rng rn_gen; Index* gen_row_idx_global_ptr = selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; - rn_gen.uniform(ctx.selected_row_total_count_, - gen_row_idx_global_ptr, - rng_engine_list[engine_offset + node_idx].get_state(), - 0, - ctx.row_total_count_); + rn_gen.uniform_gpu_internal(queue_, + ctx.selected_row_total_count_, + gen_row_idx_global_ptr, + rng_engine_list[engine_offset + node_idx], + 0, + ctx.row_total_count_, + { deps }); if (ctx.distr_mode_) { Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_; - Index* src = gen_row_idx_global_ptr; Index* const dst = selected_row_ptr + ctx.selected_row_total_count_ * node_idx; - Index row_idx = 0; - for (Index i = 0; i < ctx.selected_row_total_count_; ++i) { - dst[i] = 0; - if (src[i] >= ctx.global_row_offset_ && - src[i] < (ctx.global_row_offset_ + ctx.row_count_)) { - dst[row_idx++] = src[i] - ctx.global_row_offset_; - } - } - node_ptr[impl_const_t::ind_lrc] = row_idx; + auto [row_index, row_index_event] = + pr::ndarray::full(queue_, 1, 0, alloc::device); + row_index_event.wait_and_throw(); + Index* row_idx_ptr = row_index.get_mutable_data(); + const sycl::nd_range<1> nd_range = + bk::make_multiple_nd_range_1d(ctx.selected_row_total_count_, 1); + auto event_ = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on({ last_event }); + cgh.parallel_for(nd_range, [=](sycl::nd_item<1> id) { + auto idx = id.get_global_id(0); + dst[idx] = 0; + if (gen_row_idx_global_ptr[idx] >= ctx.global_row_offset_ && + gen_row_idx_global_ptr[idx] < + (ctx.global_row_offset_ + ctx.row_count_)) { + sycl::atomic_ref< + Index, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + counter_atomic(row_idx_ptr[0]); + auto cur_idx = counter_atomic.fetch_add(1); + dst[cur_idx] = gen_row_idx_global_ptr[idx] - ctx.global_row_offset_; + } + }); + }); + auto set_event = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on(event_); + cgh.parallel_for(sycl::range<1>{ std::size_t(1) }, [=](sycl::id<1> idx) { + node_ptr[impl_const_t::ind_lrc] = row_idx_ptr[0]; + }); + }); + set_event.wait_and_throw(); } } - last_event = ctx.distr_mode_ - ? tree_order_level.assign_from_host(queue_, selected_row_host) - : tree_order_level.assign_from_host(queue_, selected_row_global_host); + ctx.distr_mode_ ? tree_order_level = selected_row : tree_order_level = selected_row_global; } else { Index row_count = ctx.selected_row_count_; @@ -441,21 +452,22 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or // i.e. row_count can be eq 0 Index* node_list_ptr = node_list_host.get_mutable_data(); - - for (Index node_idx = 0; node_idx < node_count; ++node_idx) { - Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_; - node_ptr[impl_const_t::ind_lrc] = row_count; + auto set_event = queue_.submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<1>{ std::size_t(node_count) }, [=](sycl::id<1> idx) { + Index* node_ptr = node_list_ptr + idx * impl_const_t::node_prop_count_; + node_ptr[impl_const_t::ind_lrc] = row_count; + }); + }); + set_event.wait_and_throw(); + + if (row_count > 0) { + last_event = train_service_kernels_.initialize_tree_order(tree_order_level, + node_count, + row_count, + stride); } } - - if (row_count > 0) { - last_event = train_service_kernels_.initialize_tree_order(tree_order_level, - node_count, - row_count, - stride); - } } - return last_event; } @@ -479,13 +491,12 @@ train_kernel_hist_impl::gen_feature_list( { node_count * ctx.selected_ftr_count_ }, alloc::device); - auto selected_features_host_ptr = selected_features_host.get_mutable_data(); - auto node_vs_tree_map_list_host = node_vs_tree_map_list.to_host(queue_); - pr::rng rn_gen; - auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); if (ctx.selected_ftr_count_ != ctx.column_count_) { + auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); + pr::rng rn_gen; + auto selected_features_host_ptr = selected_features_host.get_mutable_data(); for (Index node = 0; node < node_count; ++node) { rn_gen.uniform_without_replacement( ctx.selected_ftr_count_, @@ -495,20 +506,28 @@ train_kernel_hist_impl::gen_feature_list( 0, ctx.column_count_); } + auto event = selected_features_com.assign_from_host(queue_, + selected_features_host_ptr, + selected_features_com.get_count()); + + return std::tuple{ selected_features_com, event }; } else { + sycl::event fill_event; for (Index node = 0; node < node_count; ++node) { - for (Index i = 0; i < ctx.selected_ftr_count_; ++i) { - selected_features_host_ptr[node * ctx.selected_ftr_count_ + i] = i; - } + auto selected_features_host_ptr = selected_features_com.get_mutable_data(); + + fill_event = queue_.submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl::range<1>{ std::size_t(ctx.selected_ftr_count_) }, + [=](sycl::id<1> idx) { + selected_features_host_ptr[node * ctx.selected_ftr_count_ + idx] = idx; + }); + }); } - } - - auto event = selected_features_com.assign_from_host(queue_, - selected_features_host_ptr, - selected_features_com.get_count()); - return std::tuple{ selected_features_com, event }; + return std::tuple{ selected_features_com, fill_event }; + } } template @@ -537,11 +556,11 @@ train_kernel_hist_impl::gen_random_thresholds( // Generate random bins for selected features for (Index node = 0; node < node_count; ++node) { - rn_gen.uniform(ctx.selected_ftr_count_, - random_bins_host_ptr + node * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]].get_state(), - 0.0f, - 1.0f); + rn_gen.uniform_cpu(ctx.selected_ftr_count_, + random_bins_host_ptr + node * ctx.selected_ftr_count_, + rng_engine_list[tree_map_ptr[node]], + 0.0f, + 1.0f); } auto event_rnd_generate = random_bins_com.assign_from_host(queue_, random_bins_host_ptr, random_bins_com.get_count()); @@ -758,54 +777,50 @@ sycl::event train_kernel_hist_impl::compute_initial_imp pr::ndarray& node_list, Index node_count, const bk::event_vector& deps) { + ONEDAL_PROFILER_TASK(compute_initial_imp_for_node_list, queue_); ONEDAL_ASSERT(imp_data_list.imp_list_.get_count() == node_count * impl_const_t::node_imp_prop_count_); - if constexpr (std::is_same_v) { - ONEDAL_ASSERT(imp_data_list.class_hist_list_.get_count() == node_count * ctx.class_count_); - } ONEDAL_ASSERT(node_list.get_count() == node_count * impl_const_t::node_prop_count_); + sycl::event event_; if constexpr (std::is_same_v) { - auto class_hist_list_host = imp_data_list.class_hist_list_.to_host(queue_, deps); - auto imp_list_host = imp_data_list.imp_list_.to_host(queue_); - auto node_list_host = node_list.to_host(queue_); + const Index* class_hist_list_ptr = imp_data_list.class_hist_list_.get_data(); + Float* imp_list_ptr = imp_data_list.imp_list_.get_mutable_data(); + Index* node_list_ptr = node_list.get_mutable_data(); + + // Launch kernel to compute impurity and winning class for each node + auto event_ = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + cgh.parallel_for(sycl::range<1>(node_count), [=](sycl::id<1> idx) { + Index node_idx = idx; + const Index* node_histogram_ptr = class_hist_list_ptr + node_idx * ctx.class_count_; + Float* node_imp_ptr = imp_list_ptr + node_idx * impl_const_t::node_imp_prop_count_; + Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_; - const Index* class_hist_list_host_ptr = class_hist_list_host.get_data(); - Float* imp_list_host_ptr = imp_list_host.get_mutable_data(); - Index* node_list_host_ptr = node_list_host.get_mutable_data(); + Index row_count = node_ptr[impl_const_t::ind_grc]; - for (Index node_idx = 0; node_idx < node_count; ++node_idx) { - const Index* node_histogram_ptr = - class_hist_list_host_ptr + node_idx * ctx.class_count_; - Float* node_imp_ptr = imp_list_host_ptr + node_idx * impl_const_t::node_imp_prop_count_; - Index* node_ptr = node_list_host_ptr + node_idx * impl_const_t::node_prop_count_; - - Index row_count = node_ptr[impl_const_t::ind_grc]; - - Float imp = Float(1); - Float div = Float(1) / (Float(row_count) * row_count); - Index max_cls_count = 0; - Index win_cls = 0; - Index cls_count = 0; - - for (Index cls_idx = 0; cls_idx < ctx.class_count_; ++cls_idx) { - cls_count = node_histogram_ptr[cls_idx]; - imp -= Float(cls_count) * (cls_count)*div; - - if (cls_count > max_cls_count) { - max_cls_count = cls_count; - win_cls = cls_idx; - } - } + Float imp = Float(1); + Float div = Float(1) / (Float(row_count) * row_count); + Index max_cls_count = 0; + Index win_cls = 0; + Index cls_count = 0; - node_ptr[impl_const_t::ind_win] = win_cls; - node_imp_ptr[0] = sycl::max(imp, Float(0)); - } - imp_data_list.imp_list_.assign_from_host(queue_, imp_list_host).wait_and_throw(); - node_list.assign_from_host(queue_, node_list_host).wait_and_throw(); + for (Index cls_idx = 0; cls_idx < ctx.class_count_; ++cls_idx) { + cls_count = node_histogram_ptr[cls_idx]; + imp -= cls_count * cls_count * div; + + if (cls_count > max_cls_count) { + max_cls_count = cls_count; + win_cls = cls_idx; + } + } + node_ptr[impl_const_t::ind_win] = win_cls; + node_imp_ptr[0] = sycl::max(imp, Float(0)); + }); + }); } - return sycl::event{}; + return event_; } template @@ -994,6 +1009,136 @@ Float* local_buf_ptr = local_buf.get_pointer().get(); return event; } +template +sycl::event train_kernel_hist_impl::compute_local_sum_histogram( + const train_context_t& ctx, + const pr::ndarray& response, + const pr::ndarray& tree_order, + const pr::ndarray& node_list, + pr::ndarray& local_sum_hist, + pr::ndarray& local_sum2cent_hist, + Index node_count, + const bk::event_vector& deps) { + ONEDAL_ASSERT(response.get_count() == ctx.row_count_); + ONEDAL_ASSERT(tree_order.get_count() == ctx.tree_in_block_ * ctx.selected_row_total_count_); + ONEDAL_ASSERT(node_list.get_count() == node_count * impl_const_t::node_prop_count_); + ONEDAL_ASSERT(local_sum_hist.get_count() == node_count); + ONEDAL_ASSERT(local_sum2cent_hist.get_count() == node_count); + + auto fill_event1 = local_sum_hist.fill(queue_, 0, deps); + auto fill_event2 = local_sum2cent_hist.fill(queue_, 0, deps); + + fill_event1.wait_and_throw(); + fill_event2.wait_and_throw(); + + const Float* response_ptr = response.get_data(); + const Index* tree_order_ptr = tree_order.get_data(); + const Index* node_list_ptr = node_list.get_data(); + Float* local_sum_hist_ptr = local_sum_hist.get_mutable_data(); + Float* local_sum2cent_hist_ptr = local_sum2cent_hist.get_mutable_data(); + + const Index node_prop_count = impl_const_t::node_prop_count_; + + auto local_size = ctx.preferable_group_size_; + const sycl::nd_range<2> nd_range = + bk::make_multiple_nd_range_2d({ local_size, node_count }, { local_size, 1 }); + + auto event = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + local_accessor_rw_t local_sum_buf(local_size, cgh); + local_accessor_rw_t local_sum2cent_buf(local_size, cgh); + cgh.parallel_for(nd_range, [=](sycl::nd_item<2> item) { + const Index node_id = item.get_global_id()[1]; + const Index local_id = item.get_local_id()[0]; + const Index local_size = item.get_local_range()[0]; + + const Index* node_ptr = node_list_ptr + node_id * node_prop_count; + + const Index row_offset = node_ptr[impl_const_t::ind_ofs]; + const Index row_count = node_ptr[impl_const_t::ind_lrc]; + + const Index* node_tree_order_ptr = &tree_order_ptr[row_offset]; +#if __SYCL_COMPILER_VERSION >= 20230828 + Float* local_sum_buf_ptr = + local_sum_buf.template get_multi_ptr().get_raw(); + Float* local_sum2cent_buf_ptr = + local_sum2cent_buf.template get_multi_ptr().get_raw(); +#else + Float* local_sum_buf_ptr = local_sum_buf.get_pointer().get(); + Float* local_sum2cent_buf_ptr = local_sum2cent_buf.get_pointer().get(); +#endif + Float local_sum = Float(0); + Float local_sum2cent = Float(0); + for (Index i = local_id; i < row_count; i += local_size) { + Float value = response_ptr[node_tree_order_ptr[i]]; + local_sum += value; + local_sum2cent += value * value; + } + + local_sum_buf_ptr[local_id] = local_sum; + local_sum2cent_buf_ptr[local_id] = local_sum2cent; + + for (Index offset = local_size / 2; offset > 0; offset >>= 1) { + item.barrier(sycl::access::fence_space::local_space); + if (local_id < offset) { + local_sum_buf_ptr[local_id] += local_sum_buf_ptr[local_id + offset]; + local_sum2cent_buf_ptr[local_id] += local_sum2cent_buf_ptr[local_id + offset]; + } + } + + if (local_id == 0) { + local_sum_hist_ptr[node_id] = local_sum_buf_ptr[local_id]; + local_sum2cent_hist_ptr[node_id] = local_sum2cent_buf_ptr[local_id]; + } + }); + }); + + event.wait_and_throw(); + return event; +} + +template +sycl::event +train_kernel_hist_impl::compute_initial_imp_for_node_list_regression( + const train_context_t& ctx, + const pr::ndarray& node_list, + const pr::ndarray& local_sum_hist, + const pr::ndarray& local_sum2cent_hist, + imp_data_t& imp_data_list, + Index node_count, + const bk::event_vector& deps) { + ONEDAL_ASSERT(node_list.get_count() == node_count * impl_const_t::node_prop_count_); + ONEDAL_ASSERT(local_sum_hist.get_count() == node_count); + ONEDAL_ASSERT(local_sum2cent_hist.get_count() == node_count); + ONEDAL_ASSERT(imp_data_list.imp_list_.get_count() == + node_count * impl_const_t::node_imp_prop_count_); + + const Index* node_list_ptr = node_list.get_data(); + const Float* local_sum_hist_ptr = local_sum_hist.get_data(); + const Float* local_sum2cent_hist_ptr = local_sum2cent_hist.get_data(); + Float* imp_list_ptr = imp_data_list.imp_list_.get_mutable_data(); + + const sycl::range<1> range{ de::integral_cast(node_count) }; + + auto last_event = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + cgh.parallel_for(range, [=](sycl::id<1> node_idx) { + // set mean + imp_list_ptr[node_idx * impl_const_t::node_imp_prop_count_ + 0] = + local_sum_hist_ptr[node_idx] / + node_list_ptr[node_idx * impl_const_t::node_prop_count_ + impl_const_t::ind_grc]; + // set sum2cent + imp_list_ptr[node_idx * impl_const_t::node_imp_prop_count_ + 1] = + local_sum2cent_hist_ptr[node_idx] - + (local_sum_hist_ptr[node_idx] * local_sum_hist_ptr[node_idx]) / + node_list_ptr[node_idx * impl_const_t::node_prop_count_ + + impl_const_t::ind_grc]; + }); + }); + + return last_event; +} + template sycl::event train_kernel_hist_impl::compute_initial_sum2cent_local( const train_context_t& ctx, @@ -1135,8 +1280,8 @@ sycl::event train_kernel_hist_impl::compute_initial_his sycl::event last_event; - if (ctx.distr_mode_) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + if (ctx.distr_mode_) { last_event = compute_initial_histogram_local(ctx, response, tree_order, @@ -1156,49 +1301,44 @@ sycl::event train_kernel_hist_impl::compute_initial_his { last_event }); } else { - auto sum_list = pr::ndarray::empty(queue_, { node_count }); - auto sum2cent_list = pr::ndarray::empty(queue_, { node_count }); - last_event = compute_initial_sum_local(ctx, - response, - tree_order, - node_list, - sum_list, - node_count, - deps); - { - ONEDAL_PROFILER_TASK(sum_list, queue_); - comm_.allreduce(sum_list.flatten(queue_, { last_event })).wait(); - } - last_event = compute_initial_sum2cent_local(ctx, - response, - tree_order, - node_list, - sum_list, - sum2cent_list, - node_count, - { last_event }); - { - ONEDAL_PROFILER_TASK(allreduce_sum2cent_list, queue_); - comm_.allreduce(sum2cent_list.flatten(queue_, { last_event })).wait(); - } - last_event = fin_initial_imp(ctx, - node_list, - sum_list, - sum2cent_list, - imp_data_list, - node_count, - { last_event }); + last_event = compute_initial_histogram_local(ctx, + response, + tree_order, + node_list, + imp_data_list, + node_count, + deps); last_event.wait_and_throw(); } } else { - last_event = compute_initial_histogram_local(ctx, - response, - tree_order, - node_list, - imp_data_list, - node_count, - deps); + auto local_sum_hist = pr::ndarray::empty(queue_, { node_count }); + auto local_sum2cent_hist = pr::ndarray::empty(queue_, { node_count }); + + last_event = compute_local_sum_histogram(ctx, + response, + tree_order, + node_list, + local_sum_hist, + local_sum2cent_hist, + node_count, + deps); + { + ONEDAL_PROFILER_TASK(allreduce_sum_hist, queue_); + comm_.allreduce(local_sum_hist.flatten(queue_, { last_event })).wait(); + } + { + ONEDAL_PROFILER_TASK(allreduce_sum2cent_hist, queue_); + comm_.allreduce(local_sum2cent_hist.flatten(queue_, { last_event })).wait(); + } + + last_event = compute_initial_imp_for_node_list_regression(ctx, + node_list, + local_sum_hist, + local_sum2cent_hist, + imp_data_list, + node_count, + { last_event }); last_event.wait_and_throw(); } @@ -1409,8 +1549,6 @@ sycl::event train_kernel_hist_impl::do_node_split( const Index* node_list_ptr = node_list.get_data(); const Index* node_vs_tree_map_list_ptr = node_vs_tree_map_list.get_data(); - const bool distr_mode = ctx.distr_mode_; - Index* node_list_new_ptr = node_list_new.get_mutable_data(); Index* node_vs_tree_map_list_new_ptr = node_vs_tree_map_list_new.get_mutable_data(); @@ -1449,7 +1587,7 @@ sycl::event train_kernel_hist_impl::do_node_split( Index* node_rch = node_list_new_ptr + (new_left_node_pos + 1) * node_prop_count; node_lch[impl_const_t::ind_ofs] = node_prn[impl_const_t::ind_ofs]; - node_lch[impl_const_t::ind_lrc] = distr_mode + node_lch[impl_const_t::ind_lrc] = ctx.distr_mode_ ? node_prn[impl_const_t::ind_lch_lrc] : node_prn[impl_const_t::ind_lch_grc]; node_lch[impl_const_t::ind_grc] = node_prn[impl_const_t::ind_lch_grc]; @@ -1858,10 +1996,11 @@ train_result train_kernel_hist_impl::operator()( de::check_mul_overflow((ctx.tree_count_ - 1), skip_num); - pr::engine_collection collection(ctx.tree_count_, desc.get_seed()); - rng_engine_list_t engine_arr = collection([&](std::size_t i, std::size_t& skip) { - skip = i * skip_num; - }); + pr::engine_collection collection(queue_, + ctx.tree_count_, + desc.get_seed()); + + rng_engine_list_t engine_arr = collection.get_engines(); pr::ndarray node_imp_decrease_list; @@ -1885,46 +2024,53 @@ train_result train_kernel_hist_impl::operator()( de::check_mul_overflow(node_count, impl_const_t::node_prop_count_); de::check_mul_overflow(node_count, impl_const_t::node_imp_prop_count_); - auto node_vs_tree_map_list_host = pr::ndarray::empty({ node_count }); - auto level_node_list_init_host = - pr::ndarray::empty({ node_count * impl_const_t::node_prop_count_ }); - - auto tree_map = node_vs_tree_map_list_host.get_mutable_data(); - auto node_list_ptr = level_node_list_init_host.get_mutable_data(); - - for (Index node = 0; node < node_count; ++node) { - Index* node_ptr = node_list_ptr + node * impl_const_t::node_prop_count_; - tree_map[node] = iter + node; - node_ptr[impl_const_t::ind_ofs] = - ctx.selected_row_total_count_ * node; // local row offset - node_ptr[impl_const_t::ind_lrc] = - ctx.distr_mode_ - ? 0 - : ctx.selected_row_count_; // for distr_mode it will be updated during gen_initial_tree_order - node_ptr[impl_const_t::ind_grc] = - ctx.selected_row_total_count_; // global selected rows - it is already filtered for current block - node_ptr[impl_const_t::ind_lch_lrc] = - 0; // for distr_mode it will be updated during tree_order_gen - node_ptr[impl_const_t::ind_fid] = impl_const_t::bad_val_; - } + auto node_vs_tree_map_list = + pr::ndarray::empty(queue_, { node_count }, alloc::device); + auto level_node_list_init = + pr::ndarray::empty(queue_, + { node_count * impl_const_t::node_prop_count_ }, + alloc::device); + + auto tree_map = node_vs_tree_map_list.get_mutable_data(); + auto node_list_ptr = level_node_list_init.get_mutable_data(); + + auto fill_event = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on({ last_event }); + cgh.parallel_for(sycl::range<1>{ std::size_t(node_count) }, [=](sycl::id<1> node) { + Index* node_ptr = node_list_ptr + node * impl_const_t::node_prop_count_; + tree_map[node] = iter + node; + node_ptr[impl_const_t::ind_ofs] = + ctx.selected_row_total_count_ * node; // local row offset + node_ptr[impl_const_t::ind_lrc] = + ctx.distr_mode_ + ? 0 + : ctx.selected_row_count_; // for distr_mode it will be updated during gen_initial_tree_order + node_ptr[impl_const_t::ind_grc] = + ctx.selected_row_total_count_; // global selected rows - it is already filtered for current block + node_ptr[impl_const_t::ind_lch_lrc] = + 0; // for distr_mode it will be updated during tree_order_gen + node_ptr[impl_const_t::ind_fid] = impl_const_t::bad_val_; + }); + }); - last_event = gen_initial_tree_order(ctx, - engine_arr, - level_node_list_init_host, - tree_order_lev_, - iter, - node_count); - - auto node_vs_tree_map_list = node_vs_tree_map_list_host.to_device(queue_); - level_node_lists.push_back(level_node_list_init_host.to_device(queue_)); - - last_event = compute_initial_histogram(ctx, - response_nd_, - tree_order_lev_, - level_node_lists[0], - imp_data_holder.get_mutable_data(0), - node_count, - { last_event }); + auto gen_initial_tree_order_event = gen_initial_tree_order(ctx, + engine_arr, + level_node_list_init, + tree_order_lev_, + iter, + node_count, + { fill_event }); + + level_node_lists.push_back(level_node_list_init); + + auto compute_initial_histogram_event = + compute_initial_histogram(ctx, + response_nd_, + tree_order_lev_, + level_node_lists[0], + imp_data_holder.get_mutable_data(0), + node_count, + { gen_initial_tree_order_event }); last_event.wait_and_throw(); if (ctx.oob_required_) { diff --git a/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp b/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp index 534acddb04a..69e9098a826 100644 --- a/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp @@ -400,23 +400,23 @@ DF_SPMD_CLS_TEST("df cls base check with default params") { this->infer_base_checks(desc, data_test, this->get_homogen_table_id(), model, checker_list); } -DF_SPMD_CLS_TEST("df cls base check with default params and train weights") { - SKIP_IF(this->get_policy().is_cpu()); - SKIP_IF(this->not_available_on_device()); - SKIP_IF(this->not_float64_friendly()); - const auto [data, data_test, class_count, checker_list] = - this->get_cls_dataframe_weighted_base(); - - auto desc = this->get_default_descriptor(); - - desc.set_class_count(class_count); - - this->set_rank_count(2); - const auto train_result = - this->train_spmd_weighted_base_checks(desc, data, this->get_homogen_table_id()); - const auto model = train_result.get_model(); - this->infer_base_checks(desc, data_test, this->get_homogen_table_id(), model, checker_list); -} +// DF_SPMD_CLS_TEST("df cls base check with default params and train weights") { +// SKIP_IF(this->get_policy().is_cpu()); +// SKIP_IF(this->not_available_on_device()); +// SKIP_IF(this->not_float64_friendly()); +// const auto [data, data_test, class_count, checker_list] = +// this->get_cls_dataframe_weighted_base(); + +// auto desc = this->get_default_descriptor(); + +// desc.set_class_count(class_count); + +// this->set_rank_count(2); +// const auto train_result = +// this->train_spmd_weighted_base_checks(desc, data, this->get_homogen_table_id()); +// const auto model = train_result.get_model(); +// this->infer_base_checks(desc, data_test, this->get_homogen_table_id(), model, checker_list); +// } DF_SPMD_CLS_TEST("df cls base check with non default params") { SKIP_IF(this->get_policy().is_cpu()); diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp index d21de8c9627..b0992990912 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp @@ -17,7 +17,7 @@ #pragma once #include "oneapi/dal/backend/memory.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" namespace oneapi::dal::preview::louvain::backend { using namespace oneapi::dal::preview::detail; diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp new file mode 100644 index 00000000000..a8b1c1f6ca5 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp @@ -0,0 +1,189 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#pragma once + +#include "oneapi/dal/backend/primitives/ndarray.hpp" + +#include +#include +#include +#include "oneapi/dal/backend/primitives/rng/utils.hpp" +#include "oneapi/dal/table/common.hpp" +#include "oneapi/dal/backend/primitives/ndarray.hpp" +#include +namespace mkl = oneapi::mkl; +namespace oneapi::dal::backend::primitives { + +#ifdef ONEDAL_DATA_PARALLEL + +enum class engine_list { mt2203, mcg59, mt19937 }; + +template +struct oneapi_engine_type; + +template <> +struct oneapi_engine_type { + using type = oneapi::mkl::rng::mt2203; +}; + +template <> +struct oneapi_engine_type { + using type = oneapi::mkl::rng::mcg59; +}; + +template <> +struct oneapi_engine_type { + using type = oneapi::mkl::rng::mt19937; +}; + +template +class engine { +public: + using oneapi_engine_t = typename oneapi_engine_type::type; + + explicit engine(sycl::queue& queue, std::int64_t seed = 777) + : q(queue), + daal_engine_(initialize_daal_engine(seed)), + oneapi_engine_(initialize_oneapi_engine(queue, seed)), + impl_(dynamic_cast( + daal_engine_.get())) { + if (!impl_) { + throw std::domain_error("RNG engine is not supported"); + } + } + + virtual ~engine() = default; + + void* get_state() const { + return impl_->getState(); + } + + auto& get_daal_engine() { + return daal_engine_; + } + + auto& get_oneapi_state() { + return oneapi_engine_; + } + + void skip_ahead_cpu(size_t nSkip) { + daal_engine_->skipAhead(nSkip); + } + + void skip_ahead_gpu(size_t nSkip) { + if constexpr (EngineType == engine_list::mt2203) { + } + else { + skip_ahead(oneapi_engine_, nSkip); + } + } + +private: + daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) { + switch (EngineType) { + case engine_list::mt2203: + return daal::algorithms::engines::mt2203::Batch<>::create(seed); + case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_list::mt19937: + return daal::algorithms::engines::mt19937::Batch<>::create(seed); + default: throw std::invalid_argument("Unsupported engine type"); + } + } + + oneapi_engine_t initialize_oneapi_engine(sycl::queue& queue, std::int64_t seed) { + if constexpr (EngineType == engine_list::mt2203) { + return oneapi_engine_t(queue, + seed, + 0); // its necessary for aligning cpu and gpu results + } + else { + return oneapi_engine_t(queue, seed); + } + } + sycl::queue q; + daal::algorithms::engines::EnginePtr daal_engine_; + oneapi_engine_t oneapi_engine_; + daal::algorithms::engines::internal::BatchBaseImpl* impl_; +}; + +template +class rng { +public: + rng() = default; + ~rng() = default; + + template + void uniform(sycl::queue& queue, + Size count, + Type* dst, + engine& engine_, + Type a, + Type b, + bool distr_mode = false, + const event_vector& deps = {}); + + template + void uniform_gpu_internal(sycl::queue& queue, + Size count, + Type* dst, + engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); + // template + // void uniform_without_replacement(sycl::queue& queue, + // Size count, + // Type* dst, + // std::uint8_t* state, + // Type a, + // Type b, + // const event_vector& deps = {}); + + template + void uniform_cpu(Size count, Type* dst, engine& engine_, Type a, Type b) { + void* state = engine_.get_state(); + engine_.skip_ahead_gpu(count); + uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); + } + + void uniform_without_replacement(Size count, + Type* dst, + Type* buffer, + void* state, + Type a, + Type b) { + uniform_dispatcher::uniform_without_replacement_by_cpu(count, + dst, + buffer, + state, + a, + b); + } + + template >> + void shuffle(Size count, Type* dst, void* state) { + Type idx[2]; + + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } + } +}; + +#endif +} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp new file mode 100644 index 00000000000..74363680394 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -0,0 +1,189 @@ +/******************************************************************************* +* Copyright 2022 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/ndarray.hpp" + +namespace oneapi::dal::backend::primitives { + +namespace bk = oneapi::dal::backend; + +template +template +void rng::uniform_gpu_internal(sycl::queue& queue, + Size count, + Type* dst, + engine& engine_, + Type a, + Type b, + const event_vector& deps) { + auto local_engine = engine_.get_oneapi_state(); + oneapi::mkl::rng::uniform distr(a, b); + auto event = oneapi::mkl::rng::generate(distr, local_engine, count, dst, { deps }); + event.wait_and_throw(); + engine_.skip_ahead_cpu(count); + engine_.skip_ahead_gpu(count); +} + +template +template +void rng::uniform(sycl::queue& queue, + Size count, + Type* dst, + engine& engine_, + Type a, + Type b, + bool distr_mode /* = false */, + const event_vector& deps) { + if (count > 5000) { + uniform_gpu_internal(queue, count, dst, engine_, a, b); + } + else { + uniform_cpu(count, dst, engine_, a, b); + } +} + +// template +// void rng::uniform_without_replacement(sycl::queue& queue, +// Size count, +// Type* dst, +// std::uint8_t* state, +// Type a, +// Type b, +// const event_vector& deps) { +// auto engine = oneapi::mkl::rng::load_state(queue, state); + +// oneapi::mkl::rng::uniform distr; +// auto local_buf = +// ndarray::empty(queue, { std::int64_t(b) }, sycl::usm::alloc::device); +// auto local_buf_ptr = local_buf.get_mutable_data(); + +// auto random_buf = ndarray::empty(queue, { count }, sycl::usm::alloc::device); +// auto random_buf_ptr = random_buf.get_mutable_data(); + +// auto fill_event = queue.submit([&](sycl::handler& cgh) { +// cgh.depends_on(deps); +// cgh.parallel_for(sycl::range<1>{ std::size_t(b) }, [=](sycl::id<1> idx) { +// local_buf_ptr[idx] = idx; +// }); +// }); +// fill_event.wait_and_throw(); + +// auto event = oneapi::mkl::rng::generate(distr, engine, count, random_buf_ptr); +// event.wait_and_throw(); + +// queue +// .submit([&](sycl::handler& h) { +// h.parallel_for(sycl::range<1>{ std::size_t(1) }, [=](sycl::id<1> idx) { +// for (std::int64_t i = 0; i < count; ++i) { +// auto j = i + (size_t)(random_buf_ptr[i] * (float)(b - i)); +// auto tmp = local_buf_ptr[i]; +// local_buf_ptr[i] = local_buf_ptr[j]; +// local_buf_ptr[j] = tmp; +// } +// for (std::int64_t i = 0; i < count; ++i) { +// dst[i] = local_buf_ptr[i]; +// } +// }); +// }) +// .wait_and_throw(); +// mkl::rng::save_state(engine, state); +// } + +#define INSTANTIATE(F, Size, EngineType) \ + template ONEDAL_EXPORT void rng::uniform(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + engine& engine_, \ + F a, \ + F b, \ + bool dist, \ + const event_vector& deps); + +#define INSTANTIATE_FLOAT(Size) \ + INSTANTIATE(float, Size, engine_list::mt2203) \ + INSTANTIATE(float, Size, engine_list::mcg59) \ + INSTANTIATE(float, Size, engine_list::mt19937) \ + INSTANTIATE(double, Size, engine_list::mt2203) \ + INSTANTIATE(double, Size, engine_list::mcg59) \ + INSTANTIATE(double, Size, engine_list::mt19937) \ + INSTANTIATE(int, Size, engine_list::mt2203) \ + INSTANTIATE(int, Size, engine_list::mcg59) \ + INSTANTIATE(int, Size, engine_list::mt19937) + +INSTANTIATE_FLOAT(std::int64_t); +INSTANTIATE_FLOAT(std::int32_t); + +#define INSTANTIATE_(F, Size, EngineType) \ + template ONEDAL_EXPORT void rng::uniform_gpu_internal(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); + +#define INSTANTIATE_FLOAT_(Size) \ + INSTANTIATE_(float, Size, engine_list::mt2203) \ + INSTANTIATE_(float, Size, engine_list::mcg59) \ + INSTANTIATE_(float, Size, engine_list::mt19937) \ + INSTANTIATE_(double, Size, engine_list::mt2203) \ + INSTANTIATE_(double, Size, engine_list::mcg59) \ + INSTANTIATE_(double, Size, engine_list::mt19937) \ + INSTANTIATE_(int, Size, engine_list::mt2203) \ + INSTANTIATE_(int, Size, engine_list::mcg59) \ + INSTANTIATE_(int, Size, engine_list::mt19937) + +INSTANTIATE_FLOAT_(std::int64_t); +INSTANTIATE_FLOAT_(std::int32_t); + +// #define INSTANTIATE_WO_REPLACEMENT(F, Size) \ +// template ONEDAL_EXPORT void rng::uniform_without_replacement( \ +// sycl::queue& queue, \ +// Size count_, \ +// F* dst, \ +// std::uint8_t* state, \ +// F a, \ +// F b, \ +// const event_vector& deps); + +// #define INSTANTIATE_WO_REPLACEMENT_FLOAT(Size) \ +// INSTANTIATE_WO_REPLACEMENT(float, Size) \ +// INSTANTIATE_WO_REPLACEMENT(double, Size) \ +// INSTANTIATE_WO_REPLACEMENT(int, Size) + +// INSTANTIATE_WO_REPLACEMENT_FLOAT(std::int64_t); +// INSTANTIATE_WO_REPLACEMENT_FLOAT(std::int32_t); + +// #define INSTANTIATE_WO_REPLACEMENT_MT2203(F, Size) \ +// template ONEDAL_EXPORT void rng::uniform_mt2203(sycl::queue& queue, \ +// Size count_, \ +// F* dst, \ +// std::int64_t state, \ +// F a, \ +// F b, \ +// const event_vector& deps); + +// #define INSTANTIATE_WO_REPLACEMENT_MT2203_FLOAT(Size) \ +// INSTANTIATE_WO_REPLACEMENT_MT2203(float, Size) \ +// INSTANTIATE_WO_REPLACEMENT_MT2203(double, Size) \ +// INSTANTIATE_WO_REPLACEMENT_MT2203(int, Size) + +// INSTANTIATE_WO_REPLACEMENT_MT2203_FLOAT(std::int64_t); +// INSTANTIATE_WO_REPLACEMENT_MT2203_FLOAT(std::int32_t); + +} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp deleted file mode 100644 index c8ca3b13ce9..00000000000 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp +++ /dev/null @@ -1,101 +0,0 @@ -/******************************************************************************* -* Copyright 2021 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#pragma once - -#include - -#include "oneapi/dal/backend/primitives/rng/utils.hpp" - -namespace oneapi::dal::backend::primitives { - -template -class rng { -public: - rng() = default; - ~rng() = default; - - void uniform(Size count, Type* dst, void* state, Type a, Type b) { - uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); - } - - void uniform_without_replacement(Size count, - Type* dst, - Type* buffer, - void* state, - Type a, - Type b) { - uniform_dispatcher::uniform_without_replacement_by_cpu(count, - dst, - buffer, - state, - a, - b); - } - - template >> - void shuffle(Size count, Type* dst, void* state) { - Type idx[2]; - - for (Size i = 0; i < count; ++i) { - uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); - std::swap(dst[idx[0]], dst[idx[1]]); - } - } - -private: - daal::internal::RNGsInst daal_rng_; -}; - -class engine { -public: - explicit engine(std::int64_t seed = 777) - : engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)) { - impl_ = dynamic_cast(engine_.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - } - - explicit engine(const daal::algorithms::engines::EnginePtr& eng) : engine_(eng) { - impl_ = dynamic_cast(eng.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - } - - virtual ~engine() = default; - - engine& operator=(const daal::algorithms::engines::EnginePtr& eng) { - engine_ = eng; - impl_ = dynamic_cast(eng.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - - return *this; - } - - void* get_state() const { - return impl_->getState(); - } - -private: - daal::algorithms::engines::EnginePtr engine_; - daal::algorithms::engines::internal::BatchBaseImpl* impl_; -}; - -} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp index 09a5a589141..9aff7ab3bc6 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp @@ -16,78 +16,41 @@ #pragma once -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" - +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/ndarray.hpp" #include +#include +#include +#include +#include "oneapi/dal/backend/primitives/rng/utils.hpp" +#include "oneapi/dal/table/common.hpp" + namespace oneapi::dal::backend::primitives { -template +#ifdef ONEDAL_DATA_PARALLEL + +template class engine_collection { public: - explicit engine_collection(Size count, std::int64_t seed = 777) + engine_collection(sycl::queue& queue, Size count, std::int64_t seed = 777) : count_(count), - engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)), - params_(count), - technique_(daal::algorithms::engines::internal::family), - daal_engine_list_(count) {} - - template - std::vector operator()(Op&& op) { - daal::services::Status status; - for (Size i = 0; i < count_; ++i) { - op(i, params_.nSkip[i]); - } - select_parallelization_technique(technique_); - daal::algorithms::engines::internal::EnginesCollection engine_collection( - engine_, - technique_, - params_, - daal_engine_list_, - &status); - if (!status) { - dal::backend::interop::status_to_exception(status); - } - - std::vector engine_list(count_); + seed_(seed) { + engines_.reserve(count_); for (Size i = 0; i < count_; ++i) { - engine_list[i] = daal_engine_list_[i]; + engines_.push_back(engine(queue, seed_)); } - - //copy elision - return engine_list; } -private: - void select_parallelization_technique( - daal::algorithms::engines::internal::ParallelizationTechnique& technique) { - auto daal_engine_impl = - dynamic_cast(engine_.get()); - - daal::algorithms::engines::internal::ParallelizationTechnique techniques[] = { - daal::algorithms::engines::internal::family, - daal::algorithms::engines::internal::leapfrog, - daal::algorithms::engines::internal::skipahead - }; - - for (auto& techn : techniques) { - if (daal_engine_impl->hasSupport(techn)) { - technique = techn; - return; - } - } - - throw domain_error( - dal::detail::error_messages::rng_engine_does_not_support_parallelization_techniques()); + std::vector> get_engines() const { + return engines_; } private: Size count_; - daal::algorithms::engines::EnginePtr engine_; - daal::algorithms::engines::internal::Params params_; - daal::algorithms::engines::internal::ParallelizationTechnique technique_; - daal::services::internal::TArray - daal_engine_list_; + std::int64_t seed_; + std::vector> engines_; }; +#endif } // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp new file mode 100644 index 00000000000..1ba5e9fc365 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -0,0 +1,291 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dal/test/engine/common.hpp" +#include "oneapi/dal/test/engine/fixtures.hpp" +#include "oneapi/dal/test/engine/dataframe.hpp" + +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp" +namespace oneapi::dal::backend::primitives::test { + +namespace te = dal::test::engine; + +class mt2203 {}; +class mcg59 {}; +class mt19937 {}; + +template +struct engine_map {}; + +template <> +struct engine_map { + constexpr static auto value = engine_list::mt2203; +}; + +template <> +struct engine_map { + constexpr static auto value = engine_list::mcg59; +}; + +template <> +struct engine_map { + constexpr static auto value = engine_list::mt19937; +}; + +template +constexpr auto engine_v = engine_map::value; + +template +class rng_test : public te::policy_fixture { +public: + using Index = std::tuple_element_t<0, TestType>; + using EngineType = std::tuple_element_t<1, TestType>; + static constexpr auto engine_qq = engine_v; + + auto get_rng() const { + rng rn_gen; + return rn_gen; + } + + auto get_engine(std::int64_t seed) { + auto rng_engine = engine(this->get_queue(), seed); + return rng_engine; + } + + auto allocate_arrays(std::int64_t elem_count) { + auto& q = this->get_queue(); + auto val_gpu = ndarray::empty({ elem_count }); + auto val_host = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); + + return std::make_tuple(val_gpu, val_host); + } + + auto allocate_arrays_shared(std::int64_t elem_count) { + auto& q = this->get_queue(); + auto val_gpu = ndarray::empty(q, { elem_count }, sycl::usm::alloc::shared); + auto val_host = ndarray::empty(q, { elem_count }, sycl::usm::alloc::shared); + + return std::make_tuple(val_gpu, val_host); + } + + auto allocate_arrays_device(std::int64_t elem_count) { + auto& q = this->get_queue(); + auto val_gpu_1 = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); + auto val_gpu_2 = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); + + return std::make_tuple(val_gpu_1, val_gpu_2); + } + + auto allocate_arrays_host(std::int64_t elem_count) { + auto val_host_1 = ndarray::empty({ elem_count }); + auto val_host_2 = ndarray::empty({ elem_count }); + + return std::make_tuple(val_host_1, val_host_2); + } + + void check_results_host(const ndarray& val_host_1, + const ndarray& val_host_2) { + const Index* val_host_1_ptr = val_host_1.get_data(); + + const Index* val_host_2_ptr = val_host_2.get_data(); + + for (std::int64_t el = 0; el < val_host_1.get_count(); el++) { + REQUIRE(val_host_1_ptr[el] == val_host_2_ptr[el]); + } + } + + void check_results_device(const ndarray& val_gpu_1, + const ndarray& val_gpu_2) { + const auto val_gpu_host_1 = val_gpu_1.to_host(this->get_queue()); + const Index* val_gpu_host_1_ptr = val_gpu_host_1.get_data(); + + const auto val_gpu_host_2 = val_gpu_2.to_host(this->get_queue()); + const Index* val_gpu_host_2_ptr = val_gpu_host_2.get_data(); + + for (std::int64_t el = 0; el < val_gpu_2.get_count(); el++) { + REQUIRE(val_gpu_host_2_ptr[el] == val_gpu_host_1_ptr[el]); + } + } + + void check_results(const ndarray& val_gpu, const ndarray& val_host) { + const Index* val_host_ptr = val_host.get_data(); + + const auto val_gpu_host = val_gpu.to_host(this->get_queue()); + const Index* val_gpu_host_ptr = val_gpu_host.get_data(); + + for (std::int64_t el = 0; el < val_host.get_count(); el++) { + REQUIRE(val_gpu_host_ptr[el] == val_host_ptr[el]); + } + } +}; + +// using rng_types = COMBINE_TYPES((float, double), (mt2203, mcg59, mt19937)); + +// TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { +// SKIP_IF(this->get_policy().is_cpu()); +// std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 50000); +// std::int64_t seed = GENERATE_COPY(777, 999); + +// auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); +// auto arr_gpu_ptr = arr_gpu.get_mutable_data(); +// auto arr_host_ptr = arr_host.get_mutable_data(); + +// auto rn_gen = this->get_rng(); +// auto rng_engine = this->get_engine(seed); +// auto rng_engine_ = this->get_engine(seed); + +// rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine, 0, elem_count); +// rn_gen.uniform_gpu_internal(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); + +// this->check_results(arr_gpu, arr_host); +// } + +// using rng_types_skip = COMBINE_TYPES((float), (mcg59)); + +// // TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types_skip) { +// // SKIP_IF(this->get_policy().is_cpu()); +// // std::int64_t elem_count = +// // GENERATE_COPY(10, 1000, 300000, 15000, 1000000, 100000000, 6100000000, 1LL * 64 * 1000000); +// // std::int64_t seed = GENERATE_COPY(777); + +// // auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); +// // auto arr_gpu_ptr = arr_gpu.get_mutable_data(); +// // auto arr_host_ptr = arr_host.get_mutable_data(); + +// // auto rn_gen = this->get_rng(); +// // auto rng_engine = this->get_engine(seed); +// // auto rng_engine_ = this->get_engine(seed); + +// // BENCHMARK("Uniform dispatcher HOST arr" + std::to_string(elem_count)) { +// // rn_gen.uniform(this->get_queue(), elem_count, arr_host_ptr, rng_engine, 0, elem_count); +// // }; +// // BENCHMARK("Uniform dispatcher GPU arr" + std::to_string(elem_count)) { +// // rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); +// // }; + +// // auto [arr_gpu_, arr_host_] = this->allocate_arrays(elem_count); +// // auto arr_gpu_ptr_ = arr_gpu_.get_mutable_data(); +// // auto arr_host_ptr_ = arr_host_.get_mutable_data(); + +// // auto rn_gen_ = this->get_rng(); +// // auto rng_engine_1 = this->get_engine(seed); +// // auto rng_engine_2 = this->get_engine(seed); +// // BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) { +// // rn_gen_.uniform_gpu_internal(this->get_queue(), +// // elem_count, +// // arr_gpu_ptr_, +// // rng_engine_1, +// // 0, +// // elem_count); +// // }; + +// // BENCHMARK("Uniform HOST arr" + std::to_string(elem_count)) { +// // rn_gen_.uniform(elem_count, arr_host_ptr_, rng_engine_2, 0, elem_count); +// // }; +// // } + +// TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip) { +// SKIP_IF(this->get_policy().is_cpu()); +// std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 100000); +// std::int64_t seed = GENERATE_COPY(777, 999); + +// auto [arr_host_init_1, arr_host_init_2] = this->allocate_arrays_host(elem_count); +// auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); +// auto arr_host_init_1_ptr = arr_host_init_1.get_mutable_data(); +// auto arr_host_init_2_ptr = arr_host_init_2.get_mutable_data(); +// auto arr_gpu_ptr = arr_gpu.get_mutable_data(); +// auto arr_host_ptr = arr_host.get_mutable_data(); + +// auto rn_gen = this->get_rng(); +// auto rng_engine = this->get_engine(seed); +// auto rng_engine_2 = this->get_engine(seed); + +// rn_gen.uniform_cpu(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count); +// rn_gen.uniform_cpu(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count); + +// rn_gen.uniform_gpu_internal(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); +// rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + +// this->check_results_host(arr_host_init_1, arr_host_init_2); +// this->check_results(arr_gpu, arr_host); +// } + +// TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip) { +// SKIP_IF(this->get_policy().is_cpu()); +// std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); +// std::int64_t seed = GENERATE_COPY(1, 777, 999); + +// auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_device(elem_count); +// auto [arr_gpu, arr_host] = this->allocate_arrays(elem_count); +// auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data(); +// auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data(); +// auto arr_gpu_ptr = arr_gpu.get_mutable_data(); +// auto arr_host_ptr = arr_host.get_mutable_data(); + +// auto rn_gen = this->get_rng(); +// auto rng_engine = this->get_engine(seed); +// auto rng_engine_2 = this->get_engine(seed); + +// rn_gen.uniform_gpu_internal(this->get_queue(), elem_count, arr_device_init_1_ptr, rng_engine, 0, elem_count); +// rn_gen +// .uniform_gpu_internal(this->get_queue(), elem_count, arr_device_init_2_ptr, rng_engine_2, 0, elem_count); + +// rn_gen.uniform_gpu_internal(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); +// rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + +// this->check_results_device(arr_device_init_1, arr_device_init_2); +// this->check_results(arr_gpu, arr_host); +// } + +// TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip collection", "[rng]", rng_types_skip) { +// SKIP_IF(this->get_policy().is_cpu()); +// std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); +// std::int64_t seed = GENERATE_COPY(1, 777, 999); + +// engine_collection collection(this->get_queue(), 2, seed); + +// auto engine_arr = collection.get_engines(); + +// auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_shared(elem_count); + +// auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data(); +// auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data(); + +// auto rn_gen = this->get_rng(); + +// rn_gen.uniform(this->get_queue(), +// elem_count, +// arr_device_init_1_ptr, +// engine_arr[0], +// 0, +// elem_count); + +// rn_gen.uniform(this->get_queue(), +// elem_count, +// arr_device_init_2_ptr, +// engine_arr[1], +// 0, +// elem_count); + +// // rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, engine_arr[0], 0, elem_count); +// // rn_gen.uniform(elem_count, arr_host_ptr, engine_arr[1], 0, elem_count); + +// //this->check_results_device(arr_device_init_1, arr_device_init_2); +// this->check_results(arr_device_init_1, arr_device_init_2); +// } + +} // namespace oneapi::dal::backend::primitives::test diff --git a/dev/bazel/toolchains/cc_toolchain_lnx.bzl b/dev/bazel/toolchains/cc_toolchain_lnx.bzl index e9c5b631be6..67a9a4e781e 100644 --- a/dev/bazel/toolchains/cc_toolchain_lnx.bzl +++ b/dev/bazel/toolchains/cc_toolchain_lnx.bzl @@ -325,6 +325,12 @@ def configure_cc_toolchain_lnx(repo_ctx, reqs): "-Wl,-no-as-needed", "-no-as-needed", ) + + add_linker_option_if_supported( + repo_ctx, + tools.dpcc, + "-fsycl-max-parallel-link-jobs=40", + "-fsycl-max-parallel-link-jobs=40", + ) + add_linker_option_if_supported( repo_ctx, tools.dpcc, From 1365bb4f853942505d69bfa6436ad39000410ad7 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Tue, 1 Oct 2024 04:37:09 -0700 Subject: [PATCH 2/5] rng updates --- .../vertex_partitioning_default_kernel.hpp | 2 +- .../backend/gpu/infer_kernel_impl_dpc.cpp | 9 +- .../backend/gpu/train_feature_type_dpc.cpp | 45 +++++--- .../gpu/train_kernel_hist_impl_dpc.cpp | 6 +- .../backend/gpu/train_service_kernels_dpc.cpp | 20 ++-- .../backend/gpu/train_splitter_impl_dpc.cpp | 7 +- .../algo/louvain/backend/cpu/louvain_data.hpp | 2 +- .../objective_function/test/fixture.hpp | 2 +- .../optimizers/test/cg_solver_dpc.cpp | 2 +- .../primitives/optimizers/test/fixture.hpp | 2 +- .../optimizers/test/newton_cg_dpc.cpp | 2 +- .../dal/backend/primitives/rng/rng_cpu.hpp | 101 ++++++++++++++++++ 12 files changed, 164 insertions(+), 36 deletions(-) create mode 100644 cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp index 4da1866e277..39bef4e35cf 100644 --- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -24,7 +24,7 @@ #include "oneapi/dal/backend/memory.hpp" #include "oneapi/dal/backend/interop/common.hpp" #include "oneapi/dal/table/homogen.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng_cpu.hpp" #include "oneapi/dal/detail/threading.hpp" namespace oneapi::dal::preview::connected_components::backend { diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp index 19da49ffb74..c2ddd331905 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp @@ -17,7 +17,7 @@ #include "oneapi/dal/detail/policy.hpp" #include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/detail/profiler.hpp" - +#include #include "oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl.hpp" namespace oneapi::dal::decision_forest::backend { @@ -44,9 +44,11 @@ void infer_kernel_impl::validate_input(const descriptor_t& d if (data.get_row_count() > de::limits::max()) { throw domain_error(dal::detail::error_messages::invalid_range_of_rows()); } + if (data.get_column_count() > de::limits::max()) { throw domain_error(dal::detail::error_messages::invalid_range_of_columns()); } + if (model.get_tree_count() > de::limits::max()) { throw domain_error(dal::detail::error_messages::invalid_number_of_trees()); } @@ -67,6 +69,7 @@ void infer_kernel_impl::init_params(infer_context_t& ctx, ctx.class_count = de::integral_cast(desc.get_class_count()); ctx.voting_mode = desc.get_voting_mode(); } + ctx.row_count = de::integral_cast(data.get_row_count()); ctx.column_count = de::integral_cast(data.get_column_count()); @@ -140,6 +143,7 @@ infer_kernel_impl::predict_by_tree_group_weighted( { local_size, 1 }); sycl::event last_event = zero_obs_response_event; + std::cout << "here parallel for 2" << std::endl; for (Index proc_tree_count = 0; proc_tree_count < tree_count; proc_tree_count += ctx.tree_in_group_count) { last_event = queue_.submit([&](sycl::handler& cgh) { @@ -245,6 +249,7 @@ infer_kernel_impl::predict_by_tree_group(const infer_context { local_size, 1 }); sycl::event last_event = zero_obs_response_event; + std::cout << "here parallel for 3" << std::endl; for (Index proc_tree_count = 0; proc_tree_count < tree_count; proc_tree_count += ctx.tree_in_group_count) { last_event = queue_.submit([&](sycl::handler& cgh) { @@ -347,6 +352,7 @@ infer_kernel_impl::reduce_tree_group_response( be::make_multiple_nd_range_1d({ ctx.max_group_count * local_size }, { local_size }); sycl::event last_event = zero_response_event; + std::cout << "here parallel for 4" << std::endl; last_event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.depends_on(last_event); @@ -431,6 +437,7 @@ infer_kernel_impl::determine_winner(const infer_context_t& c { ctx.max_local_size }); sycl::event last_event; + std::cout << "here loop 1" << std::endl; last_event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp index 7306533ed50..5485149c9b2 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp @@ -20,7 +20,7 @@ #include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/backend/memory.hpp" #include "oneapi/dal/detail/profiler.hpp" - +#include #ifdef ONEDAL_DATA_PARALLEL namespace oneapi::dal::decision_forest::backend { @@ -29,6 +29,12 @@ namespace de = dal::detail; namespace bk = dal::backend; namespace pr = dal::backend::primitives; +template +std::int64_t propose_block_size(const sycl::queue& q, const std::int64_t r) { + constexpr std::int64_t fsize = sizeof(Float); + return 0x10000l * (8 / fsize); +} + template inline sycl::event sort_inplace(sycl::queue& queue_, pr::ndarray& src, @@ -56,18 +62,29 @@ sycl::event indexed_features::extract_column( Float* values = values_nd.get_mutable_data(); Index* indices = indices_nd.get_mutable_data(); auto column_count = column_count_; - - const sycl::range<1> range = de::integral_cast(row_count_); - - auto event = queue_.submit([&](sycl::handler& h) { - h.depends_on(deps); - h.parallel_for(range, [=](sycl::id<1> idx) { - values[idx] = data[idx * column_count + feature_id]; - indices[idx] = idx; + const auto block_size = propose_block_size(queue_, row_count_); + const bk::uniform_blocking blocking(row_count_, block_size); + + std::vector events(blocking.get_block_count()); + for (std::int64_t block_index = 0; block_index < blocking.get_block_count(); ++block_index) { + const auto first_row = blocking.get_block_start_index(block_index); + const auto last_row = blocking.get_block_end_index(block_index); + const auto curr_block = last_row - first_row; + ONEDAL_ASSERT(curr_block > 0); + + auto event = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + cgh.parallel_for<>(de::integral_cast(curr_block), [=](sycl::id<1> idx) { + const std::int64_t row = idx + first_row; + + values[row] = data[row * column_count + feature_id]; + indices[row] = row; + }); }); - }); - return event; + events.push_back(event); + } + return bk::wait_or_pass(events); } template @@ -87,7 +104,7 @@ sycl::event indexed_features::collect_bin_borders( const Float* values = values_nd.get_data(); const Index* bin_offsets = bin_offsets_nd.get_data(); Float* bin_borders = bin_borders_nd.get_mutable_data(); - + std::cout << "here parallel for 10" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(range, [=](sycl::id<1> idx) { @@ -124,7 +141,7 @@ sycl::event indexed_features::fill_bin_map( const Index* indices = indices_nd.get_data(); const Float* bin_borders = bin_borders_nd.get_data(); Bin* bins = bins_nd.get_mutable_data(); - + std::cout << "here parallel for 11" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -359,7 +376,7 @@ sycl::event indexed_features::store_column( Bin* full_data = full_data_nd.get_mutable_data(); const sycl::range<1> range = de::integral_cast(column_data_nd.get_dimension(0)); - + std::cout << "here parallel for 12" << std::endl; auto event = queue_.submit([&](sycl::handler& h) { h.depends_on(deps); h.parallel_for(range, [=](sycl::id<1> idx) { diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index a7294554a9f..a27217c9a54 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -19,7 +19,7 @@ #include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/detail/profiler.hpp" #include "oneapi/dal/algo/decision_forest/backend/gpu/train_helpers.hpp" - +#include #ifdef ONEDAL_DATA_PARALLEL #include "oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp" @@ -385,6 +385,7 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or Index* const selected_row_ptr = ctx.distr_mode_ ? selected_row.get_mutable_data() : nullptr; Index* const node_list_ptr = node_list_host.get_mutable_data(); pr::rng rn_gen; + std::cout << "here parallel for 20" << std::endl; for (Index node_idx = 0; node_idx < node_count; ++node_idx) { Index* gen_row_idx_global_ptr = selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; @@ -450,7 +451,7 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or // in case of no bootstrap // it is valid case if this worker's rows set wasn't taken for tree build // i.e. row_count can be eq 0 - + std::cout << "here parallel for 21" << std::endl; Index* node_list_ptr = node_list_host.get_mutable_data(); auto set_event = queue_.submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::range<1>{ std::size_t(node_count) }, [=](sycl::id<1> idx) { @@ -514,6 +515,7 @@ train_kernel_hist_impl::gen_feature_list( } else { sycl::event fill_event; + std::cout << "here parallel for 22" << std::endl; for (Index node = 0; node < node_count; ++node) { auto selected_features_host_ptr = selected_features_com.get_mutable_data(); diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp index 9130e22f8ca..638596cd404 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp @@ -19,7 +19,7 @@ #include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/detail/profiler.hpp" #include "oneapi/dal/algo/decision_forest/backend/gpu/train_helpers.hpp" - +#include #ifdef ONEDAL_DATA_PARALLEL namespace oneapi::dal::decision_forest::backend { @@ -59,7 +59,7 @@ sycl::event train_service_kernels::initialize_tree_orde Index* tree_order_ptr = tree_order.get_mutable_data(); const sycl::range<2> range{ de::integral_cast(row_count), de::integral_cast(tree_count) }; - + std::cout << "here parallel for 30" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(range, [=](sycl::id<2> id) { @@ -91,7 +91,7 @@ sycl::event train_service_kernels::get_split_node_count auto krn_local_size = preferable_sbg_size_; const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(krn_local_size, krn_local_size); - + std::cout << "here parallel for 31" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -155,7 +155,7 @@ train_service_kernels::calculate_left_child_row_count_o const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(preferable_partition_groups_count_ * krn_local_size, krn_local_size); - + std::cout << "here parallel for 32" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -282,7 +282,7 @@ sycl::event train_service_kernels::do_level_partition_b const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(preferable_partition_groups_count_ * krn_local_size, krn_local_size); - + std::cout << "here parallel for 33" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -417,7 +417,7 @@ sycl::event train_service_kernels::update_mdi_var_impor const Index leaf_mark = impl_const_t::leaf_mark_; const Index max_sub_groups_num = max_sbg_count_per_group_; //need to calculate it via device info - + std::cout << "here parallel for 35" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); sycl::local_accessor buf(max_sub_groups_num, cgh); @@ -506,7 +506,7 @@ sycl::event train_service_kernels::mark_present_rows( const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(krn_local_size * sbg_sum_count, krn_local_size); - + std::cout << "here parallel for 36" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -558,7 +558,7 @@ sycl::event train_service_kernels::count_absent_rows_fo const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(krn_local_size * sbg_sum_count, krn_local_size); - + std::cout << "here parallel for 37" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -617,7 +617,7 @@ sycl::event train_service_kernels::count_absent_rows_to const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(krn_local_size * sbg_sum_count, krn_local_size); - + std::cout << "here parallel for 38" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -673,7 +673,7 @@ sycl::event train_service_kernels::fill_oob_rows_list_b const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(krn_local_size * sbg_sum_count, krn_local_size); - + std::cout << "here parallel for 39.1" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp index eeaafe2a179..50f8e75f7d7 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp @@ -19,7 +19,7 @@ #include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/detail/profiler.hpp" #include "oneapi/dal/algo/decision_forest/backend/gpu/train_helpers.hpp" - +#include #ifdef ONEDAL_DATA_PARALLEL #include "oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl.hpp" @@ -134,7 +134,7 @@ sycl::event train_splitter_impl::random_split( const auto nd_range = bk::make_multiple_nd_range_2d({ local_size, node_in_block_count }, { local_size, 1 }); - + std::cout << "here parallel for 60" << std::endl; sycl::event last_event = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); local_accessor_rw_t local_hist_buf(hist_size, cgh); @@ -554,6 +554,7 @@ sycl::event train_splitter_impl::best_split( // Main kernel: // calculates histograms and impurity decrease based on histograms // and selects best split for each feature. + std::cout << "here parallel for 70" << std::endl; sycl::event last_event = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); local_accessor_rw_t hist(bin_block * hist_prop_count, cgh); @@ -686,7 +687,7 @@ sycl::event train_splitter_impl::best_split( } }); }); - + std::cout << "here parallel for 71" << std::endl; // Merging kernel: selects best split among all features. const auto merge_range = bk::make_multiple_nd_range_2d({ node_count, local_size }, { 1, local_size }); diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp index b0992990912..1be76f1cea2 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp @@ -17,7 +17,7 @@ #pragma once #include "oneapi/dal/backend/memory.hpp" -#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/rng/rng_cpu.hpp" namespace oneapi::dal::preview::louvain::backend { using namespace oneapi::dal::preview::detail; diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp index fabe919b34e..21725b72441 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp @@ -25,7 +25,7 @@ #include "oneapi/dal/table/csr_accessor.hpp" #include "oneapi/dal/detail/debug.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" namespace oneapi::dal::backend::primitives::test { diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp index ea320f690a2..dceaff5d52a 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp @@ -20,7 +20,7 @@ #include "oneapi/dal/test/engine/common.hpp" #include "oneapi/dal/test/engine/fixtures.hpp" #include "oneapi/dal/table/row_accessor.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include namespace oneapi::dal::backend::primitives::test { diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp index a6b87b2dcc1..45e7195cb28 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp @@ -21,7 +21,7 @@ #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/test/engine/common.hpp" #include "oneapi/dal/test/engine/fixtures.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include "oneapi/dal/backend/primitives/blas/gemv.hpp" #include "oneapi/dal/backend/primitives/element_wise.hpp" diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index f473dddf1f7..0be54b9d6b3 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -22,7 +22,7 @@ #include "oneapi/dal/test/engine/common.hpp" #include "oneapi/dal/test/engine/fixtures.hpp" #include "oneapi/dal/table/row_accessor.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include #include "oneapi/dal/backend/primitives/objective_function.hpp" diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp new file mode 100644 index 00000000000..22eb6a950e4 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp @@ -0,0 +1,101 @@ +/******************************************************************************* +* Copyright 2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#pragma once + +#include + +#include "oneapi/dal/backend/primitives/rng/utils.hpp" + +namespace oneapi::dal::backend::primitives { + +template +class rng { +public: + rng() = default; + ~rng() = default; + + void uniform(Size count, Type* dst, void* state, Type a, Type b) { + uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); + } + + void uniform_without_replacement(Size count, + Type* dst, + Type* buffer, + void* state, + Type a, + Type b) { + uniform_dispatcher::uniform_without_replacement_by_cpu(count, + dst, + buffer, + state, + a, + b); + } + + template >> + void shuffle(Size count, Type* dst, void* state) { + Type idx[2]; + + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } + } + +private: + daal::internal::RNGsInst daal_rng_; +}; + +class engine { +public: + explicit engine(std::int64_t seed = 777) + : engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)) { + impl_ = dynamic_cast(engine_.get()); + if (!impl_) { + throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); + } + } + + explicit engine(const daal::algorithms::engines::EnginePtr& eng) : engine_(eng) { + impl_ = dynamic_cast(eng.get()); + if (!impl_) { + throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); + } + } + + virtual ~engine() = default; + + engine& operator=(const daal::algorithms::engines::EnginePtr& eng) { + engine_ = eng; + impl_ = dynamic_cast(eng.get()); + if (!impl_) { + throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); + } + + return *this; + } + + void* get_state() const { + return impl_->getState(); + } + +private: + daal::algorithms::engines::EnginePtr engine_; + daal::algorithms::engines::internal::BatchBaseImpl* impl_; +}; + +} // namespace oneapi::dal::backend::primitives \ No newline at end of file From b0bef909a4bcefc078028c62581eee6b5e845f7c Mon Sep 17 00:00:00 2001 From: "Solovev, Aleksandr" Date: Mon, 7 Oct 2024 12:12:56 +0200 Subject: [PATCH 3/5] refactoring rng --- .../vertex_partitioning_default_kernel.hpp | 6 +- .../gpu/train_kernel_hist_impl_dpc.cpp | 10 +- .../vertex_partitioning_default_kernel.hpp | 2 +- .../objective_function/test/fixture.hpp | 6 +- .../objective_function/test/spmd_fixture.hpp | 6 +- .../optimizers/test/cg_solver_dpc.cpp | 6 +- .../primitives/optimizers/test/fixture.hpp | 8 +- .../optimizers/test/newton_cg_dpc.cpp | 12 +- cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 207 +++++++++++------- .../dal/backend/primitives/rng/rng_cpu.hpp | 121 +++++----- .../dal/backend/primitives/rng/rng_dpc.cpp | 26 +-- .../backend/primitives/rng/test/rng_dpc.cpp | 12 +- 12 files changed, 250 insertions(+), 172 deletions(-) diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp index 39bef4e35cf..7215b86df04 100644 --- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -90,9 +90,9 @@ std::int32_t most_frequent_element(const std::atomic *components, const std::int64_t &samples_count = 1024) { std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count); - dal::backend::primitives::engine eng; - dal::backend::primitives::rng rn_gen; - rn_gen.uniform(samples_count, rnd_vertex_ids, eng.get_state(), 0, vertex_count); + dal::backend::primitives::daal_engine eng; + dal::backend::primitives::daal_rng rn_gen; + rn_gen.uniform(samples_count, rnd_vertex_ids, eng, 0, vertex_count); std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count); diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index a27217c9a54..03e1782a752 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -384,12 +384,12 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or Index* const selected_row_global_ptr = selected_row_global.get_mutable_data(); Index* const selected_row_ptr = ctx.distr_mode_ ? selected_row.get_mutable_data() : nullptr; Index* const node_list_ptr = node_list_host.get_mutable_data(); - pr::rng rn_gen; + pr::oneapi_rng rn_gen; std::cout << "here parallel for 20" << std::endl; for (Index node_idx = 0; node_idx < node_count; ++node_idx) { Index* gen_row_idx_global_ptr = selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; - rn_gen.uniform_gpu_internal(queue_, + rn_gen.uniform_gpu(queue_, ctx.selected_row_total_count_, gen_row_idx_global_ptr, rng_engine_list[engine_offset + node_idx], @@ -499,11 +499,11 @@ train_kernel_hist_impl::gen_feature_list( pr::rng rn_gen; auto selected_features_host_ptr = selected_features_host.get_mutable_data(); for (Index node = 0; node < node_count; ++node) { - rn_gen.uniform_without_replacement( + rn_gen.uniform_without_replacement_cpu( ctx.selected_ftr_count_, selected_features_host_ptr + node * ctx.selected_ftr_count_, selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]].get_state(), + rng_engine_list[tree_map_ptr[node]].get_cpu_engine_state(), 0, ctx.column_count_); } @@ -1805,7 +1805,7 @@ sycl::event train_kernel_hist_impl::compute_results( for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) { rn_gen.shuffle(oob_row_count, permutation_ptr, - engine_arr[built_tree_count + tree_idx_in_block].get_state()); + engine_arr[built_tree_count + tree_idx_in_block].get_cpu_engine_state()); const Float oob_err_perm = compute_oob_error_perm(ctx, model_manager, data_host, diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp index 79e294e9f47..ff78f06f833 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology& t, ld.random_order[index] = index; } // random shuffle - ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng.get_state(), 0, t._vertex_count); + ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count); for (std::int64_t index = 0; index < t._vertex_count; ++index) { std::swap(ld.random_order[index], ld.random_order[ld.index[index]]); } diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp index 21725b72441..d673470b042 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp @@ -572,13 +572,13 @@ class logloss_test : public te::float_algo_fixture rn_gen; + primitives::daal_rng rn_gen; auto vec_host = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host); for (std::int32_t ij = 0; ij < num_checks; ++ij) { - primitives::engine eng(2007 + dim * num_checks + ij); - rn_gen.uniform(dim, vec_host.get_mutable_data(), eng.get_state(), -1.0, 1.0); + primitives::daal_engine eng(2007 + dim * num_checks + ij); + rn_gen.uniform(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); auto vec_gpu = vec_host.to_device(this->get_queue()); auto out_vector = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::device); diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp index e902dd452e1..e2a611c2c98 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp @@ -100,12 +100,12 @@ class logloss_spmd_test : public logloss_test { std::int64_t num_checks = 5; std::vector> vecs_host(num_checks), vecs_gpu(num_checks); - rng rn_gen; + daal_rng rn_gen; for (std::int64_t ij = 0; ij < num_checks; ++ij) { - engine eng(2007 + dim * num_checks + ij); + daal_engine eng(2007 + dim * num_checks + ij); vecs_host[ij] = (ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host)); - rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng.get_state(), -1.0, 1.0); + rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0); vecs_gpu[ij] = vecs_host[ij].to_device(this->get_queue()); } diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp index dceaff5d52a..36e20f03c11 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp @@ -43,9 +43,9 @@ class cg_solver_test : public te::float_algo_fixture { x_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); b_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); - primitives::rng rn_gen; - primitives::engine eng(4014 + n_); - rn_gen.uniform(n_, x_host_.get_mutable_data(), eng.get_state(), -1.0, 1.0); + primitives::daal_rng rn_gen; + primitives::daal_engine eng(4014 + n_); + rn_gen.uniform(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host_); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp index 45e7195cb28..777c0ee68e2 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp @@ -133,11 +133,11 @@ void create_stable_matrix(sycl::queue& queue, ONEDAL_ASSERT(A.get_dimension(1) == n); auto J = ndarray::empty(queue, { n, n }, sycl::usm::alloc::host); auto eigen_values = ndarray::empty(queue, { n }, sycl::usm::alloc::host); - primitives::rng rn_gen; - primitives::engine eng(2007 + n); + primitives::daal_rng rn_gen; + primitives::daal_engine eng(2007 + n); - rn_gen.uniform(n * n, J.get_mutable_data(), eng.get_state(), -1.0, 1.0); - rn_gen.uniform(n, eigen_values.get_mutable_data(), eng.get_state(), bottom_eig, top_eig); + rn_gen.uniform(n * n, J.get_mutable_data(), eng, -1.0, 1.0); + rn_gen.uniform(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig); // orthogonalize matrix J gram_schmidt(J); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index 0be54b9d6b3..d4f5ea55fb9 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -56,10 +56,10 @@ class newton_cg_test : public te::float_algo_fixture { ndarray::empty(this->get_queue(), { n_ + 1 }, sycl::usm::alloc::host); auto params_host = ndarray::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host); - primitives::rng rn_gen; - primitives::engine eng(2007 + n); - rn_gen.uniform(n_ * p_, X_host.get_mutable_data(), eng.get_state(), -10.0, 10.0); - rn_gen.uniform(p_ + 1, params_host.get_mutable_data(), eng.get_state(), -5.0, 5.0); + primitives::daal_rng rn_gen; + primitives::daal_engine eng(2007 + n); + rn_gen.uniform(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); + rn_gen.uniform(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); for (std::int64_t i = 0; i < n_; ++i) { float_t val = 0; for (std::int64_t j = 0; j < p_; ++j) { @@ -144,7 +144,7 @@ class newton_cg_test : public te::float_algo_fixture { auto b_host = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); primitives::rng rn_gen; primitives::engine eng(4014 + n_); - rn_gen.uniform(n_, solution_.get_mutable_data(), eng.get_state(), -1.0, 1.0); + rn_gen.uniform(n_, solution_.get_mutable_data(), eng.get_cpu_engine_state(), -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0)); @@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture { auto buffer = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); for (std::int32_t test_num = 0; test_num < 5; ++test_num) { - rn_gen.uniform(n_, x_host.get_mutable_data(), eng.get_state(), -1.0, 1.0); + rn_gen.uniform(n_, x_host.get_mutable_data(), eng.get_cpu_engine_state(), -1.0, 1.0); auto x_gpu = x_host.to_device(this->get_queue()); auto compute_event_vec = func_->update_x(x_gpu, true, {}); wait_or_pass(compute_event_vec).wait_and_throw(); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp index a8b1c1f6ca5..cb18a38b8dd 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp @@ -16,49 +16,63 @@ #pragma once -#include "oneapi/dal/backend/primitives/ndarray.hpp" - +#include #include #include #include #include "oneapi/dal/backend/primitives/rng/utils.hpp" -#include "oneapi/dal/table/common.hpp" -#include "oneapi/dal/backend/primitives/ndarray.hpp" -#include + namespace mkl = oneapi::mkl; namespace oneapi::dal::backend::primitives { #ifdef ONEDAL_DATA_PARALLEL -enum class engine_list { mt2203, mcg59, mt19937 }; +namespace engine { +namespace v1 { + +/// Tag-type that denotes the mt2203 engine. +struct mt2203 {}; + +/// Tag-type that denotes the mcg59 engine. +struct mcg59 {}; -template -struct oneapi_engine_type; +/// Tag-type that denotes the mt19937 engine. +struct mt19937 {}; + +/// Alias tag-type for the default engine (mt2203). +using by_default = mt2203; + +} // namespace v1 +} // namespace engine + +// Helper to map engine types to corresponding oneAPI MKL engine types +template +struct select_onedal_engine; template <> -struct oneapi_engine_type { +struct select_onedal_engine { using type = oneapi::mkl::rng::mt2203; }; template <> -struct oneapi_engine_type { +struct select_onedal_engine { using type = oneapi::mkl::rng::mcg59; }; template <> -struct oneapi_engine_type { +struct select_onedal_engine { using type = oneapi::mkl::rng::mt19937; }; -template -class engine { +template +class oneapi_engine { public: - using oneapi_engine_t = typename oneapi_engine_type::type; + using onedal_engine_t = typename select_onedal_engine::type; - explicit engine(sycl::queue& queue, std::int64_t seed = 777) + explicit oneapi_engine(sycl::queue& queue, std::int64_t seed = 777) : q(queue), daal_engine_(initialize_daal_engine(seed)), - oneapi_engine_(initialize_oneapi_engine(queue, seed)), + onedal_engine_(initialize_oneapi_engine(queue, seed)), impl_(dynamic_cast( daal_engine_.get())) { if (!impl_) { @@ -66,18 +80,18 @@ class engine { } } - virtual ~engine() = default; + virtual ~oneapi_engine() = default; - void* get_state() const { + void* get_cpu_engine_state() const { return impl_->getState(); } - auto& get_daal_engine() { + auto& get_cpu_engine() { return daal_engine_; } - auto& get_oneapi_state() { - return oneapi_engine_; + auto& get_gpu_engine() { + return onedal_engine_; } void skip_ahead_cpu(size_t nSkip) { @@ -85,99 +99,142 @@ class engine { } void skip_ahead_gpu(size_t nSkip) { - if constexpr (EngineType == engine_list::mt2203) { + if constexpr (std::is_same_v) { + // GPU-specific code for mt2203 } else { - skip_ahead(oneapi_engine_, nSkip); + skip_ahead(onedal_engine_, nSkip); } } private: daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) { - switch (EngineType) { - case engine_list::mt2203: - return daal::algorithms::engines::mt2203::Batch<>::create(seed); - case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); - case engine_list::mt19937: - return daal::algorithms::engines::mt19937::Batch<>::create(seed); - default: throw std::invalid_argument("Unsupported engine type"); + if constexpr (std::is_same_v) { + return daal::algorithms::engines::mt2203::Batch<>::create(seed); + } + else if constexpr (std::is_same_v) { + return daal::algorithms::engines::mcg59::Batch<>::create(seed); + } + else if constexpr (std::is_same_v) { + return daal::algorithms::engines::mt19937::Batch<>::create(seed); + } + else { + throw std::invalid_argument("Unsupported engine type. Supported types: mt2203, mcg59, mt19937"); } } - oneapi_engine_t initialize_oneapi_engine(sycl::queue& queue, std::int64_t seed) { - if constexpr (EngineType == engine_list::mt2203) { - return oneapi_engine_t(queue, - seed, - 0); // its necessary for aligning cpu and gpu results + onedal_engine_t initialize_oneapi_engine(sycl::queue& queue, std::int64_t seed) { + if constexpr (std::is_same_v) { + return onedal_engine_t(queue, seed, 0); // Aligns CPU and GPU results for mt2203 } else { - return oneapi_engine_t(queue, seed); + return onedal_engine_t(queue, seed); } } + sycl::queue q; daal::algorithms::engines::EnginePtr daal_engine_; - oneapi_engine_t oneapi_engine_; + onedal_engine_t onedal_engine_; daal::algorithms::engines::internal::BatchBaseImpl* impl_; }; template -class rng { +class oneapi_rng { public: - rng() = default; - ~rng() = default; + oneapi_rng() = default; + ~oneapi_rng() = default; - template + template void uniform(sycl::queue& queue, Size count, Type* dst, - engine& engine_, + oneapi_engine& engine_, Type a, Type b, bool distr_mode = false, const event_vector& deps = {}); - template - void uniform_gpu_internal(sycl::queue& queue, - Size count, - Type* dst, - engine& engine_, - Type a, - Type b, - const event_vector& deps = {}); - // template - // void uniform_without_replacement(sycl::queue& queue, - // Size count, - // Type* dst, - // std::uint8_t* state, - // Type a, - // Type b, - // const event_vector& deps = {}); - - template - void uniform_cpu(Size count, Type* dst, engine& engine_, Type a, Type b) { - void* state = engine_.get_state(); - engine_.skip_ahead_gpu(count); + template + void uniform_gpu(sycl::queue& queue, + Size count, + Type* dst, + oneapi_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); + + template + void uniform_cpu(Size count, Type* dst, oneapi_engine& engine_, Type a, Type b) { + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_cpu(count); uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); } - void uniform_without_replacement(Size count, + template + void uniform_without_replacement(sycl::queue& queue, + Size count, Type* dst, - Type* buffer, - void* state, + oneapi_engine& engine_, Type a, - Type b) { - uniform_dispatcher::uniform_without_replacement_by_cpu(count, - dst, - buffer, - state, - a, - b); + Type b, + const event_vector& deps = {}) { + } + + template + void uniform_without_replacement_gpu(sycl::queue& queue, + Size count, + Type* dst, + oneapi_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}) { } - template >> - void shuffle(Size count, Type* dst, void* state) { + template + void uniform_without_replacement_cpu(Size count, + Type* dst, + Type* buffer, + oneapi_engine& engine_, + Type a, + Type b) { + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); + } + + template >> + void shuffle(Size count, Type* dst, oneapi_engine& engine_) { Type idx[2]; + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } + } + + template >> + void shuffle_gpu(Size count, Type* dst, oneapi_engine& engine_) { + Type idx[2]; + + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_gpu(2, idx, engine_.get_gpu_engine(), 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } + } + + template >> + void shuffle_cpu(Size count, Type* dst, oneapi_engine& engine_) { + Type idx[2]; + + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + for (Size i = 0; i < count; ++i) { uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); std::swap(dst[idx[0]], dst[idx[1]]); diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp index 22eb6a950e4..adfea074998 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp @@ -17,27 +17,87 @@ #pragma once #include - +#include +#include #include "oneapi/dal/backend/primitives/rng/utils.hpp" namespace oneapi::dal::backend::primitives { +namespace engine { +namespace v1 { + +/// Tag-type that denotes the mt2203 engine. +struct mt2203 {}; + +/// Tag-type that denotes the mcg59 engine. +struct mcg59 {}; + +/// Tag-type that denotes the mt19937 engine. +struct mt19937 {}; + +/// Alias tag-type for the default engine (mt2203). +using by_default = mt2203; + +} // namespace v1 +} // namespace engine + +template +class daal_engine { +public: + explicit daal_engine(std::int64_t seed = 777) + : daal_engine_(initialize_daal_engine(seed)), + impl_(dynamic_cast( + daal_engine_.get())) { + if (!impl_) { + throw std::domain_error("RNG engine is not supported"); + } + } + + virtual ~daal_engine() = default; + + void* get_cpu_engine_state() const { + return impl_->getState(); + } + + auto& get_cpu_engine() { + return daal_engine_; + } +private: + daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) { + switch (EngineType) { + case engine_list::mt2203: + return daal::algorithms::engines::mt2203::Batch<>::create(seed); + case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_list::mt19937: + return daal::algorithms::engines::mt19937::Batch<>::create(seed); + default: throw std::invalid_argument("Unsupported engine type"); + } + } + + daal::algorithms::engines::EnginePtr daal_engine_; + daal::algorithms::engines::internal::BatchBaseImpl* impl_; +}; + template -class rng { +class daal_rng { public: - rng() = default; - ~rng() = default; + daal_rng() = default; + ~daal_rng() = default; - void uniform(Size count, Type* dst, void* state, Type a, Type b) { + template + void uniform(Size count, Type* dst, daal_engine& engine_, Type a, Type b) { + void* state = engine_.get_cpu_engine_state(); uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); } - void uniform_without_replacement(Size count, + template + void uniform_without_replacement_cpu(Size count, Type* dst, Type* buffer, - void* state, + daal_engine& engine_, Type a, Type b) { + void* state = engine_.get_cpu_engine_state(); uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, @@ -46,56 +106,17 @@ class rng { b); } - template >> - void shuffle(Size count, Type* dst, void* state) { + template >> + void shuffle(Size count, Type* dst, daal_engine& engine_) { Type idx[2]; + void* state = engine_.get_cpu_engine_state(); + for (Size i = 0; i < count; ++i) { uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); std::swap(dst[idx[0]], dst[idx[1]]); } } - -private: - daal::internal::RNGsInst daal_rng_; -}; - -class engine { -public: - explicit engine(std::int64_t seed = 777) - : engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)) { - impl_ = dynamic_cast(engine_.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - } - - explicit engine(const daal::algorithms::engines::EnginePtr& eng) : engine_(eng) { - impl_ = dynamic_cast(eng.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - } - - virtual ~engine() = default; - - engine& operator=(const daal::algorithms::engines::EnginePtr& eng) { - engine_ = eng; - impl_ = dynamic_cast(eng.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - - return *this; - } - - void* get_state() const { - return impl_->getState(); - } - -private: - daal::algorithms::engines::EnginePtr engine_; - daal::algorithms::engines::internal::BatchBaseImpl* impl_; }; } // namespace oneapi::dal::backend::primitives \ No newline at end of file diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index 74363680394..1b8b91f24e3 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -24,14 +24,14 @@ namespace bk = oneapi::dal::backend; template template -void rng::uniform_gpu_internal(sycl::queue& queue, +void oneapi_rng::uniform_gpu(sycl::queue& queue, Size count, Type* dst, engine& engine_, Type a, Type b, const event_vector& deps) { - auto local_engine = engine_.get_oneapi_state(); + auto local_engine = engine_.get_gpu_engine(); oneapi::mkl::rng::uniform distr(a, b); auto event = oneapi::mkl::rng::generate(distr, local_engine, count, dst, { deps }); event.wait_and_throw(); @@ -41,7 +41,7 @@ void rng::uniform_gpu_internal(sycl::queue& queue, template template -void rng::uniform(sycl::queue& queue, +void oneapi_rng::uniform(sycl::queue& queue, Size count, Type* dst, engine& engine_, @@ -49,19 +49,19 @@ void rng::uniform(sycl::queue& queue, Type b, bool distr_mode /* = false */, const event_vector& deps) { - if (count > 5000) { - uniform_gpu_internal(queue, count, dst, engine_, a, b); - } - else { - uniform_cpu(count, dst, engine_, a, b); - } + // if (count > 5000) { + uniform_gpu(queue, count, dst, engine_, a, b); + // } + // else { + // uniform_cpu(count, dst, engine_, a, b); + // } } // template -// void rng::uniform_without_replacement(sycl::queue& queue, +// void oneapi_rng::uniform_without_replacement(sycl::queue& queue, // Size count, // Type* dst, -// std::uint8_t* state, +// engine& engine_, // Type a, // Type b, // const event_vector& deps) { @@ -105,7 +105,7 @@ void rng::uniform(sycl::queue& queue, // } #define INSTANTIATE(F, Size, EngineType) \ - template ONEDAL_EXPORT void rng::uniform(sycl::queue& queue, \ + template ONEDAL_EXPORT void oneapi_rng::uniform(sycl::queue& queue, \ Size count_, \ F* dst, \ engine& engine_, \ @@ -129,7 +129,7 @@ INSTANTIATE_FLOAT(std::int64_t); INSTANTIATE_FLOAT(std::int32_t); #define INSTANTIATE_(F, Size, EngineType) \ - template ONEDAL_EXPORT void rng::uniform_gpu_internal(sycl::queue& queue, \ + template ONEDAL_EXPORT void oneapi_rng::uniform_gpu(sycl::queue& queue, \ Size count_, \ F* dst, \ engine& engine_, \ diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp index 1ba5e9fc365..7de6dd37a75 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -149,7 +149,7 @@ class rng_test : public te::policy_fixture { // auto rng_engine_ = this->get_engine(seed); // rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine, 0, elem_count); -// rn_gen.uniform_gpu_internal(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); +// rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); // this->check_results(arr_gpu, arr_host); // } @@ -185,7 +185,7 @@ class rng_test : public te::policy_fixture { // // auto rng_engine_1 = this->get_engine(seed); // // auto rng_engine_2 = this->get_engine(seed); // // BENCHMARK("Uniform GPU arr" + std::to_string(elem_count)) { -// // rn_gen_.uniform_gpu_internal(this->get_queue(), +// // rn_gen_.uniform_gpu(this->get_queue(), // // elem_count, // // arr_gpu_ptr_, // // rng_engine_1, @@ -217,7 +217,7 @@ class rng_test : public te::policy_fixture { // rn_gen.uniform_cpu(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count); // rn_gen.uniform_cpu(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count); -// rn_gen.uniform_gpu_internal(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); +// rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); // rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); // this->check_results_host(arr_host_init_1, arr_host_init_2); @@ -240,11 +240,11 @@ class rng_test : public te::policy_fixture { // auto rng_engine = this->get_engine(seed); // auto rng_engine_2 = this->get_engine(seed); -// rn_gen.uniform_gpu_internal(this->get_queue(), elem_count, arr_device_init_1_ptr, rng_engine, 0, elem_count); +// rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_device_init_1_ptr, rng_engine, 0, elem_count); // rn_gen -// .uniform_gpu_internal(this->get_queue(), elem_count, arr_device_init_2_ptr, rng_engine_2, 0, elem_count); +// .uniform_gpu(this->get_queue(), elem_count, arr_device_init_2_ptr, rng_engine_2, 0, elem_count); -// rn_gen.uniform_gpu_internal(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); +// rn_gen.uniform_gpu(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); // rn_gen.uniform_cpu(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); // this->check_results_device(arr_device_init_1, arr_device_init_2); From d990e8eb6440664adfd9e298bbe62b14de2ec818 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Thu, 10 Oct 2024 08:05:38 -0700 Subject: [PATCH 4/5] upd rng --- .../vertex_partitioning_default_kernel.hpp | 2 +- .../backend/gpu/infer_kernel_impl_dpc.cpp | 6 +- .../backend/gpu/train_feature_type_dpc.cpp | 6 +- .../backend/gpu/train_kernel_hist_impl.hpp | 3 +- .../gpu/train_kernel_hist_impl_dpc.cpp | 29 +- .../backend/gpu/train_service_kernels_dpc.cpp | 9 - .../backend/gpu/train_splitter_impl_dpc.cpp | 3 - .../algo/louvain/backend/cpu/louvain_data.hpp | 4 +- .../vertex_partitioning_default_kernel.hpp | 2 +- cpp/oneapi/dal/backend/primitives/rng/rng.hpp | 124 ++++----- .../dal/backend/primitives/rng/rng_cpu.hpp | 55 ++-- .../dal/backend/primitives/rng/rng_dpc.cpp | 251 ++++++++---------- .../primitives/rng/rng_engine_collection.hpp | 6 +- 13 files changed, 214 insertions(+), 286 deletions(-) diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp index 7215b86df04..e139dd50dc8 100644 --- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -92,7 +92,7 @@ std::int32_t most_frequent_element(const std::atomic *components, dal::backend::primitives::daal_engine eng; dal::backend::primitives::daal_rng rn_gen; - rn_gen.uniform(samples_count, rnd_vertex_ids, eng, 0, vertex_count); + rn_gen.uniform(samples_count, rnd_vertex_ids, eng.get_cpu_engine_state(), 0, vertex_count); std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count); diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp index c2ddd331905..b7a871f9fb6 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp @@ -143,7 +143,6 @@ infer_kernel_impl::predict_by_tree_group_weighted( { local_size, 1 }); sycl::event last_event = zero_obs_response_event; - std::cout << "here parallel for 2" << std::endl; for (Index proc_tree_count = 0; proc_tree_count < tree_count; proc_tree_count += ctx.tree_in_group_count) { last_event = queue_.submit([&](sycl::handler& cgh) { @@ -249,7 +248,7 @@ infer_kernel_impl::predict_by_tree_group(const infer_context { local_size, 1 }); sycl::event last_event = zero_obs_response_event; - std::cout << "here parallel for 3" << std::endl; + for (Index proc_tree_count = 0; proc_tree_count < tree_count; proc_tree_count += ctx.tree_in_group_count) { last_event = queue_.submit([&](sycl::handler& cgh) { @@ -352,7 +351,7 @@ infer_kernel_impl::reduce_tree_group_response( be::make_multiple_nd_range_1d({ ctx.max_group_count * local_size }, { local_size }); sycl::event last_event = zero_response_event; - std::cout << "here parallel for 4" << std::endl; + last_event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.depends_on(last_event); @@ -437,7 +436,6 @@ infer_kernel_impl::determine_winner(const infer_context_t& c { ctx.max_local_size }); sycl::event last_event; - std::cout << "here loop 1" << std::endl; last_event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp index 5485149c9b2..09f0c198c4d 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp @@ -104,7 +104,7 @@ sycl::event indexed_features::collect_bin_borders( const Float* values = values_nd.get_data(); const Index* bin_offsets = bin_offsets_nd.get_data(); Float* bin_borders = bin_borders_nd.get_mutable_data(); - std::cout << "here parallel for 10" << std::endl; + auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(range, [=](sycl::id<1> idx) { @@ -141,7 +141,7 @@ sycl::event indexed_features::fill_bin_map( const Index* indices = indices_nd.get_data(); const Float* bin_borders = bin_borders_nd.get_data(); Bin* bins = bins_nd.get_mutable_data(); - std::cout << "here parallel for 11" << std::endl; + auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -376,7 +376,7 @@ sycl::event indexed_features::store_column( Bin* full_data = full_data_nd.get_mutable_data(); const sycl::range<1> range = de::integral_cast(column_data_nd.get_dimension(0)); - std::cout << "here parallel for 12" << std::endl; + auto event = queue_.submit([&](sycl::handler& h) { h.depends_on(deps); h.parallel_for(range, [=](sycl::id<1> idx) { diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp index 036c41d6a9c..2c52ab55481 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp @@ -21,6 +21,7 @@ #include "oneapi/dal/backend/primitives/utils.hpp" #include "oneapi/dal/algo/decision_forest/train_types.hpp" #include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/rng/rng_cpu.hpp" #include "oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp" #include "oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp" @@ -50,7 +51,7 @@ class train_kernel_hist_impl { using model_manager_t = train_model_manager; using train_context_t = train_context; using imp_data_t = impurity_data; - using rng_engine_t = pr::engine; + using rng_engine_t = pr::oneapi_engine; using rng_engine_list_t = std::vector; using msg = dal::detail::error_messages; using comm_t = bk::communicator; diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index 03e1782a752..ec4c03581db 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -385,17 +385,17 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or Index* const selected_row_ptr = ctx.distr_mode_ ? selected_row.get_mutable_data() : nullptr; Index* const node_list_ptr = node_list_host.get_mutable_data(); pr::oneapi_rng rn_gen; - std::cout << "here parallel for 20" << std::endl; + for (Index node_idx = 0; node_idx < node_count; ++node_idx) { Index* gen_row_idx_global_ptr = selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; rn_gen.uniform_gpu(queue_, - ctx.selected_row_total_count_, - gen_row_idx_global_ptr, - rng_engine_list[engine_offset + node_idx], - 0, - ctx.row_total_count_, - { deps }); + ctx.selected_row_total_count_, + gen_row_idx_global_ptr, + rng_engine_list[engine_offset + node_idx], + 0, + ctx.row_total_count_, + { deps }); if (ctx.distr_mode_) { Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_; @@ -451,7 +451,6 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or // in case of no bootstrap // it is valid case if this worker's rows set wasn't taken for tree build // i.e. row_count can be eq 0 - std::cout << "here parallel for 21" << std::endl; Index* node_list_ptr = node_list_host.get_mutable_data(); auto set_event = queue_.submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::range<1>{ std::size_t(node_count) }, [=](sycl::id<1> idx) { @@ -496,7 +495,7 @@ train_kernel_hist_impl::gen_feature_list( if (ctx.selected_ftr_count_ != ctx.column_count_) { auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); - pr::rng rn_gen; + pr::daal_rng rn_gen; auto selected_features_host_ptr = selected_features_host.get_mutable_data(); for (Index node = 0; node < node_count; ++node) { rn_gen.uniform_without_replacement_cpu( @@ -515,7 +514,6 @@ train_kernel_hist_impl::gen_feature_list( } else { sycl::event fill_event; - std::cout << "here parallel for 22" << std::endl; for (Index node = 0; node < node_count; ++node) { auto selected_features_host_ptr = selected_features_com.get_mutable_data(); @@ -545,7 +543,7 @@ train_kernel_hist_impl::gen_random_thresholds( auto node_vs_tree_map_list_host = node_vs_tree_map.to_host(queue_); - pr::rng rn_gen; + pr::oneapi_rng rn_gen; auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); // Create arrays for random generated bins @@ -1800,12 +1798,13 @@ sycl::event train_kernel_hist_impl::compute_results( const Float div1 = Float(1) / Float(built_tree_count + tree_idx_in_block + 1); - pr::rng rn_gen; + pr::daal_rng rn_gen; for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) { - rn_gen.shuffle(oob_row_count, - permutation_ptr, - engine_arr[built_tree_count + tree_idx_in_block].get_cpu_engine_state()); + rn_gen.shuffle( + oob_row_count, + permutation_ptr, + engine_arr[built_tree_count + tree_idx_in_block].get_cpu_engine_state()); const Float oob_err_perm = compute_oob_error_perm(ctx, model_manager, data_host, diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp index 638596cd404..cb841561dc3 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp @@ -59,7 +59,6 @@ sycl::event train_service_kernels::initialize_tree_orde Index* tree_order_ptr = tree_order.get_mutable_data(); const sycl::range<2> range{ de::integral_cast(row_count), de::integral_cast(tree_count) }; - std::cout << "here parallel for 30" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(range, [=](sycl::id<2> id) { @@ -91,7 +90,6 @@ sycl::event train_service_kernels::get_split_node_count auto krn_local_size = preferable_sbg_size_; const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(krn_local_size, krn_local_size); - std::cout << "here parallel for 31" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -155,7 +153,6 @@ train_service_kernels::calculate_left_child_row_count_o const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(preferable_partition_groups_count_ * krn_local_size, krn_local_size); - std::cout << "here parallel for 32" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -282,7 +279,6 @@ sycl::event train_service_kernels::do_level_partition_b const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(preferable_partition_groups_count_ * krn_local_size, krn_local_size); - std::cout << "here parallel for 33" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -417,7 +413,6 @@ sycl::event train_service_kernels::update_mdi_var_impor const Index leaf_mark = impl_const_t::leaf_mark_; const Index max_sub_groups_num = max_sbg_count_per_group_; //need to calculate it via device info - std::cout << "here parallel for 35" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); sycl::local_accessor buf(max_sub_groups_num, cgh); @@ -506,7 +501,6 @@ sycl::event train_service_kernels::mark_present_rows( const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(krn_local_size * sbg_sum_count, krn_local_size); - std::cout << "here parallel for 36" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -558,7 +552,6 @@ sycl::event train_service_kernels::count_absent_rows_fo const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(krn_local_size * sbg_sum_count, krn_local_size); - std::cout << "here parallel for 37" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -617,7 +610,6 @@ sycl::event train_service_kernels::count_absent_rows_to const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(krn_local_size * sbg_sum_count, krn_local_size); - std::cout << "here parallel for 38" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { @@ -673,7 +665,6 @@ sycl::event train_service_kernels::fill_oob_rows_list_b const sycl::nd_range<1> nd_range = bk::make_multiple_nd_range_1d(krn_local_size * sbg_sum_count, krn_local_size); - std::cout << "here parallel for 39.1" << std::endl; auto event = queue_.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) { diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp index 50f8e75f7d7..3990d99d63d 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp @@ -134,7 +134,6 @@ sycl::event train_splitter_impl::random_split( const auto nd_range = bk::make_multiple_nd_range_2d({ local_size, node_in_block_count }, { local_size, 1 }); - std::cout << "here parallel for 60" << std::endl; sycl::event last_event = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); local_accessor_rw_t local_hist_buf(hist_size, cgh); @@ -554,7 +553,6 @@ sycl::event train_splitter_impl::best_split( // Main kernel: // calculates histograms and impurity decrease based on histograms // and selects best split for each feature. - std::cout << "here parallel for 70" << std::endl; sycl::event last_event = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); local_accessor_rw_t hist(bin_block * hist_prop_count, cgh); @@ -687,7 +685,6 @@ sycl::event train_splitter_impl::best_split( } }); }); - std::cout << "here parallel for 71" << std::endl; // Merging kernel: selects best split among all features. const auto merge_range = bk::make_multiple_nd_range_2d({ node_count, local_size }, { 1, local_size }); diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp index 1be76f1cea2..290f1fc5215 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp @@ -123,8 +123,8 @@ struct louvain_data { // Total link weight in the network value_type m; - engine eng; - rng rn_gen; + daal_engine eng; + daal_rng rn_gen; const std::int64_t vertex_count; const std::int64_t edge_count; diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp index ff78f06f833..7b277d88283 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology& t, ld.random_order[index] = index; } // random shuffle - ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count); + ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng.get_cpu_engine_state(), 0, t._vertex_count); for (std::int64_t index = 0; index < t._vertex_count; ++index) { std::swap(ld.random_order[index], ld.random_order[ld.index[index]]); } diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp index cb18a38b8dd..0c02b784f31 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp @@ -16,58 +16,40 @@ #pragma once -#include #include #include #include #include "oneapi/dal/backend/primitives/rng/utils.hpp" - +#include namespace mkl = oneapi::mkl; namespace oneapi::dal::backend::primitives { #ifdef ONEDAL_DATA_PARALLEL -namespace engine { -namespace v1 { - -/// Tag-type that denotes the mt2203 engine. -struct mt2203 {}; - -/// Tag-type that denotes the mcg59 engine. -struct mcg59 {}; - -/// Tag-type that denotes the mt19937 engine. -struct mt19937 {}; - -/// Alias tag-type for the default engine (mt2203). -using by_default = mt2203; - -} // namespace v1 -} // namespace engine +enum class engine_list { mt2203, mcg59, mt19937 }; -// Helper to map engine types to corresponding oneAPI MKL engine types -template -struct select_onedal_engine; +template +struct oneapi_engine_type; template <> -struct select_onedal_engine { +struct oneapi_engine_type { using type = oneapi::mkl::rng::mt2203; }; template <> -struct select_onedal_engine { +struct oneapi_engine_type { using type = oneapi::mkl::rng::mcg59; }; template <> -struct select_onedal_engine { +struct oneapi_engine_type { using type = oneapi::mkl::rng::mt19937; }; -template +template class oneapi_engine { public: - using onedal_engine_t = typename select_onedal_engine::type; + using onedal_engine_t = typename oneapi_engine_type::type; explicit oneapi_engine(sycl::queue& queue, std::int64_t seed = 777) : q(queue), @@ -99,8 +81,7 @@ class oneapi_engine { } void skip_ahead_gpu(size_t nSkip) { - if constexpr (std::is_same_v) { - // GPU-specific code for mt2203 + if constexpr (EngineType == engine_list::mt2203) { } else { skip_ahead(onedal_engine_, nSkip); @@ -109,29 +90,25 @@ class oneapi_engine { private: daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) { - if constexpr (std::is_same_v) { - return daal::algorithms::engines::mt2203::Batch<>::create(seed); - } - else if constexpr (std::is_same_v) { - return daal::algorithms::engines::mcg59::Batch<>::create(seed); - } - else if constexpr (std::is_same_v) { - return daal::algorithms::engines::mt19937::Batch<>::create(seed); - } - else { - throw std::invalid_argument("Unsupported engine type. Supported types: mt2203, mcg59, mt19937"); + switch (EngineType) { + case engine_list::mt2203: + return daal::algorithms::engines::mt2203::Batch<>::create(seed); + case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_list::mt19937: + return daal::algorithms::engines::mt19937::Batch<>::create(seed); + default: throw std::invalid_argument("Unsupported engine type"); } } onedal_engine_t initialize_oneapi_engine(sycl::queue& queue, std::int64_t seed) { - if constexpr (std::is_same_v) { - return onedal_engine_t(queue, seed, 0); // Aligns CPU and GPU results for mt2203 + if constexpr (EngineType == engine_list::mt2203) { + return onedal_engine_t(queue, seed, + 0); // Aligns CPU and GPU results for mt2203 } else { return onedal_engine_t(queue, seed); } } - sycl::queue q; daal::algorithms::engines::EnginePtr daal_engine_; onedal_engine_t onedal_engine_; @@ -144,7 +121,7 @@ class oneapi_rng { oneapi_rng() = default; ~oneapi_rng() = default; - template + template void uniform(sycl::queue& queue, Size count, Type* dst, @@ -154,7 +131,7 @@ class oneapi_rng { bool distr_mode = false, const event_vector& deps = {}); - template + template void uniform_gpu(sycl::queue& queue, Size count, Type* dst, @@ -163,34 +140,28 @@ class oneapi_rng { Type b, const event_vector& deps = {}); - template - void uniform_cpu(Size count, Type* dst, oneapi_engine& engine_, Type a, Type b) { - void* state = engine_.get_cpu_engine_state(); - engine_.skip_ahead_cpu(count); - uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); - } - - template + template + void uniform_cpu(Size count, Type* dst, oneapi_engine& engine_, Type a, Type b); + template void uniform_without_replacement(sycl::queue& queue, Size count, Type* dst, oneapi_engine& engine_, Type a, Type b, - const event_vector& deps = {}) { - } + const event_vector& deps = {}) {} - template + template void uniform_without_replacement_gpu(sycl::queue& queue, Size count, Type* dst, + Type* buff, oneapi_engine& engine_, Type a, Type b, - const event_vector& deps = {}) { - } + const event_vector& deps = {}); - template + template void uniform_without_replacement_cpu(Size count, Type* dst, Type* buffer, @@ -199,10 +170,17 @@ class oneapi_rng { Type b) { void* state = engine_.get_cpu_engine_state(); engine_.skip_ahead_gpu(count); - uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, + dst, + buffer, + state, + a, + b); } - template >> + template >> void shuffle(Size count, Type* dst, oneapi_engine& engine_) { Type idx[2]; @@ -215,20 +193,16 @@ class oneapi_rng { } } - template >> - void shuffle_gpu(Size count, Type* dst, oneapi_engine& engine_) { - Type idx[2]; - - void* state = engine_.get_cpu_engine_state(); - engine_.skip_ahead_gpu(count); - - for (Size i = 0; i < count; ++i) { - uniform_dispatcher::uniform_by_gpu(2, idx, engine_.get_gpu_engine(), 0, count); - std::swap(dst[idx[0]], dst[idx[1]]); - } - } + template + void shuffle_gpu(sycl::queue& queue, + Size count, + Type* dst, + oneapi_engine& engine_, + const event_vector& deps); - template >> + template >> void shuffle_cpu(Size count, Type* dst, oneapi_engine& engine_) { Type idx[2]; @@ -243,4 +217,4 @@ class oneapi_rng { }; #endif -} // namespace oneapi::dal::backend::primitives +} // namespace oneapi::dal::backend::primitives \ No newline at end of file diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp index adfea074998..9a893d471f6 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_cpu.hpp @@ -20,28 +20,15 @@ #include #include #include "oneapi/dal/backend/primitives/rng/utils.hpp" +#include +#include +#include namespace oneapi::dal::backend::primitives { -namespace engine { -namespace v1 { +enum class engine_list_cpu { mt2203, mcg59, mt19937 }; -/// Tag-type that denotes the mt2203 engine. -struct mt2203 {}; - -/// Tag-type that denotes the mcg59 engine. -struct mcg59 {}; - -/// Tag-type that denotes the mt19937 engine. -struct mt19937 {}; - -/// Alias tag-type for the default engine (mt2203). -using by_default = mt2203; - -} // namespace v1 -} // namespace engine - -template +template class daal_engine { public: explicit daal_engine(std::int64_t seed = 777) @@ -62,13 +49,15 @@ class daal_engine { auto& get_cpu_engine() { return daal_engine_; } + private: daal::algorithms::engines::EnginePtr initialize_daal_engine(std::int64_t seed) { switch (EngineType) { - case engine_list::mt2203: + case engine_list_cpu::mt2203: return daal::algorithms::engines::mt2203::Batch<>::create(seed); - case engine_list::mcg59: return daal::algorithms::engines::mcg59::Batch<>::create(seed); - case engine_list::mt19937: + case engine_list_cpu::mcg59: + return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_list_cpu::mt19937: return daal::algorithms::engines::mt19937::Batch<>::create(seed); default: throw std::invalid_argument("Unsupported engine type"); } @@ -84,20 +73,16 @@ class daal_rng { daal_rng() = default; ~daal_rng() = default; - template - void uniform(Size count, Type* dst, daal_engine& engine_, Type a, Type b) { - void* state = engine_.get_cpu_engine_state(); + void uniform(Size count, Type* dst, void* state, Type a, Type b) { uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); } - template void uniform_without_replacement_cpu(Size count, - Type* dst, - Type* buffer, - daal_engine& engine_, - Type a, - Type b) { - void* state = engine_.get_cpu_engine_state(); + Type* dst, + Type* buffer, + void* state, + Type a, + Type b) { uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, @@ -106,12 +91,10 @@ class daal_rng { b); } - template >> - void shuffle(Size count, Type* dst, daal_engine& engine_) { + template >> + void shuffle(Size count, Type* dst, void* state) { Type idx[2]; - void* state = engine_.get_cpu_engine_state(); - for (Size i = 0; i < count; ++i) { uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); std::swap(dst[idx[0]], dst[idx[1]]); @@ -119,4 +102,4 @@ class daal_rng { } }; -} // namespace oneapi::dal::backend::primitives \ No newline at end of file +} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp index 1b8b91f24e3..5d7e3cb3322 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -25,12 +25,12 @@ namespace bk = oneapi::dal::backend; template template void oneapi_rng::uniform_gpu(sycl::queue& queue, - Size count, - Type* dst, - engine& engine_, - Type a, - Type b, - const event_vector& deps) { + Size count, + Type* dst, + oneapi_engine& engine_, + Type a, + Type b, + const event_vector& deps) { auto local_engine = engine_.get_gpu_engine(); oneapi::mkl::rng::uniform distr(a, b); auto event = oneapi::mkl::rng::generate(distr, local_engine, count, dst, { deps }); @@ -41,101 +41,58 @@ void oneapi_rng::uniform_gpu(sycl::queue& queue, template template -void oneapi_rng::uniform(sycl::queue& queue, - Size count, - Type* dst, - engine& engine_, - Type a, - Type b, - bool distr_mode /* = false */, - const event_vector& deps) { - // if (count > 5000) { - uniform_gpu(queue, count, dst, engine_, a, b); - // } - // else { - // uniform_cpu(count, dst, engine_, a, b); - // } +void oneapi_rng::uniform_cpu(Size count, + Type* dst, + oneapi_engine& engine_, + Type a, + Type b) { + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_cpu(count); + uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); +} + +template +template +void oneapi_rng::uniform_without_replacement_gpu(sycl::queue& queue, + Size count, + Type* dst, + Type* buffer, + oneapi_engine& engine_, + Type a, + Type b, + const event_vector& deps) { + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); +} + +template +template +void oneapi_rng::shuffle_gpu(sycl::queue& queue, + Size count, + Type* dst, + oneapi_engine& engine_, + const event_vector& deps) { + Type idx[2]; + + void* state = engine_.get_cpu_engine_state(); + engine_.skip_ahead_gpu(count); + + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } } -// template -// void oneapi_rng::uniform_without_replacement(sycl::queue& queue, -// Size count, -// Type* dst, -// engine& engine_, -// Type a, -// Type b, -// const event_vector& deps) { -// auto engine = oneapi::mkl::rng::load_state(queue, state); - -// oneapi::mkl::rng::uniform distr; -// auto local_buf = -// ndarray::empty(queue, { std::int64_t(b) }, sycl::usm::alloc::device); -// auto local_buf_ptr = local_buf.get_mutable_data(); - -// auto random_buf = ndarray::empty(queue, { count }, sycl::usm::alloc::device); -// auto random_buf_ptr = random_buf.get_mutable_data(); - -// auto fill_event = queue.submit([&](sycl::handler& cgh) { -// cgh.depends_on(deps); -// cgh.parallel_for(sycl::range<1>{ std::size_t(b) }, [=](sycl::id<1> idx) { -// local_buf_ptr[idx] = idx; -// }); -// }); -// fill_event.wait_and_throw(); - -// auto event = oneapi::mkl::rng::generate(distr, engine, count, random_buf_ptr); -// event.wait_and_throw(); - -// queue -// .submit([&](sycl::handler& h) { -// h.parallel_for(sycl::range<1>{ std::size_t(1) }, [=](sycl::id<1> idx) { -// for (std::int64_t i = 0; i < count; ++i) { -// auto j = i + (size_t)(random_buf_ptr[i] * (float)(b - i)); -// auto tmp = local_buf_ptr[i]; -// local_buf_ptr[i] = local_buf_ptr[j]; -// local_buf_ptr[j] = tmp; -// } -// for (std::int64_t i = 0; i < count; ++i) { -// dst[i] = local_buf_ptr[i]; -// } -// }); -// }) -// .wait_and_throw(); -// mkl::rng::save_state(engine, state); -// } - -#define INSTANTIATE(F, Size, EngineType) \ - template ONEDAL_EXPORT void oneapi_rng::uniform(sycl::queue& queue, \ - Size count_, \ - F* dst, \ - engine& engine_, \ - F a, \ - F b, \ - bool dist, \ - const event_vector& deps); - -#define INSTANTIATE_FLOAT(Size) \ - INSTANTIATE(float, Size, engine_list::mt2203) \ - INSTANTIATE(float, Size, engine_list::mcg59) \ - INSTANTIATE(float, Size, engine_list::mt19937) \ - INSTANTIATE(double, Size, engine_list::mt2203) \ - INSTANTIATE(double, Size, engine_list::mcg59) \ - INSTANTIATE(double, Size, engine_list::mt19937) \ - INSTANTIATE(int, Size, engine_list::mt2203) \ - INSTANTIATE(int, Size, engine_list::mcg59) \ - INSTANTIATE(int, Size, engine_list::mt19937) - -INSTANTIATE_FLOAT(std::int64_t); -INSTANTIATE_FLOAT(std::int32_t); - -#define INSTANTIATE_(F, Size, EngineType) \ - template ONEDAL_EXPORT void oneapi_rng::uniform_gpu(sycl::queue& queue, \ - Size count_, \ - F* dst, \ - engine& engine_, \ - F a, \ - F b, \ - const event_vector& deps); +#define INSTANTIATE_(F, Size, EngineType) \ + template ONEDAL_EXPORT void oneapi_rng::uniform_gpu( \ + sycl::queue& queue, \ + Size count_, \ + F* dst, \ + oneapi_engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); #define INSTANTIATE_FLOAT_(Size) \ INSTANTIATE_(float, Size, engine_list::mt2203) \ @@ -151,39 +108,67 @@ INSTANTIATE_FLOAT(std::int32_t); INSTANTIATE_FLOAT_(std::int64_t); INSTANTIATE_FLOAT_(std::int32_t); -// #define INSTANTIATE_WO_REPLACEMENT(F, Size) \ -// template ONEDAL_EXPORT void rng::uniform_without_replacement( \ -// sycl::queue& queue, \ -// Size count_, \ -// F* dst, \ -// std::uint8_t* state, \ -// F a, \ -// F b, \ -// const event_vector& deps); - -// #define INSTANTIATE_WO_REPLACEMENT_FLOAT(Size) \ -// INSTANTIATE_WO_REPLACEMENT(float, Size) \ -// INSTANTIATE_WO_REPLACEMENT(double, Size) \ -// INSTANTIATE_WO_REPLACEMENT(int, Size) - -// INSTANTIATE_WO_REPLACEMENT_FLOAT(std::int64_t); -// INSTANTIATE_WO_REPLACEMENT_FLOAT(std::int32_t); - -// #define INSTANTIATE_WO_REPLACEMENT_MT2203(F, Size) \ -// template ONEDAL_EXPORT void rng::uniform_mt2203(sycl::queue& queue, \ -// Size count_, \ -// F* dst, \ -// std::int64_t state, \ -// F a, \ -// F b, \ -// const event_vector& deps); - -// #define INSTANTIATE_WO_REPLACEMENT_MT2203_FLOAT(Size) \ -// INSTANTIATE_WO_REPLACEMENT_MT2203(float, Size) \ -// INSTANTIATE_WO_REPLACEMENT_MT2203(double, Size) \ -// INSTANTIATE_WO_REPLACEMENT_MT2203(int, Size) - -// INSTANTIATE_WO_REPLACEMENT_MT2203_FLOAT(std::int64_t); -// INSTANTIATE_WO_REPLACEMENT_MT2203_FLOAT(std::int32_t); +#define INSTANTIATE_CPU(F, Size, EngineType) \ + template ONEDAL_EXPORT void oneapi_rng::uniform_cpu( \ + Size count_, \ + F* dst, \ + oneapi_engine& engine_, \ + F a, \ + F b); + +#define INSTANTIATE_FLOAT_CPU(Size) \ + INSTANTIATE_CPU(float, Size, engine_list::mt2203) \ + INSTANTIATE_CPU(float, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(float, Size, engine_list::mt19937) \ + INSTANTIATE_CPU(double, Size, engine_list::mt2203) \ + INSTANTIATE_CPU(double, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(double, Size, engine_list::mt19937) \ + INSTANTIATE_CPU(int, Size, engine_list::mt2203) \ + INSTANTIATE_CPU(int, Size, engine_list::mcg59) \ + INSTANTIATE_CPU(int, Size, engine_list::mt19937) + +INSTANTIATE_FLOAT_CPU(std::int64_t); +INSTANTIATE_FLOAT_CPU(std::int32_t); + +#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(F, Size, EngineType) \ + template ONEDAL_EXPORT void oneapi_rng::uniform_without_replacement_gpu( \ + sycl::queue& queue, \ + Size count_, \ + F* dst, \ + F* buff, \ + oneapi_engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); + +#define INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(Size) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(float, Size, engine_list::mt19937) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(double, Size, engine_list::mt19937) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt2203) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mcg59) \ + INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU(int, Size, engine_list::mt19937) + +INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int64_t); +INSTANTIATE_UNIFORM_WITHOUT_REPLACEMENT_GPU_FLOAT(std::int32_t); + +#define INSTANTIATE_SHUFFLE(F, Size, EngineType) \ + template ONEDAL_EXPORT void oneapi_rng::shuffle_gpu( \ + sycl::queue& queue, \ + Size count_, \ + F* dst, \ + oneapi_engine& engine_, \ + const event_vector& deps); + +#define INSTANTIATE_SHUFFLE_FLOAT(Size) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mt2203) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mcg59) \ + INSTANTIATE_SHUFFLE(int, Size, engine_list::mt19937) + +INSTANTIATE_SHUFFLE_FLOAT(std::int64_t); +INSTANTIATE_SHUFFLE_FLOAT(std::int32_t); } // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp index 9aff7ab3bc6..81ce6bf852b 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp @@ -38,18 +38,18 @@ class engine_collection { seed_(seed) { engines_.reserve(count_); for (Size i = 0; i < count_; ++i) { - engines_.push_back(engine(queue, seed_)); + engines_.push_back(oneapi_engine(queue, seed_)); } } - std::vector> get_engines() const { + std::vector> get_engines() const { return engines_; } private: Size count_; std::int64_t seed_; - std::vector> engines_; + std::vector> engines_; }; #endif From 423bcd20a6d8ef0e637331dc1d1b93ef865ecdc2 Mon Sep 17 00:00:00 2001 From: Alexandr-Solovev Date: Fri, 18 Oct 2024 04:15:42 -0700 Subject: [PATCH 5/5] fixes for forest --- .../backend/gpu/train_splitter_impl_dpc.cpp | 18 +++- .../decision_forest/df_cls_hist_batch.cpp | 89 +++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp index 3990d99d63d..cf8779547ec 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp @@ -536,7 +536,23 @@ sycl::event train_splitter_impl::best_split( const Index bin_block = compute_bin_block_size(queue, hist_prop_count, bin_count); - const Index local_size = bk::device_max_wg_size(queue); + const Index local_size_initial = bk::device_max_wg_size(queue); + Index local_size = local_size_initial; + const auto max_int_limit = std::numeric_limits::max(); + + if (node_count * ftr_count > 0 && node_count * ftr_count <= max_int_limit) { + while (node_count * ftr_count * local_size > max_int_limit) { + local_size /= 2; + } + } else { + std::cerr << "Error: node_count * ftr_count exceeds int limit" << std::endl; + } + + std::cout << "node count = " << node_count << std::endl; + std::cout << "ftr_count = " << ftr_count << std::endl; + std::cout << "local_size = " << local_size << std::endl; + std::cout << "total range size = " << node_count * ftr_count * local_size << std::endl; + const auto nd_range = bk::make_multiple_nd_range_3d({ node_count, ftr_count, local_size }, { 1, 1, local_size }); diff --git a/examples/oneapi/dpc/source/decision_forest/df_cls_hist_batch.cpp b/examples/oneapi/dpc/source/decision_forest/df_cls_hist_batch.cpp index 4900ea6fd54..3e7503108a6 100644 --- a/examples/oneapi/dpc/source/decision_forest/df_cls_hist_batch.cpp +++ b/examples/oneapi/dpc/source/decision_forest/df_cls_hist_batch.cpp @@ -85,3 +85,92 @@ int main(int argc, char const* argv[]) { } return 0; } + +// /******************************************************************************* +// * Copyright 2020 Intel Corporation +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// *******************************************************************************/ + +// #ifndef ONEDAL_DATA_PARALLEL +// #define ONEDAL_DATA_PARALLEL +// #endif + +// #include "oneapi/dal/algo/decision_forest.hpp" +// #include "oneapi/dal/io/csv.hpp" +// #include "oneapi/dal/backend/primitives/ndarray.hpp" +// #include "example_util/utils.hpp" +// #include "oneapi/dal/exceptions.hpp" + +// namespace dal = oneapi::dal; +// namespace df = dal::decision_forest; +// namespace pr = oneapi::dal::backend::primitives; + +// void run(sycl::queue& q) { +// // const auto train_data_file_name = get_data_path("df_classification_train_data.csv"); +// const auto train_response_file_name = get_data_path("df_classification_train_label.csv"); +// const auto test_data_file_name = get_data_path("df_classification_test_data.csv"); +// const auto test_response_file_name = get_data_path("df_classification_test_label.csv"); + +// std::int64_t row_count = 1666660; +// std::int64_t column_count = 1000; +// const auto x_train_ = pr::ndarray::empty({ row_count, column_count }); +// const auto x_train = oneapi::dal::homogen_table::wrap(x_train_.flatten(), row_count, column_count); +// const auto y_train_ = pr::ndarray::empty({ row_count, 1 }); +// const auto y_train = oneapi::dal::homogen_table::wrap(y_train_.flatten(), row_count, 1); + + +// const auto x_test = dal::read(q, dal::csv::data_source{ test_data_file_name }); +// const auto y_test = dal::read(q, dal::csv::data_source{ test_response_file_name }); + +// const auto df_desc = +// df::descriptor{} +// .set_class_count(10) +// .set_tree_count(10) +// .set_features_per_node(x_train.get_column_count()) +// .set_error_metric_mode(df::error_metric_mode::out_of_bag_error) +// .set_variable_importance_mode(df::variable_importance_mode::mdi) +// .set_infer_mode(df::infer_mode::class_responses | df::infer_mode::class_probabilities) +// .set_voting_mode(df::voting_mode::weighted); + +// try { +// const auto result_train = dal::train(q, df_desc, x_train, y_train); + +// // std::cout << "Variable importance results:\n" +// // << result_train.get_var_importance() << std::endl; + +// std::cout << "OOB error: " << result_train.get_oob_err() << std::endl; + +// const auto result_infer = dal::infer(q, df_desc, result_train.get_model(), x_test); + +// std::cout << "Prediction results:\n" << result_infer.get_responses() << std::endl; +// std::cout << "Probabilities results:\n" << result_infer.get_probabilities() << std::endl; + +// std::cout << "Ground truth:\n" << y_test << std::endl; +// } +// catch (dal::unimplemented& e) { +// std::cout << " " << e.what() << std::endl; +// return; +// } +// } + +// int main(int argc, char const* argv[]) { +// for (auto d : list_devices()) { +// std::cout << "Running on " << d.get_platform().get_info() +// << ", " << d.get_info() << "\n" +// << std::endl; +// auto q = sycl::queue{ d }; +// run(q); +// } +// return 0; +// }