From 031efef6d4eb6695b270d243a764d1b14f4e6fe3 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Mon, 9 Dec 2024 10:18:46 +0100
Subject: [PATCH] Optimize tuning compile times (#3074)

---
 cub/benchmarks/bench/partition/flagged.cu    |  8 +++++++-
 cub/benchmarks/bench/partition/if.cu         |  8 +++++++-
 cub/benchmarks/bench/scan/exclusive/base.cuh |  6 +++++-
 cub/benchmarks/bench/select/flagged.cu       |  9 ++++++++-
 cub/benchmarks/bench/select/if.cu            |  9 ++++++++-
 cub/benchmarks/bench/select/unique.cu        |  9 ++++++++-
 cub/benchmarks/bench/transform/heavy.cu      | 14 +++++++++++---
 7 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu
index ab2fd83dca7..fcd81e660f6 100644
--- a/cub/benchmarks/bench/partition/flagged.cu
+++ b/cub/benchmarks/bench/partition/flagged.cu
@@ -182,7 +182,13 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
   });
 }
 
-using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+using ::cuda::std::false_type;
+using ::cuda::std::true_type;
+#ifdef TUNE_DistinctPartitions
+using distinct_partitions = nvbench::type_list<TUNE_DistinctPartitions>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_DistinctPartitions)
+using distinct_partitions = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_DistinctPartitions
 
 NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
   .set_name("base")
diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu
index 5fc4f82f6d9..d456e65fc1c 100644
--- a/cub/benchmarks/bench/partition/if.cu
+++ b/cub/benchmarks/bench/partition/if.cu
@@ -208,7 +208,13 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
   });
 }
 
-using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+using ::cuda::std::false_type;
+using ::cuda::std::true_type;
+#ifdef TUNE_DistinctPartitions
+using distinct_partitions = nvbench::type_list<TUNE_DistinctPartitions>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_DistinctPartitions)
+using distinct_partitions = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_DistinctPartitions
 
 NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
   .set_name("base")
diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh
index e3cd7a7be8e..b5c793ca612 100644
--- a/cub/benchmarks/bench/scan/exclusive/base.cuh
+++ b/cub/benchmarks/bench/scan/exclusive/base.cuh
@@ -131,7 +131,11 @@ static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   });
 }
 
-using some_offset_types = nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>;
+#ifdef TUNE_OffsetT
+using some_offset_types = nvbench::type_list<TUNE_OffsetT>;
+#else
+using some_offset_types = nvbench::type_list<uint32_t, uint64_t>;
+#endif
 
 NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, some_offset_types))
   .set_name("base")
diff --git a/cub/benchmarks/bench/select/flagged.cu b/cub/benchmarks/bench/select/flagged.cu
index 51e3f04d028..3a180a65adc 100644
--- a/cub/benchmarks/bench/select/flagged.cu
+++ b/cub/benchmarks/bench/select/flagged.cu
@@ -163,7 +163,14 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
   });
 }
 
-using may_alias = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+using ::cuda::std::false_type;
+using ::cuda::std::true_type;
+#ifdef TUNE_MayAlias
+using may_alias = nvbench::type_list<TUNE_MayAlias>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_MayAlias)
+using may_alias = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_MayAlias
+
 // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes
 // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI).
 using select_offset_types = nvbench::type_list<int64_t>;
diff --git a/cub/benchmarks/bench/select/if.cu b/cub/benchmarks/bench/select/if.cu
index 00b2763fe49..03d6d57a1ad 100644
--- a/cub/benchmarks/bench/select/if.cu
+++ b/cub/benchmarks/bench/select/if.cu
@@ -189,7 +189,14 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
   });
 }
 
-using may_alias = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+using ::cuda::std::false_type;
+using ::cuda::std::true_type;
+#ifdef TUNE_MayAlias
+using may_alias = nvbench::type_list<TUNE_MayAlias>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_MayAlias)
+using may_alias = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_MayAlias
+
 // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes
 // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI).
 using select_offset_types = nvbench::type_list<int64_t>;
diff --git a/cub/benchmarks/bench/select/unique.cu b/cub/benchmarks/bench/select/unique.cu
index 276db2782ae..110fe4d8fd2 100644
--- a/cub/benchmarks/bench/select/unique.cu
+++ b/cub/benchmarks/bench/select/unique.cu
@@ -141,7 +141,14 @@ static void unique(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlia
   });
 }
 
-using may_alias = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+using ::cuda::std::false_type;
+using ::cuda::std::true_type;
+#ifdef TUNE_MayAlias
+using may_alias = nvbench::type_list<TUNE_MayAlias>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_MayAlias)
+using may_alias = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_MayAlias
+
 // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes
 // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI).
 using select_offset_types = nvbench::type_list<int64_t>;
diff --git a/cub/benchmarks/bench/transform/heavy.cu b/cub/benchmarks/bench/transform/heavy.cu
index be17a04fd8c..00729c03ae1 100644
--- a/cub/benchmarks/bench/transform/heavy.cu
+++ b/cub/benchmarks/bench/transform/heavy.cu
@@ -53,10 +53,18 @@ static void heavy(nvbench::state& state, nvbench::type_list<Heaviness>)
   bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, heavy_functor<Heaviness::value>{});
 }
 
-template <int I>
-using ic = ::cuda::std::integral_constant<int, I>;
+using ::cuda::std::integral_constant;
+#ifdef TUNE_Heaviness
+using heaviness = nvbench::type_list<TUNE_Heaviness>; // expands to "integral_constant<int, ...>"
+#else
+using heaviness =
+  nvbench::type_list<integral_constant<int, 32>,
+                     integral_constant<int, 64>,
+                     integral_constant<int, 128>,
+                     integral_constant<int, 256>>;
+#endif
 
-NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(nvbench::type_list<ic<32>, ic<64>, ic<128>, ic<256>>))
+NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(heaviness))
   .set_name("heavy")
   .set_type_axes_names({"Heaviness{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));