Optimize tuning compile times (#3074)

NVIDIA · Dec 9, 2024 · 031efef · 031efef
1 parent d6c4d8d
commit 031efef
Show file tree

Hide file tree

Showing 7 changed files with 54 additions and 9 deletions.
diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu
@@ -182,7 +182,13 @@ void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPa
   });
 }
 
-using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+using ::cuda::std::false_type;
+using ::cuda::std::true_type;
+#ifdef TUNE_DistinctPartitions
+using distinct_partitions = nvbench::type_list<TUNE_DistinctPartitions>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_DistinctPartitions)
+using distinct_partitions = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_DistinctPartitions
 
 NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
   .set_name("base")

diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu
@@ -208,7 +208,13 @@ void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinct
   });
 }
 
-using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+using ::cuda::std::false_type;
+using ::cuda::std::true_type;
+#ifdef TUNE_DistinctPartitions
+using distinct_partitions = nvbench::type_list<TUNE_DistinctPartitions>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_DistinctPartitions)
+using distinct_partitions = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_DistinctPartitions
 
 NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
   .set_name("base")

diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh
@@ -131,7 +131,11 @@ static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   });
 }
 
-using some_offset_types = nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>;
+#ifdef TUNE_OffsetT
+using some_offset_types = nvbench::type_list<TUNE_OffsetT>;
+#else
+using some_offset_types = nvbench::type_list<uint32_t, uint64_t>;
+#endif
 
 NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, some_offset_types))
   .set_name("base")

diff --git a/cub/benchmarks/bench/select/flagged.cu b/cub/benchmarks/bench/select/flagged.cu
@@ -163,7 +163,14 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
   });
 }
 
-using may_alias = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+using ::cuda::std::false_type;
+using ::cuda::std::true_type;
+#ifdef TUNE_MayAlias
+using may_alias = nvbench::type_list<TUNE_MayAlias>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_MayAlias)
+using may_alias = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_MayAlias
+
 // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes
 // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI).
 using select_offset_types = nvbench::type_list<int64_t>;

diff --git a/cub/benchmarks/bench/select/if.cu b/cub/benchmarks/bench/select/if.cu
@@ -189,7 +189,14 @@ void select(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlias>)
   });
 }
 
-using may_alias = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+using ::cuda::std::false_type;
+using ::cuda::std::true_type;
+#ifdef TUNE_MayAlias
+using may_alias = nvbench::type_list<TUNE_MayAlias>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_MayAlias)
+using may_alias = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_MayAlias
+
 // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes
 // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI).
 using select_offset_types = nvbench::type_list<int64_t>;

diff --git a/cub/benchmarks/bench/select/unique.cu b/cub/benchmarks/bench/select/unique.cu
@@ -141,7 +141,14 @@ static void unique(nvbench::state& state, nvbench::type_list<T, OffsetT, MayAlia
   });
 }
 
-using may_alias = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+using ::cuda::std::false_type;
+using ::cuda::std::true_type;
+#ifdef TUNE_MayAlias
+using may_alias = nvbench::type_list<TUNE_MayAlias>; // expands to "false_type" or "true_type"
+#else // !defined(TUNE_MayAlias)
+using may_alias = nvbench::type_list<false_type, true_type>;
+#endif // TUNE_MayAlias
+
 // The implementation of DeviceSelect for 64-bit offset types uses a streaming approach, where it runs multiple passes
 // using a 32-bit offset type, so we only need to test one (to save time for tuning and the benchmark CI).
 using select_offset_types = nvbench::type_list<int64_t>;

diff --git a/cub/benchmarks/bench/transform/heavy.cu b/cub/benchmarks/bench/transform/heavy.cu
@@ -53,10 +53,18 @@ static void heavy(nvbench::state& state, nvbench::type_list<Heaviness>)
   bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, heavy_functor<Heaviness::value>{});
 }
 
-template <int I>
-using ic = ::cuda::std::integral_constant<int, I>;
+using ::cuda::std::integral_constant;
+#ifdef TUNE_Heaviness
+using heaviness = nvbench::type_list<TUNE_Heaviness>; // expands to "integral_constant<int, ...>"
+#else
+using heaviness =
+  nvbench::type_list<integral_constant<int, 32>,
+                     integral_constant<int, 64>,
+                     integral_constant<int, 128>,
+                     integral_constant<int, 256>>;
+#endif
 
-NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(nvbench::type_list<ic<32>, ic<64>, ic<128>, ic<256>>))
+NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(heaviness))
   .set_name("heavy")
   .set_type_axes_names({"Heaviness{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));