From 89e38c8bde2e977f1bd29f3fd18c761dc80ef306 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Fri, 15 Mar 2024 23:37:35 -0700 Subject: [PATCH 01/14] Add ordered memory atomics from desul --- include/RAJA/policy/desul/atomic.hpp | 204 +++++++++++++++++---------- include/RAJA/policy/desul/policy.hpp | 62 ++++++++ 2 files changed, 191 insertions(+), 75 deletions(-) create mode 100644 include/RAJA/policy/desul/policy.hpp diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp index dbcb5e06eb..9fdec5ccc9 100644 --- a/include/RAJA/policy/desul/atomic.hpp +++ b/include/RAJA/policy/desul/atomic.hpp @@ -12,10 +12,9 @@ #if defined(RAJA_ENABLE_DESUL_ATOMICS) -#include "RAJA/util/macros.hpp" - #include "RAJA/policy/atomic_builtin.hpp" - +#include "RAJA/policy/desul/policy.hpp" +#include "RAJA/util/macros.hpp" #include "desul/atomics.hpp" // Default desul options for RAJA @@ -26,153 +25,208 @@ using raja_default_desul_scope = desul::MemoryScopeDevice; namespace RAJA { +namespace detail +{ +template +struct DesulAtomicPolicy { + using memory_order = raja_default_desul_order; + using memory_scope = raja_default_desul_scope; +}; + +template +struct DesulAtomicPolicy> { + using memory_order = OrderingPolicy; + using memory_scope = ScopePolicy; +}; + +} // namespace detail + RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T -atomicAdd(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_add(const_cast(acc), +RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T volatile *acc, T value) +{ + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_add(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T -atomicSub(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_sub(const_cast(acc), +RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T volatile *acc, T value) +{ + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_sub(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_min(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_min(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_max(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_max(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc) +RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc) { - return desul::atomic_fetch_inc(const_cast(acc), - raja_default_desul_order{}, - raja_default_desul_scope{}); + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_inc(const_cast(acc), + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc, T val) +RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc, T val) { + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; // See: // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc - return desul::atomic_fetch_inc_mod(const_cast(acc), - val, - raja_default_desul_order{}, - raja_default_desul_scope{}); + return desul::atomic_fetch_inc_mod(const_cast(acc), + val, + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc) +RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc) { - return desul::atomic_fetch_dec(const_cast(acc), - raja_default_desul_order{}, - raja_default_desul_scope{}); + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_dec(const_cast(acc), + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc, T val) +RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc, T val) { + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; // See: // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec - return desul::atomic_fetch_dec_mod(const_cast(acc), - val, - raja_default_desul_order{}, - raja_default_desul_scope{}); + return desul::atomic_fetch_dec_mod(const_cast(acc), + val, + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_and(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_and(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_or(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_or(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_xor(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_xor(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicExchange(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy, + T volatile *acc, + T value) { - return desul::atomic_exchange(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_exchange(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicCAS(AtomicPolicy, T volatile *acc, T compare, T value) +RAJA_HOST_DEVICE RAJA_INLINE T +atomicCAS(AtomicPolicy, T volatile *acc, T compare, T value) { - return desul::atomic_compare_exchange(const_cast(acc), - compare, - value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_compare_exchange( + const_cast(acc), compare, value, desul_order{}, desul_scope{}); } } // namespace RAJA #endif // RAJA_ENABLE_DESUL_ATOMICS -#endif // guard +#endif // guard diff --git a/include/RAJA/policy/desul/policy.hpp b/include/RAJA/policy/desul/policy.hpp new file mode 100644 index 0000000000..b743760592 --- /dev/null +++ b/include/RAJA/policy/desul/policy.hpp @@ -0,0 +1,62 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_policy_desul_HPP +#define RAJA_policy_desul_HPP + +#include "RAJA/config.hpp" + +#if defined(RAJA_ENABLE_DESUL_ATOMICS) + +#include "desul/atomics.hpp" + +namespace RAJA +{ + +// Policy to perform an atomic operation with a given memory ordering. +template +struct detail_atomic_t { +}; + +using atomic_seq_cst = + detail_atomic_t; +using atomic_acq_rel = + detail_atomic_t; +using atomic_acquire = + detail_atomic_t; +using atomic_release = + detail_atomic_t; +using atomic_relaxed = + detail_atomic_t; + +using atomic_seq_cst_block = + detail_atomic_t; +using atomic_acq_rel_block = + detail_atomic_t; +using atomic_acquire_block = + detail_atomic_t; +using atomic_release_block = + detail_atomic_t; +using atomic_relaxed_block = + detail_atomic_t; + +using atomic_seq_cst_sys = + detail_atomic_t; +using atomic_acq_rel_sys = + detail_atomic_t; +using atomic_acquire_sys = + detail_atomic_t; +using atomic_release_sys = + detail_atomic_t; +using atomic_relaxed_sys = + detail_atomic_t; + +} // namespace RAJA + +#endif // RAJA_ENABLE_DESUL_ATOMICS + +#endif From b71c42bc74eecbe10982afa00f7162fd30f6fadc Mon Sep 17 00:00:00 2001 From: Max Yang Date: Sat, 16 Mar 2024 00:13:30 -0700 Subject: [PATCH 02/14] Add a test driver for litmus tests, and a message passing test --- .../forall/atomic-basic/CMakeLists.txt | 23 ++ .../test-forall-atomic-litmus.cpp | 319 ++++++++++++++++++ .../tests/test-forall-atomic-litmus-mp.hpp | 168 +++++++++ 3 files changed, 510 insertions(+) create mode 100644 test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp create mode 100644 test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt index 9c2c12d76f..b5e72dc8fc 100644 --- a/test/functional/forall/atomic-basic/CMakeLists.txt +++ b/test/functional/forall/atomic-basic/CMakeLists.txt @@ -28,3 +28,26 @@ foreach( ATOMIC_BACKEND ${FORALL_ATOMIC_BACKENDS} ) target_include_directories(test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND}.exe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) endforeach() + + +set(ENABLE_LITMUS_TESTS OFF) + +if(RAJA_ENABLE_DESUL_ATOMICS AND RAJA_ENABLE_CUDA) + set(LITMUS_BACKEND "Cuda") + set(ENABLE_LITMUS_TESTS ON) +endif() + +if(RAJA_ENABLE_DESUL_ATOMICS AND RAJA_ENABLE_HIP) + set(LITMUS_BACKEND "Hip") + set(ENABLE_LITMUS_TESTS ON) +endif() + +if (ENABLE_LITMUS_TESTS) + raja_add_test( NAME test-forall-atomic-litmus-${LITMUS_BACKEND} + SOURCES test-forall-atomic-litmus.cpp) + target_include_directories(test-forall-atomic-litmus-${LITMUS_BACKEND}.exe + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) +endif() + +unset(ENABLE_LITMUS_TESTS) +unset(LITMUS_BACKEND) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp new file mode 100644 index 0000000000..a55340c148 --- /dev/null +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp @@ -0,0 +1,319 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-atomicpol.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +// +// Header for tests in ./tests directory +// +// Note: CMake adds ./tests as an include dir for these tests. +// +#include +#include + +#include "test-forall-atomic-litmus-mp.hpp" + +using IdxType = size_t; +constexpr int NUM_ITERS = 100; +constexpr int MAX_SHUFFLE_LEN = 1024; +constexpr int STRIDE = 8; +constexpr int DATA_STRESS_SIZE = 2048 * STRIDE; + +constexpr int PERMUTE_PRIME_BLOCK = 11; +constexpr int PERMUTE_PRIME_GRID = 31; + +template +__global__ void dummy_kernel(IdxType index, Func func) +{ + func(index); +} + +// Generic test driver for memory litmus tests on the GPU. +template +struct LitmusTestDriver { +public: + struct TestData { + int block_size; + int grid_size; + + // Number of blocks to run the message-passing litmus test on. + int testing_blocks; + + // Array to shuffle block indices in a test kernel. + IdxType* shuffle_block; + + // Pattern to use for memory stressing. + IdxType* stress_pattern; + // Barrier integers to synchronize testing threads. + IdxType* barriers; + + IdxType* data_stress; + + void allocate(camp::resources::Resource work_res, + int grid_size, + int block_size, + int num_testing_blocks) + { + this->grid_size = grid_size; + this->block_size = block_size; + + testing_blocks = num_testing_blocks; + + shuffle_block = work_res.allocate(grid_size); + stress_pattern = work_res.allocate(MAX_SHUFFLE_LEN); + barriers = work_res.allocate(grid_size / 2); + data_stress = work_res.allocate(DATA_STRESS_SIZE); + } + + void pre_run(camp::resources::Resource work_res) + { + std::random_device rand_device; + // Create a random permutation for the range [0, grid_size) + std::vector shuffle_block_host(grid_size); + { + std::iota(shuffle_block_host.begin(), shuffle_block_host.end(), 0); + std::shuffle(shuffle_block_host.begin(), + shuffle_block_host.end(), + std::mt19937{rand_device()}); + } + work_res.memcpy(shuffle_block, + shuffle_block_host.data(), + sizeof(IdxType) * grid_size); + + // Create a random sequence of 1's and 0's, equally balanced between the + // two + std::vector stress_host(MAX_SHUFFLE_LEN); + { + std::random_device rand_device; + std::shuffle(stress_host.begin(), + stress_host.end(), + std::mt19937{rand_device()}); + } + work_res.memcpy(stress_pattern, + stress_host.data(), + sizeof(IdxType) * MAX_SHUFFLE_LEN); + + work_res.memset(barriers, 0, sizeof(IdxType) * grid_size / 2); + work_res.memset(data_stress, 0, sizeof(IdxType) * DATA_STRESS_SIZE); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(shuffle_block); + work_res.deallocate(stress_pattern); + work_res.deallocate(barriers); + work_res.deallocate(data_stress); + } + }; + + RAJA_HOST_DEVICE LitmusTestDriver() {} + + // Run + static void run() + { + constexpr IdxType BLOCK_SIZE = 256; + int num_blocks = 0; + { + LitmusPolicy dummy_policy{}; + TestData dummy_test_data{}; + auto lambda = [=] RAJA_HOST_DEVICE(IdxType index) { + LitmusTestDriver test_inst{}; + test_inst.test_main(index, dummy_test_data, dummy_policy); + }; + RAJA_UNUSED_VAR(lambda); +#ifdef RAJA_ENABLE_CUDA + cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks, dummy_kernel, BLOCK_SIZE, 0)); + num_blocks *= RAJA::cuda::device_prop().multiProcessorCount; +#endif +#ifdef RAJA_ENABLE_HIP + hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks, dummy_kernel, BLOCK_SIZE, 0)); + num_blocks *= RAJA::hip::device_prop().multiProcessorCount; +#endif + } + std::cout << "Got num_blocks = " << num_blocks + << ", block_size = " << BLOCK_SIZE << "\n" + << std::flush; + if (num_blocks == 0) { + FAIL() << "Grid size wasn't set to a valid value.\n"; + } + +#ifdef RAJA_ENABLE_CUDA + using ResourcePolicy = camp::resources::Cuda; +#endif +#ifdef RAJA_ENABLE_HIP + using ResourcePolicy = camp::resources::Hip; +#endif + camp::resources::Resource work_res{ResourcePolicy()}; + + TestData test_data; + test_data.allocate(work_res, num_blocks, BLOCK_SIZE, num_blocks); + + LitmusPolicy litmus_test; + litmus_test.allocate(work_res, num_blocks * BLOCK_SIZE * STRIDE); + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec; +#endif + + for (int iter = 0; iter < NUM_ITERS; iter++) { + test_data.pre_run(work_res); + litmus_test.pre_run(work_res); + + RAJA::forall( + RAJA::TypedRangeSegment(0, num_blocks * BLOCK_SIZE), + [=] RAJA_HOST_DEVICE(IdxType index) { + LitmusTestDriver test_inst{}; + test_inst.test_main(index, test_data, litmus_test); + }); + + litmus_test.count_results(work_res); + } + + litmus_test.verify(); + + litmus_test.deallocate(work_res); + test_data.deallocate(work_res); + } + +private: + using NormalAtomic = RAJA::atomic_relaxed; + + RAJA_HOST_DEVICE void test_main(IdxType index, + TestData param, + LitmusPolicy test) + { + IdxType block_idx = index / param.block_size; + IdxType thread_idx = index % param.block_size; + + // Permute the thread index, to promote scattering of memory accesses + // within a block. + IdxType permute_thread_idx = + (thread_idx * PERMUTE_PRIME_BLOCK) % param.block_size; + + // Shuffle the block ID randomly according to a permutation array. + block_idx = param.shuffle_block[block_idx]; + + IdxType data_idx = block_idx * param.block_size + permute_thread_idx; + data_idx *= STRIDE; + + if (block_idx < (IdxType)param.testing_blocks) { + // Block is a testing block. + // Each block acts as a "sender" to a unique "partner" block. This is done + // by permuting the block IDs with a function p(i) = i * k mod n, where n + // is the number of blocks being tested, and k and n are coprime. + int partner_idx = (block_idx * PERMUTE_PRIME_GRID) % param.testing_blocks; + + // Pre-stress pattern - stressing memory accesses before the test may + // increase the rate of weak memory behaviors + // this->stress(param.data_stress, block_idx, param.grid_size, 128); + + // Synchronize all blocks before testing, to increase the chance of + // interleaved requests. + this->sync(param.testing_blocks, thread_idx, param.barriers[0]); + + for (int i = 0; i < STRIDE; i++) { + // Run specified test, matching threads between the two paired blocks. + int other_data_idx = + partner_idx * param.block_size + permute_thread_idx; + other_data_idx *= STRIDE; + test.run(data_idx + i, other_data_idx + i); + } + } else { + // Blocks which aren't testing should just stress memory accesses. + // this->stress(param.data_stress, block_idx, param.grid_size, 1024); + } + }; + + RAJA_HOST_DEVICE void sync(int num_blocks, int thread_idx, IdxType& barrier) + { + if (thread_idx == 0) { + IdxType result = RAJA::atomicAdd(&barrier, IdxType{1}); + // Busy-wait until all blocks perform the above add. + while (result != num_blocks) + result = RAJA::atomicAdd(&barrier, IdxType{0}); + } + +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_CODE__) + __syncthreads(); +#endif + } + + RAJA_HOST_DEVICE uint64_t get_rand(uint64_t& pcg_state) + { + uint64_t oldstate = pcg_state; + // Advance internal state + pcg_state = oldstate * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; + // Calculate output function (XSH RR), uses old state for max ILP + uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; + uint32_t rot = oldstate >> 59u; + return (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); + } + + RAJA_HOST_DEVICE void stress(IdxType* stress_data, + IdxType block_idx, + int grid_size, + int num_iters) + { + uint64_t pcg_state = block_idx; + for (int i = 0; i < num_iters; i++) { + // Pseudo-randomly target a given stress data location. + auto rand = get_rand(pcg_state); + auto target_line = rand % DATA_STRESS_SIZE; + + RAJA::atomicAdd(&(stress_data[target_line]), rand); + } + } +}; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, MessagePassingTest) +{ + using Type = typename camp::at>::type; + using SendRecvPol = typename camp::at>::type; + using SendPol = typename camp::at>::type; + using RecvPol = typename camp::at>::type; + + using MPTest = MessagePassingLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, MessagePassingTest); + +using MessagePassingTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + MessagePassingTestTypes); diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp new file mode 100644 index 0000000000..993de18ee4 --- /dev/null +++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp @@ -0,0 +1,168 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +// "Message Passing" litmus test for DESUL ordered atomic +// ------------------------------------------------------ +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// store(x, 1) a = load(flag) +// store(flag, 1) b = load(x) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 1, b = 1 +// - a = 0, b = 0 +// - a = 0, b = 1 +// Weak behavior: +// - a = 1, b = 0 +// +// On weak architectures (POWER/ARM/GPUs), the store to "x" can be reordered +// after the store to "flag". Store-release and load-acquire on the "flag" +// variable should prevent observing the weak behavior. + +// Send policy: Relaxed (Weak), Acquire, AcqRel, SeqCst +// Recv policy: Relaxed (Weak), Release, AcqRel, SeqCst +template +struct MessagePassingLitmus { + using RelaxedPolicy = RAJA::atomic_relaxed; + size_t m_size; + T *x; + T *flag; + T *a; + T *b; + + int strong_behavior_0{0}; + int strong_behavior_1{0}; + int interleaved_behavior{0}; + int weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size) + { + m_size = size; + x = work_res.allocate(size); + flag = work_res.allocate(size); + a = work_res.allocate(size); + b = work_res.allocate(size); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(flag); + work_res.deallocate(a); + work_res.deallocate(b); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size); + work_res.memset(flag, 0, sizeof(T) * m_size); + work_res.memset(a, 0, sizeof(T) * m_size); + work_res.memset(b, 0, sizeof(T) * m_size); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread) + { + // Send action + x[other_thread] = T{1}; + RAJA::atomicExchange(&(flag[other_thread]), T{1}); + // Recv action + a[this_thread] = RAJA::atomicAdd(&(flag[this_thread]), T{0}); + b[this_thread] = x[this_thread]; + } + + void count_results(camp::resources::Resource work_res) + { + camp::resources::Resource host_res{camp::resources::Host()}; + + T *a_host = host_res.allocate(m_size); + T *b_host = host_res.allocate(m_size); + + work_res.memcpy(a_host, a, m_size * sizeof(T)); + work_res.memcpy(b_host, b, m_size * sizeof(T)); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + for (size_t i = 0; i < m_size; i++) { + if (a_host[i] == 0 && b_host[i] == 0) { + // Strong behavior: neither store from test_send is observable + strong_behavior_0++; + } else if (a_host[i] == 1 && b_host[i] == 1) { + // Strong behavior: both stores from test_send are observable + strong_behavior_1++; + } else if (a_host[i] == 0 && b_host[i] == 1) { + // Strong behavior: stores interleaved with receives + interleaved_behavior++; + } else if (a_host[i] == 1 && b_host[i] == 0) { + // Weak behavior: second store observed before first store + weak_behavior++; + } else { + FAIL() << "Unexpected result for index " << i; + } + } + + host_res.deallocate(a_host); + host_res.deallocate(b_host); + } + + void verify() + { + std::cout << " - Strong behavior (a = 0, b = 0) = " << strong_behavior_0 + << "\n"; + std::cout << " - Strong behavior (a = 1, b = 1) = " << strong_behavior_1 + << "\n"; + std::cout << " - Strong behavior (a = 0, b = 1) = " << interleaved_behavior + << "\n"; + std::cout << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value && + std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + ASSERT_GT(weak_behavior, 0); + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using MPLitmusTestOrderPols = + camp::list, + camp::list, + camp::list, + camp::list >; + +using MPLitmusTestRecvPols = camp::list; + +using MPLitmusTestPols = + camp::cartesian_product; From 4be5c8cb62fe89d063afa6463b481a31aeaf50c3 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Sat, 16 Mar 2024 12:10:45 -0700 Subject: [PATCH 03/14] Remove stress_pattern array --- .../test-forall-atomic-litmus.cpp | 21 ++----------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp index a55340c148..41e76c87dd 100644 --- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp @@ -55,8 +55,6 @@ struct LitmusTestDriver { // Array to shuffle block indices in a test kernel. IdxType* shuffle_block; - // Pattern to use for memory stressing. - IdxType* stress_pattern; // Barrier integers to synchronize testing threads. IdxType* barriers; @@ -73,8 +71,7 @@ struct LitmusTestDriver { testing_blocks = num_testing_blocks; shuffle_block = work_res.allocate(grid_size); - stress_pattern = work_res.allocate(MAX_SHUFFLE_LEN); - barriers = work_res.allocate(grid_size / 2); + barriers = work_res.allocate(STRIDE); data_stress = work_res.allocate(DATA_STRESS_SIZE); } @@ -93,20 +90,7 @@ struct LitmusTestDriver { shuffle_block_host.data(), sizeof(IdxType) * grid_size); - // Create a random sequence of 1's and 0's, equally balanced between the - // two - std::vector stress_host(MAX_SHUFFLE_LEN); - { - std::random_device rand_device; - std::shuffle(stress_host.begin(), - stress_host.end(), - std::mt19937{rand_device()}); - } - work_res.memcpy(stress_pattern, - stress_host.data(), - sizeof(IdxType) * MAX_SHUFFLE_LEN); - - work_res.memset(barriers, 0, sizeof(IdxType) * grid_size / 2); + work_res.memset(barriers, 0, sizeof(IdxType) * STRIDE); work_res.memset(data_stress, 0, sizeof(IdxType) * DATA_STRESS_SIZE); #if defined(RAJA_ENABLE_CUDA) @@ -121,7 +105,6 @@ struct LitmusTestDriver { void deallocate(camp::resources::Resource work_res) { work_res.deallocate(shuffle_block); - work_res.deallocate(stress_pattern); work_res.deallocate(barriers); work_res.deallocate(data_stress); } From e73d52ae5ef61171217341319eb263f318e1b1e6 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Sat, 16 Mar 2024 12:16:32 -0700 Subject: [PATCH 04/14] Increase observability of weak memory behaviors Fiddling around with some parameters for the litmus test driver: - It seems that having only a subset of the running blocks participate in the Message Passing litmus test increases the rate at which weak memory behaviors are observed. - Pre-stressing memory doesn't seem to help on NVIDIA V100s. --- .../test-forall-atomic-litmus.cpp | 55 ++++++++++++------- .../tests/test-forall-atomic-litmus-mp.hpp | 2 +- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp index 41e76c87dd..d30f3221b8 100644 --- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp @@ -27,13 +27,20 @@ #include "test-forall-atomic-litmus-mp.hpp" using IdxType = size_t; -constexpr int NUM_ITERS = 100; -constexpr int MAX_SHUFFLE_LEN = 1024; -constexpr int STRIDE = 8; +constexpr int NUM_ITERS = 20; +#ifdef RAJA_ENABLE_CUDA +constexpr int STRIDE = 4; +constexpr bool STRESS_BEFORE_TEST = false; +constexpr bool NONTESTING_BLOCKS = true; +#elif defined(RAJA_ENABLE_HIP) +constexpr int STRIDE = 16; +constexpr bool STRESS_BEFORE_TEST = true; +constexpr bool NONTESTING_BLOCKS = true; +#endif constexpr int DATA_STRESS_SIZE = 2048 * STRIDE; -constexpr int PERMUTE_PRIME_BLOCK = 11; -constexpr int PERMUTE_PRIME_GRID = 31; +constexpr int PERMUTE_PRIME_BLOCK = 17; +constexpr int PERMUTE_PRIME_GRID = 47; template __global__ void dummy_kernel(IdxType index, Func func) @@ -115,7 +122,7 @@ struct LitmusTestDriver { // Run static void run() { - constexpr IdxType BLOCK_SIZE = 256; + constexpr IdxType BLOCK_SIZE = 128; int num_blocks = 0; { LitmusPolicy dummy_policy{}; @@ -151,11 +158,16 @@ struct LitmusTestDriver { #endif camp::resources::Resource work_res{ResourcePolicy()}; + int num_testing_blocks = num_blocks; + if (NONTESTING_BLOCKS) { + num_testing_blocks = num_blocks / 4; + } + TestData test_data; - test_data.allocate(work_res, num_blocks, BLOCK_SIZE, num_blocks); + test_data.allocate(work_res, num_blocks, BLOCK_SIZE, num_testing_blocks); LitmusPolicy litmus_test; - litmus_test.allocate(work_res, num_blocks * BLOCK_SIZE * STRIDE); + litmus_test.allocate(work_res, num_testing_blocks * BLOCK_SIZE * STRIDE); #ifdef RAJA_ENABLE_HIP using GPUExec = RAJA::hip_exec; @@ -213,24 +225,26 @@ struct LitmusTestDriver { // is the number of blocks being tested, and k and n are coprime. int partner_idx = (block_idx * PERMUTE_PRIME_GRID) % param.testing_blocks; - // Pre-stress pattern - stressing memory accesses before the test may - // increase the rate of weak memory behaviors - // this->stress(param.data_stress, block_idx, param.grid_size, 128); - - // Synchronize all blocks before testing, to increase the chance of - // interleaved requests. - this->sync(param.testing_blocks, thread_idx, param.barriers[0]); - for (int i = 0; i < STRIDE; i++) { // Run specified test, matching threads between the two paired blocks. int other_data_idx = partner_idx * param.block_size + permute_thread_idx; other_data_idx *= STRIDE; + + // Pre-stress pattern - stressing memory accesses before the test may + // increase the rate of weak memory behaviors + // Helps on AMD, doesn't seem to help on NVIDIA + if (STRESS_BEFORE_TEST) { + this->stress( + param.data_stress, block_idx, thread_idx, param.grid_size, 128); + } + + // Synchronize all blocks before testing, to increase the chance of + // interleaved requests. + this->sync(param.testing_blocks, thread_idx, param.barriers[i]); + test.run(data_idx + i, other_data_idx + i); } - } else { - // Blocks which aren't testing should just stress memory accesses. - // this->stress(param.data_stress, block_idx, param.grid_size, 1024); } }; @@ -261,6 +275,7 @@ struct LitmusTestDriver { RAJA_HOST_DEVICE void stress(IdxType* stress_data, IdxType block_idx, + IdxType thread_idx, int grid_size, int num_iters) { @@ -268,7 +283,7 @@ struct LitmusTestDriver { for (int i = 0; i < num_iters; i++) { // Pseudo-randomly target a given stress data location. auto rand = get_rand(pcg_state); - auto target_line = rand % DATA_STRESS_SIZE; + auto target_line = (rand + thread_idx) % DATA_STRESS_SIZE; RAJA::atomicAdd(&(stress_data[target_line]), rand); } diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp index 993de18ee4..8a4bb1e495 100644 --- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp +++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp @@ -90,7 +90,7 @@ struct MessagePassingLitmus { { // Send action x[other_thread] = T{1}; - RAJA::atomicExchange(&(flag[other_thread]), T{1}); + RAJA::atomicAdd(&(flag[other_thread]), T{1}); // Recv action a[this_thread] = RAJA::atomicAdd(&(flag[this_thread]), T{0}); b[this_thread] = x[this_thread]; From e0dea11fa9d8b5f4e1ed20ab45084e59a2180167 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Sat, 16 Mar 2024 12:21:33 -0700 Subject: [PATCH 05/14] Add a store buffer litmus test Store buffering is an observable behavior where a store may be reordered after a load. This exercises MemoryOrderSeqCst. --- .../test-forall-atomic-litmus.cpp | 35 +++- .../tests/test-forall-atomic-litmus-sb.hpp | 164 ++++++++++++++++++ 2 files changed, 194 insertions(+), 5 deletions(-) create mode 100644 test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp index d30f3221b8..dbf5201fc2 100644 --- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp @@ -25,6 +25,7 @@ #include #include "test-forall-atomic-litmus-mp.hpp" +#include "test-forall-atomic-litmus-sb.hpp" using IdxType = size_t; constexpr int NUM_ITERS = 20; @@ -290,14 +291,14 @@ struct LitmusTestDriver { } }; -TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); +TYPED_TEST_SUITE_P(ForallAtomicLitmusTestMP); template -class ForallAtomicLitmusTest : public ::testing::Test +class ForallAtomicLitmusTestMP : public ::testing::Test { }; -TYPED_TEST_P(ForallAtomicLitmusTest, MessagePassingTest) +TYPED_TEST_P(ForallAtomicLitmusTestMP, MessagePassingTest) { using Type = typename camp::at>::type; using SendRecvPol = typename camp::at>::type; @@ -308,10 +309,34 @@ TYPED_TEST_P(ForallAtomicLitmusTest, MessagePassingTest) LitmusTestDriver::run(); } -REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, MessagePassingTest); +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTestMP, MessagePassingTest); using MessagePassingTestTypes = Test::Types; INSTANTIATE_TYPED_TEST_SUITE_P(Hip, - ForallAtomicLitmusTest, + ForallAtomicLitmusTestMP, MessagePassingTestTypes); + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTestSB); + +template +class ForallAtomicLitmusTestSB : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTestSB, StoreBufferTest) +{ + using Type = typename camp::at>::type; + using AtomicPol = typename camp::at>::type; + + using SBTest = StoreBufferLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTestSB, StoreBufferTest); + +using StoreBufferTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTestSB, + StoreBufferTestTypes); diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp new file mode 100644 index 0000000000..3498afa2a4 --- /dev/null +++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp @@ -0,0 +1,164 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +// "Store buffer" litmus test for DESUL ordered atomic +// --------------------------------------------------- +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// store(x, 1) store(y, 1) +// a = load(y) b = load(x) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 1, b = 1 +// - a = 0, b = 1 +// - a = 1, b = 0 +// Weak behavior: +// - a = 0, b = 0 +// +// Acquire-release semantics are not enough to disallow the stores to be +// reordered after the load -- full sequential consistency is required in order +// to impose a "single total order" of the stores. + +// Send policy: Relaxed, SeqCst (Strong) +// Recv policy: Relaxed, SeqCst (Strong) +template +struct StoreBufferLitmus { + using RelaxedPolicy = RAJA::atomic_relaxed; + size_t m_size; + T *x; + T *y; + T *a; + T *b; + + int strong_behavior_0{0}; + int strong_behavior_1{0}; + int interleaved_behavior{0}; + int weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size) + { + m_size = size; + x = work_res.allocate(size); + y = work_res.allocate(size); + a = work_res.allocate(size); + b = work_res.allocate(size); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(y); + work_res.deallocate(a); + work_res.deallocate(b); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size); + work_res.memset(y, 0, sizeof(T) * m_size); + work_res.memset(a, 0, sizeof(T) * m_size); + work_res.memset(b, 0, sizeof(T) * m_size); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread) + { + // Store-buffer 1 + RAJA::atomicAdd(&(x[other_thread]), T{1}); + // a[other_thread] = RAJA::atomicAdd(&(y[other_thread]), + // T{0}); + a[other_thread] = y[other_thread]; + // Store-buffer 2 + RAJA::atomicAdd(&(y[this_thread]), T{1}); + b[this_thread] = x[this_thread]; + } + + void count_results(camp::resources::Resource work_res) + { + camp::resources::Resource host_res{camp::resources::Host()}; + + T *a_host = host_res.allocate(m_size); + T *b_host = host_res.allocate(m_size); + + work_res.memcpy(a_host, a, m_size * sizeof(T)); + work_res.memcpy(b_host, b, m_size * sizeof(T)); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + for (size_t i = 0; i < m_size; i++) { + if (a_host[i] == 1 && b_host[i] == 0) { + // Strong behavior: thread 1 happened before thread 2 + strong_behavior_0++; + } else if (a_host[i] == 0 && b_host[i] == 1) { + // Strong behavior: thread 2 happened before thread 1 + strong_behavior_1++; + } else if (a_host[i] == 1 && b_host[i] == 1) { + // Strong behavior: stores interleaved with receives + interleaved_behavior++; + } else if (a_host[i] == 0 && b_host[i] == 0) { + // Weak behavior: stores reordered after receives + weak_behavior++; + } else { + FAIL() << "Unexpected result for index " << i; + } + } + + host_res.deallocate(a_host); + host_res.deallocate(b_host); + } + + void verify() + { + std::cout << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 + << "\n"; + std::cout << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 + << "\n"; + std::cout << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior + << "\n"; + std::cout << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + ASSERT_GT(weak_behavior, 0); + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using SBLitmusTestOrderPols = + camp::list; + +using SBLitmusTestPols = + camp::cartesian_product; From 35a7a11f56f9578f0a755bcdd1fee25813643bb1 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Mon, 18 Mar 2024 13:54:51 -0700 Subject: [PATCH 06/14] Do strided index calculations within policy --- .../test-forall-atomic-litmus.cpp | 6 +-- .../tests/test-forall-atomic-litmus-mp.hpp | 43 ++++++++++-------- .../tests/test-forall-atomic-litmus-sb.hpp | 44 ++++++++++--------- 3 files changed, 49 insertions(+), 44 deletions(-) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp index dbf5201fc2..ce02567cb5 100644 --- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp @@ -168,7 +168,7 @@ struct LitmusTestDriver { test_data.allocate(work_res, num_blocks, BLOCK_SIZE, num_testing_blocks); LitmusPolicy litmus_test; - litmus_test.allocate(work_res, num_testing_blocks * BLOCK_SIZE * STRIDE); + litmus_test.allocate(work_res, num_testing_blocks * BLOCK_SIZE, STRIDE); #ifdef RAJA_ENABLE_HIP using GPUExec = RAJA::hip_exec; @@ -217,7 +217,6 @@ struct LitmusTestDriver { block_idx = param.shuffle_block[block_idx]; IdxType data_idx = block_idx * param.block_size + permute_thread_idx; - data_idx *= STRIDE; if (block_idx < (IdxType)param.testing_blocks) { // Block is a testing block. @@ -230,7 +229,6 @@ struct LitmusTestDriver { // Run specified test, matching threads between the two paired blocks. int other_data_idx = partner_idx * param.block_size + permute_thread_idx; - other_data_idx *= STRIDE; // Pre-stress pattern - stressing memory accesses before the test may // increase the rate of weak memory behaviors @@ -244,7 +242,7 @@ struct LitmusTestDriver { // interleaved requests. this->sync(param.testing_blocks, thread_idx, param.barriers[i]); - test.run(data_idx + i, other_data_idx + i); + test.run(data_idx, other_data_idx, i); } } }; diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp index 8a4bb1e495..b8f25ca9df 100644 --- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp +++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp @@ -43,6 +43,7 @@ template struct MessagePassingLitmus { using RelaxedPolicy = RAJA::atomic_relaxed; size_t m_size; + int m_stride; T *x; T *flag; T *a; @@ -53,13 +54,14 @@ struct MessagePassingLitmus { int interleaved_behavior{0}; int weak_behavior{0}; - void allocate(camp::resources::Resource work_res, size_t size) + void allocate(camp::resources::Resource work_res, size_t size, int stride) { m_size = size; - x = work_res.allocate(size); - flag = work_res.allocate(size); - a = work_res.allocate(size); - b = work_res.allocate(size); + m_stride = stride; + x = work_res.allocate(size * stride); + flag = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + b = work_res.allocate(size * stride); } void deallocate(camp::resources::Resource work_res) @@ -72,10 +74,10 @@ struct MessagePassingLitmus { void pre_run(camp::resources::Resource work_res) { - work_res.memset(x, 0, sizeof(T) * m_size); - work_res.memset(flag, 0, sizeof(T) * m_size); - work_res.memset(a, 0, sizeof(T) * m_size); - work_res.memset(b, 0, sizeof(T) * m_size); + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(flag, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + work_res.memset(b, 0, sizeof(T) * m_size * m_stride); #if defined(RAJA_ENABLE_CUDA) cudaErrchk(cudaDeviceSynchronize()); @@ -86,25 +88,28 @@ struct MessagePassingLitmus { #endif } - RAJA_HOST_DEVICE void run(int this_thread, int other_thread) + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) { + int this_thread_idx = this_thread * m_stride + iter; + int other_thread_idx = other_thread * m_stride + iter; // Send action - x[other_thread] = T{1}; - RAJA::atomicAdd(&(flag[other_thread]), T{1}); + x[other_thread_idx] = T{1}; + RAJA::atomicAdd(&(flag[other_thread_idx]), T{1}); // Recv action - a[this_thread] = RAJA::atomicAdd(&(flag[this_thread]), T{0}); - b[this_thread] = x[this_thread]; + a[this_thread_idx] = + RAJA::atomicAdd(&(flag[this_thread_idx]), T{0}); + b[this_thread_idx] = x[this_thread_idx]; } void count_results(camp::resources::Resource work_res) { camp::resources::Resource host_res{camp::resources::Host()}; - T *a_host = host_res.allocate(m_size); - T *b_host = host_res.allocate(m_size); + T *a_host = host_res.allocate(m_size * m_stride); + T *b_host = host_res.allocate(m_size * m_stride); - work_res.memcpy(a_host, a, m_size * sizeof(T)); - work_res.memcpy(b_host, b, m_size * sizeof(T)); + work_res.memcpy(a_host, a, m_size * m_stride * sizeof(T)); + work_res.memcpy(b_host, b, m_size * m_stride * sizeof(T)); #if defined(RAJA_ENABLE_CUDA) cudaErrchk(cudaDeviceSynchronize()); @@ -113,7 +118,7 @@ struct MessagePassingLitmus { #if defined(RAJA_ENABLE_HIP) hipErrchk(hipDeviceSynchronize()); #endif - for (size_t i = 0; i < m_size; i++) { + for (size_t i = 0; i < m_size * m_stride; i++) { if (a_host[i] == 0 && b_host[i] == 0) { // Strong behavior: neither store from test_send is observable strong_behavior_0++; diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp index 3498afa2a4..efbd11740d 100644 --- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp +++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp @@ -43,6 +43,7 @@ template struct StoreBufferLitmus { using RelaxedPolicy = RAJA::atomic_relaxed; size_t m_size; + int m_stride; T *x; T *y; T *a; @@ -53,13 +54,14 @@ struct StoreBufferLitmus { int interleaved_behavior{0}; int weak_behavior{0}; - void allocate(camp::resources::Resource work_res, size_t size) + void allocate(camp::resources::Resource work_res, size_t size, int stride) { m_size = size; - x = work_res.allocate(size); - y = work_res.allocate(size); - a = work_res.allocate(size); - b = work_res.allocate(size); + m_stride = stride; + x = work_res.allocate(size * stride); + y = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + b = work_res.allocate(size * stride); } void deallocate(camp::resources::Resource work_res) @@ -72,10 +74,10 @@ struct StoreBufferLitmus { void pre_run(camp::resources::Resource work_res) { - work_res.memset(x, 0, sizeof(T) * m_size); - work_res.memset(y, 0, sizeof(T) * m_size); - work_res.memset(a, 0, sizeof(T) * m_size); - work_res.memset(b, 0, sizeof(T) * m_size); + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(y, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + work_res.memset(b, 0, sizeof(T) * m_size * m_stride); #if defined(RAJA_ENABLE_CUDA) cudaErrchk(cudaDeviceSynchronize()); @@ -86,27 +88,27 @@ struct StoreBufferLitmus { #endif } - RAJA_HOST_DEVICE void run(int this_thread, int other_thread) + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) { + int this_thread_idx = this_thread * m_stride + iter; + int other_thread_idx = other_thread * m_stride + iter; // Store-buffer 1 - RAJA::atomicAdd(&(x[other_thread]), T{1}); - // a[other_thread] = RAJA::atomicAdd(&(y[other_thread]), - // T{0}); - a[other_thread] = y[other_thread]; + RAJA::atomicAdd(&(x[other_thread_idx]), T{1}); + a[other_thread_idx] = y[other_thread_idx]; // Store-buffer 2 - RAJA::atomicAdd(&(y[this_thread]), T{1}); - b[this_thread] = x[this_thread]; + RAJA::atomicAdd(&(y[this_thread_idx]), T{1}); + b[this_thread_idx] = x[this_thread_idx]; } void count_results(camp::resources::Resource work_res) { camp::resources::Resource host_res{camp::resources::Host()}; - T *a_host = host_res.allocate(m_size); - T *b_host = host_res.allocate(m_size); + T *a_host = host_res.allocate(m_size * m_stride); + T *b_host = host_res.allocate(m_size * m_stride); - work_res.memcpy(a_host, a, m_size * sizeof(T)); - work_res.memcpy(b_host, b, m_size * sizeof(T)); + work_res.memcpy(a_host, a, m_size * m_stride * sizeof(T)); + work_res.memcpy(b_host, b, m_size * m_stride * sizeof(T)); #if defined(RAJA_ENABLE_CUDA) cudaErrchk(cudaDeviceSynchronize()); @@ -115,7 +117,7 @@ struct StoreBufferLitmus { #if defined(RAJA_ENABLE_HIP) hipErrchk(hipDeviceSynchronize()); #endif - for (size_t i = 0; i < m_size; i++) { + for (size_t i = 0; i < m_size * m_stride; i++) { if (a_host[i] == 1 && b_host[i] == 0) { // Strong behavior: thread 1 happened before thread 2 strong_behavior_0++; From d32bf9895974d4e332676b57bb18cc95179b3fa8 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Tue, 19 Mar 2024 12:44:03 -0700 Subject: [PATCH 07/14] Litmus tests: various modifications - Use a forall device kernel to check results - Interleave order of operations between testing threads - Only warn on a lack of observed relaxed behaviors --- .../tests/test-forall-atomic-litmus-mp.hpp | 123 ++++++++++++------ .../tests/test-forall-atomic-litmus-sb.hpp | 108 ++++++++++----- 2 files changed, 155 insertions(+), 76 deletions(-) diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp index b8f25ca9df..64deb1beb3 100644 --- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp +++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp @@ -41,7 +41,9 @@ // Recv policy: Relaxed (Weak), Release, AcqRel, SeqCst template struct MessagePassingLitmus { + using DataType = T; using RelaxedPolicy = RAJA::atomic_relaxed; + constexpr static int PERMUTE_THREAD_FLAG = 97; size_t m_size; int m_stride; T *x; @@ -49,10 +51,10 @@ struct MessagePassingLitmus { T *a; T *b; - int strong_behavior_0{0}; - int strong_behavior_1{0}; - int interleaved_behavior{0}; - int weak_behavior{0}; + size_t strong_behavior_0{0}; + size_t strong_behavior_1{0}; + size_t interleaved_behavior{0}; + size_t weak_behavior{0}; void allocate(camp::resources::Resource work_res, size_t size, int stride) { @@ -90,70 +92,111 @@ struct MessagePassingLitmus { RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) { - int this_thread_idx = this_thread * m_stride + iter; - int other_thread_idx = other_thread * m_stride + iter; + bool send_first = (this_thread % 2 == 0); // Send action - x[other_thread_idx] = T{1}; - RAJA::atomicAdd(&(flag[other_thread_idx]), T{1}); - // Recv action - a[this_thread_idx] = - RAJA::atomicAdd(&(flag[this_thread_idx]), T{0}); - b[this_thread_idx] = x[this_thread_idx]; + if (send_first) { + this->run_send(other_thread, iter); + this->run_recv(this_thread, iter); + } else { + this->run_recv(this_thread, iter); + this->run_send(other_thread, iter); + } } - void count_results(camp::resources::Resource work_res) + RAJA_HOST_DEVICE void run_send(int other_thread, int iter) { - camp::resources::Resource host_res{camp::resources::Host()}; + int other_thread_idx = other_thread * m_stride + iter; + int permute_other_thread = (other_thread * PERMUTE_THREAD_FLAG) % m_size; + int permute_idx = permute_other_thread * m_stride + iter; + RAJA::atomicAdd(&(x[other_thread_idx]), T{1}); + RAJA::atomicAdd(&(flag[permute_idx]), T{1}); + } - T *a_host = host_res.allocate(m_size * m_stride); - T *b_host = host_res.allocate(m_size * m_stride); + RAJA_HOST_DEVICE void run_recv(int this_thread, int iter) + { + int this_thread_idx = this_thread * m_stride + iter; + int permute_this_thread = (this_thread * PERMUTE_THREAD_FLAG) % m_size; + int permute_idx = permute_this_thread * m_stride + iter; + a[this_thread_idx] = + RAJA::atomicAdd(&(flag[permute_idx]), T{0}); + b[this_thread_idx] = + RAJA::atomicAdd(&(x[this_thread_idx]), T{0}); + } - work_res.memcpy(a_host, a, m_size * m_stride * sizeof(T)); - work_res.memcpy(b_host, b, m_size * m_stride * sizeof(T)); + void count_results(camp::resources::Resource work_res) + { -#if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaDeviceSynchronize()); +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; #endif -#if defined(RAJA_ENABLE_HIP) - hipErrchk(hipDeviceSynchronize()); +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; #endif - for (size_t i = 0; i < m_size * m_stride; i++) { - if (a_host[i] == 0 && b_host[i] == 0) { + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *a_local = a; + T *b_local = b; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (a_local[i] == 0 && b_local[i] == 0) { // Strong behavior: neither store from test_send is observable - strong_behavior_0++; - } else if (a_host[i] == 1 && b_host[i] == 1) { + strong_cnt_0 += 1; + } else if (a_local[i] == 1 && b_local[i] == 1) { // Strong behavior: both stores from test_send are observable - strong_behavior_1++; - } else if (a_host[i] == 0 && b_host[i] == 1) { + strong_cnt_1 += 1; + } else if (a_local[i] == 0 && b_local[i] == 1) { // Strong behavior: stores interleaved with receives - interleaved_behavior++; - } else if (a_host[i] == 1 && b_host[i] == 0) { + interleaved_cnt += 1; + } else if (a_local[i] == 1 && b_local[i] == 0) { // Weak behavior: second store observed before first store - weak_behavior++; + weak_cnt += 1; } else { - FAIL() << "Unexpected result for index " << i; + unexpected_cnt += 1; } - } + }); - host_res.deallocate(a_host); - host_res.deallocate(b_host); + EXPECT_EQ(unexpected_cnt.get(), 0); + + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); } void verify() { - std::cout << " - Strong behavior (a = 0, b = 0) = " << strong_behavior_0 + std::cerr << " - Strong behavior (a = 0, b = 0) = " << strong_behavior_0 << "\n"; - std::cout << " - Strong behavior (a = 1, b = 1) = " << strong_behavior_1 + std::cerr << " - Strong behavior (a = 1, b = 1) = " << strong_behavior_1 << "\n"; - std::cout << " - Strong behavior (a = 0, b = 1) = " << interleaved_behavior + std::cerr << " - Strong behavior (a = 0, b = 1) = " << interleaved_behavior << "\n"; - std::cout << " - Weak behaviors = " << weak_behavior << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; if (std::is_same::value && std::is_same::value) { // In the relaxed case, we should observe some weak behaviors. - ASSERT_GT(weak_behavior, 0); + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } } else { // We should not expect any weak behaviors if using a strong ordering. EXPECT_EQ(weak_behavior, 0); diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp index efbd11740d..c3cde1e760 100644 --- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp +++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp @@ -41,6 +41,7 @@ // Recv policy: Relaxed, SeqCst (Strong) template struct StoreBufferLitmus { + using DataType = T; using RelaxedPolicy = RAJA::atomic_relaxed; size_t m_size; int m_stride; @@ -90,68 +91,103 @@ struct StoreBufferLitmus { RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) { - int this_thread_idx = this_thread * m_stride + iter; - int other_thread_idx = other_thread * m_stride + iter; - // Store-buffer 1 - RAJA::atomicAdd(&(x[other_thread_idx]), T{1}); - a[other_thread_idx] = y[other_thread_idx]; - // Store-buffer 2 - RAJA::atomicAdd(&(y[this_thread_idx]), T{1}); - b[this_thread_idx] = x[this_thread_idx]; + bool swap = this_thread % 2 == 0; + if (swap) { + store_buffer_1(other_thread, iter); + store_buffer_2(this_thread, iter); + } else { + store_buffer_2(this_thread, iter); + store_buffer_1(other_thread, iter); + } } - void count_results(camp::resources::Resource work_res) + RAJA_HOST_DEVICE void store_buffer_1(int thread, int iter) { - camp::resources::Resource host_res{camp::resources::Host()}; + int thread_idx = thread * m_stride + iter; + RAJA::atomicAdd(&(x[thread_idx]), T{1}); + a[thread_idx] = RAJA::atomicAdd(&(y[thread_idx]), T{0}); + } - T *a_host = host_res.allocate(m_size * m_stride); - T *b_host = host_res.allocate(m_size * m_stride); + RAJA_HOST_DEVICE void store_buffer_2(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicAdd(&(y[thread_idx]), T{1}); + b[thread_idx] = RAJA::atomicAdd(&(x[thread_idx]), T{0}); + } - work_res.memcpy(a_host, a, m_size * m_stride * sizeof(T)); - work_res.memcpy(b_host, b, m_size * m_stride * sizeof(T)); + void count_results(camp::resources::Resource work_res) + { -#if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaDeviceSynchronize()); +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; #endif -#if defined(RAJA_ENABLE_HIP) - hipErrchk(hipDeviceSynchronize()); +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; #endif - for (size_t i = 0; i < m_size * m_stride; i++) { - if (a_host[i] == 1 && b_host[i] == 0) { + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *a_local = a; + T *b_local = b; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (a_local[i] == 1 && b_local[i] == 0) { // Strong behavior: thread 1 happened before thread 2 - strong_behavior_0++; - } else if (a_host[i] == 0 && b_host[i] == 1) { + strong_cnt_0 += 1; + } else if (a_local[i] == 0 && b_local[i] == 1) { // Strong behavior: thread 2 happened before thread 1 - strong_behavior_1++; - } else if (a_host[i] == 1 && b_host[i] == 1) { + strong_cnt_1 += 1; + } else if (a_local[i] == 1 && b_local[i] == 1) { // Strong behavior: stores interleaved with receives - interleaved_behavior++; - } else if (a_host[i] == 0 && b_host[i] == 0) { + interleaved_cnt += 1; + } else if (a_local[i] == 0 && b_local[i] == 0) { // Weak behavior: stores reordered after receives - weak_behavior++; + weak_cnt += 1; } else { - FAIL() << "Unexpected result for index " << i; + unexpected_cnt += 1; } - } + }); + + EXPECT_EQ(unexpected_cnt.get(), 0); - host_res.deallocate(a_host); - host_res.deallocate(b_host); + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); } void verify() { - std::cout << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 + std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 << "\n"; - std::cout << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 + std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 << "\n"; - std::cout << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior + std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior << "\n"; - std::cout << " - Weak behaviors = " << weak_behavior << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; if (std::is_same::value) { // In the relaxed case, we should observe some weak behaviors. - ASSERT_GT(weak_behavior, 0); + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } } else { // We should not expect any weak behaviors if using a strong ordering. EXPECT_EQ(weak_behavior, 0); From 822d8ddb4ec11b998978a00aa117989561a8c8a3 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Tue, 19 Mar 2024 13:44:06 -0700 Subject: [PATCH 08/14] Litmus tests: changes to increase relaxed observation rates Correctly use the stress testing formulation from the paper, "Foundations of Empirical Memory Consistency Testing" (OOPSLA 2020). Instead of having all stressing blocks scatter their accesses across the "stressing" array, select a small-ish subset of 64-word lines and stripe them across the stressing blocks. This increases the stress on the contention hardware in a GPU. Synchronize testing blocks and stressing blocks together on each iteration. --- .../test-forall-atomic-litmus.cpp | 130 +++++++++++------- 1 file changed, 84 insertions(+), 46 deletions(-) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp index ce02567cb5..f987f5e764 100644 --- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp @@ -28,20 +28,28 @@ #include "test-forall-atomic-litmus-sb.hpp" using IdxType = size_t; -constexpr int NUM_ITERS = 20; +constexpr int NUM_ITERS = 100; #ifdef RAJA_ENABLE_CUDA -constexpr int STRIDE = 4; -constexpr bool STRESS_BEFORE_TEST = false; +constexpr int STRIDE = 32; +constexpr bool STRESS_BEFORE_TEST = true; constexpr bool NONTESTING_BLOCKS = true; +constexpr int WARP_SIZE = 32; +constexpr int BLOCK_SIZE = 128; #elif defined(RAJA_ENABLE_HIP) -constexpr int STRIDE = 16; +constexpr int STRIDE = 32; constexpr bool STRESS_BEFORE_TEST = true; constexpr bool NONTESTING_BLOCKS = true; +constexpr int WARP_SIZE = 64; +constexpr int BLOCK_SIZE = 128; #endif -constexpr int DATA_STRESS_SIZE = 2048 * STRIDE; + +constexpr int STRESS_STRIDE = 64; +constexpr int NUM_STRESS_LINES = 2048; +constexpr int DATA_STRESS_SIZE = NUM_STRESS_LINES * STRESS_STRIDE; constexpr int PERMUTE_PRIME_BLOCK = 17; constexpr int PERMUTE_PRIME_GRID = 47; +constexpr int PERMUTE_STRESS = 977; template __global__ void dummy_kernel(IdxType index, Func func) @@ -53,10 +61,16 @@ __global__ void dummy_kernel(IdxType index, Func func) template struct LitmusTestDriver { public: + using T = typename LitmusPolicy::DataType; struct TestData { int block_size; int grid_size; + int stride = STRIDE; + + int pre_stress_iterations = 4; + int stress_iterations = 12; + // Number of blocks to run the message-passing litmus test on. int testing_blocks; @@ -66,7 +80,10 @@ struct LitmusTestDriver { // Barrier integers to synchronize testing threads. IdxType* barriers; - IdxType* data_stress; + int* data_stress; + + int num_stress_index; + IdxType* stress_index; void allocate(camp::resources::Resource work_res, int grid_size, @@ -80,7 +97,10 @@ struct LitmusTestDriver { shuffle_block = work_res.allocate(grid_size); barriers = work_res.allocate(STRIDE); - data_stress = work_res.allocate(DATA_STRESS_SIZE); + data_stress = work_res.allocate(DATA_STRESS_SIZE); + + num_stress_index = 64; + stress_index = work_res.allocate(num_stress_index); } void pre_run(camp::resources::Resource work_res) @@ -98,8 +118,26 @@ struct LitmusTestDriver { shuffle_block_host.data(), sizeof(IdxType) * grid_size); + std::vector stress_index_host(num_stress_index); + { + std::mt19937 gen{rand_device()}; + std::uniform_int_distribution rnd_stress_offset(0, + STRESS_STRIDE); + std::uniform_int_distribution rnd_stress_dist( + 0, NUM_STRESS_LINES - 1); + std::generate(stress_index_host.begin(), + stress_index_host.end(), + [&]() -> IdxType { + return rnd_stress_dist(gen) * STRESS_STRIDE; + }); + } + work_res.memcpy(stress_index, + stress_index_host.data(), + sizeof(IdxType) * num_stress_index); + work_res.memset(barriers, 0, sizeof(IdxType) * STRIDE); - work_res.memset(data_stress, 0, sizeof(IdxType) * DATA_STRESS_SIZE); + work_res.memset(data_stress, 0, sizeof(int) * DATA_STRESS_SIZE); + #if defined(RAJA_ENABLE_CUDA) cudaErrchk(cudaDeviceSynchronize()); @@ -123,7 +161,6 @@ struct LitmusTestDriver { // Run static void run() { - constexpr IdxType BLOCK_SIZE = 128; int num_blocks = 0; { LitmusPolicy dummy_policy{}; @@ -216,16 +253,25 @@ struct LitmusTestDriver { // Shuffle the block ID randomly according to a permutation array. block_idx = param.shuffle_block[block_idx]; - IdxType data_idx = block_idx * param.block_size + permute_thread_idx; + IdxType data_idx = block_idx * param.block_size + thread_idx; - if (block_idx < (IdxType)param.testing_blocks) { - // Block is a testing block. - // Each block acts as a "sender" to a unique "partner" block. This is done - // by permuting the block IDs with a function p(i) = i * k mod n, where n - // is the number of blocks being tested, and k and n are coprime. - int partner_idx = (block_idx * PERMUTE_PRIME_GRID) % param.testing_blocks; + for (int i = 0; i < param.stride; i++) { + + // Synchronize all blocks before testing, to increase the chance of + // interleaved requests. + this->sync(param.grid_size, thread_idx, param.barriers[i]); + + if (block_idx < (IdxType)param.testing_blocks) { + + // Block is a testing block. + // + // Each block acts as a "sender" to a unique "partner" block. This is + // done by permuting the block IDs with a function p(i) = i * k mod n, + // where n is the number of blocks being tested, and k and n are + // coprime. + int partner_idx = + (block_idx * PERMUTE_PRIME_GRID + i) % param.testing_blocks; - for (int i = 0; i < STRIDE; i++) { // Run specified test, matching threads between the two paired blocks. int other_data_idx = partner_idx * param.block_size + permute_thread_idx; @@ -234,15 +280,12 @@ struct LitmusTestDriver { // increase the rate of weak memory behaviors // Helps on AMD, doesn't seem to help on NVIDIA if (STRESS_BEFORE_TEST) { - this->stress( - param.data_stress, block_idx, thread_idx, param.grid_size, 128); + this->stress(block_idx, thread_idx, param, true); } - // Synchronize all blocks before testing, to increase the chance of - // interleaved requests. - this->sync(param.testing_blocks, thread_idx, param.barriers[i]); - test.run(data_idx, other_data_idx, i); + } else { + this->stress(block_idx, thread_idx, param); } } }; @@ -252,8 +295,9 @@ struct LitmusTestDriver { if (thread_idx == 0) { IdxType result = RAJA::atomicAdd(&barrier, IdxType{1}); // Busy-wait until all blocks perform the above add. - while (result != num_blocks) + while (result != num_blocks) { result = RAJA::atomicAdd(&barrier, IdxType{0}); + } } #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_CODE__) @@ -261,30 +305,24 @@ struct LitmusTestDriver { #endif } - RAJA_HOST_DEVICE uint64_t get_rand(uint64_t& pcg_state) - { - uint64_t oldstate = pcg_state; - // Advance internal state - pcg_state = oldstate * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; - // Calculate output function (XSH RR), uses old state for max ILP - uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; - uint32_t rot = oldstate >> 59u; - return (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); - } - - RAJA_HOST_DEVICE void stress(IdxType* stress_data, - IdxType block_idx, + RAJA_HOST_DEVICE void stress(IdxType block_idx, IdxType thread_idx, - int grid_size, - int num_iters) + const TestData& param, + bool pre_stress = false) { - uint64_t pcg_state = block_idx; - for (int i = 0; i < num_iters; i++) { - // Pseudo-randomly target a given stress data location. - auto rand = get_rand(pcg_state); - auto target_line = (rand + thread_idx) % DATA_STRESS_SIZE; - - RAJA::atomicAdd(&(stress_data[target_line]), rand); + int num_iters = + (pre_stress ? param.pre_stress_iterations : param.stress_iterations); + int volatile* stress_data = param.data_stress; + if (thread_idx % WARP_SIZE == 0) { + int warp_idx = thread_idx / WARP_SIZE; + // int select_idx = block_idx * NUM_WARPS + warp_idx; + int select_idx = block_idx; + select_idx = (select_idx * PERMUTE_STRESS) % param.num_stress_index; + int stress_line = param.stress_index[select_idx]; + for (int i = 0; i < num_iters; i++) { + int data = stress_data[stress_line]; + stress_data[stress_line] = i + 1; + } } } }; From 2784605d412043ab60a4d7f2d6e8be75dc81cd32 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Tue, 19 Mar 2024 13:52:01 -0700 Subject: [PATCH 09/14] RENAME --- ...rall-atomic-litmus-mp.hpp => test-forall-atomic-litmus-mp.cpp} | 0 ...rall-atomic-litmus-sb.hpp => test-forall-atomic-litmus-sb.cpp} | 0 .../test-forall-atomic-litmus-driver.hpp} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename test/functional/forall/atomic-basic/{tests/test-forall-atomic-litmus-mp.hpp => test-forall-atomic-litmus-mp.cpp} (100%) rename test/functional/forall/atomic-basic/{tests/test-forall-atomic-litmus-sb.hpp => test-forall-atomic-litmus-sb.cpp} (100%) rename test/functional/forall/atomic-basic/{test-forall-atomic-litmus.cpp => tests/test-forall-atomic-litmus-driver.hpp} (100%) diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp similarity index 100% rename from test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp rename to test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp similarity index 100% rename from test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp rename to test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp similarity index 100% rename from test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp rename to test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp From 1fea7cf08fd193d76b12fb411ebfc42b53eaef22 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Tue, 19 Mar 2024 13:58:52 -0700 Subject: [PATCH 10/14] Post-rename changes --- .../forall/atomic-basic/CMakeLists.txt | 16 ++++-- .../test-forall-atomic-litmus-mp.cpp | 30 ++++++++++- .../test-forall-atomic-litmus-sb.cpp | 26 +++++++++ .../test-forall-atomic-litmus-driver.hpp | 53 ------------------- 4 files changed, 66 insertions(+), 59 deletions(-) diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt index b5e72dc8fc..6e2c184b09 100644 --- a/test/functional/forall/atomic-basic/CMakeLists.txt +++ b/test/functional/forall/atomic-basic/CMakeLists.txt @@ -42,12 +42,20 @@ if(RAJA_ENABLE_DESUL_ATOMICS AND RAJA_ENABLE_HIP) set(ENABLE_LITMUS_TESTS ON) endif() +set(FORALL_LITMUS_TESTS + mp # Message Passing + sb # Store Buffer +) + if (ENABLE_LITMUS_TESTS) - raja_add_test( NAME test-forall-atomic-litmus-${LITMUS_BACKEND} - SOURCES test-forall-atomic-litmus.cpp) - target_include_directories(test-forall-atomic-litmus-${LITMUS_BACKEND}.exe - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + foreach ( LITMUS_TEST ${FORALL_LITMUS_TESTS} ) + raja_add_test( NAME test-forall-atomic-litmus-${LITMUS_BACKEND}-${LITMUS_TEST} + SOURCES test-forall-atomic-litmus-${LITMUS_TEST}.cpp) + target_include_directories(test-forall-atomic-litmus-${LITMUS_BACKEND}-${LITMUS_TEST}.exe + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + endforeach() endif() +unset(FORALL_LITMUS_TESTS) unset(ENABLE_LITMUS_TESTS) unset(LITMUS_BACKEND) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp index 64deb1beb3..362686d5c0 100644 --- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp @@ -15,6 +15,8 @@ #include "RAJA_test-forall-execpol.hpp" #include "RAJA_test-index-types.hpp" +#include "test-forall-atomic-litmus-driver.hpp" + // "Message Passing" litmus test for DESUL ordered atomic // ------------------------------------------------------ // Initial state: x = 0 && y = 0 @@ -210,7 +212,31 @@ using MPLitmusTestOrderPols = camp::list, camp::list >; -using MPLitmusTestRecvPols = camp::list; - using MPLitmusTestPols = camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, MessagePassingTest) +{ + using Type = typename camp::at>::type; + using SendRecvPol = typename camp::at>::type; + using SendPol = typename camp::at>::type; + using RecvPol = typename camp::at>::type; + + using MPTest = MessagePassingLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, MessagePassingTest); + +using MessagePassingTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + MessagePassingTestTypes); diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp index c3cde1e760..666c1470ea 100644 --- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp @@ -15,6 +15,8 @@ #include "RAJA_test-forall-execpol.hpp" #include "RAJA_test-index-types.hpp" +#include "test-forall-atomic-litmus-driver.hpp" + // "Store buffer" litmus test for DESUL ordered atomic // --------------------------------------------------- // Initial state: x = 0 && y = 0 @@ -200,3 +202,27 @@ using SBLitmusTestOrderPols = using SBLitmusTestPols = camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, StoreBufferTest) +{ + using Type = typename camp::at>::type; + using AtomicPol = typename camp::at>::type; + + using SBTest = StoreBufferLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, StoreBufferTest); + +using StoreBufferTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + StoreBufferTestTypes); diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp index f987f5e764..3e6b9520e8 100644 --- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp +++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp @@ -24,9 +24,6 @@ #include #include -#include "test-forall-atomic-litmus-mp.hpp" -#include "test-forall-atomic-litmus-sb.hpp" - using IdxType = size_t; constexpr int NUM_ITERS = 100; #ifdef RAJA_ENABLE_CUDA @@ -326,53 +323,3 @@ struct LitmusTestDriver { } } }; - -TYPED_TEST_SUITE_P(ForallAtomicLitmusTestMP); - -template -class ForallAtomicLitmusTestMP : public ::testing::Test -{ -}; - -TYPED_TEST_P(ForallAtomicLitmusTestMP, MessagePassingTest) -{ - using Type = typename camp::at>::type; - using SendRecvPol = typename camp::at>::type; - using SendPol = typename camp::at>::type; - using RecvPol = typename camp::at>::type; - - using MPTest = MessagePassingLitmus; - LitmusTestDriver::run(); -} - -REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTestMP, MessagePassingTest); - -using MessagePassingTestTypes = Test::Types; - -INSTANTIATE_TYPED_TEST_SUITE_P(Hip, - ForallAtomicLitmusTestMP, - MessagePassingTestTypes); - -TYPED_TEST_SUITE_P(ForallAtomicLitmusTestSB); - -template -class ForallAtomicLitmusTestSB : public ::testing::Test -{ -}; - -TYPED_TEST_P(ForallAtomicLitmusTestSB, StoreBufferTest) -{ - using Type = typename camp::at>::type; - using AtomicPol = typename camp::at>::type; - - using SBTest = StoreBufferLitmus; - LitmusTestDriver::run(); -} - -REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTestSB, StoreBufferTest); - -using StoreBufferTestTypes = Test::Types; - -INSTANTIATE_TYPED_TEST_SUITE_P(Hip, - ForallAtomicLitmusTestSB, - StoreBufferTestTypes); From 94d45b5a7ea3d1025937fd4a536c0b1ef468f7e1 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Tue, 19 Mar 2024 14:21:38 -0700 Subject: [PATCH 11/14] Add a load buffer litmus test --- .../forall/atomic-basic/CMakeLists.txt | 1 + .../test-forall-atomic-litmus-lb.cpp | 235 ++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt index 6e2c184b09..4d4f17fac9 100644 --- a/test/functional/forall/atomic-basic/CMakeLists.txt +++ b/test/functional/forall/atomic-basic/CMakeLists.txt @@ -45,6 +45,7 @@ endif() set(FORALL_LITMUS_TESTS mp # Message Passing sb # Store Buffer + lb # Load Buffer ) if (ENABLE_LITMUS_TESTS) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp new file mode 100644 index 0000000000..67a49eb91d --- /dev/null +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp @@ -0,0 +1,235 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +#include "test-forall-atomic-litmus-driver.hpp" + +// "Load buffer" litmus test for DESUL ordered atomic +// -------------------------------------------------- +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// a = load(y) b = load(x) +// store(x, 1) store(y, 1) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 0, b = 1 +// - a = 1, b = 0 +// - a = 0, b = 0 +// Weak behavior: +// - a = 1, b = 1 + +// Send policy: Relaxed (Weak), Release, AcqRel, SeqCst +// Recv policy: Relaxed (Weak), Acquire, AcqRel, SeqCst +template +struct LoadBufferLitmus { + using DataType = T; + using RelaxedPolicy = RAJA::atomic_relaxed; + constexpr static int PERMUTE_THREAD_FLAG = 97; + size_t m_size; + int m_stride; + T *x; + T *y; + T *a; + T *b; + + int strong_behavior_0{0}; + int strong_behavior_1{0}; + int interleaved_behavior{0}; + int weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size, int stride) + { + m_size = size; + m_stride = stride; + x = work_res.allocate(size * stride); + y = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + b = work_res.allocate(size * stride); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(y); + work_res.deallocate(a); + work_res.deallocate(b); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(y, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + work_res.memset(b, 0, sizeof(T) * m_size * m_stride); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) + { + bool swap = this_thread % 2 == 0; + if (swap) { + load_buffer_1(other_thread, iter); + load_buffer_2(this_thread, iter); + } else { + load_buffer_2(this_thread, iter); + load_buffer_1(other_thread, iter); + } + } + + RAJA_HOST_DEVICE void load_buffer_1(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + int permute_thread = (thread * PERMUTE_THREAD_FLAG) % m_size; + int permute_idx = permute_thread * m_stride + iter; + a[thread_idx] = RAJA::atomicAdd(&(y[thread_idx]), T{0}); + RAJA::atomicAdd(&(x[permute_idx]), T{1}); + } + + RAJA_HOST_DEVICE void load_buffer_2(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + int permute_thread = (thread * PERMUTE_THREAD_FLAG) % m_size; + int permute_idx = permute_thread * m_stride + iter; + b[thread_idx] = RAJA::atomicAdd(&(x[permute_idx]), T{0}); + RAJA::atomicAdd(&(y[thread_idx]), T{1}); + } + + void count_results(camp::resources::Resource work_res) + { + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; +#endif + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *a_local = a; + T *b_local = b; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (a_local[i] == 1 && b_local[i] == 0) { + // Strong behavior: thread 2 happened before thread 1 + strong_cnt_0 += 1; + } else if (a_local[i] == 0 && b_local[i] == 1) { + // Strong behavior: thread 1 happened before thread 2 + strong_cnt_1 += 1; + } else if (a_local[i] == 0 && b_local[i] == 0) { + // Strong behavior: stores interleaved with receives + interleaved_cnt += 1; + } else if (a_local[i] == 1 && b_local[i] == 1) { + // Weak behavior: stores reordered after receives + weak_cnt += 1; + } else { + unexpected_cnt += 1; + } + }); + + EXPECT_EQ(unexpected_cnt.get(), 0); + + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); + } + + void verify() + { + std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 + << "\n"; + std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 + << "\n"; + std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior + << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value && + std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using LBLitmusTestOrderPols = + camp::list, + camp::list, + camp::list, + camp::list >; + +using LBLitmusTestPols = + camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, LoadBufferTest) +{ + using Type = typename camp::at>::type; + using SendRecvPol = typename camp::at>::type; + using SendPol = typename camp::at>::type; + using RecvPol = typename camp::at>::type; + + using LBTest = LoadBufferLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, LoadBufferTest); + +using LoadBufferTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + LoadBufferTestTypes); From a90f93a7520bc8a2b1f8e09fbb12ab297650cd57 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Tue, 19 Mar 2024 14:32:16 -0700 Subject: [PATCH 12/14] Add a store litmus test --- .../forall/atomic-basic/CMakeLists.txt | 1 + .../test-forall-atomic-litmus-store.cpp | 226 ++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt index 4d4f17fac9..fa0fbc7569 100644 --- a/test/functional/forall/atomic-basic/CMakeLists.txt +++ b/test/functional/forall/atomic-basic/CMakeLists.txt @@ -46,6 +46,7 @@ set(FORALL_LITMUS_TESTS mp # Message Passing sb # Store Buffer lb # Load Buffer + store # Store ) if (ENABLE_LITMUS_TESTS) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp new file mode 100644 index 0000000000..10fd9edeb8 --- /dev/null +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp @@ -0,0 +1,226 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +#include "test-forall-atomic-litmus-driver.hpp" + +// "Store" litmus test for DESUL ordered atomic +// --------------------------------------------------- +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// store(x, 2) a = load(y) +// store(y, 1) store(x, 1) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 1, x = 1 +// - a = 0, x = 2 +// - a = 0, x = 1 +// Weak behavior: +// - a = 1, x = 2 + +// Send policy: Relaxed (Weak), Acquire, AcqRel, SeqCst +// Recv policy: Relaxed (Weak), Release, AcqRel, SeqCst +template +struct StoreLitmus { + using DataType = T; + using RelaxedPolicy = RAJA::atomic_relaxed; + size_t m_size; + int m_stride; + T *x; + T *y; + T *a; + + int strong_behavior_0{0}; + int strong_behavior_1{0}; + int interleaved_behavior{0}; + int weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size, int stride) + { + m_size = size; + m_stride = stride; + x = work_res.allocate(size * stride); + y = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(y); + work_res.deallocate(a); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(y, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) + { + bool swap = this_thread % 2 == 0; + if (swap) { + store_1(other_thread, iter); + store_2(this_thread, iter); + } else { + store_2(this_thread, iter); + store_1(other_thread, iter); + } + } + + RAJA_HOST_DEVICE void store_1(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicExchange(&(x[thread_idx]), T{2}); + RAJA::atomicAdd(&(y[thread_idx]), T{1}); + } + + RAJA_HOST_DEVICE void store_2(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + a[thread_idx] = RAJA::atomicAdd(&(y[thread_idx]), T{0}); + RAJA::atomicExchange(&(x[thread_idx]), T{1}); + } + + void count_results(camp::resources::Resource work_res) + { + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; +#endif + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *a_local = a; + T *x_local = x; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (a_local[i] == 1 && x_local[i] == 1) { + // Strong behavior: thread 1 happened before thread 2 + strong_cnt_0 += 1; + } else if (a_local[i] == 0 && x_local[i] == 2) { + // Strong behavior: thread 2 happened before thread 1 + strong_cnt_1 += 1; + } else if (a_local[i] == 0 && x_local[i] == 1) { + // Strong behavior: stores interleaved with receives + interleaved_cnt += 1; + } else if (a_local[i] == 1 && x_local[i] == 2) { + // Weak behavior: receives reordered after stores + weak_cnt += 1; + } else { + unexpected_cnt += 1; + } + }); + + EXPECT_EQ(unexpected_cnt.get(), 0); + + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); + } + + void verify() + { + std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 + << "\n"; + std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 + << "\n"; + std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior + << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value && + std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using StoreLitmusTestOrderPols = + camp::list, + camp::list, + camp::list, + camp::list >; + +using StoreLitmusTestPols = + camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, StoreTest) +{ + using Type = typename camp::at>::type; + using SendRecvPol = typename camp::at>::type; + using SendPol = typename camp::at>::type; + using RecvPol = typename camp::at>::type; + + using StoreTest = StoreLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, StoreTest); + +using StoreTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + StoreTestTypes); From a86a1765d5b6e432936ab8f3a651a7c09d402d36 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Tue, 19 Mar 2024 14:58:15 -0700 Subject: [PATCH 13/14] Add a read litmus test --- .../forall/atomic-basic/CMakeLists.txt | 1 + .../test-forall-atomic-litmus-read.cpp | 219 ++++++++++++++++++ 2 files changed, 220 insertions(+) create mode 100644 test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt index fa0fbc7569..968090ec06 100644 --- a/test/functional/forall/atomic-basic/CMakeLists.txt +++ b/test/functional/forall/atomic-basic/CMakeLists.txt @@ -47,6 +47,7 @@ set(FORALL_LITMUS_TESTS sb # Store Buffer lb # Load Buffer store # Store + read # Read ) if (ENABLE_LITMUS_TESTS) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp new file mode 100644 index 0000000000..16c870e4b8 --- /dev/null +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp @@ -0,0 +1,219 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +#include "test-forall-atomic-litmus-driver.hpp" + +// "Read" litmus test for DESUL ordered atomic +// --------------------------------------------------- +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// store(x, 1) store(y, 2) +// store(y, 1) a = load(x) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 1, x = 2 +// - a = 1, x = 1 +// - a = 0, x = 1 +// Weak behavior: +// - a = 0, x = 2 + +// Atomic policy: Relaxed (Weak), SeqCst +template +struct ReadLitmus { + using DataType = T; + using RelaxedPolicy = RAJA::atomic_relaxed; + size_t m_size; + int m_stride; + T *x; + T *y; + T *a; + + int strong_behavior_0{0}; + int strong_behavior_1{0}; + int interleaved_behavior{0}; + int weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size, int stride) + { + m_size = size; + m_stride = stride; + x = work_res.allocate(size * stride); + y = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(y); + work_res.deallocate(a); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(y, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) + { + bool swap = this_thread % 2 == 0; + if (swap) { + read_1(other_thread, iter); + read_2(this_thread, iter); + } else { + read_2(this_thread, iter); + read_1(other_thread, iter); + } + } + + RAJA_HOST_DEVICE void read_1(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicExchange(&(x[thread_idx]), T{1}); + RAJA::atomicExchange(&(y[thread_idx]), T{1}); + } + + RAJA_HOST_DEVICE void read_2(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicExchange(&(y[thread_idx]), T{2}); + a[thread_idx] = RAJA::atomicAdd(&(x[thread_idx]), T{0}); + } + + void count_results(camp::resources::Resource work_res) + { + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; +#endif + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *a_local = a; + T *y_local = y; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (a_local[i] == 1 && y_local[i] == 2) { + // Strong behavior: thread 1 happened before thread 2 + strong_cnt_0 += 1; + } else if (a_local[i] == 0 && y_local[i] == 1) { + // Strong behavior: thread 2 happened before thread 1 + strong_cnt_1 += 1; + } else if (a_local[i] == 1 && y_local[i] == 1) { + // Strong behavior: stores interleaved with receives + interleaved_cnt += 1; + } else if (a_local[i] == 0 && y_local[i] == 2) { + // Weak behavior: receives reordered after stores + weak_cnt += 1; + } else { + unexpected_cnt += 1; + } + }); + + EXPECT_EQ(unexpected_cnt.get(), 0); + + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); + } + + void verify() + { + std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 + << "\n"; + std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 + << "\n"; + std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior + << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using ReadLitmusTestOrderPols = + camp::list; + +using ReadLitmusTestPols = + camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, ReadTest) +{ + using Type = typename camp::at>::type; + using AtomicPol = typename camp::at>::type; + + using ReadTest = ReadLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, ReadTest); + +using ReadTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + ReadTestTypes); From 3f4dc307b197d186ef3cc627ced1a13bef0c7ea7 Mon Sep 17 00:00:00 2001 From: Max Yang Date: Tue, 19 Mar 2024 15:20:15 -0700 Subject: [PATCH 14/14] Add a 2+2 write litmus test --- .../forall/atomic-basic/CMakeLists.txt | 1 + .../test-forall-atomic-litmus-write2x2.cpp | 222 ++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100644 test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt index 968090ec06..707eabdc78 100644 --- a/test/functional/forall/atomic-basic/CMakeLists.txt +++ b/test/functional/forall/atomic-basic/CMakeLists.txt @@ -48,6 +48,7 @@ set(FORALL_LITMUS_TESTS lb # Load Buffer store # Store read # Read + write2x2 # 2+2 write ) if (ENABLE_LITMUS_TESTS) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp new file mode 100644 index 0000000000..b5dc28ac5f --- /dev/null +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp @@ -0,0 +1,222 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +#include "test-forall-atomic-litmus-driver.hpp" + +// "2+2 Write" litmus test for DESUL ordered atomic +// --------------------------------------------------- +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// store(x, 1) store(y, 1) +// store(y, 2) store(x, 2) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 1, x = 2 +// - a = 2, x = 1 +// - a = 2, x = 2 +// Weak behavior: +// - a = 1, x = 1 + +// Send policy: Relaxed (Weak), AcqRel, SeqCst +// Recv policy: Relaxed (Weak), AcqRel, SeqCst +template +struct Write2x2Litmus { + using DataType = T; + using RelaxedPolicy = RAJA::atomic_relaxed; + size_t m_size; + int m_stride; + T *x; + T *y; + T *a; + + int strong_behavior_0{0}; + int strong_behavior_1{0}; + int interleaved_behavior{0}; + int weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size, int stride) + { + m_size = size; + m_stride = stride; + x = work_res.allocate(size * stride); + y = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(y); + work_res.deallocate(a); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(y, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) + { + bool swap = this_thread % 2 == 0; + if (swap) { + store_1(other_thread, iter); + store_2(this_thread, iter); + } else { + store_2(this_thread, iter); + store_1(other_thread, iter); + } + } + + RAJA_HOST_DEVICE void store_1(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicExchange(&(x[thread_idx]), T{1}); + RAJA::atomicExchange(&(y[thread_idx]), T{2}); + } + + RAJA_HOST_DEVICE void store_2(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicExchange(&(y[thread_idx]), T{1}); + RAJA::atomicExchange(&(x[thread_idx]), T{2}); + } + + void count_results(camp::resources::Resource work_res) + { + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; +#endif + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *x_local = x; + T *y_local = y; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (x_local[i] == 1 && y_local[i] == 2) { + // Strong behavior: thread 1 happened before thread 2 + strong_cnt_0 += 1; + } else if (x_local[i] == 2 && y_local[i] == 1) { + // Strong behavior: thread 2 happened before thread 1 + strong_cnt_1 += 1; + } else if (x_local[i] == 2 && y_local[i] == 2) { + // Strong behavior: interleaved stores in-order + interleaved_cnt += 1; + } else if (x_local[i] == 1 && y_local[i] == 1) { + // Weak behavior: stores on each thread were reordered + weak_cnt += 1; + } else { + unexpected_cnt += 1; + } + }); + + EXPECT_EQ(unexpected_cnt.get(), 0); + + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); + } + + void verify() + { + std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 + << "\n"; + std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 + << "\n"; + std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior + << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using Write2x2LitmusTestOrderPols = + camp::list; + +using Write2x2LitmusTestPols = + camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, Write2x2Test) +{ + using Type = typename camp::at>::type; + using AtomicPol = typename camp::at>::type; + + using Write2x2Test = Write2x2Litmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, Write2x2Test); + +using Write2x2TestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + Write2x2TestTypes);