From 89e38c8bde2e977f1bd29f3fd18c761dc80ef306 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Fri, 15 Mar 2024 23:37:35 -0700
Subject: [PATCH 01/14] Add ordered memory atomics from desul

---
 include/RAJA/policy/desul/atomic.hpp | 204 +++++++++++++++++----------
 include/RAJA/policy/desul/policy.hpp |  62 ++++++++
 2 files changed, 191 insertions(+), 75 deletions(-)
 create mode 100644 include/RAJA/policy/desul/policy.hpp
diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index dbcb5e06eb..9fdec5ccc9 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -12,10 +12,9 @@
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
 
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/policy/atomic_builtin.hpp"
-
+#include "RAJA/policy/desul/policy.hpp"
+#include "RAJA/util/macros.hpp"
 #include "desul/atomics.hpp"
 
 // Default desul options for RAJA
@@ -26,153 +25,208 @@ using raja_default_desul_scope = desul::MemoryScopeDevice;
 namespace RAJA
 {
 
+namespace detail
+{
+template <typename T>
+struct DesulAtomicPolicy {
+  using memory_order = raja_default_desul_order;
+  using memory_scope = raja_default_desul_scope;
+};
+
+template <typename OrderingPolicy, typename ScopePolicy>
+struct DesulAtomicPolicy<detail_atomic_t<OrderingPolicy, ScopePolicy>> {
+  using memory_order = OrderingPolicy;
+  using memory_scope = ScopePolicy;
+};
+
+}  // namespace detail
+
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicAdd(AtomicPolicy, T volatile *acc, T value) {
-  return desul::atomic_fetch_add(const_cast<T*>(acc),
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T volatile *acc, T value)
+{
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_add(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicSub(AtomicPolicy, T volatile *acc, T value) {
-  return desul::atomic_fetch_sub(const_cast<T*>(acc),
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T volatile *acc, T value)
+{
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_sub(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value)
 {
-  return desul::atomic_fetch_min(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_min(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value)
 {
-  return desul::atomic_fetch_max(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_max(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc)
 {
-  return desul::atomic_fetch_inc(const_cast<T*>(acc),
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_inc(const_cast<T *>(acc),
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc, T val)
 {
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_fetch_inc_mod(const_cast<T*>(acc),
-                                          val,
-                                          raja_default_desul_order{},
-                                          raja_default_desul_scope{});
+  return desul::atomic_fetch_inc_mod(const_cast<T *>(acc),
+                                     val,
+                                     desul_order{},
+                                     desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc)
 {
-  return desul::atomic_fetch_dec(const_cast<T*>(acc),
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_dec(const_cast<T *>(acc),
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc, T val)
 {
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_fetch_dec_mod(const_cast<T*>(acc),
-                                          val,
-                                          raja_default_desul_order{},
-                                          raja_default_desul_scope{});
+  return desul::atomic_fetch_dec_mod(const_cast<T *>(acc),
+                                     val,
+                                     desul_order{},
+                                     desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value)
 {
-  return desul::atomic_fetch_and(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_and(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value)
 {
-  return desul::atomic_fetch_or(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_or(const_cast<T *>(acc),
                                 value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+                                desul_order{},
+                                desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value)
 {
-  return desul::atomic_fetch_xor(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_xor(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy,
+                                              T volatile *acc,
+                                              T value)
 {
-  return desul::atomic_exchange(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_exchange(const_cast<T *>(acc),
                                 value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+                                desul_order{},
+                                desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(AtomicPolicy, T volatile *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T
+atomicCAS(AtomicPolicy, T volatile *acc, T compare, T value)
 {
-  return desul::atomic_compare_exchange(const_cast<T*>(acc),
-                                        compare,
-                                        value,
-                                        raja_default_desul_order{},
-                                        raja_default_desul_scope{});
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_compare_exchange(
+      const_cast<T *>(acc), compare, value, desul_order{}, desul_scope{});
 }
 
 }  // namespace RAJA
 
 #endif  // RAJA_ENABLE_DESUL_ATOMICS
-#endif // guard
+#endif  // guard
diff --git a/include/RAJA/policy/desul/policy.hpp b/include/RAJA/policy/desul/policy.hpp
new file mode 100644
index 0000000000..b743760592
--- /dev/null
+++ b/include/RAJA/policy/desul/policy.hpp
@@ -0,0 +1,62 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_policy_desul_HPP
+#define RAJA_policy_desul_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_DESUL_ATOMICS)
+
+#include "desul/atomics.hpp"
+
+namespace RAJA
+{
+
+// Policy to perform an atomic operation with a given memory ordering.
+template <typename OrderingPolicy, typename ScopePolicy>
+struct detail_atomic_t {
+};
+
+using atomic_seq_cst =
+    detail_atomic_t<desul::MemoryOrderSeqCst, desul::MemoryScopeDevice>;
+using atomic_acq_rel =
+    detail_atomic_t<desul::MemoryOrderAcqRel, desul::MemoryScopeDevice>;
+using atomic_acquire =
+    detail_atomic_t<desul::MemoryOrderAcquire, desul::MemoryScopeDevice>;
+using atomic_release =
+    detail_atomic_t<desul::MemoryOrderRelease, desul::MemoryScopeDevice>;
+using atomic_relaxed =
+    detail_atomic_t<desul::MemoryOrderRelaxed, desul::MemoryScopeDevice>;
+
+using atomic_seq_cst_block =
+    detail_atomic_t<desul::MemoryOrderSeqCst, desul::MemoryScopeCore>;
+using atomic_acq_rel_block =
+    detail_atomic_t<desul::MemoryOrderAcqRel, desul::MemoryScopeCore>;
+using atomic_acquire_block =
+    detail_atomic_t<desul::MemoryOrderAcquire, desul::MemoryScopeCore>;
+using atomic_release_block =
+    detail_atomic_t<desul::MemoryOrderRelease, desul::MemoryScopeCore>;
+using atomic_relaxed_block =
+    detail_atomic_t<desul::MemoryOrderRelaxed, desul::MemoryScopeCore>;
+
+using atomic_seq_cst_sys =
+    detail_atomic_t<desul::MemoryOrderSeqCst, desul::MemoryScopeSystem>;
+using atomic_acq_rel_sys =
+    detail_atomic_t<desul::MemoryOrderAcqRel, desul::MemoryScopeSystem>;
+using atomic_acquire_sys =
+    detail_atomic_t<desul::MemoryOrderAcquire, desul::MemoryScopeSystem>;
+using atomic_release_sys =
+    detail_atomic_t<desul::MemoryOrderRelease, desul::MemoryScopeSystem>;
+using atomic_relaxed_sys =
+    detail_atomic_t<desul::MemoryOrderRelaxed, desul::MemoryScopeSystem>;
+
+}  // namespace RAJA
+
+#endif  // RAJA_ENABLE_DESUL_ATOMICS
+
+#endif

From b71c42bc74eecbe10982afa00f7162fd30f6fadc Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Sat, 16 Mar 2024 00:13:30 -0700
Subject: [PATCH 02/14] Add a test driver for litmus tests, and a message
 passing test

---
 .../forall/atomic-basic/CMakeLists.txt        |  23 ++
 .../test-forall-atomic-litmus.cpp             | 319 ++++++++++++++++++
 .../tests/test-forall-atomic-litmus-mp.hpp    | 168 +++++++++
 3 files changed, 510 insertions(+)
 create mode 100644 test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
 create mode 100644 test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp

diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt
index 9c2c12d76f..b5e72dc8fc 100644
--- a/test/functional/forall/atomic-basic/CMakeLists.txt
+++ b/test/functional/forall/atomic-basic/CMakeLists.txt
@@ -28,3 +28,26 @@ foreach( ATOMIC_BACKEND ${FORALL_ATOMIC_BACKENDS} )
   target_include_directories(test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND}.exe
                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 endforeach()
+
+
+set(ENABLE_LITMUS_TESTS OFF)
+
+if(RAJA_ENABLE_DESUL_ATOMICS AND RAJA_ENABLE_CUDA)
+    set(LITMUS_BACKEND "Cuda")
+    set(ENABLE_LITMUS_TESTS ON)
+endif()
+
+if(RAJA_ENABLE_DESUL_ATOMICS AND RAJA_ENABLE_HIP)
+    set(LITMUS_BACKEND "Hip")
+    set(ENABLE_LITMUS_TESTS ON)
+endif()
+
+if (ENABLE_LITMUS_TESTS)
+  raja_add_test( NAME test-forall-atomic-litmus-${LITMUS_BACKEND}
+                 SOURCES test-forall-atomic-litmus.cpp)
+  target_include_directories(test-forall-atomic-litmus-${LITMUS_BACKEND}.exe
+                              PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endif()
+
+unset(ENABLE_LITMUS_TESTS)
+unset(LITMUS_BACKEND)
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
new file mode 100644
index 0000000000..a55340c148
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
@@ -0,0 +1,319 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-atomicpol.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include <numeric>
+#include <random>
+
+#include "test-forall-atomic-litmus-mp.hpp"
+
+using IdxType = size_t;
+constexpr int NUM_ITERS = 100;
+constexpr int MAX_SHUFFLE_LEN = 1024;
+constexpr int STRIDE = 8;
+constexpr int DATA_STRESS_SIZE = 2048 * STRIDE;
+
+constexpr int PERMUTE_PRIME_BLOCK = 11;
+constexpr int PERMUTE_PRIME_GRID = 31;
+
+template <typename Func>
+__global__ void dummy_kernel(IdxType index, Func func)
+{
+  func(index);
+}
+
+// Generic test driver for memory litmus tests on the GPU.
+template <typename LitmusPolicy>
+struct LitmusTestDriver {
+public:
+  struct TestData {
+    int block_size;
+    int grid_size;
+
+    // Number of blocks to run the message-passing litmus test on.
+    int testing_blocks;
+
+    // Array to shuffle block indices in a test kernel.
+    IdxType* shuffle_block;
+
+    // Pattern to use for memory stressing.
+    IdxType* stress_pattern;
+    // Barrier integers to synchronize testing threads.
+    IdxType* barriers;
+
+    IdxType* data_stress;
+
+    void allocate(camp::resources::Resource work_res,
+                  int grid_size,
+                  int block_size,
+                  int num_testing_blocks)
+    {
+      this->grid_size = grid_size;
+      this->block_size = block_size;
+
+      testing_blocks = num_testing_blocks;
+
+      shuffle_block = work_res.allocate<IdxType>(grid_size);
+      stress_pattern = work_res.allocate<IdxType>(MAX_SHUFFLE_LEN);
+      barriers = work_res.allocate<IdxType>(grid_size / 2);
+      data_stress = work_res.allocate<IdxType>(DATA_STRESS_SIZE);
+    }
+
+    void pre_run(camp::resources::Resource work_res)
+    {
+      std::random_device rand_device;
+      // Create a random permutation for the range [0, grid_size)
+      std::vector<IdxType> shuffle_block_host(grid_size);
+      {
+        std::iota(shuffle_block_host.begin(), shuffle_block_host.end(), 0);
+        std::shuffle(shuffle_block_host.begin(),
+                     shuffle_block_host.end(),
+                     std::mt19937{rand_device()});
+      }
+      work_res.memcpy(shuffle_block,
+                      shuffle_block_host.data(),
+                      sizeof(IdxType) * grid_size);
+
+      // Create a random sequence of 1's and 0's, equally balanced between the
+      // two
+      std::vector<IdxType> stress_host(MAX_SHUFFLE_LEN);
+      {
+        std::random_device rand_device;
+        std::shuffle(stress_host.begin(),
+                     stress_host.end(),
+                     std::mt19937{rand_device()});
+      }
+      work_res.memcpy(stress_pattern,
+                      stress_host.data(),
+                      sizeof(IdxType) * MAX_SHUFFLE_LEN);
+
+      work_res.memset(barriers, 0, sizeof(IdxType) * grid_size / 2);
+      work_res.memset(data_stress, 0, sizeof(IdxType) * DATA_STRESS_SIZE);
+
+#if defined(RAJA_ENABLE_CUDA)
+      cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+      hipErrchk(hipDeviceSynchronize());
+#endif
+    }
+
+    void deallocate(camp::resources::Resource work_res)
+    {
+      work_res.deallocate(shuffle_block);
+      work_res.deallocate(stress_pattern);
+      work_res.deallocate(barriers);
+      work_res.deallocate(data_stress);
+    }
+  };
+
+  RAJA_HOST_DEVICE LitmusTestDriver() {}
+
+  // Run
+  static void run()
+  {
+    constexpr IdxType BLOCK_SIZE = 256;
+    int num_blocks = 0;
+    {
+      LitmusPolicy dummy_policy{};
+      TestData dummy_test_data{};
+      auto lambda = [=] RAJA_HOST_DEVICE(IdxType index) {
+        LitmusTestDriver<LitmusPolicy> test_inst{};
+        test_inst.test_main(index, dummy_test_data, dummy_policy);
+      };
+      RAJA_UNUSED_VAR(lambda);
+#ifdef RAJA_ENABLE_CUDA
+      cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &num_blocks, dummy_kernel<decltype(lambda)>, BLOCK_SIZE, 0));
+      num_blocks *= RAJA::cuda::device_prop().multiProcessorCount;
+#endif
+#ifdef RAJA_ENABLE_HIP
+      hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+          &num_blocks, dummy_kernel<decltype(lambda)>, BLOCK_SIZE, 0));
+      num_blocks *= RAJA::hip::device_prop().multiProcessorCount;
+#endif
+    }
+    std::cout << "Got num_blocks = " << num_blocks
+              << ", block_size = " << BLOCK_SIZE << "\n"
+              << std::flush;
+    if (num_blocks == 0) {
+      FAIL() << "Grid size wasn't set to a valid value.\n";
+    }
+
+#ifdef RAJA_ENABLE_CUDA
+    using ResourcePolicy = camp::resources::Cuda;
+#endif
+#ifdef RAJA_ENABLE_HIP
+    using ResourcePolicy = camp::resources::Hip;
+#endif
+    camp::resources::Resource work_res{ResourcePolicy()};
+
+    TestData test_data;
+    test_data.allocate(work_res, num_blocks, BLOCK_SIZE, num_blocks);
+
+    LitmusPolicy litmus_test;
+    litmus_test.allocate(work_res, num_blocks * BLOCK_SIZE * STRIDE);
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<BLOCK_SIZE>;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<BLOCK_SIZE>;
+#endif
+
+    for (int iter = 0; iter < NUM_ITERS; iter++) {
+      test_data.pre_run(work_res);
+      litmus_test.pre_run(work_res);
+
+      RAJA::forall<GPUExec>(
+          RAJA::TypedRangeSegment<IdxType>(0, num_blocks * BLOCK_SIZE),
+          [=] RAJA_HOST_DEVICE(IdxType index) {
+            LitmusTestDriver<LitmusPolicy> test_inst{};
+            test_inst.test_main(index, test_data, litmus_test);
+          });
+
+      litmus_test.count_results(work_res);
+    }
+
+    litmus_test.verify();
+
+    litmus_test.deallocate(work_res);
+    test_data.deallocate(work_res);
+  }
+
+private:
+  using NormalAtomic = RAJA::atomic_relaxed;
+
+  RAJA_HOST_DEVICE void test_main(IdxType index,
+                                  TestData param,
+                                  LitmusPolicy test)
+  {
+    IdxType block_idx = index / param.block_size;
+    IdxType thread_idx = index % param.block_size;
+
+    // Permute the thread index, to promote scattering of memory accesses
+    // within a block.
+    IdxType permute_thread_idx =
+        (thread_idx * PERMUTE_PRIME_BLOCK) % param.block_size;
+
+    // Shuffle the block ID randomly according to a permutation array.
+    block_idx = param.shuffle_block[block_idx];
+
+    IdxType data_idx = block_idx * param.block_size + permute_thread_idx;
+    data_idx *= STRIDE;
+
+    if (block_idx < (IdxType)param.testing_blocks) {
+      // Block is a testing block.
+      // Each block acts as a "sender" to a unique "partner" block. This is done
+      // by permuting the block IDs with a function p(i) = i * k mod n, where n
+      // is the number of blocks being tested, and k and n are coprime.
+      int partner_idx = (block_idx * PERMUTE_PRIME_GRID) % param.testing_blocks;
+
+      // Pre-stress pattern - stressing memory accesses before the test may
+      // increase the rate of weak memory behaviors
+      // this->stress(param.data_stress, block_idx, param.grid_size, 128);
+
+      // Synchronize all blocks before testing, to increase the chance of
+      // interleaved requests.
+      this->sync(param.testing_blocks, thread_idx, param.barriers[0]);
+
+      for (int i = 0; i < STRIDE; i++) {
+        // Run specified test, matching threads between the two paired blocks.
+        int other_data_idx =
+            partner_idx * param.block_size + permute_thread_idx;
+        other_data_idx *= STRIDE;
+        test.run(data_idx + i, other_data_idx + i);
+      }
+    } else {
+      // Blocks which aren't testing should just stress memory accesses.
+      // this->stress(param.data_stress, block_idx, param.grid_size, 1024);
+    }
+  };
+
+  RAJA_HOST_DEVICE void sync(int num_blocks, int thread_idx, IdxType& barrier)
+  {
+    if (thread_idx == 0) {
+      IdxType result = RAJA::atomicAdd<NormalAtomic>(&barrier, IdxType{1});
+      // Busy-wait until all blocks perform the above add.
+      while (result != num_blocks)
+        result = RAJA::atomicAdd<NormalAtomic>(&barrier, IdxType{0});
+    }
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_CODE__)
+    __syncthreads();
+#endif
+  }
+
+  RAJA_HOST_DEVICE uint64_t get_rand(uint64_t& pcg_state)
+  {
+    uint64_t oldstate = pcg_state;
+    // Advance internal state
+    pcg_state = oldstate * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+    // Calculate output function (XSH RR), uses old state for max ILP
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot = oldstate >> 59u;
+    return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+  }
+
+  RAJA_HOST_DEVICE void stress(IdxType* stress_data,
+                               IdxType block_idx,
+                               int grid_size,
+                               int num_iters)
+  {
+    uint64_t pcg_state = block_idx;
+    for (int i = 0; i < num_iters; i++) {
+      // Pseudo-randomly target a given stress data location.
+      auto rand = get_rand(pcg_state);
+      auto target_line = rand % DATA_STRESS_SIZE;
+
+      RAJA::atomicAdd<NormalAtomic>(&(stress_data[target_line]), rand);
+    }
+  }
+};
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, MessagePassingTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using SendRecvPol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using SendPol = typename camp::at<SendRecvPol, camp::num<0>>::type;
+  using RecvPol = typename camp::at<SendRecvPol, camp::num<1>>::type;
+
+  using MPTest = MessagePassingLitmus<Type, SendPol, RecvPol>;
+  LitmusTestDriver<MPTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, MessagePassingTest);
+
+using MessagePassingTestTypes = Test<MPLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               MessagePassingTestTypes);
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
new file mode 100644
index 0000000000..993de18ee4
--- /dev/null
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
@@ -0,0 +1,168 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+// "Message Passing" litmus test for DESUL ordered atomic
+// ------------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// store(x, 1)      a = load(flag)
+// store(flag, 1)   b = load(x)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 1, b = 1
+//  - a = 0, b = 0
+//  - a = 0, b = 1
+// Weak behavior:
+//  - a = 1, b = 0
+//
+// On weak architectures (POWER/ARM/GPUs), the store to "x" can be reordered
+// after the store to "flag". Store-release and load-acquire on the "flag"
+// variable should prevent observing the weak behavior.
+
+// Send policy: Relaxed (Weak), Acquire, AcqRel, SeqCst
+// Recv policy: Relaxed (Weak), Release, AcqRel, SeqCst
+template <typename T, typename SendPolicy, typename RecvPolicy>
+struct MessagePassingLitmus {
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  size_t m_size;
+  T *x;
+  T *flag;
+  T *a;
+  T *b;
+
+  int strong_behavior_0{0};
+  int strong_behavior_1{0};
+  int interleaved_behavior{0};
+  int weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size)
+  {
+    m_size = size;
+    x = work_res.allocate<T>(size);
+    flag = work_res.allocate<T>(size);
+    a = work_res.allocate<T>(size);
+    b = work_res.allocate<T>(size);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(flag);
+    work_res.deallocate(a);
+    work_res.deallocate(b);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size);
+    work_res.memset(flag, 0, sizeof(T) * m_size);
+    work_res.memset(a, 0, sizeof(T) * m_size);
+    work_res.memset(b, 0, sizeof(T) * m_size);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread)
+  {
+    // Send action
+    x[other_thread] = T{1};
+    RAJA::atomicExchange<SendPolicy>(&(flag[other_thread]), T{1});
+    // Recv action
+    a[this_thread] = RAJA::atomicAdd<RecvPolicy>(&(flag[this_thread]), T{0});
+    b[this_thread] = x[this_thread];
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+    camp::resources::Resource host_res{camp::resources::Host()};
+
+    T *a_host = host_res.allocate<T>(m_size);
+    T *b_host = host_res.allocate<T>(m_size);
+
+    work_res.memcpy(a_host, a, m_size * sizeof(T));
+    work_res.memcpy(b_host, b, m_size * sizeof(T));
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+    for (size_t i = 0; i < m_size; i++) {
+      if (a_host[i] == 0 && b_host[i] == 0) {
+        // Strong behavior: neither store from test_send is observable
+        strong_behavior_0++;
+      } else if (a_host[i] == 1 && b_host[i] == 1) {
+        // Strong behavior: both stores from test_send are observable
+        strong_behavior_1++;
+      } else if (a_host[i] == 0 && b_host[i] == 1) {
+        // Strong behavior: stores interleaved with receives
+        interleaved_behavior++;
+      } else if (a_host[i] == 1 && b_host[i] == 0) {
+        // Weak behavior: second store observed before first store
+        weak_behavior++;
+      } else {
+        FAIL() << "Unexpected result for index " << i;
+      }
+    }
+
+    host_res.deallocate(a_host);
+    host_res.deallocate(b_host);
+  }
+
+  void verify()
+  {
+    std::cout << " - Strong behavior (a = 0, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cout << " - Strong behavior (a = 1, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cout << " - Strong behavior (a = 0, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cout << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<SendPolicy, RAJA::atomic_relaxed>::value &&
+        std::is_same<RecvPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      ASSERT_GT(weak_behavior, 0);
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using MPLitmusTestOrderPols =
+    camp::list<camp::list<RAJA::atomic_relaxed, RAJA::atomic_relaxed>,
+               camp::list<RAJA::atomic_release, RAJA::atomic_acquire>,
+               camp::list<RAJA::atomic_acq_rel, RAJA::atomic_acq_rel>,
+               camp::list<RAJA::atomic_seq_cst, RAJA::atomic_seq_cst> >;
+
+using MPLitmusTestRecvPols = camp::list<RAJA::atomic_relaxed>;
+
+using MPLitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, MPLitmusTestOrderPols>;

From 4be5c8cb62fe89d063afa6463b481a31aeaf50c3 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Sat, 16 Mar 2024 12:10:45 -0700
Subject: [PATCH 03/14] Remove stress_pattern array

---
 .../test-forall-atomic-litmus.cpp             | 21 ++-----------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
index a55340c148..41e76c87dd 100644
--- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
@@ -55,8 +55,6 @@ struct LitmusTestDriver {
     // Array to shuffle block indices in a test kernel.
     IdxType* shuffle_block;
 
-    // Pattern to use for memory stressing.
-    IdxType* stress_pattern;
     // Barrier integers to synchronize testing threads.
     IdxType* barriers;
 
@@ -73,8 +71,7 @@ struct LitmusTestDriver {
       testing_blocks = num_testing_blocks;
 
       shuffle_block = work_res.allocate<IdxType>(grid_size);
-      stress_pattern = work_res.allocate<IdxType>(MAX_SHUFFLE_LEN);
-      barriers = work_res.allocate<IdxType>(grid_size / 2);
+      barriers = work_res.allocate<IdxType>(STRIDE);
       data_stress = work_res.allocate<IdxType>(DATA_STRESS_SIZE);
     }
 
@@ -93,20 +90,7 @@ struct LitmusTestDriver {
                       shuffle_block_host.data(),
                       sizeof(IdxType) * grid_size);
 
-      // Create a random sequence of 1's and 0's, equally balanced between the
-      // two
-      std::vector<IdxType> stress_host(MAX_SHUFFLE_LEN);
-      {
-        std::random_device rand_device;
-        std::shuffle(stress_host.begin(),
-                     stress_host.end(),
-                     std::mt19937{rand_device()});
-      }
-      work_res.memcpy(stress_pattern,
-                      stress_host.data(),
-                      sizeof(IdxType) * MAX_SHUFFLE_LEN);
-
-      work_res.memset(barriers, 0, sizeof(IdxType) * grid_size / 2);
+      work_res.memset(barriers, 0, sizeof(IdxType) * STRIDE);
       work_res.memset(data_stress, 0, sizeof(IdxType) * DATA_STRESS_SIZE);
 
 #if defined(RAJA_ENABLE_CUDA)
@@ -121,7 +105,6 @@ struct LitmusTestDriver {
     void deallocate(camp::resources::Resource work_res)
     {
       work_res.deallocate(shuffle_block);
-      work_res.deallocate(stress_pattern);
       work_res.deallocate(barriers);
       work_res.deallocate(data_stress);
     }

From e73d52ae5ef61171217341319eb263f318e1b1e6 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Sat, 16 Mar 2024 12:16:32 -0700
Subject: [PATCH 04/14] Increase observability of weak memory behaviors

Fiddling around with some parameters for the litmus test driver:
- It seems that having only a subset of the running blocks participate
  in the Message Passing litmus test increases the rate at which weak
  memory behaviors are observed.
- Pre-stressing memory doesn't seem to help on NVIDIA V100s.
---
 .../test-forall-atomic-litmus.cpp             | 55 ++++++++++++-------
 .../tests/test-forall-atomic-litmus-mp.hpp    |  2 +-
 2 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
index 41e76c87dd..d30f3221b8 100644
--- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
@@ -27,13 +27,20 @@
 #include "test-forall-atomic-litmus-mp.hpp"
 
 using IdxType = size_t;
-constexpr int NUM_ITERS = 100;
-constexpr int MAX_SHUFFLE_LEN = 1024;
-constexpr int STRIDE = 8;
+constexpr int NUM_ITERS = 20;
+#ifdef RAJA_ENABLE_CUDA
+constexpr int STRIDE = 4;
+constexpr bool STRESS_BEFORE_TEST = false;
+constexpr bool NONTESTING_BLOCKS = true;
+#elif defined(RAJA_ENABLE_HIP)
+constexpr int STRIDE = 16;
+constexpr bool STRESS_BEFORE_TEST = true;
+constexpr bool NONTESTING_BLOCKS = true;
+#endif
 constexpr int DATA_STRESS_SIZE = 2048 * STRIDE;
 
-constexpr int PERMUTE_PRIME_BLOCK = 11;
-constexpr int PERMUTE_PRIME_GRID = 31;
+constexpr int PERMUTE_PRIME_BLOCK = 17;
+constexpr int PERMUTE_PRIME_GRID = 47;
 
 template <typename Func>
 __global__ void dummy_kernel(IdxType index, Func func)
@@ -115,7 +122,7 @@ struct LitmusTestDriver {
   // Run
   static void run()
   {
-    constexpr IdxType BLOCK_SIZE = 256;
+    constexpr IdxType BLOCK_SIZE = 128;
     int num_blocks = 0;
     {
       LitmusPolicy dummy_policy{};
@@ -151,11 +158,16 @@ struct LitmusTestDriver {
 #endif
     camp::resources::Resource work_res{ResourcePolicy()};
 
+    int num_testing_blocks = num_blocks;
+    if (NONTESTING_BLOCKS) {
+      num_testing_blocks = num_blocks / 4;
+    }
+
     TestData test_data;
-    test_data.allocate(work_res, num_blocks, BLOCK_SIZE, num_blocks);
+    test_data.allocate(work_res, num_blocks, BLOCK_SIZE, num_testing_blocks);
 
     LitmusPolicy litmus_test;
-    litmus_test.allocate(work_res, num_blocks * BLOCK_SIZE * STRIDE);
+    litmus_test.allocate(work_res, num_testing_blocks * BLOCK_SIZE * STRIDE);
 
 #ifdef RAJA_ENABLE_HIP
     using GPUExec = RAJA::hip_exec<BLOCK_SIZE>;
@@ -213,24 +225,26 @@ struct LitmusTestDriver {
       // is the number of blocks being tested, and k and n are coprime.
       int partner_idx = (block_idx * PERMUTE_PRIME_GRID) % param.testing_blocks;
 
-      // Pre-stress pattern - stressing memory accesses before the test may
-      // increase the rate of weak memory behaviors
-      // this->stress(param.data_stress, block_idx, param.grid_size, 128);
-
-      // Synchronize all blocks before testing, to increase the chance of
-      // interleaved requests.
-      this->sync(param.testing_blocks, thread_idx, param.barriers[0]);
-
       for (int i = 0; i < STRIDE; i++) {
         // Run specified test, matching threads between the two paired blocks.
         int other_data_idx =
             partner_idx * param.block_size + permute_thread_idx;
         other_data_idx *= STRIDE;
+
+        // Pre-stress pattern - stressing memory accesses before the test may
+        // increase the rate of weak memory behaviors
+        // Helps on AMD, doesn't seem to help on NVIDIA
+        if (STRESS_BEFORE_TEST) {
+          this->stress(
+              param.data_stress, block_idx, thread_idx, param.grid_size, 128);
+        }
+
+        // Synchronize all blocks before testing, to increase the chance of
+        // interleaved requests.
+        this->sync(param.testing_blocks, thread_idx, param.barriers[i]);
+
         test.run(data_idx + i, other_data_idx + i);
       }
-    } else {
-      // Blocks which aren't testing should just stress memory accesses.
-      // this->stress(param.data_stress, block_idx, param.grid_size, 1024);
     }
   };
 
@@ -261,6 +275,7 @@ struct LitmusTestDriver {
 
   RAJA_HOST_DEVICE void stress(IdxType* stress_data,
                                IdxType block_idx,
+                               IdxType thread_idx,
                                int grid_size,
                                int num_iters)
   {
@@ -268,7 +283,7 @@ struct LitmusTestDriver {
     for (int i = 0; i < num_iters; i++) {
       // Pseudo-randomly target a given stress data location.
       auto rand = get_rand(pcg_state);
-      auto target_line = rand % DATA_STRESS_SIZE;
+      auto target_line = (rand + thread_idx) % DATA_STRESS_SIZE;
 
       RAJA::atomicAdd<NormalAtomic>(&(stress_data[target_line]), rand);
     }
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
index 993de18ee4..8a4bb1e495 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
@@ -90,7 +90,7 @@ struct MessagePassingLitmus {
   {
     // Send action
     x[other_thread] = T{1};
-    RAJA::atomicExchange<SendPolicy>(&(flag[other_thread]), T{1});
+    RAJA::atomicAdd<SendPolicy>(&(flag[other_thread]), T{1});
     // Recv action
     a[this_thread] = RAJA::atomicAdd<RecvPolicy>(&(flag[this_thread]), T{0});
     b[this_thread] = x[this_thread];

From e0dea11fa9d8b5f4e1ed20ab45084e59a2180167 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Sat, 16 Mar 2024 12:21:33 -0700
Subject: [PATCH 05/14] Add a store buffer litmus test

Store buffering is an observable behavior where a store may be reordered
after a load. This exercises MemoryOrderSeqCst.
---
 .../test-forall-atomic-litmus.cpp             |  35 +++-
 .../tests/test-forall-atomic-litmus-sb.hpp    | 164 ++++++++++++++++++
 2 files changed, 194 insertions(+), 5 deletions(-)
 create mode 100644 test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp

diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
index d30f3221b8..dbf5201fc2 100644
--- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
@@ -25,6 +25,7 @@
 #include <random>
 
 #include "test-forall-atomic-litmus-mp.hpp"
+#include "test-forall-atomic-litmus-sb.hpp"
 
 using IdxType = size_t;
 constexpr int NUM_ITERS = 20;
@@ -290,14 +291,14 @@ struct LitmusTestDriver {
   }
 };
 
-TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTestMP);
 
 template <typename T>
-class ForallAtomicLitmusTest : public ::testing::Test
+class ForallAtomicLitmusTestMP : public ::testing::Test
 {
 };
 
-TYPED_TEST_P(ForallAtomicLitmusTest, MessagePassingTest)
+TYPED_TEST_P(ForallAtomicLitmusTestMP, MessagePassingTest)
 {
   using Type = typename camp::at<TypeParam, camp::num<0>>::type;
   using SendRecvPol = typename camp::at<TypeParam, camp::num<1>>::type;
@@ -308,10 +309,34 @@ TYPED_TEST_P(ForallAtomicLitmusTest, MessagePassingTest)
   LitmusTestDriver<MPTest>::run();
 }
 
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, MessagePassingTest);
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTestMP, MessagePassingTest);
 
 using MessagePassingTestTypes = Test<MPLitmusTestPols>::Types;
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
-                               ForallAtomicLitmusTest,
+                               ForallAtomicLitmusTestMP,
                                MessagePassingTestTypes);
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTestSB);
+
+template <typename T>
+class ForallAtomicLitmusTestSB : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTestSB, StoreBufferTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using AtomicPol = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  using SBTest = StoreBufferLitmus<Type, AtomicPol>;
+  LitmusTestDriver<SBTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTestSB, StoreBufferTest);
+
+using StoreBufferTestTypes = Test<SBLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTestSB,
+                               StoreBufferTestTypes);
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp
new file mode 100644
index 0000000000..3498afa2a4
--- /dev/null
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp
@@ -0,0 +1,164 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+// "Store buffer" litmus test for DESUL ordered atomic
+// ---------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// store(x, 1)      store(y, 1)
+// a = load(y)      b = load(x)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 1, b = 1
+//  - a = 0, b = 1
+//  - a = 1, b = 0
+// Weak behavior:
+//  - a = 0, b = 0
+//
+// Acquire-release semantics are not enough to disallow the stores to be
+// reordered after the load -- full sequential consistency is required in order
+// to impose a "single total order" of the stores.
+
+// Send policy: Relaxed, SeqCst (Strong)
+// Recv policy: Relaxed, SeqCst (Strong)
+template <typename T, typename AtomicPolicy>
+struct StoreBufferLitmus {
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  size_t m_size;
+  T *x;
+  T *y;
+  T *a;
+  T *b;
+
+  int strong_behavior_0{0};
+  int strong_behavior_1{0};
+  int interleaved_behavior{0};
+  int weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size)
+  {
+    m_size = size;
+    x = work_res.allocate<T>(size);
+    y = work_res.allocate<T>(size);
+    a = work_res.allocate<T>(size);
+    b = work_res.allocate<T>(size);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(y);
+    work_res.deallocate(a);
+    work_res.deallocate(b);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size);
+    work_res.memset(y, 0, sizeof(T) * m_size);
+    work_res.memset(a, 0, sizeof(T) * m_size);
+    work_res.memset(b, 0, sizeof(T) * m_size);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread)
+  {
+    // Store-buffer 1
+    RAJA::atomicAdd<AtomicPolicy>(&(x[other_thread]), T{1});
+    // a[other_thread] = RAJA::atomicAdd<AtomicPolicy>(&(y[other_thread]),
+    // T{0});
+    a[other_thread] = y[other_thread];
+    // Store-buffer 2
+    RAJA::atomicAdd<AtomicPolicy>(&(y[this_thread]), T{1});
+    b[this_thread] = x[this_thread];
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+    camp::resources::Resource host_res{camp::resources::Host()};
+
+    T *a_host = host_res.allocate<T>(m_size);
+    T *b_host = host_res.allocate<T>(m_size);
+
+    work_res.memcpy(a_host, a, m_size * sizeof(T));
+    work_res.memcpy(b_host, b, m_size * sizeof(T));
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+    for (size_t i = 0; i < m_size; i++) {
+      if (a_host[i] == 1 && b_host[i] == 0) {
+        // Strong behavior: thread 1 happened before thread 2
+        strong_behavior_0++;
+      } else if (a_host[i] == 0 && b_host[i] == 1) {
+        // Strong behavior: thread 2 happened before thread 1
+        strong_behavior_1++;
+      } else if (a_host[i] == 1 && b_host[i] == 1) {
+        // Strong behavior: stores interleaved with receives
+        interleaved_behavior++;
+      } else if (a_host[i] == 0 && b_host[i] == 0) {
+        // Weak behavior: stores reordered after receives
+        weak_behavior++;
+      } else {
+        FAIL() << "Unexpected result for index " << i;
+      }
+    }
+
+    host_res.deallocate(a_host);
+    host_res.deallocate(b_host);
+  }
+
+  void verify()
+  {
+    std::cout << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cout << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cout << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cout << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<AtomicPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      ASSERT_GT(weak_behavior, 0);
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using SBLitmusTestOrderPols =
+    camp::list<RAJA::atomic_relaxed, RAJA::atomic_seq_cst>;
+
+using SBLitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, SBLitmusTestOrderPols>;

From 35a7a11f56f9578f0a755bcdd1fee25813643bb1 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Mon, 18 Mar 2024 13:54:51 -0700
Subject: [PATCH 06/14] Do strided index calculations within policy

---
 .../test-forall-atomic-litmus.cpp             |  6 +--
 .../tests/test-forall-atomic-litmus-mp.hpp    | 43 ++++++++++--------
 .../tests/test-forall-atomic-litmus-sb.hpp    | 44 ++++++++++---------
 3 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
index dbf5201fc2..ce02567cb5 100644
--- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
@@ -168,7 +168,7 @@ struct LitmusTestDriver {
     test_data.allocate(work_res, num_blocks, BLOCK_SIZE, num_testing_blocks);
 
     LitmusPolicy litmus_test;
-    litmus_test.allocate(work_res, num_testing_blocks * BLOCK_SIZE * STRIDE);
+    litmus_test.allocate(work_res, num_testing_blocks * BLOCK_SIZE, STRIDE);
 
 #ifdef RAJA_ENABLE_HIP
     using GPUExec = RAJA::hip_exec<BLOCK_SIZE>;
@@ -217,7 +217,6 @@ struct LitmusTestDriver {
     block_idx = param.shuffle_block[block_idx];
 
     IdxType data_idx = block_idx * param.block_size + permute_thread_idx;
-    data_idx *= STRIDE;
 
     if (block_idx < (IdxType)param.testing_blocks) {
       // Block is a testing block.
@@ -230,7 +229,6 @@ struct LitmusTestDriver {
         // Run specified test, matching threads between the two paired blocks.
         int other_data_idx =
             partner_idx * param.block_size + permute_thread_idx;
-        other_data_idx *= STRIDE;
 
         // Pre-stress pattern - stressing memory accesses before the test may
         // increase the rate of weak memory behaviors
@@ -244,7 +242,7 @@ struct LitmusTestDriver {
         // interleaved requests.
         this->sync(param.testing_blocks, thread_idx, param.barriers[i]);
 
-        test.run(data_idx + i, other_data_idx + i);
+        test.run(data_idx, other_data_idx, i);
       }
     }
   };
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
index 8a4bb1e495..b8f25ca9df 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
@@ -43,6 +43,7 @@ template <typename T, typename SendPolicy, typename RecvPolicy>
 struct MessagePassingLitmus {
   using RelaxedPolicy = RAJA::atomic_relaxed;
   size_t m_size;
+  int m_stride;
   T *x;
   T *flag;
   T *a;
@@ -53,13 +54,14 @@ struct MessagePassingLitmus {
   int interleaved_behavior{0};
   int weak_behavior{0};
 
-  void allocate(camp::resources::Resource work_res, size_t size)
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
   {
     m_size = size;
-    x = work_res.allocate<T>(size);
-    flag = work_res.allocate<T>(size);
-    a = work_res.allocate<T>(size);
-    b = work_res.allocate<T>(size);
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    flag = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+    b = work_res.allocate<T>(size * stride);
   }
 
   void deallocate(camp::resources::Resource work_res)
@@ -72,10 +74,10 @@ struct MessagePassingLitmus {
 
   void pre_run(camp::resources::Resource work_res)
   {
-    work_res.memset(x, 0, sizeof(T) * m_size);
-    work_res.memset(flag, 0, sizeof(T) * m_size);
-    work_res.memset(a, 0, sizeof(T) * m_size);
-    work_res.memset(b, 0, sizeof(T) * m_size);
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(flag, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(b, 0, sizeof(T) * m_size * m_stride);
 
 #if defined(RAJA_ENABLE_CUDA)
     cudaErrchk(cudaDeviceSynchronize());
@@ -86,25 +88,28 @@ struct MessagePassingLitmus {
 #endif
   }
 
-  RAJA_HOST_DEVICE void run(int this_thread, int other_thread)
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
   {
+    int this_thread_idx = this_thread * m_stride + iter;
+    int other_thread_idx = other_thread * m_stride + iter;
     // Send action
-    x[other_thread] = T{1};
-    RAJA::atomicAdd<SendPolicy>(&(flag[other_thread]), T{1});
+    x[other_thread_idx] = T{1};
+    RAJA::atomicAdd<SendPolicy>(&(flag[other_thread_idx]), T{1});
     // Recv action
-    a[this_thread] = RAJA::atomicAdd<RecvPolicy>(&(flag[this_thread]), T{0});
-    b[this_thread] = x[this_thread];
+    a[this_thread_idx] =
+        RAJA::atomicAdd<RecvPolicy>(&(flag[this_thread_idx]), T{0});
+    b[this_thread_idx] = x[this_thread_idx];
   }
 
   void count_results(camp::resources::Resource work_res)
   {
     camp::resources::Resource host_res{camp::resources::Host()};
 
-    T *a_host = host_res.allocate<T>(m_size);
-    T *b_host = host_res.allocate<T>(m_size);
+    T *a_host = host_res.allocate<T>(m_size * m_stride);
+    T *b_host = host_res.allocate<T>(m_size * m_stride);
 
-    work_res.memcpy(a_host, a, m_size * sizeof(T));
-    work_res.memcpy(b_host, b, m_size * sizeof(T));
+    work_res.memcpy(a_host, a, m_size * m_stride * sizeof(T));
+    work_res.memcpy(b_host, b, m_size * m_stride * sizeof(T));
 
 #if defined(RAJA_ENABLE_CUDA)
     cudaErrchk(cudaDeviceSynchronize());
@@ -113,7 +118,7 @@ struct MessagePassingLitmus {
 #if defined(RAJA_ENABLE_HIP)
     hipErrchk(hipDeviceSynchronize());
 #endif
-    for (size_t i = 0; i < m_size; i++) {
+    for (size_t i = 0; i < m_size * m_stride; i++) {
       if (a_host[i] == 0 && b_host[i] == 0) {
         // Strong behavior: neither store from test_send is observable
         strong_behavior_0++;
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp
index 3498afa2a4..efbd11740d 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp
@@ -43,6 +43,7 @@ template <typename T, typename AtomicPolicy>
 struct StoreBufferLitmus {
   using RelaxedPolicy = RAJA::atomic_relaxed;
   size_t m_size;
+  int m_stride;
   T *x;
   T *y;
   T *a;
@@ -53,13 +54,14 @@ struct StoreBufferLitmus {
   int interleaved_behavior{0};
   int weak_behavior{0};
 
-  void allocate(camp::resources::Resource work_res, size_t size)
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
   {
     m_size = size;
-    x = work_res.allocate<T>(size);
-    y = work_res.allocate<T>(size);
-    a = work_res.allocate<T>(size);
-    b = work_res.allocate<T>(size);
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    y = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+    b = work_res.allocate<T>(size * stride);
   }
 
   void deallocate(camp::resources::Resource work_res)
@@ -72,10 +74,10 @@ struct StoreBufferLitmus {
 
   void pre_run(camp::resources::Resource work_res)
   {
-    work_res.memset(x, 0, sizeof(T) * m_size);
-    work_res.memset(y, 0, sizeof(T) * m_size);
-    work_res.memset(a, 0, sizeof(T) * m_size);
-    work_res.memset(b, 0, sizeof(T) * m_size);
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(y, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(b, 0, sizeof(T) * m_size * m_stride);
 
 #if defined(RAJA_ENABLE_CUDA)
     cudaErrchk(cudaDeviceSynchronize());
@@ -86,27 +88,27 @@ struct StoreBufferLitmus {
 #endif
   }
 
-  RAJA_HOST_DEVICE void run(int this_thread, int other_thread)
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
   {
+    int this_thread_idx = this_thread * m_stride + iter;
+    int other_thread_idx = other_thread * m_stride + iter;
     // Store-buffer 1
-    RAJA::atomicAdd<AtomicPolicy>(&(x[other_thread]), T{1});
-    // a[other_thread] = RAJA::atomicAdd<AtomicPolicy>(&(y[other_thread]),
-    // T{0});
-    a[other_thread] = y[other_thread];
+    RAJA::atomicAdd<AtomicPolicy>(&(x[other_thread_idx]), T{1});
+    a[other_thread_idx] = y[other_thread_idx];
     // Store-buffer 2
-    RAJA::atomicAdd<AtomicPolicy>(&(y[this_thread]), T{1});
-    b[this_thread] = x[this_thread];
+    RAJA::atomicAdd<AtomicPolicy>(&(y[this_thread_idx]), T{1});
+    b[this_thread_idx] = x[this_thread_idx];
   }
 
   void count_results(camp::resources::Resource work_res)
   {
     camp::resources::Resource host_res{camp::resources::Host()};
 
-    T *a_host = host_res.allocate<T>(m_size);
-    T *b_host = host_res.allocate<T>(m_size);
+    T *a_host = host_res.allocate<T>(m_size * m_stride);
+    T *b_host = host_res.allocate<T>(m_size * m_stride);
 
-    work_res.memcpy(a_host, a, m_size * sizeof(T));
-    work_res.memcpy(b_host, b, m_size * sizeof(T));
+    work_res.memcpy(a_host, a, m_size * m_stride * sizeof(T));
+    work_res.memcpy(b_host, b, m_size * m_stride * sizeof(T));
 
 #if defined(RAJA_ENABLE_CUDA)
     cudaErrchk(cudaDeviceSynchronize());
@@ -115,7 +117,7 @@ struct StoreBufferLitmus {
 #if defined(RAJA_ENABLE_HIP)
     hipErrchk(hipDeviceSynchronize());
 #endif
-    for (size_t i = 0; i < m_size; i++) {
+    for (size_t i = 0; i < m_size * m_stride; i++) {
       if (a_host[i] == 1 && b_host[i] == 0) {
         // Strong behavior: thread 1 happened before thread 2
         strong_behavior_0++;

From d32bf9895974d4e332676b57bb18cc95179b3fa8 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Tue, 19 Mar 2024 12:44:03 -0700
Subject: [PATCH 07/14] Litmus tests: various modifications

- Use a forall device kernel to check results
- Interleave order of operations between testing threads
- Only warn on a lack of observed relaxed behaviors
---
 .../tests/test-forall-atomic-litmus-mp.hpp    | 123 ++++++++++++------
 .../tests/test-forall-atomic-litmus-sb.hpp    | 108 ++++++++++-----
 2 files changed, 155 insertions(+), 76 deletions(-)

diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
index b8f25ca9df..64deb1beb3 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
@@ -41,7 +41,9 @@
 // Recv policy: Relaxed (Weak), Release, AcqRel, SeqCst
 template <typename T, typename SendPolicy, typename RecvPolicy>
 struct MessagePassingLitmus {
+  using DataType = T;
   using RelaxedPolicy = RAJA::atomic_relaxed;
+  constexpr static int PERMUTE_THREAD_FLAG = 97;
   size_t m_size;
   int m_stride;
   T *x;
@@ -49,10 +51,10 @@ struct MessagePassingLitmus {
   T *a;
   T *b;
 
-  int strong_behavior_0{0};
-  int strong_behavior_1{0};
-  int interleaved_behavior{0};
-  int weak_behavior{0};
+  size_t strong_behavior_0{0};
+  size_t strong_behavior_1{0};
+  size_t interleaved_behavior{0};
+  size_t weak_behavior{0};
 
   void allocate(camp::resources::Resource work_res, size_t size, int stride)
   {
@@ -90,70 +92,111 @@ struct MessagePassingLitmus {
 
   RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
   {
-    int this_thread_idx = this_thread * m_stride + iter;
-    int other_thread_idx = other_thread * m_stride + iter;
+    bool send_first = (this_thread % 2 == 0);
     // Send action
-    x[other_thread_idx] = T{1};
-    RAJA::atomicAdd<SendPolicy>(&(flag[other_thread_idx]), T{1});
-    // Recv action
-    a[this_thread_idx] =
-        RAJA::atomicAdd<RecvPolicy>(&(flag[this_thread_idx]), T{0});
-    b[this_thread_idx] = x[this_thread_idx];
+    if (send_first) {
+      this->run_send(other_thread, iter);
+      this->run_recv(this_thread, iter);
+    } else {
+      this->run_recv(this_thread, iter);
+      this->run_send(other_thread, iter);
+    }
   }
 
-  void count_results(camp::resources::Resource work_res)
+  RAJA_HOST_DEVICE void run_send(int other_thread, int iter)
   {
-    camp::resources::Resource host_res{camp::resources::Host()};
+    int other_thread_idx = other_thread * m_stride + iter;
+    int permute_other_thread = (other_thread * PERMUTE_THREAD_FLAG) % m_size;
+    int permute_idx = permute_other_thread * m_stride + iter;
+    RAJA::atomicAdd<RelaxedPolicy>(&(x[other_thread_idx]), T{1});
+    RAJA::atomicAdd<SendPolicy>(&(flag[permute_idx]), T{1});
+  }
 
-    T *a_host = host_res.allocate<T>(m_size * m_stride);
-    T *b_host = host_res.allocate<T>(m_size * m_stride);
+  RAJA_HOST_DEVICE void run_recv(int this_thread, int iter)
+  {
+    int this_thread_idx = this_thread * m_stride + iter;
+    int permute_this_thread = (this_thread * PERMUTE_THREAD_FLAG) % m_size;
+    int permute_idx = permute_this_thread * m_stride + iter;
+    a[this_thread_idx] =
+        RAJA::atomicAdd<RecvPolicy>(&(flag[permute_idx]), T{0});
+    b[this_thread_idx] =
+        RAJA::atomicAdd<RelaxedPolicy>(&(x[this_thread_idx]), T{0});
+  }
 
-    work_res.memcpy(a_host, a, m_size * m_stride * sizeof(T));
-    work_res.memcpy(b_host, b, m_size * m_stride * sizeof(T));
+  void count_results(camp::resources::Resource work_res)
+  {
 
-#if defined(RAJA_ENABLE_CUDA)
-    cudaErrchk(cudaDeviceSynchronize());
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
 #endif
 
-#if defined(RAJA_ENABLE_HIP)
-    hipErrchk(hipDeviceSynchronize());
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
 #endif
-    for (size_t i = 0; i < m_size * m_stride; i++) {
-      if (a_host[i] == 0 && b_host[i] == 0) {
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *a_local = a;
+    T *b_local = b;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (a_local[i] == 0 && b_local[i] == 0) {
         // Strong behavior: neither store from test_send is observable
-        strong_behavior_0++;
-      } else if (a_host[i] == 1 && b_host[i] == 1) {
+        strong_cnt_0 += 1;
+      } else if (a_local[i] == 1 && b_local[i] == 1) {
         // Strong behavior: both stores from test_send are observable
-        strong_behavior_1++;
-      } else if (a_host[i] == 0 && b_host[i] == 1) {
+        strong_cnt_1 += 1;
+      } else if (a_local[i] == 0 && b_local[i] == 1) {
         // Strong behavior: stores interleaved with receives
-        interleaved_behavior++;
-      } else if (a_host[i] == 1 && b_host[i] == 0) {
+        interleaved_cnt += 1;
+      } else if (a_local[i] == 1 && b_local[i] == 0) {
         // Weak behavior: second store observed before first store
-        weak_behavior++;
+        weak_cnt += 1;
       } else {
-        FAIL() << "Unexpected result for index " << i;
+        unexpected_cnt += 1;
       }
-    }
+    });
 
-    host_res.deallocate(a_host);
-    host_res.deallocate(b_host);
+    EXPECT_EQ(unexpected_cnt.get(), 0);
+
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
   }
 
   void verify()
   {
-    std::cout << " - Strong behavior (a = 0, b = 0) = " << strong_behavior_0
+    std::cerr << " - Strong behavior (a = 0, b = 0) = " << strong_behavior_0
               << "\n";
-    std::cout << " - Strong behavior (a = 1, b = 1) = " << strong_behavior_1
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << strong_behavior_1
               << "\n";
-    std::cout << " - Strong behavior (a = 0, b = 1) = " << interleaved_behavior
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << interleaved_behavior
               << "\n";
-    std::cout << " - Weak behaviors = " << weak_behavior << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
 
     if (std::is_same<SendPolicy, RAJA::atomic_relaxed>::value &&
         std::is_same<RecvPolicy, RAJA::atomic_relaxed>::value) {
       // In the relaxed case, we should observe some weak behaviors.
-      ASSERT_GT(weak_behavior, 0);
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
     } else {
       // We should not expect any weak behaviors if using a strong ordering.
       EXPECT_EQ(weak_behavior, 0);
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp
index efbd11740d..c3cde1e760 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp
@@ -41,6 +41,7 @@
 // Recv policy: Relaxed, SeqCst (Strong)
 template <typename T, typename AtomicPolicy>
 struct StoreBufferLitmus {
+  using DataType = T;
   using RelaxedPolicy = RAJA::atomic_relaxed;
   size_t m_size;
   int m_stride;
@@ -90,68 +91,103 @@ struct StoreBufferLitmus {
 
   RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
   {
-    int this_thread_idx = this_thread * m_stride + iter;
-    int other_thread_idx = other_thread * m_stride + iter;
-    // Store-buffer 1
-    RAJA::atomicAdd<AtomicPolicy>(&(x[other_thread_idx]), T{1});
-    a[other_thread_idx] = y[other_thread_idx];
-    // Store-buffer 2
-    RAJA::atomicAdd<AtomicPolicy>(&(y[this_thread_idx]), T{1});
-    b[this_thread_idx] = x[this_thread_idx];
+    bool swap = this_thread % 2 == 0;
+    if (swap) {
+      store_buffer_1(other_thread, iter);
+      store_buffer_2(this_thread, iter);
+    } else {
+      store_buffer_2(this_thread, iter);
+      store_buffer_1(other_thread, iter);
+    }
   }
 
-  void count_results(camp::resources::Resource work_res)
+  RAJA_HOST_DEVICE void store_buffer_1(int thread, int iter)
   {
-    camp::resources::Resource host_res{camp::resources::Host()};
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicAdd<AtomicPolicy>(&(x[thread_idx]), T{1});
+    a[thread_idx] = RAJA::atomicAdd<RelaxedPolicy>(&(y[thread_idx]), T{0});
+  }
 
-    T *a_host = host_res.allocate<T>(m_size * m_stride);
-    T *b_host = host_res.allocate<T>(m_size * m_stride);
+  RAJA_HOST_DEVICE void store_buffer_2(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicAdd<AtomicPolicy>(&(y[thread_idx]), T{1});
+    b[thread_idx] = RAJA::atomicAdd<RelaxedPolicy>(&(x[thread_idx]), T{0});
+  }
 
-    work_res.memcpy(a_host, a, m_size * m_stride * sizeof(T));
-    work_res.memcpy(b_host, b, m_size * m_stride * sizeof(T));
+  void count_results(camp::resources::Resource work_res)
+  {
 
-#if defined(RAJA_ENABLE_CUDA)
-    cudaErrchk(cudaDeviceSynchronize());
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
 #endif
 
-#if defined(RAJA_ENABLE_HIP)
-    hipErrchk(hipDeviceSynchronize());
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
 #endif
-    for (size_t i = 0; i < m_size * m_stride; i++) {
-      if (a_host[i] == 1 && b_host[i] == 0) {
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *a_local = a;
+    T *b_local = b;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (a_local[i] == 1 && b_local[i] == 0) {
         // Strong behavior: thread 1 happened before thread 2
-        strong_behavior_0++;
-      } else if (a_host[i] == 0 && b_host[i] == 1) {
+        strong_cnt_0 += 1;
+      } else if (a_local[i] == 0 && b_local[i] == 1) {
         // Strong behavior: thread 2 happened before thread 1
-        strong_behavior_1++;
-      } else if (a_host[i] == 1 && b_host[i] == 1) {
+        strong_cnt_1 += 1;
+      } else if (a_local[i] == 1 && b_local[i] == 1) {
         // Strong behavior: stores interleaved with receives
-        interleaved_behavior++;
-      } else if (a_host[i] == 0 && b_host[i] == 0) {
+        interleaved_cnt += 1;
+      } else if (a_local[i] == 0 && b_local[i] == 0) {
         // Weak behavior: stores reordered after receives
-        weak_behavior++;
+        weak_cnt += 1;
       } else {
-        FAIL() << "Unexpected result for index " << i;
+        unexpected_cnt += 1;
       }
-    }
+    });
+
+    EXPECT_EQ(unexpected_cnt.get(), 0);
 
-    host_res.deallocate(a_host);
-    host_res.deallocate(b_host);
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
   }
 
   void verify()
   {
-    std::cout << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
+    std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
               << "\n";
-    std::cout << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
               << "\n";
-    std::cout << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
               << "\n";
-    std::cout << " - Weak behaviors = " << weak_behavior << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
 
     if (std::is_same<AtomicPolicy, RAJA::atomic_relaxed>::value) {
       // In the relaxed case, we should observe some weak behaviors.
-      ASSERT_GT(weak_behavior, 0);
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
     } else {
       // We should not expect any weak behaviors if using a strong ordering.
       EXPECT_EQ(weak_behavior, 0);

From 822d8ddb4ec11b998978a00aa117989561a8c8a3 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Tue, 19 Mar 2024 13:44:06 -0700
Subject: [PATCH 08/14] Litmus tests: changes to increase relaxed observation
 rates

Correctly use the stress testing formulation from the paper, "Foundations
of Empirical Memory Consistency Testing" (OOPSLA 2020). Instead of having
all stressing blocks scatter their accesses across the "stressing" array,
select a small-ish subset of 64-word lines and stripe them across the
stressing blocks. This increases the stress on the contention hardware
in a GPU.

Synchronize testing blocks and stressing blocks together on each
iteration.
---
 .../test-forall-atomic-litmus.cpp             | 130 +++++++++++-------
 1 file changed, 84 insertions(+), 46 deletions(-)

diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
index ce02567cb5..f987f5e764 100644
--- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
@@ -28,20 +28,28 @@
 #include "test-forall-atomic-litmus-sb.hpp"
 
 using IdxType = size_t;
-constexpr int NUM_ITERS = 20;
+constexpr int NUM_ITERS = 100;
 #ifdef RAJA_ENABLE_CUDA
-constexpr int STRIDE = 4;
-constexpr bool STRESS_BEFORE_TEST = false;
+constexpr int STRIDE = 32;
+constexpr bool STRESS_BEFORE_TEST = true;
 constexpr bool NONTESTING_BLOCKS = true;
+constexpr int WARP_SIZE = 32;
+constexpr int BLOCK_SIZE = 128;
 #elif defined(RAJA_ENABLE_HIP)
-constexpr int STRIDE = 16;
+constexpr int STRIDE = 32;
 constexpr bool STRESS_BEFORE_TEST = true;
 constexpr bool NONTESTING_BLOCKS = true;
+constexpr int WARP_SIZE = 64;
+constexpr int BLOCK_SIZE = 128;
 #endif
-constexpr int DATA_STRESS_SIZE = 2048 * STRIDE;
+
+constexpr int STRESS_STRIDE = 64;
+constexpr int NUM_STRESS_LINES = 2048;
+constexpr int DATA_STRESS_SIZE = NUM_STRESS_LINES * STRESS_STRIDE;
 
 constexpr int PERMUTE_PRIME_BLOCK = 17;
 constexpr int PERMUTE_PRIME_GRID = 47;
+constexpr int PERMUTE_STRESS = 977;
 
 template <typename Func>
 __global__ void dummy_kernel(IdxType index, Func func)
@@ -53,10 +61,16 @@ __global__ void dummy_kernel(IdxType index, Func func)
 template <typename LitmusPolicy>
 struct LitmusTestDriver {
 public:
+  using T = typename LitmusPolicy::DataType;
   struct TestData {
     int block_size;
     int grid_size;
 
+    int stride = STRIDE;
+
+    int pre_stress_iterations = 4;
+    int stress_iterations = 12;
+
     // Number of blocks to run the message-passing litmus test on.
     int testing_blocks;
 
@@ -66,7 +80,10 @@ struct LitmusTestDriver {
     // Barrier integers to synchronize testing threads.
     IdxType* barriers;
 
-    IdxType* data_stress;
+    int* data_stress;
+
+    int num_stress_index;
+    IdxType* stress_index;
 
     void allocate(camp::resources::Resource work_res,
                   int grid_size,
@@ -80,7 +97,10 @@ struct LitmusTestDriver {
 
       shuffle_block = work_res.allocate<IdxType>(grid_size);
       barriers = work_res.allocate<IdxType>(STRIDE);
-      data_stress = work_res.allocate<IdxType>(DATA_STRESS_SIZE);
+      data_stress = work_res.allocate<int>(DATA_STRESS_SIZE);
+
+      num_stress_index = 64;
+      stress_index = work_res.allocate<IdxType>(num_stress_index);
     }
 
     void pre_run(camp::resources::Resource work_res)
@@ -98,8 +118,26 @@ struct LitmusTestDriver {
                       shuffle_block_host.data(),
                       sizeof(IdxType) * grid_size);
 
+      std::vector<IdxType> stress_index_host(num_stress_index);
+      {
+        std::mt19937 gen{rand_device()};
+        std::uniform_int_distribution<IdxType> rnd_stress_offset(0,
+                                                                 STRESS_STRIDE);
+        std::uniform_int_distribution<IdxType> rnd_stress_dist(
+            0, NUM_STRESS_LINES - 1);
+        std::generate(stress_index_host.begin(),
+                      stress_index_host.end(),
+                      [&]() -> IdxType {
+                        return rnd_stress_dist(gen) * STRESS_STRIDE;
+                      });
+      }
+      work_res.memcpy(stress_index,
+                      stress_index_host.data(),
+                      sizeof(IdxType) * num_stress_index);
+
       work_res.memset(barriers, 0, sizeof(IdxType) * STRIDE);
-      work_res.memset(data_stress, 0, sizeof(IdxType) * DATA_STRESS_SIZE);
+      work_res.memset(data_stress, 0, sizeof(int) * DATA_STRESS_SIZE);
+
 
 #if defined(RAJA_ENABLE_CUDA)
       cudaErrchk(cudaDeviceSynchronize());
@@ -123,7 +161,6 @@ struct LitmusTestDriver {
   // Run
   static void run()
   {
-    constexpr IdxType BLOCK_SIZE = 128;
     int num_blocks = 0;
     {
       LitmusPolicy dummy_policy{};
@@ -216,16 +253,25 @@ struct LitmusTestDriver {
     // Shuffle the block ID randomly according to a permutation array.
     block_idx = param.shuffle_block[block_idx];
 
-    IdxType data_idx = block_idx * param.block_size + permute_thread_idx;
+    IdxType data_idx = block_idx * param.block_size + thread_idx;
 
-    if (block_idx < (IdxType)param.testing_blocks) {
-      // Block is a testing block.
-      // Each block acts as a "sender" to a unique "partner" block. This is done
-      // by permuting the block IDs with a function p(i) = i * k mod n, where n
-      // is the number of blocks being tested, and k and n are coprime.
-      int partner_idx = (block_idx * PERMUTE_PRIME_GRID) % param.testing_blocks;
+    for (int i = 0; i < param.stride; i++) {
+
+      // Synchronize all blocks before testing, to increase the chance of
+      // interleaved requests.
+      this->sync(param.grid_size, thread_idx, param.barriers[i]);
+
+      if (block_idx < (IdxType)param.testing_blocks) {
+
+        // Block is a testing block.
+        //
+        // Each block acts as a "sender" to a unique "partner" block. This is
+        // done by permuting the block IDs with a function p(i) = i * k mod n,
+        // where n is the number of blocks being tested, and k and n are
+        // coprime.
+        int partner_idx =
+            (block_idx * PERMUTE_PRIME_GRID + i) % param.testing_blocks;
 
-      for (int i = 0; i < STRIDE; i++) {
         // Run specified test, matching threads between the two paired blocks.
         int other_data_idx =
             partner_idx * param.block_size + permute_thread_idx;
@@ -234,15 +280,12 @@ struct LitmusTestDriver {
         // increase the rate of weak memory behaviors
         // Helps on AMD, doesn't seem to help on NVIDIA
         if (STRESS_BEFORE_TEST) {
-          this->stress(
-              param.data_stress, block_idx, thread_idx, param.grid_size, 128);
+          this->stress(block_idx, thread_idx, param, true);
         }
 
-        // Synchronize all blocks before testing, to increase the chance of
-        // interleaved requests.
-        this->sync(param.testing_blocks, thread_idx, param.barriers[i]);
-
         test.run(data_idx, other_data_idx, i);
+      } else {
+        this->stress(block_idx, thread_idx, param);
       }
     }
   };
@@ -252,8 +295,9 @@ struct LitmusTestDriver {
     if (thread_idx == 0) {
       IdxType result = RAJA::atomicAdd<NormalAtomic>(&barrier, IdxType{1});
       // Busy-wait until all blocks perform the above add.
-      while (result != num_blocks)
+      while (result != num_blocks) {
         result = RAJA::atomicAdd<NormalAtomic>(&barrier, IdxType{0});
+      }
     }
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_CODE__)
@@ -261,30 +305,24 @@ struct LitmusTestDriver {
 #endif
   }
 
-  RAJA_HOST_DEVICE uint64_t get_rand(uint64_t& pcg_state)
-  {
-    uint64_t oldstate = pcg_state;
-    // Advance internal state
-    pcg_state = oldstate * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
-    // Calculate output function (XSH RR), uses old state for max ILP
-    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
-    uint32_t rot = oldstate >> 59u;
-    return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
-  }
-
-  RAJA_HOST_DEVICE void stress(IdxType* stress_data,
-                               IdxType block_idx,
+  RAJA_HOST_DEVICE void stress(IdxType block_idx,
                                IdxType thread_idx,
-                               int grid_size,
-                               int num_iters)
+                               const TestData& param,
+                               bool pre_stress = false)
   {
-    uint64_t pcg_state = block_idx;
-    for (int i = 0; i < num_iters; i++) {
-      // Pseudo-randomly target a given stress data location.
-      auto rand = get_rand(pcg_state);
-      auto target_line = (rand + thread_idx) % DATA_STRESS_SIZE;
-
-      RAJA::atomicAdd<NormalAtomic>(&(stress_data[target_line]), rand);
+    int num_iters =
+        (pre_stress ? param.pre_stress_iterations : param.stress_iterations);
+    int volatile* stress_data = param.data_stress;
+    if (thread_idx % WARP_SIZE == 0) {
+      int warp_idx = thread_idx / WARP_SIZE;
+      // int select_idx = block_idx * NUM_WARPS + warp_idx;
+      int select_idx = block_idx;
+      select_idx = (select_idx * PERMUTE_STRESS) % param.num_stress_index;
+      int stress_line = param.stress_index[select_idx];
+      for (int i = 0; i < num_iters; i++) {
+        int data = stress_data[stress_line];
+        stress_data[stress_line] = i + 1;
+      }
     }
   }
 };

From 2784605d412043ab60a4d7f2d6e8be75dc81cd32 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Tue, 19 Mar 2024 13:52:01 -0700
Subject: [PATCH 09/14] RENAME

---
 ...rall-atomic-litmus-mp.hpp => test-forall-atomic-litmus-mp.cpp} | 0
 ...rall-atomic-litmus-sb.hpp => test-forall-atomic-litmus-sb.cpp} | 0
 .../test-forall-atomic-litmus-driver.hpp}                         | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename test/functional/forall/atomic-basic/{tests/test-forall-atomic-litmus-mp.hpp => test-forall-atomic-litmus-mp.cpp} (100%)
 rename test/functional/forall/atomic-basic/{tests/test-forall-atomic-litmus-sb.hpp => test-forall-atomic-litmus-sb.cpp} (100%)
 rename test/functional/forall/atomic-basic/{test-forall-atomic-litmus.cpp => tests/test-forall-atomic-litmus-driver.hpp} (100%)

diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp
similarity index 100%
rename from test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-mp.hpp
rename to test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp
similarity index 100%
rename from test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-sb.hpp
rename to test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp
similarity index 100%
rename from test/functional/forall/atomic-basic/test-forall-atomic-litmus.cpp
rename to test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp

From 1fea7cf08fd193d76b12fb411ebfc42b53eaef22 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Tue, 19 Mar 2024 13:58:52 -0700
Subject: [PATCH 10/14] Post-rename changes

---
 .../forall/atomic-basic/CMakeLists.txt        | 16 ++++--
 .../test-forall-atomic-litmus-mp.cpp          | 30 ++++++++++-
 .../test-forall-atomic-litmus-sb.cpp          | 26 +++++++++
 .../test-forall-atomic-litmus-driver.hpp      | 53 -------------------
 4 files changed, 66 insertions(+), 59 deletions(-)

diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt
index b5e72dc8fc..6e2c184b09 100644
--- a/test/functional/forall/atomic-basic/CMakeLists.txt
+++ b/test/functional/forall/atomic-basic/CMakeLists.txt
@@ -42,12 +42,20 @@ if(RAJA_ENABLE_DESUL_ATOMICS AND RAJA_ENABLE_HIP)
     set(ENABLE_LITMUS_TESTS ON)
 endif()
 
+set(FORALL_LITMUS_TESTS
+    mp # Message Passing
+    sb # Store Buffer
+)
+
 if (ENABLE_LITMUS_TESTS)
-  raja_add_test( NAME test-forall-atomic-litmus-${LITMUS_BACKEND}
-                 SOURCES test-forall-atomic-litmus.cpp)
-  target_include_directories(test-forall-atomic-litmus-${LITMUS_BACKEND}.exe
-                              PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+ foreach ( LITMUS_TEST ${FORALL_LITMUS_TESTS} )
+   raja_add_test( NAME test-forall-atomic-litmus-${LITMUS_BACKEND}-${LITMUS_TEST}
+                  SOURCES test-forall-atomic-litmus-${LITMUS_TEST}.cpp)
+   target_include_directories(test-forall-atomic-litmus-${LITMUS_BACKEND}-${LITMUS_TEST}.exe
+                  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+ endforeach()
 endif()
 
+unset(FORALL_LITMUS_TESTS)
 unset(ENABLE_LITMUS_TESTS)
 unset(LITMUS_BACKEND)
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp
index 64deb1beb3..362686d5c0 100644
--- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp
@@ -15,6 +15,8 @@
 #include "RAJA_test-forall-execpol.hpp"
 #include "RAJA_test-index-types.hpp"
 
+#include "test-forall-atomic-litmus-driver.hpp"
+
 // "Message Passing" litmus test for DESUL ordered atomic
 // ------------------------------------------------------
 // Initial state: x = 0 && y = 0
@@ -210,7 +212,31 @@ using MPLitmusTestOrderPols =
                camp::list<RAJA::atomic_acq_rel, RAJA::atomic_acq_rel>,
                camp::list<RAJA::atomic_seq_cst, RAJA::atomic_seq_cst> >;
 
-using MPLitmusTestRecvPols = camp::list<RAJA::atomic_relaxed>;
-
 using MPLitmusTestPols =
     camp::cartesian_product<AtomicDataTypeList, MPLitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, MessagePassingTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using SendRecvPol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using SendPol = typename camp::at<SendRecvPol, camp::num<0>>::type;
+  using RecvPol = typename camp::at<SendRecvPol, camp::num<1>>::type;
+
+  using MPTest = MessagePassingLitmus<Type, SendPol, RecvPol>;
+  LitmusTestDriver<MPTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, MessagePassingTest);
+
+using MessagePassingTestTypes = Test<MPLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               MessagePassingTestTypes);
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp
index c3cde1e760..666c1470ea 100644
--- a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp
@@ -15,6 +15,8 @@
 #include "RAJA_test-forall-execpol.hpp"
 #include "RAJA_test-index-types.hpp"
 
+#include "test-forall-atomic-litmus-driver.hpp"
+
 // "Store buffer" litmus test for DESUL ordered atomic
 // ---------------------------------------------------
 // Initial state: x = 0 && y = 0
@@ -200,3 +202,27 @@ using SBLitmusTestOrderPols =
 
 using SBLitmusTestPols =
     camp::cartesian_product<AtomicDataTypeList, SBLitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, StoreBufferTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using AtomicPol = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  using SBTest = StoreBufferLitmus<Type, AtomicPol>;
+  LitmusTestDriver<SBTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, StoreBufferTest);
+
+using StoreBufferTestTypes = Test<SBLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               StoreBufferTestTypes);
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp
index f987f5e764..3e6b9520e8 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp
@@ -24,9 +24,6 @@
 #include <numeric>
 #include <random>
 
-#include "test-forall-atomic-litmus-mp.hpp"
-#include "test-forall-atomic-litmus-sb.hpp"
-
 using IdxType = size_t;
 constexpr int NUM_ITERS = 100;
 #ifdef RAJA_ENABLE_CUDA
@@ -326,53 +323,3 @@ struct LitmusTestDriver {
     }
   }
 };
-
-TYPED_TEST_SUITE_P(ForallAtomicLitmusTestMP);
-
-template <typename T>
-class ForallAtomicLitmusTestMP : public ::testing::Test
-{
-};
-
-TYPED_TEST_P(ForallAtomicLitmusTestMP, MessagePassingTest)
-{
-  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
-  using SendRecvPol = typename camp::at<TypeParam, camp::num<1>>::type;
-  using SendPol = typename camp::at<SendRecvPol, camp::num<0>>::type;
-  using RecvPol = typename camp::at<SendRecvPol, camp::num<1>>::type;
-
-  using MPTest = MessagePassingLitmus<Type, SendPol, RecvPol>;
-  LitmusTestDriver<MPTest>::run();
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTestMP, MessagePassingTest);
-
-using MessagePassingTestTypes = Test<MPLitmusTestPols>::Types;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
-                               ForallAtomicLitmusTestMP,
-                               MessagePassingTestTypes);
-
-TYPED_TEST_SUITE_P(ForallAtomicLitmusTestSB);
-
-template <typename T>
-class ForallAtomicLitmusTestSB : public ::testing::Test
-{
-};
-
-TYPED_TEST_P(ForallAtomicLitmusTestSB, StoreBufferTest)
-{
-  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
-  using AtomicPol = typename camp::at<TypeParam, camp::num<1>>::type;
-
-  using SBTest = StoreBufferLitmus<Type, AtomicPol>;
-  LitmusTestDriver<SBTest>::run();
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTestSB, StoreBufferTest);
-
-using StoreBufferTestTypes = Test<SBLitmusTestPols>::Types;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
-                               ForallAtomicLitmusTestSB,
-                               StoreBufferTestTypes);

From 94d45b5a7ea3d1025937fd4a536c0b1ef468f7e1 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Tue, 19 Mar 2024 14:21:38 -0700
Subject: [PATCH 11/14] Add a load buffer litmus test

---
 .../forall/atomic-basic/CMakeLists.txt        |   1 +
 .../test-forall-atomic-litmus-lb.cpp          | 235 ++++++++++++++++++
 2 files changed, 236 insertions(+)
 create mode 100644 test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp

diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt
index 6e2c184b09..4d4f17fac9 100644
--- a/test/functional/forall/atomic-basic/CMakeLists.txt
+++ b/test/functional/forall/atomic-basic/CMakeLists.txt
@@ -45,6 +45,7 @@ endif()
 set(FORALL_LITMUS_TESTS
     mp # Message Passing
     sb # Store Buffer
+    lb # Load Buffer
 )
 
 if (ENABLE_LITMUS_TESTS)
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp
new file mode 100644
index 0000000000..67a49eb91d
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp
@@ -0,0 +1,235 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "test-forall-atomic-litmus-driver.hpp"
+
+// "Load buffer" litmus test for DESUL ordered atomic
+// --------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// a = load(y)      b = load(x)
+// store(x, 1)      store(y, 1)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 0, b = 1
+//  - a = 1, b = 0
+//  - a = 0, b = 0
+// Weak behavior:
+//  - a = 1, b = 1
+
+// Send policy: Relaxed (Weak), Release, AcqRel, SeqCst
+// Recv policy: Relaxed (Weak), Acquire, AcqRel, SeqCst
+template <typename T, typename SendPolicy, typename RecvPolicy>
+struct LoadBufferLitmus {
+  using DataType = T;
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  constexpr static int PERMUTE_THREAD_FLAG = 97;
+  size_t m_size;
+  int m_stride;
+  T *x;
+  T *y;
+  T *a;
+  T *b;
+
+  int strong_behavior_0{0};
+  int strong_behavior_1{0};
+  int interleaved_behavior{0};
+  int weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
+  {
+    m_size = size;
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    y = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+    b = work_res.allocate<T>(size * stride);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(y);
+    work_res.deallocate(a);
+    work_res.deallocate(b);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(y, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(b, 0, sizeof(T) * m_size * m_stride);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
+  {
+    bool swap = this_thread % 2 == 0;
+    if (swap) {
+      load_buffer_1(other_thread, iter);
+      load_buffer_2(this_thread, iter);
+    } else {
+      load_buffer_2(this_thread, iter);
+      load_buffer_1(other_thread, iter);
+    }
+  }
+
+  RAJA_HOST_DEVICE void load_buffer_1(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    int permute_thread = (thread * PERMUTE_THREAD_FLAG) % m_size;
+    int permute_idx = permute_thread * m_stride + iter;
+    a[thread_idx] = RAJA::atomicAdd<RelaxedPolicy>(&(y[thread_idx]), T{0});
+    RAJA::atomicAdd<SendPolicy>(&(x[permute_idx]), T{1});
+  }
+
+  RAJA_HOST_DEVICE void load_buffer_2(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    int permute_thread = (thread * PERMUTE_THREAD_FLAG) % m_size;
+    int permute_idx = permute_thread * m_stride + iter;
+    b[thread_idx] = RAJA::atomicAdd<RecvPolicy>(&(x[permute_idx]), T{0});
+    RAJA::atomicAdd<RelaxedPolicy>(&(y[thread_idx]), T{1});
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
+#endif
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *a_local = a;
+    T *b_local = b;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (a_local[i] == 1 && b_local[i] == 0) {
+        // Strong behavior: thread 2 happened before thread 1
+        strong_cnt_0 += 1;
+      } else if (a_local[i] == 0 && b_local[i] == 1) {
+        // Strong behavior: thread 1 happened before thread 2
+        strong_cnt_1 += 1;
+      } else if (a_local[i] == 0 && b_local[i] == 0) {
+        // Strong behavior: stores interleaved with receives
+        interleaved_cnt += 1;
+      } else if (a_local[i] == 1 && b_local[i] == 1) {
+        // Weak behavior: stores reordered after receives
+        weak_cnt += 1;
+      } else {
+        unexpected_cnt += 1;
+      }
+    });
+
+    EXPECT_EQ(unexpected_cnt.get(), 0);
+
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
+  }
+
+  void verify()
+  {
+    std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<SendPolicy, RAJA::atomic_relaxed>::value &&
+        std::is_same<RecvPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using LBLitmusTestOrderPols =
+    camp::list<camp::list<RAJA::atomic_relaxed, RAJA::atomic_relaxed>,
+               camp::list<RAJA::atomic_release, RAJA::atomic_acquire>,
+               camp::list<RAJA::atomic_acq_rel, RAJA::atomic_acq_rel>,
+               camp::list<RAJA::atomic_seq_cst, RAJA::atomic_seq_cst> >;
+
+using LBLitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, LBLitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, LoadBufferTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using SendRecvPol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using SendPol = typename camp::at<SendRecvPol, camp::num<0>>::type;
+  using RecvPol = typename camp::at<SendRecvPol, camp::num<1>>::type;
+
+  using LBTest = LoadBufferLitmus<Type, SendPol, RecvPol>;
+  LitmusTestDriver<LBTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, LoadBufferTest);
+
+using LoadBufferTestTypes = Test<LBLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               LoadBufferTestTypes);

From a90f93a7520bc8a2b1f8e09fbb12ab297650cd57 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Tue, 19 Mar 2024 14:32:16 -0700
Subject: [PATCH 12/14] Add a store litmus test

---
 .../forall/atomic-basic/CMakeLists.txt        |   1 +
 .../test-forall-atomic-litmus-store.cpp       | 226 ++++++++++++++++++
 2 files changed, 227 insertions(+)
 create mode 100644 test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp

diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt
index 4d4f17fac9..fa0fbc7569 100644
--- a/test/functional/forall/atomic-basic/CMakeLists.txt
+++ b/test/functional/forall/atomic-basic/CMakeLists.txt
@@ -46,6 +46,7 @@ set(FORALL_LITMUS_TESTS
     mp # Message Passing
     sb # Store Buffer
     lb # Load Buffer
+    store # Store
 )
 
 if (ENABLE_LITMUS_TESTS)
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp
new file mode 100644
index 0000000000..10fd9edeb8
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp
@@ -0,0 +1,226 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "test-forall-atomic-litmus-driver.hpp"
+
+// "Store" litmus test for DESUL ordered atomic
+// ---------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// store(x, 2)      a = load(y)
+// store(y, 1)      store(x, 1)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 1, x = 1
+//  - a = 0, x = 2
+//  - a = 0, x = 1
+// Weak behavior:
+//  - a = 1, x = 2
+
+// Send policy: Relaxed (Weak), Acquire, AcqRel, SeqCst
+// Recv policy: Relaxed (Weak), Release, AcqRel, SeqCst
+template <typename T, typename SendPolicy, typename RecvPolicy>
+struct StoreLitmus {
+  using DataType = T;
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  size_t m_size;
+  int m_stride;
+  T *x;
+  T *y;
+  T *a;
+
+  int strong_behavior_0{0};
+  int strong_behavior_1{0};
+  int interleaved_behavior{0};
+  int weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
+  {
+    m_size = size;
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    y = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(y);
+    work_res.deallocate(a);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(y, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
+  {
+    bool swap = this_thread % 2 == 0;
+    if (swap) {
+      store_1(other_thread, iter);
+      store_2(this_thread, iter);
+    } else {
+      store_2(this_thread, iter);
+      store_1(other_thread, iter);
+    }
+  }
+
+  RAJA_HOST_DEVICE void store_1(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicExchange<RelaxedPolicy>(&(x[thread_idx]), T{2});
+    RAJA::atomicAdd<SendPolicy>(&(y[thread_idx]), T{1});
+  }
+
+  RAJA_HOST_DEVICE void store_2(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    a[thread_idx] = RAJA::atomicAdd<RecvPolicy>(&(y[thread_idx]), T{0});
+    RAJA::atomicExchange<RelaxedPolicy>(&(x[thread_idx]), T{1});
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
+#endif
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *a_local = a;
+    T *x_local = x;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (a_local[i] == 1 && x_local[i] == 1) {
+        // Strong behavior: thread 1 happened before thread 2
+        strong_cnt_0 += 1;
+      } else if (a_local[i] == 0 && x_local[i] == 2) {
+        // Strong behavior: thread 2 happened before thread 1
+        strong_cnt_1 += 1;
+      } else if (a_local[i] == 0 && x_local[i] == 1) {
+        // Strong behavior: stores interleaved with receives
+        interleaved_cnt += 1;
+      } else if (a_local[i] == 1 && x_local[i] == 2) {
+        // Weak behavior: receives reordered after stores
+        weak_cnt += 1;
+      } else {
+        unexpected_cnt += 1;
+      }
+    });
+
+    EXPECT_EQ(unexpected_cnt.get(), 0);
+
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
+  }
+
+  void verify()
+  {
+    std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<SendPolicy, RAJA::atomic_relaxed>::value &&
+        std::is_same<RecvPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using StoreLitmusTestOrderPols =
+    camp::list<camp::list<RAJA::atomic_relaxed, RAJA::atomic_relaxed>,
+               camp::list<RAJA::atomic_release, RAJA::atomic_acquire>,
+               camp::list<RAJA::atomic_acq_rel, RAJA::atomic_acq_rel>,
+               camp::list<RAJA::atomic_seq_cst, RAJA::atomic_seq_cst> >;
+
+using StoreLitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, StoreLitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, StoreTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using SendRecvPol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using SendPol = typename camp::at<SendRecvPol, camp::num<0>>::type;
+  using RecvPol = typename camp::at<SendRecvPol, camp::num<1>>::type;
+
+  using StoreTest = StoreLitmus<Type, SendPol, RecvPol>;
+  LitmusTestDriver<StoreTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, StoreTest);
+
+using StoreTestTypes = Test<StoreLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               StoreTestTypes);

From a86a1765d5b6e432936ab8f3a651a7c09d402d36 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Tue, 19 Mar 2024 14:58:15 -0700
Subject: [PATCH 13/14] Add a read litmus test

---
 .../forall/atomic-basic/CMakeLists.txt        |   1 +
 .../test-forall-atomic-litmus-read.cpp        | 219 ++++++++++++++++++
 2 files changed, 220 insertions(+)
 create mode 100644 test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp

diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt
index fa0fbc7569..968090ec06 100644
--- a/test/functional/forall/atomic-basic/CMakeLists.txt
+++ b/test/functional/forall/atomic-basic/CMakeLists.txt
@@ -47,6 +47,7 @@ set(FORALL_LITMUS_TESTS
     sb # Store Buffer
     lb # Load Buffer
     store # Store
+    read # Read
 )
 
 if (ENABLE_LITMUS_TESTS)
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp
new file mode 100644
index 0000000000..16c870e4b8
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp
@@ -0,0 +1,219 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "test-forall-atomic-litmus-driver.hpp"
+
+// "Read" litmus test for DESUL ordered atomic
+// ---------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// store(x, 1)      store(y, 2)
+// store(y, 1)      a = load(x)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 1, x = 2
+//  - a = 1, x = 1
+//  - a = 0, x = 1
+// Weak behavior:
+//  - a = 0, x = 2
+
+// Atomic policy: Relaxed (Weak), SeqCst
+template <typename T, typename AtomicPolicy>
+struct ReadLitmus {
+  using DataType = T;
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  size_t m_size;
+  int m_stride;
+  T *x;
+  T *y;
+  T *a;
+
+  int strong_behavior_0{0};
+  int strong_behavior_1{0};
+  int interleaved_behavior{0};
+  int weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
+  {
+    m_size = size;
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    y = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(y);
+    work_res.deallocate(a);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(y, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
+  {
+    bool swap = this_thread % 2 == 0;
+    if (swap) {
+      read_1(other_thread, iter);
+      read_2(this_thread, iter);
+    } else {
+      read_2(this_thread, iter);
+      read_1(other_thread, iter);
+    }
+  }
+
+  RAJA_HOST_DEVICE void read_1(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicExchange<AtomicPolicy>(&(x[thread_idx]), T{1});
+    RAJA::atomicExchange<RelaxedPolicy>(&(y[thread_idx]), T{1});
+  }
+
+  RAJA_HOST_DEVICE void read_2(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicExchange<AtomicPolicy>(&(y[thread_idx]), T{2});
+    a[thread_idx] = RAJA::atomicAdd<RelaxedPolicy>(&(x[thread_idx]), T{0});
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
+#endif
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *a_local = a;
+    T *y_local = y;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (a_local[i] == 1 && y_local[i] == 2) {
+        // Strong behavior: thread 1 happened before thread 2
+        strong_cnt_0 += 1;
+      } else if (a_local[i] == 0 && y_local[i] == 1) {
+        // Strong behavior: thread 2 happened before thread 1
+        strong_cnt_1 += 1;
+      } else if (a_local[i] == 1 && y_local[i] == 1) {
+        // Strong behavior: stores interleaved with receives
+        interleaved_cnt += 1;
+      } else if (a_local[i] == 0 && y_local[i] == 2) {
+        // Weak behavior: receives reordered after stores
+        weak_cnt += 1;
+      } else {
+        unexpected_cnt += 1;
+      }
+    });
+
+    EXPECT_EQ(unexpected_cnt.get(), 0);
+
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
+  }
+
+  void verify()
+  {
+    std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<AtomicPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using ReadLitmusTestOrderPols =
+    camp::list<RAJA::atomic_relaxed, RAJA::atomic_seq_cst>;
+
+using ReadLitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, ReadLitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, ReadTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using AtomicPol = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  using ReadTest = ReadLitmus<Type, AtomicPol>;
+  LitmusTestDriver<ReadTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, ReadTest);
+
+using ReadTestTypes = Test<ReadLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               ReadTestTypes);

From 3f4dc307b197d186ef3cc627ced1a13bef0c7ea7 Mon Sep 17 00:00:00 2001
From: Max Yang <yang39@llnl.gov>
Date: Tue, 19 Mar 2024 15:20:15 -0700
Subject: [PATCH 14/14] Add a 2+2 write litmus test

---
 .../forall/atomic-basic/CMakeLists.txt        |   1 +
 .../test-forall-atomic-litmus-write2x2.cpp    | 222 ++++++++++++++++++
 2 files changed, 223 insertions(+)
 create mode 100644 test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp

diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt
index 968090ec06..707eabdc78 100644
--- a/test/functional/forall/atomic-basic/CMakeLists.txt
+++ b/test/functional/forall/atomic-basic/CMakeLists.txt
@@ -48,6 +48,7 @@ set(FORALL_LITMUS_TESTS
     lb # Load Buffer
     store # Store
     read # Read
+    write2x2 # 2+2 write
 )
 
 if (ENABLE_LITMUS_TESTS)
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp
new file mode 100644
index 0000000000..b5dc28ac5f
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp
@@ -0,0 +1,222 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "test-forall-atomic-litmus-driver.hpp"
+
+// "2+2 Write" litmus test for DESUL ordered atomic
+// ---------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// store(x, 1)      store(y, 1)
+// store(y, 2)      store(x, 2)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 1, x = 2
+//  - a = 2, x = 1
+//  - a = 2, x = 2
+// Weak behavior:
+//  - a = 1, x = 1
+
+// Send policy: Relaxed (Weak), AcqRel, SeqCst
+// Recv policy: Relaxed (Weak), AcqRel, SeqCst
+template <typename T, typename AtomicPolicy>
+struct Write2x2Litmus {
+  using DataType = T;
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  size_t m_size;
+  int m_stride;
+  T *x;
+  T *y;
+  T *a;
+
+  int strong_behavior_0{0};
+  int strong_behavior_1{0};
+  int interleaved_behavior{0};
+  int weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
+  {
+    m_size = size;
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    y = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(y);
+    work_res.deallocate(a);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(y, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
+  {
+    bool swap = this_thread % 2 == 0;
+    if (swap) {
+      store_1(other_thread, iter);
+      store_2(this_thread, iter);
+    } else {
+      store_2(this_thread, iter);
+      store_1(other_thread, iter);
+    }
+  }
+
+  RAJA_HOST_DEVICE void store_1(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicExchange<RelaxedPolicy>(&(x[thread_idx]), T{1});
+    RAJA::atomicExchange<AtomicPolicy>(&(y[thread_idx]), T{2});
+  }
+
+  RAJA_HOST_DEVICE void store_2(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicExchange<RelaxedPolicy>(&(y[thread_idx]), T{1});
+    RAJA::atomicExchange<AtomicPolicy>(&(x[thread_idx]), T{2});
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
+#endif
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *x_local = x;
+    T *y_local = y;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (x_local[i] == 1 && y_local[i] == 2) {
+        // Strong behavior: thread 1 happened before thread 2
+        strong_cnt_0 += 1;
+      } else if (x_local[i] == 2 && y_local[i] == 1) {
+        // Strong behavior: thread 2 happened before thread 1
+        strong_cnt_1 += 1;
+      } else if (x_local[i] == 2 && y_local[i] == 2) {
+        // Strong behavior: interleaved stores in-order
+        interleaved_cnt += 1;
+      } else if (x_local[i] == 1 && y_local[i] == 1) {
+        // Weak behavior: stores on each thread were reordered
+        weak_cnt += 1;
+      } else {
+        unexpected_cnt += 1;
+      }
+    });
+
+    EXPECT_EQ(unexpected_cnt.get(), 0);
+
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
+  }
+
+  void verify()
+  {
+    std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<AtomicPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using Write2x2LitmusTestOrderPols =
+    camp::list<RAJA::atomic_relaxed,
+               RAJA::atomic_acq_rel,
+               RAJA::atomic_seq_cst >;
+
+using Write2x2LitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, Write2x2LitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, Write2x2Test)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using AtomicPol = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  using Write2x2Test = Write2x2Litmus<Type, AtomicPol>;
+  LitmusTestDriver<Write2x2Test>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, Write2x2Test);
+
+using Write2x2TestTypes = Test<Write2x2LitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               Write2x2TestTypes);