diff --git a/Android.bp b/Android.bp index fd59ef524e..d6516fec72 100644 --- a/Android.bp +++ b/Android.bp @@ -202,12 +202,9 @@ cc_library_static { "src/core/AccessWindowAutoPadding.cpp", "src/core/AccessWindowStatic.cpp", "src/core/AccessWindowTranspose.cpp", - "src/core/CL/CLCommandBuffer.cpp", - "src/core/CL/CLCompatCommandBuffer.cpp", "src/core/CL/CLCompileContext.cpp", "src/core/CL/CLHelpers.cpp", "src/core/CL/CLKernelLibrary.cpp", - "src/core/CL/CLMutableCommandBuffer.cpp", "src/core/CL/CLUtils.cpp", "src/core/CL/DefaultLWSHeuristics.cpp", "src/core/CL/ICLKernel.cpp", @@ -466,6 +463,7 @@ cc_library_static { "src/cpu/kernels/activation/generic/neon/qasymm8.cpp", "src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp", "src/cpu/kernels/activation/generic/neon/qsymm16.cpp", + "src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp", "src/cpu/kernels/add/generic/neon/fp16.cpp", "src/cpu/kernels/add/generic/neon/fp32.cpp", "src/cpu/kernels/add/generic/neon/impl.cpp", @@ -1032,6 +1030,7 @@ cc_library_static { "src/runtime/experimental/operators/CpuGemmConv2d.cpp", "src/runtime/experimental/operators/CpuGemmDirectConv2d.cpp", "src/runtime/experimental/operators/CpuMul.cpp", + "src/runtime/experimental/operators/CpuSoftmax.cpp", "src/runtime/experimental/operators/CpuSub.cpp", "src/runtime/experimental/operators/CpuTranspose.cpp", "src/runtime/experimental/operators/CpuWinogradConv2d.cpp", diff --git a/CMakeLists.txt b/CMakeLists.txt index cb99dee99e..321a83bfbb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute) project( ArmCompute - VERSION 41.0.0 + VERSION 42.0.0 DESCRIPTION "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures" LANGUAGES C CXX ASM) @@ -138,11 +138,12 @@ if(ARM_COMPUTE_OPENMP) endif() # --------------------------------------------------------------------- -# SVE Library +# SVE Object Library -add_library(arm_compute_sve "") +add_library(arm_compute_sve OBJECT "") target_compile_options(arm_compute_sve - PRIVATE "-march=armv8.2-a+sve+fp16+dotprod") + PRIVATE "-march=armv8.2-a+sve+fp16+dotprod" + PRIVATE "-fPIC") target_compile_definitions(arm_compute_sve PRIVATE ARM_COMPUTE_ENABLE_BF16) target_compile_definitions(arm_compute_sve PRIVATE ENABLE_SVE) target_compile_definitions(arm_compute_sve PRIVATE ARM_COMPUTE_ENABLE_SVE) @@ -160,11 +161,12 @@ target_include_directories( src/core/NEON/kernels/arm_gemm/merges) # --------------------------------------------------------------------- -# SVE2 Library +# SVE2 Object Library -add_library(arm_compute_sve2 "") +add_library(arm_compute_sve2 OBJECT "") target_compile_options(arm_compute_sve2 - PRIVATE "-march=armv8.6-a+sve2+fp16+dotprod") + PRIVATE "-march=armv8.6-a+sve2+fp16+dotprod" + PRIVATE "-fPIC") target_compile_definitions(arm_compute_sve2 PRIVATE ARM_COMPUTE_ENABLE_SVE2) target_compile_definitions(arm_compute_sve2 PRIVATE ARM_COMPUTE_ENABLE_BF16) target_compile_definitions(arm_compute_sve2 PRIVATE ENABLE_SVE) @@ -205,8 +207,11 @@ target_include_directories( target_compile_options(arm_compute PUBLIC ${COMMON_CXX_FLAGS}) add_library(ArmCompute::Core ALIAS arm_compute) + +# arm_compute_sve and arm_compute_sve2 obj files will not be public in the arm_compute.so target_link_libraries( - arm_compute PUBLIC arm_compute_sve arm_compute_sve2) + arm_compute PRIVATE $ + PRIVATE $) # --------------------------------------------------------------------- # Graph Library diff --git a/README.md b/README.md index 64474b9890..97ffe318c4 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@

-# Compute Library ![](https://img.shields.io/badge/latest_release-24.08.1-green) +# Compute Library ![](https://img.shields.io/badge/latest_release-24.09-green) The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.
@@ -37,7 +37,7 @@ Key Features:
## Documentation -[![Documentation](https://img.shields.io/badge/documentation-24.08.1-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/index.xhtml) +[![Documentation](https://img.shields.io/badge/documentation-24.09-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/index.xhtml) > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc. @@ -50,22 +50,22 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C | Platform | Operating System | Release archive (Download) | | -------------- | ---------------- | -------------------------- | -| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-bin.tar.gz) | -| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) | -| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) | -| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) | +| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-bin.tar.gz) | +| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) | +| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) | +| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) |
| Architecture | Operating System | Release archive (Download) | | ------------ | ---------------- | -------------------------- | -| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-gpu-bin.tar.gz) | -| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-android-aarch64-cpu-gpu-bin.tar.gz) | -| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) | +| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-gpu-bin.tar.gz) | +| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-android-aarch64-cpu-gpu-bin.tar.gz) | +| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) |
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.08.1-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.08.1) +Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.09-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.09) Pre-build binaries are generated with the following security / good coding practices related flags: > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong @@ -107,13 +107,13 @@ Pre-build binaries are generated with the following security / good coding pract ## Experimental builds -**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/how_to_build.xhtml) for more details. +**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/how_to_build.xhtml) for more details.
## How to contribute -Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/contribution_guidelines.xhtml). +Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/contribution_guidelines.xhtml). ### Developer Certificate of Origin (DCO) Before the Compute Library accepts your contribution, you need to certify its origin and give us your permission. To manage this process we use the Developer Certificate of Origin (DCO) V1.1 (https://developercertificate.org/) diff --git a/SConscript b/SConscript index bd8f034c9c..2aff67d8ca 100644 --- a/SConscript +++ b/SConscript @@ -33,8 +33,8 @@ import codecs import platform import SCons -VERSION = "v24.08.1" -LIBRARY_VERSION_MAJOR = 41 +VERSION = "v24.09" +LIBRARY_VERSION_MAJOR = 42 LIBRARY_VERSION_MINOR = 0 LIBRARY_VERSION_PATCH = 0 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH) diff --git a/SConstruct b/SConstruct index 941f173d3d..c4bfef826d 100644 --- a/SConstruct +++ b/SConstruct @@ -281,8 +281,12 @@ if env['cppthreads']: if env['openmp']: env.Append(CPPDEFINES = [('ARM_COMPUTE_OPENMP_SCHEDULER', 1)]) - env.Append(CXXFLAGS = ['-fopenmp']) - env.Append(LINKFLAGS = ['-fopenmp']) + if not 'windows' in env['os']: + env.Append(CXXFLAGS = ['-fopenmp']) + env.Append(LINKFLAGS = ['-fopenmp']) + else: + env.Append(CXXFLAGS = ['-openmp']) + env.Append(LINKFLAGS = ['libomp.lib']) # Validate and define state if env['estate'] == 'auto': diff --git a/arm_compute/core/utils/DataTypeUtils.h b/arm_compute/core/utils/DataTypeUtils.h index 6fabb19b64..b19a3dd1e7 100644 --- a/arm_compute/core/utils/DataTypeUtils.h +++ b/arm_compute/core/utils/DataTypeUtils.h @@ -97,9 +97,12 @@ inline size_t element_size_from_data_type(DataType dt) case DataType::S32: case DataType::F32: return 4; + case DataType::F64: case DataType::U64: case DataType::S64: return 8; + case DataType::SIZET: + return sizeof(size_t); // portable default: ARM_COMPUTE_ERROR("Undefined element size for given data type"); return 0; diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h index b74fcb74ef..3b99cb40ac 100644 --- a/arm_compute/runtime/CL/CLScheduler.h +++ b/arm_compute/runtime/CL/CLScheduler.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CLSCHEDULER_H -#define ARM_COMPUTE_CLSCHEDULER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_CLSCHEDULER_H +#define ACL_ARM_COMPUTE_RUNTIME_CL_CLSCHEDULER_H #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLTypes.h" @@ -211,6 +211,8 @@ class CLScheduler final bool _job_chaining_enabled; int _job_chaining_size; int _job_chaining_count; + unsigned int _enqueue_count; + unsigned int _flush_count; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_CLSCHEDULER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_CL_CLSCHEDULER_H diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h index fde8e9c43a..763a1e4b13 100644 --- a/arm_compute/runtime/CL/CLTensorAllocator.h +++ b/arm_compute/runtime/CL/CLTensorAllocator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CLTENSORALLOCATOR_H -#define ARM_COMPUTE_CLTENSORALLOCATOR_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_CLTENSORALLOCATOR_H +#define ACL_ARM_COMPUTE_RUNTIME_CL_CLTENSORALLOCATOR_H #include "arm_compute/core/CL/CLTypes.h" #include "arm_compute/core/CL/OpenCL.h" @@ -106,6 +106,9 @@ class CLTensorAllocator : public ITensorAllocator * */ void free() override; + + bool is_allocated() const override; + /** Import an existing memory as a tensor's backing memory * * @warning memory should have been created under the same context that Compute Library uses. @@ -156,4 +159,4 @@ class CLTensorAllocator : public ITensorAllocator CLInt32Array _offset; /**< Offsets array in case of quantized per channel data type */ }; } // namespace arm_compute -#endif /* ARM_COMPUTE_CLTENSORALLOCATOR_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_CL_CLTENSORALLOCATOR_H diff --git a/arm_compute/runtime/ITensorAllocator.h b/arm_compute/runtime/ITensorAllocator.h index e2d3536169..2f77cd491c 100644 --- a/arm_compute/runtime/ITensorAllocator.h +++ b/arm_compute/runtime/ITensorAllocator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_ITENSORALLOCATOR_H -#define ARM_COMPUTE_ITENSORALLOCATOR_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_ITENSORALLOCATOR_H +#define ACL_ARM_COMPUTE_RUNTIME_ITENSORALLOCATOR_H #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" @@ -91,6 +91,12 @@ class ITensorAllocator */ virtual void free() = 0; + /** Return whether the tensor is currently allocated. + * + * @return true if the tensor is allocated, false otherwise. + */ + virtual bool is_allocated() const = 0; + protected: /** Interface to be implemented by the child class to lock the memory allocation for the CPU to access. * @@ -106,4 +112,4 @@ class ITensorAllocator size_t _alignment{}; /**< Tensor's alignment in bytes */ }; } // namespace arm_compute -#endif /*ARM_COMPUTE_ITENSORALLOCATOR_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_ITENSORALLOCATOR_H diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h index 3268781c65..46c83eb827 100644 --- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h +++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h @@ -67,7 +67,6 @@ class NEArithmeticSubtraction : public IFunction * |QASYMM8 |QASYMM8 |QASYMM8 | * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | * |QSYMM16 |QSYMM16 |QASYMM16 | - * |QSYMM16 |QSYMM16 |S32 | * |U8 |U8 |U8 | * |S16 |S16 |S16 | * |S32 |S32 |S32 | diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h index bfb4bc83b5..f1f983b282 100644 --- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h +++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h @@ -69,8 +69,9 @@ class NEPixelWiseMultiplication : public IFunction * |U8 |S16 |S16 | * |S16 |U8 |S16 | * |S16 |S16 |S16 | + * |S32 |S32 |S32 | * |F16 |F16 |F16 | - * |F32 |S32 |F32 | + * |F32 |F32 |F32 | * * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h index 4ac397a980..aa44e39a0b 100644 --- a/arm_compute/runtime/NEON/functions/NEReverse.h +++ b/arm_compute/runtime/NEON/functions/NEReverse.h @@ -42,11 +42,11 @@ class NEReverse : public INESimpleFunctionNoBorder * - All * * Valid data type configurations: - * |src0 |src1 |dst | - * |:--------------|:--------------|:--------------| - * |All |U32, S32 |All | + * |src0 |src1 |dst | + * |:---------------------------|:--------------|:---------------------------| + * |All except SIZET <= 32-bits |U32, S32 |All except SIZET <= 32-bits | * - * @param[in] input Input tensor. Data types supported: All + * @param[in] input Input tensor. Data types supported: All except SIZET <= 32-bit data types * @param[out] output Output tensor. Data type supported: Same as @p input * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32 * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis @@ -60,12 +60,7 @@ class NEReverse : public INESimpleFunctionNoBorder void configure(const ITensor *input, ITensor *output, const ITensor *axis, const bool use_inverted_axis = false); /** Static function to check if given info will lead to a valid configuration of NEReverseKernel * - * @param[in] input Input tensor info. Data types supported: All - * @param[in] output Output tensor info. Data type supported: Same as @p input - * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32 - * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis - * - * @return a status + * Similar to @ref NEReverse::configure() */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, diff --git a/arm_compute/runtime/TensorAllocator.h b/arm_compute/runtime/TensorAllocator.h index d819931415..f25108d747 100644 --- a/arm_compute/runtime/TensorAllocator.h +++ b/arm_compute/runtime/TensorAllocator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 Arm Limited. + * Copyright (c) 2016-2019, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TENSORALLOCATOR_H -#define ARM_COMPUTE_TENSORALLOCATOR_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_TENSORALLOCATOR_H +#define ACL_ARM_COMPUTE_RUNTIME_TENSORALLOCATOR_H #include "arm_compute/runtime/ITensorAllocator.h" #include "arm_compute/runtime/Memory.h" #include "arm_compute/runtime/MemoryGroup.h" @@ -84,6 +84,8 @@ class TensorAllocator : public ITensorAllocator */ void allocate() override; + bool is_allocated() const override; + /** Free allocated CPU memory. * * @note The tensor must have been allocated when calling this function. @@ -126,4 +128,4 @@ class TensorAllocator : public ITensorAllocator Memory _memory; /**< CPU memory */ }; } // namespace arm_compute -#endif /* ARM_COMPUTE_TENSORALLOCATOR_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_TENSORALLOCATOR_H diff --git a/arm_compute/runtime/experimental/operators/CpuMul.h b/arm_compute/runtime/experimental/operators/CpuMul.h index d5ef33d08b..10a9c40a46 100644 --- a/arm_compute/runtime/experimental/operators/CpuMul.h +++ b/arm_compute/runtime/experimental/operators/CpuMul.h @@ -56,27 +56,7 @@ class CpuMul : public INEOperator ~CpuMul() override; /** Initialise the kernel's inputs, dst and convertion policy. * - * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. - * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * - * @param[in, out] src1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] src2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p src1 is QASYMM8), QASYMM8_SIGNED (only if @p src1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p src1 is QSYMM16), F16 (only if @p src1 is F16), F32 (only if @p src1 is F32). - * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] dst dst tensor info. Data types supported: - * - U8, only if both inputs are U8. - * - QASYMM8, only if both inputs are QASYMM8. - * - QASYMM8_SIGNED, only if @p src1 is QASYMM8_SIGNED. - * - S16. - * - QSYMM16, only if both inputs are QSYMM16. - * - S32, only if both inputs are S32 or both are QSYMM16. - * - F16, only if @p src1 is F16. - * - F32, only if both inputs are F32. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype - * @param[in] rounding_policy Rounding policy. @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + * Similar to @ref NEPixelWiseMultiplication::configure() */ void configure(ITensorInfo *src1, ITensorInfo *src2, @@ -87,7 +67,7 @@ class CpuMul : public INEOperator const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * - * Similar to @ref CpuMul::configure() + * Similar to @ref NEPixelWiseMultiplication::validate() * * @return a status */ diff --git a/arm_compute/runtime/experimental/operators/CpuSoftmax.h b/arm_compute/runtime/experimental/operators/CpuSoftmax.h new file mode 100644 index 0000000000..1ac94288fd --- /dev/null +++ b/arm_compute/runtime/experimental/operators/CpuSoftmax.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUSOFTMAX_H +#define ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUSOFTMAX_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/IOperator.h" + +#include + +namespace arm_compute +{ +namespace experimental +{ +namespace op +{ +class CpuSoftmaxKernel; + +/* + * A shallow wrapper for arm_compute::cpu::CpuSoftmaxGeneric. + * Any new features should be added to arm_compute::cpu::CpuSoftmaxGeneric + * and arm_compute::experimental::op::CpuSoftmax should remain a shallow wrapper. + */ +class CpuSoftmax : public IOperator +{ +public: + /** Constructor **/ + CpuSoftmax(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuSoftmax(const CpuSoftmax &) = delete; + /** Prevent copy assignment */ + CpuSoftmax &operator=(const CpuSoftmax &) = delete; + /** Default move constructor */ + CpuSoftmax(CpuSoftmax &&) = default; + /** Default move assignment */ + CpuSoftmax &operator=(CpuSoftmax &&) = default; + /** Default destructor */ + ~CpuSoftmax() override; + /** Set the input and output tensors. + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 | + * |F32 |F32 | + * + * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * last value of each row to the nearest multiple. + * @param[out] dst Destination tensor ifo. Data types supported: same as @p input. + * @param[in] beta (Optional) A scaling factor for the exponent. + * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and + * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0 + * @param[in] is_log True if the operation is log-softmax + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuSoftmax::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + + // Unused + void prepare(ITensorPack &constants) override; + +private: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace op +} // namespace experimental +} // namespace arm_compute +#endif // ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUSOFTMAX_H diff --git a/docs/Doxyfile b/docs/Doxyfile index 57f15d0a78..d92a65f340 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -60,7 +60,7 @@ PROJECT_NAME = "Compute Library" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 24.08.1 +PROJECT_NUMBER = 24.09 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/user_guide/how_to_build_and_run_examples.dox b/docs/user_guide/how_to_build_and_run_examples.dox index 88ccc3d5c8..39b3a2ed6b 100644 --- a/docs/user_guide/how_to_build_and_run_examples.dox +++ b/docs/user_guide/how_to_build_and_run_examples.dox @@ -322,11 +322,9 @@ In this case the first argument of LeNet (like all the graph examples) is the ta @section S1_4_macos Building for macOS -The library was successfully natively built for Apple Silicon under macOS 11.1 using clang v12.0.0. - To natively compile the library with accelerated CPU support: - scons Werror=1 -j8 neon=1 opencl=0 os=macos arch=armv8a build=native + scons Werror=1 -j8 neon=1 opencl=0 os=macos arch=armv8.2-a build=native @note Initial support disables feature discovery through HWCAPS and thread scheduling affinity controls diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox index 8e828e88a4..f423260fb5 100644 --- a/docs/user_guide/operator_list.dox +++ b/docs/user_guide/operator_list.dox @@ -208,7 +208,6 @@ where N = batches, C = channels, H = height, W = width, D = depth QASYMM8QASYMM8QASYMM8 QASYMM8_SIGNEDQASYMM8_SIGNEDQASYMM8_SIGNED QSYMM16QSYMM16QASYMM16 - QSYMM16QSYMM16S32 U8U8U8 S16S16S16 S32S32S32 @@ -2319,8 +2318,9 @@ where N = batches, C = channels, H = height, W = width, D = depth U8S16S16 S16U8S16 S16S16S16 + S32S32S32 F16F16F16 - F32S32F32 + F32F32F32 CLPixelWiseMultiplication @@ -2752,7 +2752,7 @@ where N = batches, C = channels, H = height, W = width, D = depth
src0src1dst -
AllU32, S32All +
All except SIZET <= 32-bitsU32, S32All except SIZET <= 32-bits
CLReverse diff --git a/filelist.json b/filelist.json index e1de9e0511..5b49a68692 100644 --- a/filelist.json +++ b/filelist.json @@ -120,10 +120,7 @@ ], "gpu": { "common": [ - "src/core/CL/CLCommandBuffer.cpp", - "src/core/CL/CLCompatCommandBuffer.cpp", "src/core/CL/CLCompileContext.cpp", - "src/core/CL/CLMutableCommandBuffer.cpp", "src/core/CL/DefaultLWSHeuristics.cpp", "src/core/CL/CLHelpers.cpp", "src/core/CL/CLKernelLibrary.cpp", @@ -899,6 +896,7 @@ "common": [ "src/cpu/operators/CpuActivation.cpp", "src/cpu/kernels/CpuActivationKernel.cpp", + "src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp", "src/runtime/NEON/functions/NEActivationLayer.cpp" ], "neon": { @@ -958,8 +956,10 @@ "fp16":["src/cpu/kernels/add/generic/sve/fp16.cpp"] }, "sve2": { + "common": ["src/cpu/kernels/add/generic/sme2/impl.cpp"], "qasymm8": [ "src/cpu/kernels/add/generic/sve2/qasymm8.cpp" ], - "qasymm8_signed": [ "src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp", + "src/cpu/kernels/add/generic/sme2/qasymm8_signed.cpp" ], "qsymm16": [ "src/cpu/kernels/add/generic/sve2/qsymm16.cpp" ] } } @@ -1586,7 +1586,7 @@ } }, "Gemm": { - "deps": [ "Quantize", "Add", "Sub"], + "deps": [ "Quantize", "Add", "Sub", "Softmax"], "files": { "common": [ "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp", @@ -1617,6 +1617,7 @@ "src/runtime/experimental/operators/CpuGemmConv2d.cpp", "src/runtime/experimental/operators/CpuGemmDirectConv2d.cpp", "src/runtime/experimental/operators/CpuMul.cpp", + "src/runtime/experimental/operators/CpuSoftmax.cpp", "src/runtime/experimental/operators/CpuSub.cpp", "src/runtime/experimental/operators/CpuTranspose.cpp", "src/runtime/experimental/operators/CpuWinogradConv2d.cpp" diff --git a/src/BUILD.bazel b/src/BUILD.bazel index bbfb463d54..4aa157efd5 100644 --- a/src/BUILD.bazel +++ b/src/BUILD.bazel @@ -110,6 +110,8 @@ filegroup( "cpu/kernels/activation/generic/sve2/qasymm8.cpp", "cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp", "cpu/kernels/activation/generic/sve2/qsymm16.cpp", + "cpu/kernels/add/generic/sme2/impl.cpp", + "cpu/kernels/add/generic/sme2/qasymm8_signed.cpp", "cpu/kernels/add/generic/sve2/qasymm8.cpp", "cpu/kernels/add/generic/sve2/qasymm8_signed.cpp", "cpu/kernels/add/generic/sve2/qsymm16.cpp", @@ -743,6 +745,7 @@ filegroup( "cpu/kernels/activation/generic/neon/qasymm8.cpp", "cpu/kernels/activation/generic/neon/qasymm8_signed.cpp", "cpu/kernels/activation/generic/neon/qsymm16.cpp", + "cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp", "cpu/kernels/add/generic/neon/fp16.cpp", "cpu/kernels/add/generic/neon/fp32.cpp", "cpu/kernels/add/generic/neon/impl.cpp", @@ -1044,6 +1047,7 @@ filegroup( "runtime/experimental/operators/CpuGemmConv2d.cpp", "runtime/experimental/operators/CpuGemmDirectConv2d.cpp", "runtime/experimental/operators/CpuMul.cpp", + "runtime/experimental/operators/CpuSoftmax.cpp", "runtime/experimental/operators/CpuSub.cpp", "runtime/experimental/operators/CpuTranspose.cpp", "runtime/experimental/operators/CpuWinogradConv2d.cpp"] + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 22198050e4..58eca30847 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -337,6 +337,8 @@ target_sources( cpu/kernels/activation/generic/sve2/qasymm8.cpp cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp cpu/kernels/activation/generic/sve2/qsymm16.cpp + cpu/kernels/add/generic/sme2/impl.cpp + cpu/kernels/add/generic/sme2/qasymm8_signed.cpp cpu/kernels/add/generic/sve2/qasymm8.cpp cpu/kernels/add/generic/sve2/qasymm8_signed.cpp cpu/kernels/add/generic/sve2/qsymm16.cpp @@ -734,6 +736,7 @@ target_sources( cpu/kernels/activation/generic/neon/qasymm8.cpp cpu/kernels/activation/generic/neon/qasymm8_signed.cpp cpu/kernels/activation/generic/neon/qsymm16.cpp + cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp cpu/kernels/add/generic/neon/fp16.cpp cpu/kernels/add/generic/neon/fp32.cpp cpu/kernels/add/generic/neon/impl.cpp @@ -1035,6 +1038,7 @@ target_sources( runtime/experimental/operators/CpuGemmConv2d.cpp runtime/experimental/operators/CpuGemmDirectConv2d.cpp runtime/experimental/operators/CpuMul.cpp + runtime/experimental/operators/CpuSoftmax.cpp runtime/experimental/operators/CpuSub.cpp runtime/experimental/operators/CpuTranspose.cpp runtime/experimental/operators/CpuWinogradConv2d.cpp diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp index d46d8d7773..2352e27a17 100644 --- a/src/common/cpuinfo/CpuInfo.cpp +++ b/src/common/cpuinfo/CpuInfo.cpp @@ -39,6 +39,12 @@ #if !defined(_WIN64) #include /* C++ std::regex takes up a lot of space in the standalone builds */ #include +#else /* !defined(_WIN64) */ +// clang-format off +#include +#include +#include +// clang-format on #endif /* !defined(_WIN64) */ #include @@ -411,7 +417,15 @@ CpuInfo CpuInfo::build() #elif defined(__aarch64__) && defined(_WIN64) /* #elif defined(__aarch64__) && defined(__APPLE__) */ CpuIsaInfo isainfo; isainfo.neon = true; - CpuInfo info(isainfo, {CpuModel::GENERIC}); + if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) + { + isainfo.dot = true; + } + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + const int ncpus = sysinfo.dwNumberOfProcessors; + std::vector cpus_model(ncpus); + CpuInfo info(isainfo, cpus_model); return info; #else /* #elif defined(__aarch64__) && defined(_WIN64) */ CpuInfo info(CpuIsaInfo(), {CpuModel::GENERIC}); diff --git a/src/core/CL/CLCommandBuffer.h b/src/core/CL/CLCommandBuffer.h deleted file mode 100644 index 90e434161e..0000000000 --- a/src/core/CL/CLCommandBuffer.h +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (c) 2023 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H -#define ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H - -#include "arm_compute/core/CL/OpenCL.h" - -#include -#include -#include - -namespace arm_compute -{ - -/** Command buffer contains a list of commands that is constructed once and later enqueued multiple times. - * - * To prepare a command buffer: - * - Construct a new command buffer targeting a command queue using @ref CLCommandBuffer::create. - * - Add kernel enqueue command to the buffer using @ref CLCommandBuffer::add_kernel. - * The kernel must be ready to be enqueued with all the arguments set. - * - Specify which kernel argument is mutable after the command buffer has been finalized. - * - When all the kernel enqueue commands have been added, call @ref CLCommandBuffer::finalize. - * After this point the command buffer is ready to be executed. - * - * To execute the command buffer: - * - Make any changes in the value which the mutable arguments are pointing to. - * - Call @ref CLCommandBuffer::update to apply the argument value changes. - * - Call @ref CLCommandBuffer::enqueue to enqueue the command buffer to execute. - */ -class CLCommandBuffer -{ -public: - /** Create a new command buffer targeting the specified command queue. - * - * @param[in] queue The command queue to execute the command buffer. - * - * @return A unique pointer to the newly created command buffer. - */ - static std::unique_ptr create(cl_command_queue queue); - - /** Constructor. */ - CLCommandBuffer(); - - /** Destructor. */ - virtual ~CLCommandBuffer(); - - /** Disallow copy constructor. */ - CLCommandBuffer(const CLCommandBuffer &) = delete; - - /** Disallow copy assignment. */ - CLCommandBuffer &operator=(const CLCommandBuffer &) = delete; - - /** Disallow move constructor. */ - CLCommandBuffer(CLCommandBuffer &&other) = delete; - - /** Disallow move assignment. */ - CLCommandBuffer &operator=(CLCommandBuffer &&other) = delete; - - /** Add a kernel enqueue command to the command queue. - * - * This function must be called before the command buffer has been finalized. - * - * @param[in] kernel The CL kernel. - * @param[in] offset The global work offset. - * @param[in] global The global work size. - * @param[in] local The local work size. - */ - virtual void - add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0; - - /** Add the mutable argument to the current kernel enqueue command. - * - * This function must be called after @ref CLCommandBuffer::add_kernel but before the command buffer - * has been finalized. - * - * The pointer must be valid and it must point to the correct value at the time - * @ref CLCommandBuffer::update is called so that the value of the argument - * can be applied successfully to the kernel enqueue command. - * - * @param[in] arg_idx The index of the argument in the current kernel program. - * @param[in] value The pointer to the value of the argument. - */ - template ::value || std::is_pointer::value>> - void add_mutable_argument(cl_uint arg_idx, const T *value) - { - add_mutable_argument_generic(arg_idx, value, sizeof(T)); - } - - /** Finalize the command buffer. */ - virtual void finalize() = 0; - - /** Update the command buffer with new kernel argument values. - * - * This function must be called after the command buffer has been finalized. - * - * All the value pointed by the mutable argument will be applied to the command buffer. - */ - virtual void update() = 0; - - /** Enqueue the command buffer. - * - * This function must be called after the command buffer has been finalized. - */ - virtual void enqueue() = 0; - - /** Check if the command buffer has been finalized. - * - * @return true if the command buffer has been finalized. - */ - virtual bool is_finalized() const = 0; - -protected: - /** Add the mutable argument to the current kernel enqueue command. - * - * @see CLCommandBuffer::add_mutable_argument for more information. - */ - virtual void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) = 0; - - /** The state of the command buffer. */ - enum class State : int32_t - { - /** The command buffer has been created and is being specified. */ - Created, - - /** The command buffer has been finalized and is ready to be executed. */ - Finalized, - }; - - /** Get the state of the command buffer. */ - State state() const; - - /** Set the state of the command buffer. */ - CLCommandBuffer &state(State state); - -private: - State _state{State::Created}; -}; - -} // namespace arm_compute - -#endif // ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H diff --git a/src/core/CL/CLCompatCommandBuffer.cpp b/src/core/CL/CLCompatCommandBuffer.cpp deleted file mode 100644 index 242fd7719c..0000000000 --- a/src/core/CL/CLCompatCommandBuffer.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2023 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "src/core/CL/CLCompatCommandBuffer.h" - -#include "arm_compute/core/Error.h" - -#include "src/core/CL/CLUtils.h" - -namespace arm_compute -{ - -CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) : _queue(queue) -{ -} - -CLCompatCommandBuffer::~CLCompatCommandBuffer() -{ -} - -void CLCompatCommandBuffer::add_kernel(cl_kernel kernel, - const cl::NDRange &offset, - const cl::NDRange &global, - const cl::NDRange &local) -{ - ARM_COMPUTE_ERROR_ON(state() != State::Created); - - _kernel_cmds.push_back(KernelCommand{kernel, offset, global, local, {}}); -} - -void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) -{ - ARM_COMPUTE_ERROR_ON(state() != State::Created); - ARM_COMPUTE_ERROR_ON(_kernel_cmds.empty()); - - _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{arg_idx, size, value}); -} - -void CLCompatCommandBuffer::finalize() -{ - ARM_COMPUTE_ERROR_ON(state() != State::Created); - - _kernel_cmds.shrink_to_fit(); - - for (auto &cmd : _kernel_cmds) - { - cmd.mutable_args.shrink_to_fit(); - } - - state(State::Finalized); -} - -void CLCompatCommandBuffer::update() -{ - ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - - // Nothing to do here - The kernel arguments will be updated when each command is enqueued. -} - -void CLCompatCommandBuffer::enqueue() -{ - ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - - for (const auto &cmd : _kernel_cmds) - { - for (const auto &arg : cmd.mutable_args) - { - const auto error = clSetKernelArg(cmd.kernel, arg.arg_index, arg.arg_size, arg.arg_value); - - handle_cl_error("clSetKernelArg", error); - } - - const auto error = - clEnqueueNDRangeKernel(_queue, cmd.kernel, static_cast(cmd.global.dimensions()), - cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, cmd.global.get(), - cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, 0, nullptr, nullptr); - - handle_cl_error("clEnqueueNDRangeKernel", error); - } -} - -bool CLCompatCommandBuffer::is_finalized() const -{ - return state() == State::Finalized; -} - -} // namespace arm_compute diff --git a/src/core/CL/CLCompatCommandBuffer.h b/src/core/CL/CLCompatCommandBuffer.h deleted file mode 100644 index d5df106425..0000000000 --- a/src/core/CL/CLCompatCommandBuffer.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2023 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H -#define ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H - -#include "src/core/CL/CLCommandBuffer.h" - -#include - -namespace arm_compute -{ - -/** Command buffer implementation for platform without mutable dispatch command buffer extension. */ -class CLCompatCommandBuffer final : public CLCommandBuffer -{ -public: - /** Create a new command buffer targeting the specified command queue. - * - * @param[in] queue The command queue to execute the command buffer. - */ - CLCompatCommandBuffer(cl_command_queue queue); - - /** Destructor. */ - virtual ~CLCompatCommandBuffer(); - - /** Disallow copy constructor. */ - CLCompatCommandBuffer(const CLCompatCommandBuffer &) = delete; - - /** Disallow copy assignment. */ - CLCompatCommandBuffer &operator=(const CLCompatCommandBuffer &) = delete; - - /** Disallow move constructor. */ - CLCompatCommandBuffer(CLCompatCommandBuffer &&) = delete; - - /** Disallow move assignment. */ - CLCompatCommandBuffer &operator=(CLCompatCommandBuffer &&) = delete; - - void add_kernel(cl_kernel kernel, - const cl::NDRange &offset, - const cl::NDRange &global, - const cl::NDRange &local) override; - - void finalize() override; - - void update() override; - - void enqueue() override; - - bool is_finalized() const override; - -protected: - void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) override; - -private: - struct KernelCommand - { - cl_kernel kernel; - cl::NDRange offset; - cl::NDRange global; - cl::NDRange local; - - std::vector mutable_args; - }; - -private: - cl_command_queue _queue{}; - std::vector _kernel_cmds{}; -}; - -} // namespace arm_compute - -#endif // ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp deleted file mode 100644 index 0e078d8416..0000000000 --- a/src/core/CL/CLMutableCommandBuffer.cpp +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) 2023 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "src/core/CL/CLMutableCommandBuffer.h" - -#include "arm_compute/core/Error.h" - -#include "src/common/utils/Log.h" -#include "src/core/CL/CLUtils.h" - -namespace arm_compute -{ - -CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLCommandBuffer() -{ - cl_int status = CL_SUCCESS; - - cl_command_buffer_properties_khr properties[] = { - CL_COMMAND_BUFFER_FLAGS_KHR, - CL_COMMAND_BUFFER_MUTABLE_KHR, - 0, - }; - - _cb = clCreateCommandBufferKHR(1, &queue, properties, &status); - handle_cl_error("clCreateCommandBufferKHR", status); -} - -CLMutableCommandBuffer::~CLMutableCommandBuffer() -{ - const auto status = clReleaseCommandBufferKHR(_cb); - if (status != CL_SUCCESS) - { - const std::string error_message = "clReleaseCommandBufferKHR - Error code: " + std::to_string(status); - ARM_COMPUTE_LOG_ERROR_ACL(error_message); - } -} - -void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, - const cl::NDRange &offset, - const cl::NDRange &global, - const cl::NDRange &local) -{ - ARM_COMPUTE_ERROR_ON(state() != State::Created); - - cl_mutable_command_khr mutable_handle = nullptr; - - cl_ndrange_kernel_command_properties_khr properties[] = { - CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR, - CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, - 0, - }; - - const auto error = clCommandNDRangeKernelKHR( - _cb, nullptr, properties, kernel, global.dimensions(), offset.dimensions() != 0 ? offset.get() : nullptr, - global.get(), local.dimensions() != 0 ? local.get() : nullptr, 0, nullptr, nullptr, &mutable_handle); - - handle_cl_error("clCommandNDRangeKernelKHR", error); - - cl_mutable_dispatch_config_khr mut_dispatch_cfg{}; - mut_dispatch_cfg.type = CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR; - mut_dispatch_cfg.command = mutable_handle; - - _mut_dispatch_cfgs.emplace_back(mut_dispatch_cfg); -} - -void CLMutableCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) -{ - ARM_COMPUTE_ERROR_ON(state() != State::Created); - - cl_mutable_dispatch_arg_khr cfg{}; - cfg.arg_index = arg_idx; - cfg.arg_size = size; - cfg.arg_value = value; - - _mut_arg_cfgs.emplace_back(cfg); - ++_mut_dispatch_cfgs.back().num_args; -} - -void CLMutableCommandBuffer::finalize() -{ - ARM_COMPUTE_ERROR_ON(state() != State::Created); - - const auto error = clFinalizeCommandBufferKHR(_cb); - handle_cl_error("clFinalizeCommandBufferKHR", error); - - state(State::Finalized); - - _mut_dispatch_cfgs.shrink_to_fit(); - _mut_arg_cfgs.shrink_to_fit(); - - size_t arg_no = 0; - - for (auto &mut_dispatch_cfg : _mut_dispatch_cfgs) - { - ARM_COMPUTE_ERROR_ON(arg_no >= _mut_arg_cfgs.size()); - mut_dispatch_cfg.arg_list = &_mut_arg_cfgs[arg_no]; - - arg_no += mut_dispatch_cfg.num_args; - } - - _mut_cfg.type = CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR; - _mut_cfg.next = nullptr; - _mut_cfg.num_mutable_dispatch = _mut_dispatch_cfgs.size(); - _mut_cfg.mutable_dispatch_list = &_mut_dispatch_cfgs[0]; -} - -void CLMutableCommandBuffer::update() -{ - ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - - const auto error = clUpdateMutableCommandsKHR(_cb, &_mut_cfg); - - handle_cl_error("clUpdateMutableCommandsKHR", error); -} - -void CLMutableCommandBuffer::enqueue() -{ - ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - - const auto error = clEnqueueCommandBufferKHR(0, nullptr, _cb, 0, nullptr, nullptr); - - handle_cl_error("clEnqueueCommandBufferKHR", error); -} - -bool CLMutableCommandBuffer::is_finalized() const -{ - return state() == State::Finalized; -} - -} // namespace arm_compute diff --git a/src/core/CL/CLMutableCommandBuffer.h b/src/core/CL/CLMutableCommandBuffer.h deleted file mode 100644 index 8997d7d1fd..0000000000 --- a/src/core/CL/CLMutableCommandBuffer.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2023 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H -#define ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H - -#include "src/core/CL/CLCommandBuffer.h" - -#include - -namespace arm_compute -{ - -/** Command buffer implementaton based on CL mutable dispatch command buffer extension. */ -class CLMutableCommandBuffer : public CLCommandBuffer -{ -public: - /** Create a new mutable dispatch command buffer targeting the specified command queue. - * - * @param[in] queue The command queue to execute the command buffer. - */ - CLMutableCommandBuffer(cl_command_queue queue); - - /** Destructor. */ - virtual ~CLMutableCommandBuffer(); - - /** Disallow copy constructor. */ - CLMutableCommandBuffer(const CLMutableCommandBuffer &) = delete; - - /** Disallow copy assignment. */ - CLMutableCommandBuffer &operator=(const CLMutableCommandBuffer &) = delete; - - /** Disallow move constructor. */ - CLMutableCommandBuffer(CLMutableCommandBuffer &&) = delete; - - /** Disallow move assignment. */ - CLMutableCommandBuffer &operator=(CLMutableCommandBuffer &&) = delete; - - void add_kernel(cl_kernel kernel, - const cl::NDRange &offset, - const cl::NDRange &global, - const cl::NDRange &local) override; - - void finalize() override; - - void update() override; - - void enqueue() override; - - bool is_finalized() const override; - -protected: - void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) override; - -private: - cl_command_buffer_khr _cb{}; - cl_mutable_base_config_khr _mut_cfg{}; - std::vector _mut_dispatch_cfgs{}; - std::vector _mut_arg_cfgs{}; -}; - -} // namespace arm_compute - -#endif // ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer.cl b/src/core/CL/cl_kernels/nchw/pooling_layer.cl index 15ad116289..bd59e61ef8 100644 --- a/src/core/CL/cl_kernels/nchw/pooling_layer.cl +++ b/src/core/CL/cl_kernels/nchw/pooling_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -39,7 +39,9 @@ #define POW2_OP(x, vec_size) (x) #endif /* defined(POOL_L2) */ +// Compatible with Cpu backend: Round to nearest ties to even #define DIV_OP(x, y) (x * (1.f / y)) +#define DIV_INT(x, y) convert_int_rte(DIV_OP(x, y)) #define SQRT_OP(x) sqrt((x)) #if defined(FP_MIXED_PRECISION) || defined(QUANTIZED) @@ -132,7 +134,7 @@ __kernel void pooling_layer_MxN_nchw( src_x = clamp(src_x, 0, SRC_WIDTH - 1); VEC_DATA_TYPE(ACC_DATA_TYPE, 8) data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)(src_addr + src_x.s0 * sizeof(DATA_TYPE) + y * src_stride_y)); -#endif // defined(POOL_AVG) || defined(POOL_L2 +#endif // defined(POOL_AVG) || defined(POOL_L2) #if defined(POOL_L2) // Raise to power of 2 for L2 Pooling @@ -176,7 +178,12 @@ __kernel void pooling_layer_MxN_nchw( #if defined(POOL_AVG) || defined(POOL_L2) // Divide by pool region in case of average pooling - res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); + const ACC_DATA_TYPE avg_scale = calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#if defined(QUANTIZED) + res = DIV_INT(res, avg_scale); +#else // defined(QUANTIZED) + res = DIV_OP(res, avg_scale); +#endif // defined(QUANTIZED) #endif /* defined(POOL_AVG) || defined(POOL_L2) */ #if defined(QUANTIZED) @@ -282,4 +289,4 @@ __kernel void pooling_layer_2_nchw_indices( *(__global uint *)(indices_ptr + indices_offset_first_element_in_bytes + id0 * sizeof(uint) + id1 * indices_stride_y + id2 * indices_stride_z) = index; #endif // defined(SRC_BATCH) -} \ No newline at end of file +} diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl index abf0db9d07..a7b1ffd08e 100644 --- a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl +++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,14 +32,16 @@ #define SQRT_OP(x) sqrt((x)) +#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE) +#define CONVERT_RTE_STR(x, type) (convert_##type##_rte((x))) +#define CONVERT_RTE(x, type) CONVERT_RTE_STR(x, type) + #if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) #if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z) #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) #define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE) -#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE) -#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) #define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res) \ { \ @@ -169,7 +171,8 @@ __kernel void pooling_3d_layer_MxN_ndhwc_quantized( } #if defined(POOL_AVG) - res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size; + // Compatible with Cpu backend: Round to nearest ties to even + res0 = CONVERT_RTE(CONVERT(res0, VEC_DATA_TYPE(float, VEC_SIZE)) / filter_size, VEC_INT(VEC_SIZE)); #endif // defined(POOL_AVG) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl index 46268a4a88..42899e1e50 100644 --- a/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl +++ b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,10 +26,12 @@ #if defined(DATA_TYPE) && defined(INITIAL_VALUE) #define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) +#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE) +#define CONVERT_RTE_STR(x, type) (convert_##type##_rte((x))) +#define CONVERT_RTE(x, type) CONVERT_RTE_STR(x, type) + #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) #define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE) -#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE) -#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) #define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res) \ { \ @@ -148,7 +150,8 @@ __kernel void pooling_layer_MxN_quantized_nhwc( } #if defined(POOL_AVG) - res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size; + // Compatible with Cpu backend: Round to nearest ties to even + res0 = CONVERT_RTE(CONVERT(res0, VEC_DATA_TYPE(float, VEC_SIZE)) / filter_size, VEC_INT(VEC_SIZE)); #endif // defined(POOL_AVG) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) @@ -161,4 +164,4 @@ __kernel void pooling_layer_MxN_quantized_nhwc( STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0)); } #endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) -#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE) \ No newline at end of file +#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE) diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp index 00241b161b..9722441bdb 100644 --- a/src/core/CL/kernels/CLReverseKernel.cpp +++ b/src/core/CL/kernels/CLReverseKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, 2023 Arm Limited. + * Copyright (c) 2018-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -45,7 +45,12 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT { ARM_COMPUTE_UNUSED(use_inverted_axis); ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + +#ifndef __aarch64__ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->element_size() > 4, + "Only 32-bit and lower data types are supported in 32-bit builds"); +#endif // __aarch64__ + ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor"); diff --git a/src/core/NEON/NEMath.h b/src/core/NEON/NEMath.h index 9e81c38ad8..e03d6d537d 100644 --- a/src/core/NEON/NEMath.h +++ b/src/core/NEON/NEMath.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,10 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEMATH_H -#define ARM_COMPUTE_NEMATH_H +#ifndef ACL_SRC_CORE_NEON_NEMATH_H +#define ACL_SRC_CORE_NEON_NEMATH_H + +#include "arm_compute/core/Rounding.h" #include #include @@ -204,6 +206,7 @@ void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x * @param[in] in Vector of float to be converted * @param[out] out Converted vector of uint8 to store the result */ +template void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out); /** Converts from float32x4x4_t to just one int8x16_t @@ -211,9 +214,13 @@ void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out); * @param[in] in Vector of float to be converted * @param[out] out Converted vector of uint8 to store the result */ +template void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out); /** Converts from float vector to integer vector + * + * @note: Default rounding mode is "Round to Nearest with Ties to Even" + * if __aarch64__ is defined else "Round towards Zero" * * @param[in] in Float vector to converted * @@ -353,4 +360,4 @@ float16_t vreduce(const float16x8_t &v); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ } // namespace arm_compute #include "src/core/NEON/NEMath.inl" -#endif /* ARM_COMPUTE_NEMATH_H */ +#endif // ACL_SRC_CORE_NEON_NEMATH_H diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl index a5aba0bf23..d995b6e2fc 100644 --- a/src/core/NEON/NEMath.inl +++ b/src/core/NEON/NEMath.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2023 Arm Limited. + * Copyright (c) 2016-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,6 +22,11 @@ * SOFTWARE. */ +#ifndef ACL_SRC_CORE_NEON_NEMATH_INL +#define ACL_SRC_CORE_NEON_NEMATH_INL + +#include "arm_compute/core/Error.h" + #include "src/core/utils/Math.h" #include "support/ToolchainSupport.h" @@ -492,25 +497,71 @@ inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const flo out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), vqmovn_u32(vcvtq_u32_f32(in2.val[2])))); } +template +inline uint32x4_t vconvert_to_uint(float32x4_t in) +{ + switch (policy) + { + case RoundingPolicy::TO_ZERO: + return vcvtq_u32_f32(in); +#ifdef __aarch64__ + case RoundingPolicy::TO_NEAREST_EVEN: + return vcvtnq_u32_f32(in); + case RoundingPolicy::TO_NEAREST_UP: + return vcvtaq_u32_f32(in); +#endif // __aarch64__ + default: + ARM_COMPUTE_ERROR("Unsupported Rounding Policy"); + } +} + +template +inline int32x4_t vconvert_to_int(float32x4_t in) +{ + switch (policy) + { + case RoundingPolicy::TO_ZERO: + return vcvtq_s32_f32(in); +#ifdef __aarch64__ + case RoundingPolicy::TO_NEAREST_EVEN: + return vcvtnq_s32_f32(in); + case RoundingPolicy::TO_NEAREST_UP: + return vcvtaq_s32_f32(in); +#endif // __aarch64__ + default: + ARM_COMPUTE_ERROR("Unsupported Rounding Policy"); + } +} + +template inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out) { - const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), vqmovn_u32(vcvtq_u32_f32(in.val[1]))); - const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), vqmovn_u32(vcvtq_u32_f32(in.val[3]))); - out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); + const auto low = + vcombine_u16(vqmovn_u32(vconvert_to_uint(in.val[0])), vqmovn_u32(vconvert_to_uint(in.val[1]))); + const auto high = + vcombine_u16(vqmovn_u32(vconvert_to_uint(in.val[2])), vqmovn_u32(vconvert_to_uint(in.val[3]))); + out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); } +template inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out) { - const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), vqmovn_s32(vcvtq_s32_f32(in.val[1]))); - const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), vqmovn_s32(vcvtq_s32_f32(in.val[3]))); - out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); + const auto low = + vcombine_s16(vqmovn_s32(vconvert_to_int(in.val[0])), vqmovn_s32(vconvert_to_int(in.val[1]))); + const auto high = + vcombine_s16(vqmovn_s32(vconvert_to_int(in.val[2])), vqmovn_s32(vconvert_to_int(in.val[3]))); + out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); } template <> inline uint8x16_t convert_float_to_int(const float32x4x4_t &in) { uint8x16_t out; - convert_float32x4x4_to_uint8x16(in, out); +#ifdef __aarch64__ + convert_float32x4x4_to_uint8x16(in, out); +#else // __aarch64__ + convert_float32x4x4_to_uint8x16(in, out); +#endif // __aarch64__ return out; } @@ -524,7 +575,11 @@ template <> inline int8x16_t convert_float_to_int(const float32x4x4_t &in) { int8x16_t out; - convert_float32x4x4_to_int8x16(in, out); +#ifdef __aarch64__ + convert_float32x4x4_to_int8x16(in, out); +#else // __aarch64__ + convert_float32x4x4_to_int8x16(in, out); +#endif // __aarch64__ return out; } @@ -730,3 +785,5 @@ inline float16_t vreduce(const float16x8_t &v) #endif /* DOXYGEN_SKIP_THIS */ #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ } // namespace arm_compute + +#endif // ACL_SRC_CORE_NEON_NEMATH_INL diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp index b3710555df..88a9fa5a90 100644 --- a/src/core/NEON/kernels/NEReverseKernel.cpp +++ b/src/core/NEON/kernels/NEReverseKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, 2023 Arm Limited. + * Copyright (c) 2018-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -40,8 +40,13 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT { ARM_COMPUTE_UNUSED(use_inverted_axis); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis); - //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); + + // No need to check for fp16 or bf16 support in the cpu as this kernel will only use unsigned integer data types + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->element_size() > 4, "Only 32-bit and lower data types are supported"); + + // size_t is not a portable type + ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN || input->data_type() == DataType::SIZET); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index 532d08de92..5dfb5c0306 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2023 Arm Limited. + * Copyright (c) 2016-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -423,7 +423,7 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool // Note: Output quantization info for softmax should always have // * Softmax with QASYMM8: scale = 1/256, offset = 0 // * Softmax with QASYMM8_SIGNED: scale = 1/256, offset = -128 - // * LogSoftmax with QASYMM8: scale = 1/256, offset = 0 + // * LogSoftmax with QASYMM8: scale = 16/256, offset = 255 // * LogSoftmax with QASYMM8_SIGNED: scale = 16/256, offset = 127 if (is_data_type_quantized_asymmetric_signed(input_type)) { @@ -436,7 +436,7 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool return QuantizationInfo(1.f / 256, -128); } } - return QuantizationInfo(1.f / 256, 0); + return is_log ? QuantizationInfo(16.f / 256, 255) : QuantizationInfo(1.f / 256, 0); } std::pair get_quantized_activation_min_max(const ActivationLayerInfo &act_info, diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp index 2effffbe92..62ad2bab6d 100644 --- a/src/core/helpers/LUTManager.cpp +++ b/src/core/helpers/LUTManager.cpp @@ -24,13 +24,22 @@ #include "src/core/helpers/LUTManager.h" +#include "src/common/utils/Validate.h" +#include "support/Bfloat16.h" + namespace arm_compute { #ifdef __aarch64__ namespace { -float16_t activation(float16_t x, const LUTInfo &info) +union Element +{ + uint16_t i = 0; + float16_t fp; +}; + +inline float16_t activation(float16_t x, const LUTInfo &info) { float16_t out = 0.f; switch (info.act) @@ -50,26 +59,51 @@ float16_t activation(float16_t x, const LUTInfo &info) return out; } -void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut, const LUTInfo &info) +// Read bf16 value as u16, convert to fp32. +// Calculate exp in fp32, return as bf16 +inline uint16_t exponential(uint16_t x, const LUTInfo &info) { - union Element - { - uint16_t i = 0; - float16_t fp; - } item; + float fp = bf16_to_float(x); + fp = std::exp(fp * info.beta * -1); + return float_to_bf16(fp); +} - // Fill lut by iterating over all 16 bit values using the union. +void init_lut_16bit(LookupTable65536 *lut, const LUTInfo &info) +{ + // assert lut is valid config. + ARM_COMPUTE_ASSERT((info.type == LUTType::Activation && info.dt == DataType::F16) || + (info.type == LUTType::Exponential && info.dt == DataType::BFLOAT16)); + + Element item = {0}; // Fill lut by iterating over all 16 bit values using the union. + Element bf16 = {0}; // Temporary object used to store bf16 values as fp16 in lut while (true) { - (*lut)[item.i] = activation(item.fp, info); + switch (info.type) + { + case LUTType::Activation: + { + (*lut)[item.i] = activation(item.fp, info); + break; + } + case LUTType::Exponential: + { + bf16.i = exponential(item.i, info); + (*lut)[item.i] = bf16.fp; + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported Activation for 16-bit LUT table"); + break; + } if (item.i == 65535) break; item.i++; } } + } // namespace -std::shared_ptr LUTManager::get_lut_table(LUTInfo info) +std::shared_ptr LUTManager::get_lut_table(LUTInfo info) { const auto itr = map_fp16.find(info); auto s_ptr = (itr != map_fp16.end()) ? itr->second.lock() : nullptr; // nullptr if invalid or not found. @@ -82,8 +116,8 @@ std::shared_ptr LUTManager::get_lut_table { // Not found, or pointer not valid // We do not use make_shared to prevent the weak_ptr keeping the control block alive - std::shared_ptr ptr(new ActivationLayerInfo::LookupTable65536); - init_lut_fp16(ptr.get(), info); + std::shared_ptr ptr(new LookupTable65536); + init_lut_16bit(ptr.get(), info); map_fp16[info] = ptr; return ptr; } diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h index f3f4bf2832..226f44f360 100644 --- a/src/core/helpers/LUTManager.h +++ b/src/core/helpers/LUTManager.h @@ -34,28 +34,51 @@ namespace arm_compute { +#ifdef __aarch64__ +using LookupTable256 = std::array; +using LookupTable65536 = std::array; +#endif // __aarch64__ + +enum class LUTType +{ + Activation, // Determined by activation type + Exponential, // e^x +}; struct LUTInfo { - ActivationLayerInfo::ActivationFunction act; - float alpha; - float beta; - DataType dt; - UniformQuantizationInfo qinfo; + // For exponential lookup + LUTInfo(LUTType lut, float b, DataType type, UniformQuantizationInfo info) + : act(), alpha(1.0f), beta(b), dt(type), qinfo(info), type(lut) + { + } + + // For activation functions + LUTInfo(ActivationFunction func, float a, float b, DataType type, UniformQuantizationInfo info) + : act(func), alpha(a), beta(b), dt(type), qinfo(info), type(LUTType::Activation) + { + } // Operators enable use of map with Lutinfo as key friend bool operator<(const LUTInfo &l, const LUTInfo &r) { - const auto l_tup = std::make_tuple(l.act, l.alpha, l.beta, l.dt, l.qinfo.scale, l.qinfo.offset); - const auto r_tup = std::make_tuple(r.act, r.alpha, r.beta, r.dt, r.qinfo.scale, r.qinfo.offset); + const auto l_tup = std::make_tuple(l.type, l.act, l.alpha, l.beta, l.dt, l.qinfo.scale, l.qinfo.offset); + const auto r_tup = std::make_tuple(r.type, r.act, r.alpha, r.beta, r.dt, r.qinfo.scale, r.qinfo.offset); return l_tup < r_tup; } bool operator==(const LUTInfo &l) const { - return this->act == l.act && this->alpha == l.alpha && this->beta == l.beta && this->dt == l.dt && - this->qinfo == l.qinfo; + return this->type == l.type && this->act == l.act && this->alpha == l.alpha && this->beta == l.beta && + this->dt == l.dt && this->qinfo == l.qinfo; } + + ActivationLayerInfo::ActivationFunction act; + float alpha; + float beta; + DataType dt; + UniformQuantizationInfo qinfo; + LUTType type; // Default is Activation. }; /* Class to handle getting look up table */ @@ -66,10 +89,10 @@ class LUTManager static LUTManager &get_instance(); #ifdef __aarch64__ - std::shared_ptr get_lut_table(LUTInfo info); + std::shared_ptr get_lut_table(LUTInfo info); private: - std::map> map_fp16{}; + std::map> map_fp16{}; #endif // __aarch64__ }; diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h index dd094b414c..2182ec925b 100644 --- a/src/core/helpers/MemoryHelpers.h +++ b/src/core/helpers/MemoryHelpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_COMMON_MEMORY_HELPERS_H -#define SRC_COMMON_MEMORY_HELPERS_H +#ifndef ACL_SRC_CORE_HELPERS_MEMORYHELPERS_H +#define ACL_SRC_CORE_HELPERS_MEMORYHELPERS_H #include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorPack.h" @@ -63,7 +63,8 @@ template WorkspaceData manage_workspace(const experimental::MemoryRequirements &mem_reqs, MemoryGroup &mgroup, ITensorPack &run_pack, - ITensorPack &prep_pack) + ITensorPack &prep_pack, + bool allocate_now = true) { WorkspaceData workspace_memory; for (const auto &req : mem_reqs) @@ -94,8 +95,11 @@ WorkspaceData manage_workspace(const experimental::MemoryRequirement for (auto &mem : workspace_memory) { - auto tensor = mem.tensor.get(); - tensor->allocator()->allocate(); + if (allocate_now || mem.lifetime == experimental::MemoryLifetime::Temporary) + { + auto tensor = mem.tensor.get(); + tensor->allocator()->allocate(); + } } return workspace_memory; @@ -117,6 +121,28 @@ void release_prepare_tensors(WorkspaceData &workspace, ITensorPack & workspace.end()); } +/** Allocate all tensors with Persistent or Prepare lifetime if not already allocated */ +template +void allocate_tensors(const experimental::MemoryRequirements &mem_reqs, WorkspaceData &workspace) +{ + for (auto &ws : workspace) + { + const int slot = ws.slot; + for (auto &m : mem_reqs) + { + if (m.slot == slot && m.lifetime != experimental::MemoryLifetime::Temporary) + { + auto tensor = ws.tensor.get(); + if (!tensor->allocator()->is_allocated()) + { + tensor->allocator()->allocate(); + } + break; + } + } + } +} + /** Utility function to release tensors with lifetime marked as Prepare */ template void release_temporaries(const experimental::MemoryRequirements &mem_reqs, WorkspaceData &workspace) @@ -136,4 +162,4 @@ void release_temporaries(const experimental::MemoryRequirements &mem_reqs, Works } } } // namespace arm_compute -#endif /* SRC_COMMON_MEMORY_HELPERS_H */ +#endif // ACL_SRC_CORE_HELPERS_MEMORYHELPERS_H diff --git a/src/core/helpers/PoolingHelpers.h b/src/core/helpers/PoolingHelpers.h index 9ef045f472..1bd570e4af 100644 --- a/src/core/helpers/PoolingHelpers.h +++ b/src/core/helpers/PoolingHelpers.h @@ -1,5 +1,5 @@ /* -* Copyright (c) 2022 Arm Limited. +* Copyright (c) 2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_HELPERS_POOLINGHELPERS_H -#define SRC_CORE_HELPERS_POOLINGHELPERS_H +#ifndef ACL_SRC_CORE_HELPERS_POOLINGHELPERS_H +#define ACL_SRC_CORE_HELPERS_POOLINGHELPERS_H #include "src/core/NEON/NEAsymm.h" @@ -122,6 +122,25 @@ inline int32x4_t vcvtq_q32_f32(float32x4_t values) return vcvtq_s32_f32(values); } +#ifdef __aarch64__ + +template +inline T vcvtnq_q32_f32(float32x4_t values); + +template <> +inline uint32x4_t vcvtnq_q32_f32(float32x4_t values) +{ + return vcvtnq_u32_f32(values); +} + +template <> +inline int32x4_t vcvtnq_q32_f32(float32x4_t values) +{ + return vcvtnq_s32_f32(values); +} + +#endif // __aarch64__ + template inline float32x4_t vcvtq_f32_q32(T values); @@ -216,4 +235,4 @@ inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo } // namespace } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_HELPERS_POOLINGHELPERS_H */ +#endif // ACL_SRC_CORE_HELPERS_POOLINGHELPERS_H diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp index 555705bd45..c02691d5db 100644 --- a/src/cpu/kernels/CpuActivationKernel.cpp +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -31,6 +31,7 @@ #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h" #include "src/cpu/kernels/activation/list.h" #include "src/cpu/kernels/logistic/list.h" @@ -45,87 +46,6 @@ namespace kernels namespace { -bool is_fp16_lut_supported(ActivationLayerInfo::ActivationFunction func) -{ - return func == ActivationLayerInfo::ActivationFunction::LOGISTIC || - func == ActivationLayerInfo::ActivationFunction::TANH; -} - -static const std::vector available_kernels = { -#ifdef ARM_COMPUTE_ENABLE_SVE - {"sve2_q8_activation_lut", - [](const ActivationDataTypeISASelectorData &data) - { - return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && - data.cpumodel == CPUModel::A510 && data.isa.sve2 && - data.f != ActivationLayerInfo::ActivationFunction::RELU; - }, - REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)}, -#endif // ARM_COMPUTE_ENABLE_SVE -#ifdef __aarch64__ - {// Neon LUT implementantion takes precedence - "neon_q8_activation_lut", - [](const ActivationDataTypeISASelectorData &data) - { - return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && - data.f != ActivationLayerInfo::ActivationFunction::RELU; - }, - REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)}, -#endif // __aarch64__ - {"sme2_fp32_logistic", - [](const ActivationDataTypeISASelectorData &data) { - return data.dt == DataType::F32 && data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC && - data.isa.sme2; - }, - REGISTER_FP32_SME2(arm_compute::cpu::sme2_fp32_logistic)}, - {"sve2_qu8_activation", - [](const ActivationDataTypeISASelectorData &data) { - return data.dt == DataType::QASYMM8 && data.isa.sve2 && - data.f != ActivationLayerInfo::ActivationFunction::GELU; - }, - REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)}, - {"sve2_qs8_activation", - [](const ActivationDataTypeISASelectorData &data) - { - return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && - data.f != ActivationLayerInfo::ActivationFunction::GELU; - }, - REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)}, - {"sve2_qs16_activation", - [](const ActivationDataTypeISASelectorData &data) { - return data.dt == DataType::QSYMM16 && data.isa.sve2 && - data.f != ActivationLayerInfo::ActivationFunction::GELU; - }, - REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)}, - {"sve_fp16_activation_lut", - [](const ActivationDataTypeISASelectorData &data) - { return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve && is_fp16_lut_supported(data.f); }, - REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)}, - {"sve_fp16_activation", - [](const ActivationDataTypeISASelectorData &data) - { - return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && - data.f != ActivationLayerInfo::ActivationFunction::GELU; - }, - REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)}, - {"sve_fp32_activation", - [](const ActivationDataTypeISASelectorData &data) - { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, - REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)}, - {"neon_fp16_activation", - [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)}, - {"neon_fp32_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)}, - {"neon_qu8_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)}, - {"neon_qs8_activation", - [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)}, - {"neon_qs16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QSYMM16; }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)}, -}; - /* Supported activation in the 8-bit integer domain */ static const std::array qasymm8_activations = { ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, @@ -144,8 +64,8 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32); - const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ - src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()}); + heuristics::CpuActivationKernelHeuristics heuristics(src, dst, activation_info); + const auto *uk = heuristics.kernel(); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); const DataType data_type = src->data_type(); @@ -193,19 +113,6 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const return Status{}; } -std::pair validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst) -{ - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - - if (dst != nullptr) - { - // dst auto inizialitation if not yet initialized - auto_init_if_empty(*dst, *src->clone()); - } - - return std::make_pair(Status{}, win); -} #ifdef __aarch64__ void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_type, @@ -281,20 +188,21 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac { ARM_COMPUTE_UNUSED(dst); ARM_COMPUTE_ERROR_ON_NULLPTR(src); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info)); + ARM_COMPUTE_ERROR_THROW_ON(CpuActivationKernel::validate(src, dst, activation_info)); + + heuristics::CpuActivationKernelHeuristics heuristics(src, dst, activation_info); + _heuristics = std::move(heuristics); - const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ - src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()}); if (dst != nullptr) { // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, *src->clone()); } + const auto *uk = heuristics.kernel(); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); - _run_method = uk->ukernel; - _name = std::string("CpuActivationKernel").append("/").append(uk->name); + _name = std::string("CpuActivationKernel").append("/").append(uk->name); #ifdef __aarch64__ // Initialise lut_manager @@ -312,6 +220,7 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac if (std::string(uk->name) == "sve_fp16_activation_lut") { + // Create info using init list. const LUTInfo info = {activation_info.activation(), activation_info.a(), activation_info.b(), src->data_type(), src->quantization_info().uniform()}; activation_info.setLookupTable65536((lut_manager.get_lut_table(info))); @@ -319,16 +228,7 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac #endif // __aarch64__ _act_info = activation_info; - Window win; - - // Use squashed window - std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src); - // Collapse window with SME kernels in Y-Dim - if (std::string(uk->name) == "sme2_fp32_logistic") - { - win = win.collapse(win, Window::DimY); - } - ICPPKernel::configure(win); + ICPPKernel::configure(heuristics.window()); } Status @@ -336,8 +236,6 @@ CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, co { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR( - validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first); return Status{}; } @@ -347,13 +245,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count ARM_COMPUTE_UNUSED(thread_count); ARM_COMPUTE_UNUSED(platform); - if (_split_dimension == Window::DimX) - { - // Don't split the work load too small if the tensor has been reinterpreted as 1D. - // This number is loosely chosen as threading overhead in each platform varies wildly. - return 1536; - } - return default_mws; + return _heuristics.mws(); } void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) @@ -369,23 +261,20 @@ void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, con ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ARM_COMPUTE_ERROR_ON(tensors.empty()); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + ActivationKernelPtr run_method = _heuristics.kernel()->ukernel; + ARM_COMPUTE_ERROR_ON(run_method == nullptr); const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - _run_method(src, dst, _act_info, window); + run_method(src, dst, _act_info, window); } const char *CpuActivationKernel::name() const { return _name.c_str(); } - -const std::vector &CpuActivationKernel::get_available_kernels() -{ - return available_kernels; -} } // namespace kernels } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h index c1487499d6..946d539b17 100644 --- a/src/cpu/kernels/CpuActivationKernel.h +++ b/src/cpu/kernels/CpuActivationKernel.h @@ -29,6 +29,7 @@ #include "src/core/common/Macros.h" #include "src/core/helpers/LUTManager.h" #include "src/cpu/ICpuKernel.h" +#include "src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h" namespace arm_compute { @@ -37,11 +38,10 @@ namespace cpu namespace kernels { /** Interface for the activation kernel */ -class CpuActivationKernel : public ICpuKernel +class CpuActivationKernel : public ICPPKernel { private: - using ActivationKernelPtr = - std::add_pointer::type; + using ActivationKernelPtr = heuristics::CpuActivationKernelHeuristics::KernelPtr; public: CpuActivationKernel() = default; @@ -83,23 +83,13 @@ class CpuActivationKernel : public ICpuKernel */ size_t get_split_dimension_hint() const { - return _split_dimension; + return _heuristics.scheduler_hint().split_dimension(); } - struct ActivationKernel - { - const char *name; - const ActivationDataTypeISASelectorDataPtr is_selected; - ActivationKernelPtr ukernel; - }; - - static const std::vector &get_available_kernels(); - private: - ActivationLayerInfo _act_info{}; - ActivationKernelPtr _run_method{nullptr}; - size_t _split_dimension{Window::DimY}; - std::string _name{}; + ActivationLayerInfo _act_info{}; + std::string _name{}; + heuristics::CpuActivationKernelHeuristics _heuristics{}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp index a990aa4715..d86504054f 100644 --- a/src/cpu/kernels/CpuAddKernel.cpp +++ b/src/cpu/kernels/CpuAddKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -52,6 +52,12 @@ namespace kernels namespace { static const std::vector available_kernels = { + {"sme2_qs8_add_fixedpoint", + [](const CpuAddKernelDataTypeISASelectorData &data) { + return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sme2 && data.can_use_fixedpoint && + data.can_use_sme2_impl; + }, + REGISTER_QASYMM8_SIGNED_SME2(arm_compute::cpu::add_qasymm8_signed_sme2)}, {"neon_qu8_add_fixedpoint", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; }, @@ -134,8 +140,15 @@ validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITens } const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(&src0, &src1, &dst); - const auto uk = CpuAddKernel::get_implementation( - CpuAddKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); +#ifdef ARM_COMPUTE_ENABLE_SME2 + const auto can_use_sme2_impl = add_q8_sme2_fixedpoint_possible(&src0, &src1, &dst); +#else /* ARM_COMPUTE_ENABLE_SME2 */ + const auto can_use_sme2_impl = false; +#endif /* ARM_COMPUTE_ENABLE_SME2 */ + const auto uk = + CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{ + src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint, can_use_sme2_impl}); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; @@ -148,8 +161,14 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst); - const auto uk = CpuAddKernel::get_implementation( - CpuAddKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); +#ifdef ARM_COMPUTE_ENABLE_SME2 + const auto can_use_sme2_impl = add_q8_sme2_fixedpoint_possible(src0, src1, dst); +#else /* ARM_COMPUTE_ENABLE_SME2 */ + const auto can_use_sme2_impl = false; +#endif /* ARM_COMPUTE_ENABLE_SME2 */ + const auto uk = + CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{ + src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint, can_use_sme2_impl}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); @@ -191,7 +210,6 @@ void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const Thre const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - _run_method(src0, src1, dst, _policy, window); } diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp index 82e3a5ce00..9acdd9b6c2 100644 --- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp +++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023 Arm Limited. + * Copyright (c) 2019-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -92,6 +92,7 @@ Status validate_arguments(const ITensorInfo *src, { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); } else { diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h index 96ddad9d19..7e3a6fcc4b 100644 --- a/src/cpu/kernels/CpuKernelSelectionTypes.h +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -90,6 +90,7 @@ struct CpuAddKernelDataTypeISASelectorData DataType dt; cpuinfo::CpuIsaInfo isa; bool can_use_fixedpoint; + bool can_use_sme2_impl; }; struct ScaleKernelDataTypeISASelectorData diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp index d7a3a77d51..bba5cc105f 100644 --- a/src/cpu/kernels/CpuMulKernel.cpp +++ b/src/cpu/kernels/CpuMulKernel.cpp @@ -128,6 +128,16 @@ inline Status validate_arguments(const ITensorInfo *src1, "Scale value not supported (Should be 1/(2^n) or 1/255"); } + // Certain data types do not support x-dimension broadcasting + const bool broadcast_x = src1->tensor_shape().x() != src2->tensor_shape().x(); + if (broadcast_x) + { + const DataType dtype1 = src1->data_type(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dtype1 == DataType::QSYMM16 || dtype1 == DataType::U8 || + dtype1 == DataType::S16, + "X-broadcasting is not supported in certain data type configurations."); + } + return Status{}; } diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h index 676e79782b..becaa42835 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.h +++ b/src/cpu/kernels/CpuSoftmaxKernel.h @@ -37,8 +37,8 @@ namespace kernels class CpuSoftmaxKernel : public ICpuKernel { private: - using SoftmaxKernelPtr = std::add_pointer::type; + using SoftmaxKernelPtr = + std::add_pointer::type; public: CpuSoftmaxKernel() = default; diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp index c8706ff651..0604691b81 100644 --- a/src/cpu/kernels/CpuSubKernel.cpp +++ b/src/cpu/kernels/CpuSubKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023 Arm Limited. + * Copyright (c) 2021-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -92,9 +92,12 @@ validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITens DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); - const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst); - const auto uk = CpuSubKernel::get_implementation( - CpuSubKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); + const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst); + const auto can_use_sme2_add_impl = false; + + const auto uk = + CpuSubKernel::get_implementation(CpuSubKernelDataTypeISASelectorData{ + src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint, can_use_sme2_add_impl}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); @@ -126,9 +129,11 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I set_shape_if_empty(*dst, out_shape); set_data_type_if_unknown(*dst, src0->data_type()); - const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(src0, src1, dst); - const auto uk = CpuSubKernel::get_implementation( - CpuSubKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); + const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(src0, src1, dst); + const auto can_use_sme2_add_impl = false; + const auto uk = + CpuSubKernel::get_implementation(CpuSubKernelDataTypeISASelectorData{ + src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint, can_use_sme2_add_impl}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); diff --git a/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp b/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp new file mode 100644 index 0000000000..76aa759dd1 --- /dev/null +++ b/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2017-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h" + +#include "src/core/common/Registrars.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/activation/list.h" +#include "src/cpu/kernels/logistic/list.h" + +#include +#include + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace heuristics +{ +namespace +{ + +bool is_fp16_lut_supported(ActivationLayerInfo::ActivationFunction func) +{ + return func == ActivationLayerInfo::ActivationFunction::LOGISTIC || + func == ActivationLayerInfo::ActivationFunction::TANH; +} + +using KernelList = std::vector; +using KernelMap = std::map; + +static const KernelList fp32_kernels = { + {"sme2_fp32_logistic", + [](const ActivationDataTypeISASelectorData &data) + { return data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC && data.isa.sme2; }, + REGISTER_FP32_SME2(arm_compute::cpu::sme2_fp32_logistic)}, + {"sve_fp32_activation", + [](const ActivationDataTypeISASelectorData &data) + { return data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, + REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)}, + {"neon_fp32_activation", + [](const ActivationDataTypeISASelectorData &data) + { + ARM_COMPUTE_UNUSED(data); + return true; + }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)}, +}; + +static const KernelList fp16_kernels = { + {"sve_fp16_activation_lut", + [](const ActivationDataTypeISASelectorData &data) + { return data.isa.fp16 && data.isa.sve && is_fp16_lut_supported(data.f); }, + REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)}, + {"sve_fp16_activation", + [](const ActivationDataTypeISASelectorData &data) + { return data.isa.sve && data.isa.fp16 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, + REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)}, + {"neon_fp16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)}, +}; + +static const KernelList qasymm8_kernels = { + {"sve2_q8_activation_lut", + [](const ActivationDataTypeISASelectorData &data) { + return data.cpumodel == CPUModel::A510 && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::RELU; + }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)}, +#ifdef __aarch64__ + {// Neon LUT implementantion takes precedence + "neon_q8_activation_lut", + [](const ActivationDataTypeISASelectorData &data) + { return data.f != ActivationLayerInfo::ActivationFunction::RELU; }, + REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)}, +#endif // __aarch64__ + {"sve2_qu8_activation", + [](const ActivationDataTypeISASelectorData &data) + { return data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)}, + {"neon_qu8_activation", + [](const ActivationDataTypeISASelectorData &data) + { + ARM_COMPUTE_UNUSED(data); + return true; + }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)}, +}; + +static const KernelList qasymm8_signed_kernels = { + {"sve2_q8_activation_lut", + [](const ActivationDataTypeISASelectorData &data) { + return data.cpumodel == CPUModel::A510 && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::RELU; + }, + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_q8_activation_lut)}, +#ifdef __aarch64__ + {// Neon LUT implementantion takes precedence + "neon_q8_activation_lut", + [](const ActivationDataTypeISASelectorData &data) + { return data.f != ActivationLayerInfo::ActivationFunction::RELU; }, + REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)}, +#endif // __aarch64__ + {"sve2_qs8_activation", + [](const ActivationDataTypeISASelectorData &data) + { return data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)}, + {"neon_qs8_activation", + [](const ActivationDataTypeISASelectorData &data) + { + ARM_COMPUTE_UNUSED(data); + return true; + }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)}, +}; + +static const KernelList qsymm16_kernels = { + {"sve2_qs16_activation", + [](const ActivationDataTypeISASelectorData &data) + { return data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, + REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)}, + {"neon_qs16_activation", + [](const ActivationDataTypeISASelectorData &data) + { + ARM_COMPUTE_UNUSED(data); + return true; + }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)}, +}; + +static const KernelMap kernels = {{DataType::F32, fp32_kernels}, + {DataType::F16, fp16_kernels}, + {DataType::QASYMM8, qasymm8_kernels}, + {DataType::QASYMM8_SIGNED, qasymm8_signed_kernels}, + {DataType::QSYMM16, qsymm16_kernels}}; + +} // namespace + +void CpuActivationKernelHeuristics::choose_kernel(ActivationDataTypeISASelectorData &selector) +{ + const auto &klist = kernels.find(selector.dt); + if (klist == kernels.end()) + { + return; + } + + for (const auto &uk : klist->second) + { + if (uk.is_selected(selector) && uk.ukernel != nullptr) + { + _kernel = &uk; + return; + } + } +} + +CpuActivationKernelHeuristics::CpuActivationKernelHeuristics(const ITensorInfo *src, + const ITensorInfo *dst, + const ActivationLayerInfo &activation_info) +{ + ARM_COMPUTE_UNUSED(dst); + + // Set kernel + const DataType dtype = src->data_type(); + ActivationDataTypeISASelectorData selector{dtype, CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), + activation_info.activation()}; + choose_kernel(selector); + + // Set window and scheduling hint + int split_dim; + std::tie(_window, split_dim) = calculate_squashed_or_max_window(*src); + + // Collapse window with SME kernels in Y-Dim + if (std::string(_kernel->name) == "sme2_fp32_logistic") + { + _window = _window.collapse(_window, Window::DimY); + } + + _hint = IScheduler::Hints(split_dim); + + // Set minimum workload size + if (split_dim == Window::DimX) + { + // Don't split the work load too small if the tensor has been reinterpreted as 1D. + // This number is loosely chosen as threading overhead in each platform varies wildly. + _mws = 1536; + } +} + +/** Return minimum workload size + * + * @return Minimum workload size for requested configuration. + */ +size_t CpuActivationKernelHeuristics::mws() const +{ + return _mws; +} + +/** Return kernel's execution window + * + * @return The execution window + */ +const Window &CpuActivationKernelHeuristics::window() const +{ + return _window; +} + +/** Return the kernel to run + * + * @return The function pointer to the chosen kernel + */ +const CpuActivationKernelHeuristics::ActivationKernel *CpuActivationKernelHeuristics::kernel() +{ + return _kernel; +} + +/** Return the scheduling hint e.g. dimension(s) to split + * + * @return an instance of @ref IScheduler::Hints to describe the scheduling hints + */ +const IScheduler::Hints &CpuActivationKernelHeuristics::scheduler_hint() const +{ + return _hint; +} +} // namespace heuristics +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h b/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h new file mode 100644 index 0000000000..1e08680ee7 --- /dev/null +++ b/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2017-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_ACTIVATION_HEURISTICS_CPUACTIVATIONKERNELHEURISTICS_H +#define ACL_SRC_CPU_KERNELS_ACTIVATION_HEURISTICS_CPUACTIVATIONKERNELHEURISTICS_H + +#include "arm_compute/core/CPP/ICPPKernel.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/IScheduler.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/kernels/CpuKernelSelectionTypes.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace heuristics +{ + +class CpuActivationKernelHeuristics +{ +public: + using KernelPtr = + std::add_pointer::type; + + struct ActivationKernel + { + const char *name{nullptr}; + const ActivationDataTypeISASelectorDataPtr is_selected{nullptr}; + KernelPtr ukernel{nullptr}; + }; + + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuActivationKernelHeuristics); + + // Default constructor and destructor + CpuActivationKernelHeuristics() noexcept {}; + ~CpuActivationKernelHeuristics() = default; + + /** Similar to @ref CpuActivationKernel::configure() */ + CpuActivationKernelHeuristics(const ITensorInfo *src, + const ITensorInfo *dst, + const ActivationLayerInfo &activation_info); + + /** Return minimum workload size + * + * @return Minimum workload size for requested configuration in size_t + */ + size_t mws() const; + + /** Return kernel's execution window + * + * @return a reference to the kernel execution window of type @ref Window + */ + const Window &window() const; + + /** Return the kernel to run + * + * @return The function pointer to the chosen kernel + */ + const ActivationKernel *kernel(); + + /** Return the scheduling hint e.g. dimension(s) to split + * + * @return an instance of @ref IScheduler::Hints to describe the scheduling hints + */ + const IScheduler::Hints &scheduler_hint() const; + +private: + /** Chooses a kernel to run and saves it into _kernel data member + * + * @param[in] selector Selector object based on input and device configuration + */ + void choose_kernel(ActivationDataTypeISASelectorData &selector); + +private: + size_t _mws{ICPPKernel::default_mws}; + Window _window{}; + const ActivationKernel *_kernel{nullptr}; + IScheduler::Hints _hint{Window::DimY}; +}; + +} // namespace heuristics +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_ACTIVATION_HEURISTICS_CPUACTIVATIONKERNELHEURISTICS_H diff --git a/src/core/CL/CLCommandBuffer.cpp b/src/cpu/kernels/add/generic/sme2/impl.cpp similarity index 55% rename from src/core/CL/CLCommandBuffer.cpp rename to src/cpu/kernels/add/generic/sme2/impl.cpp index d094dcdaea..acc00e490c 100644 --- a/src/core/CL/CLCommandBuffer.cpp +++ b/src/cpu/kernels/add/generic/sme2/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Arm Limited. + * Copyright (c) 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,45 +22,33 @@ * SOFTWARE. */ -#include "src/core/CL/CLCommandBuffer.h" +#include "src/cpu/kernels/add/generic/sme2/impl.h" -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" - -#include "src/core/CL/CLCompatCommandBuffer.h" -#include "src/core/CL/CLMutableCommandBuffer.h" +#include "arm_compute/core/Helpers.h" namespace arm_compute { - -std::unique_ptr CLCommandBuffer::create(cl_command_queue queue) +namespace cpu { - const auto &cl_device = CLKernelLibrary::get().get_device(); - const auto has_mutable_dispatch = command_buffer_mutable_dispatch_supported(cl_device); - - if (has_mutable_dispatch) - { - return std::make_unique(queue); - } - else - { - return std::make_unique(queue); - } -} - -CLCommandBuffer::CLCommandBuffer() = default; -CLCommandBuffer::~CLCommandBuffer() = default; -CLCommandBuffer::State CLCommandBuffer::state() const +bool add_q8_sme2_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { - return _state; + return add_sub_q8_sme2_fixedpoint_possible(src0, src1, dst); } -CLCommandBuffer &CLCommandBuffer::state(CLCommandBuffer::State state) +bool add_sub_q8_sme2_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { - _state = state; - - return *this; + const auto &in0_shape = src0->tensor_shape(); + const auto &in1_shape = src1->tensor_shape(); + const unsigned int dst_dims = dst->num_dimensions(); + // Does not support broadcasting on x + // Does not support dims > 4D output, unless input shapes are identical (therefore collapsible) + if (in0_shape.x() == in1_shape.x() && (in0_shape == in1_shape || dst_dims <= 4)) + { + return true; + } + return false; } +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sme2/impl.h b/src/cpu/kernels/add/generic/sme2/impl.h new file mode 100644 index 0000000000..906b4f360c --- /dev/null +++ b/src/cpu/kernels/add/generic/sme2/impl.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_ADD_GENERIC_SME2_IMPL_H +#define ACL_SRC_CPU_KERNELS_ADD_GENERIC_SME2_IMPL_H + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" // Needed for ConvertPolicy +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace cpu +{ + +void add_qasymm8_signed_sme2( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); + +bool add_q8_sme2_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + +bool add_sub_q8_sme2_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_ADD_GENERIC_SME2_IMPL_H diff --git a/src/cpu/kernels/add/generic/sme2/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/sme2/qasymm8_signed.cpp new file mode 100644 index 0000000000..4cf369b688 --- /dev/null +++ b/src/cpu/kernels/add/generic/sme2/qasymm8_signed.cpp @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef ARM_COMPUTE_ENABLE_SME2 + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace cpu +{ + +// Add SME kernel +void sme2_q8_signed_add_kernel( // + const int8_t *src0, + const int8_t *src1, + int8_t *dst, + const float scale_0, + const float scale_1, + const float offset, + const uintptr_t win_shape[4], + const uintptr_t src_strides[4], + const uintptr_t wei_strides[4], + const uintptr_t dst_strides[4]) +{ + struct Args + { + uintptr_t shape1; + uintptr_t shape2; + uintptr_t shape3; + const int8_t *src_0; + const int8_t *src_1; + int8_t *dst; + int32_t scale_0_5p11; + int32_t scale_1_5p11; + int32_t offset_21p11; + } args; + + // Constant used to express values in the 21p11 and 5p11 fixed point format + constexpr float _2pow11 = 2048; + + args.shape1 = win_shape[1]; + args.shape2 = win_shape[2]; + args.shape3 = win_shape[3]; + args.src_0 = src0; + args.src_1 = src1; + args.dst = dst; + args.scale_0_5p11 = static_cast(static_cast(support::cpp11::lround(scale_0 * _2pow11))); + args.scale_1_5p11 = static_cast(static_cast(support::cpp11::lround(scale_1 * _2pow11))); + args.offset_21p11 = static_cast(support::cpp11::lround(offset * _2pow11)); + + // Precondition: + assert(src_strides[0] == sizeof(int8_t)); + assert(wei_strides[0] == sizeof(int8_t)); + assert(dst_strides[0] == sizeof(int8_t)); + __asm__ volatile( + R"( + .inst 0xd503477f // smstart + .inst 0x25207811 // ptrue pn9.b + ptrue p0.b + + // ================================================== + // 3D loop opening + // ================================================== + + // ---------------------------------------------------------------- x8: body_length = (length / vl) * vl + cntb x8, ALL, MUL #2 // x8 is vl (of 8 bit values) + udiv x9, %x[length], x8 // length/vl + mul x8, x8, x9 // x8 = vl * result + + ldr x10, [%[args_ptr], %[offset_shape_3]] + ldr x11, [%[args_ptr], %[offset_src_ptr]] + ldr x12, [%[args_ptr], %[offset_wei_ptr]] + ldr x13, [%[args_ptr], %[offset_dst_ptr]] + + // Could potentially be replaced with explicit loads. + ld1rw {z1.s}, p0/z, [%[args_ptr], %[scale_0_offset]] + ld1rw {z2.s}, p0/z, [%[args_ptr], %[scale_1_offset]] + ld1rw {z3.s}, p0/z, [%[args_ptr], %[offset_offset]] + +loop_3_start%=: + // for index_3 in shape_3 downto 1 + cmp x10, #0 + b.eq loop_3_end%= + sub x10, x10, #1 + + ldr x14, [%[args_ptr], %[offset_shape_2]] + mov x15, x11 + mov x16, x12 + mov x17, x13 + +loop_2_start%=: + // for index_2 in shape_2 downto 1 + cmp x14, #0 + b.eq loop_2_end%= + sub x14, x14, #1 + + ldr x7, [%[args_ptr], %[offset_shape_1]] + mov x20, x15 + mov x21, x16 + mov x22, x17 + +loop_1_start%=: + // for index_1 in shape_2 downto 1 + cmp x7, #0 + b.eq loop_1_end%= + sub x7, x7, #1 + + mov x9, #0 // x9: index/count + +inner_loop_body_start%=: + cmp x9, x8 + b.eq inner_loop_body_end%= + + /* + Two - instead of the maximal four - registers of each input are processed per loop iteration + due to the need for at least 32 registers just for the data processing which leaves no space + for the registers that contain the pre-loop loaded constants. + Once the would be 4 registers are expanded into 16 as the data goes from 8 to 32-bit, the + same number of registers (another 16) is needed to accumulate onto the offset constant for + each of those 16 lanes. One advantage of only processing two registers per loop is that more + of the elements to be processed will be in this vectorised loop instead of the left-over one. + */ + + // Load src0 + .inst 0xa0090684 // ld1b {z4.b-z5.b}, pn9/z, [x20, x9] + + // Widen src0 to 16 bits + .inst 0xc175e08c // sunpk {z12.h-z15.h}, {z4.b-z5.b} + + // Widen src0 to 32-bits + .inst 0xc1b5e184 // sunpk {z4.s-z7.s}, {z12.h-z13.h} + .inst 0xc1b5e1c8 // sunpk {z8.s-z11.s}, {z14.h-z15.h} + + // Duplicate the offset value into registers for all the values to be processed + mov z16.d, z3.d + mov z17.d, z3.d + mov z18.d, z3.d + mov z19.d, z3.d + mov z20.d, z3.d + mov z21.d, z3.d + mov z22.d, z3.d + mov z23.d, z3.d + + // MLA Fixed Point multiplication and accumulation integer + // Multiply src0 by scale_0 (z1) and add offset + mla z16.s, p0/m, z4.s, z1.s + mla z17.s, p0/m, z5.s, z1.s + mla z18.s, p0/m, z6.s, z1.s + mla z19.s, p0/m, z7.s, z1.s + mla z20.s, p0/m, z8.s, z1.s + mla z21.s, p0/m, z9.s, z1.s + mla z22.s, p0/m, z10.s, z1.s + mla z23.s, p0/m, z11.s, z1.s + + //Load src1 into the same registers that were used for src0 since they are no longer needed + .inst 0xa00906a4 // ld1b {z4.b-z5.b}, pn9/z, [x21, x9] + + // Widen src1 to 16 bits + .inst 0xc175e08c // sunpk {z12.h-z15.h}, {z4.b-z5.b} + + // Widen src1 32-bits + .inst 0xc1b5e184 // sunpk {z4.s-z7.s}, {z12.h-z13.h} + .inst 0xc1b5e1c8 // sunpk {z8.s-z11.s}, {z14.h-z15.h} + + // MLA Fixed Point multiplication and accumulation integer + // Multiply src1 by scale_1 (z2) and accumulate into registers containing src0*scale_0 + offset + mla z16.s, p0/m, z4.s, z2.s + mla z17.s, p0/m, z5.s, z2.s + mla z18.s, p0/m, z6.s, z2.s + mla z19.s, p0/m, z7.s, z2.s + mla z20.s, p0/m, z8.s, z2.s + mla z21.s, p0/m, z9.s, z2.s + mla z22.s, p0/m, z10.s, z2.s + mla z23.s, p0/m, z11.s, z2.s + + // Int32 to Int8 saturate + .inst 0xc175da85 // sqrshr z5.b, {z20.s-z23.s}, #11 + .inst 0xc175da04 // sqrshr z4.b, {z16.s-z19.s}, #11 + // Store + .inst 0xa02906c4 // st1b {z4.b-z5.b}, pn9, [x22, x9] + + incb x9, ALL, MUL #2 + b inner_loop_body_start%= +inner_loop_body_end%=: + +inner_loop_leftover_start%=: + whilelo p1.b, x9, %x[length] // While x9info(); + const auto *src1_info = src1->info(); + const auto *dst_info = dst->info(); + + const UniformQuantizationInfo src0_q_info = src0_info->quantization_info().uniform(); + const UniformQuantizationInfo src1_q_info = src1_info->quantization_info().uniform(); + const UniformQuantizationInfo dst_q_info = dst_info->quantization_info().uniform(); + + const auto &src0_strides_bytes = src0_info->strides_in_bytes(); + const auto &src1_strides_bytes = src1_info->strides_in_bytes(); + const auto &dst_strides_bytes = dst_info->strides_in_bytes(); + + // NOTE: This kernel does not support shapes above 4D (Unless excecution window has been collapsed) + assert(window.num_iterations(4) == 1 && window.num_iterations(5) == 1); + + // Note : The window is expected to handle broadcasting in higher axis than x by setting relevant strides to 0. + const uintptr_t shape[] = { + window.num_iterations(0), + window.num_iterations(1), + window.num_iterations(2), + window.num_iterations(3), + }; + + Window input0_win = window.broadcast_if_dimension_le_one(src0_info->tensor_shape()); + Window input1_win = window.broadcast_if_dimension_le_one(src1_info->tensor_shape()); + + // First dim is always datasize. If broadcasting in other dims, set stride to 0. + uintptr_t src0_strides[] = {src0_strides_bytes[0], (input0_win.is_broadcasted(1)) ? 0 : src0_strides_bytes[1], + (input0_win.is_broadcasted(2)) ? 0 : src0_strides_bytes[2], + (input0_win.is_broadcasted(3)) ? 0 : src0_strides_bytes[3]}; + uintptr_t src1_strides[] = {src1_strides_bytes[0], (input1_win.is_broadcasted(1)) ? 0 : src1_strides_bytes[1], + (input1_win.is_broadcasted(2)) ? 0 : src1_strides_bytes[2], + (input1_win.is_broadcasted(3)) ? 0 : src1_strides_bytes[3]}; + + const uintptr_t dst_strides[] = { + dst_strides_bytes[0], + dst_strides_bytes[1], + dst_strides_bytes[2], + dst_strides_bytes[3], + }; + + const uintptr_t src0_offset = window[0].start() * src0_strides[0] + window[1].start() * src0_strides[1] + + window[2].start() * src0_strides[2] + window[3].start() * src0_strides[3] + + src0->info()->offset_first_element_in_bytes(); + const uintptr_t src1_offset = window[0].start() * src1_strides[0] + window[1].start() * src1_strides[1] + + window[2].start() * src1_strides[2] + window[3].start() * src1_strides[3] + + src1->info()->offset_first_element_in_bytes(); + const uintptr_t dst_offset = window[0].start() * dst_strides[0] + window[1].start() * dst_strides[1] + + window[2].start() * dst_strides[2] + window[3].start() * dst_strides[3] + + dst->info()->offset_first_element_in_bytes(); + + const auto *src0_ptr = reinterpret_cast(src0->buffer() + src0_offset); + const auto *src1_ptr = reinterpret_cast(src1->buffer() + src1_offset); + auto *dst_ptr = reinterpret_cast(dst->buffer() + dst_offset); + + // Calculate or retrieve necessary offsets/scale values. + const int32_t offset_a = src0_q_info.offset; + const int32_t offset_b = src1_q_info.offset; + const float scale0 = src0_q_info.scale / dst_q_info.scale; + const float scale1 = src1_q_info.scale / dst_q_info.scale; + const float offset = static_cast(dst_q_info.offset) - static_cast(offset_a) * scale0 - + static_cast(offset_b) * scale1; + + sme2_q8_signed_add_kernel(src0_ptr, src1_ptr, dst_ptr, scale0, scale1, offset, shape, src0_strides, src1_strides, + dst_strides); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h index 1040c39a41..7a5dc5a176 100644 --- a/src/cpu/kernels/add/list.h +++ b/src/cpu/kernels/add/list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022 Arm Limited. + * Copyright (c) 2020-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,10 +21,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_KERNELS_ADD_LIST_H -#define SRC_CORE_KERNELS_ADD_LIST_H +#ifndef ACL_SRC_CPU_KERNELS_ADD_LIST_H +#define ACL_SRC_CPU_KERNELS_ADD_LIST_H #include "src/cpu/kernels/add/generic/neon/impl.h" +#include "src/cpu/kernels/add/generic/sme2/impl.h" #include "src/cpu/kernels/add/generic/sve/impl.h" namespace arm_compute @@ -51,9 +52,10 @@ DECLARE_ADD_KERNEL(add_s32_sve); DECLARE_ADD_KERNEL(add_qasymm8_sve2); DECLARE_ADD_KERNEL(add_qasymm8_signed_sve2); DECLARE_ADD_KERNEL(add_qsymm16_sve2); +DECLARE_ADD_KERNEL(add_qasymm8_signed_sme2); #undef DECLARE_ADD_KERNEL } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_KERNELS_ADD_LIST_H +#endif // ACL_SRC_CPU_KERNELS_ADD_LIST_H diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp index 296fe88791..e94c92b5ce 100644 --- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023 Arm Limited. + * Copyright (c) 2021-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -281,6 +281,7 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo * if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); } else diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h index 38f1b2f1f9..8dd43ad4b1 100644 --- a/src/cpu/kernels/pool2d/neon/quantized.h +++ b/src/cpu/kernels/pool2d/neon/quantized.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H -#define SRC_CORE_NEON_KERNELS_QUANTIZED_H +#ifndef ACL_SRC_CPU_KERNELS_POOL2D_NEON_QUANTIZED_H +#define ACL_SRC_CPU_KERNELS_POOL2D_NEON_QUANTIZED_H #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" @@ -80,13 +80,10 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float32x4_t half_scale_v = vdupq_n_f32(0.5f); - const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); - const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; - // "new_offset" doesn't have to consider the "half_scale_v" in its computation - // With a requantization performed in a single step there won't be uncertainties introduced + const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; const int32_t new_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / quant_rescale); @@ -163,11 +160,18 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, else { const float32x4_t scale_v = vdupq_n_f32(scale); - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - vres1 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); - vres2 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); - vres3 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); - vres4 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); + +#ifdef __aarch64__ + vres1 = vcvtnq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtnq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtnq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtnq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres4), scale_v)); +#else // __aarch64__ + vres1 = vcvtq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres4), scale_v)); +#endif // __aarch64__ const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); @@ -268,8 +272,11 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, } else { - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - res = static_cast(0.5f + static_cast(res) * scale); +#ifdef __aarch64__ + res = arm_compute::round(static_cast(res) * scale, RoundingPolicy::TO_NEAREST_EVEN); +#else // __aarch64__ + res = arm_compute::round(static_cast(res) * scale, RoundingPolicy::TO_ZERO); +#endif // __aarch64__ // Store result *(reinterpret_cast(out.ptr()) + x_off) = res; @@ -829,4 +836,4 @@ void poolingMxN_quantized_neon_nchw(const ITensor *src, } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H +#endif // ACL_SRC_CPU_KERNELS_POOL2D_NEON_QUANTIZED_H diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h index 8819907901..fe7e1bc965 100644 --- a/src/cpu/kernels/pool3d/neon/quantized.h +++ b/src/cpu/kernels/pool3d/neon/quantized.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H -#define SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H +#ifndef ACL_SRC_CPU_KERNELS_POOL3D_NEON_QUANTIZED_H +#define ACL_SRC_CPU_KERNELS_POOL3D_NEON_QUANTIZED_H #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" @@ -83,13 +83,10 @@ void avg_poolingMxNxD_q8_neon_ndhwc( Iterator out(dst0, window_out); - const float32x4_t half_scale_v = vdupq_n_f32(0.5f); - const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); - const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; - // "new_offset" doesn't have to consider the "half_scale_v" in its computation - // With a requantization performed in a single step there won't be uncertainties introduced + const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; const int32_t new_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / quant_rescale); @@ -171,12 +168,18 @@ void avg_poolingMxNxD_q8_neon_ndhwc( else { const float32x4_t scale_v = vdupq_n_f32(scale); - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - vres1 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); - vres2 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); - vres3 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); - vres4 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); +#ifdef __aarch64__ + vres1 = vcvtnq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtnq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtnq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtnq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres4), scale_v)); +#else // __aarch64__ + vres1 = vcvtq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32(wrapper::vmul(vcvtq_f32_q32(vres4), scale_v)); +#endif // __aarch64__ const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); // Store result @@ -217,8 +220,11 @@ void avg_poolingMxNxD_q8_neon_ndhwc( } else { - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - res = static_cast(0.5f + static_cast(res) * scale); +#ifdef __aarch64__ + res = arm_compute::round(static_cast(res) * scale, RoundingPolicy::TO_NEAREST_EVEN); +#else // __aarch64__ + res = arm_compute::round(static_cast(res) * scale, RoundingPolicy::TO_ZERO); +#endif // __aarch64__ // Store result *(reinterpret_cast(out.ptr()) + x_off) = res; @@ -396,4 +402,4 @@ void max_poolingMxNxD_q8_neon_ndhwc( } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H +#endif // ACL_SRC_CPU_KERNELS_POOL3D_NEON_QUANTIZED_H diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp index 425fcf7ac6..a364c1331e 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp @@ -39,7 +39,7 @@ void neon_fp16_softmax(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr) + const void *lut_ptr) { ARM_COMPUTE_UNUSED(lut_ptr); if (axis == 0) @@ -58,14 +58,14 @@ template void neon_fp16_softmax(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); template void neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp index a64946eb74..a4ded572fe 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp @@ -37,7 +37,7 @@ void neon_fp32_softmax(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr) + const void *lut_ptr) { ARM_COMPUTE_UNUSED(lut_ptr); if (axis == 0) @@ -56,14 +56,14 @@ template void neon_fp32_softmax(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); template void neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp index 31baf8a9df..8448fb8088 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.cpp +++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp @@ -40,8 +40,9 @@ void neon_softmax_x_quantized( const int input_width = in->info()->valid_region().shape.x(); - const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; - const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta); + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta); + const UniformQuantizationInfo out_qinfo = out->info()->quantization_info().uniform(); Iterator in_it(in, window); Iterator out_it(out, window); @@ -198,18 +199,22 @@ void neon_softmax_x_quantized( int x = 0; for (; x <= (input_width - vec_size); x += vec_size) { - using int_vec_type = wrapper::traits::neon_vector_t; - float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); - int_vec_type normalized_value{}; + using int_vec_type = wrapper::traits::neon_vector_t; + const float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); + int_vec_type normalized_value{}; if (IS_LOG) { - const float32x4x4_t sub = { - vsubq_f32(vec_in.val[0], sum_vec), - vsubq_f32(vec_in.val[1], sum_vec), - vsubq_f32(vec_in.val[2], sum_vec), - vsubq_f32(vec_in.val[3], sum_vec), + const float32x4_t out_offset = vdupq_n_f32(static_cast(out_qinfo.offset)); + const float32x4_t out_inv_scale = vdupq_n_f32(1.f / out_qinfo.scale); + + const float32x4x4_t normalized_value_f = { + vmlaq_f32(out_offset, vsubq_f32(vec_in.val[0], sum_vec), out_inv_scale), + vmlaq_f32(out_offset, vsubq_f32(vec_in.val[1], sum_vec), out_inv_scale), + vmlaq_f32(out_offset, vsubq_f32(vec_in.val[2], sum_vec), out_inv_scale), + vmlaq_f32(out_offset, vsubq_f32(vec_in.val[3], sum_vec), out_inv_scale), }; - normalized_value = convert_float_to_int(sub); + + normalized_value = convert_float_to_int(normalized_value_f); } else { @@ -238,7 +243,13 @@ void neon_softmax_x_quantized( { if (IS_LOG) { - out_ptr[x] = utils::cast::saturate_cast(tmp_ptr[x] - sum_transformed); + const float diff = tmp_ptr[x] - sum_transformed; +#ifdef __aarch64__ + constexpr auto policy = RoundingPolicy::TO_NEAREST_EVEN; +#else // __aarch64__ + constexpr auto policy = RoundingPolicy::TO_ZERO; +#endif // __aarch64__ + out_ptr[x] = Qasymm8QuantizationHelper::quantize(diff, out_qinfo, policy); } else { @@ -277,6 +288,8 @@ void neon_softmax_non_x_quantized( const int axis_width = in_info->dimension(axis); const int end_actual = std::min(window[0].end(), x_width); + const UniformQuantizationInfo out_qinfo = out->info()->quantization_info().uniform(); + execute_window_loop( window, [&](const Coordinates &winCoords) @@ -488,13 +501,21 @@ void neon_softmax_non_x_quantized( if (IS_LOG) { - const float32x4x4_t sub = { - vsubq_f32(vec_in.val[0], vec_sum_transformed.val[0]), - vsubq_f32(vec_in.val[1], vec_sum_transformed.val[1]), - vsubq_f32(vec_in.val[2], vec_sum_transformed.val[2]), - vsubq_f32(vec_in.val[3], vec_sum_transformed.val[3]), + const float32x4_t out_offset = vdupq_n_f32(static_cast(out_qinfo.offset)); + const float32x4_t out_inv_scale = vdupq_n_f32(1.f / out_qinfo.scale); + + const float32x4x4_t normalized_value_f = { + vmlaq_f32(out_offset, vsubq_f32(vec_in.val[0], vec_sum_transformed.val[0]), + out_inv_scale), + vmlaq_f32(out_offset, vsubq_f32(vec_in.val[1], vec_sum_transformed.val[1]), + out_inv_scale), + vmlaq_f32(out_offset, vsubq_f32(vec_in.val[2], vec_sum_transformed.val[2]), + out_inv_scale), + vmlaq_f32(out_offset, vsubq_f32(vec_in.val[3], vec_sum_transformed.val[3]), + out_inv_scale), }; - normalized_value = convert_float_to_int(sub); + + normalized_value = convert_float_to_int(normalized_value_f); } else { @@ -528,19 +549,28 @@ void neon_softmax_non_x_quantized( float *const base_ptr_tmp = (i * tmp_axis_stride) + reinterpret_cast(tmp_ptr); if (IS_LOG) { +#ifdef __aarch64__ + constexpr auto policy = RoundingPolicy::TO_NEAREST_EVEN; +#else // __aarch64__ + constexpr auto policy = RoundingPolicy::TO_ZERO; +#endif // __aarch64__ + for (int k = 0; k < num_remaining_full; ++k) { for (int j = 0; j < 4; ++j) { - *(base_ptr_out + (4 * k + j)) = utils::cast::saturate_cast( - (*(base_ptr_tmp + (4 * k + j)) - vec_sum_transformed.val[k][j])); + const float diff = *(base_ptr_tmp + (4 * k + j)) - vec_sum_transformed.val[k][j]; + + *(base_ptr_out + (4 * k + j)) = + Qasymm8QuantizationHelper::quantize(diff, out_qinfo, policy); } } for (int j = 0; j < num_remaining_partial; ++j) { + const float diff = *(base_ptr_tmp + (4 * num_remaining_full + j)) - + vec_sum_transformed.val[num_remaining_full][j]; *(base_ptr_out + (4 * num_remaining_full + j)) = - utils::cast::saturate_cast(*(base_ptr_tmp + (4 * num_remaining_full + j)) - - vec_sum_transformed.val[num_remaining_full][j]); + Qasymm8QuantizationHelper::quantize(diff, out_qinfo, policy); } } else diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp index 369f9bb005..e16eff3ac6 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp @@ -36,7 +36,7 @@ void neon_qasymm8_softmax(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr) + const void *lut_ptr) { ARM_COMPUTE_UNUSED(lut_ptr); if (axis == 0) @@ -55,14 +55,14 @@ template void neon_qasymm8_softmax(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); template void neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp index 594ceb7654..a2832dcca2 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp @@ -36,7 +36,7 @@ void neon_qasymm8_signed_softmax(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr) + const void *lut_ptr) { ARM_COMPUTE_UNUSED(lut_ptr); if (axis == 0) @@ -55,14 +55,14 @@ template void neon_qasymm8_signed_softmax(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); template void neon_qasymm8_signed_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sme2/fp16.cpp b/src/cpu/kernels/softmax/generic/sme2/fp16.cpp index e70c9f4793..95550548bf 100644 --- a/src/cpu/kernels/softmax/generic/sme2/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/sme2/fp16.cpp @@ -720,13 +720,8 @@ loop_3_end%=: ); } -void sme2_fp16_softmax(const ITensor *in, - void *const, - ITensor *out, - const float beta, - int axis, - const Window &window, - const float *lut_ptr) +void sme2_fp16_softmax( + const ITensor *in, void *const, ITensor *out, const float beta, int axis, const Window &window, const void *lut_ptr) { ARM_COMPUTE_UNUSED(lut_ptr); ARM_COMPUTE_UNUSED(axis); diff --git a/src/cpu/kernels/softmax/generic/sme2/fp32.cpp b/src/cpu/kernels/softmax/generic/sme2/fp32.cpp index 5e29d51746..d08bed7ad9 100644 --- a/src/cpu/kernels/softmax/generic/sme2/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/sme2/fp32.cpp @@ -524,13 +524,8 @@ loop_3_end%=: ); } -void sme2_fp32_softmax(const ITensor *in, - void *const, - ITensor *out, - const float beta, - int axis, - const Window &window, - const float *lut_ptr) +void sme2_fp32_softmax( + const ITensor *in, void *const, ITensor *out, const float beta, int axis, const Window &window, const void *lut_ptr) { ARM_COMPUTE_UNUSED(lut_ptr); ARM_COMPUTE_UNUSED(axis); diff --git a/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp index 8bde7dc907..f3d443f9aa 100644 --- a/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp @@ -566,10 +566,12 @@ void sme2_qasymm8_softmax_lut_512VL(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr) + const void *lut_ptr) { ARM_COMPUTE_UNUSED(axis); + auto lut_fp32_ptr = reinterpret_cast(lut_ptr); + const auto *src_info = in->info(); const auto *dst_info = out->info(); @@ -624,7 +626,7 @@ void sme2_qasymm8_softmax_lut_512VL(const ITensor *in, auto *k_tmp = reinterpret_cast(tmp_float_ptr + k_tmp_offset); auto *k_dst = reinterpret_cast(out->buffer() + k_dst_offset); - sme2_qasymm8_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_ptr, k_tmp); + sme2_qasymm8_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_fp32_ptr, k_tmp); } } // namespace cpu diff --git a/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp index f3667d4ad8..4a71914006 100644 --- a/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp @@ -587,10 +587,12 @@ void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr) + const void *lut_ptr) { ARM_COMPUTE_UNUSED(axis); + auto lut_fp32_ptr = reinterpret_cast(lut_ptr); + const auto *src_info = in->info(); const auto *dst_info = out->info(); @@ -645,7 +647,8 @@ void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in, auto *k_tmp = reinterpret_cast(tmp_float_ptr + k_tmp_offset); auto *k_dst = reinterpret_cast(out->buffer() + k_dst_offset); - sme2_qasymm8_signed_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_ptr, k_tmp); + sme2_qasymm8_signed_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_fp32_ptr, + k_tmp); } } // namespace cpu diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h index 7bbb265022..9b11f1eaed 100644 --- a/src/cpu/kernels/softmax/list.h +++ b/src/cpu/kernels/softmax/list.h @@ -31,7 +31,7 @@ namespace cpu #define DECLARE_SOFTMAX_KERNEL(func_name) \ template \ void func_name(const ITensor *in, void *const tmp, ITensor *out, const float beta, int axis, const Window &window, \ - const float *lut_ptr) + const void *lut_ptr) DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax); DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax); @@ -46,7 +46,7 @@ void sme2_fp32_softmax(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); void sme2_fp16_softmax(const ITensor *in, void *const tmp, @@ -54,7 +54,7 @@ void sme2_fp16_softmax(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); void sme2_qasymm8_softmax_lut_512VL(const ITensor *in, void *const tmp, @@ -62,7 +62,7 @@ void sme2_qasymm8_softmax_lut_512VL(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in, void *const tmp, @@ -70,7 +70,7 @@ void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in, const float beta, int axis, const Window &window, - const float *lut_ptr); + const void *lut_ptr); #endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/cpu/kernels/sub/neon/impl.h b/src/cpu/kernels/sub/neon/impl.h index 6123f7e25a..d641ec4ec9 100644 --- a/src/cpu/kernels/sub/neon/impl.h +++ b/src/cpu/kernels/sub/neon/impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023 Arm Limited. + * Copyright (c) 2021-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -91,30 +91,45 @@ void sub_same_neon( // Compute S elements per iteration int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) + if (is_broadcast_input_2) { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) - : wrapper::vsub(broadcast_value_vec, non_broadcast_v); - if (is_broadcast_input_2) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - res = wrapper::vmul(res, wrapper::vdup_n(static_cast(-1), ExactTagType{})); + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::vqsub(non_broadcast_v, broadcast_value_vec) + : wrapper::vsub(non_broadcast_v, broadcast_value_vec); + wrapper::vstore(output_ptr + x, res); } - wrapper::vstore(output_ptr + x, res); - } - // Compute left-over elements - for (; x < window_end_x; ++x) + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::sub_sat(non_broadcast_v, broadcast_value) + : non_broadcast_v - broadcast_value; + + *(output_ptr + x) = res; + } + } + else { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - auto res = - is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v; - if (is_broadcast_input_2) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - res = static_cast(-1) * res; + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) + : wrapper::vsub(broadcast_value_vec, non_broadcast_v); + wrapper::vstore(output_ptr + x, res); } - *(output_ptr + x) = res; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) + : broadcast_value - non_broadcast_v; + + *(output_ptr + x) = res; + } } }, broadcast_input, non_broadcast_input, output); diff --git a/src/cpu/kernels/sub/neon/qsymm16.cpp b/src/cpu/kernels/sub/neon/qsymm16.cpp index 23e4b03843..d6b9a73727 100644 --- a/src/cpu/kernels/sub/neon/qsymm16.cpp +++ b/src/cpu/kernels/sub/neon/qsymm16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -55,8 +55,6 @@ void sub_qsymm16_neon( const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); - const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); if (is_broadcast_across_x) @@ -69,6 +67,9 @@ void sub_qsymm16_neon( const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + const float32x4_t vbroadcast_scale = vdupq_n_f32(broadcast_qinfo.scale); + const float32x4_t vnon_broadcast_scale = vdupq_n_f32(non_broadcast_qinfo.scale); + // Clear X Dimension on execution window as we handle manually non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -87,8 +88,8 @@ void sub_qsymm16_neon( const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); const float32x4x2_t bf = {{ - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vbroadcast_scale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vbroadcast_scale), }}; const float bfs = static_cast(broadcast_value) * broadcast_qinfo.scale; @@ -98,24 +99,24 @@ void sub_qsymm16_neon( { const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); const float32x4x2_t af = {{ - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vnon_broadcast_scale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vnon_broadcast_scale), }}; const int32x4x4_t rf = {{ #ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) - : vsubq_f32(af.val[0], bf.val[0]), + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(af.val[0], bf.val[0]) + : vsubq_f32(bf.val[0], af.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) - : vsubq_f32(af.val[1], bf.val[1]), + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(af.val[1], bf.val[1]) + : vsubq_f32(bf.val[1], af.val[1]), invvscaleo)), #else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) - : vsubq_f32(af.val[0], bf.val[0]), + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(af.val[0], bf.val[0]) + : vsubq_f32(bf.val[0], af.val[0]), invvscaleo)), - vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) - : vsubq_f32(af.val[1], bf.val[1]), + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(af.val[1], bf.val[1]) + : vsubq_f32(bf.val[1], af.val[1]), invvscaleo)), #endif //__aarch64__ }}; @@ -128,13 +129,16 @@ void sub_qsymm16_neon( for (; x < window_end_x; ++x) { const float afs = static_cast(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info); + *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (afs - bfs) : (bfs - afs), oq_info); } }, broadcast_input, non_broadcast_input, output); } else { + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + // Clear X Dimension on execution window as we handle manually input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp index 26ca2ee783..23f51cda24 100644 --- a/src/cpu/operators/CpuConv2d.cpp +++ b/src/cpu/operators/CpuConv2d.cpp @@ -233,10 +233,9 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *i return ConvolutionMethod::GEMM; } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - // This heuristics only applies to F16 data type on A55r1 - if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && - input->data_type() == DataType::F16) +#if ARM_COMPUTE_ENABLE_FP16 + // This heuristics only applies to F16 + if (CPUInfo::get().has_fp16() && enable_fast_math && input->data_type() == DataType::F16) { // Exclude known bad winograd configs (and defaults to GEMM) const std::vector known_bad_winograd_f16_with_fastmath_configs = { @@ -270,7 +269,7 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *i return ConvolutionMethod::GEMM; } } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#endif // ARM_COMPUTE_ENABLE_FP16 // For 1x1 convolutions run the default GEMM if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp index 54075f2afa..5fe91aa5d8 100644 --- a/src/cpu/operators/CpuDepthwiseConv2d.cpp +++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023 Arm Limited. + * Copyright (c) 2021-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -45,12 +45,6 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::F16, DataType::F32); - if (!is_data_type_quantized_per_channel(weights->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - } ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1); const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp index 4d84c1b257..ae74ca8d64 100644 --- a/src/runtime/CL/CLScheduler.cpp +++ b/src/runtime/CL/CLScheduler.cpp @@ -105,7 +105,9 @@ CLScheduler::CLScheduler() _backend_type(CLBackendType::Native), _job_chaining_enabled(true), _job_chaining_size(1), - _job_chaining_count(0) + _job_chaining_count(0), + _enqueue_count(0), + _flush_count(0) { } @@ -201,14 +203,11 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f void CLScheduler::flush_queue(bool flush) { - if (flush) - { - _queue.flush(); - _job_chaining_count = 0; - return; - } + _enqueue_count++; + _flush_count += flush; + const float flush_ratio = _flush_count / (float)_enqueue_count; - if (_job_chaining_enabled) + if (_enqueue_count > 100 && flush_ratio > 0.5f && _job_chaining_enabled) { ++_job_chaining_count; @@ -228,6 +227,11 @@ void CLScheduler::flush_queue(bool flush) _queue.flush(); } } + else if (flush) + { + _job_chaining_count = 0; + _queue.flush(); + } } void CLScheduler::enqueue(ICLKernel &kernel, bool flush) diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp index e6457218c7..b109288bfb 100644 --- a/src/runtime/CL/CLTensorAllocator.cpp +++ b/src/runtime/CL/CLTensorAllocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -168,6 +168,11 @@ void CLTensorAllocator::free() info().set_is_resizable(true); } +bool CLTensorAllocator::is_allocated() const +{ + return _memory.region() != nullptr; +} + Status CLTensorAllocator::import_memory(cl::Buffer buffer) { ARM_COMPUTE_RETURN_ERROR_ON(buffer.get() == nullptr); diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index 7767b45a01..bb3edb2323 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -49,6 +49,7 @@ struct CLConvolutionLayer::Impl WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; std::unique_ptr func{nullptr}; + bool is_prepared{false}; }; CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) @@ -126,9 +127,10 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, _impl->aux_mem_req = _impl->op->workspace(); _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; - _impl->workspace = - manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, + _impl->prep_pack, /* allocate_now */ false); } + _impl->is_prepared = false; } Status CLConvolutionLayer::validate(const ITensorInfo *input, @@ -208,16 +210,22 @@ void CLConvolutionLayer::run() void CLConvolutionLayer::prepare() { - if (_impl->func) + if (!_impl->is_prepared) { - _impl->func->prepare(); - } - else - { - _impl->op->prepare(_impl->prep_pack); + if (_impl->func) + { + _impl->func->prepare(); + } + else + { + allocate_tensors(_impl->aux_mem_req, _impl->workspace); + _impl->op->prepare(_impl->prep_pack); + + // Release temporary tensors that are only used in prepare stage + release_temporaries(_impl->aux_mem_req, _impl->workspace); + } - // Release temporary tensors that are only used in prepare stage - release_temporaries(_impl->aux_mem_req, _impl->workspace); + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index b30f9e701f..6296b8e054 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -97,8 +97,8 @@ void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, { _impl->aux_mem_req = _impl->op->workspace(); _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; - _impl->workspace = - manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, + _impl->run_pack, /* allocate_now */ false); } else { @@ -121,10 +121,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, void CLFullyConnectedLayer::run() { - if (!_impl->dynamic_weights) - { - prepare(); - } + prepare(); MemoryGroupResourceScope scope_mg(_impl->memory_group); _impl->op->run(_impl->run_pack); @@ -134,26 +131,31 @@ void CLFullyConnectedLayer::prepare() { if (!_impl->is_prepared) { - _impl->op->prepare(_impl->run_pack); + allocate_tensors(_impl->aux_mem_req, _impl->workspace); + if (!_impl->dynamic_weights) + { + _impl->op->prepare(_impl->run_pack); - // Release temporary tensors that are only used in prepare stage - release_temporaries(_impl->aux_mem_req, _impl->workspace); - _impl->is_prepared = true; + // Release temporary tensors that are only used in prepare stage + release_temporaries(_impl->aux_mem_req, _impl->workspace); - // Handle weights managed infrastructure - if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) - { - // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare - // This is for cases where multiple functions share the same b (weights) - // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference - const ITensor *original_b = _impl->original_weights; - if (!original_b->is_used()) + // Handle weights managed infrastructure + if (_impl->weights_manager != nullptr && + _impl->weights_manager->are_weights_managed(_impl->original_weights)) { - _impl->weights_manager->pre_mark_as_unused(original_b); + // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare + // This is for cases where multiple functions share the same b (weights) + // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference + const ITensor *original_b = _impl->original_weights; + if (!original_b->is_used()) + { + _impl->weights_manager->pre_mark_as_unused(original_b); + } + _impl->original_weights->mark_as_used(); + _impl->weights_manager->release(_impl->original_weights); } - _impl->original_weights->mark_as_used(); - _impl->weights_manager->release(_impl->original_weights); } + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 871a1d6e27..bc66205af4 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -102,8 +102,8 @@ void CLGEMM::configure(const CLCompileContext &compile_context, _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}}; _impl->prep_pack = {{ACL_SRC_1, _impl->b}}; - _impl->workspace_tensors = - manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace_tensors = manage_workspace( + _impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack, /* allocate_now */ false); } } @@ -131,6 +131,7 @@ void CLGEMM::prepare() { if (!_impl->is_prepared) { + allocate_tensors(_impl->aux_mem_req, _impl->workspace_tensors); _impl->op->prepare(_impl->prep_pack); auto has_reshape = diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index aef7cddd7a..5439129ab0 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -109,9 +109,9 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_contex {TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}, }; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = - manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, + _impl->prep_pack, /* allocate_now */ false); } Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, @@ -139,6 +139,7 @@ void CLGEMMConvolutionLayer::prepare() { if (!_impl->is_prepared) { + allocate_tensors(_impl->aux_mem_req, _impl->workspace_tensors); _impl->op->prepare(_impl->prep_pack); auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 8bad198658..d3a2e7dc09 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -94,9 +94,9 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con } else { - _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}}; - _impl->workspace_tensors = - manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}}; + _impl->workspace_tensors = manage_workspace( + _impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack, /* allocate_now */ false); } } @@ -122,6 +122,7 @@ void CLGEMMLowpMatrixMultiplyCore::prepare() { if (!_impl->is_prepared) { + allocate_tensors(_impl->aux_mem_req, _impl->workspace_tensors); _impl->op->prepare(_impl->run_pack); // Release temporary tensors that are only used in prepare stage diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index 645f817030..bd7e23b980 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -86,12 +86,12 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_co (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, enable_fast_math); - _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src}, - {TensorType::ACL_SRC_1, _impl->weights}, - {TensorType::ACL_SRC_2, _impl->biases}, - {TensorType::ACL_DST, _impl->dst}}; - _impl->workspace_tensors = - manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src}, + {TensorType::ACL_SRC_1, _impl->weights}, + {TensorType::ACL_SRC_2, _impl->biases}, + {TensorType::ACL_DST, _impl->dst}}; + _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, + _impl->run_pack, /* allocate_now */ false); } Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, @@ -116,6 +116,7 @@ void CLWinogradConvolutionLayer::prepare() { if (!_impl->is_prepared) { + allocate_tensors(_impl->op->workspace(), _impl->workspace_tensors); _impl->op->prepare(_impl->run_pack); // Release Preparation tensors diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index 8efebbbb1a..7107a6be7a 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -51,6 +51,7 @@ struct NEConvolutionLayer::Impl WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; std::unique_ptr func{nullptr}; + bool is_prepared{false}; }; NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) @@ -113,9 +114,10 @@ void NEConvolutionLayer::configure(ITensor *input, _impl->aux_mem_req = _impl->op->workspace(); _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; - _impl->workspace = - manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, + _impl->prep_pack, /* allocate_now */ false); } + _impl->is_prepared = false; } Status NEConvolutionLayer::validate(const ITensorInfo *input, @@ -193,16 +195,22 @@ void NEConvolutionLayer::run() void NEConvolutionLayer::prepare() { - if (_impl->func) + if (!_impl->is_prepared) { - _impl->func->prepare(); - } - else - { - _impl->op->prepare(_impl->prep_pack); + if (_impl->func) + { + _impl->func->prepare(); + } + else + { + allocate_tensors(_impl->aux_mem_req, _impl->workspace); + _impl->op->prepare(_impl->prep_pack); + + // Release temporary tensors that are only used in prepare stage + release_temporaries(_impl->aux_mem_req, _impl->workspace); + } - // Release temporary tensors that are only used in prepare stage - release_temporaries(_impl->aux_mem_req, _impl->workspace); + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index 6c085645db..de291355ac 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -244,6 +244,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( _impl->is_nchw = input->info()->data_layout() == DataLayout::NCHW; _impl->is_prepared = !_impl->is_nchw; + _impl->permuted_input = {}; + _impl->permuted_weights = {}; + ITensor *input_to_use = input; const ITensor *weights_to_use = weights; ITensor *output_to_use = output; diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index 2656d0fa0f..be451bcdeb 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -91,8 +91,8 @@ void NEFullyConnectedLayer::configure(const ITensor *input, _impl->aux_mem_req = _impl->op->workspace(); _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; - _impl->workspace = - manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, + _impl->run_pack, /* allocate_now */ false); _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights && !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights; @@ -135,6 +135,7 @@ void NEFullyConnectedLayer::prepare() { if (!_impl->is_prepared) { + allocate_tensors(_impl->aux_mem_req, _impl->workspace); _impl->op->prepare(_impl->run_pack); // Release temporary tensors that are only used in prepare stage diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 934a8250cc..d26b819864 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -92,8 +92,8 @@ void NEGEMM::configure(const ITensor *a, _impl->aux_mem_req = _impl->op->workspace(); _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}}; _impl->prep_pack = {{ACL_SRC_1, b}, {ACL_SRC_2, c}}; - _impl->workspace = - manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, + _impl->prep_pack, /* allocate_now */ false); } Status NEGEMM::validate(const ITensorInfo *a, @@ -139,6 +139,7 @@ void NEGEMM::prepare() { if (!_impl->is_prepared) { + allocate_tensors(_impl->aux_mem_req, _impl->workspace); _impl->op->prepare(_impl->prep_pack); auto has_reshape = diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp index 6cca02eea9..b5cdd864ba 100644 --- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp +++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -68,8 +68,8 @@ void NEGEMMConv2d::configure( _impl->aux_mem_req = _impl->op->workspace(); _impl->run_pack = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}}; _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}}; - _impl->workspace = - manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, + _impl->prep_pack, /* allocate_now */ false); } Status NEGEMMConv2d::validate(const ITensorInfo *input, @@ -93,6 +93,7 @@ void NEGEMMConv2d::prepare() { if (!_impl->is_prepared) { + allocate_tensors(_impl->aux_mem_req, _impl->workspace); _impl->op->prepare(_impl->prep_pack); auto has_reshape = diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index be10121a56..03df5115f0 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -69,18 +69,19 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - _impl->weights = weights; - _impl->op = std::make_unique(); + _impl->is_prepared = false; + _impl->weights = weights; + _impl->op = std::make_unique(); _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); - _impl->run_pack = {{TensorType::ACL_SRC_0, input}, - {TensorType::ACL_SRC_1, weights}, - {TensorType::ACL_SRC_2, biases}, - {TensorType::ACL_DST, output}}; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = - manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + {TensorType::ACL_DST, output}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, + _impl->run_pack, /* allocate_now */ false); } Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, @@ -129,6 +130,7 @@ void NEGEMMConvolutionLayer::prepare() { if (!_impl->is_prepared) { + allocate_tensors(_impl->aux_mem_req, _impl->workspace_tensors); _impl->op->prepare(_impl->run_pack); // Release temporary tensors that are only used in prepare stage diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index b9cff8540d..6d172cef27 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -73,18 +73,19 @@ void NEGEMMLowpMatrixMultiplyCore::configure( b_info_to_use->set_are_values_constant(false); } - _impl->b = b; - _impl->op = std::make_unique(); + _impl->is_prepared = false; + _impl->b = b; + _impl->op = std::make_unique(); _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), gemm_info); - _impl->run_pack = {{TensorType::ACL_SRC_0, a}, - {TensorType::ACL_SRC_1, b}, - {TensorType::ACL_SRC_2, c}, - {TensorType::ACL_DST, output}}; - _impl->prep_pack = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}}; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = - manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, a}, + {TensorType::ACL_SRC_1, b}, + {TensorType::ACL_SRC_2, c}, + {TensorType::ACL_DST, output}}; + _impl->prep_pack = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, + _impl->prep_pack, /* allocate_now */ false); } Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, @@ -142,6 +143,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare() { if (!_impl->is_prepared) { + allocate_tensors(_impl->aux_mem_req, _impl->workspace_tensors); _impl->op->prepare(_impl->prep_pack); auto has_reshape = diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index 7334be8456..b72aff577a 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -69,6 +69,7 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ActivationLayerInfo &act_info, bool enable_fast_math) { + _impl->is_prepared = false; _impl->original_weights = weights; _impl->op = std::make_unique(); _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), @@ -77,8 +78,8 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, _impl->aux_mem_req = _impl->op->workspace(); _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; - _impl->workspace = - manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, + _impl->prep_pack, /* allocate_now */ false); } void NEWinogradConvolutionLayer::run() @@ -104,6 +105,7 @@ void NEWinogradConvolutionLayer::prepare() { if (!_impl->is_prepared) { + allocate_tensors(_impl->aux_mem_req, _impl->workspace); _impl->op->prepare(_impl->prep_pack); _impl->original_weights->mark_as_unused(); diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp index fe4dfdd474..55b25f9098 100644 --- a/src/runtime/OMP/OMPScheduler.cpp +++ b/src/runtime/OMP/OMPScheduler.cpp @@ -81,7 +81,13 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win const Window &max_window = window; const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); - const unsigned int num_threads = std::min(num_iterations, _num_threads); + const unsigned int mws = kernel->get_mws(CPUInfo::get(), _num_threads); + + // Ensure each thread has mws amount of work to do (i.e. ceil(num_iterations / mws) threads) + const unsigned int candidate_num_threads = (num_iterations + mws - 1) / mws; + + // Cap the number of threads to be spawn with the size of the thread pool + const unsigned int num_threads = std::min(candidate_num_threads, _num_threads); if (!kernel->is_parallelisable() || num_threads == 1) { diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp index 372852bfea..b803f77522 100644 --- a/src/runtime/TensorAllocator.cpp +++ b/src/runtime/TensorAllocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -151,6 +151,11 @@ void TensorAllocator::free() info().set_is_resizable(true); } +bool TensorAllocator::is_allocated() const +{ + return _memory.region() != nullptr; +} + Status TensorAllocator::import_memory(void *memory) { ARM_COMPUTE_RETURN_ERROR_ON(memory == nullptr); diff --git a/src/runtime/experimental/operators/CpuSoftmax.cpp b/src/runtime/experimental/operators/CpuSoftmax.cpp new file mode 100644 index 0000000000..8386fd36ef --- /dev/null +++ b/src/runtime/experimental/operators/CpuSoftmax.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/experimental/operators/CpuSoftmax.h" + +#include "src/cpu/operators/CpuSoftmax.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace op +{ + +struct CpuSoftmax::Impl +{ + std::unique_ptr op{nullptr}; +}; + +CpuSoftmax::CpuSoftmax() : impl_(std::make_unique()) +{ + impl_->op = std::make_unique(); +} + +CpuSoftmax::~CpuSoftmax() = default; + +void CpuSoftmax::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis, bool is_log) +{ + impl_->op->configure(src, dst, beta, axis, is_log); +} + +Status CpuSoftmax::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis, bool is_log) +{ + return cpu::CpuSoftmaxGeneric::validate(src, dst, beta, axis, is_log); +} + +void CpuSoftmax::run(ITensorPack &tensor) +{ + impl_->op->run(tensor); +} + +experimental::MemoryRequirements CpuSoftmax::workspace() const +{ + return impl_->op->workspace(); +} + +void CpuSoftmax::prepare(ITensorPack &constants) +{ + ARM_COMPUTE_UNUSED(constants); +} + +} // namespace op +} // namespace experimental +} // namespace arm_compute diff --git a/support/Bfloat16.h b/support/Bfloat16.h index 02772898a8..7c5ef78848 100644 --- a/support/Bfloat16.h +++ b/support/Bfloat16.h @@ -31,6 +31,26 @@ namespace arm_compute { namespace { +/** Convert float to bfloat16 in a portable way that works on older hardware + * + * @param[in] v Floating-point value to convert to bfloat + * + * @return Converted value + */ +inline uint16_t portable_float_to_bf16(const float v) +{ + const uint32_t *fromptr = reinterpret_cast(&v); + uint16_t res = (*fromptr >> 16); + const uint16_t error = (*fromptr & 0x0000ffff); + uint16_t bf_l = res & 0x0001; + + if ((error > 0x8000) || ((error == 0x8000) && (bf_l != 0))) + { + res += 1; + } + return res; +} + /** Convert float to bfloat16 * * @param[in] v Floating-point value to convert to bfloat @@ -39,9 +59,9 @@ namespace */ inline uint16_t float_to_bf16(const float v) { - const uint32_t *fromptr = reinterpret_cast(&v); #if defined(ARM_COMPUTE_ENABLE_BF16) - uint16_t res; + const uint32_t *fromptr = reinterpret_cast(&v); + uint16_t res; __asm __volatile("ldr s0, [%[fromptr]]\n" ".inst 0x1e634000\n" // BFCVT h0, s0 @@ -49,16 +69,10 @@ inline uint16_t float_to_bf16(const float v) : : [fromptr] "r"(fromptr), [toptr] "r"(&res) : "v0", "memory"); + return res; #else /* defined(ARM_COMPUTE_ENABLE_BF16) */ - uint16_t res = (*fromptr >> 16); - const uint16_t error = (*fromptr & 0x0000ffff); - uint16_t bf_l = res & 0x0001; - if ((error > 0x8000) || ((error == 0x8000) && (bf_l != 0))) - { - res += 1; - } + return portable_float_to_bf16(v); #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ - return res; } /** Convert bfloat16 to float @@ -91,6 +105,15 @@ class bfloat16 final bfloat16(float v) : value(float_to_bf16(v)) { } + /** Constructor + * + * @param[in] v Floating-point value + * @param[in] portable bool to indicate the conversion is to be done in a backward compatible way + */ + bfloat16(float v, bool portable) : value(0) + { + value = portable ? portable_float_to_bf16(v) : float_to_bf16(v); + } /** Assignment operator * * @param[in] v Floating point value to assign diff --git a/tests/AssetsLibrary.cpp b/tests/AssetsLibrary.cpp index 571b55125b..5eb8179704 100644 --- a/tests/AssetsLibrary.cpp +++ b/tests/AssetsLibrary.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020, 2023 Arm Limited. + * Copyright (c) 2017-2020, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -267,6 +267,11 @@ std::random_device::result_type AssetsLibrary::seed() const return _seed; } +void AssetsLibrary::set_seed(std::random_device::result_type seed) +{ + _seed = seed; +} + void AssetsLibrary::fill(RawTensor &raw, const std::string &name, Format format) const { //FIXME: Should be done by swapping cached buffers diff --git a/tests/AssetsLibrary.h b/tests/AssetsLibrary.h index bd97cb7bd4..dedad5227f 100644 --- a/tests/AssetsLibrary.h +++ b/tests/AssetsLibrary.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_TENSOR_LIBRARY_H -#define ARM_COMPUTE_TEST_TENSOR_LIBRARY_H +#ifndef ACL_TESTS_ASSETSLIBRARY_H +#define ACL_TESTS_ASSETSLIBRARY_H #include "arm_compute/core/Coordinates.h" #include "arm_compute/core/Error.h" @@ -76,6 +76,12 @@ class AssetsLibrary final */ std::string path() const; + /** Set the seed that is used to fill tensors with random values. + * + * @param[in] the initial random seed to set. + */ + void set_seed(std::random_device::result_type); + /** Seed that is used to fill tensors with random values. * * @return the initial random seed. @@ -588,7 +594,6 @@ void AssetsLibrary::fill_with_generator(T &&tensor, const GeneratorFunctionType< { const bool is_nhwc = tensor.data_layout() == DataLayout::NHWC; TensorShape shape(tensor.shape()); - if(is_nhwc) { // Ensure that the equivalent tensors will be filled for both data layouts @@ -739,6 +744,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t break; } case DataType::U16: + case DataType::QASYMM16: { std::uniform_int_distribution distribution_u16(std::numeric_limits::lowest(), std::numeric_limits::max()); fill(tensor, distribution_u16, seed_offset); @@ -778,7 +784,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t case DataType::BFLOAT16: { // It doesn't make sense to check [-inf, inf], so hard code it to a big number - arm_compute::utils::uniform_real_distribution_16bit distribution_bf16{ -1000.f, 1000.f }; + arm_compute::utils::uniform_real_distribution_16bit distribution_bf16{ -1000.f, 1000.f, true /* portable */ }; fill(tensor, distribution_bf16, seed_offset); break; } @@ -1057,4 +1063,4 @@ void AssetsLibrary::fill_tensor_value(T &&tensor, D value) const } } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_TENSOR_LIBRARY_H */ +#endif // ACL_TESTS_ASSETSLIBRARY_H diff --git a/tests/datasets/DatatypeDataset.h b/tests/datasets/DatatypeDataset.h index 4cce7bb375..b1928b5e1d 100644 --- a/tests/datasets/DatatypeDataset.h +++ b/tests/datasets/DatatypeDataset.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,10 +21,10 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_DATATYPE_DATASET_H -#define ARM_COMPUTE_TEST_DATATYPE_DATASET_H +#ifndef ACL_TESTS_DATASETS_DATATYPEDATASET_H +#define ACL_TESTS_DATASETS_DATATYPEDATASET_H -#include "arm_compute/core/Types.h" +#include "arm_compute/core/CoreTypes.h" #include "tests/framework/datasets/ContainerDataset.h" #include @@ -35,6 +35,54 @@ namespace test { namespace datasets { +class AllDataTypes final : public framework::dataset::ContainerDataset> +{ +public: + AllDataTypes(const std::string &name) + : ContainerDataset(name, + { + DataType::QSYMM8, + DataType::QASYMM8, + DataType::QASYMM8_SIGNED, + DataType::QSYMM16, + DataType::U8, /**< unsigned 8-bit number */ + DataType::S8, /**< signed 8-bit number */ + DataType::QSYMM8_PER_CHANNEL, /**< quantized, symmetric per channel fixed-point 8-bit number */ + DataType::U16, /**< unsigned 16-bit number */ + DataType::S16, /**< signed 16-bit number */ + DataType::QSYMM16, /**< quantized, symmetric fixed-point 16-bit number */ + DataType::QASYMM16, /**< quantized, asymmetric fixed-point 16-bit number */ + DataType::U32, /**< unsigned 32-bit number */ + DataType::S32, /**< signed 32-bit number */ + DataType::U64, /**< unsigned 64-bit number */ + DataType::S64, /**< signed 64-bit number */ + DataType::BFLOAT16, /**< 16-bit brain floating-point number */ + DataType::F16, /**< 16-bit floating-point number */ + DataType::F32, /**< 32-bit floating-point number */ + DataType::F64, /**< 64-bit floating-point number */ + DataType::SIZET /**< size_t */ + }) + { + } +}; + +class CommonDataTypes final : public framework::dataset::ContainerDataset> +{ +public: + CommonDataTypes(const std::string &name) + : ContainerDataset(name, + { + DataType::QASYMM8, + DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, /**< quantized, symmetric per channel fixed-point 8-bit number */ + DataType::S32, /**< signed 32-bit number */ + DataType::BFLOAT16, /**< 16-bit brain floating-point number */ + DataType::F16, /**< 16-bit floating-point number */ + DataType::F32, /**< 32-bit floating-point number */ + }) + { + } +}; class QuantizedTypes final : public framework::dataset::ContainerDataset> { public: @@ -63,4 +111,4 @@ class QuantizedPerChannelTypes final : public framework::dataset::ContainerDatas } // namespace datasets } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_DATATYPE_DATASET_H */ +#endif // ACL_TESTS_DATASETS_DATATYPEDATASET_H diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h index d987f4f60b..4d419bbaf2 100644 --- a/tests/datasets/ShapeDatasets.h +++ b/tests/datasets/ShapeDatasets.h @@ -339,6 +339,30 @@ class SmallShapesBroadcast final : public framework::dataset::ZipDataset +{ +public: + SmallShapesNonXBroadcast() + : ZipDataset( + ShapeDataset("Shape0", + { + TensorShape{ 9U, 9U }, + TensorShape{ 128U, 1U, 5U, 3U }, + TensorShape{ 9U, 9U, 3U, 4U }, + TensorShape{ 1U, 16U, 10U, 2U, 128U } + }), + ShapeDataset("Shape1", + { + TensorShape{ 9U, 1U, 2U }, + TensorShape{ 128U, 64U, 1U, 3U }, + TensorShape{ 9U, 1U, 3U }, + TensorShape{ 1U, 1U, 1U, 1U, 128U } + })) + { + } +}; + class TemporaryLimitedSmallShapesBroadcast final : public framework::dataset::ZipDataset { public: diff --git a/tests/framework/Framework.cpp b/tests/framework/Framework.cpp index bfb955c525..755398b53d 100644 --- a/tests/framework/Framework.cpp +++ b/tests/framework/Framework.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -343,6 +343,10 @@ TestResult::Status Framework::run_test(const TestInfo &info, TestCaseFactory &te { profiler.start(); } + if (_prepare_function != nullptr) + { + _prepare_function(); + } test_case->do_run(); test_case->do_sync(); if(_num_iterations == 1 || i != 0) @@ -726,6 +730,16 @@ void Framework::set_new_fixture_call(bool val) { _new_fixture_call = val; } + +void Framework::set_prepare_function(const Framework::PrepareFunc &foo) +{ + _prepare_function = foo; +} + +void Framework::set_seed(unsigned int seed) +{ + _seed = seed; +} } // namespace framework } // namespace test } // namespace arm_compute diff --git a/tests/framework/Framework.h b/tests/framework/Framework.h index 2dded30038..1c5c9a89a1 100644 --- a/tests/framework/Framework.h +++ b/tests/framework/Framework.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_FRAMEWORK -#define ARM_COMPUTE_TEST_FRAMEWORK +#ifndef ACL_TESTS_FRAMEWORK_FRAMEWORK_H +#define ACL_TESTS_FRAMEWORK_FRAMEWORK_H #include "DatasetModes.h" #include "Exceptions.h" @@ -323,6 +323,31 @@ class Framework final */ void set_new_fixture_call(bool val); + /** Prepare functions is called before each test run. + * + * The difference between the prepare_function and on_setup() callback + * from TestCase is that the prepare_function is global for the framework, + * but the on_setup() is individual for each TestCase. + */ + using PrepareFunc = std::function; + + /** Set prepare function. + * + * The prepare_function is called before calling on_run() for each test case. + * The difference between the prepare function and on_setup() callback from + * TestCase is that the prepare function is global for the framework, but + * the on_setup() callback is individual for each TestCase. + * + * @param[in] prepare The prepare function. + */ + void set_prepare_function(const PrepareFunc &prepare); + + /** Set random seed reported by the framework. + * + * @param[in] seed Random seed reported by the framework. + */ + void set_seed(unsigned int seed); + private: Framework(); ~Framework() = default; @@ -360,6 +385,7 @@ class Framework final bool _new_fixture_call{ false }; bool _print_rerun_cmd{ false }; unsigned int _seed{ 0 }; + PrepareFunc _prepare_function{}; using create_function = std::unique_ptr(); std::map _available_instruments{}; @@ -391,4 +417,4 @@ inline void Framework::add_data_test_case(std::string test_name, DatasetMode mod } // namespace framework } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_FRAMEWORK */ +#endif // ACL_TESTS_FRAMEWORK_FRAMEWORK_H diff --git a/tests/framework/Macros.h b/tests/framework/Macros.h index 09e01b0b0c..97389597f0 100644 --- a/tests/framework/Macros.h +++ b/tests/framework/Macros.h @@ -215,17 +215,31 @@ #define DISABLED_DATA_TEST_CASE(TEST_NAME, MODE, DATASET, ...) \ DATA_TEST_CASE_IMPL(TEST_NAME, MODE, arm_compute::test::framework::TestCaseFactory::Status::DISABLED, DATASET, __VA_ARGS__) +#define VALIDATION_FIXTURE_RUN() \ + void do_run() override \ + { \ + if (_iteration != 0) \ + { \ + do_setup(); \ + } \ + do_validate(); \ + ++_iteration; \ + } + #define FIXTURE_TEST_CASE_IMPL(TEST_NAME, FIXTURE, MODE, STATUS) \ class TEST_NAME : public arm_compute::test::framework::TestCase, public FIXTURE \ { \ public: \ TEST_CASE_CONSTRUCTOR(TEST_NAME) \ FIXTURE_SETUP(FIXTURE) \ - void do_run() override; \ + VALIDATION_FIXTURE_RUN() \ + void do_validate(); \ FIXTURE_TEARDOWN(FIXTURE) \ + private: \ + unsigned int _iteration {0}; \ }; \ TEST_REGISTRAR(TEST_NAME, MODE, STATUS); \ - void TEST_NAME::do_run() + void TEST_NAME::do_validate() #define FIXTURE_TEST_CASE(TEST_NAME, FIXTURE, MODE) \ FIXTURE_TEST_CASE_IMPL(TEST_NAME, FIXTURE, MODE, arm_compute::test::framework::TestCaseFactory::Status::ACTIVE) @@ -248,12 +262,15 @@ public: \ DATA_TEST_CASE_CONSTRUCTOR(TEST_NAME, DATASET) \ FIXTURE_DATA_SETUP(FIXTURE) \ - void do_run() override; \ + VALIDATION_FIXTURE_RUN() \ + void do_validate(); \ FIXTURE_TEARDOWN(FIXTURE) \ + private: \ + unsigned int _iteration {0}; \ }; \ DATA_TEST_REGISTRAR(TEST_NAME, MODE, STATUS, DATASET); \ template \ - void TEST_NAME>::do_run() + void TEST_NAME>::do_validate() #define FIXTURE_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, DATASET) \ FIXTURE_DATA_TEST_CASE_IMPL(TEST_NAME, FIXTURE, MODE, arm_compute::test::framework::TestCaseFactory::Status::ACTIVE, DATASET) @@ -271,12 +288,15 @@ public: \ DATA_TEST_CASE_CONSTRUCTOR(TEST_NAME, DATASET) \ FIXTURE_DATA_SETUP_NEW(FIXTURE) \ - void do_run() override; \ + VALIDATION_FIXTURE_RUN() \ + void do_validate(); \ FIXTURE_TEARDOWN(FIXTURE) \ + private: \ + unsigned int _iteration {0}; \ }; \ DATA_TEST_REGISTRAR(TEST_NAME, MODE, STATUS, DATASET); \ template \ - void TEST_NAME>::do_run() + void TEST_NAME>::do_validate() #define FIXTURE_DATA_TEST_CASE_NEW(TEST_NAME, FIXTURE, MODE, DATASET) \ FIXTURE_DATA_TEST_CASE_NEW_IMPL(TEST_NAME, FIXTURE, MODE, arm_compute::test::framework::TestCaseFactory::Status::ACTIVE, DATASET) diff --git a/tests/main.cpp b/tests/main.cpp index e862c7627e..52301bf276 100644 --- a/tests/main.cpp +++ b/tests/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -118,8 +118,8 @@ int main(int argc, char **argv) filter_id->set_help("List of test ids. ... can be used to define a range."); auto stop_on_error = parser.add_option("stop-on-error"); stop_on_error->set_help("Stop execution after the first failed test (useful for debugging)"); - auto seed = parser.add_option>("seed", std::random_device()()); - seed->set_help("Global seed for random number generation"); + auto seed = parser.add_option>("seed"); + seed->set_help("Global seed for random number generation. When not set, each test iteration will use different random seed"); auto list_tests = parser.add_option("list-tests", false); list_tests->set_help("List all test names"); auto test_instruments = parser.add_option("test-instruments", false); @@ -220,13 +220,17 @@ int main(int argc, char **argv) } } + const std::random_device::result_type seed_value = (seed->is_set()) ? seed->value(): std::random_device()(); + const bool randomize_seeds = !seed->is_set() && (options.iterations->value() > 1); + if(options.log_level->value() >= framework::LogLevel::CONFIG) { for(auto &p : printers) { p->print_entry("Version", build_information()); p->print_entry("CommandLine", command_line(argc, argv)); - p->print_entry("Seed", support::cpp11::to_string(seed->value())); + auto seed_str = randomize_seeds ? "Dynamic" : support::cpp11::to_string(seed_value); + p->print_entry("Seed", seed_str); #ifdef ARM_COMPUTE_CL if(opencl_is_available()) { @@ -282,7 +286,7 @@ int main(int argc, char **argv) fconfig.cooldown_sec = cooldown_sec->value(); fconfig.configure_only = configure_only->value(); fconfig.print_rerun_cmd = print_rerun_command->value(); - fconfig.seed = seed->value(); + fconfig.seed = seed_value; framework.init(fconfig); for(auto &p : printers) @@ -292,6 +296,14 @@ int main(int argc, char **argv) framework.set_throw_errors(options.throw_errors->value()); framework.set_stop_on_error(stop_on_error->value()); framework.set_error_on_missing_assets(error_on_missing_assets->value()); + if (randomize_seeds) + { + framework.set_prepare_function([&] (){ + std::random_device::result_type seed = std::random_device()(); + library->set_seed(seed); + framework.set_seed(seed); + }); + } bool success = true; @@ -319,7 +331,7 @@ int main(int argc, char **argv) return 0; } - library = std::make_unique(assets->value(), seed->value()); + library = std::make_unique(assets->value(), seed_value); fixed_library = std::make_unique(assets->value(), fixed_seed); if(!parser.validate()) diff --git a/tests/validation/CL/Pooling3dLayer.cpp b/tests/validation/CL/Pooling3dLayer.cpp index 84d630e6cf..5f0c68c17b 100644 --- a/tests/validation/CL/Pooling3dLayer.cpp +++ b/tests/validation/CL/Pooling3dLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -77,6 +77,48 @@ constexpr AbsoluteTolerance tolerance_qasymm8(1); /**< Tolerance TEST_SUITE(CL) TEST_SUITE(Pooling3dLayer) +TEST_CASE(RoundToNearestInteger, framework::DatasetMode::ALL) +{ + const auto pool_info = Pooling3dLayerInfo(PoolingType::AVG, + Size3D(3,1,1), Size3D(1,1,1), Padding3D(), true /* exclude padding */); + + const auto shape = TensorShape(1U,3U,1U,1U); + const auto output_shape = TensorShape(1U,1U,1U,1U); + + const auto dtype = DataType::QASYMM8_SIGNED; + const auto layout = DataLayout::NDHWC; + const auto qinfo = QuantizationInfo(1.f, 0); + + CLTensor input = create_tensor(shape, dtype, 1, qinfo, layout); + CLTensor output = create_tensor(output_shape, dtype, 1, qinfo, layout); + + CLPooling3dLayer pool; + pool.configure(&input, &output, pool_info); + + input.allocator()->allocate(); + output.allocator()->allocate(); + + std::vector values = {-10, -10, -9}; + std::vector refs = {-10}; + + ARM_COMPUTE_EXPECT(values.size() == shape.total_size(), framework::LogLevel::ERRORS); + + library->fill_static_values(CLAccessor(input), values); + + pool.run(); + + output.map(true); + for(unsigned int i = 0; i < refs.size(); ++i) + { + const int8_t ref = refs[i]; + const int8_t target = reinterpret_cast(output.buffer())[i]; + + ARM_COMPUTE_EXPECT(ref == target, framework::LogLevel::ERRORS); + } + + output.unmap(); +} + // *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( @@ -106,7 +148,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( TensorInfo(TensorShape(5U, 1U, 1U, 1U, 4U), 1, DataType::F32, DataLayout::NDHWC), TensorInfo(TensorShape(1U, 15U, 1U, 2U, 4U), 1, DataType::F32, DataLayout::NDHWC), // Output width larger than input TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U), 1, DataType::F32, DataLayout::NDHWC), - TensorInfo(TensorShape(5U, 6U, 6U, 2U, 2U), 1, DataType::F32, DataLayout::NDHWC), + TensorInfo(TensorShape(5U, 6U, 6U, 2U, 2U), 1, DataType::F32, DataLayout::NDHWC), TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U), 1, DataType::F32, DataLayout::NDHWC), TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U), 1, DataType::F32, DataLayout::NDHWC), })), diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp index 9fe28c7acf..8aca7dd08b 100644 --- a/tests/validation/CL/PoolingLayer.cpp +++ b/tests/validation/CL/PoolingLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -85,11 +85,68 @@ const auto pool_data_layout_dataset = framework::datas const auto pool_fp_mixed_precision_dataset = framework::dataset::make("FpMixedPrecision", { true, false }); +void RoundToNearestIntegerPoolTestBody(const DataLayout layout, const TensorShape &shape, + const TensorShape &output_shape) +{ + const auto pool_info = PoolingLayerInfo(PoolingType::AVG, + Size2D(3,1), layout, PadStrideInfo(), true /* exclude padding */); + + const auto dtype = DataType::QASYMM8_SIGNED; + const auto qinfo = QuantizationInfo(1.f, 0); + + CLTensor input = create_tensor(shape, dtype, 1, qinfo, layout); + CLTensor output = create_tensor(output_shape, dtype, 1, qinfo, layout); + + CLPoolingLayer pool; + pool.configure(&input, &output, pool_info); + + input.allocator()->allocate(); + output.allocator()->allocate(); + + std::vector values = {-10, -10, -9}; + std::vector refs = {-10}; + + ARM_COMPUTE_EXPECT(values.size() == shape.total_size(), framework::LogLevel::ERRORS); + + library->fill_static_values(CLAccessor(input), values); + + pool.run(); + + output.map(true); + for(unsigned int i = 0; i < refs.size(); ++i) + { + const int8_t ref = refs[i]; + const int8_t target = reinterpret_cast(output.buffer())[i]; + + ARM_COMPUTE_EXPECT(ref == target, framework::LogLevel::ERRORS); + } + + output.unmap(); +} + } // namespace TEST_SUITE(CL) TEST_SUITE(PoolingLayer) +TEST_CASE(RoundToNearestIntegerNHWC, framework::DatasetMode::ALL) +{ + const auto layout = DataLayout::NHWC; + const auto shape = TensorShape(1U,3U,1U); + const auto output_shape = TensorShape(1U,1U,1U); + + RoundToNearestIntegerPoolTestBody(layout, shape, output_shape); +} + +TEST_CASE(RoundToNearestIntegerNCHW, framework::DatasetMode::ALL) +{ + const auto layout = DataLayout::NCHW; + const auto shape = TensorShape(3U,1U,1U); + const auto output_shape = TensorShape(1U,1U,1U); + + RoundToNearestIntegerPoolTestBody(layout, shape, output_shape); +} + // *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( diff --git a/tests/validation/CL/Reverse.cpp b/tests/validation/CL/Reverse.cpp index 82effc2136..671eb94090 100644 --- a/tests/validation/CL/Reverse.cpp +++ b/tests/validation/CL/Reverse.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, 2023 Arm Limited. + * Copyright (c) 2018-2020, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ #include "tests/CL/CLAccessor.h" #include "tests/PaddingCalculator.h" #include "tests/datasets/ShapeDatasets.h" +#include "tests/datasets/DatatypeDataset.h" #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" #include "tests/framework/datasets/Datasets.h" @@ -47,10 +48,68 @@ namespace auto run_small_dataset = combine(datasets::Small3DShapes(), datasets::Tiny1DShapes()); auto run_large_dataset = combine(datasets::LargeShapes(), datasets::Tiny1DShapes()); +void validate_data_types(DataType input_dtype, DataType output_dtype, DataType axis_dtype) +{ + const auto input = TensorInfo(TensorShape(16U, 16U, 5U), 1, input_dtype); + const auto axis = TensorInfo(TensorShape(1U), 1, axis_dtype); + auto output = TensorInfo(TensorShape(16U, 16U, 5U), 1, output_dtype); + + const Status status = (CLReverse::validate(&input, &output, &axis, false /* use_inverted_axis */)); + const bool is_valid = static_cast(status); + + static const auto supported_dtypes = { + DataType::QSYMM8, + DataType::QASYMM8, + DataType::QASYMM8_SIGNED, + DataType::QSYMM16, + DataType::U8, + DataType::S8, + DataType::QSYMM8_PER_CHANNEL, + DataType::U16, + DataType::S16, + DataType::QSYMM16, + DataType::QASYMM16, + DataType::U32, + DataType::S32, + DataType::SIZET, + DataType::BFLOAT16, + DataType::F16, + DataType::F32, +#ifdef __aarch64__ + DataType::U64, + DataType::S64, + DataType::F64, +#endif // __aarch64__ + }; + + static std::vector> supports = {}; + for(DataType dtype : supported_dtypes) + { + supports.push_back(std::make_tuple(dtype, dtype, DataType::S32)); + supports.push_back(std::make_tuple(dtype, dtype, DataType::U32)); + } + + const auto config = std::make_tuple(input_dtype, output_dtype, axis_dtype); + const bool expected = (std::find(supports.begin(), supports.end(), config) != supports.end()); + + ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); +} + } // namespace TEST_SUITE(CL) TEST_SUITE(Reverse) +/// @note: Do not modify. Validating all data types is pretty fast. +DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::ALL, + combine( + datasets::AllDataTypes("InputDataType"), + datasets::AllDataTypes("OutputDataType"), + datasets::AllDataTypes("AxisDataType")), + input_dtype, output_dtype, axis_dtype) +{ + validate_data_types(input_dtype, output_dtype, axis_dtype); +} + // *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( @@ -90,6 +149,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( template using CLReverseFixture = ReverseValidationFixture; +/// @note: see tests/validation/NEON/Reverse.cpp for the Test Strategy + TEST_SUITE(Float) TEST_SUITE(F16) FIXTURE_DATA_TEST_CASE(RunSmall, @@ -119,7 +180,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, } TEST_SUITE_END() // F16 -TEST_SUITE(FP32) +TEST_SUITE(F32) FIXTURE_DATA_TEST_CASE(RunSmall, CLReverseFixture, framework::DatasetMode::PRECOMMIT, @@ -146,16 +207,150 @@ FIXTURE_DATA_TEST_CASE(RunLarge, validate(CLAccessor(_target), _reference); } TEST_SUITE_END() // F32 + +#ifdef __aarch64__ +TEST_SUITE(FP64) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + run_small_dataset, + make("DataType", DataType::F64), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // F64 +#endif // __aarch64__ + TEST_SUITE_END() // Float -TEST_SUITE(Quantized) -TEST_SUITE(QASYMM8) +TEST_SUITE(Integer) +TEST_SUITE(Int32) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::S32}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // Int32 + +#ifndef __x86_64__ +TEST_SUITE(SizeT) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::SIZET}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // SizeT +#endif // __x86_64__ + +TEST_SUITE(UInt32) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::U32}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // UInt32 + +#ifdef __aarch64__ +TEST_SUITE(Int64) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::S64}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // Int64 + +TEST_SUITE(UInt64) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::S64}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // UInt64 +#endif // __aarch64__ + +TEST_SUITE(Int16) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::S16, DataType::QSYMM16}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // Int16 + +TEST_SUITE(UInt16) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::U16, DataType::QASYMM16}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // UInt16 + +TEST_SUITE(UInt8) FIXTURE_DATA_TEST_CASE(RunSmall, CLReverseFixture, framework::DatasetMode::PRECOMMIT, combine( run_small_dataset, - make("DataType", DataType::QASYMM8), + make("DataType", {DataType::QASYMM8, DataType::U8}), make("use_negative_axis", { true, false }), make("use_inverted_axis", { true, false }))) { @@ -175,8 +370,25 @@ FIXTURE_DATA_TEST_CASE(RunLarge, // Validate output validate(CLAccessor(_target), _reference); } -TEST_SUITE_END() // QASYMM8 -TEST_SUITE_END() // Quantized +TEST_SUITE_END() // UInt8 + +TEST_SUITE(Int8) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::QASYMM8_SIGNED, DataType::S8, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // Int8 +TEST_SUITE_END() // Integer TEST_SUITE_END() // Reverse TEST_SUITE_END() // CL diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt index c2b526817e..59cd4b0a88 100644 --- a/tests/validation/CMakeLists.txt +++ b/tests/validation/CMakeLists.txt @@ -152,7 +152,9 @@ if(ENABLE_NEON) runtime/experimental/operators/CpuGemmConv2d.cpp runtime/experimental/operators/CpuGemmDirectConv2d.cpp runtime/experimental/operators/CpuMul.cpp + runtime/experimental/operators/CpuSoftmax.cpp runtime/experimental/operators/CpuSub.cpp runtime/experimental/operators/CpuTranspose.cpp - runtime/experimental/operators/CpuWinogradConv2d.cpp) + runtime/experimental/operators/CpuWinogradConv2d.cpp + ) endif() diff --git a/tests/validation/CPP/LUT.cpp b/tests/validation/CPP/LUT.cpp new file mode 100644 index 0000000000..1874823d8d --- /dev/null +++ b/tests/validation/CPP/LUT.cpp @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/validation/Validation.h" +#include "src/core/helpers/LUTManager.h" +#include "include/half/half.hpp" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ +#ifdef ARM_COMPUTE_ENABLE_FP16 + // Take fp16 value and output as uint16_t without changing bits. + inline uint16_t read_as_bf16(const float16_t tmp) + { + uint16_t out = 0; + memcpy(&out, &tmp, sizeof(tmp)); + return out; + } +#endif // ARM_COMPUTE_ENABLE_FP16 + + // Check if difference in values is within tolerance range + template + bool equal_values_relative(const U target, const U reference, const float tolerance) + { + if(are_equal_infs(target, reference)) + { + return true; + } + else if(target == reference) + { + return true; + } + else if(half_float::detail::builtin_isnan(target) && half_float::detail::builtin_isnan(reference)) // determine if nan values using existing function + { + return true; + } + + const U epsilon = (std::is_same::type>::value || (reference == 0)) ? static_cast(0.01) : static_cast(1e-05); + if(std::abs(static_cast(reference) - static_cast(target)) <= epsilon) + { + return true; + } + else + { + if(static_cast(reference) == 0.0f) + { + return false; // We have checked whether _reference and _target is close. If _reference is 0 but not close to _target, it should return false + } + const double relative_change = std::abs((static_cast(target) - static_cast(reference)) / reference); + return relative_change <= static_cast(tolerance); + } + } +} // namespace + +TEST_SUITE(LUTManager) +#ifdef ARM_COMPUTE_ENABLE_FP16 +TEST_SUITE(BF16) +TEST_CASE(LUTValueTest, framework::DatasetMode::ALL) +{ + // Define values for test + constexpr float beta = 1.0f; + constexpr float rel_tolerance = 0.01f; + constexpr int num_elements = 65536; + unsigned int num_mismatches = 0; + + // Create lutinfo, use to get lut + LUTInfo info = {LUTType::Exponential, beta, DataType::BFLOAT16, UniformQuantizationInfo()}; + LUTManager lman = LUTManager::get_instance(); + + if(CPUInfo::get().has_fp16()) + { + // Retrieve lut, Assert lut exists and is retrieved successfully. + std::shared_ptr lut = lman.get_lut_table(info); + ARM_COMPUTE_EXPECT(lut != nullptr, framework::LogLevel::ALL); + + // Check each value in lut + for(int i=0; i < num_elements; i++) + { + // Calculate reference in fp32. Convert lut value to fp32. + const float fref = std::exp(bf16_to_float(i) * beta * -1); + const uint16_t target_bf16 = read_as_bf16((*lut)[i]); + const float target = bf16_to_float(target_bf16); + + // Compare and increment mismatch count if needed. + if(!equal_values_relative(target, fref, rel_tolerance)) + { + ARM_COMPUTE_TEST_INFO("id = " << i); + ARM_COMPUTE_TEST_INFO("target = " << std::setprecision(5) << framework::make_printable(target)); + ARM_COMPUTE_TEST_INFO("reference = " << std::setprecision(5) << framework::make_printable(fref)); + ARM_COMPUTE_TEST_INFO("relative tolerance = " << std::setprecision(5) << framework::make_printable(rel_tolerance)); + framework::ARM_COMPUTE_PRINT_INFO(); + ++num_mismatches; + } + } + + if(num_mismatches != 0) + { + const float percent_mismatches = static_cast(num_mismatches) / num_elements * 100.f; + ARM_COMPUTE_TEST_INFO(num_mismatches << " values (" << std::fixed << std::setprecision(2) << percent_mismatches << "%) mismatched "); + } + + // Check if passed tests + ARM_COMPUTE_EXPECT(num_mismatches == 0, framework::LogLevel::ERRORS); + } +} + +TEST_CASE(CheckLutReuse, framework::DatasetMode::ALL) +{ + LUTInfo info = {LUTType::Exponential, 1.0f, DataType::BFLOAT16, UniformQuantizationInfo()}; + LUTManager lman = LUTManager::get_instance(); + auto first = lman.get_lut_table(info); + auto second = lman.get_lut_table(info); + ARM_COMPUTE_EXPECT(first == second, framework::LogLevel::ERRORS); +} + + +TEST_SUITE_END() // BF16 +#endif // ARM_COMPUTE_ENABLE_FP16 + +TEST_SUITE_END() // LUTManager + +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp index 560460fd33..d9c0418f35 100644 --- a/tests/validation/Helpers.cpp +++ b/tests/validation/Helpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,6 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/core/CPP/CPPTypes.h" + #include "tests/validation/Helpers.h" #include "tests/framework/Asserts.h" @@ -571,6 +573,40 @@ QuantizationHint suggest_mac_dst_q_info_and_bias( return { c_q_info, min_bias, max_bias }; } +template +bool config_has_dtype(const std::initializer_list &types) +{ + bool dtype_exists = false; + for(DataType type : types) + { + dtype_exists |= (type == data_type); + } + return dtype_exists; +} + +bool cpu_supports_dtypes(const std::initializer_list &types) +{ + const bool cpu_has_bf16 = CPUInfo::get().has_bf16(); + const bool cpu_has_fp16 = CPUInfo::get().has_fp16(); + const bool config_has_fp16 = config_has_dtype(types); + const bool config_has_bf16 = config_has_dtype(types); + +#ifndef ARM_COMPUTE_ENABLE_FP16 + const bool fp16_enabled = false; +#else // ARM_COMPUTE_ENABLE_FP16 + const bool fp16_enabled = true; +#endif // ARM_COMPUTE_ENABLE_FP16 + +#ifndef ARM_COMPUTE_ENABLE_BF16 + const bool bf16_enabled = false; +#else // ARM_COMPUTE_ENABLE_BF16 + const bool bf16_enabled = true; +#endif // ARM_COMPUTE_ENABLE_BF16 + + return !(config_has_fp16 && (!cpu_has_fp16 || !fp16_enabled)) && + !(config_has_bf16 && (!cpu_has_bf16 || !bf16_enabled)); +} + template void get_tile(const SimpleTensor &in, SimpleTensor &roi, const Coordinates &coord); template void get_tile(const SimpleTensor &in, SimpleTensor &roi, const Coordinates &coord); template void get_tile(const SimpleTensor &in, SimpleTensor &roi, const Coordinates &coord); diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h index e044620556..7bdbf5a855 100644 --- a/tests/validation/Helpers.h +++ b/tests/validation/Helpers.h @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -308,6 +309,14 @@ QuantizationHint suggest_mac_dst_q_info_and_bias(const QuantizationInfo &lhs_q_i DataType data_type, float bias_fraction, int num_sd = 2); + +/** Check if Cpu supports the vectoral operations for the data types in the parameters + * + * @param[in] types an initializeer list that contain data types + * + * @return true if the current cpu supports the vectoral operations for the data types + */ +bool cpu_supports_dtypes(const std::initializer_list &types); } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp index 51a2cecb78..119d51808a 100644 --- a/tests/validation/NEON/ActivationLayer.cpp +++ b/tests/validation/NEON/ActivationLayer.cpp @@ -285,48 +285,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); } -DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat( - combine(framework::dataset::make("CpuExt", std::string("NEON")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16, - DataType::QASYMM8, - DataType::QASYMM8_SIGNED, - DataType::QSYMM16 - })), - combine(framework::dataset::make("CpuExt", std::string("SVE")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16, - }))), - combine(framework::dataset::make("CpuExt", std::string("SVE2")), - framework::dataset::make("DataType", { DataType::QASYMM8, - DataType::QASYMM8_SIGNED, - DataType::QSYMM16 - }))), - cpu_ext, data_type) -{ - using namespace cpu::kernels; - - cpuinfo::CpuIsaInfo cpu_isa{}; - cpu_isa.neon = (cpu_ext == "NEON"); - cpu_isa.sve = (cpu_ext == "SVE"); - cpu_isa.sve2 = (cpu_ext == "SVE2"); - cpu_isa.fp16 = (data_type == DataType::F16); - - const auto *selected_impl = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{data_type, CPUModel::GENERIC, cpu_isa,ActivationLayerInfo::ActivationFunction::BOUNDED_RELU}, cpu::KernelSelectionType::Preferred); - - ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl); - std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_activation"; - if( data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED) - { -#ifdef __aarch64__ - expected = "neon_q8_activation_lut"; -#else // __aarch64__ - expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_activation"; -#endif // __aarch64__ - } - std::string actual = selected_impl->name; - ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS); -} // clang-format on // *INDENT-ON* diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp index 7a7aa52041..c0033daab0 100644 --- a/tests/validation/NEON/ArithmeticAddition.cpp +++ b/tests/validation/NEON/ArithmeticAddition.cpp @@ -44,12 +44,14 @@ namespace test { namespace validation { + +using framework::dataset::make; namespace { #if !defined(__aarch64__) || defined(ENABLE_SVE) constexpr AbsoluteTolerance tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ #else // !defined(__aarch64__) || defined(ENABLE_SVE) -constexpr AbsoluteTolerance tolerance_quant(0); +constexpr AbsoluteTolerance tolerance_quant(1); #endif // !defined(__aarch64__) || defined(ENABLE_SVE) const auto InPlaceDataSet = framework::dataset::make("InPlace", { false, true }); const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false }); @@ -125,7 +127,7 @@ DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat( cpu_isa.sve2 = (cpu_ext == "SVE2"); cpu_isa.fp16 = (data_type == DataType::F16); - const auto *selected_impl = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{data_type, cpu_isa, can_use_fixedpoint}, cpu::KernelSelectionType::Preferred); + const auto *selected_impl = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{data_type, cpu_isa, can_use_fixedpoint, false /* can_use_sme2_impl */ }, cpu::KernelSelectionType::Preferred); ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl); @@ -298,12 +300,43 @@ TEST_SUITE(QASYMM8_SIGNED) FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionQuantizedFixture, framework::DatasetMode::ALL, - combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), - framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })), - framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })), - framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })), - framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })), - OutOfPlaceDataSet)) + combine(datasets::SmallShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + make("ConvertPolicy", { ConvertPolicy::SATURATE }), + make("Src0QInfo", { QuantizationInfo(0.45f, 20) }), + make("Src1QInfo", { QuantizationInfo(0.55f, 10) }), + make("OutQInfo", { QuantizationInfo(0.5f, 5) }), + OutOfPlaceDataSet)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_quant); +} + +FIXTURE_DATA_TEST_CASE(RunSmall5d, + NEArithmeticAdditionQuantizedFixture, + framework::DatasetMode::ALL, + combine(datasets::Tiny5dShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + make("ConvertPolicy", { ConvertPolicy::SATURATE }), + make("Src0QInfo", { QuantizationInfo(0.45f, 20) }), + make("Src1QInfo", { QuantizationInfo(0.55f, 10) }), + make("OutQInfo", { QuantizationInfo(0.5f, 5) }), + OutOfPlaceDataSet)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_quant); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, + NEArithmeticAdditionQuantizedFixture, + framework::DatasetMode::NIGHTLY, + combine(datasets::LargeShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + make("ConvertPolicy", { ConvertPolicy::SATURATE }), + make("Src0QInfo", { QuantizationInfo(0.45f, 20) }), + make("Src1QInfo", { QuantizationInfo(0.55f, 10) }), + make("OutQInfo", { QuantizationInfo(0.5f, 5) }), + OutOfPlaceDataSet)) { // Validate output validate(Accessor(_target), _reference, tolerance_quant); @@ -312,8 +345,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionQuantizedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine( datasets::SmallShapesBroadcast(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })), - framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })), - framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("Src0QInfo", { QuantizationInfo(0.45f, 20) })), + framework::dataset::make("Src1QInfo", { QuantizationInfo(0.55f, 10) })), framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })), OutOfPlaceDataSet)) { diff --git a/tests/validation/NEON/ArithmeticSubtraction.cpp b/tests/validation/NEON/ArithmeticSubtraction.cpp index 9a6032cd9e..cbc99ba78c 100644 --- a/tests/validation/NEON/ArithmeticSubtraction.cpp +++ b/tests/validation/NEON/ArithmeticSubtraction.cpp @@ -25,6 +25,7 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/TensorAllocator.h" + #include "tests/NEON/Accessor.h" #include "tests/PaddingCalculator.h" #include "tests/datasets/ConvertPolicyDataset.h" @@ -32,8 +33,10 @@ #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" #include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Helpers.h" #include "tests/validation/Validation.h" #include "tests/validation/fixtures/ArithmeticOperationsFixture.h" +#include "tests/datasets/DatatypeDataset.h" namespace arm_compute { @@ -41,6 +44,9 @@ namespace test { namespace validation { + +using framework::dataset::make; + namespace { #ifdef __aarch64__ @@ -60,11 +66,47 @@ const auto ArithmeticSubtractionQuantizationInfoSignedDataset = combine(combine( const auto ArithmeticSubtractionQuantizationInfoSignedInPlaceDataset = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.8f, 10) }), framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.8f, 10) })), framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.8f, 10) })); -const auto ArithmeticSubtractionQuantizationInfoSymmetric = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.3f, 0) }), - framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.7f, 0) })), - framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.2f, 0) })); +const auto ArithmeticSubtractionQuantizationInfo16bitSymmetric = + combine( + make("QuantizationInfoIn1", { QuantizationInfo(0.003f, 0) }), + make("QuantizationInfoIn2", { QuantizationInfo(0.007f, 0) }), + make("QuantizationInfoOut", { QuantizationInfo(0.2f, 0), + QuantizationInfo(0.002f, 0) /* for more saturation */ }) + ); + const auto InPlaceDataSet = framework::dataset::make("InPlace", { false, true }); const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false }); + +void validate_data_types(DataType input1_dtype, DataType input2_dtype, DataType output_dtype) +{ + const auto input1 = TensorInfo(TensorShape(27U, 13U, 2U), 1, input1_dtype); + const auto input2 = TensorInfo(TensorShape(27U, 13U, 2U), 1, input2_dtype); + auto output = TensorInfo(TensorShape(27U, 13U, 2U), 1, output_dtype); + + const bool is_valid = static_cast(NEArithmeticSubtraction::validate(&input1, &input2, &output, + ConvertPolicy::SATURATE)); + + const auto supports = { + std::make_tuple(DataType::F32,DataType::F32,DataType::F32), + std::make_tuple(DataType::F16,DataType::F16,DataType::F16), + std::make_tuple(DataType::U8,DataType::U8,DataType::U8), + std::make_tuple(DataType::S16,DataType::S16,DataType::S16), + std::make_tuple(DataType::S32,DataType::S32,DataType::S32), + std::make_tuple(DataType::QSYMM16,DataType::QSYMM16,DataType::QSYMM16), + std::make_tuple(DataType::QASYMM8,DataType::QASYMM8,DataType::QASYMM8), + std::make_tuple(DataType::QASYMM8_SIGNED,DataType::QASYMM8_SIGNED,DataType::QASYMM8_SIGNED) + }; + const auto config = std::make_tuple(input1_dtype, input2_dtype, output_dtype); + const std::initializer_list dtypes_list = {input1_dtype, input2_dtype, output_dtype}; + + bool expected = false; + if(cpu_supports_dtypes(dtypes_list)) + { + expected = (std::find(supports.begin(), supports.end(), config) != supports.end()); + } + ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); +} + } // namespace TEST_SUITE(NEON) @@ -72,6 +114,8 @@ TEST_SUITE(ArithmeticSubtraction) template using NEArithmeticSubtractionFixture = ArithmeticSubtractionValidationFixture; +template +using NEArithmeticSubtractionBroadcastFixture = ArithmeticSubtractionBroadcastValidationFixture; // *INDENT-OFF* // clang-format off @@ -160,6 +204,18 @@ TEST_CASE(InvalidBroadcastBoth, framework::DatasetMode::ALL) } TEST_SUITE_END() // InPlaceValidate +/// @note: Do not modify. Validating all data types is pretty fast. +DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::ALL, + combine( + datasets::AllDataTypes("Input1DataType"), + datasets::AllDataTypes("Input2DataType"), + datasets::AllDataTypes("OutputDataType")), + input1_dtype, input2_dtype, output_dtype) +{ + validate_data_types(input1_dtype, input2_dtype, output_dtype); +} + + TEST_SUITE(U8) FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), @@ -169,13 +225,25 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture, framew // Validate output validate(Accessor(_target), _reference); } +FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture, framework::DatasetMode::PRECOMMIT, + combine( + datasets::SmallShapesBroadcast(), + make("DataType", DataType::U8), + make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP }), + OutOfPlaceDataSet)) +{ + // Validate output + validate(Accessor(_target), _reference); +} TEST_SUITE_END() // U8 using NEArithmeticSubtractionQASYMM8Fixture = ArithmeticSubtractionValidationQuantizedFixture; using NEArithmeticSubtractionQASYMM8SignedFixture = ArithmeticSubtractionValidationQuantizedFixture; -using NEArithmeticSubtractionQASYMM8SignedBroadcastFixture = ArithmeticSubtractionValidationQuantizedBroadcastFixture; using NEArithmeticSubtractionQSYMM16Fixture = ArithmeticSubtractionValidationQuantizedFixture; +template +using NEArithmeticSubtractionQuantizedBroadcastFixture = ArithmeticSubtractionValidationQuantizedBroadcastFixture; + TEST_SUITE(Quantized) TEST_SUITE(QASYMM8) FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", @@ -187,6 +255,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framewor // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } +FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQuantizedBroadcastFixture, framework::DatasetMode::ALL, + combine( + datasets::SmallShapesBroadcast(), + make("DataType", DataType::QASYMM8), + make("ConvertPolicy", { ConvertPolicy::SATURATE }), + ArithmeticSubtractionQuantizationInfoDataset, + OutOfPlaceDataSet)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} TEST_SUITE_END() // QASYMM8 TEST_SUITE(QASYMM8_SIGNED) @@ -199,7 +278,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8SignedFixture, fr // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine( +FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQuantizedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine( datasets::SmallShapesBroadcast(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })), @@ -209,7 +288,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQASYMM8SignedBr // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine( +FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEArithmeticSubtractionQuantizedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine( datasets::TinyShapesBroadcastInplace(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })), @@ -222,12 +301,24 @@ FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEArithmeticSubtractionQASYMM8Si TEST_SUITE_END() // QASYMM8_SIGNED TEST_SUITE(QSYMM16) -FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine( +FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQSYMM16Fixture, framework::DatasetMode::ALL, + combine( datasets::SmallShapes(), - framework::dataset::make("DataType", DataType::QSYMM16)), - framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })), - ArithmeticSubtractionQuantizationInfoSymmetric), - OutOfPlaceDataSet)) + make("DataType", DataType::QSYMM16), + make("ConvertPolicy", { ConvertPolicy::SATURATE }), + ArithmeticSubtractionQuantizationInfo16bitSymmetric, + OutOfPlaceDataSet)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qsymm16); +} +FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQuantizedBroadcastFixture, framework::DatasetMode::ALL, + combine( + datasets::SmallShapesBroadcast(), + make("DataType", DataType::QSYMM16), + make("ConvertPolicy", { ConvertPolicy::SATURATE }), + ArithmeticSubtractionQuantizationInfo16bitSymmetric, + OutOfPlaceDataSet)) { // Validate output validate(Accessor(_target), _reference, tolerance_qsymm16); @@ -245,6 +336,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture, framew validate(Accessor(_target), _reference); } +FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture, framework::DatasetMode::PRECOMMIT, + combine(datasets::SmallShapesBroadcast(), + make("DataType", + DataType::S16), + make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP }), + OutOfPlaceDataSet)) +{ + // Validate output + validate(Accessor(_target), _reference); +} + FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::S16)), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })), @@ -265,6 +367,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture, framew validate(Accessor(_target), _reference); } +FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture, framework::DatasetMode::PRECOMMIT, + combine( + datasets::SmallShapesBroadcast(), + make("DataType", DataType::S32), + make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP }), + OutOfPlaceDataSet)) +{ + // Validate output + validate(Accessor(_target), _reference); +} + FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })), @@ -316,9 +429,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture, framewor validate(Accessor(_target), _reference); } -template -using NEArithmeticSubtractionBroadcastFixture = ArithmeticSubtractionBroadcastValidationFixture; - FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })), diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp index 2d948f3e32..4f4e0e5a6c 100644 --- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp +++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp @@ -35,6 +35,10 @@ #include "tests/framework/datasets/Datasets.h" #include "tests/validation/Validation.h" #include "tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h" +#include "tests/datasets/DatatypeDataset.h" + +#include +#include namespace arm_compute { @@ -206,6 +210,67 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); } +void validate_data_types(DataType input_dtype, DataType weight_dtype, DataType bias_dtype, DataType output_dtype) +{ + const int depth_multiplier = 1; + const auto dilation = Size2D(1U, 1U); + const auto conv_info = PadStrideInfo(1, 1, 0, 0); + + const auto input = TensorInfo(TensorShape(27U, 13U, 2U), 1, input_dtype); + std::vector scales(input.tensor_shape().z() * depth_multiplier); + + const auto weights = TensorInfo(TensorShape(3U, 3U, 2U), 1, weight_dtype, QuantizationInfo(scales)); + const auto bias = TensorInfo(TensorShape(2U), 1, bias_dtype); + auto output = TensorInfo(TensorShape(25U, 11U, 2U), 1, output_dtype); + + + bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input.clone()->set_is_resizable(false), &weights.clone()->set_is_resizable(false), &bias.clone()->set_is_resizable(false), &output.clone()->set_is_resizable(false), + conv_info, depth_multiplier, ActivationLayerInfo(), dilation)); + + const auto supports = { + std::make_tuple(DataType::F32,DataType::F32,DataType::F32,DataType::F32), + std::make_tuple(DataType::F16,DataType::F16,DataType::F16,DataType::F16), + std::make_tuple(DataType::QASYMM8,DataType::QASYMM8,DataType::S32,DataType::QASYMM8), + std::make_tuple(DataType::QASYMM8,DataType::QSYMM8_PER_CHANNEL,DataType::S32,DataType::QASYMM8), + std::make_tuple(DataType::QASYMM8_SIGNED,DataType::QASYMM8_SIGNED,DataType::S32,DataType::QASYMM8_SIGNED), + std::make_tuple(DataType::QASYMM8_SIGNED,DataType::QSYMM8_PER_CHANNEL,DataType::S32,DataType::QASYMM8_SIGNED), + }; + + const auto config = std::make_tuple(input_dtype, weight_dtype, bias_dtype, output_dtype); + const std::initializer_list dtypes_list = {input_dtype, weight_dtype, bias_dtype, output_dtype}; + + bool expected = false; + if(cpu_supports_dtypes(dtypes_list)) + { + expected = (std::find(supports.begin(), supports.end(), config) != supports.end()); + } + + ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); +} + +/// @note: Do not modify. Validating all data types is pretty fast. +DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::NIGHTLY, + combine( + datasets::AllDataTypes("InputDataType"), + datasets::AllDataTypes("WeightDataType"), + datasets::AllDataTypes("BiasDataType"), + datasets::AllDataTypes("OutputDataType")), + input_dtype, weight_dtype, bias_dtype, output_dtype) +{ + validate_data_types(input_dtype, weight_dtype, bias_dtype, output_dtype); +} + +DATA_TEST_CASE(ValidateCommonDataTypes, framework::DatasetMode::PRECOMMIT, + combine( + datasets::CommonDataTypes("InputDataType"), + datasets::CommonDataTypes("WeightDataType"), + datasets::CommonDataTypes("BiasDataType"), + datasets::CommonDataTypes("OutputDataType")), + input_dtype, weight_dtype, bias_dtype, output_dtype) +{ + validate_data_types(input_dtype, weight_dtype, bias_dtype, output_dtype); +} + DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip( make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching data type input/weights TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32), // Mismatching input feature maps diff --git a/tests/validation/NEON/LogSoftmaxLayer.cpp b/tests/validation/NEON/LogSoftmaxLayer.cpp index 6718597c6b..4d983d5763 100644 --- a/tests/validation/NEON/LogSoftmaxLayer.cpp +++ b/tests/validation/NEON/LogSoftmaxLayer.cpp @@ -40,6 +40,9 @@ namespace test { namespace validation { + +using framework::dataset::make; + namespace { /** Tolerance for float operations */ @@ -48,6 +51,7 @@ RelativeTolerance tolerance_f16(half(0.2)); /** Tolerance for quantized operations */ constexpr AbsoluteTolerance tolerance_qasymm8(1); +constexpr AbsoluteTolerance tolerance_qasymm8_signed(1); /** CNN data types */ const auto CNNDataTypes = framework::dataset::make("DataType", @@ -180,6 +184,39 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerQuantizedFixture, fra validate(Accessor(_target), _reference, tolerance_qasymm8); } TEST_SUITE_END() //QASYMM8 + +TEST_SUITE(QASYMM8_SIGNED) +FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine(datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + combine(make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f })), + make("Axis", { 0, 1 }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8_signed); +} +FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine(datasets::Small4DShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + combine(make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f })), + make("Axis", { 0, -1, 1 }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8_signed); +} +FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, + combine(datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + combine(make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f })), + make("Axis", { 0 }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8_signed); +} +TEST_SUITE_END() // QASYMM8_SIGNED TEST_SUITE_END() //Quantized TEST_SUITE_END() //LogSoftmaxLayer diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp index f93bafcff6..f14100fbb5 100644 --- a/tests/validation/NEON/PixelWiseMultiplication.cpp +++ b/tests/validation/NEON/PixelWiseMultiplication.cpp @@ -30,6 +30,11 @@ #include "tests/framework/Macros.h" #include "tests/validation/Validation.h" #include "tests/validation/fixtures/PixelWiseMultiplicationFixture.h" +#include "tests/datasets/DatatypeDataset.h" +#include "tests/validation/Helpers.h" + +#include +#include namespace arm_compute { @@ -37,6 +42,9 @@ namespace test { namespace validation { + +using framework::dataset::make; + namespace { const float scale_unity = 1.f; @@ -87,7 +95,6 @@ const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false }); #define VALIDATE(TYPE, TOLERANCE) validate(Accessor(_target), _reference, AbsoluteTolerance(TOLERANCE), 0.f); #define WRAP_VALIDATE(TYPE, TOLERANCE) validate_wrap(Accessor(_target), _reference, AbsoluteTolerance(TOLERANCE), 0.f); -// *INDENT-OFF* // clang-format off #define PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, SHAPES, DT1, DT2, DT3, SCALE, RP, INPLACE_DATASET, VALIDATE) \ FIXTURE_DATA_TEST_CASE(TEST_NAME, NEPixelWiseMultiplication##FIXTURE, framework::DatasetMode::MODE, \ @@ -114,8 +121,43 @@ const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false }); } \ } -// *INDENT-ON* // clang-format on + +void validate_data_types(DataType input1_dtype, DataType input2_dtype, DataType output_dtype) +{ + const auto input1 = TensorInfo(TensorShape(27U, 13U, 2U), 1, input1_dtype); + const auto input2 = TensorInfo(TensorShape(27U, 13U, 2U), 1, input2_dtype); + auto output = TensorInfo(TensorShape(27U, 13U, 2U), 1, output_dtype); + + bool is_valid = static_cast(NEPixelWiseMultiplication::validate(&input1, &input2, &output, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + + const auto supports = { + std::make_tuple(DataType::F32,DataType::F32,DataType::F32), + std::make_tuple(DataType::F16,DataType::F16,DataType::F16), + std::make_tuple(DataType::U8,DataType::U8,DataType::U8), + std::make_tuple(DataType::U8,DataType::U8,DataType::S16), + std::make_tuple(DataType::U8,DataType::S16,DataType::S16), + std::make_tuple(DataType::S16,DataType::U8,DataType::S16), + std::make_tuple(DataType::S16,DataType::S16,DataType::S16), + std::make_tuple(DataType::S32,DataType::S32,DataType::S32), + std::make_tuple(DataType::QSYMM16,DataType::QSYMM16,DataType::QSYMM16), + std::make_tuple(DataType::QSYMM16,DataType::QSYMM16,DataType::S32), + std::make_tuple(DataType::QASYMM8,DataType::QASYMM8,DataType::QASYMM8), + std::make_tuple(DataType::QASYMM8_SIGNED,DataType::QASYMM8_SIGNED,DataType::QASYMM8_SIGNED) + }; + + const auto config = std::make_tuple(input1_dtype, input2_dtype, output_dtype); + const std::initializer_list dtypes_list = {input1_dtype, input2_dtype, output_dtype}; + + bool expected = false; + if(cpu_supports_dtypes(dtypes_list)) + { + expected = (std::find(supports.begin(), supports.end(), config) != supports.end()); + } + + ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); +} + } // namespace using NEPixelWiseMultiplicationQASYMM8Fixture = PixelWiseMultiplicationValidationQuantizedFixture; @@ -137,11 +179,14 @@ template using NEPixelWiseMultiplicationBroadcastFixture = PixelWiseMultiplicationBroadcastValidationFixture; using NEPixelWiseMultiplicationBroadcastQASYMM8Fixture = PixelWiseMultiplicationBroadcastValidationQuantizedFixture; using NEPixelWiseMultiplicationBroadcastQASYMM8SignedFixture = PixelWiseMultiplicationBroadcastValidationQuantizedFixture; +using NEPixelWiseMultiplicationBroadcastQSYMM16Fixture = PixelWiseMultiplicationBroadcastValidationQuantizedFixture; +using NEPixelWiseMultiplicationBroadcastQSYMM16ToS32Fixture = PixelWiseMultiplicationBroadcastValidationQuantizedFixture; +using NEPixelWiseMultiplicationBroadcastU8U8ToS16Fixture = PixelWiseMultiplicationBroadcastValidationFixture; +using NEPixelWiseMultiplicationBroadcastToS16Fixture = PixelWiseMultiplicationBroadcastValidationFixture; TEST_SUITE(NEON) TEST_SUITE(PixelWiseMultiplication) -// *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), //1 Ok @@ -227,7 +272,17 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS); } // clang-format on -// *INDENT-ON* + +/// @note: Do not modify. Validating all data types is pretty fast. +DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::ALL, + combine( + datasets::AllDataTypes("Input1DataType"), + datasets::AllDataTypes("Input2DataType"), + datasets::AllDataTypes("OutputDataType")), + input1_dtype, input2_dtype, output_dtype) +{ + validate_data_types(input1_dtype, input2_dtype, output_dtype); +} TEST_SUITE(InPlaceValidate) TEST_CASE(SingleTensor, framework::DatasetMode::ALL) @@ -455,7 +510,24 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framew // Validate output validate(Accessor(_target), _reference, tolerance_qsymm16); } + TEST_SUITE_END() // ScaleOther +TEST_SUITE(NonXBroadcast) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationBroadcastQSYMM16Fixture, + framework::DatasetMode::ALL, + combine(datasets::SmallShapesNonXBroadcast(), + make("DataTypeIn1", DataType::QSYMM16), + make("DataTypeIn2", DataType::QSYMM16), + make("DataTypeOut", DataType::QSYMM16), + make("Scale", { scale_unity }), + PixelWiseMultiplicationPolicySTZDataset, + PixelWiseMultiplicationQSYMM16QuantDataset, + OutOfPlaceDataSet)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qsymm16); +} +TEST_SUITE_END() // NonXBroadcast TEST_SUITE_END() // QSYMM16 TEST_SUITE(QSYMM16toS32) FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16ToS32Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), @@ -470,6 +542,21 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16ToS32Fixture, f // Validate output validate(Accessor(_target), _reference); } +TEST_SUITE(NonXBroadcast) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationBroadcastQSYMM16ToS32Fixture, framework::DatasetMode::ALL, + combine(datasets::SmallShapesNonXBroadcast(), + make("DataTypeIn1", DataType::QSYMM16), + make("DataTypeIn2", DataType::QSYMM16), + make("DataTypeOut", DataType::S32), + make("Scale", { scale_unity }), + PixelWiseMultiplicationPolicySTZDataset, + PixelWiseMultiplicationQSYMM16QuantDataset, + OutOfPlaceDataSet)) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // NonXBroadcast TEST_SUITE_END() // QSYMM16toS32 TEST_SUITE_END() // Quantized @@ -488,6 +575,22 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationU8U8ToS16Fixture, fram validate_wrap(Accessor(_target), _reference, AbsoluteTolerance(1), 0.f); } +TEST_SUITE(NonXBroadcast) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationBroadcastU8U8ToS16Fixture, framework::DatasetMode::PRECOMMIT, + combine(datasets::SmallShapesNonXBroadcast(), + framework::dataset::make("DataTypeIn1", DataType::U8), + framework::dataset::make("DataTypeIn2", DataType::U8), + framework::dataset::make("DataTypeOut", DataType::S16), + framework::dataset::make("Scale", { scale_255 }), + datasets::ConvertPolicies(), + framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_UP), + OutOfPlaceDataSet)) +{ + // Validate output + validate_wrap(Accessor(_target), _reference, AbsoluteTolerance(1), 0.f); +} +TEST_SUITE_END() // NonXBroadcast + FIXTURE_DATA_TEST_CASE(RunSmall1, NEPixelWiseMultiplicationU8U8ToS16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataTypeIn1", DataType::U8)), framework::dataset::make("DataTypeIn2", DataType::U8)), @@ -511,6 +614,9 @@ TEST_SUITE_END() // Scale255 TEST_SUITE(ScaleUnity) PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture, ALL, SmallShapes(), U8, U8, U8, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE) +TEST_SUITE(NonXBroadcast) +PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture, ALL, SmallShapesNonXBroadcast(), U8, U8, U8, scale_unity, TO_ZERO, OutOfPlaceDataSet, DEFAULT_VALIDATE) +TEST_SUITE_END() // NonXBroadcast TEST_SUITE_END() // ScaleUnity TEST_SUITE(ScaleOther) @@ -529,6 +635,11 @@ TEST_SUITE_END() // Scale255 TEST_SUITE(ScaleUnity) PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture, ALL, SmallShapes(), U8, S16, S16, scale_unity, TO_ZERO, OutOfPlaceDataSet, DEFAULT_VALIDATE) + +TEST_SUITE(NonXBroadcast) +PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastToS16Fixture, ALL, SmallShapesNonXBroadcast(), U8, S16, S16, scale_unity, TO_ZERO, OutOfPlaceDataSet, + DEFAULT_VALIDATE) +TEST_SUITE_END() // NonXBroadcast TEST_SUITE_END() // ScaleUnity TEST_SUITE(ScaleOther) @@ -546,6 +657,10 @@ TEST_SUITE_END() // Scale255 TEST_SUITE(ScaleUnity) PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture, ALL, SmallShapes(), S16, S16, S16, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE) +TEST_SUITE(NonXBroadcast) +PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture, ALL, SmallShapesNonXBroadcast(), S16, S16, S16, scale_unity, TO_ZERO, OutOfPlaceDataSet, DEFAULT_VALIDATE) +TEST_SUITE_END() // NonXBroadcast + TEST_SUITE_END() // ScaleUnity TEST_SUITE(ScaleOther) diff --git a/tests/validation/NEON/Pooling3dLayer.cpp b/tests/validation/NEON/Pooling3dLayer.cpp index 1b30023ca5..461f82da25 100644 --- a/tests/validation/NEON/Pooling3dLayer.cpp +++ b/tests/validation/NEON/Pooling3dLayer.cpp @@ -97,6 +97,40 @@ const auto qasymm8_signed_out_qinfo_dataset = framework::dataset::make("OutputQu TEST_SUITE(NEON) TEST_SUITE(Pooling3dLayer) +TEST_CASE(SimpleIntegerAvgPooling, framework::DatasetMode::ALL) +{ + const auto pool_info = Pooling3dLayerInfo(PoolingType::AVG, + Size3D(1,1,1), Size3D(1,1,1), Padding3D(), true /* exclude padding */); + const auto shape = TensorShape(18U,1U,1U,1U); // > 16 for channel dim. to stress vector and leftover loops + const auto dtype = DataType::QASYMM8_SIGNED; + const auto layout = DataLayout::NDHWC; + const auto qinfo = QuantizationInfo(1.f, 0); + + Tensor input = create_tensor(shape, dtype, 1, qinfo, layout); + Tensor output = create_tensor(shape, dtype, 1, qinfo, layout); + + NEPooling3dLayer pool; + pool.configure(&input, &output, pool_info); + + input.allocator()->allocate(); + output.allocator()->allocate(); + + std::vector values = {-9, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8}; + + ARM_COMPUTE_EXPECT(values.size() == shape.x(), framework::LogLevel::ERRORS); + + library->fill_static_values(Accessor(input), values); + + pool.run(); + for(unsigned int i = 0; i < values.size(); ++i) + { + const int8_t ref = values[i]; + const int8_t target = reinterpret_cast(output.buffer())[i]; + ARM_COMPUTE_EXPECT(ref == target, framework::LogLevel::ERRORS); + } +} + // *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp index f635a63bbe..50118e8831 100644 --- a/tests/validation/NEON/PoolingLayer.cpp +++ b/tests/validation/NEON/PoolingLayer.cpp @@ -160,6 +160,40 @@ const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(combine(framework const auto PoolingLayerKernelIndicesDatasetFPSmall = combine(combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(7, 7) })), framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 1, 1, 1) })), framework::dataset::make("ExcludePadding", { false })); + +TEST_CASE(SimpleIntegerAvgPooling, framework::DatasetMode::ALL) +{ + const auto pool_info = PoolingLayerInfo(PoolingType::AVG, Size2D(1,1), DataLayout::NHWC); + const auto shape = TensorShape(18U,1U,1U); // > 16 for channel dim. to stress vector and leftover loops + const auto dtype = DataType::QASYMM8_SIGNED; + const auto layout = DataLayout::NHWC; + const auto qinfo = QuantizationInfo(1.f, 0); + + Tensor input = create_tensor(shape, dtype, 1, qinfo, layout); + Tensor output = create_tensor(shape, dtype, 1, qinfo, layout); + + NEPoolingLayer pool; + pool.configure(&input, &output, pool_info); + + input.allocator()->allocate(); + output.allocator()->allocate(); + + std::vector values = {-9, -8, -7, -6, -5, -4, -3, -2, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8}; + + ARM_COMPUTE_EXPECT(values.size() == shape.x(), framework::LogLevel::ERRORS); + + library->fill_static_values(Accessor(input), values); + pool.run(); + + for(unsigned int i = 0; i < values.size(); ++i) + { + const int8_t ref = values[i]; + const int8_t target = reinterpret_cast(output.buffer())[i]; + ARM_COMPUTE_EXPECT(ref == target, framework::LogLevel::ERRORS); + } +} + TEST_SUITE(Float) TEST_SUITE(FP32) FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(), diff --git a/tests/validation/NEON/Reverse.cpp b/tests/validation/NEON/Reverse.cpp index 7d99bd614d..d390ed0e23 100644 --- a/tests/validation/NEON/Reverse.cpp +++ b/tests/validation/NEON/Reverse.cpp @@ -25,15 +25,15 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/functions/NEReverse.h" #include "arm_compute/runtime/Tensor.h" -#include "arm_compute/runtime/TensorAllocator.h" #include "tests/NEON/Accessor.h" -#include "tests/PaddingCalculator.h" #include "tests/datasets/ShapeDatasets.h" +#include "tests/datasets/DatatypeDataset.h" #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" #include "tests/framework/datasets/Datasets.h" #include "tests/validation/Validation.h" #include "tests/validation/fixtures/ReverseFixture.h" +#include "tests/validation/Helpers.h" namespace arm_compute { @@ -44,13 +44,66 @@ namespace validation namespace { using framework::dataset::make; + auto run_small_dataset = combine(datasets::Small3DShapes(), datasets::Tiny1DShapes()); auto run_large_dataset = combine(datasets::LargeShapes(), datasets::Tiny1DShapes()); +void validate_data_types(DataType input_dtype, DataType output_dtype, DataType axis_dtype) +{ + const auto input = TensorInfo(TensorShape(16U, 16U, 5U), 1, input_dtype); + const auto axis = TensorInfo(TensorShape(1U), 1, axis_dtype); + auto output = TensorInfo(TensorShape(16U, 16U, 5U), 1, output_dtype); + + const Status status = (NEReverse::validate(&input, &output, &axis, false /* use_inverted_axis */)); + const bool is_valid = static_cast(status); + + static const auto supported_dtypes = { + DataType::QSYMM8, + DataType::QASYMM8, + DataType::QASYMM8_SIGNED, + DataType::QSYMM16, + DataType::U8, + DataType::S8, + DataType::QSYMM8_PER_CHANNEL, + DataType::U16, + DataType::S16, + DataType::QSYMM16, + DataType::QASYMM16, + DataType::U32, + DataType::S32, + DataType::BFLOAT16, + DataType::F16, + DataType::F32 + }; + + static std::vector> supports = {}; + for(DataType dtype : supported_dtypes) + { + supports.push_back(std::make_tuple(dtype, dtype, DataType::S32)); + supports.push_back(std::make_tuple(dtype, dtype, DataType::U32)); + } + + const auto config = std::make_tuple(input_dtype, output_dtype, axis_dtype); + const bool expected = (std::find(supports.begin(), supports.end(), config) != supports.end()); + + ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); +} + } // namespace TEST_SUITE(NEON) TEST_SUITE(Reverse) +/// @note: Do not modify. Validating all data types is pretty fast. +DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::ALL, + combine( + datasets::AllDataTypes("InputDataType"), + datasets::AllDataTypes("OutputDataType"), + datasets::AllDataTypes("AxisDataType")), + input_dtype, output_dtype, axis_dtype) +{ + validate_data_types(input_dtype, output_dtype, axis_dtype); +} + // *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( @@ -92,29 +145,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( template using NEReverseFixture = ReverseValidationFixture; -TEST_SUITE(Float) +/// @note: Test Strategy -- +/// The operator uses uint8_t, uint16_t and uint32_t under the hood depending +/// on the size of the input data type. Therefore, we do not extensively test +/// all the data types here. fp32/16 and qasymm8 has been thoroughly tested with +/// multiple shapes and configuration. Other data types are just smoke tested +/// with a very limited set of configurations, just to make sure they function +/// correctly. -#ifdef ARM_COMPUTE_ENABLE_FP16 +TEST_SUITE(Float) TEST_SUITE(F16) FIXTURE_DATA_TEST_CASE(RunSmall, NEReverseFixture, framework::DatasetMode::PRECOMMIT, combine( run_small_dataset, - make("DataType", DataType::F16), + make("DataType", {DataType::F16, DataType::BFLOAT16}), make("use_negative_axis", { true, false }), make("use_inverted_axis", { true, false }))) { - if(CPUInfo::get().has_fp16()) - { - // Validate output - validate(Accessor(_target), _reference); - } - else - { - ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED."); - framework::ARM_COMPUTE_PRINT_INFO(); - } + validate(Accessor(_target), _reference); } FIXTURE_DATA_TEST_CASE(RunLarge, @@ -126,21 +176,11 @@ FIXTURE_DATA_TEST_CASE(RunLarge, make("use_negative_axis", { true, false }), make("use_inverted_axis", { true, false }))) { - if(CPUInfo::get().has_fp16()) - { - // Validate output - validate(Accessor(_target), _reference); - } - else - { - ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED."); - framework::ARM_COMPUTE_PRINT_INFO(); - } + validate(Accessor(_target), _reference); } TEST_SUITE_END() // F16 -#endif /* ARM_COMPUTE_ENABLE_FP16 */ -TEST_SUITE(FP32) +TEST_SUITE(F32) FIXTURE_DATA_TEST_CASE(RunSmall, NEReverseFixture, framework::DatasetMode::PRECOMMIT, @@ -169,14 +209,78 @@ FIXTURE_DATA_TEST_CASE(RunLarge, TEST_SUITE_END() // F32 TEST_SUITE_END() // Float -TEST_SUITE(Quantized) -TEST_SUITE(QASYMM8) +TEST_SUITE(Integer) +TEST_SUITE(Int32) +FIXTURE_DATA_TEST_CASE(RunSmall, + NEReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::S32}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // Int32 + +TEST_SUITE(UInt32) +FIXTURE_DATA_TEST_CASE(RunSmall, + NEReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::U32}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // UInt32 + +TEST_SUITE(Int16) +FIXTURE_DATA_TEST_CASE(RunSmall, + NEReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::S16, DataType::QSYMM16}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // Int16 + +TEST_SUITE(UInt16) +FIXTURE_DATA_TEST_CASE(RunSmall, + NEReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::U16, DataType::QASYMM16}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // UInt16 + +TEST_SUITE(UInt8) FIXTURE_DATA_TEST_CASE(RunSmall, NEReverseFixture, framework::DatasetMode::PRECOMMIT, combine( run_small_dataset, - make("DataType", DataType::QASYMM8), + make("DataType", {DataType::QASYMM8, DataType::U8}), make("use_negative_axis", { true, false }), make("use_inverted_axis", { true, false }))) { @@ -196,8 +300,25 @@ FIXTURE_DATA_TEST_CASE(RunLarge, // Validate output validate(Accessor(_target), _reference); } -TEST_SUITE_END() // QASYMM8 -TEST_SUITE_END() // Quantized +TEST_SUITE_END() // UInt8 + +TEST_SUITE(Int8) +FIXTURE_DATA_TEST_CASE(RunSmall, + NEReverseFixture, + framework::DatasetMode::PRECOMMIT, + combine( + make("InOutShape", TensorShape(18U, 5U, 5U)), + make("AxisShape", TensorShape(2U)), + make("DataType", {DataType::QASYMM8_SIGNED, DataType::S8, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL}), + make("use_negative_axis", { false }), + make("use_inverted_axis", { false }))) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // Int8 +TEST_SUITE_END() // Integer TEST_SUITE_END() // Reverse TEST_SUITE_END() // Neon diff --git a/tests/validation/fixtures/CpuSoftmaxFixture.h b/tests/validation/fixtures/CpuSoftmaxFixture.h new file mode 100644 index 0000000000..82938405b7 --- /dev/null +++ b/tests/validation/fixtures/CpuSoftmaxFixture.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_TESTS_VALIDATION_FIXTURES_CPUSOFTMAXFIXTURE_H +#define ACL_TESTS_VALIDATION_FIXTURES_CPUSOFTMAXFIXTURE_H + +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/Tensor.h" +#include "tests/AssetsLibrary.h" +#include "tests/Globals.h" +#include "tests/IAccessor.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Fixture.h" +#include "tests/validation/reference/SoftmaxLayer.h" + +#include + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +template +class CpuSoftmaxValidationFixture : public framework::Fixture +{ +public: + void setup(TensorShape shape, DataType data_type, float beta, size_t axis) + { + if(std::is_same::value && // Cpu + data_type == DataType::F16 && !CPUInfo::get().has_fp16()) + { + return; + } + + quantization_info_ = QuantizationInfo(); + + reference_ = compute_reference(shape, data_type, quantization_info_, beta, axis); + target_ = compute_target(shape, data_type, quantization_info_, beta, axis); + } + +protected: + template + void fill(U &&tensor) + { + if(tensor.data_type() == DataType::F32) + { + std::uniform_real_distribution distribution(-10.0f, 10.0f); + library->fill(tensor, distribution, 0); + } + else if(tensor.data_type() == DataType::F16) + { + arm_compute::utils::uniform_real_distribution_16bit distribution{ -10.0f, 10.0f }; + library->fill(tensor, distribution, 0); + } + else if(!is_data_type_quantized(tensor.data_type())) + { + std::uniform_int_distribution<> distribution(0, 100); + library->fill(tensor, distribution, 0); + } + else + { + library->fill_tensor_uniform(tensor, 0); + } + } + + TensorType compute_target(const TensorShape &shape, DataType data_type, + QuantizationInfo quantization_info, float beta, int32_t axis) + { + // Create tensors + TensorType src = create_tensor(shape, data_type, 1, quantization_info); + TensorType dst = create_tensor(shape, data_type, 1, get_softmax_output_quantization_info(data_type, IS_LOG)); + + // Create and configure function + FunctionType softmax; + softmax.configure(src.info(), dst.info(), beta, axis); + + ARM_COMPUTE_ASSERT(src.info()->is_resizable()); + ARM_COMPUTE_ASSERT(dst.info()->is_resizable()); + + // Allocate tensors + src.allocator()->allocate(); + dst.allocator()->allocate(); + + ARM_COMPUTE_ASSERT(!src.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!dst.info()->is_resizable()); + + // Fill tensors + fill(AccessorType(src)); + + ITensorPack run_pack{ { arm_compute::TensorType::ACL_SRC_0, &src }}; + run_pack.add_tensor(arm_compute::TensorType::ACL_DST, &dst); + auto mg = MemoryGroup{}; + auto ws = manage_workspace(softmax.workspace(), mg, run_pack); + + // Compute function + softmax.run(run_pack); + + return dst; + } + + SimpleTensor compute_reference(const TensorShape &shape, DataType data_type, + QuantizationInfo quantization_info, float beta, int32_t axis) + { + // Create reference + SimpleTensor src{ shape, data_type, 1, quantization_info }; + + // Fill reference + fill(src); + + return reference::softmax_layer(src, beta, axis, IS_LOG); + } + + TensorType target_{}; + SimpleTensor reference_{}; + QuantizationInfo quantization_info_{}; +}; + +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif // ACL_TESTS_VALIDATION_FIXTURES_CPUSOFTMAXFIXTURE_H diff --git a/tests/validation/fixtures/ReverseFixture.h b/tests/validation/fixtures/ReverseFixture.h index 5bb8f876d2..58a108d637 100644 --- a/tests/validation/fixtures/ReverseFixture.h +++ b/tests/validation/fixtures/ReverseFixture.h @@ -47,12 +47,6 @@ class ReverseValidationFixture : public framework::Fixture public: void setup(TensorShape shape, TensorShape axis_shape, DataType data_type, bool use_negative_axis = false, bool use_inverted_axis = false) { - if(std::is_same::value && // Cpu - data_type == DataType::F16 && !CPUInfo::get().has_fp16()) - { - return; - } - _num_dims = shape.num_dimensions(); _target = compute_target(shape, axis_shape, data_type, use_negative_axis, use_inverted_axis); _reference = compute_reference(shape, axis_shape, data_type, use_negative_axis, use_inverted_axis); @@ -85,7 +79,17 @@ class ReverseValidationFixture : public framework::Fixture TensorType compute_target(const TensorShape &shape, const TensorShape &axis_shape, DataType data_type, bool use_negative_axis, bool use_inverted_axis = false) { // Create tensors - TensorType src = create_tensor(shape, data_type, 1); + QuantizationInfo qinfo = QuantizationInfo(); + if(data_type == DataType::QSYMM8_PER_CHANNEL) + { + // We need dummy scale and offset values for tensor buffer allocation + const std::vector scales(1); + const std::vector offsets(1); + + qinfo = QuantizationInfo(scales, offsets); + } + + TensorType src = create_tensor(shape, data_type, 1, qinfo); TensorType axis = create_tensor(axis_shape, DataType::U32, 1); TensorType dst; diff --git a/tests/validation/reference/Reverse.cpp b/tests/validation/reference/Reverse.cpp index 7924f900d1..1d7ed74c85 100644 --- a/tests/validation/reference/Reverse.cpp +++ b/tests/validation/reference/Reverse.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, 2023 Arm Limited. + * Copyright (c) 2018-2020, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -97,8 +97,20 @@ SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor } template SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor &axis, bool use_inverted_axis); +template SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor &axis, bool use_inverted_axis); +template SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor &axis, bool use_inverted_axis); +template SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor &axis, bool use_inverted_axis); +template SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor &axis, bool use_inverted_axis); +template SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor &axis, bool use_inverted_axis); template SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor &axis, bool use_inverted_axis); template SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor &axis, bool use_inverted_axis); + +#ifdef __aarch64__ +template SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor &axis, bool use_inverted_axis); +template SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor &axis, bool use_inverted_axis); +template SimpleTensor reverse(const SimpleTensor &src, const SimpleTensor &axis, bool use_inverted_axis); +#endif // __aarch64__ + } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/runtime/experimental/operators/CpuAdd.cpp b/tests/validation/runtime/experimental/operators/CpuAdd.cpp index 97eaa9ce9e..5a3ec353d1 100644 --- a/tests/validation/runtime/experimental/operators/CpuAdd.cpp +++ b/tests/validation/runtime/experimental/operators/CpuAdd.cpp @@ -52,6 +52,7 @@ const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", {false}); } // namespace TEST_SUITE(NEON) +TEST_SUITE(OPERATORS) TEST_SUITE(CpuAdd) using CpuAddFixture = CpuArithmeticAdditionValidationFixture; @@ -71,7 +72,8 @@ FIXTURE_DATA_TEST_CASE( TEST_SUITE_END() // U8 TEST_SUITE_END() // CpuAdd -TEST_SUITE_END() // Neon +TEST_SUITE_END() // OPERATORS +TEST_SUITE_END() // NEON } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/runtime/experimental/operators/CpuElementwise.cpp b/tests/validation/runtime/experimental/operators/CpuElementwise.cpp index b2007ea22a..a41f8e6b07 100644 --- a/tests/validation/runtime/experimental/operators/CpuElementwise.cpp +++ b/tests/validation/runtime/experimental/operators/CpuElementwise.cpp @@ -54,6 +54,7 @@ const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", {false}); } // namespace TEST_SUITE(NEON) +TEST_SUITE(OPERATORS) TEST_SUITE(CpuElementwiseDivision) template @@ -72,7 +73,7 @@ FIXTURE_DATA_TEST_CASE(SmokeTest, } TEST_SUITE_END() // F32 TEST_SUITE_END() // Float -TEST_SUITE_END() // CpuElementwiseMin +TEST_SUITE_END() // CpuElementwiseDivision TEST_SUITE(CpuElementwiseMax) template @@ -91,7 +92,7 @@ FIXTURE_DATA_TEST_CASE(SmokeTest, } TEST_SUITE_END() // F32 TEST_SUITE_END() // Float -TEST_SUITE_END() // CpuElementwiseMin +TEST_SUITE_END() // CpuElementwiseMax TEST_SUITE(CpuElementwiseMin) @@ -113,7 +114,8 @@ TEST_SUITE_END() // F32 TEST_SUITE_END() // Float TEST_SUITE_END() // CpuElementwiseMin -TEST_SUITE_END() // Neon +TEST_SUITE_END() // OPERATORS +TEST_SUITE_END() // NEON } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/runtime/experimental/operators/CpuGemm.cpp b/tests/validation/runtime/experimental/operators/CpuGemm.cpp index 9d85f90712..75ad22a448 100644 --- a/tests/validation/runtime/experimental/operators/CpuGemm.cpp +++ b/tests/validation/runtime/experimental/operators/CpuGemm.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/experimental/operators/CpuGemm.h" + #include "src/core/helpers/MemoryHelpers.h" -#include "tests/NEON/Accessor.h" -#include "tests/framework/Macros.h" #include "tests/framework/datasets/Datasets.h" +#include "tests/framework/Macros.h" +#include "tests/NEON/Accessor.h" #include "tests/validation/fixtures/GEMMFixture.h" /* @@ -42,19 +43,10 @@ namespace validation { using framework::dataset::make; -namespace -{ -/** CNN data types */ -const auto CNNDataTypes = make("DataType", -{ - DataType::F32, -}); -} // namespace - TEST_SUITE(NEON) TEST_SUITE(OPERATORS) -TEST_SUITE(CPUGEMM) +TEST_SUITE(CpuGemm) /** Test case for memory injection in @ref arm_compute::experimental::op::CpuGemm. * * Configure the operator once and inject memory at run-time in multiple executions. @@ -80,8 +72,8 @@ TEST_CASE(OpCpuGemmMemoryInjection, framework::DatasetMode::ALL) rhs.allocator()->allocate(); c.allocator()->allocate(); - ITensorPack run_pack{ { TensorType::ACL_SRC_0, &lhs }, { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } }; - ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } }; + ITensorPack run_pack{{TensorType::ACL_SRC_0, &lhs}, {TensorType::ACL_SRC_1, &rhs}, {TensorType::ACL_SRC_2, &c}}; + ITensorPack prep_pack{{TensorType::ACL_SRC_1, &rhs}, {TensorType::ACL_SRC_2, &c}}; auto mg = MemoryGroup{}; auto ws = manage_workspace(gemm->workspace(), mg, run_pack, prep_pack); @@ -102,22 +94,29 @@ TEST_CASE(OpCpuGemmMemoryInjection, framework::DatasetMode::ALL) }; auto result_0 = run_conv(); auto result_1 = run_conv(); - for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i) + for (size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i) { - ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], + framework::LogLevel::ERRORS); } } -DATA_TEST_CASE(OpCpuGemmValidateAccumulate, framework::DatasetMode::ALL, combine( - zip(make("In0",{ TensorShape(21U, 13U) }), - make("In1", { TensorShape(33U, 21U) }), - make("Dst", { TensorShape(33U, 13U) })), - zip( - make("alpha", { 1.0, 100.0, 1.0, 1.0 }), - make("beta", { 0.0, 0.0, 1.0, 1.0 }), - make("is_c_null", { false, false, false, true }), - make("Expected", { true, false, false, true }))), - shape_a, shape_b, shape_dst, alpha, beta, is_c_null, expected) +DATA_TEST_CASE(OpCpuGemmValidateAccumulate, + framework::DatasetMode::ALL, + combine(zip(make("In0", {TensorShape(21U, 13U)}), + make("In1", {TensorShape(33U, 21U)}), + make("Dst", {TensorShape(33U, 13U)})), + zip(make("alpha", {1.0, 100.0, 1.0, 1.0}), + make("beta", {0.0, 0.0, 1.0, 1.0}), + make("is_c_null", {false, false, false, true}), + make("Expected", {true, false, false, true}))), + shape_a, + shape_b, + shape_dst, + alpha, + beta, + is_c_null, + expected) { /* Accumulation test for GEMM kernels */ // Create tensors @@ -132,10 +131,10 @@ DATA_TEST_CASE(OpCpuGemmValidateAccumulate, framework::DatasetMode::ALL, combine // Validate accumulation arm_compute::experimental::op::CpuGemm gemm; Status status = gemm.validate(&in_a, &in_b, (is_c_null ? nullptr : &in_c), &dst, alpha, beta, gemm_info); - ARM_COMPUTE_EXPECT((expected == bool(status)), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT((expected == bool(status)), framework::LogLevel::ERRORS); } -TEST_SUITE_END() // CPUGEMM +TEST_SUITE_END() // CpuGemm TEST_SUITE_END() // OPERATORS TEST_SUITE_END() // NEON } // namespace validation diff --git a/tests/validation/runtime/experimental/operators/CpuMul.cpp b/tests/validation/runtime/experimental/operators/CpuMul.cpp index 8cad6210a1..3bff2e3b5a 100644 --- a/tests/validation/runtime/experimental/operators/CpuMul.cpp +++ b/tests/validation/runtime/experimental/operators/CpuMul.cpp @@ -62,6 +62,7 @@ using CpuMulU8U8toS16Fixture = CpuMulValidationFixture; TEST_SUITE(NEON) +TEST_SUITE(OPERATORS) TEST_SUITE(CpuMul) TEST_SUITE(U8U8toS16) @@ -101,6 +102,7 @@ FIXTURE_DATA_TEST_CASE( TEST_SUITE_END() // U8U8toS16 TEST_SUITE_END() // CpuMul +TEST_SUITE_END() // OPERATORS TEST_SUITE_END() // NEON } // namespace validation } // namespace test diff --git a/tests/validation/runtime/experimental/operators/CpuSoftmax.cpp b/tests/validation/runtime/experimental/operators/CpuSoftmax.cpp new file mode 100644 index 0000000000..30eb1c31bb --- /dev/null +++ b/tests/validation/runtime/experimental/operators/CpuSoftmax.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2017-2020, 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/experimental/operators/CpuSoftmax.h" +#include "arm_compute/core/Types.h" + +#include "arm_compute/runtime/Tensor.h" +#include "arm_compute/runtime/TensorAllocator.h" +#include "src/common/cpuinfo/CpuIsaInfo.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "tests/NEON/Accessor.h" +#include "tests/datasets/ShapeDatasets.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Validation.h" +#include "tests/validation/fixtures/CpuSoftmaxFixture.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ +using framework::dataset::make; + +/** Tolerance for float operations */ +constexpr AbsoluteTolerance tolerance_f32(0.000001f); +} // namespace +TEST_SUITE(NEON) +TEST_SUITE(OPERATORS) + +TEST_SUITE(CpuSoftmax) + +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip( + make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types + TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes + TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis high + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis low + QuantizationInfo(1.f/256, 12)), + }), + make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16), + TensorInfo(TensorShape(27U, 11U), 1, DataType::F32), + TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + }), + make("beta", { 1.0, + 2.0, + 1.0, + 2.0, + 1.0, + 1.0, + 2.0, + 1.0, + }), + make("axis", { 0, + 0, + 0, + 1, + 0, + -1, + 2, + -3, + }), + make("Expected", { false, false, false, true, true, true, false, false })), + input_info, output_info, beta, axis, expected) +{ + ARM_COMPUTE_EXPECT(bool(arm_compute::experimental::op::CpuSoftmax::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS); +} + +TEST_CASE(OpCpuSoftmaxMemoryInjection, framework::DatasetMode::ALL) +{ + auto softmax = std::make_unique(); + const auto src_info = TensorInfo(TensorShape{ 1U, 9U }, 1, DataType::F32); + auto dst_info = TensorInfo(TensorShape{ 1U, 9U }, 1, DataType::F32); + + const float beta = (1.0F); + const int32_t axis = 0; + const bool is_log = false; + + softmax->configure(&src_info, &dst_info, beta, axis, is_log); + + // the lhs are newly created every call of this lambda function + auto src = create_tensor(src_info); + auto dst = create_tensor(dst_info); + src.allocator()->allocate(); + + ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }}; + auto mg = MemoryGroup{}; + auto ws = manage_workspace(softmax->workspace(), mg, run_pack); + + auto run_softmax = [&]() -> Tensor + { + auto dst = create_tensor(dst_info); + dst.allocator()->allocate(); + run_pack.add_tensor(TensorType::ACL_DST, &dst); + + library->fill_tensor_value(Accessor(src), 1.f); + // This operator is configured once and captured by this lambda. + softmax->run(run_pack); + return dst; + }; + auto result_0 = run_softmax(); + auto result_1 = run_softmax(); + for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i) + { + ARM_COMPUTE_EXPECT((reinterpret_cast(result_0.buffer()))[i] == (reinterpret_cast(result_1.buffer()))[i], framework::LogLevel::ERRORS); + } +} + +template +using CpuOpSoftmaxFixture = CpuSoftmaxValidationFixture; + +TEST_SUITE(FP32) +FIXTURE_DATA_TEST_CASE(SmokeTest, CpuOpSoftmaxFixture, framework::DatasetMode::PRECOMMIT, + combine( + datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::F32), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, -1 }))) +{ + // Validate output + validate(Accessor(target_), reference_, tolerance_f32); +} + +TEST_SUITE_END() //FP32 +TEST_SUITE_END() //CpuSoftmax +TEST_SUITE_END() //OPERATORS +TEST_SUITE_END() //NEON + +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/tests/validation/runtime/experimental/operators/CpuSub.cpp b/tests/validation/runtime/experimental/operators/CpuSub.cpp index 22f5ae8d7b..4736aafb2e 100644 --- a/tests/validation/runtime/experimental/operators/CpuSub.cpp +++ b/tests/validation/runtime/experimental/operators/CpuSub.cpp @@ -52,6 +52,7 @@ const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", {false}); } // namespace TEST_SUITE(NEON) +TEST_SUITE(OPERATORS) TEST_SUITE(CpuSub) using CpuSubFixture = CpuArithmeticSubtractionValidationFixture; @@ -71,7 +72,8 @@ FIXTURE_DATA_TEST_CASE( TEST_SUITE_END() // U8 TEST_SUITE_END() // CpuSub -TEST_SUITE_END() // Neon +TEST_SUITE_END() // OPERATORS +TEST_SUITE_END() // NEON } // namespace validation } // namespace test } // namespace arm_compute diff --git a/utils/Utils.h b/utils/Utils.h index 626cbcf07f..93dc2fa106 100644 --- a/utils/Utils.h +++ b/utils/Utils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2023 Arm Limited. + * Copyright (c) 2016-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __UTILS_UTILS_H__ -#define __UTILS_UTILS_H__ + +#ifndef ACL_UTILS_UTILS_H +#define ACL_UTILS_UTILS_H /** @dir . * brief Boiler plate code used by examples. Various utilities to print types, load / store assets, etc. @@ -261,10 +262,12 @@ class uniform_real_distribution_16bit using result_type = T; /** Constructor * - * @param[in] min Minimum value of the distribution - * @param[in] max Maximum value of the distribution + * @param[in] min Minimum value of the distribution + * @param[in] max Maximum value of the distribution + * @param[in] portable Boolean to indicate portable conversion in-between 16-bit and other data types */ - explicit uniform_real_distribution_16bit(float min = 0.f, float max = 1.0) : dist(min, max) + explicit uniform_real_distribution_16bit(float min = 0.f, float max = 1.0, bool portable = false) + : _dist(min, max), _portable(portable) { } @@ -274,11 +277,24 @@ class uniform_real_distribution_16bit */ T operator()(std::mt19937 &gen) { - return T(dist(gen)); + return convert(_dist(gen)); } private: - std::uniform_real_distribution dist; + template + inline typename std::enable_if::value, bfloat16>::type convert(float x) + { + return bfloat16(x, _portable); + } + + template + inline typename std::enable_if::value, T>::type convert(float x) + { + return T(x); + } + + std::uniform_real_distribution _dist; + bool _portable; }; /** Numpy data loader */ @@ -857,4 +873,5 @@ int compare_tensor(ITensor &tensor1, ITensor &tensor2, T tolerance) } } // namespace utils } // namespace arm_compute -#endif /* __UTILS_UTILS_H__*/ + +#endif // ACL_UTILS_UTILS_H