Skip to content

Commit

Permalink
Use the new pthreadpool_parallelize_[23]d_tile_2d_dynamic strategie…
Browse files Browse the repository at this point in the history
…s in the GEMM-based ops.

PiperOrigin-RevId: 713162910
  • Loading branch information
gonnet authored and xnnpack-bot committed Jan 8, 2025
1 parent 260e0ec commit 4922366
Show file tree
Hide file tree
Showing 201 changed files with 1,456 additions and 1,288 deletions.
108 changes: 70 additions & 38 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1266,18 +1266,31 @@ IF(XNNPACK_BUILD_TESTS)
ENDIF()

# ---[ Launch heavy tests first.
# Tests added to this list will be automatically removed from other lists.
SET(SHARDED_TESTS
fully-connected-nc
avgpool-minmax
maxpool-minmax
f32-vclamp
f32-vlrelu
f32-rdsum
f32-velu
f32-argmaxpool
s8-vclamp
u8-vclamp
)
maxpool-minmax)
IF(XNNPACK_TARGET_PROCESSOR MATCHES "^riscv")
LIST(APPEND SHARDED_TESTS
f16-qs8-vcvt
f16-qu8-vcvt
f32-argmaxpool
f32-qs8-vcvt
f32-qu8-vcvt
f32-rdsum
f32-vclamp
f32-velu
f32-vlrelu
qs8-f32-vcvt
qs8-vcvt
qs8-vlrelu
qu8-f32-vcvt
qu8-vcvt
qu8-vlrelu
s8-vclamp
u8-vclamp)
ENDIF()
FOREACH(TEST ${SHARDED_TESTS})
ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
Expand All @@ -1297,6 +1310,7 @@ IF(XNNPACK_BUILD_TESTS)

IF(XNNPACK_BUILD_LIBRARY)
# ---[ Launch heavy tests first.
# Tests added to this list will be automatically removed from other lists.
SET(LIBRARY_SHARDED_TESTS
batch-matrix-multiply-nc
batch-matrix-multiply
Expand Down Expand Up @@ -1342,6 +1356,7 @@ IF(XNNPACK_BUILD_TESTS)
runtime
subgraph-nchw
workspace)
LIST(REMOVE_ITEM LIBRARY_SUBGRAPH_OPTIMIZATION_TESTS ${LIBRARY_SHARDED_TESTS})
FOREACH(TEST ${LIBRARY_SUBGRAPH_OPTIMIZATION_TESTS})
ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE src test)
Expand Down Expand Up @@ -1386,6 +1401,7 @@ IF(XNNPACK_BUILD_TESTS)
transpose-reshape
unary
unpooling-2d)
LIST(REMOVE_ITEM LIBRARY_SUBGRAPH_UNIT_TESTS ${LIBRARY_SHARDED_TESTS})
FOREACH(TEST ${LIBRARY_SUBGRAPH_UNIT_TESTS})
ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE src test)
Expand All @@ -1407,6 +1423,7 @@ IF(XNNPACK_BUILD_TESTS)
convolution-2d
deconvolution-2d
depthwise-convolution-2d)
LIST(REMOVE_ITEM LIBRARY_SUBGRAPH_CONVOLUTION_UNIT_TESTS ${LIBRARY_SHARDED_TESTS})
FOREACH(TEST ${LIBRARY_SUBGRAPH_CONVOLUTION_UNIT_TESTS})
ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE src test)
Expand Down Expand Up @@ -1440,21 +1457,23 @@ IF(XNNPACK_BUILD_TESTS)
f16-conv-hwc2chw
f16-f32acc-rdsum
f16-f32acc-rsum
f16-ibilinear-chw
f16-ibilinear
f16-ibilinear-chw
f16-raddstoreexpminusmax
f16-rmax
f16-rsum
f16-spmm-minmax
f16-vcmul
f16-vmulcaddc-minmax
f32-argmaxpool
f32-conv-hwc
f32-conv-hwc2chw
f32-ibilinear-chw
f32-ibilinear
f32-ibilinear-chw
f32-raddexpminusmax
f32-raddextexp
f32-raddstoreexpminusmax
f32-rdsum
f32-rmax
f32-rmin
f32-rminmax
Expand All @@ -1466,11 +1485,12 @@ IF(XNNPACK_BUILD_TESTS)
f32-vscaleextexp
indirection
packing
qs8-packw
qs8-qc4w-packw
qs8-rdsum-minmax-fp32
qu8-rdsum
qs8-rsum
qu8-rdsum
qu8-rsum
qs8-vlrelu
qu8-vlrelu
s8-ibilinear
u8-ibilinear
Expand All @@ -1483,11 +1503,10 @@ IF(XNNPACK_BUILD_TESTS)
x32-unpool
x8-lut
x8-packw
qs8-packw
qs8-qc4w-packw
xN-transpose
xx-fill
xx-pad)
LIST(REMOVE_ITEM MICROKERNEL_UNIT_TESTS ${SHARDED_TESTS})
FOREACH(TEST ${MICROKERNEL_UNIT_TESTS})
ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
Expand All @@ -1514,21 +1533,22 @@ IF(XNNPACK_BUILD_TESTS)
f16-dwconv-minmax-multipass
f16-dwconv-minmax-unipass
f16-dwconv2d-chw
f32-dwconv-multipass
f32-dwconv-minmax-multipass
f32-dwconv-unipass
f32-dwconv-minmax-unipass
f32-dwconv-multipass
f32-dwconv-unipass
f32-dwconv2d-chw
qs8-qc8w-dwconv-minmax-multipass-fp32
qs8-qc8w-dwconv-minmax-unipass-fp32
qs8-dwconv-minmax-multipass-fp32
qs8-dwconv-minmax-multipass-rndnu
qs8-dwconv-minmax-unipass-fp32
qs8-dwconv-minmax-unipass-rndnu
qs8-qc8w-dwconv-minmax-multipass-fp32
qs8-qc8w-dwconv-minmax-unipass-fp32
qu8-dwconv-minmax-multipass-fp32
qu8-dwconv-minmax-multipass-rndnu
qu8-dwconv-minmax-unipass-fp32
qu8-dwconv-minmax-unipass-rndnu)
LIST(REMOVE_ITEM MICROKERNEL_DWCONV_UNIT_TESTS ${SHARDED_TESTS})
FOREACH(TEST ${MICROKERNEL_DWCONV_UNIT_TESTS})
ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
Expand All @@ -1550,38 +1570,39 @@ IF(XNNPACK_BUILD_TESTS)
SET(MICROKERNEL_GEMM_UNIT_TESTS
bf16-gemm-minmax
f16-f32acc-gemm-minmax
f16-gemm-minmax
f16-f32acc-igemm-minmax
f16-gemm-minmax
f16-igemm-minmax
qd8-f16-qc8w-gemm-minmax
f32-gemm
f32-gemm-relu
f32-gemm-minmax
f32-gemm-goi-minmax
f32-qc8w-gemm
f32-qc8w-gemm-relu
f32-qc8w-gemm-minmax
f32-qc4w-gemm-minmax
f32-gemm-minmax
f32-gemm-relu
f32-gemminc-minmax
f32-igemm
f32-igemm-relu
f32-igemm-minmax
f32-igemm-relu
f32-ppmm-minmax
qd8-f32-qc8w-gemm-minmax
f32-qc4w-gemm-minmax
f32-qc8w-gemm
f32-qc8w-gemm-minmax
f32-qc8w-gemm-relu
qd8-f16-qb4w-gemm-minmax
qd8-f16-qc4w-gemm-minmax
qd8-f16-qc8w-gemm-minmax
qd8-f32-qb4w-gemm-minmax
qd8-f32-qc4w-gemm-minmax
qd8-f32-qc8w-gemm-minmax
qd8-f32-qc8w-igemm-minmax
qp8-f32-qb4w-gemm-minmax
qp8-f32-qc4w-gemm-minmax
qp8-f32-qc8w-gemm-minmax
qp8-f32-qb4w-gemm-minmax
qs8-qc8w-gemm-minmax-fp32
qs8-qc8w-igemm-minmax-fp32
qu8-gemm-minmax-fp32
qu8-gemm-minmax-rndnu
qu8-igemm-minmax-fp32
qu8-igemm-minmax-rndnu)
LIST(REMOVE_ITEM MICROKERNEL_GEMM_UNIT_TESTS ${SHARDED_TESTS})
FOREACH(TEST ${MICROKERNEL_GEMM_UNIT_TESTS})
FILE(GLOB TEST_SOURCES "test/${TEST}*.cc")
IF(TEST_SOURCES)
Expand All @@ -1605,6 +1626,7 @@ IF(XNNPACK_BUILD_TESTS)

SET(MICROKERNEL_PACKQ_UNIT_TESTS
x8-packq)
LIST(REMOVE_ITEM MICROKERNEL_PACKQ_UNIT_TESTS ${SHARDED_TESTS})
FOREACH(TEST ${MICROKERNEL_PACKQ_UNIT_TESTS})
ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
Expand Down Expand Up @@ -1675,6 +1697,7 @@ IF(XNNPACK_BUILD_TESTS)
qu8-vmul-minmax-rndnu
qu8-vmulc-minmax-fp32
qu8-vmulc-minmax-rndnu)
LIST(REMOVE_ITEM MICROKERNEL_VBINARY_UNIT_TESTS ${SHARDED_TESTS})
FOREACH(TEST ${MICROKERNEL_VBINARY_UNIT_TESTS})
ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
Expand All @@ -1696,6 +1719,7 @@ IF(XNNPACK_BUILD_TESTS)
SET(MICROKERNEL_VCVT_TESTS
f16-f32-vcvt
f16-qs8-vcvt
f16-qu8-vcvt
f32-f16-vcvt
f32-qs8-vcvt
f32-qu8-vcvt
Expand All @@ -1704,6 +1728,7 @@ IF(XNNPACK_BUILD_TESTS)
qs8-vcvt
qu8-f32-vcvt
qu8-vcvt)
LIST(REMOVE_ITEM MICROKERNEL_VCVT_TESTS ${SHARDED_TESTS})
FOREACH(TEST ${MICROKERNEL_VCVT_TESTS})
ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
Expand All @@ -1728,31 +1753,38 @@ IF(XNNPACK_BUILD_TESTS)
f16-vhswish
f16-vlrelu
f16-vneg
f16-vrndd
f16-vrndne
f16-vrndz
f16-vrndu
f16-vrndd
f16-vrndz
f16-vrsqrt
f16-vsigmoid
f16-vsqr
f16-vsqrt
f16-vtanh
f32-vabs
f32-vhswish
f32-vgelu
f32-vclamp
f32-velu
f32-vexp
f32-vgelu
f32-vhswish
f32-vlog
f32-vlrelu
f32-vneg
f32-vrelu
f32-vrndd
f32-vrndne
f32-vrndz
f32-vrndu
f32-vrndd
f32-vrndz
f32-vrsqrt
f32-vsigmoid
f32-vsqr
f32-vsqrt
f32-vrsqrt
f32-vtanh)
f32-vtanh
qs8-vlrelu
s8-vclamp
u8-vclamp)
LIST(REMOVE_ITEM MICROKERNEL_VUNARY_TESTS ${SHARDED_TESTS})
FOREACH(TEST ${MICROKERNEL_VUNARY_TESTS})
ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
Expand Down
2 changes: 1 addition & 1 deletion bench/average-pooling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -434,5 +434,5 @@ BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g4, "ShuffleNet v1
BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();

#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
XNN_BENCHMARK_MAIN();
#endif
24 changes: 13 additions & 11 deletions bench/batch-matrix-multiply.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,6 @@
#include "tensorflow/lite/version.h"
#endif // BENCHMARK_TENSORFLOW_LITE

namespace {
static const size_t kMinIterations = 10;
} // namespace

void xnnpack_batch_matrix_multiply_f32(benchmark::State& state,
const char* net) {
const size_t batch_size = state.range(0);
Expand Down Expand Up @@ -99,8 +95,9 @@ void xnnpack_batch_matrix_multiply_f32(benchmark::State& state,
return;
}

while (state.KeepRunningBatch(kMinIterations)) {
for (int iter = 0; iter < kMinIterations; iter++) {
int num_iters = FLAGS_benchmark_min_iters;
while (state.KeepRunningBatch(num_iters)) {
for (int iter = 0; iter < num_iters; iter++) {
benchmark::utils::WipePthreadpoolL2Caches(state, threadpool);

status = xnn_run_operator(op, threadpool);
Expand All @@ -109,6 +106,7 @@ void xnnpack_batch_matrix_multiply_f32(benchmark::State& state,
return;
}
}
num_iters = 1;
}

status = xnn_delete_operator(op);
Expand Down Expand Up @@ -207,8 +205,9 @@ void xnnpack_batch_matrix_multiply_qd8_f32_qc8w(benchmark::State& state,
return;
}

while (state.KeepRunningBatch(kMinIterations)) {
for (int iter = 0; iter < kMinIterations; iter++) {
int num_iters = FLAGS_benchmark_min_iters;
while (state.KeepRunningBatch(num_iters)) {
for (int iter = 0; iter < num_iters; iter++) {
benchmark::utils::WipePthreadpoolL2Caches(state, threadpool);

status = xnn_run_operator(op, threadpool);
Expand All @@ -218,6 +217,7 @@ void xnnpack_batch_matrix_multiply_qd8_f32_qc8w(benchmark::State& state,
return;
}
}
num_iters = 1;
}

status = xnn_delete_operator(op);
Expand Down Expand Up @@ -353,13 +353,15 @@ void tflite_batch_matrix_multiply_f32(benchmark::State& state,
interpreter->typed_tensor<float>(1) + batch_size * k * n,
std::ref(f32rng));

while (state.KeepRunningBatch(kMinIterations)) {
for (int iter = 0; iter < kMinIterations; iter++) {
int num_iters = FLAGS_benchmark_min_iters;
while (state.KeepRunningBatch(num_iters)) {
for (int iter = 0; iter < num_iters; iter++) {
if (interpreter->Invoke() != kTfLiteOk) {
state.SkipWithError("failed to invoke TFLite interpreter");
return;
}
}
num_iters = 1;
}

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
Expand All @@ -376,5 +378,5 @@ void tflite_batch_matrix_multiply_f32(benchmark::State& state,
#endif // BENCHMARK_TENSORFLOW_LITE

#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
XNN_BENCHMARK_MAIN();
#endif
2 changes: 1 addition & 1 deletion bench/bf16-gemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -241,5 +241,5 @@ static void bf16_gemm(benchmark::State& state,
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64

#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
XNN_BENCHMARK_MAIN();
#endif
Loading

0 comments on commit 4922366

Please sign in to comment.