From e38647ffbcc6507c172e14cd4210b373ee950b78 Mon Sep 17 00:00:00 2001 From: shimmybalsam <38699071+shimmybalsam@users.noreply.github.com> Date: Mon, 4 Apr 2022 20:54:06 +0300 Subject: [PATCH] TL/UCP: reduce_kn avg pre op fix (#449) (#454) * TL/UCP: reduce_kn avg pre op fix * TL/UCP: code review fixes Co-authored-by: valentin petrov Co-authored-by: valentin petrov --- src/components/tl/ucp/reduce/reduce.c | 18 ++++-- src/components/tl/ucp/reduce/reduce_knomial.c | 58 +++++++++++++++---- 2 files changed, 61 insertions(+), 15 deletions(-) diff --git a/src/components/tl/ucp/reduce/reduce.c b/src/components/tl/ucp/reduce/reduce.c index 85ba57d085..9003b01a8b 100644 --- a/src/components/tl/ucp/reduce/reduce.c +++ b/src/components/tl/ucp/reduce/reduce.c @@ -20,16 +20,21 @@ ucc_status_t ucc_tl_ucp_reduce_init(ucc_tl_ucp_task_t *task) ucc_rank_t vrank = (myrank - root + team_size) % team_size; ucc_status_t status = UCC_OK; ucc_memory_type_t mtype; - size_t data_size; + ucc_datatype_t dt; + size_t count, data_size; int isleaf; + int self_avg; if (root == myrank) { - data_size = args->dst.info.count * ucc_dt_size(args->dst.info.datatype); + count = args->dst.info.count; + dt = args->dst.info.datatype; mtype = args->dst.info.mem_type; } else { - data_size = args->src.info.count * ucc_dt_size(args->src.info.datatype); + count = args->src.info.count; + dt = args->src.info.datatype; mtype = args->src.info.mem_type; } + data_size = count * ucc_dt_size(dt); task->super.post = ucc_tl_ucp_reduce_knomial_start; task->super.progress = ucc_tl_ucp_reduce_knomial_progress; task->super.finalize = ucc_tl_ucp_reduce_knomial_finalize; @@ -37,10 +42,12 @@ ucc_status_t ucc_tl_ucp_reduce_init(ucc_tl_ucp_task_t *task) ucc_min(UCC_TL_UCP_TEAM_LIB(team)->cfg.reduce_kn_radix, team_size); CALC_KN_TREE_DIST(team_size, task->reduce_kn.radix, task->reduce_kn.max_dist); - isleaf = (vrank % task->reduce_kn.radix != 0 || vrank == team_size - 1); + isleaf = (vrank % task->reduce_kn.radix != 0 || vrank == team_size - 1); + self_avg = (vrank % task->reduce_kn.radix == 0 && args->op == UCC_OP_AVG && + UCC_TL_UCP_TEAM_LIB(team)->cfg.reduce_avg_pre_op); task->reduce_kn.scratch_mc_header = NULL; - if (!isleaf) { + if (!isleaf || self_avg) { /* scratch of size radix to fit up to radix - 1 recieved vectors from its children at each step, and an additional 1 for previous step reduce multi result */ @@ -49,5 +56,6 @@ ucc_status_t ucc_tl_ucp_reduce_init(ucc_tl_ucp_task_t *task) task->reduce_kn.scratch = task->reduce_kn.scratch_mc_header->addr; } + return status; } diff --git a/src/components/tl/ucp/reduce/reduce_knomial.c b/src/components/tl/ucp/reduce/reduce_knomial.c index dbf21ae6ef..9ba75eb626 100644 --- a/src/components/tl/ucp/reduce/reduce_knomial.c +++ b/src/components/tl/ucp/reduce/reduce_knomial.c @@ -124,15 +124,37 @@ void ucc_tl_ucp_reduce_knomial_progress(ucc_coll_task_t *coll_task) ucc_status_t ucc_tl_ucp_reduce_knomial_start(ucc_coll_task_t *coll_task) { - ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t); - ucc_coll_args_t *args = &TASK_ARGS(task); - ucc_tl_ucp_team_t *team = TASK_TEAM(task); - uint32_t radix = task->reduce_kn.radix; - ucc_rank_t root = (ucc_rank_t)args->root; - ucc_rank_t rank = UCC_TL_TEAM_RANK(team); - ucc_rank_t size = UCC_TL_TEAM_SIZE(team); - ucc_rank_t vrank = (rank - root + size) % size; - int isleaf = ((vrank % radix != 0) || (vrank == size - 1)); + ucc_tl_ucp_task_t *task = + ucc_derived_of(coll_task, ucc_tl_ucp_task_t); + ucc_coll_args_t *args = &TASK_ARGS(task); + ucc_tl_ucp_team_t *team = TASK_TEAM(task); + uint32_t radix = task->reduce_kn.radix; + ucc_rank_t root = (ucc_rank_t)args->root; + ucc_rank_t rank = UCC_TL_TEAM_RANK(team); + ucc_rank_t size = UCC_TL_TEAM_SIZE(team); + ucc_rank_t vrank = (rank - root + size) % size; + int isleaf = + (vrank % radix != 0 || vrank == size - 1); + int avg_pre_op = + UCC_TL_UCP_TEAM_LIB(team)->cfg.reduce_avg_pre_op; + int self_avg = (args->op == UCC_OP_AVG && + avg_pre_op && vrank % radix == 0); + size_t data_size, count; + ucc_memory_type_t mtype; + ucc_datatype_t dt; + ucc_status_t status; + + if (root == rank) { + count = args->dst.info.count; + dt = args->dst.info.datatype; + mtype = args->dst.info.mem_type; + } else { + count = args->src.info.count; + dt = args->src.info.datatype; + mtype = args->src.info.mem_type; + } + data_size = count * ucc_dt_size(dt); + UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_reduce_kn_start", 0); ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); @@ -141,10 +163,26 @@ ucc_status_t ucc_tl_ucp_reduce_knomial_start(ucc_coll_task_t *coll_task) args->src.info.buffer = args->dst.info.buffer; } - if (isleaf) { + if (isleaf && !self_avg) { task->reduce_kn.scratch = args->src.info.buffer; } + if (isleaf && self_avg) { + /* In case of avg_pre_op, single leaf process which does not take part + in first iteration reduction must divide itself by team_size */ + status = ucc_dt_reduce_multi_alpha(args->src.info.buffer, + args->src.info.buffer, task->reduce_kn.scratch, 1, count, + data_size, dt, UCC_OP_PROD, + (double)1 / (double)(UCC_TL_TEAM_SIZE(TASK_TEAM(task)) * 2), + mtype, args); + if (ucc_unlikely(UCC_OK != status)) { + tl_error(UCC_TASK_LIB(task), + "failed to perform dt reduction"); + task->super.super.status = status; + return status; + } + } + task->reduce_kn.dist = 1; task->reduce_kn.phase = UCC_REDUCE_KN_PHASE_INIT;