Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TL/UCP: Allow self copy in allgather using network loopback #1021

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/components/tl/ucp/allgather/allgather.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,22 @@ char *ucc_tl_ucp_allgather_score_str_get(ucc_tl_ucp_team_t *team)
UCC_TL_UCP_ALLGATHER_DEFAULT_ALG_SELECT_STR, algo_num);
return str;
}

ucc_status_t loopback_self_copy(void *rbuf, void *sbuf, size_t data_size,
ucc_memory_type_t rmem, ucc_memory_type_t smem,
ucc_rank_t rank, ucc_tl_ucp_team_t *team,
ucc_tl_ucp_task_t *task)
{
ucc_status_t status;
status = ucc_tl_ucp_send_nb(sbuf, data_size, smem, rank, team, task);
if (UCC_OK != status) {
task->super.status = status;
return task->super.status;
}
status = ucc_tl_ucp_recv_nb(rbuf, data_size, rmem, rank, team, task);
if (UCC_OK != status) {
task->super.status = status;
return task->super.status;
}
return UCC_OK;
}
6 changes: 6 additions & 0 deletions src/components/tl/ucp/allgather/allgather.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#define ALLGATHER_H_
#include "../tl_ucp.h"
#include "../tl_ucp_coll.h"
#include "tl_ucp_sendrecv.h"

enum {
UCC_TL_UCP_ALLGATHER_ALG_KNOMIAL,
Expand Down Expand Up @@ -38,6 +39,11 @@ static inline int ucc_tl_ucp_allgather_alg_from_str(const char *str)

ucc_status_t ucc_tl_ucp_allgather_init(ucc_tl_ucp_task_t *task);

ucc_status_t loopback_self_copy(void *rbuf, void *sbuf, size_t data_size,
ucc_memory_type_t rmem, ucc_memory_type_t smem,
ucc_rank_t rank, ucc_tl_ucp_team_t *team,
ucc_tl_ucp_task_t *task);

/* Ring */
ucc_status_t ucc_tl_ucp_allgather_ring_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *team,
Expand Down
107 changes: 77 additions & 30 deletions src/components/tl/ucp/allgather/allgather_bruck.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ void ucc_tl_ucp_allgather_bruck_progress(ucc_coll_task_t *coll_task)
task->allgather_bruck.scratch_header;
size_t scratch_size = task->allgather_bruck.scratch_size;
size_t data_size = (count / tsize) * ucc_dt_size(dt);
int use_loopback = UCC_TL_UCP_TEAM_LIB(team)->cfg.allgather_use_loopback;
ucc_rank_t rank = ucc_ep_map_eval(task->subset.map, trank);
ucc_rank_t recvfrom, sendto;
ucc_status_t status;
size_t blockcount, distance;
Expand All @@ -115,9 +117,13 @@ void ucc_tl_ucp_allgather_bruck_progress(ucc_coll_task_t *coll_task)
if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) {
return;
}
/*printf("tagged rcv: %d\n", (int)(task->tagged.recv_posted));
use_loopback = 0;*/


/* On each step doubles distance */
distance = 1 << task->tagged.recv_posted;
//distance = 1 << task->tagged.recv_posted;
distance = use_loopback ? 1 << (task->tagged.recv_posted - 1) : 1 << task->tagged.recv_posted;
tmpsend = rbuf;
while (distance < tsize) {

Expand Down Expand Up @@ -153,24 +159,35 @@ void ucc_tl_ucp_allgather_bruck_progress(ucc_coll_task_t *coll_task)
}

/* post processing step */
if (trank != 0) {
if ((trank != 0) && (task->allgather_bruck.post_processing_step == 1)) {
task->allgather_bruck.post_processing_step = 0;
if (UCC_MEMORY_TYPE_HOST == rmem) {
// copy blocks [0 .. (size - rank - 1)] from rbuf to shift buffer
status = ucc_mc_memcpy(scratch_header->addr, rbuf, scratch_size,
UCC_MEMORY_TYPE_HOST, rmem);
if (use_loopback){
status = loopback_self_copy(scratch_header->addr, rbuf, scratch_size,
UCC_MEMORY_TYPE_HOST, rmem, rank, team, task);
} else {
status = ucc_mc_memcpy(scratch_header->addr, rbuf, scratch_size,
UCC_MEMORY_TYPE_HOST, rmem);
}
if (ucc_unlikely(status != UCC_OK)) {
tl_error(UCC_TASK_LIB(task),
"failed to copy data to scratch buffer");
"failed to copy data to scratch buffer");
task->super.status = status;
return;
}
}
// move blocks [(size - rank) .. size] from rbuf to beginning of rbuf
// TODO: rewrite to cycle to get rid of overlap
memmove(rbuf, PTR_OFFSET(rbuf, scratch_size), trank * data_size);
// copy blocks from shift buffer starting at block [rank] in rbuf.
status = ucc_mc_memcpy(PTR_OFFSET(rbuf, trank * data_size),
if (use_loopback) {
status = loopback_self_copy(PTR_OFFSET(rbuf, trank * data_size), scratch_header->addr, scratch_size,
rmem, UCC_MEMORY_TYPE_HOST, rank, team, task);
} else {
status = ucc_mc_memcpy(PTR_OFFSET(rbuf, trank * data_size),
scratch_header->addr, scratch_size, rmem,
UCC_MEMORY_TYPE_HOST);
}
if (ucc_unlikely(status != UCC_OK)) {
tl_error(UCC_TASK_LIB(task),
"failed to copy data from scratch to rbuff buffer");
Expand All @@ -180,28 +197,43 @@ void ucc_tl_ucp_allgather_bruck_progress(ucc_coll_task_t *coll_task)
} else {
/* In case of non host memory we perform two copy to host buffer and then back to device, 3 memcopy in total */
/* TODO: replace with generic kernel to do bruck post step in sinle launch on device */
status = ucc_mc_memcpy(
PTR_OFFSET(scratch_header->addr, trank * data_size), rbuf,
(tsize - trank) * data_size, UCC_MEMORY_TYPE_HOST, rmem);
if (use_loopback) {
status = loopback_self_copy(PTR_OFFSET(scratch_header->addr, trank * data_size), rbuf, (tsize - trank) * data_size,
UCC_MEMORY_TYPE_HOST, rmem, rank, team, task);
} else {
status = ucc_mc_memcpy(
PTR_OFFSET(scratch_header->addr, trank * data_size), rbuf,
(tsize - trank) * data_size, UCC_MEMORY_TYPE_HOST, rmem);
}
if (ucc_unlikely(status != UCC_OK)) {
tl_error(UCC_TASK_LIB(task),
"failed to copy first data part to scratch buffer");
task->super.status = status;
return;
}
status =
ucc_mc_memcpy(scratch_header->addr,
PTR_OFFSET(rbuf, (tsize - trank) * data_size),
trank * data_size, UCC_MEMORY_TYPE_HOST, rmem);
if (use_loopback) {
status = loopback_self_copy(scratch_header->addr, PTR_OFFSET(rbuf, (tsize - trank) * data_size), trank * data_size,
UCC_MEMORY_TYPE_HOST, rmem, rank, team, task);
} else {
status =
ucc_mc_memcpy(scratch_header->addr,
PTR_OFFSET(rbuf, (tsize - trank) * data_size),
trank * data_size, UCC_MEMORY_TYPE_HOST, rmem);
}
if (ucc_unlikely(status != UCC_OK)) {
tl_error(UCC_TASK_LIB(task),
"failed to copy second data part to scratch buffer");
task->super.status = status;
return;
}
status =
ucc_mc_memcpy(rbuf, scratch_header->addr, tsize * data_size,
rmem, UCC_MEMORY_TYPE_HOST);
if (use_loopback) {
status = loopback_self_copy(rbuf, scratch_header->addr, tsize * data_size,
rmem, UCC_MEMORY_TYPE_HOST, rank, team, task);
} else {
status =
ucc_mc_memcpy(rbuf, scratch_header->addr, tsize * data_size,
rmem, UCC_MEMORY_TYPE_HOST);
}
if (ucc_unlikely(status != UCC_OK)) {
tl_error(UCC_TASK_LIB(task),
"failed to copy from scratch buffer to dst");
Expand All @@ -211,6 +243,10 @@ void ucc_tl_ucp_allgather_bruck_progress(ucc_coll_task_t *coll_task)
}
}

if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) {
return;
}
printf("rank %d\n", (int)trank);
ucc_assert(UCC_TL_UCP_TASK_P2P_COMPLETE(task));

task->super.status = UCC_OK;
Expand All @@ -221,33 +257,44 @@ void ucc_tl_ucp_allgather_bruck_progress(ucc_coll_task_t *coll_task)

ucc_status_t ucc_tl_ucp_allgather_bruck_start(ucc_coll_task_t *coll_task)
{
ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
ucc_tl_ucp_team_t *team = TASK_TEAM(task);
size_t count = TASK_ARGS(task).dst.info.count;
void *sbuf = TASK_ARGS(task).src.info.buffer;
void *rbuf = TASK_ARGS(task).dst.info.buffer;
ucc_memory_type_t smem = TASK_ARGS(task).src.info.mem_type;
ucc_memory_type_t rmem = TASK_ARGS(task).dst.info.mem_type;
ucc_datatype_t dt = TASK_ARGS(task).dst.info.datatype;
ucc_rank_t trank = UCC_TL_TEAM_RANK(team);
ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team);
size_t data_size = (count / tsize) * ucc_dt_size(dt);
ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
ucc_tl_ucp_team_t *team = TASK_TEAM(task);
size_t count = TASK_ARGS(task).dst.info.count;
void *sbuf = TASK_ARGS(task).src.info.buffer;
void *rbuf = TASK_ARGS(task).dst.info.buffer;
ucc_memory_type_t smem = TASK_ARGS(task).src.info.mem_type;
ucc_memory_type_t rmem = TASK_ARGS(task).dst.info.mem_type;
ucc_datatype_t dt = TASK_ARGS(task).dst.info.datatype;
ucc_rank_t trank = UCC_TL_TEAM_RANK(team);
ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team);
size_t data_size = (count / tsize) * ucc_dt_size(dt);
int use_loopback = UCC_TL_UCP_TEAM_LIB(team)->cfg.allgather_use_loopback;
ucc_rank_t rank = ucc_ep_map_eval(task->subset.map, trank);
ucc_status_t status;

UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allgather_bruck_start", 0);
ucc_tl_ucp_task_reset(task, UCC_INPROGRESS);
task->allgather_bruck.post_processing_step = 1;

/* initial step: copy data on non root ranks to the beginning of buffer */
if (!UCC_IS_INPLACE(TASK_ARGS(task))) {
// not inplace: copy chunk from source buff to beginning of receive
status = ucc_mc_memcpy(rbuf, sbuf, data_size, rmem, smem);
if (use_loopback) {
status = loopback_self_copy(rbuf, sbuf, data_size, rmem, smem, rank, team, task);
} else {
status = ucc_mc_memcpy(rbuf, sbuf, data_size, rmem, smem);
}
if (ucc_unlikely(UCC_OK != status)) {
return status;
}
} else if (trank != 0) {
// inplace: copy chunk to the begin
status = ucc_mc_memcpy(rbuf, PTR_OFFSET(rbuf, data_size * trank),
if (use_loopback) {
status = loopback_self_copy(rbuf, PTR_OFFSET(rbuf, data_size * trank), data_size, rmem, rmem, rank, team, task);
}else {
status = ucc_mc_memcpy(rbuf, PTR_OFFSET(rbuf, data_size * trank),
data_size, rmem, rmem);
}
if (ucc_unlikely(UCC_OK != status)) {
return status;
}
Expand Down
74 changes: 47 additions & 27 deletions src/components/tl/ucp/allgather/allgather_knomial.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "coll_patterns/sra_knomial.h"
#include "utils/ucc_math.h"
#include "utils/ucc_coll_utils.h"
#include "allgather.h"

#define SAVE_STATE(_phase) \
do { \
Expand Down Expand Up @@ -54,22 +55,21 @@

void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task)
{
ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task,
ucc_tl_ucp_task_t);
ucc_coll_args_t *args = &TASK_ARGS(task);
ucc_tl_ucp_team_t *team = TASK_TEAM(task);
ucc_kn_radix_t radix = task->allgather_kn.p.radix;
ucc_tl_ucp_task_t * task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion is not to reformat existing code (* symbol), because clang format formats different depending on version, so it is better to leave as before. @samnordmann @Sergei-Lebedev what do you think?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indeed, clang format often does this undesirable reformatting... We should have a long-term fix -- in the meantime I suggest to manually place the "*" where they should

ucc_coll_args_t * args = &TASK_ARGS(task);
ucc_tl_ucp_team_t * team = TASK_TEAM(task);
ucc_kn_radix_t radix = task->allgather_kn.p.radix;
uint8_t node_type = task->allgather_kn.p.node_type;
ucc_knomial_pattern_t *p = &task->allgather_kn.p;
void *rbuf = GET_DST(args);
void * rbuf = GET_DST(args);
ucc_memory_type_t mem_type = GET_MT(args);
size_t dt_size = ucc_dt_size(GET_DT(args));
ucc_rank_t size = task->subset.map.ep_num;
size_t data_size = GET_TOTAL_COUNT(args, size);
ucc_rank_t broot = args->coll_type == UCC_COLL_TYPE_BCAST ?
args->root : 0;
ucc_rank_t rank = VRANK(task->subset.myrank, broot, size);
size_t local = GET_LOCAL_COUNT(args, size, rank);
ucc_rank_t broot = args->coll_type == UCC_COLL_TYPE_BCAST ? args->root : 0;
ucc_rank_t rank = VRANK(task->subset.myrank, broot, size);
size_t local = GET_LOCAL_COUNT(args, size, rank);
int use_loopback = UCC_TL_UCP_TEAM_LIB(team)->cfg.allgather_use_loopback;
void *sbuf;
ptrdiff_t peer_seg_offset, local_seg_offset;
ucc_rank_t peer, peer_dist;
Expand All @@ -78,8 +78,14 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task)
ucc_status_t status;
size_t extra_count;

EXEC_TASK_TEST(UCC_KN_PHASE_INIT, "failed during ee task test",
task->allgather_kn.etask);
if (use_loopback) {
if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) {
return;
}
} else {
EXEC_TASK_TEST(UCC_KN_PHASE_INIT, "failed during ee task test",
task->allgather_kn.etask);
}
task->allgather_kn.etask = NULL;
UCC_KN_GOTO_PHASE(task->allgather_kn.phase);
if (KN_NODE_EXTRA == node_type) {
Expand Down Expand Up @@ -209,6 +215,7 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_start(ucc_coll_task_t *coll_task)
ct == UCC_COLL_TYPE_BCAST ?
args->root : 0, size);
ucc_ee_executor_task_args_t eargs = {0};
int use_loopback = UCC_TL_UCP_TEAM_LIB(team)->cfg.allgather_use_loopback;
ucc_status_t status;
ptrdiff_t offset;
ucc_ee_executor_t *exec;
Expand All @@ -225,21 +232,34 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_start(ucc_coll_task_t *coll_task)
ucc_dt_size(args->dst.info.datatype);
rbuf = args->dst.info.buffer;
if (!UCC_IS_INPLACE(*args)) {
status = ucc_coll_task_get_executor(&task->super, &exec);
if (ucc_unlikely(status != UCC_OK)) {
task->super.status = status;
return status;
}
eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY;
eargs.copy.dst = PTR_OFFSET(args->dst.info.buffer, offset);
eargs.copy.src = args->src.info.buffer;
eargs.copy.len = args->src.info.count *
ucc_dt_size(args->src.info.datatype);
status = ucc_ee_executor_task_post(exec, &eargs,
&task->allgather_kn.etask);
if (ucc_unlikely(status != UCC_OK)) {
task->super.status = status;
return status;
if (use_loopback) {
status = loopback_self_copy(
PTR_OFFSET(args->dst.info.buffer, offset),
args->src.info.buffer,
args->src.info.count * ucc_dt_size(args->src.info.datatype),
args->dst.info.mem_type, args->src.info.mem_type, rank,
team, task);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}
} else {
/* Executer */
status = ucc_coll_task_get_executor(&task->super, &exec);
if (ucc_unlikely(status != UCC_OK)) {
task->super.status = status;
return status;
}
eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY;
eargs.copy.dst = PTR_OFFSET(args->dst.info.buffer, offset);
eargs.copy.src = args->src.info.buffer;
eargs.copy.len =
args->src.info.count * ucc_dt_size(args->src.info.datatype);
status = ucc_ee_executor_task_post(exec, &eargs,
&task->allgather_kn.etask);
if (ucc_unlikely(status != UCC_OK)) {
task->super.status = status;
return status;
}
}
}
} else if (ct == UCC_COLL_TYPE_ALLGATHERV) {
Expand Down
16 changes: 12 additions & 4 deletions src/components/tl/ucp/allgather/allgather_neighbor.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,15 @@ ucc_status_t ucc_tl_ucp_allgather_neighbor_start(ucc_coll_task_t *coll_task)
ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
ucc_tl_ucp_team_t *team = TASK_TEAM(task);
size_t count = TASK_ARGS(task).dst.info.count;
void *sbuf = TASK_ARGS(task).src.info.buffer;
void *rbuf = TASK_ARGS(task).dst.info.buffer;
void * sbuf = TASK_ARGS(task).src.info.buffer;
void * rbuf = TASK_ARGS(task).dst.info.buffer;
ucc_memory_type_t smem = TASK_ARGS(task).src.info.mem_type;
ucc_memory_type_t rmem = TASK_ARGS(task).dst.info.mem_type;
ucc_datatype_t dt = TASK_ARGS(task).dst.info.datatype;
ucc_rank_t trank = UCC_TL_TEAM_RANK(team);
ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team);
size_t data_size = (count / tsize) * ucc_dt_size(dt);
int use_loopback = UCC_TL_UCP_TEAM_LIB(team)->cfg.allgather_use_loopback;
ucc_status_t status;
ucc_rank_t neighbor;
void *tmprecv, *tmpsend;
Expand All @@ -150,8 +151,15 @@ ucc_status_t ucc_tl_ucp_allgather_neighbor_start(ucc_coll_task_t *coll_task)
ucc_tl_ucp_task_reset(task, UCC_INPROGRESS);

if (!UCC_IS_INPLACE(TASK_ARGS(task))) {
status = ucc_mc_memcpy(PTR_OFFSET(rbuf, data_size * trank), sbuf,
data_size, rmem, smem);
if (use_loopback) {
status =
loopback_self_copy(PTR_OFFSET(rbuf, data_size * trank), sbuf,
data_size, rmem, smem, trank, team, task);
} else {
/* Use cuda copy */
status = ucc_mc_memcpy(PTR_OFFSET(rbuf, data_size * trank), sbuf,
data_size, rmem, smem);
}
if (ucc_unlikely(UCC_OK != status)) {
return status;
}
Expand Down
Loading
Loading