Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check that a PR driver is still running before trying to kill it #2799

Merged
merged 8 commits into from
Aug 7, 2024
8 changes: 4 additions & 4 deletions ci/scripts/check_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@ fi
export GH

rocotostat=$(command -v rocotostat)
if [[ -z ${rocotostat+x} ]]; then
if [[ -z ${rocotostat} ]]; then
echo "rocotostat not found on system"
exit 1
else
echo "rocotostat being used from ${rocotostat}"
fi
rocotocheck=$(command -v rocotocheck)
if [[ -z ${rocotocheck+x} ]]; then
if [[ -z ${rocotocheck} ]]; then
echo "rocotocheck not found on system"
exit 1
else
Expand All @@ -70,7 +70,7 @@ pr_list=""
if [[ -f "${pr_list_dbfile}" ]]; then
pr_list=$("${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Running) || true
fi
if [[ -z "${pr_list+x}" ]]; then
if [[ -z "${pr_list}" ]]; then
echo "no PRs open and ready to run cases on .. exiting"
exit 0
fi
Expand Down Expand Up @@ -124,7 +124,7 @@ for pr in ${pr_list}; do

for pslot_dir in "${pr_dir}/RUNTESTS/EXPDIR/"*; do
pslot=$(basename "${pslot_dir}") || true
if [[ -z "${pslot+x}" ]]; then
if [[ -z "${pslot}" ]]; then
echo "No experiments found in ${pslot_dir} .. exiting"
exit 0
fi
Expand Down
11 changes: 7 additions & 4 deletions ci/scripts/driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,9 @@ pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" -

for pr in ${pr_list}; do
pr_dir="${GFS_CI_ROOT}/PR/${pr}"
[[ ! -d ${pr_dir} ]] && mkdir -p "${pr_dir}"
db_list=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}")
output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log"
output_ci_single="${pr_dir}/output_single.log"
#############################################################
# Check if a Ready labeled PR has changed back from once set
# and in that case completely kill the previose driver.sh cron
Expand Down Expand Up @@ -107,7 +108,9 @@ for pr in ${pr_list}; do
echo -e "${pstree_out}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill
fi
else
ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill'
# Check if the driver is still running on the head node; if so, kill it and all child processes
#shellcheck disable=SC2029
ssh "${driver_HOST}" "pstree -A -p \"${driver_PID}\" | grep -Eow \"[0-9]+\" | xargs kill || echo \"Failed to kill process with PID: ${driver_PID}, it may not be valid.\""
fi
{
echo "Driver PID: Requested termination of ${driver_PID} and children on ${driver_HOST}"
Expand Down Expand Up @@ -141,7 +144,7 @@ pr_list=""
if [[ -f "${pr_list_dbfile}" ]]; then
pr_list=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Ready) || true
fi
if [[ -z "${pr_list+x}" ]]; then
if [[ -z "${pr_list}" ]]; then
echo "no PRs open and ready for checkout/build .. exiting"
exit 0
fi
Expand All @@ -155,7 +158,7 @@ fi
for pr in ${pr_list}; do
# Skip pr's that are currently Building for when overlapping driver scripts are being called from within cron
pr_building=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --display "${pr}" --dbfile "${pr_list_dbfile}" | grep Building) || true
if [[ -z "${pr_building+x}" ]]; then
if [[ -n "${pr_building}" ]]; then
continue
fi
id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id')
Expand Down
Loading