Skip to content

Commit

Permalink
Merge pull request #3165 from branfosj/20240207121631_new_pr_tensorflow
Browse files Browse the repository at this point in the history
replace `run_cmd` with `run_shell_cmd` in custom easyblock for TensorFlow (`tensorflow.py`)
  • Loading branch information
boegel authored Feb 7, 2024
2 parents 55c6558 + 4dc075e commit e565dc5
Showing 1 changed file with 23 additions and 22 deletions.
45 changes: 23 additions & 22 deletions easybuild/easyblocks/t/tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from easybuild.tools.filetools import adjust_permissions, apply_regex_substitutions, copy_file, mkdir, resolve_path
from easybuild.tools.filetools import is_readable, read_file, symlink, which, write_file, remove_file
from easybuild.tools.modules import get_software_root, get_software_version, get_software_libdir
from easybuild.tools.run import run_cmd
from easybuild.tools.run import run_shell_cmd
from easybuild.tools.systemtools import AARCH64, X86_64, get_cpu_architecture, get_os_name, get_os_version
from easybuild.tools.toolchain.toolchain import RPATH_WRAPPERS_SUBDIR

Expand Down Expand Up @@ -273,9 +273,9 @@ def __init__(self, *args, **kwargs):
def python_pkg_exists(self, name):
"""Check if the given python package exists/can be imported"""
cmd = self.python_cmd + " -c 'import %s'" % name
out, ec = run_cmd(cmd, log_ok=False)
self.log.debug('Existence check for %s returned %s with output: %s', name, ec, out)
return ec == 0
res = run_shell_cmd(cmd, fail_on_error=False)
self.log.debug('Existence check for %s returned %s with output: %s', name, res.exit_code, res.output)
return res.exit_code == 0

def handle_jemalloc(self):
"""Figure out whether jemalloc support should be enabled or not."""
Expand Down Expand Up @@ -714,7 +714,7 @@ def configure_step(self):
apply_regex_substitutions('configure.py', regex_subs)

cmd = self.cfg['preconfigopts'] + './configure ' + self.cfg['configopts']
run_cmd(cmd, log_all=True, simple=True)
run_shell_cmd(cmd)

# when building on Arm 64-bit we can't just use --copt=-mcpu=native (or likewise for any -mcpu=...),
# because it breaks the build of XNNPACK;
Expand Down Expand Up @@ -938,11 +938,11 @@ def build_step(self):
+ ['//tensorflow/tools/pip_package:build_pip_package']
)

run_cmd(' '.join(cmd), log_all=True, simple=True, log_ok=True)
run_shell_cmd(' '.join(cmd))

# run generated 'build_pip_package' script to build the .whl
cmd = "bazel-bin/tensorflow/tools/pip_package/build_pip_package %s" % self.builddir
run_cmd(cmd, log_all=True, simple=True, log_ok=True)
run_shell_cmd(cmd)

def test_step(self):
"""Run TensorFlow unit tests"""
Expand Down Expand Up @@ -971,14 +971,15 @@ def test_step(self):
num_gpus_to_use = 0
else:
# determine number of available GPUs via nvidia-smi command, fall back to just 1 GPU
# Note: Disable logging to also disable the error handling in run_cmd and do it explicitly below
(out, ec) = run_cmd("nvidia-smi --list-gpus", log_ok=False, log_all=False, regexp=False)
# Note: Disable checking exit code in run_shell_cmd, and do it explicitly below
res = run_shell_cmd("nvidia-smi --list-gpus", fail_on_error=False)
try:
if ec != 0:
raise RuntimeError("nvidia-smi returned exit code %s with output:\n%s" % (ec, out))
if res.exit_code != 0:
raise RuntimeError("nvidia-smi returned exit code %s with output:\n%s" % (res.exit_code,
res.output))
else:
self.log.info('nvidia-smi succeeded with output:\n%s' % out)
gpu_ct = sum(line.startswith('GPU ') for line in out.strip().split('\n'))
self.log.info('nvidia-smi succeeded with output:\n%s' % res.output)
gpu_ct = sum(line.startswith('GPU ') for line in res.output.strip().split('\n'))
except (RuntimeError, ValueError) as err:
self.log.warning("Failed to get the number of GPUs on this system: %s", err)
gpu_ct = 0
Expand Down Expand Up @@ -1065,16 +1066,16 @@ def test_step(self):
+ test_targets
)

stdouterr, ec = run_cmd(cmd, log_ok=False, simple=False)
if ec:
res = run_shell_cmd(cmd, fail_on_error=False)
if res.exit_code:
fail_msg = 'Tests on %s (cmd: %s) failed with exit code %s and output:\n%s' % (
device, cmd, ec, stdouterr)
device, cmd, res.exit_code, res.output)
self.log.warning(fail_msg)
# Try to enhance error message
failed_tests = []
failed_test_logs = dict()
# Bazel outputs failed tests like "//tensorflow/c:kernels_test FAILED in[...]"
for match in re.finditer(r'^(//[a-zA-Z_/:]+)\s+FAILED', stdouterr, re.MULTILINE):
for match in re.finditer(r'^(//[a-zA-Z_/:]+)\s+FAILED', res.output, re.MULTILINE):
test_name = match.group(1)
failed_tests.append(test_name)
# Logs are in a folder named after the test, e.g. tensorflow/c/kernels_test
Expand All @@ -1083,7 +1084,7 @@ def test_step(self):
# <prefix>/k8-opt/testlogs/tensorflow/c/kernels_test/test.log
# <prefix>/k8-opt/testlogs/tensorflow/c/kernels_test/shard_1_of_4/test_attempts/attempt_1.log
test_log_re = re.compile(r'.*\n(.*\n)?\s*(/.*/testlogs/%s/(/[^/]*)?test.log)' % test_folder)
log_match = test_log_re.match(stdouterr, match.end())
log_match = test_log_re.match(res.output, match.end())
if log_match:
failed_test_logs[test_name] = log_match.group(2)
# When TF logs are found enhance the below error by additionally logging the details about failed tests
Expand All @@ -1097,7 +1098,7 @@ def test_step(self):
len(failed_tests), device, ', '.join(failed_tests))
self.report_test_failure(fail_msg)
else:
self.log.info('Tests on %s succeeded with output:\n%s', device, stdouterr)
self.log.info('Tests on %s succeeded with output:\n%s', device, res.output)

def install_step(self):
"""Custom install procedure for TensorFlow."""
Expand All @@ -1119,7 +1120,7 @@ def install_step(self):
if self.cfg['exts_list']:
cmd += ' --no-deps'

run_cmd(cmd, log_all=True, simple=True, log_ok=True)
run_shell_cmd(cmd)
else:
raise EasyBuildError("Failed to isolate built .whl in %s: %s", whl_paths, self.builddir)

Expand Down Expand Up @@ -1186,7 +1187,7 @@ def sanity_check_step(self):
logdir = tempfile.mkdtemp(suffix='-tf-%s-logs' % os.path.splitext(mnist_py)[0])
mnist_py = os.path.join(self.start_dir, 'tensorflow', 'examples', 'tutorials', 'mnist', mnist_py)
cmd = "%s %s --data_dir %s --log_dir %s" % (self.python_cmd, mnist_py, datadir, logdir)
run_cmd(cmd, log_all=True, simple=True, log_ok=True)
run_shell_cmd(cmd)

# run test script (if any)
if self.test_script:
Expand All @@ -1195,6 +1196,6 @@ def sanity_check_step(self):
test_script = os.path.join(self.builddir, os.path.basename(self.test_script))
copy_file(self.test_script, test_script)

run_cmd("python %s" % test_script, log_all=True, simple=True, log_ok=True)
run_shell_cmd("python %s" % test_script)

return res

0 comments on commit e565dc5

Please sign in to comment.