diff --git a/loopy/__init__.py b/loopy/__init__.py index 57e214833..633e6595f 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -43,7 +43,8 @@ AddressSpace, TemporaryVariable, SubstitutionRule, - CallMangleInfo) + CallMangleInfo, + OpenMPSIMDTag, VectorizeTag) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.translation_unit import ( @@ -153,6 +154,8 @@ from loopy.target.c import (CFamilyTarget, CTarget, ExecutableCTarget, generate_header, CWithGNULibcTarget, ExecutableCWithGNULibcTarget) +from loopy.target.c_vector_extensions import (CVectorExtensionsTarget, + ExecutableCVectorExtensionsTarget) from loopy.target.cuda import CudaTarget from loopy.target.opencl import OpenCLTarget from loopy.target.pyopencl import PyOpenCLTarget @@ -190,7 +193,7 @@ "AddressSpace", "TemporaryVariable", "SubstitutionRule", - "CallMangleInfo", + "CallMangleInfo", "OpenMPSIMDTag", "VectorizeTag", "make_kernel", "UniqueName", "make_function", @@ -300,6 +303,7 @@ "TargetBase", "CFamilyTarget", "CTarget", "ExecutableCTarget", "generate_header", "CWithGNULibcTarget", "ExecutableCWithGNULibcTarget", + "CVectorExtensionsTarget", "ExecutableCVectorExtensionsTarget", "CudaTarget", "OpenCLTarget", "PyOpenCLTarget", "ISPCTarget", "NumbaTarget", "NumbaCudaTarget", diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 98a5494b7..24d6c0ee0 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -395,21 +395,48 @@ def try_vectorized(self, what, func): return self.unvectorize(func) def unvectorize(self, func): + from loopy.kernel.data import VectorizeTag, UnrollTag, OpenMPSIMDTag + from loopy.codegen.result import (merge_codegen_results, + CodeGenerationResult) vinf = self.vectorization_info + vec_tag, = self.kernel.inames[vinf.iname].tags_of_type(VectorizeTag) result = [] - novec_self = self.copy(vectorization_info=False) - for i in range(vinf.length): - idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i - new_codegen_state = novec_self.fix(vinf.iname, idx_aff) - generated = func(new_codegen_state) - - if isinstance(generated, list): - result.extend(generated) + if isinstance(vec_tag.fallback_impl_tag, UnrollTag): + novec_self = self.copy(vectorization_info=False) + + for i in range(vinf.length): + idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i + new_codegen_state = novec_self.fix(vinf.iname, idx_aff) + generated = func(new_codegen_state) + + if isinstance(generated, list): + result.extend(generated) + else: + result.append(generated) + elif isinstance(vec_tag.fallback_impl_tag, OpenMPSIMDTag): + novec_self = self.copy(vectorization_info=False) + astb = self.ast_builder + inner = func(novec_self) + if isinstance(inner, list): + inner = merge_codegen_results(novec_self, inner) + assert isinstance(inner, CodeGenerationResult) + if isinstance(inner.current_ast(novec_self), + astb.ast_comment_class): + # loop body is a comment => do not emit the loop + loop_cgr = inner else: - result.append(generated) + result.append(astb.emit_pragma("omp simd")) + loop_cgr = inner.with_new_ast( + novec_self, + astb.emit_sequential_loop( + novec_self, vinf.iname, self.kernel.index_dtype, + 0, vinf.length-1, inner.current_ast(novec_self))) + result.append(loop_cgr) + elif vec_tag.fallback_impl_tag is None: + raise RuntimeError("Could not vectorize all statements" + f" in name {vinf.iname}") - from loopy.codegen.result import merge_codegen_results return merge_codegen_results(self, result) @property diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 8d98196f2..ee2b1e091 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -108,6 +108,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): elif isinstance(sched_item, EnterLoop): from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, LoopedIlpTag, VectorizeTag, + OpenMPSIMDTag, InameImplementationTag, InOrderSequentialSequentialTag, filter_iname_tags_by_type) @@ -117,12 +118,15 @@ def generate_code_for_sched_index(codegen_state, sched_index): from loopy.codegen.loop import ( generate_unroll_loop, generate_vectorize_loop, - generate_sequential_loop_dim_code) + generate_sequential_loop_dim_code, + generate_openmp_simd_loop) if filter_iname_tags_by_type(tags, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop elif filter_iname_tags_by_type(tags, VectorizeTag): func = generate_vectorize_loop + elif filter_iname_tags_by_type(tags, OpenMPSIMDTag): + func = generate_openmp_simd_loop elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)): func = generate_sequential_loop_dim_code diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 713254075..383988863 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -127,6 +127,13 @@ def generate_assignment_instruction_code(codegen_state, insn): raise UnvectorizableError( "LHS is scalar, RHS is vector, cannot assign") + if (lhs_is_vector + and (not rhs_is_vector) + and (not + kernel.target.broadcasts_scalar_assignment_to_vec_types)): + raise UnvectorizableError( + "LHS is vector, RHS is not vector, cannot assign") + is_vector = lhs_is_vector del lhs_is_vector diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index a0d22330f..48f67af26 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -160,10 +160,27 @@ def generate_unroll_loop(codegen_state, sched_index): # {{{ vectorized loops +def raise_for_unvectorizable_loop(codegen_state, sched_index): + kernel = codegen_state.kernel + raise RuntimeError(f"Cannot vectorize {kernel.schedule[sched_index]}") + + def generate_vectorize_loop(codegen_state, sched_index): + from loopy.kernel.data import VectorizeTag, UnrollTag, OpenMPSIMDTag kernel = codegen_state.kernel iname = kernel.linearization[sched_index].iname + vec_tag, = kernel.inames[iname].tags_of_type(VectorizeTag) + fallback_impl_tag = vec_tag.fallback_impl_tag + + if isinstance(fallback_impl_tag, UnrollTag): + fallback_codegen_routine = generate_unroll_loop + elif isinstance(fallback_impl_tag, OpenMPSIMDTag): + fallback_codegen_routine = generate_openmp_simd_loop + elif fallback_impl_tag is None: + fallback_codegen_routine = raise_for_unvectorizable_loop + else: + raise NotImplementedError(fallback_impl_tag) bounds = kernel.get_iname_bounds(iname, constants_only=True) @@ -177,7 +194,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return fallback_codegen_routine(codegen_state, sched_index) length = int(pw_aff_to_expr(length_aff)) @@ -192,7 +209,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return fallback_codegen_routine(codegen_state, sched_index) # {{{ 'implement' vectorization bounds @@ -484,4 +501,17 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): # }}} + +# {{{ omp simd loop + +def generate_openmp_simd_loop(codegen_state, sched_index): + return merge_codegen_results( + codegen_state, + [codegen_state.ast_builder.emit_pragma("omp simd"), + generate_sequential_loop_dim_code(codegen_state, + sched_index)]) + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/expression.py b/loopy/expression.py index fda3a1499..2230b9aed 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -76,7 +76,10 @@ def combine(vectorizabilities): return reduce(and_, vectorizabilities) def map_sum(self, expr): - return any(self.rec(child) for child in expr.children) + possible = False + for child in expr.children: + possible = self.rec(child) if True else possible + return possible map_product = map_sum @@ -176,6 +179,11 @@ def map_reduction(self, expr): # FIXME: Do this more carefully raise UnvectorizableError() + def map_if(self, expr): + raise UnvectorizableError("Emitting vector instructions with masks not" + " (yet) supported.") + + # }}} # vim: fdm=marker diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 0c7c81441..29ca4ba27 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1397,7 +1397,9 @@ def eval_expr_assert_integer_constant(i, expr): # We'll do absolutely nothing here, which will result # in the vector being returned. pass - + elif (vectorization_info is None + and target.allows_non_constant_indexing_for_vec_types): + vector_index = idx else: idx = eval_expr_assert_integer_constant(i, idx) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 5b536dfd4..9cfd394a8 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -203,7 +203,29 @@ def __str__(self): # }}} +class _NotProvided: + pass + + class VectorizeTag(UniqueInameTag, HardwareConcurrentTag): + """ + .. attribute:: fallback_impl_tag + + If the loop contains instructions that are not vectorizable, the code + generator will implement the loop as directed by `fallback_impl_tag`. + If *None*, then a :class:`RuntimeError` would be raised while + generating code for an unvectorizable instruction within the loop. + """ + def __init__(self, fallback_impl_tag=_NotProvided): + if fallback_impl_tag is _NotProvided: + from warnings import warn + warn("`fallback_impl_tag` not provided to VectorizeTag." + " This will be an error from 2023. To keep the current" + " behavior, instantiate as `VectorizeTag(UnrollTag())`", + DeprecationWarning, stacklevel=2) + fallback_impl_tag = UnrollTag() + super().__init__(fallback_impl_tag=fallback_impl_tag) + def __str__(self): return "vec" @@ -223,6 +245,15 @@ def __str__(self): return "ord" +class OpenMPSIMDTag(InameImplementationTag): + """ + Directs the code generator to emit code with ``#pragma omp simd`` + directive atop the loop. + """ + def __str__(self): + return "omp.simd" + + def parse_tag(tag): from pytools.tag import Tag as TagBase if tag is None: @@ -241,7 +272,7 @@ def parse_tag(tag): elif tag in ["unr"]: return UnrollTag() elif tag in ["vec"]: - return VectorizeTag() + return VectorizeTag(UnrollTag()) elif tag in ["ilp", "ilp.unr"]: return UnrolledIlpTag() elif tag == "ilp.seq": diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 3457b8e7e..1ac078cc7 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1007,6 +1007,11 @@ def ast_block_class(self): from cgen import Block return Block + @property + def ast_comment_class(self): + from cgen import Comment + return Comment + @property def ast_block_scope_class(self): return ScopingBlock @@ -1250,6 +1255,10 @@ def emit_comment(self, s): from cgen import Comment return Comment(s) + def emit_pragma(self, s): + from cgen import Pragma + return Pragma(s) + @property def can_implement_conditionals(self): return True diff --git a/loopy/target/c_vector_extensions.py b/loopy/target/c_vector_extensions.py new file mode 100644 index 000000000..962dfe4ee --- /dev/null +++ b/loopy/target/c_vector_extensions.py @@ -0,0 +1,142 @@ +import numpy as np +from pytools import memoize_method +from loopy.target.c import CTarget, CWithGNULibcASTBuilder, ExecutableCTarget +from loopy.types import NumpyType + + +# {{{ vector types + +class vec: # noqa + pass + + +def _create_vector_types(): + field_names = ["x", "y", "z", "w"] + + vec.types = {} + vec.names_and_dtypes = [] + vec.type_to_scalar_and_count = {} + + counts = [2, 3, 4, 8, 16] + + for base_name, base_type in [ + ("char", np.int8), + ("unsigned char", np.uint8), + ("short", np.int16), + ("unsigned short", np.uint16), + ("int", np.int32), + ("unsigned int", np.uint32), + ("long", np.int64), + ("unsigned long", np.uint64), + ("float", np.float32), + ("double", np.float64), + ]: + for count in counts: + byte_count = count*np.dtype(base_type).itemsize + name = "%s __attribute__((vector_size(%d)))" % (base_name, + byte_count) + + titles = field_names[:count] + + names = [f"s{i}" for i in range(count)] + + if len(titles) < len(names): + titles.extend((len(names)-len(titles))*[None]) + + try: + dtype = np.dtype(dict( + names=names, + formats=[base_type]*count, + titles=titles)) + except NotImplementedError: + try: + dtype = np.dtype([((n, title), base_type) + for (n, title) in zip(names, titles)]) + except TypeError: + dtype = np.dtype([(n, base_type) for (n, title) + in zip(names, titles)]) + + setattr(vec, name, dtype) + + vec.names_and_dtypes.append((name, dtype)) + + vec.types[np.dtype(base_type), count] = dtype + vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count + + +_create_vector_types() + + +def _register_vector_types(dtype_registry): + for name, dtype in vec.names_and_dtypes: + dtype_registry.get_or_register_dtype(name, dtype) + +# }}} + + +# {{{ target + +class CVectorExtensionsTarget(CTarget): + """A specialized C-target that represents vectorization through GCC/Clang + language extensions. + """ + + def get_device_ast_builder(self): + return CVectorExtensionsASTBuilder(self) + + @memoize_method + def get_dtype_registry(self): + from loopy.target.c.compyte.dtypes import ( + DTypeRegistry, fill_registry_with_c99_stdint_types, + fill_registry_with_c99_complex_types) + from loopy.target.c import DTypeRegistryWrapper + + result = DTypeRegistry() + fill_registry_with_c99_stdint_types(result) + fill_registry_with_c99_complex_types(result) + + _register_vector_types(result) + return DTypeRegistryWrapper(result) + + def is_vector_dtype(self, dtype): + return (isinstance(dtype, NumpyType) + and dtype.numpy_dtype in list(vec.types.values())) + + def vector_dtype(self, base, count): + return NumpyType( + vec.types[base.numpy_dtype, count], + target=self) + + @property + def allows_non_constant_indexing_for_vec_types(self): + return True + + @property + def broadcasts_scalar_assignment_to_vec_types(self): + return False + + +class ExecutableCVectorExtensionsTarget(CVectorExtensionsTarget, + ExecutableCTarget): + def __init__(self, compiler=None, fortran_abi=False): + ExecutableCTarget.__init__(self, compiler=compiler, fortran_abi=fortran_abi) + + def get_kernel_executor_cache_key(self, *args, **kwargs): + return ExecutableCTarget.get_kernel_executor_cache_key(self, *args, **kwargs) + + def get_kernel_executor(self, t_unit, *args, **kwargs): + return ExecutableCTarget.get_kernel_executor(self, t_unit, *args, **kwargs) + + def get_host_ast_builder(self): + return ExecutableCTarget.get_host_ast_builder(self) + +# }}} + + +# {{{ AST builder + +class CVectorExtensionsASTBuilder(CWithGNULibcASTBuilder): + def add_vector_access(self, access_expr, index): + return access_expr[index] + +# }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 08413b615..a39a9fb0f 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -265,6 +265,14 @@ def vector_dtype(self, base, count): # }}} + @property + def allows_non_constant_indexing_for_vec_types(self): + return False + + @property + def broadcasts_scalar_assignment_to_vec_types(self): + return True + # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 89710c023..ac5ec41c2 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -569,6 +569,14 @@ def vector_dtype(self, base, count): vec.types[base.numpy_dtype, count], target=self) + @property + def allows_non_constant_indexing_for_vec_types(self): + return False + + @property + def broadcasts_scalar_assignment_to_vec_types(self): + return True + # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index 15ddc4679..b67750cb4 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -218,6 +218,11 @@ def ast_block_scope_class(self): # and delete the implementation above. return Collection + @property + def ast_comment_class(self): + from genpy import Comment + return Comment + def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper diff --git a/test/test_target.py b/test/test_target.py index 126310e5e..070dee1db 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -663,6 +663,95 @@ def test_glibc_bessel_functions(dtype): rtol=1e-6, atol=1e-6) +def test_c_vector_extensions(): + knl = lp.make_kernel( + "{[i, j1, j2, j3]: 0<=i<10 and 0<=j1,j2,j3<4}", + """ + <> temp1[j1] = x[i, j1] + <> temp2[j2] = 2*temp1[j2] + 1 {inames=i:j2} + y[i, j3] = temp2[j3] + """, + [lp.GlobalArg("x, y", shape=lp.auto, dtype=float)], + seq_dependencies=True, + target=lp.CVectorExtensionsTarget()) + + knl = lp.tag_inames(knl, "j2:vec, j1:ilp, j3:ilp") + knl = lp.tag_array_axes(knl, "temp1,temp2", "vec") + + print(lp.generate_code_v2(knl).device_code()) + + +def test_omp_simd_tag(): + knl = lp.make_kernel( + "{[i]: 0<=i<16}", + """ + y[i] = 2 * x[i] + """) + + knl = lp.add_dtypes(knl, {"x": "float64"}) + knl = lp.split_iname(knl, "i", 4) + knl = lp.tag_inames(knl, {"i_inner": lp.OpenMPSIMDTag()}) + + code_str = lp.generate_code_v2(knl).device_code() + + assert any(line.strip() == "#pragma omp simd" + for line in code_str.split("\n")) + + +def test_vec_tag_with_omp_simd_fallback(): + knl = lp.make_kernel( + "{[i, j1, j2, j3]: 0<=i<10 and 0<=j1,j2,j3<4}", + """ + <> temp1[j1] = x[i, j1] + <> temp2[j2] = 2*temp1[j2] + 1 {inames=i:j2} + y[i, j3] = temp2[j3] + """, + [lp.GlobalArg("x, y", shape=lp.auto, dtype=float)], + seq_dependencies=True, + target=lp.ExecutableCVectorExtensionsTarget()) + + knl = lp.tag_inames(knl, {"j1": lp.VectorizeTag(lp.OpenMPSIMDTag()), + "j2": lp.VectorizeTag(lp.OpenMPSIMDTag()), + "j3": lp.VectorizeTag(lp.OpenMPSIMDTag())}) + knl = lp.tag_array_axes(knl, "temp1,temp2", "vec") + + code_str = lp.generate_code_v2(knl).device_code() + + assert len([line + for line in code_str.split("\n") + if line.strip() == "#pragma omp simd"]) == 2 + + x = np.random.rand(10, 4) + _, (out,) = knl(x=x) + np.testing.assert_allclose(out, 2*x+1) + + +def test_vec_extensions_with_multiple_loopy_body_insns(): + knl = lp.make_kernel( + "{[n]: 0<=n tmp = 2.0 + dat0[n, 0] = tmp {id=expr_insn} + ... nop {id=statement0} + end + """, + seq_dependencies=True, + target=lp.ExecutableCVectorExtensionsTarget()) + + knl = lp.add_dtypes(knl, {"dat0": "float64"}) + knl = lp.split_iname(knl, "n", 4, slabs=(1, 1), + inner_iname="n_batch") + knl = lp.privatize_temporaries_with_inames(knl, "n_batch") + knl = lp.tag_array_axes(knl, "tmp", "vec") + knl = lp.tag_inames(knl, { + "n_batch": lp.VectorizeTag(lp.OpenMPSIMDTag())}) + + _, (out,) = knl(N=100) + np.testing.assert_allclose(out, 2*np.ones((100, 1))) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])