diff --git a/examples/python/call-external.py b/examples/python/call-external.py index 49b25d6e0..ee17d2388 100644 --- a/examples/python/call-external.py +++ b/examples/python/call-external.py @@ -3,6 +3,8 @@ from loopy.diagnostic import LoopyError from loopy.target.c import CTarget from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 +from loopy.target.c.c_execution import CCompiler +from codepy.toolchain import GCCToolchain # {{{ blas callable @@ -22,7 +24,7 @@ def with_types(self, arg_id_to_dtype, callables_table): if vec_dtype.numpy_dtype == np.float32: name_in_target = "cblas_sgemv" - elif vec_dtype. numpy_dtype == np.float64: + elif vec_dtype.numpy_dtype == np.float64: name_in_target = "cblas_dgemv" else: raise LoopyError("GEMV is only supported for float32 and float64 " @@ -47,30 +49,37 @@ def with_descrs(self, arg_id_to_descr, callables_table): assert mat_descr.shape[0] == res_descr.shape[0] assert len(vec_descr.shape) == len(res_descr.shape) == 1 # handling only the easy case when stride == 1 - assert vec_descr.dim_tags[0].stride == 1 assert mat_descr.dim_tags[1].stride == 1 - assert res_descr.dim_tags[0].stride == 1 return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table def emit_call_insn(self, insn, target, expression_to_code_mapper): from pymbolic import var + from loopy.codegen import UnvectorizableError mat_descr = self.arg_id_to_descr[0] + vec_descr = self.arg_id_to_descr[1] + res_descr = self.arg_id_to_descr[-1] m, n = mat_descr.shape ecm = expression_to_code_mapper + + if ecm.codegen_state.vectorization_info is not None: + raise UnvectorizableError("cannot vectorize BLAS-gemv.") + mat, vec = insn.expression.parameters result, = insn.assignees c_parameters = [var("CblasRowMajor"), var("CblasNoTrans"), m, n, - 1, + 1, # alpha ecm(mat).expr, - 1, + mat_descr.dim_tags[0].stride, # LDA ecm(vec).expr, - 1, + vec_descr.dim_tags[0].stride, # INCX + 0, # beta ecm(result).expr, - 1] + res_descr.dim_tags[0].stride # INCY + ] return (var(self.name_in_target)(*c_parameters), False # cblas_gemv does not return anything ) @@ -83,17 +92,95 @@ def generate_preambles(self, target): # }}} -n = 10 - -knl = lp.make_kernel( - "{:}", +def transform_1(knl): + return knl + + +def transform_2(knl): + # A similar transformation is applied to kernels containing + # SLATE + # callables. + knl = lp.split_iname(knl, "e", 4, inner_iname="e_inner", slabs=(0, 1)) + knl = lp.privatize_temporaries_with_inames(knl, "e_inner") + knl = lp.tag_inames(knl, {"e_inner": "vec"}) + if 0: + # Easy codegen exercise, but misses vectorizing certain instructions. + knl = lp.tag_array_axes(knl, "tmp3", "c,vec") + else: + knl = lp.tag_array_axes(knl, "tmp3,tmp2", "c,vec") + return knl + + +def main(): + + compiler = CCompiler(toolchain=GCCToolchain( + cc="gcc", + cflags="-std=c99 -O3 -fPIC".split(), + ldflags="-shared".split(), + libraries=["blas"], + library_dirs=[], + defines=[], + undefines=[], + source_suffix="c", + so_ext=".so", + o_ext=".o", + include_dirs=[])) + + knl = lp.make_kernel( + "{[e,i1,i2]: 0<=e tmp3[i2] = 2 * tmp2[i2] + out[e, i2] = tmp3[i2] + end + end + """, + kernel_data=[ + lp.TemporaryVariable("tmp1", + shape=(4, ), + dtype=None), + lp.TemporaryVariable("tmp2", + shape=(4, ), + dtype=None), + lp.GlobalArg("A", + shape=(4, 4), + dtype="float64"), + lp.GlobalArg("x", + shape=lp.auto, + dtype="float64"), + ...], + target=lp.ExecutableCVectorExtensionsTarget(compiler=compiler), + lang_version=(2018, 2)) + + knl = lp.register_callable(knl, "matvec", CBLASGEMV("matvec")) + + for transform_func in [transform_1, transform_2]: + knl = transform_func(knl) + print("Generated code from '{transform_func.__name__} -----'") + print(lp.generate_code_v2(knl).device_code()) + print(75 * "-") + + # {{ verify the result is correct. + + from numpy.random import default_rng + + rng = default_rng(seed=0) + a = rng.random((4, 4)) + x = rng.random((100, 4)) + + _, (out,) = knl(A=a, x=x) + + np.testing.assert_allclose(6*np.einsum("ij,ej->ei", + a, x), + out) + + # }}} + + +if __name__ == "__main__": + main() diff --git a/test/test_target.py b/test/test_target.py index b05a18ad4..719c3f800 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -737,23 +737,6 @@ def test_c_vector_extensions(): print(lp.generate_code_v2(knl).device_code()) -def test_omp_simd_tag(): - knl = lp.make_kernel( - "{[i]: 0<=i<16}", - """ - y[i] = 2 * x[i] - """) - - knl = lp.add_dtypes(knl, {"x": "float64"}) - knl = lp.split_iname(knl, "i", 4) - knl = lp.tag_inames(knl, {"i_inner": lp.OpenMPSIMDTag()}) - - code_str = lp.generate_code_v2(knl).device_code() - - assert any(line.strip() == "#pragma omp simd" - for line in code_str.split("\n")) - - def test_vec_tag_with_omp_simd_fallback(): knl = lp.make_kernel( "{[i, j1, j2, j3]: 0<=i<10 and 0<=j1,j2,j3<4}", @@ -764,11 +747,13 @@ def test_vec_tag_with_omp_simd_fallback(): """, [lp.GlobalArg("x, y", shape=lp.auto, dtype=float)], seq_dependencies=True, - target=lp.ExecutableCVectorExtensionsTarget()) + target=lp.ExecutableCVectorExtensionsTarget( + lp.VectorizationFallback.OMP_SIMD) + ) - knl = lp.tag_inames(knl, {"j1": lp.VectorizeTag(lp.OpenMPSIMDTag()), - "j2": lp.VectorizeTag(lp.OpenMPSIMDTag()), - "j3": lp.VectorizeTag(lp.OpenMPSIMDTag())}) + knl = lp.tag_inames(knl, {"j1": "vec", + "j2": "vec", + "j3": "vec"}) knl = lp.tag_array_axes(knl, "temp1,temp2", "vec") code_str = lp.generate_code_v2(knl).device_code() @@ -794,15 +779,16 @@ def test_vec_extensions_with_multiple_loopy_body_insns(): end """, seq_dependencies=True, - target=lp.ExecutableCVectorExtensionsTarget()) + target=lp.ExecutableCVectorExtensionsTarget( + lp.VectorizationFallback.OMP_SIMD) + ) knl = lp.add_dtypes(knl, {"dat0": "float64"}) knl = lp.split_iname(knl, "n", 4, slabs=(1, 1), inner_iname="n_batch") knl = lp.privatize_temporaries_with_inames(knl, "n_batch") knl = lp.tag_array_axes(knl, "tmp", "vec") - knl = lp.tag_inames(knl, { - "n_batch": lp.VectorizeTag(lp.OpenMPSIMDTag())}) + knl = lp.tag_inames(knl, {"n_batch": "vec"}) _, (out,) = knl(N=100) np.testing.assert_allclose(out, 2*np.ones((100, 1)))