diff --git a/.env b/.env index 65bb46c838bee..f379ca14cd205 100644 --- a/.env +++ b/.env @@ -100,7 +100,8 @@ VCPKG="a42af01b72c28a8e1d7b48107b33e4f286a55ef6" # 2023.11.20 Release # use pulled built images in dev/tasks/python-wheels/github.windows.yml. PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-04-09 -# Use conanio/${CONAN} for "docker-compose run --rm conan". See -# https://github.com/conan-io/conan-docker-tools#readme for available -# images. -CONAN=gcc10 +# Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker-compose run --rm conan". +# See https://github.com/conan-io/conan-docker-tools#readme and +# https://hub.docker.com/u/conanio for available images. +CONAN_BASE=gcc10 +CONAN_VERSION=1.62.0 diff --git a/ci/conan/all/conan_cmake_project_include.cmake b/ci/conan/all/conan_cmake_project_include.cmake new file mode 100644 index 0000000000000..a6dee0c43461c --- /dev/null +++ b/ci/conan/all/conan_cmake_project_include.cmake @@ -0,0 +1,35 @@ +# MIT License +# +# Copyright (c) 2019 Conan.io +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +if(ARROW_S3) + find_package(AWSSDK REQUIRED) + # Fix issue where scripts expect a variable called "AWSSDK_LINK_LIBRARIES" + # which is not defined by the generated AWSSDKConfig.cmake + if(NOT DEFINED AWSSDK_LINK_LIBRARIES) + set(AWSSDK_LINK_LIBRARIES "${AWSSDK_LIBRARIES}") + endif() + + # Causes logic used for generated .pc file to not run + # avoiding instropection of target `aws-cpp-sdk::aws-cpp-sdk` + # This is fine because the generated .pc file is not of use + set(AWSSDK_SOURCE "conan") +endif() diff --git a/ci/conan/all/conandata.yml b/ci/conan/all/conandata.yml index 7402272a4b366..fb75f3995c62e 100644 --- a/ci/conan/all/conandata.yml +++ b/ci/conan/all/conandata.yml @@ -21,6 +21,30 @@ # SOFTWARE. sources: + "15.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-15.0.0/apache-arrow-15.0.0.tar.gz?action=download" + sha256: "01dd3f70e85d9b5b933ec92c0db8a4ef504a5105f78d2d8622e84279fb45c25d" + "14.0.2": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.2/apache-arrow-14.0.2.tar.gz?action=download" + sha256: "1304dedb41896008b89fe0738c71a95d9b81752efc77fa70f264cb1da15d9bc2" + "14.0.1": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.1/apache-arrow-14.0.1.tar.gz?action=download" + sha256: "5c70eafb1011f9d124bafb328afe54f62cc5b9280b7080e1e3d668f78c0e407e" + "14.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.0/apache-arrow-14.0.0.tar.gz?action=download" + sha256: "4eb0da50ec071baf15fc163cb48058931e006f1c862c8def0e180fd07d531021" + "13.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-13.0.0/apache-arrow-13.0.0.tar.gz?action=download" + sha256: "35dfda191262a756be934eef8afee8d09762cad25021daa626eb249e251ac9e6" + "12.0.1": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-12.0.1/apache-arrow-12.0.1.tar.gz?action=download" + sha256: "3481c411393aa15c75e88d93cf8315faf7f43e180fe0790128d3840d417de858" + "12.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-12.0.0/apache-arrow-12.0.0.tar.gz?action=download" + sha256: "ddd8347882775e53af7d0965a1902b7d8fcd0a030fd14f783d4f85e821352d52" + "11.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-11.0.0/apache-arrow-11.0.0.tar.gz?action=download" + sha256: "2dd8f0ea0848a58785628ee3a57675548d509e17213a2f5d72b0d900b43f5430" "10.0.1": url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-10.0.1/apache-arrow-10.0.1.tar.gz?action=download" sha256: "c814e0670112a22c1a6ec03ab420a52ae236a9a42e9e438c3cbd37f37e658fb3" @@ -36,12 +60,6 @@ sources: "7.0.0": url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-7.0.0/apache-arrow-7.0.0.tar.gz?action=download" sha256: "e8f49b149a15ecef4e40fcfab1b87c113c6b1ee186005c169e5cdf95d31a99de" - "2.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-2.0.0/apache-arrow-2.0.0.tar.gz?action=download" - sha256: "be0342cc847bb340d86aeaef43596a0b6c1dbf1ede9c789a503d939e01c71fbe" - "1.0.0": - url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-1.0.0/apache-arrow-1.0.0.tar.gz?action=download" - sha256: "86ddb9feb48203a5aaf9cc4f2827525e20a2ca4d7239e492af17e74532ccf243" patches: "8.0.1": - patch_file: "patches/8.0.0-0005-install-utils.patch" @@ -64,23 +82,3 @@ patches: - patch_file: "patches/7.0.0-0007-fix-cmake.patch" patch_description: "use cci package" patch_type: "conan" - "2.0.0": - - patch_file: "patches/2.0.0-0003-fix-shared-msvc.patch" - patch_description: "make shared enabled in msvc" - patch_type: "official" - - patch_file: "patches/2.0.0-0005-gandiva-engine.patch" - patch_description: "fix grandiva compilation error" - patch_type: "official" - - patch_file: "patches/2.0.0-0008-fix-cmake.patch" - patch_description: "use cci package" - patch_type: "conan" - "1.0.0": - - patch_file: "patches/1.0.0-0003-fix-shared-msvc.patch" - patch_description: "make shared enabled in msvc" - patch_type: "official" - - patch_file: "patches/1.0.0-0005-fix-make12-namespace.patch" - patch_description: "fix ambiguous `make12` function between std and date" - patch_type: "official" - - patch_file: "patches/1.0.0-0006-fix-cmake.patch" - patch_description: "use cci package" - patch_type: "conan" diff --git a/ci/conan/all/conanfile.py b/ci/conan/all/conanfile.py index 7e87f82e7e018..178cd03da1555 100644 --- a/ci/conan/all/conanfile.py +++ b/ci/conan/all/conanfile.py @@ -21,12 +21,12 @@ # SOFTWARE. from conan import ConanFile -from conan.errors import ConanInvalidConfiguration -from conan.tools.microsoft import is_msvc_static_runtime, is_msvc, check_min_vs -from conan.tools.files import export_conandata_patches, apply_conandata_patches, get, copy, rmdir +from conan.errors import ConanInvalidConfiguration, ConanException from conan.tools.build import check_min_cppstd, cross_building -from conan.tools.scm import Version from conan.tools.cmake import CMake, CMakeDeps, CMakeToolchain, cmake_layout +from conan.tools.files import apply_conandata_patches, copy, export_conandata_patches, get, rmdir +from conan.tools.microsoft import is_msvc, is_msvc_static_runtime +from conan.tools.scm import Version import os import glob @@ -39,7 +39,8 @@ class ArrowConan(ConanFile): license = ("Apache-2.0",) url = "https://github.com/conan-io/conan-center-index" homepage = "https://arrow.apache.org/" - topics = ("memory", "gandiva", "parquet", "skyhook", "plasma", "hdfs", "csv", "cuda", "gcs", "json", "hive", "s3", "grpc") + topics = ("memory", "gandiva", "parquet", "skyhook", "acero", "hdfs", "csv", "cuda", "gcs", "json", "hive", "s3", "grpc") + package_type = "library" settings = "os", "arch", "compiler", "build_type" options = { "shared": [True, False], @@ -48,15 +49,15 @@ class ArrowConan(ConanFile): "parquet": ["auto", True, False], "substrait": [True, False], "skyhook": [True, False], - "plasma": [True, False], + "acero": [True, False], "cli": [True, False], "compute": ["auto", True, False], - "acero": ["auto", True, False], "dataset_modules": ["auto", True, False], "deprecated": [True, False], "encryption": [True, False], "filesystem_layer": [True, False], "hdfs_bridgs": [True, False], + "plasma": [True, False, "deprecated"], "simd_level": [None, "default", "sse4_2", "avx2", "avx512", "neon", ], "runtime_simd_level": [None, "sse4_2", "avx2", "avx512", "max"], "with_backtrace": [True, False], @@ -70,8 +71,9 @@ class ArrowConan(ConanFile): "with_glog": ["auto", True, False], "with_grpc": ["auto", True, False], "with_jemalloc": ["auto", True, False], - "with_mimalloc": ["auto", True, False], + "with_mimalloc": [True, False], "with_json": [True, False], + "with_thrift": ["auto", True, False], "with_llvm": ["auto", True, False], "with_openssl": ["auto", True, False], "with_opentelemetry": [True, False], @@ -91,43 +93,44 @@ class ArrowConan(ConanFile): "shared": False, "fPIC": True, "gandiva": False, - "parquet": "auto", + "parquet": False, "skyhook": False, "substrait": False, - "plasma": False, + "acero": False, "cli": False, - "compute": "auto", - "acero": "auto", - "dataset_modules": "auto", + "compute": False, + "dataset_modules": False, "deprecated": True, "encryption": False, "filesystem_layer": False, "hdfs_bridgs": False, + "plasma": "deprecated", "simd_level": "default", "runtime_simd_level": "max", "with_backtrace": False, - "with_boost": "auto", + "with_boost": False, "with_brotli": False, "with_bz2": False, "with_csv": False, "with_cuda": False, - "with_flight_rpc": "auto", + "with_flight_rpc": False, "with_flight_sql": False, "with_gcs": False, - "with_gflags": "auto", - "with_jemalloc": "auto", + "with_gflags": False, + "with_jemalloc": False, "with_mimalloc": False, - "with_glog": "auto", - "with_grpc": "auto", + "with_glog": False, + "with_grpc": False, "with_json": False, - "with_llvm": "auto", - "with_openssl": "auto", + "with_thrift": False, + "with_llvm": False, + "with_openssl": False, "with_opentelemetry": False, "with_orc": False, - "with_protobuf": "auto", - "with_re2": "auto", + "with_protobuf": False, + "with_re2": False, "with_s3": False, - "with_utf8proc": "auto", + "with_utf8proc": False, "with_lz4": False, "with_snappy": False, "with_zlib": False, @@ -136,283 +139,147 @@ class ArrowConan(ConanFile): short_paths = True @property - def _minimum_cpp_standard(self): + def _min_cppstd(self): # arrow >= 10.0.0 requires C++17. # https://github.com/apache/arrow/pull/13991 - return 11 if Version(self.version) < "10.0.0" else 17 + return "11" if Version(self.version) < "10.0.0" else "17" @property def _compilers_minimum_version(self): return { - "gcc": "8", - "clang": "7", - "apple-clang": "10", - } + "11": { + "clang": "3.9", + }, + "17": { + "gcc": "8", + "clang": "7", + "apple-clang": "10", + "Visual Studio": "15", + "msvc": "191", + }, + }.get(self._min_cppstd, {}) def export_sources(self): export_conandata_patches(self) + copy(self, "conan_cmake_project_include.cmake", self.recipe_folder, os.path.join(self.export_sources_folder, "src")) def config_options(self): if self.settings.os == "Windows": del self.options.fPIC - if Version(self.version) < "2.0.0": - del self.options.simd_level - del self.options.runtime_simd_level - elif Version(self.version) < "6.0.0": - self.options.simd_level = "sse4_2" - if Version(self.version) < "6.0.0": - del self.options.with_gcs - if Version(self.version) < "7.0.0": - del self.options.skyhook - del self.options.with_flight_sql - del self.options.with_opentelemetry if Version(self.version) < "8.0.0": del self.options.substrait + if is_msvc(self): + self.options.with_boost = True def configure(self): if self.options.shared: self.options.rm_safe("fPIC") - def validate(self): - if self.info.settings.compiler.cppstd: - check_min_cppstd(self, self._minimum_cpp_standard) - - if self._minimum_cpp_standard == 11: - if self.info.settings.compiler == "clang" and self.info.settings.compiler.version <= Version("3.9"): - raise ConanInvalidConfiguration("This recipe does not support this compiler version") - else: - check_min_vs(self, 191) - if not is_msvc(self): - minimum_version = self._compilers_minimum_version.get(str(self.info.settings.compiler), False) - if minimum_version and Version(self.info.settings.compiler.version) < minimum_version: - raise ConanInvalidConfiguration( - f"{self.ref} requires C++{self._minimum_cpp_standard}, which your compiler does not support." - ) - - if self.options.shared: - del self.options.fPIC - if self.options.compute == False and not self._compute(True): - raise ConanInvalidConfiguration("compute options is required (or choose auto)") - if self.options.acero == False and not self._acero(True): - raise ConanInvalidConfiguration("acero options is required (or choose auto)") - if self.options.parquet == False and self._parquet(True): - raise ConanInvalidConfiguration("parquet options is required (or choose auto)") - if self.options.dataset_modules == False and self._dataset_modules(True): - raise ConanInvalidConfiguration("dataset_modules options is required (or choose auto)") - if self.options.get_safe("skyhook", False): - raise ConanInvalidConfiguration("CCI has no librados recipe (yet)") - if self.options.with_jemalloc == False and self._with_jemalloc(True): - raise ConanInvalidConfiguration("with_jemalloc option is required (or choose auto)") - if self.options.with_re2 == False and self._with_re2(True): - raise ConanInvalidConfiguration("with_re2 option is required (or choose auto)") - if self.options.with_protobuf == False and self._with_protobuf(True): - raise ConanInvalidConfiguration("with_protobuf option is required (or choose auto)") - if self.options.with_gflags == False and self._with_gflags(True): - raise ConanInvalidConfiguration("with_gflags options is required (or choose auto)") - if self.options.with_flight_rpc == False and self._with_flight_rpc(True): - raise ConanInvalidConfiguration("with_flight_rpc options is required (or choose auto)") - if self.options.with_grpc == False and self._with_grpc(True): - raise ConanInvalidConfiguration("with_grpc options is required (or choose auto)") - if self.options.with_boost == False and self._with_boost(True): - raise ConanInvalidConfiguration("with_boost options is required (or choose auto)") - if self.options.with_openssl == False and self._with_openssl(True): - raise ConanInvalidConfiguration("with_openssl options is required (or choose auto)") - if self.options.with_llvm == False and self._with_llvm(True): - raise ConanInvalidConfiguration("with_llvm options is required (or choose auto)") - if self.options.with_cuda: - raise ConanInvalidConfiguration("CCI has no cuda recipe (yet)") - if self.options.with_orc: - raise ConanInvalidConfiguration("CCI has no orc recipe (yet)") - if self.options.with_s3 and not self.options["aws-sdk-cpp"].config: - raise ConanInvalidConfiguration("arrow:with_s3 requires aws-sdk-cpp:config is True.") - - if self.options.shared and self._with_jemalloc(): - if self.options["jemalloc"].enable_cxx: - raise ConanInvalidConfiguration("jemmalloc.enable_cxx of a static jemalloc must be disabled") - - if Version(self.version) < "6.0.0" and self.options.get_safe("simd_level") == "default": - raise ConanInvalidConfiguration(f"In {self.ref}, simd_level options is not supported `default` value.") - def layout(self): cmake_layout(self, src_folder="src") - def _compute(self, required=False): - if required or self.options.compute == "auto": - return bool(self._parquet()) or bool(self._acero()) - else: - return bool(self.options.compute) - - def _acero(self, required=False): - if required or self.options.acero == "auto": - return bool(self._dataset_modules()) - else: - return bool(self.options.acero) - - def _parquet(self, required=False): - if required or self.options.parquet == "auto": - return bool(self.options.get_safe("substrait", False)) - else: - return bool(self.options.parquet) - - def _plasma(self, required=False): - if Version(self.version) >= "12.0.0": - return False - else: - return required or self.options.plasma - - def _dataset_modules(self, required=False): - if required or self.options.dataset_modules == "auto": - return bool(self.options.get_safe("substrait", False)) - else: - return bool(self.options.dataset_modules) - - def _with_jemalloc(self, required=False): - if required or self.options.with_jemalloc == "auto": - return bool("BSD" in str(self.settings.os)) - else: - return bool(self.options.with_jemalloc) - - def _with_re2(self, required=False): - if required or self.options.with_re2 == "auto": - if self.options.gandiva or self.options.parquet: - return True - if Version(self) >= "7.0.0" and (self._compute() or self._dataset_modules()): - return True - return False - else: - return bool(self.options.with_re2) - - def _with_protobuf(self, required=False): - if required or self.options.with_protobuf == "auto": - return bool(self.options.gandiva or self._with_flight_rpc() or self.options.with_orc or self.options.get_safe("substrait", False)) - else: - return bool(self.options.with_protobuf) - - def _with_flight_rpc(self, required=False): - if required or self.options.with_flight_rpc == "auto": - return bool(self.options.get_safe("with_flight_sql", False)) - else: - return bool(self.options.with_flight_rpc) - - def _with_gflags(self, required=False): - if required or self.options.with_gflags == "auto": - return bool(self._plasma() or self._with_glog() or self._with_grpc()) - else: - return bool(self.options.with_gflags) - - def _with_glog(self, required=False): - if required or self.options.with_glog == "auto": - return False - else: - return bool(self.options.with_glog) - - def _with_grpc(self, required=False): - if required or self.options.with_grpc == "auto": - return self._with_flight_rpc() - else: - return bool(self.options.with_grpc) - - def _with_boost(self, required=False): - if required or self.options.with_boost == "auto": - if self.options.gandiva: - return True - version = Version(self.version) - if version.major == "1": - if self._parquet() and self.settings.compiler == "gcc" and self.settings.compiler.version < Version("4.9"): - return True - elif version.major >= "2": - if is_msvc(self): - return True - return False - else: - return bool(self.options.with_boost) - - def _with_thrift(self, required=False): - # No self.options.with_thrift exists - return bool(required or self._parquet()) - - def _with_utf8proc(self, required=False): - if required or self.options.with_utf8proc == "auto": - return bool(self._compute() or self.options.gandiva) - else: - return bool(self.options.with_utf8proc) - - def _with_llvm(self, required=False): - if required or self.options.with_llvm == "auto": - return bool(self.options.gandiva) - else: - return bool(self.options.with_llvm) - - def _with_openssl(self, required=False): - if required or self.options.with_openssl == "auto": - return bool(self.options.encryption or self._with_flight_rpc() or self.options.with_s3) - else: - return bool(self.options.with_openssl) - - def _with_rapidjson(self): - if self.options.with_json: - return True - if Version(self.version) >= "7.0.0" and self.options.encryption: - return True - return False + def _requires_rapidjson(self): + return self.options.with_json or self.options.encryption def requirements(self): - if self._with_thrift(): - self.requires("zlib/1.2.13") + if self.options.with_thrift: self.requires("thrift/0.17.0") - if self._with_protobuf(): - self.requires("protobuf/3.21.4") - if self._with_jemalloc(): + if self.options.with_protobuf: + self.requires("protobuf/3.21.9") + if self.options.with_jemalloc: self.requires("jemalloc/5.3.0") if self.options.with_mimalloc: self.requires("mimalloc/1.7.6") - if self._with_boost(): - self.requires("boost/1.80.0") - if self._with_gflags(): + if self.options.with_boost: + self.requires("boost/1.84.0") + if self.options.with_gflags: self.requires("gflags/2.2.2") - if self._with_glog(): + if self.options.with_glog: self.requires("glog/0.6.0") if self.options.get_safe("with_gcs"): self.requires("google-cloud-cpp/1.40.1") - if self._with_grpc(): + if self.options.with_grpc: self.requires("grpc/1.50.0") - if self._with_rapidjson(): + if self._requires_rapidjson(): self.requires("rapidjson/1.1.0") - if self._with_llvm(): + if self.options.with_llvm: self.requires("llvm-core/13.0.0") - if self._with_openssl(): + if self.options.with_openssl: # aws-sdk-cpp requires openssl/1.1.1. it uses deprecated functions in openssl/3.0.0 if self.options.with_s3: - self.requires("openssl/1.1.1s") + self.requires("openssl/1.1.1w") else: - self.requires("openssl/1.1.1s") + self.requires("openssl/[>=1.1 <4]") if self.options.get_safe("with_opentelemetry"): self.requires("opentelemetry-cpp/1.7.0") if self.options.with_s3: self.requires("aws-sdk-cpp/1.9.234") if self.options.with_brotli: - self.requires("brotli/1.0.9") + self.requires("brotli/1.1.0") if self.options.with_bz2: self.requires("bzip2/1.0.8") if self.options.with_lz4: self.requires("lz4/1.9.4") if self.options.with_snappy: self.requires("snappy/1.1.9") - if Version(self.version) >= "6.0.0" and \ - self.options.get_safe("simd_level") != None or \ + if self.options.get_safe("simd_level") != None or \ self.options.get_safe("runtime_simd_level") != None: self.requires("xsimd/9.0.1") if self.options.with_zlib: - self.requires("zlib/1.2.13") + self.requires("zlib/[>=1.2.11 <2]") if self.options.with_zstd: - self.requires("zstd/1.5.2") - if self._with_re2(): - self.requires("re2/20220601") - if self._with_utf8proc(): + self.requires("zstd/1.5.5") + if self.options.with_re2: + self.requires("re2/20230301") + if self.options.with_utf8proc: self.requires("utf8proc/2.8.0") if self.options.with_backtrace: self.requires("libbacktrace/cci.20210118") + def validate(self): + # Do not allow options with 'auto' value + # TODO: Remove "auto" from the possible values for these options + auto_options = [option for option, value in self.options.items() if value == "auto"] + if auto_options: + raise ConanException("Options with value 'auto' are deprecated. Please set them true/false or use its default value." + f" Please change the following options: {auto_options}") + + # From https://github.com/conan-io/conan-center-index/pull/23163#issuecomment-2039808851 + if self.options.gandiva: + if not self.options.with_re2: + raise ConanException("'with_re2' option should be True when'gandiva=True'") + if not self.options.with_boost: + raise ConanException("'with_boost' option should be True when'gandiva=True'") + if not self.options.with_utf8proc: + raise ConanException("'with_utf8proc' option should be True when'gandiva=True'") + + if self.settings.compiler.get_safe("cppstd"): + check_min_cppstd(self, self._min_cppstd) + + minimum_version = self._compilers_minimum_version.get(str(self.settings.compiler), False) + if minimum_version and Version(self.settings.compiler.version) < minimum_version: + raise ConanInvalidConfiguration( + f"{self.ref} requires C++{self._min_cppstd}, which your compiler does not support." + ) + + if self.options.get_safe("skyhook", False): + raise ConanInvalidConfiguration("CCI has no librados recipe (yet)") + if self.options.with_cuda: + raise ConanInvalidConfiguration("CCI has no cuda recipe (yet)") + if self.options.with_orc: + raise ConanInvalidConfiguration("CCI has no orc recipe (yet)") + if self.options.with_s3 and not self.dependencies["aws-sdk-cpp"].options.config: + raise ConanInvalidConfiguration("arrow:with_s3 requires aws-sdk-cpp:config is True.") + + if self.options.shared and self.options.with_jemalloc: + if self.dependencies["jemalloc"].options.enable_cxx: + raise ConanInvalidConfiguration("jemmalloc.enable_cxx of a static jemalloc must be disabled") + + + def build_requirements(self): + if Version(self.version) >= "13.0.0": + self.tool_requires("cmake/[>=3.16 <4]") + def source(self): # START # This block should be removed when we update upstream: @@ -435,17 +302,15 @@ def source(self): return # END get(self, **self.conan_data["sources"][self.version], - filename=f"apache-arrow-{self.version}.tar.gz", destination=self.source_folder, strip_root=True) + filename=f"apache-arrow-{self.version}.tar.gz", strip_root=True) def generate(self): - # BUILD_SHARED_LIBS and POSITION_INDEPENDENT_CODE are automatically parsed when self.options.shared or self.options.fPIC exist tc = CMakeToolchain(self) if cross_building(self): cmake_system_processor = { "armv8": "aarch64", "armv8.3": "aarch64", }.get(str(self.settings.arch), str(self.settings.arch)) - tc.variables["CMAKE_SYSTEM_PROCESSOR"] = cmake_system_processor if cmake_system_processor == "aarch64": tc.variables["ARROW_CPU_FLAG"] = "armv8" if is_msvc(self): @@ -453,12 +318,10 @@ def generate(self): tc.variables["ARROW_DEPENDENCY_SOURCE"] = "SYSTEM" tc.variables["ARROW_PACKAGE_KIND"] = "conan" # See https://github.com/conan-io/conan-center-index/pull/14903/files#r1057938314 for details tc.variables["ARROW_GANDIVA"] = bool(self.options.gandiva) - tc.variables["ARROW_PARQUET"] = self._parquet() + tc.variables["ARROW_PARQUET"] = self.options.parquet tc.variables["ARROW_SUBSTRAIT"] = bool(self.options.get_safe("substrait", False)) - if Version(self.version) < "12.0.0": - tc.variables["ARROW_PLASMA"] = bool(self._plasma()) - tc.variables["ARROW_ACERO"] = self._acero() - tc.variables["ARROW_DATASET"] = self._dataset_modules() + tc.variables["ARROW_ACERO"] = bool(self.options.acero) + tc.variables["ARROW_DATASET"] = self.options.dataset_modules tc.variables["ARROW_FILESYSTEM"] = bool(self.options.filesystem_layer) tc.variables["PARQUET_REQUIRE_ENCRYPTION"] = bool(self.options.encryption) tc.variables["ARROW_HDFS"] = bool(self.options.hdfs_bridgs) @@ -466,12 +329,12 @@ def generate(self): tc.variables["ARROW_BUILD_SHARED"] = bool(self.options.shared) tc.variables["ARROW_BUILD_STATIC"] = not bool(self.options.shared) tc.variables["ARROW_NO_DEPRECATED_API"] = not bool(self.options.deprecated) - tc.variables["ARROW_FLIGHT"] = self._with_flight_rpc() + tc.variables["ARROW_FLIGHT"] = self.options.with_flight_rpc tc.variables["ARROW_FLIGHT_SQL"] = bool(self.options.get_safe("with_flight_sql", False)) - tc.variables["ARROW_COMPUTE"] = self._compute() + tc.variables["ARROW_COMPUTE"] = bool(self.options.compute) tc.variables["ARROW_CSV"] = bool(self.options.with_csv) tc.variables["ARROW_CUDA"] = bool(self.options.with_cuda) - tc.variables["ARROW_JEMALLOC"] = self._with_jemalloc() + tc.variables["ARROW_JEMALLOC"] = self.options.with_jemalloc tc.variables["jemalloc_SOURCE"] = "SYSTEM" tc.variables["ARROW_MIMALLOC"] = bool(self.options.with_mimalloc) tc.variables["ARROW_JSON"] = bool(self.options.with_json) @@ -479,61 +342,58 @@ def generate(self): tc.variables["ARROW_GCS"] = bool(self.options.get_safe("with_gcs", False)) tc.variables["BOOST_SOURCE"] = "SYSTEM" tc.variables["Protobuf_SOURCE"] = "SYSTEM" - if self._with_protobuf(): - tc.variables["ARROW_PROTOBUF_USE_SHARED"] = bool(self.options["protobuf"].shared) + if self.options.with_protobuf: + tc.variables["ARROW_PROTOBUF_USE_SHARED"] = bool(self.dependencies["protobuf"].options.shared) tc.variables["gRPC_SOURCE"] = "SYSTEM" - if self._with_grpc(): - tc.variables["ARROW_GRPC_USE_SHARED"] = bool(self.options["grpc"].shared) + if self.options.with_grpc: + tc.variables["ARROW_GRPC_USE_SHARED"] = bool(self.dependencies["grpc"].options.shared) - tc.variables["ARROW_USE_GLOG"] = self._with_glog() + tc.variables["ARROW_USE_GLOG"] = self.options.with_glog tc.variables["GLOG_SOURCE"] = "SYSTEM" tc.variables["ARROW_WITH_BACKTRACE"] = bool(self.options.with_backtrace) tc.variables["ARROW_WITH_BROTLI"] = bool(self.options.with_brotli) tc.variables["brotli_SOURCE"] = "SYSTEM" if self.options.with_brotli: - tc.variables["ARROW_BROTLI_USE_SHARED"] = bool(self.options["brotli"].shared) + tc.variables["ARROW_BROTLI_USE_SHARED"] = bool(self.dependencies["brotli"].options.shared) tc.variables["gflags_SOURCE"] = "SYSTEM" - if self._with_gflags(): - tc.variables["ARROW_GFLAGS_USE_SHARED"] = bool(self.options["gflags"].shared) + if self.options.with_gflags: + tc.variables["ARROW_GFLAGS_USE_SHARED"] = bool(self.dependencies["gflags"].options.shared) tc.variables["ARROW_WITH_BZ2"] = bool(self.options.with_bz2) tc.variables["BZip2_SOURCE"] = "SYSTEM" if self.options.with_bz2: - tc.variables["ARROW_BZ2_USE_SHARED"] = bool(self.options["bzip2"].shared) + tc.variables["ARROW_BZ2_USE_SHARED"] = bool(self.dependencies["bzip2"].options.shared) tc.variables["ARROW_WITH_LZ4"] = bool(self.options.with_lz4) tc.variables["lz4_SOURCE"] = "SYSTEM" if self.options.with_lz4: - tc.variables["ARROW_LZ4_USE_SHARED"] = bool(self.options["lz4"].shared) + tc.variables["ARROW_LZ4_USE_SHARED"] = bool(self.dependencies["lz4"].options.shared) tc.variables["ARROW_WITH_SNAPPY"] = bool(self.options.with_snappy) tc.variables["RapidJSON_SOURCE"] = "SYSTEM" tc.variables["Snappy_SOURCE"] = "SYSTEM" if self.options.with_snappy: - tc.variables["ARROW_SNAPPY_USE_SHARED"] = bool(self.options["snappy"].shared) + tc.variables["ARROW_SNAPPY_USE_SHARED"] = bool(self.dependencies["snappy"].options.shared) tc.variables["ARROW_WITH_ZLIB"] = bool(self.options.with_zlib) tc.variables["re2_SOURCE"] = "SYSTEM" tc.variables["ZLIB_SOURCE"] = "SYSTEM" tc.variables["xsimd_SOURCE"] = "SYSTEM" tc.variables["ARROW_WITH_ZSTD"] = bool(self.options.with_zstd) - if Version(self.version) >= "2.0": - tc.variables["zstd_SOURCE"] = "SYSTEM" - tc.variables["ARROW_SIMD_LEVEL"] = str(self.options.simd_level).upper() - tc.variables["ARROW_RUNTIME_SIMD_LEVEL"] = str(self.options.runtime_simd_level).upper() - else: - tc.variables["ZSTD_SOURCE"] = "SYSTEM" + tc.variables["zstd_SOURCE"] = "SYSTEM" + tc.variables["ARROW_SIMD_LEVEL"] = str(self.options.simd_level).upper() + tc.variables["ARROW_RUNTIME_SIMD_LEVEL"] = str(self.options.runtime_simd_level).upper() if self.options.with_zstd: - tc.variables["ARROW_ZSTD_USE_SHARED"] = bool(self.options["zstd"].shared) + tc.variables["ARROW_ZSTD_USE_SHARED"] = bool(self.dependencies["zstd"].options.shared) tc.variables["ORC_SOURCE"] = "SYSTEM" - tc.variables["ARROW_WITH_THRIFT"] = self._with_thrift() + tc.variables["ARROW_WITH_THRIFT"] = bool(self.options.with_thrift) tc.variables["Thrift_SOURCE"] = "SYSTEM" - if self._with_thrift(): - tc.variables["THRIFT_VERSION"] = bool(self.deps_cpp_info["thrift"].version) # a recent thrift does not require boost - tc.variables["ARROW_THRIFT_USE_SHARED"] = bool(self.options["thrift"].shared) - tc.variables["ARROW_USE_OPENSSL"] = self._with_openssl() - if self._with_openssl(): - tc.variables["OPENSSL_ROOT_DIR"] = self.deps_cpp_info["openssl"].rootpath.replace("\\", "/") - tc.variables["ARROW_OPENSSL_USE_SHARED"] = bool(self.options["openssl"].shared) - if self._with_boost(): + if self.options.with_thrift: + tc.variables["THRIFT_VERSION"] = bool(self.dependencies["thrift"].ref.version) # a recent thrift does not require boost + tc.variables["ARROW_THRIFT_USE_SHARED"] = bool(self.dependencies["thrift"].options.shared) + tc.variables["ARROW_USE_OPENSSL"] = self.options.with_openssl + if self.options.with_openssl: + tc.variables["OPENSSL_ROOT_DIR"] = self.dependencies["openssl"].package_folder.replace("\\", "/") + tc.variables["ARROW_OPENSSL_USE_SHARED"] = bool(self.dependencies["openssl"].options.shared) + if self.options.with_boost: tc.variables["ARROW_USE_BOOST"] = True - tc.variables["ARROW_BOOST_USE_SHARED"] = bool(self.options["boost"].shared) + tc.variables["ARROW_BOOST_USE_SHARED"] = bool(self.dependencies["boost"].options.shared) tc.variables["ARROW_S3"] = bool(self.options.with_s3) tc.variables["AWSSDK_SOURCE"] = "SYSTEM" tc.variables["ARROW_BUILD_UTILITIES"] = bool(self.options.cli) @@ -544,16 +404,18 @@ def generate(self): tc.variables["ARROW_ENABLE_TIMING_TESTS"] = False tc.variables["ARROW_BUILD_BENCHMARKS"] = False tc.variables["LLVM_SOURCE"] = "SYSTEM" - tc.variables["ARROW_WITH_UTF8PROC"] = self._with_utf8proc() - tc.variables["ARROW_BOOST_REQUIRED"] = self._with_boost() + tc.variables["ARROW_WITH_UTF8PROC"] = self.options.with_utf8proc + tc.variables["ARROW_BOOST_REQUIRED"] = self.options.with_boost tc.variables["utf8proc_SOURCE"] = "SYSTEM" - if self._with_utf8proc(): - tc.variables["ARROW_UTF8PROC_USE_SHARED"] = bool(self.options["utf8proc"].shared) + if self.options.with_utf8proc: + tc.variables["ARROW_UTF8PROC_USE_SHARED"] = bool(self.dependencies["utf8proc"].options.shared) tc.variables["BUILD_WARNING_LEVEL"] = "PRODUCTION" if is_msvc(self): - tc.variables["ARROW_USE_STATIC_CRT"] = "MT" in str(self.settings.compiler.runtime) - if self._with_llvm(): - tc.variables["LLVM_DIR"] = self.deps_cpp_info["llvm-core"].rootpath.replace("\\", "/") + tc.variables["ARROW_USE_STATIC_CRT"] = is_msvc_static_runtime(self) + if self.options.with_llvm: + tc.variables["LLVM_DIR"] = self.dependencies["llvm-core"].package_folder.replace("\\", "/") + + tc.cache_variables["CMAKE_PROJECT_arrow_INCLUDE"] = os.path.join(self.source_folder, "conan_cmake_project_include.cmake") tc.generate() deps = CMakeDeps(self) @@ -561,10 +423,11 @@ def generate(self): def _patch_sources(self): apply_conandata_patches(self) - if "7.0.0" <= Version(self.version) < "10.0.0": + if Version(self.version) < "10.0.0": for filename in glob.glob(os.path.join(self.source_folder, "cpp", "cmake_modules", "Find*.cmake")): if os.path.basename(filename) not in [ "FindArrow.cmake", + "FindArrowAcero.cmake", "FindArrowCUDA.cmake", "FindArrowDataset.cmake", "FindArrowFlight.cmake", @@ -576,7 +439,6 @@ def _patch_sources(self): "FindArrowTesting.cmake", "FindGandiva.cmake", "FindParquet.cmake", - "FindPlasma.cmake", ]: os.remove(filename) @@ -596,129 +458,106 @@ def package(self): rmdir(self, os.path.join(self.package_folder, "lib", "pkgconfig")) rmdir(self, os.path.join(self.package_folder, "share")) - def _lib_name(self, name): - if is_msvc(self) and not self.options.shared: - return "{}_static".format(name) - else: - return "{}".format(name) - - def package_id(self): - self.info.options.with_gflags = self._with_gflags() - self.info.options.with_protobuf = self._with_protobuf() - self.info.options.with_re2 = self._with_re2() - self.info.options.with_jemalloc = self._with_jemalloc() - self.info.options.with_openssl = self._with_openssl() - self.info.options.with_boost = self._with_boost() - self.info.options.with_glog = self._with_glog() - self.info.options.with_grpc = self._with_grpc() - def package_info(self): - self.cpp_info.filenames["cmake_find_package"] = "Arrow" - self.cpp_info.filenames["cmake_find_package_multi"] = "Arrow" - self.cpp_info.components["libarrow"].libs = [self._lib_name("arrow")] - self.cpp_info.components["libarrow"].names["cmake_find_package"] = "arrow" - self.cpp_info.components["libarrow"].names["cmake_find_package_multi"] = "arrow" - self.cpp_info.components["libarrow"].names["pkg_config"] = "arrow" + # FIXME: fix CMake targets of components + + self.cpp_info.set_property("cmake_file_name", "Arrow") + + suffix = "_static" if is_msvc(self) and not self.options.shared else "" + + self.cpp_info.components["libarrow"].set_property("pkg_config_name", "arrow") + self.cpp_info.components["libarrow"].libs = [f"arrow{suffix}"] if not self.options.shared: self.cpp_info.components["libarrow"].defines = ["ARROW_STATIC"] if self.settings.os in ["Linux", "FreeBSD"]: self.cpp_info.components["libarrow"].system_libs = ["pthread", "m", "dl", "rt"] - if self._parquet(): - self.cpp_info.components["libparquet"].libs = [self._lib_name("parquet")] - self.cpp_info.components["libparquet"].names["cmake_find_package"] = "parquet" - self.cpp_info.components["libparquet"].names["cmake_find_package_multi"] = "parquet" - self.cpp_info.components["libparquet"].names["pkg_config"] = "parquet" + if self.options.parquet: + self.cpp_info.components["libparquet"].set_property("pkg_config_name", "parquet") + self.cpp_info.components["libparquet"].libs = [f"parquet{suffix}"] self.cpp_info.components["libparquet"].requires = ["libarrow"] if not self.options.shared: self.cpp_info.components["libparquet"].defines = ["PARQUET_STATIC"] - if self.options.get_safe("substrait", False): - self.cpp_info.components["libarrow_substrait"].libs = [self._lib_name("arrow_substrait")] - self.cpp_info.components["libarrow_substrait"].names["cmake_find_package"] = "arrow_substrait" - self.cpp_info.components["libarrow_substrait"].names["cmake_find_package_multi"] = "arrow_substrait" - self.cpp_info.components["libarrow_substrait"].names["pkg_config"] = "arrow_substrait" - self.cpp_info.components["libarrow_substrait"].requires = ["libparquet", "dataset", "acero"] + if self.options.get_safe("substrait"): + self.cpp_info.components["libarrow_substrait"].set_property("pkg_config_name", "arrow_substrait") + self.cpp_info.components["libarrow_substrait"].libs = [f"arrow_substrait{suffix}"] + self.cpp_info.components["libarrow_substrait"].requires = ["libparquet", "dataset"] + + # Plasma was deprecated in Arrow 12.0.0 + del self.options.plasma - if self._plasma(): - self.cpp_info.components["libplasma"].libs = [self._lib_name("plasma")] - self.cpp_info.components["libplasma"].names["cmake_find_package"] = "plasma" - self.cpp_info.components["libplasma"].names["cmake_find_package_multi"] = "plasma" - self.cpp_info.components["libplasma"].names["pkg_config"] = "plasma" - self.cpp_info.components["libplasma"].requires = ["libarrow"] + if self.options.acero: + self.cpp_info.components["libacero"].libs = [f"arrow_acero{suffix}"] + self.cpp_info.components["libacero"].names["cmake_find_package"] = "acero" + self.cpp_info.components["libacero"].names["cmake_find_package_multi"] = "acero" + self.cpp_info.components["libacero"].names["pkg_config"] = "acero" + self.cpp_info.components["libacero"].requires = ["libarrow"] if self.options.gandiva: - self.cpp_info.components["libgandiva"].libs = [self._lib_name("gandiva")] - self.cpp_info.components["libgandiva"].names["cmake_find_package"] = "gandiva" - self.cpp_info.components["libgandiva"].names["cmake_find_package_multi"] = "gandiva" - self.cpp_info.components["libgandiva"].names["pkg_config"] = "gandiva" + self.cpp_info.components["libgandiva"].set_property("pkg_config_name", "gandiva") + self.cpp_info.components["libgandiva"].libs = [f"gandiva{suffix}"] self.cpp_info.components["libgandiva"].requires = ["libarrow"] if not self.options.shared: self.cpp_info.components["libgandiva"].defines = ["GANDIVA_STATIC"] - if self._with_flight_rpc(): - self.cpp_info.components["libarrow_flight"].libs = [self._lib_name("arrow_flight")] - self.cpp_info.components["libarrow_flight"].names["cmake_find_package"] = "flight_rpc" - self.cpp_info.components["libarrow_flight"].names["cmake_find_package_multi"] = "flight_rpc" - self.cpp_info.components["libarrow_flight"].names["pkg_config"] = "flight_rpc" + if self.options.with_flight_rpc: + self.cpp_info.components["libarrow_flight"].set_property("pkg_config_name", "flight_rpc") + self.cpp_info.components["libarrow_flight"].libs = [f"arrow_flight{suffix}"] self.cpp_info.components["libarrow_flight"].requires = ["libarrow"] if self.options.get_safe("with_flight_sql"): - self.cpp_info.components["libarrow_flight_sql"].libs = [self._lib_name("arrow_flight_sql")] - self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package"] = "flight_sql" - self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package_multi"] = "flight_sql" - self.cpp_info.components["libarrow_flight_sql"].names["pkg_config"] = "flight_sql" + self.cpp_info.components["libarrow_flight_sql"].set_property("pkg_config_name", "flight_sql") + self.cpp_info.components["libarrow_flight_sql"].libs = [f"arrow_flight_sql{suffix}"] self.cpp_info.components["libarrow_flight_sql"].requires = ["libarrow", "libarrow_flight"] - if self._acero(): - self.cpp_info.components["acero"].libs = ["arrow_acero"] - - if self._dataset_modules(): + if self.options.dataset_modules: self.cpp_info.components["dataset"].libs = ["arrow_dataset"] + if self.options.parquet: + self.cpp_info.components["dataset"].requires = ["libparquet"] - if (self.options.cli and (self.options.with_cuda or self._with_flight_rpc() or self._parquet())) or self._plasma(): + if self.options.cli and (self.options.with_cuda or self.options.with_flight_rpc or self.options.parquet): binpath = os.path.join(self.package_folder, "bin") self.output.info(f"Appending PATH env var: {binpath}") self.env_info.PATH.append(binpath) - if self._with_boost(): + if self.options.with_boost: if self.options.gandiva: # FIXME: only filesystem component is used self.cpp_info.components["libgandiva"].requires.append("boost::boost") - if self._parquet() and self.settings.compiler == "gcc" and self.settings.compiler.version < Version("4.9"): + if self.options.parquet and self.settings.compiler == "gcc" and self.settings.compiler.version < Version("4.9"): self.cpp_info.components["libparquet"].requires.append("boost::boost") - if Version(self.version) >= "2.0": - # FIXME: only headers components is used - self.cpp_info.components["libarrow"].requires.append("boost::boost") - if self._with_openssl(): + # FIXME: only headers components is used + self.cpp_info.components["libarrow"].requires.append("boost::boost") + if self.options.with_openssl: self.cpp_info.components["libarrow"].requires.append("openssl::openssl") - if self._with_gflags(): + if self.options.with_gflags: self.cpp_info.components["libarrow"].requires.append("gflags::gflags") - if self._with_glog(): + if self.options.with_glog: self.cpp_info.components["libarrow"].requires.append("glog::glog") - if self._with_jemalloc(): + if self.options.with_jemalloc: self.cpp_info.components["libarrow"].requires.append("jemalloc::jemalloc") if self.options.with_mimalloc: self.cpp_info.components["libarrow"].requires.append("mimalloc::mimalloc") - if self._with_re2(): + if self.options.with_re2: if self.options.gandiva: self.cpp_info.components["libgandiva"].requires.append("re2::re2") - if self._parquet(): + if self.options.parquet: self.cpp_info.components["libparquet"].requires.append("re2::re2") self.cpp_info.components["libarrow"].requires.append("re2::re2") - if self._with_llvm(): + if self.options.with_llvm: self.cpp_info.components["libgandiva"].requires.append("llvm-core::llvm-core") - if self._with_protobuf(): + if self.options.with_protobuf: self.cpp_info.components["libarrow"].requires.append("protobuf::protobuf") - if self._with_utf8proc(): + if self.options.with_utf8proc: self.cpp_info.components["libarrow"].requires.append("utf8proc::utf8proc") - if self._with_thrift(): + if self.options.with_thrift: self.cpp_info.components["libarrow"].requires.append("thrift::thrift") if self.options.with_backtrace: self.cpp_info.components["libarrow"].requires.append("libbacktrace::libbacktrace") if self.options.with_cuda: self.cpp_info.components["libarrow"].requires.append("cuda::cuda") - if self._with_rapidjson(): + if self._requires_rapidjson(): self.cpp_info.components["libarrow"].requires.append("rapidjson::rapidjson") if self.options.with_s3: self.cpp_info.components["libarrow"].requires.append("aws-sdk-cpp::s3") @@ -742,9 +581,32 @@ def package_info(self): self.cpp_info.components["libarrow"].requires.append("zlib::zlib") if self.options.with_zstd: self.cpp_info.components["libarrow"].requires.append("zstd::zstd") - if self._with_boost(): + if self.options.with_boost: self.cpp_info.components["libarrow"].requires.append("boost::boost") - if self._with_grpc(): + if self.options.with_grpc: self.cpp_info.components["libarrow"].requires.append("grpc::grpc") - if self._with_flight_rpc(): + if self.options.with_flight_rpc: self.cpp_info.components["libarrow_flight"].requires.append("protobuf::protobuf") + + # TODO: to remove in conan v2 + self.cpp_info.filenames["cmake_find_package"] = "Arrow" + self.cpp_info.filenames["cmake_find_package_multi"] = "Arrow" + self.cpp_info.components["libarrow"].names["cmake_find_package"] = "arrow" + self.cpp_info.components["libarrow"].names["cmake_find_package_multi"] = "arrow" + if self.options.parquet: + self.cpp_info.components["libparquet"].names["cmake_find_package"] = "parquet" + self.cpp_info.components["libparquet"].names["cmake_find_package_multi"] = "parquet" + if self.options.get_safe("substrait"): + self.cpp_info.components["libarrow_substrait"].names["cmake_find_package"] = "arrow_substrait" + self.cpp_info.components["libarrow_substrait"].names["cmake_find_package_multi"] = "arrow_substrait" + if self.options.gandiva: + self.cpp_info.components["libgandiva"].names["cmake_find_package"] = "gandiva" + self.cpp_info.components["libgandiva"].names["cmake_find_package_multi"] = "gandiva" + if self.options.with_flight_rpc: + self.cpp_info.components["libarrow_flight"].names["cmake_find_package"] = "flight_rpc" + self.cpp_info.components["libarrow_flight"].names["cmake_find_package_multi"] = "flight_rpc" + if self.options.get_safe("with_flight_sql"): + self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package"] = "flight_sql" + self.cpp_info.components["libarrow_flight_sql"].names["cmake_find_package_multi"] = "flight_sql" + if self.options.cli and (self.options.with_cuda or self.options.with_flight_rpc or self.options.parquet): + self.env_info.PATH.append(os.path.join(self.package_folder, "bin")) diff --git a/ci/conan/config.yml b/ci/conan/config.yml index be333447f348c..3fa90be6f669a 100644 --- a/ci/conan/config.yml +++ b/ci/conan/config.yml @@ -21,6 +21,22 @@ # SOFTWARE. versions: + "15.0.0": + folder: all + "14.0.2": + folder: all + "14.0.1": + folder: all + "14.0.0": + folder: all + "13.0.0": + folder: all + "12.0.1": + folder: all + "12.0.0": + folder: all + "11.0.0": + folder: all "10.0.1": folder: all "10.0.0": @@ -31,7 +47,3 @@ versions: folder: all "7.0.0": folder: all - "2.0.0": - folder: all - "1.0.0": - folder: all diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index c59766c4a665c..d93732abb0032 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -27,6 +27,11 @@ ENV R_PRUNE_DEPS=${r_prune_deps} ARG r_duckdb_dev=FALSE ENV R_DUCKDB_DEV=${r_duckdb_dev} +# This is needed to avoid errors with utf8 characters in some +# R package's DESCRIPTION files +# https://github.com/statnmap/HatchedPolygons/issues/4 +ENV LANG=C.UTF-8 + # Build R # [1] https://www.digitalocean.com/community/tutorials/how-to-install-r-on-ubuntu-18-04 # [2] https://linuxize.com/post/how-to-install-r-on-ubuntu-18-04/#installing-r-packages-from-cran diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 629d532a3dc76..4a37818f94396 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -29,7 +29,7 @@ RUN echo "debconf debconf/frontend select Noninteractive" | \ # while debugging package list with docker build. ARG clang_tools ARG llvm -RUN latest_system_llvm=14 && \ +RUN latest_system_llvm=18 && \ if [ ${llvm} -gt ${latest_system_llvm} -o \ ${clang_tools} -gt ${latest_system_llvm} ]; then \ apt-get update -y -q && \ @@ -127,7 +127,7 @@ RUN if [ "${gcc_version}" = "" ]; then \ g++ \ gcc; \ else \ - if [ "${gcc_version}" -gt "12" ]; then \ + if [ "${gcc_version}" -gt "14" ]; then \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends software-properties-common && \ add-apt-repository ppa:ubuntu-toolchain-r/volatile; \ diff --git a/ci/scripts/conan_build.sh b/ci/scripts/conan_build.sh index b1ee0a8fc2afd..0ea3fc29192dd 100755 --- a/ci/scripts/conan_build.sh +++ b/ci/scripts/conan_build.sh @@ -30,34 +30,39 @@ export CONAN_HOOK_ERROR_LEVEL=40 conan_args=() conan_args+=(--build=missing) if [ -n "${ARROW_CONAN_PARQUET:-}" ]; then - conan_args+=(--options arrow:parquet=${ARROW_CONAN_PARQUET}) + conan_args+=(--options arrow/*:parquet=${ARROW_CONAN_PARQUET}) + conan_args+=(--options arrow/*:with_thrift=${ARROW_CONAN_PARQUET}) + conan_args+=(--options arrow/*:with_boost=${ARROW_CONAN_PARQUET}) fi if [ -n "${ARROW_CONAN_WITH_BROTLI:-}" ]; then - conan_args+=(--options arrow:with_brotli=${ARROW_CONAN_WITH_BROTLI}) + conan_args+=(--options arrow/*:with_brotli=${ARROW_CONAN_WITH_BROTLI}) fi if [ -n "${ARROW_CONAN_WITH_BZ2:-}" ]; then - conan_args+=(--options arrow:with_bz2=${ARROW_CONAN_WITH_BZ2}) + conan_args+=(--options arrow/*:with_bz2=${ARROW_CONAN_WITH_BZ2}) fi if [ -n "${ARROW_CONAN_WITH_FLIGHT_RPC:-}" ]; then - conan_args+=(--options arrow:with_flight_rpc=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_flight_rpc=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_grpc=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_protobuf=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options arrow/*:with_re2=${ARROW_CONAN_WITH_FLIGHT_RPC}) fi if [ -n "${ARROW_CONAN_WITH_GLOG:-}" ]; then - conan_args+=(--options arrow:with_glog=${ARROW_CONAN_WITH_GLOG}) + conan_args+=(--options arrow/*:with_glog=${ARROW_CONAN_WITH_GLOG}) fi if [ -n "${ARROW_CONAN_WITH_JEMALLOC:-}" ]; then - conan_args+=(--options arrow:with_jemalloc=${ARROW_CONAN_WITH_JEMALLOC}) + conan_args+=(--options arrow/*:with_jemalloc=${ARROW_CONAN_WITH_JEMALLOC}) fi if [ -n "${ARROW_CONAN_WITH_JSON:-}" ]; then - conan_args+=(--options arrow:with_json=${ARROW_CONAN_WITH_JSON}) + conan_args+=(--options arrow/*:with_json=${ARROW_CONAN_WITH_JSON}) fi if [ -n "${ARROW_CONAN_WITH_LZ4:-}" ]; then - conan_args+=(--options arrow:with_lz4=${ARROW_CONAN_WITH_LZ4}) + conan_args+=(--options arrow/*:with_lz4=${ARROW_CONAN_WITH_LZ4}) fi if [ -n "${ARROW_CONAN_WITH_SNAPPY:-}" ]; then - conan_args+=(--options arrow:with_snappy=${ARROW_CONAN_WITH_SNAPPY}) + conan_args+=(--options arrow/*:with_snappy=${ARROW_CONAN_WITH_SNAPPY}) fi if [ -n "${ARROW_CONAN_WITH_ZSTD:-}" ]; then - conan_args+=(--options arrow:with_zstd=${ARROW_CONAN_WITH_ZSTD}) + conan_args+=(--options arrow/*:with_zstd=${ARROW_CONAN_WITH_ZSTD}) fi version=$(grep '^set(ARROW_VERSION ' ${ARROW_HOME}/cpp/CMakeLists.txt | \ diff --git a/ci/scripts/install_cmake.sh b/ci/scripts/install_cmake.sh index 2f5e5d52051ed..7fdb06d90f02c 100755 --- a/ci/scripts/install_cmake.sh +++ b/ci/scripts/install_cmake.sh @@ -21,7 +21,10 @@ set -e declare -A archs archs=([amd64]=x86_64 - [arm64v8]=aarch64) + [arch64]=aarch64 + [arm64]=aarch64 + [arm64v8]=aarch64 + [x86_64]=x86_64) declare -A platforms platforms=([linux]=linux @@ -38,5 +41,25 @@ platform=${platforms[$2]} version=$3 prefix=$4 -url="https://github.com/Kitware/CMake/releases/download/v${version}/cmake-${version}-${platform}-${arch}.tar.gz" -wget -q ${url} -O - | tar -xzf - --directory ${prefix} --strip-components=1 +mkdir -p ${prefix} +url="https://github.com/Kitware/CMake/releases/download/v${version}/cmake-${version}-${platform}-" +case ${platform} in + macos) + url+="universal.tar.gz" + curl -L ${url} | tar -xzf - --directory ${prefix} --strip-components=1 + ln -s CMake.app/Contents/bin ${prefix}/bin + ;; + windows) + url+="${arch}.zip" + archive_name=$(basename ${url}) + curl -L -o ${archive_name} ${url} + unzip ${archive_name} + base_name=$(basename ${archive_name} .zip) + mv ${base_name}/* ${prefix} + rm -rf ${base_name} ${archive_name} + ;; + *) + url+="${arch}.tar.gz" + curl -L ${url} | tar -xzf - --directory ${prefix} --strip-components=1 + ;; +esac diff --git a/ci/scripts/install_sccache.sh b/ci/scripts/install_sccache.sh index 0346c0cc9ce7d..136f39b3ae2ab 100755 --- a/ci/scripts/install_sccache.sh +++ b/ci/scripts/install_sccache.sh @@ -59,7 +59,7 @@ fi # Extract only the sccache binary into $PREFIX and ignore README and LICENSE. # --wildcards doesn't work on busybox. tar -xzvf $SCCACHE_ARCHIVE --strip-component=1 --directory $PREFIX --exclude="sccache*/*E*E*" -chmod u+x $PREFIX/sccache +chmod a+x $PREFIX/sccache if [ -n "${GITHUB_PATH}" ]; then echo "$PREFIX" >> $GITHUB_PATH diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 7d54ccccf7c19..ddea1c399cbba 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1190,6 +1190,12 @@ if(MSVC AND ARROW_USE_STATIC_CRT) set(Boost_USE_STATIC_RUNTIME ON) endif() set(Boost_ADDITIONAL_VERSIONS + "1.84.0" + "1.84" + "1.83.0" + "1.83" + "1.82.0" + "1.82" "1.81.0" "1.81" "1.80.0" diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc index 993c0b9a705b4..ad1bd67cc8ec7 100644 --- a/cpp/src/arrow/acero/hash_join_benchmark.cc +++ b/cpp/src/arrow/acero/hash_join_benchmark.cc @@ -83,8 +83,8 @@ class JoinBenchmark { build_metadata["null_probability"] = std::to_string(settings.null_percentage); build_metadata["min"] = std::to_string(min_build_value); build_metadata["max"] = std::to_string(max_build_value); - build_metadata["min_length"] = settings.var_length_min; - build_metadata["max_length"] = settings.var_length_max; + build_metadata["min_length"] = std::to_string(settings.var_length_min); + build_metadata["max_length"] = std::to_string(settings.var_length_max); std::unordered_map probe_metadata; probe_metadata["null_probability"] = std::to_string(settings.null_percentage); diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc index 0b198759de1e6..5b9c51cda5576 100644 --- a/cpp/src/arrow/csv/writer.cc +++ b/cpp/src/arrow/csv/writer.cc @@ -128,10 +128,20 @@ class ColumnPopulator { // Populators are intented to be applied to reasonably small data. In most cases // threading overhead would not be justified. ctx.set_use_threads(false); - ASSIGN_OR_RAISE( - std::shared_ptr casted, - compute::Cast(data, /*to_type=*/utf8(), compute::CastOptions(), &ctx)); - casted_array_ = checked_pointer_cast(casted); + if (data.type() && is_large_binary_like(data.type()->id())) { + ASSIGN_OR_RAISE(array_, compute::Cast(data, /*to_type=*/large_utf8(), + compute::CastOptions(), &ctx)); + } else { + auto casted = compute::Cast(data, /*to_type=*/utf8(), compute::CastOptions(), &ctx); + if (casted.ok()) { + array_ = std::move(casted).ValueOrDie(); + } else if (casted.status().IsCapacityError()) { + ASSIGN_OR_RAISE(array_, compute::Cast(data, /*to_type=*/large_utf8(), + compute::CastOptions(), &ctx)); + } else { + return casted.status(); + } + } return UpdateRowLengths(row_lengths); } @@ -146,7 +156,8 @@ class ColumnPopulator { protected: virtual Status UpdateRowLengths(int64_t* row_lengths) = 0; - std::shared_ptr casted_array_; + // It must be a `StringArray` or `LargeStringArray`. + std::shared_ptr array_; const std::string end_chars_; std::shared_ptr null_string_; @@ -181,15 +192,28 @@ class UnquotedColumnPopulator : public ColumnPopulator { reject_values_with_quotes_(reject_values_with_quotes) {} Status UpdateRowLengths(int64_t* row_lengths) override { + if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) { + return UpdateRowLengths(row_lengths); + } else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) { + return UpdateRowLengths(row_lengths); + } else { + return Status::TypeError("The array must be StringArray or LargeStringArray."); + } + } + + template + Status UpdateRowLengths(int64_t* row_lengths) { + auto casted_array = checked_pointer_cast(array_); if (reject_values_with_quotes_) { // When working on values that, after casting, could produce quotes, // we need to return an error in accord with RFC4180. - RETURN_NOT_OK(CheckStringArrayHasNoStructuralChars(*casted_array_, delimiter_)); + RETURN_NOT_OK(CheckStringArrayHasNoStructuralChars(*casted_array, + delimiter_)); } int64_t row_number = 0; - VisitArraySpanInline( - *casted_array_->data(), + VisitArraySpanInline( + *casted_array->data(), [&](std::string_view s) { row_lengths[row_number] += static_cast(s.length()); row_number++; @@ -202,6 +226,17 @@ class UnquotedColumnPopulator : public ColumnPopulator { } Status PopulateRows(char* output, int64_t* offsets) const override { + if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) { + return PopulateRows(output, offsets); + } else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) { + return PopulateRows(output, offsets); + } else { + return Status::TypeError("The array must be StringArray or LargeStringArray."); + } + } + + template + Status PopulateRows(char* output, int64_t* offsets) const { // Function applied to valid values cast to string. auto valid_function = [&](std::string_view s) { memcpy(output + *offsets, s.data(), s.length()); @@ -222,13 +257,14 @@ class UnquotedColumnPopulator : public ColumnPopulator { return Status::OK(); }; - return VisitArraySpanInline(*casted_array_->data(), valid_function, - null_function); + return VisitArraySpanInline( + *array_->data(), valid_function, null_function); } private: // Returns an error status if string array has any structural characters. - static Status CheckStringArrayHasNoStructuralChars(const StringArray& array, + template + static Status CheckStringArrayHasNoStructuralChars(const ArrayType& array, const char delimiter) { // scan the underlying string array buffer as a single big string const uint8_t* const data = array.raw_data() + array.value_offset(0); @@ -282,14 +318,26 @@ class QuotedColumnPopulator : public ColumnPopulator { : ColumnPopulator(pool, std::move(end_chars), std::move(null_string)) {} Status UpdateRowLengths(int64_t* row_lengths) override { - const StringArray& input = *casted_array_; + if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) { + return UpdateRowLengths(row_lengths); + } else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) { + return UpdateRowLengths(row_lengths); + } else { + return Status::TypeError("The array must be StringArray or LargeStringArray."); + } + } + + template + Status UpdateRowLengths(int64_t* row_lengths) { + auto casted_array = checked_pointer_cast(array_); + const StringArrayType& input = *casted_array; - row_needs_escaping_.resize(casted_array_->length(), false); + row_needs_escaping_.resize(casted_array->length(), false); if (NoQuoteInArray(input)) { // fast path if no quote int row_number = 0; - VisitArraySpanInline( + VisitArraySpanInline( *input.data(), [&](std::string_view s) { row_lengths[row_number] += static_cast(s.length()) + kQuoteCount; @@ -301,7 +349,7 @@ class QuotedColumnPopulator : public ColumnPopulator { }); } else { int row_number = 0; - VisitArraySpanInline( + VisitArraySpanInline( *input.data(), [&](std::string_view s) { // Each quote in the value string needs to be escaped. @@ -320,9 +368,20 @@ class QuotedColumnPopulator : public ColumnPopulator { } Status PopulateRows(char* output, int64_t* offsets) const override { + if (ARROW_PREDICT_TRUE(array_->type_id() == Type::STRING)) { + return PopulateRows(output, offsets); + } else if (ARROW_PREDICT_TRUE(array_->type_id() == Type::LARGE_STRING)) { + return PopulateRows(output, offsets); + } else { + return Status::TypeError("The array must be StringArray or LargeStringArray."); + } + } + + template + Status PopulateRows(char* output, int64_t* offsets) const { auto needs_escaping = row_needs_escaping_.begin(); - VisitArraySpanInline( - *(casted_array_->data()), + VisitArraySpanInline( + *array_->data(), [&](std::string_view s) { // still needs string content length to be added char* row = output + *offsets; @@ -355,7 +414,8 @@ class QuotedColumnPopulator : public ColumnPopulator { private: // Returns true if there's no quote in the string array - static bool NoQuoteInArray(const StringArray& array) { + template + static bool NoQuoteInArray(const StringArrayType& array) { const uint8_t* data = array.raw_data() + array.value_offset(0); const int64_t buffer_size = array.total_values_length(); return std::memchr(data, '"', buffer_size) == nullptr; diff --git a/cpp/src/arrow/csv/writer.h b/cpp/src/arrow/csv/writer.h index 4323337212472..d9d79e1660867 100644 --- a/cpp/src/arrow/csv/writer.h +++ b/cpp/src/arrow/csv/writer.h @@ -29,7 +29,8 @@ namespace arrow { namespace csv { // Functionality for converting Arrow data to Comma separated value text. -// This library supports all primitive types that can be cast to a StringArrays. +// This library supports all primitive types that can be cast to a StringArray or +// a LargeStringArray. // It applies to following formatting rules: // - For non-binary types no quotes surround values. Nulls are represented as the empty // string. diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc index c1f5622289edb..703179da94093 100644 --- a/cpp/src/arrow/csv/writer_test.cc +++ b/cpp/src/arrow/csv/writer_test.cc @@ -73,17 +73,17 @@ WriteOptions DefaultTestOptions(bool include_header = false, } std::string UtilGetExpectedWithEOL(const std::string& eol) { - return std::string("1,,-1,,,,") + eol + // line 1 - R"(1,"abc""efg",2324,,,,)" + eol + // line 2 - R"(,"abcd",5467,,,,)" + eol + // line 3 - R"(,,,,,,)" + eol + // line 4 - R"(546,"",517,,,,)" + eol + // line 5 - R"(124,"a""""b""",,,,,)" + eol + // line 6 - R"(,,,1970-01-01,,,)" + eol + // line 7 - R"(,,,,1970-01-02,,)" + eol + // line 8 - R"(,,,,,2004-02-29 01:02:03,)" + eol + // line 9 - R"(,,,,,,3600)" + eol + // line 10 - R"(,"NA",,,,,)" + eol; // line 11 + return std::string("1,,-1,,,,,") + eol + // line 1 + R"(1,"abc""efg",2324,,,,,)" + eol + // line 2 + R"(,"abcd",5467,,,,,"efghi")" + eol + // line 3 + R"(,,,,,,,)" + eol + // line 4 + R"(546,"",517,,,,,)" + eol + // line 5 + R"(124,"a""""b""",,,,,,)" + eol + // line 6 + R"(,,,1970-01-01,,,,"jklm")" + eol + // line 7 + R"(,,,,1970-01-02,,,)" + eol + // line 8 + R"(,,,,,2004-02-29 01:02:03,,)" + eol + // line 9 + R"(,,,,,,3600,)" + eol + // line 10 + R"(,"NA",,,,,,)" + eol; // line 11 } std::vector GenerateTestCases() { @@ -100,20 +100,22 @@ std::vector GenerateTestCases() { field("e", date64()), field("f", timestamp(TimeUnit::SECOND)), field("g", duration(TimeUnit::SECOND)), + field("h", large_utf8()), }); auto populated_batch = R"([{"a": 1, "c ": -1}, { "a": 1, "b\"": "abc\"efg", "c ": 2324}, - { "b\"": "abcd", "c ": 5467}, + { "b\"": "abcd", "c ": 5467, "h": "efghi"}, { }, { "a": 546, "b\"": "", "c ": 517 }, { "a": 124, "b\"": "a\"\"b\"" }, - { "d": 0 }, + { "d": 0, "h": "jklm" }, { "e": 86400000 }, { "f": 1078016523 }, { "g": 3600 }, { "b\"": "NA" }])"; - std::string expected_header = std::string(R"("a","b""","c ","d","e","f","g")") + "\n"; + std::string expected_header = + std::string(R"("a","b""","c ","d","e","f","g","h")") + "\n"; // Expected output without header when using default QuotingStyle::Needed. std::string expected_without_header = UtilGetExpectedWithEOL("\n"); @@ -122,42 +124,42 @@ std::vector GenerateTestCases() { // Expected output without header when using QuotingStyle::AllValid. std::string expected_quoting_style_all_valid = - std::string(R"("1",,"-1",,,,)") + "\n" + // line 1 - R"("1","abc""efg","2324",,,,)" + "\n" + // line 2 - R"(,"abcd","5467",,,,)" + "\n" + // line 3 - R"(,,,,,,)" + "\n" + // line 4 - R"("546","","517",,,,)" + "\n" + // line 5 - R"("124","a""""b""",,,,,)" + "\n" + // line 6 - R"(,,,"1970-01-01",,,)" + "\n" + // line 7 - R"(,,,,"1970-01-02",,)" + "\n" + // line 8 - R"(,,,,,"2004-02-29 01:02:03",)" + "\n" + // line 9 - R"(,,,,,,"3600")" + "\n" + // line 10 - R"(,"NA",,,,,)" + "\n"; // line 11 + std::string(R"("1",,"-1",,,,,)") + "\n" + // line 1 + R"("1","abc""efg","2324",,,,,)" + "\n" + // line 2 + R"(,"abcd","5467",,,,,"efghi")" + "\n" + // line 3 + R"(,,,,,,,)" + "\n" + // line 4 + R"("546","","517",,,,,)" + "\n" + // line 5 + R"("124","a""""b""",,,,,,)" + "\n" + // line 6 + R"(,,,"1970-01-01",,,,"jklm")" + "\n" + // line 7 + R"(,,,,"1970-01-02",,,)" + "\n" + // line 8 + R"(,,,,,"2004-02-29 01:02:03",,)" + "\n" + // line 9 + R"(,,,,,,"3600",)" + "\n" + // line 10 + R"(,"NA",,,,,,)" + "\n"; // line 11 // Batch when testing QuotingStyle::None. The values may not contain any quotes for this // style according to RFC4180. auto populated_batch_quoting_style_none = R"([{"a": 1, "c ": -1}, { "a": 1, "b\"": "abcefg", "c ": 2324}, - { "b\"": "abcd", "c ": 5467}, + { "b\"": "abcd", "c ": 5467, "h": "efghi"}, { }, { "a": 546, "b\"": "", "c ": 517 }, { "a": 124, "b\"": "ab" }, - { "d": 0 }, + { "d": 0, "h": "jklm" }, { "e": 86400000 }, { "f": 1078016523 }, { "g": 3600 }])"; // Expected output for QuotingStyle::None. - std::string expected_quoting_style_none = std::string("1,,-1,,,,") + "\n" + // line 1 - R"(1,abcefg,2324,,,,)" + "\n" + // line 2 - R"(,abcd,5467,,,,)" + "\n" + // line 3 - R"(,,,,,,)" + "\n" + // line 4 - R"(546,,517,,,,)" + "\n" + // line 5 - R"(124,ab,,,,,)" + "\n" + // line 6 - R"(,,,1970-01-01,,,)" + "\n" + // line 7 - R"(,,,,1970-01-02,,)" + "\n" + // line 8 - R"(,,,,,2004-02-29 01:02:03,)" + - "\n" + // line 9 - R"(,,,,,,3600)" + "\n"; // line 10 + std::string expected_quoting_style_none = std::string("1,,-1,,,,,") + "\n" + // line 1 + R"(1,abcefg,2324,,,,,)" + "\n" + // line 2 + R"(,abcd,5467,,,,,efghi)" + "\n" + // line 3 + R"(,,,,,,,)" + "\n" + // line 4 + R"(546,,517,,,,,)" + "\n" + // line 5 + R"(124,ab,,,,,,)" + "\n" + // line 6 + R"(,,,1970-01-01,,,,jklm)" + "\n" + // line 7 + R"(,,,,1970-01-02,,,)" + "\n" + // line 8 + R"(,,,,,2004-02-29 01:02:03,,)" + + "\n" + // line 9 + R"(,,,,,,3600,)" + "\n"; // line 10 // Schema and data to test custom null value string. auto schema_custom_na = schema({field("g", uint64()), field("h", utf8())}); diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 19226ce01ae2f..8eb00b8ae44f3 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -752,7 +752,7 @@ void GenericFileSystemTest::TestGetFileInfoSelector(FileSystem* fs) { } void GenericFileSystemTest::TestGetFileInfoGenerator(FileSystem* fs) { -#ifdef ADDRESS_SANITIZER +#if defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND) if (have_false_positive_memory_leak_with_generator()) { GTEST_SKIP() << "Filesystem have false positive memory leak with generator"; } diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc index e84aac995e35f..8cecc6365a3b9 100644 --- a/cpp/src/arrow/util/value_parsing.cc +++ b/cpp/src/arrow/util/value_parsing.cc @@ -53,8 +53,11 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, uint16_t* o float temp_out; const auto res = ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, temp_out, options); - *out = Float16::FromFloat(temp_out).bits(); - return res.ec == std::errc() && res.ptr == s + length; + const bool ok = res.ec == std::errc() && res.ptr == s + length; + if (ok) { + *out = Float16::FromFloat(temp_out).bits(); + } + return ok; } // ---------------------------------------------------------------------- diff --git a/cpp/src/gandiva/regex_functions_holder.cc b/cpp/src/gandiva/regex_functions_holder.cc index 03a4af90d8991..ef07a9ef0bc9b 100644 --- a/cpp/src/gandiva/regex_functions_holder.cc +++ b/cpp/src/gandiva/regex_functions_holder.cc @@ -99,13 +99,14 @@ Result> LikeHolder::Make(const FunctionNode& node) { "'like' function requires a string literal as the second parameter")); RE2::Options regex_op; + regex_op.set_dot_nl(true); // set dotall mode for the regex. if (node.descriptor()->name() == "ilike") { regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. return Make(std::get(literal->holder()), regex_op); } if (node.children().size() == 2) { - return Make(std::get(literal->holder())); + return Make(std::get(literal->holder()), regex_op); } else { auto escape_char = dynamic_cast(node.children().at(2).get()); ARROW_RETURN_IF( @@ -118,7 +119,7 @@ Result> LikeHolder::Make(const FunctionNode& node) { Status::Invalid( "'like' function requires a string literal as the third parameter")); return Make(std::get(literal->holder()), - std::get(escape_char->holder())); + std::get(escape_char->holder()), regex_op); } } @@ -126,7 +127,9 @@ Result> LikeHolder::Make(const std::string& sql_patt std::string pcre_pattern; ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); + RE2::Options regex_op; + regex_op.set_dot_nl(true); // set dotall mode for the regex. + auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed with: ", lholder->regex_.error())); @@ -135,7 +138,8 @@ Result> LikeHolder::Make(const std::string& sql_patt } Result> LikeHolder::Make(const std::string& sql_pattern, - const std::string& escape_char) { + const std::string& escape_char, + RE2::Options regex_op) { ARROW_RETURN_IF(escape_char.length() > 1, Status::Invalid("The length of escape char ", escape_char, " in 'like' function is greater than 1")); @@ -147,7 +151,7 @@ Result> LikeHolder::Make(const std::string& sql_patt ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); } - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); + auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed with: ", lholder->regex_.error())); diff --git a/cpp/src/gandiva/regex_functions_holder.h b/cpp/src/gandiva/regex_functions_holder.h index 36d942510bb5b..354c2b53d95e1 100644 --- a/cpp/src/gandiva/regex_functions_holder.h +++ b/cpp/src/gandiva/regex_functions_holder.h @@ -40,7 +40,8 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { static Result> Make(const std::string& sql_pattern); static Result> Make(const std::string& sql_pattern, - const std::string& escape_char); + const std::string& escape_char, + RE2::Options regex_op); static Result> Make(const std::string& sql_pattern, RE2::Options regex_op); diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index 534be5987a233..64657e88c6473 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -28,6 +28,8 @@ namespace gandiva { class TestLikeHolder : public ::testing::Test { public: RE2::Options regex_op; + void SetUp() { regex_op.set_dot_nl(true); } + FunctionNode BuildLike(std::string pattern) { auto field = std::make_shared(arrow::field("in", arrow::utf8())); auto pattern_node = @@ -77,6 +79,14 @@ TEST_F(TestLikeHolder, TestPcreSpecial) { EXPECT_FALSE(like("xxabc")); } +TEST_F(TestLikeHolder, TestPcreSpecialWithNewLine) { + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("%Space1.%", regex_op)); + + auto& like = *like_holder; + EXPECT_TRUE( + like("[name: \"Space1.protect\"\nargs: \"count\"\ncolumn_name: \"pass_count\"]")); +} + TEST_F(TestLikeHolder, TestRegexEscape) { std::string res; ARROW_EXPECT_OK(RegexUtil::SqlLikePatternToPcre("#%hello#_abc_def##", '#', res)); @@ -91,14 +101,22 @@ TEST_F(TestLikeHolder, TestDot) { EXPECT_FALSE(like("abcd")); } +TEST_F(TestLikeHolder, TestMatchWithNewLine) { + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("%abc%", regex_op)); + + auto& like = *like_holder; + EXPECT_TRUE(like("abc\nd")); +} + TEST_F(TestLikeHolder, TestMatchSubString) { - EXPECT_OK_AND_ASSIGN(auto like_holder, LikeHolder::Make("%abc%", "\\")); + EXPECT_OK_AND_ASSIGN(auto like_holder, LikeHolder::Make("%abc%", "\\", regex_op)); auto& like = *like_holder; EXPECT_TRUE(like("abc")); EXPECT_FALSE(like("xxabdc")); - EXPECT_OK_AND_ASSIGN(like_holder, LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\")); + EXPECT_OK_AND_ASSIGN(like_holder, + LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", regex_op)); auto& like_reserved_char = *like_holder; EXPECT_TRUE(like_reserved_char("XXab-.^$*+?()[]{}|—/c%d")); @@ -173,7 +191,7 @@ TEST_F(TestLikeHolder, TestOptimise) { } TEST_F(TestLikeHolder, TestMatchOneEscape) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "\\")); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "\\", regex_op)); auto& like = *like_holder; @@ -187,7 +205,7 @@ TEST_F(TestLikeHolder, TestMatchOneEscape) { } TEST_F(TestLikeHolder, TestMatchManyEscape) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\%", "\\")); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\%", "\\", regex_op)); auto& like = *like_holder; @@ -201,7 +219,8 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) { } TEST_F(TestLikeHolder, TestMatchEscape) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\\\", "\\")); + EXPECT_OK_AND_ASSIGN(auto const like_holder, + LikeHolder::Make("ab\\\\", "\\", regex_op)); auto& like = *like_holder; @@ -211,7 +230,7 @@ TEST_F(TestLikeHolder, TestMatchEscape) { } TEST_F(TestLikeHolder, TestEmptyEscapeChar) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "")); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "", regex_op)); auto& like = *like_holder; @@ -223,7 +242,7 @@ TEST_F(TestLikeHolder, TestEmptyEscapeChar) { } TEST_F(TestLikeHolder, TestMultipleEscapeChar) { - ASSERT_RAISES(Invalid, LikeHolder::Make("ab\\_", "\\\\").status()); + ASSERT_RAISES(Invalid, LikeHolder::Make("ab\\_", "\\\\", regex_op).status()); } class TestILikeHolder : public ::testing::Test { diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index a858c53e931d8..e74a9f55b124f 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -396,7 +396,7 @@ struct ByteStreamSplitDummyValue> { using Array = std::array; static constexpr Array value() { - Array array; + Array array{}; array.fill(ByteStreamSplitDummyValue::value()); return array; } diff --git a/csharp/src/Apache.Arrow/Arrays/Array.cs b/csharp/src/Apache.Arrow/Arrays/Array.cs index 0838134b19c6d..4abe63e05ad83 100644 --- a/csharp/src/Apache.Arrow/Arrays/Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Array.cs @@ -31,7 +31,7 @@ protected Array(ArrayData data) public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public ArrowBuffer NullBitmapBuffer => Data.Buffers[0]; diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs index 55d77f598c4e4..cdb6ed6b39418 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs @@ -15,7 +15,6 @@ using Apache.Arrow.Memory; using Apache.Arrow.Types; -using Google.FlatBuffers; using System; using System.Collections.Generic; using System.Linq; @@ -28,12 +27,30 @@ public sealed class ArrayData : IDisposable public readonly IArrowType DataType; public readonly int Length; - public readonly int NullCount; + + /// + /// The number of null values in the Array. May be -1 if the null count has not been computed. + /// + public int NullCount; + public readonly int Offset; public readonly ArrowBuffer[] Buffers; public readonly ArrayData[] Children; public readonly ArrayData Dictionary; // Only used for dictionary type + /// + /// Get the number of null values in the Array, computing the count if required. + /// + public int GetNullCount() + { + if (NullCount == RecalculateNullCount) + { + NullCount = ComputeNullCount(); + } + + return NullCount; + } + // This is left for compatibility with lower version binaries // before the dictionary type was supported. public ArrayData( @@ -111,7 +128,25 @@ public ArrayData Slice(int offset, int length) length = Math.Min(Length - offset, length); offset += Offset; - return new ArrayData(DataType, length, RecalculateNullCount, offset, Buffers, Children, Dictionary); + int nullCount; + if (NullCount == 0) + { + nullCount = 0; + } + else if (NullCount == Length) + { + nullCount = length; + } + else if (offset == Offset && length == Length) + { + nullCount = NullCount; + } + else + { + nullCount = RecalculateNullCount; + } + + return new ArrayData(DataType, length, nullCount, offset, Buffers, Children, Dictionary); } public ArrayData Clone(MemoryAllocator allocator = default) @@ -125,5 +160,24 @@ public ArrayData Clone(MemoryAllocator allocator = default) Children?.Select(b => b.Clone(allocator))?.ToArray(), Dictionary?.Clone(allocator)); } + + private int ComputeNullCount() + { + if (DataType.TypeId == ArrowTypeId.Union) + { + return UnionArray.ComputeNullCount(this); + } + + if (Buffers == null || Buffers.Length == 0 || Buffers[0].IsEmpty) + { + return 0; + } + + // Note: Dictionary arrays may be logically null if there is a null in the dictionary values, + // but this isn't accounted for by the IArrowArray.IsNull implementation, + // so we maintain consistency with that behaviour here. + + return Length - BitUtility.CountBits(Buffers[0].Span, Offset, Length); + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs index 698d74e4bac84..84658a5fab812 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs @@ -71,7 +71,7 @@ public ArrayDataConcatenationVisitor(IReadOnlyList arrayDataList, Mem foreach (ArrayData arrayData in _arrayDataList) { _totalLength += arrayData.Length; - _totalNullCount += arrayData.NullCount; + _totalNullCount += arrayData.GetNullCount(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs index b6b61c560e482..79880c894b13d 100644 --- a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs @@ -24,7 +24,7 @@ public class DenseUnionArray : UnionArray { public ArrowBuffer ValueOffsetBuffer => Data.Buffers[1]; - public ReadOnlySpan ValueOffsets => ValueOffsetBuffer.Span.CastTo(); + public ReadOnlySpan ValueOffsets => ValueOffsetBuffer.Span.CastTo().Slice(Offset, Length); public DenseUnionArray( IArrowType dataType, @@ -38,7 +38,6 @@ public DenseUnionArray( dataType, length, nullCount, offset, new[] { typeIds, valuesOffsetBuffer }, children.Select(child => child.Data))) { - _fields = children.ToArray(); ValidateMode(UnionMode.Dense, Type.Mode); } @@ -53,5 +52,28 @@ protected override bool FieldIsValid(IArrowArray fieldArray, int index) { return fieldArray.IsValid(ValueOffsets[index]); } + + internal new static int ComputeNullCount(ArrayData data) + { + var offset = data.Offset; + var length = data.Length; + var typeIds = data.Buffers[0].Span.Slice(offset, length); + var valueOffsets = data.Buffers[1].Span.CastTo().Slice(offset, length); + var childArrays = new IArrowArray[data.Children.Length]; + for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx) + { + childArrays[childIdx] = ArrowArrayFactory.BuildArray(data.Children[childIdx]); + } + + var nullCount = 0; + for (var i = 0; i < length; ++i) + { + var typeId = typeIds[i]; + var valueOffset = valueOffsets[i]; + nullCount += childArrays[typeId].IsNull(valueOffset) ? 1 : 0; + } + + return nullCount; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/NullArray.cs b/csharp/src/Apache.Arrow/Arrays/NullArray.cs index 762540065c929..7f3e183829243 100644 --- a/csharp/src/Apache.Arrow/Arrays/NullArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/NullArray.cs @@ -95,7 +95,7 @@ public NullArray(int length) public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public void Dispose() { } public bool IsNull(int index) => true; diff --git a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs index 07d36e25cfc23..5b29489ebb1f0 100644 --- a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs @@ -32,7 +32,6 @@ public SparseUnionArray( dataType, length, nullCount, offset, new[] { typeIds }, children.Select(child => child.Data))) { - _fields = children.ToArray(); ValidateMode(UnionMode.Sparse, Type.Mode); } @@ -47,5 +46,26 @@ protected override bool FieldIsValid(IArrowArray fieldArray, int index) { return fieldArray.IsValid(index); } + + internal new static int ComputeNullCount(ArrayData data) + { + var offset = data.Offset; + var length = data.Length; + var typeIds = data.Buffers[0].Span.Slice(offset, length); + var childArrays = new IArrowArray[data.Children.Length]; + for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx) + { + childArrays[childIdx] = ArrowArrayFactory.BuildArray(data.Children[childIdx]); + } + + var nullCount = 0; + for (var i = 0; i < data.Length; ++i) + { + var typeId = typeIds[i]; + nullCount += childArrays[typeId].IsNull(offset + i) ? 1 : 0; + } + + return nullCount; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs index 5fcb276655162..c1deb9b651a89 100644 --- a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs @@ -25,7 +25,7 @@ public abstract class UnionArray : IArrowArray protected IReadOnlyList _fields; public IReadOnlyList Fields => - LazyInitializer.EnsureInitialized(ref _fields, () => InitializeFields()); + LazyInitializer.EnsureInitialized(ref _fields, InitializeFields); public ArrayData Data { get; } @@ -35,13 +35,13 @@ public abstract class UnionArray : IArrowArray public ArrowBuffer TypeBuffer => Data.Buffers[0]; - public ReadOnlySpan TypeIds => TypeBuffer.Span; + public ReadOnlySpan TypeIds => TypeBuffer.Span.Slice(Offset, Length); public int Length => Data.Length; public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public bool IsValid(int index) => NullCount == 0 || FieldIsValid(Fields[TypeIds[index]], index); @@ -91,12 +91,29 @@ protected static void ValidateMode(UnionMode expected, UnionMode actual) } } + internal static int ComputeNullCount(ArrayData data) + { + return ((UnionType)data.DataType).Mode switch + { + UnionMode.Sparse => SparseUnionArray.ComputeNullCount(data), + UnionMode.Dense => DenseUnionArray.ComputeNullCount(data), + _ => throw new InvalidOperationException("unknown union mode in null count computation") + }; + } + private IReadOnlyList InitializeFields() { IArrowArray[] result = new IArrowArray[Data.Children.Length]; for (int i = 0; i < Data.Children.Length; i++) { - result[i] = ArrowArrayFactory.BuildArray(Data.Children[i]); + var childData = Data.Children[i]; + if (Mode == UnionMode.Sparse && (Data.Offset != 0 || childData.Length != Data.Length)) + { + // We only slice the child data for sparse mode, + // so that the sliced value offsets remain valid in dense mode + childData = childData.Slice(Data.Offset, Data.Length); + } + result[i] = ArrowArrayFactory.BuildArray(childData); } return result; } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs index 03059eaf5d4df..b241fdfea3bda 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs @@ -115,7 +115,7 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr { cArray->length = array.Length; cArray->offset = array.Offset; - cArray->null_count = array.NullCount; + cArray->null_count = array.NullCount; // The C Data interface allows the null count to be -1 cArray->release = ReleaseArrayPtr; cArray->private_data = MakePrivateData(sharedOwner); diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index b002f8c8b1578..7b319b03d790c 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -376,7 +376,7 @@ private void CreateSelfAndChildrenFieldNodes(ArrayData data) CreateSelfAndChildrenFieldNodes(data.Children[i]); } } - Flatbuf.FieldNode.CreateFieldNode(Builder, data.Length, data.NullCount); + Flatbuf.FieldNode.CreateFieldNode(Builder, data.Length, data.GetNullCount()); } private static int CountAllNodes(IReadOnlyList fields) diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index a0e90cbbc7c61..682ebec323dc0 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -185,6 +185,7 @@ public void SlicePrimitiveArrayWithNulls() TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).AppendNull().Append(new DateTime(2019, 1, 3))); TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); + TestSlice(x => x.AppendNull().AppendNull().AppendNull()); // All nulls static void TestNumberSlice() where T : struct, INumber @@ -314,6 +315,8 @@ private void ValidateArrays(PrimitiveArray slicedArray) .SequenceEqual(slicedArray.Values)); Assert.Equal(baseArray.GetValue(slicedArray.Offset), slicedArray.GetValue(0)); + + ValidateNullCount(slicedArray); } private void ValidateArrays(BooleanArray slicedArray) @@ -333,6 +336,8 @@ private void ValidateArrays(BooleanArray slicedArray) #pragma warning disable CS0618 Assert.Equal(baseArray.GetBoolean(slicedArray.Offset), slicedArray.GetBoolean(0)); #pragma warning restore CS0618 + + ValidateNullCount(slicedArray); } private void ValidateArrays(BinaryArray slicedArray) @@ -347,6 +352,16 @@ private void ValidateArrays(BinaryArray slicedArray) .SequenceEqual(slicedArray.ValueOffsets)); Assert.True(baseArray.GetBytes(slicedArray.Offset).SequenceEqual(slicedArray.GetBytes(0))); + + ValidateNullCount(slicedArray); + } + + private static void ValidateNullCount(IArrowArray slicedArray) + { + var expectedNullCount = Enumerable.Range(0, slicedArray.Length) + .Select(i => slicedArray.IsNull(i) ? 1 : 0) + .Sum(); + Assert.Equal(expectedNullCount, slicedArray.NullCount); } } } diff --git a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs index 1fb5cf2415c68..712a87a252b6c 100644 --- a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +using System; using System.Linq; using Apache.Arrow.Types; using Xunit; @@ -24,17 +25,116 @@ public class UnionArrayTests [Theory] [InlineData(UnionMode.Sparse)] [InlineData(UnionMode.Dense)] - public void UnionArray_IsNull(UnionMode mode) + public void UnionArrayIsNull(UnionMode mode) + { + var (array, expectedNull) = BuildUnionArray(mode, 100); + + for (var i = 0; i < array.Length; ++i) + { + Assert.Equal(expectedNull[i], array.IsNull(i)); + Assert.Equal(!expectedNull[i], array.IsValid(i)); + } + } + + [Theory] + [InlineData(UnionMode.Sparse)] + [InlineData(UnionMode.Dense)] + public void UnionArraySlice(UnionMode mode) + { + var (array, expectedNull) = BuildUnionArray(mode, 10); + + for (var offset = 0; offset < array.Length; ++offset) + { + for (var length = 0; length < array.Length - offset; ++length) + { + var slicedArray = (UnionArray)ArrowArrayFactory.Slice(array, offset, length); + + var nullCount = 0; + for (var i = 0; i < slicedArray.Length; ++i) + { + Assert.Equal(expectedNull[offset + i], slicedArray.IsNull(i)); + Assert.Equal(!expectedNull[offset + i], slicedArray.IsValid(i)); + nullCount += expectedNull[offset + i] ? 1 : 0; + + CompareValue(array, offset + i, slicedArray, i); + } + + Assert.Equal(nullCount, slicedArray.NullCount); + } + } + } + + [Theory] + [InlineData(UnionMode.Sparse)] + [InlineData(UnionMode.Dense)] + public void UnionArrayConstructedWithOffset(UnionMode mode) + { + const int length = 10; + var (array, expectedNull) = BuildUnionArray(mode, length); + + for (var offset = 0; offset < array.Length; ++offset) + { + var (slicedArray, _) = BuildUnionArray(mode, length, offset); + + var nullCount = 0; + for (var i = 0; i < slicedArray.Length; ++i) + { + Assert.Equal(expectedNull[offset + i], slicedArray.IsNull(i)); + Assert.Equal(!expectedNull[offset + i], slicedArray.IsValid(i)); + nullCount += expectedNull[offset + i] ? 1 : 0; + + CompareValue(array, offset + i, slicedArray, i); + } + + Assert.Equal(nullCount, slicedArray.NullCount); + } + } + + private static void CompareValue(UnionArray originalArray, int originalIndex, UnionArray slicedArray, int sliceIndex) + { + var typeId = originalArray.TypeIds[originalIndex]; + var sliceTypeId = slicedArray.TypeIds[sliceIndex]; + Assert.Equal(typeId, sliceTypeId); + + switch (typeId) + { + case 0: + CompareFieldValue(typeId, originalArray, originalIndex, slicedArray, sliceIndex); + break; + case 1: + CompareFieldValue(typeId, originalArray, originalIndex, slicedArray, sliceIndex); + break; + default: + throw new Exception($"Unexpected type id {typeId}"); + } + } + + private static void CompareFieldValue(byte typeId, UnionArray originalArray, int originalIndex, UnionArray slicedArray, int sliceIndex) + where T: struct + where TArray : PrimitiveArray + { + if (originalArray is DenseUnionArray denseOriginalArray) + { + Assert.IsType(slicedArray); + + originalIndex = denseOriginalArray.ValueOffsets[originalIndex]; + sliceIndex = ((DenseUnionArray)slicedArray).ValueOffsets[sliceIndex]; + } + var originalValue = ((TArray)originalArray.Fields[typeId]).GetValue(originalIndex); + var sliceValue = ((TArray)slicedArray.Fields[typeId]).GetValue(sliceIndex); + Assert.Equal(originalValue, sliceValue); + } + + private static (UnionArray array, bool[] isNull) BuildUnionArray(UnionMode mode, int length, int offset=0) { var fields = new Field[] { new Field("field0", new Int32Type(), true), new Field("field1", new FloatType(), true), }; - var typeIds = fields.Select(f => (int) f.DataType.TypeId).ToArray(); + var typeIds = new[] { 0, 1 }; var type = new UnionType(fields, typeIds, mode); - const int length = 100; var nullCount = 0; var field0Builder = new Int32Array.Builder(); var field1Builder = new FloatArray.Builder(); @@ -44,9 +144,9 @@ public void UnionArray_IsNull(UnionMode mode) for (var i = 0; i < length; ++i) { - var isNull = i % 5 == 0; + var isNull = i % 3 == 0; expectedNull[i] = isNull; - nullCount += isNull ? 1 : 0; + nullCount += (isNull && i >= offset) ? 1 : 0; if (i % 2 == 0) { @@ -101,13 +201,9 @@ public void UnionArray_IsNull(UnionMode mode) }; UnionArray array = mode == UnionMode.Dense - ? new DenseUnionArray(type, length, children, typeIdsBuffer, valuesOffsetBuffer, nullCount) - : new SparseUnionArray(type, length, children, typeIdsBuffer, nullCount); + ? new DenseUnionArray(type, length - offset, children, typeIdsBuffer, valuesOffsetBuffer, nullCount, offset) + : new SparseUnionArray(type, length - offset, children, typeIdsBuffer, nullCount, offset); - for (var i = 0; i < length; ++i) - { - Assert.Equal(expectedNull[i], array.IsNull(i)); - Assert.Equal(!expectedNull[i], array.IsValid(i)); - } + return (array, expectedNull); } } diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat index 8226eb5db360a..06d3016c72af9 100644 --- a/dev/release/verify-release-candidate.bat +++ b/dev/release/verify-release-candidate.bat @@ -122,7 +122,10 @@ cmake --build . --target INSTALL --config Release || exit /B 1 @rem Needed so python-test.exe works set PYTHONPATH_ORIGINAL=%PYTHONPATH% set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX%;%PYTHONPATH% -ctest -j%NUMBER_OF_PROCESSORS% --output-on-failure || exit /B 1 +ctest ^ + --build-config Release ^ + --output-on-failure ^ + --parallel %NUMBER_OF_PROCESSORS% || exit /B 1 set PYTHONPATH=%PYTHONPATH_ORIGINAL% popd diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index f18b18aaa997c..cf49751e6e2a9 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -831,7 +831,9 @@ test_glib() { show_header "Build and test C GLib libraries" # Build and test C GLib - maybe_setup_conda glib gobject-introspection meson ninja ruby + # We can unpin gobject-introspection after + # https://github.com/conda-forge/glib-feedstock/pull/174 is merged. + maybe_setup_conda glib gobject-introspection=1.78.1 meson ninja ruby maybe_setup_virtualenv meson # Install bundler if doesn't exist diff --git a/dev/tasks/docker-tests/azure.linux.yml b/dev/tasks/docker-tests/azure.linux.yml deleted file mode 100644 index b66bfbdfe940a..0000000000000 --- a/dev/tasks/docker-tests/azure.linux.yml +++ /dev/null @@ -1,67 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -jobs: -- job: linux - pool: - vmImage: ubuntu-latest - timeoutInMinutes: 360 - {% if env is defined %} - variables: - {% for key, value in env.items() %} - {{ key }}: {{ value }} - {% endfor %} - {% endif %} - - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '3.8' - - - script: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - git -C arrow remote add upstream https://github.com/apache/arrow.git - displayName: Clone arrow - - - script: pip install -e arrow/dev/archery[docker] - displayName: Setup Archery - - - script: | - archery --debug docker --using-docker-cli run \ - -e ARROW_DOCS_VERSION="{{ arrow.no_rc_version }}" \ - -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \ - {{ flags|default("") }} \ - {{ image }} \ - {{ command|default("") }} - displayName: Execute Docker Build - env: - {{ macros.azure_set_sccache_envvars()|indent(4) }} - - {% if post_script is defined %} - - script: | - {{ post_script|indent(6) }} - displayName: Post Script - {% endif %} - - {% if artifacts is defined %} - {{ macros.azure_upload_releases(artifacts) }} - {% endif %} diff --git a/dev/tasks/docker-tests/circle.linux.yml b/dev/tasks/docker-tests/circle.linux.yml deleted file mode 100644 index faad449c8ef88..0000000000000 --- a/dev/tasks/docker-tests/circle.linux.yml +++ /dev/null @@ -1,51 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -version: 2 -jobs: - build: - machine: - image: ubuntu-1604:202004-01 - {%- if env is defined %} - environment: - {%- for key, value in env.items() %} - {{ key }}: {{ value }} - {%- endfor %} - {%- endif %} - steps: - - run: | - docker -v - docker-compose -v - - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - - run: - name: Execute Docker Build - command: | - pyenv versions - pyenv global 3.8.12 - pip install -e arrow/dev/archery[docker] - archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" {{ run }} - no_output_timeout: "1h" - -workflows: - version: 2 - build: - jobs: - - build diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index f55a7f9481e56..1e819d3cf4556 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -59,7 +59,8 @@ env: {%- macro github_install_archery() -%} - name: Set up Python by actions/setup-python - if: runner.arch == 'X64' + if: | + !(runner.os == 'Linux' && runner.arch != 'X64') uses: actions/setup-python@v4 with: cache: 'pip' @@ -86,7 +87,8 @@ env: {%- macro github_upload_releases(pattern) -%} - name: Set up Python by actions/setup-python - if: runner.arch == 'X64' + if: | + !(runner.os == 'Linux' && runner.arch != 'X64') uses: actions/setup-python@v4 with: python-version: 3.12 diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml index cf99c84c60bfd..ce9613545eb54 100644 --- a/dev/tasks/python-wheels/github.osx.yml +++ b/dev/tasks/python-wheels/github.osx.yml @@ -50,6 +50,14 @@ jobs: run: | brew list + # CMake 3.29.1 that is pre-installed on the macOS image has a problem. + # See also: https://github.com/microsoft/vcpkg/issues/37968 + - name: Install CMake 3.29.0 + shell: bash + run: | + arrow/ci/scripts/install_cmake.sh $(arch) macos 3.29.0 ${PWD}/local + echo "${PWD}/local/bin" >> $GITHUB_PATH + - name: Retrieve VCPKG version from arrow/.env run: | vcpkg_version=$(cat "arrow/.env" | grep "VCPKG" | cut -d "=" -f2 | tr -d '"') diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index f98c0a2b48caa..da9d2cefe5f51 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1049,14 +1049,13 @@ tasks: image: {{ image }} {% endfor %} - # Use azure to run valgrind tests to prevent OOM test-conda-cpp-valgrind: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: image: conda-cpp-valgrind -{% for ubuntu_version in ["20.04", "22.04", "24.04"] %} +{% for ubuntu_version in ["20.04", "22.04"] %} test-ubuntu-{{ ubuntu_version }}-cpp: ci: github template: docker-tests/github.linux.yml @@ -1074,13 +1073,25 @@ tasks: UBUNTU: 20.04 image: ubuntu-cpp-bundled + test-ubuntu-24.04-cpp: + ci: github + template: docker-tests/github.linux.yml + params: + env: + CLANG_TOOLS: 15 + LLVM: 15 + UBUNTU: 24.04 + image: ubuntu-cpp + test-ubuntu-24.04-cpp-gcc-14: ci: github template: docker-tests/github.linux.yml params: env: - UBUNTU: "24.04" + CLANG_TOOLS: 15 GCC_VERSION: 14 + LLVM: 15 + UBUNTU: 24.04 # rapidjson 1.1.0 has an error caught by gcc 14. # https://github.com/Tencent/rapidjson/issues/718 flags: -e CC=gcc-14 -e CXX=g++-14 -e RapidJSON_SOURCE=BUNDLED @@ -1215,8 +1226,8 @@ tasks: image: conda-python-cython2 test-debian-12-python-3-amd64: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: DEBIAN: 12 @@ -1233,8 +1244,8 @@ tasks: image: debian-python test-ubuntu-20.04-python-3: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: UBUNTU: 20.04 @@ -1249,16 +1260,16 @@ tasks: image: ubuntu-python test-fedora-39-python-3: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: FEDORA: 39 image: fedora-python test-r-linux-valgrind: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: ARROW_R_DEV: "TRUE" @@ -1419,16 +1430,16 @@ tasks: flags: "-e LIBARROW_MINIMAL=TRUE" test-ubuntu-r-sanitizer: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: R_PRUNE_DEPS: TRUE image: ubuntu-r-sanitizer test-fedora-r-clang-sanitizer: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: R_PRUNE_DEPS: TRUE @@ -1436,8 +1447,8 @@ tasks: {% for go_version, staticcheck in [("1.21", "v0.4.7"), ("1.22", "latest")] %} test-debian-12-go-{{ go_version }}: - ci: azure - template: docker-tests/azure.linux.yml + ci: github + template: docker-tests/github.linux.yml params: env: DEBIAN: 12 diff --git a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat index 8f160ee7c4fd2..3b337bb175005 100644 --- a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat +++ b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat @@ -78,7 +78,8 @@ cmake --build . --target INSTALL --config Release || exit /B 1 @rem Test Arrow C++ library -ctest --output-on-failure ^ +ctest --build-config Release ^ + --output-on-failure ^ --parallel %NUMBER_OF_PROCESSORS% ^ --timeout 300 || exit /B 1 diff --git a/dev/tasks/vcpkg-tests/github.windows.yml b/dev/tasks/vcpkg-tests/github.windows.yml index 618c997c2527b..af12db595286f 100644 --- a/dev/tasks/vcpkg-tests/github.windows.yml +++ b/dev/tasks/vcpkg-tests/github.windows.yml @@ -15,14 +15,9 @@ # specific language governing permissions and limitations # under the License. -# NOTE: must set "Crossbow" as name to have the badge links working in the -# github comment reports! -name: Crossbow +{% import 'macros.jinja' as macros with context %} -on: - push: - branches: - - "*-github-*" +{{ macros.github_header() }} jobs: test-vcpkg-win: @@ -31,12 +26,14 @@ jobs: env: VCPKG_BINARY_SOURCES: 'clear;nuget,GitHub,readwrite' steps: - - name: Checkout Arrow + {{ macros.github_checkout_arrow()|indent }} + # CMake 3.29.1 that is pre-installed on the Windows image has a problem. + # See also: https://github.com/microsoft/vcpkg/issues/37968 + - name: Install CMake 3.29.0 + shell: bash run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive + arrow/ci/scripts/install_cmake.sh amd64 windows 3.29.0 /c/cmake + echo "c:\\cmake\\bin" >> $GITHUB_PATH - name: Download Timezone Database shell: bash run: arrow/ci/scripts/download_tz_database.sh @@ -59,7 +56,7 @@ jobs: CALL setx PATH "%PATH%;C:\vcpkg" - name: Setup NuGet Credentials shell: bash - env: + env: GITHUB_TOKEN: {{ '${{ secrets.GITHUB_TOKEN }}' }} run: | `vcpkg fetch nuget | tail -n 1` \ diff --git a/docker-compose.yml b/docker-compose.yml index 46717557bc337..60edf1420bc0f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -711,10 +711,11 @@ services: # Usage: # docker-compose run --rm conan # Parameters: - # CONAN: gcc11, gcc11-armv7, ... + # CONAN_BASE: gcc11, gcc11-armv7, ... + # CONAN_VERSION: 1.62.0 # See https://github.com/conan-io/conan-docker-tools#readme for # available images. - image: conanio/${CONAN} + image: conanio/${CONAN_BASE}:${CONAN_VERSION} user: root:root shm_size: *shm-size ulimits: *ulimits @@ -724,7 +725,7 @@ services: - .:/arrow:delegated command: >- /bin/bash -c " - /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin && + sudo /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin && /arrow/ci/scripts/conan_setup.sh && /arrow/ci/scripts/conan_build.sh /arrow /build" diff --git a/go/parquet/internal/utils/bit_reader_test.go b/go/parquet/internal/utils/bit_reader_test.go index 3e5d4ed724bc5..91202979520ef 100644 --- a/go/parquet/internal/utils/bit_reader_test.go +++ b/go/parquet/internal/utils/bit_reader_test.go @@ -59,6 +59,23 @@ func TestBitWriter(t *testing.T) { assert.Equal(t, byte(0xAA), buf[0]) assert.Equal(t, byte(0xCC), buf[1]) + + for i := 0; i < 3; i++ { + assert.True(t, bw.WriteVlqInt(uint64(i))) + } + assert.Equal(t, byte(0xAA), buf[0]) + assert.Equal(t, byte(0xCC), buf[1]) + assert.Equal(t, byte(0), buf[2]) + assert.Equal(t, byte(1), buf[3]) + assert.Equal(t, byte(2), buf[4]) +} + +func BenchmarkBitWriter(b *testing.B) { + buf := make([]byte, b.N) + bw := utils.NewBitWriter(utils.NewWriterAtBuffer(buf)) + for i := 0; i < b.N; i++ { + assert.True(b, bw.WriteVlqInt(uint64(1))) + } } func TestBitReader(t *testing.T) { diff --git a/go/parquet/internal/utils/bit_writer.go b/go/parquet/internal/utils/bit_writer.go index 106461d33e048..cf1d6cf13b113 100644 --- a/go/parquet/internal/utils/bit_writer.go +++ b/go/parquet/internal/utils/bit_writer.go @@ -75,6 +75,7 @@ type BitWriter struct { byteoffset int bitoffset uint raw [8]byte + buf [binary.MaxVarintLen64]byte } // NewBitWriter initializes a new bit writer to write to the passed in interface @@ -163,9 +164,8 @@ func (b *BitWriter) WriteAligned(val uint64, nbytes int) bool { // without buffering. func (b *BitWriter) WriteVlqInt(v uint64) bool { b.Flush(true) - var buf [binary.MaxVarintLen64]byte - nbytes := binary.PutUvarint(buf[:], v) - if _, err := b.wr.WriteAt(buf[:nbytes], int64(b.byteoffset)); err != nil { + nbytes := binary.PutUvarint(b.buf[:], v) + if _, err := b.wr.WriteAt(b.buf[:nbytes], int64(b.byteoffset)); err != nil { log.Println(err) return false }