Merge remote-tracking branch 'upstream/main' into dataset_encryption

jorisvandenbossche · Oct 9, 2023 · 2610fbb · 2610fbb
2 parents d2f1584 + 293819c
commit 2610fbb
Show file tree

Hide file tree

Showing 164 changed files with 9,686 additions and 4,482 deletions.
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
@@ -103,7 +103,7 @@ jobs:
         shell: bash
         run: |
           gem install test-unit
-          pip install "cython<3" setuptools six pytest jira
+          pip install "cython>=0.29.31" setuptools six pytest jira
       - name: Run Release Test
         env:
           ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}

diff --git a/LICENSE.txt b/LICENSE.txt
@@ -1889,19 +1889,6 @@ for PyArrow. Ibis is released under the Apache License, Version 2.0.
 
 --------------------------------------------------------------------------------
 
-This project includes code from the autobrew project.
-
-The following files are based on code from the autobrew project:
-* r/tools/autobrew
-* dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
-* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb 
-
-Copyright (c) 2019, Jeroen Ooms
-License: MIT
-Homepage: https://github.com/autobrew/
-
---------------------------------------------------------------------------------
-
 dev/tasks/homebrew-formulae/apache-arrow.rb has the following license:
 
 BSD 2-Clause License

diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat
@@ -132,6 +132,19 @@ set ARROW_HOME=%CONDA_PREFIX%\Library
 @rem ARROW-3075; pkgconfig is broken for Parquet for now
 set PARQUET_HOME=%CONDA_PREFIX%\Library
 
+@rem Download IANA Timezone Database to a non-standard location to
+@rem test the configurability of the timezone database path
+curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output tzdata.tar.gz || exit /B
+mkdir %USERPROFILE%\Downloads\test\tzdata
+tar --extract --file tzdata.tar.gz --directory %USERPROFILE%\Downloads\test\tzdata
+curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml ^
+  --output %USERPROFILE%\Downloads\test\tzdata\windowsZones.xml || exit /B
+@rem Remove the database from the default location
+rmdir /s /q %USERPROFILE%\Downloads\tzdata
+@rem Set the env var for the non-standard location of the database
+@rem (only needed for testing purposes)
+set PYARROW_TZDATA_PATH=%USERPROFILE%\Downloads\test\tzdata
+
 python setup.py develop -q || exit /B
 
 set PYTHONDEVMODE=1

diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt
@@ -18,7 +18,7 @@
 # don't add pandas here, because it is not a mandatory test dependency
 boto3  # not a direct dependency of s3fs, but needed for our s3fs fixture
 cffi
-cython<3
+cython>=0.29.31
 cloudpickle
 fsspec
 hypothesis

diff --git a/ci/scripts/install_pandas.sh b/ci/scripts/install_pandas.sh
@@ -28,7 +28,7 @@ pandas=$1
 numpy=${2:-"latest"}
 
 if [ "${numpy}" = "nightly" ]; then
-  pip install --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple --pre numpy
+  pip install --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple --pre numpy
 elif [ "${numpy}" = "latest" ]; then
   pip install numpy
 else
@@ -38,7 +38,7 @@ fi
 if [ "${pandas}" = "upstream_devel" ]; then
   pip install git+https://github.com/pandas-dev/pandas.git
 elif [ "${pandas}" = "nightly" ]; then
-  pip install --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple --pre pandas
+  pip install --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple --pre pandas
 elif [ "${pandas}" = "latest" ]; then
   pip install pandas
 else

diff --git a/ci/scripts/integration_dask.sh b/ci/scripts/integration_dask.sh
@@ -34,10 +34,7 @@ python -c "import dask.dataframe"
 # The "skip_with_pyarrow_strings" marker is meant to skip automatically, but that doesn't work with --pyargs, so de-selecting manually
 pytest -v --pyargs dask.dataframe.tests.test_dataframe -m "not skip_with_pyarrow_strings"
 pytest -v --pyargs dask.dataframe.io.tests.test_orc
-# skip failing parquet tests
-# test_pandas_timestamp_overflow_pyarrow is skipped because of GH-33321.
 pytest -v --pyargs dask.dataframe.io.tests.test_parquet \
-  -k "not test_pandas_timestamp_overflow_pyarrow" \
   -m "not skip_with_pyarrow_strings and not xfail_with_pyarrow_strings"
 # this file contains parquet tests that use S3 filesystem
 pytest -v --pyargs dask.bytes.tests.test_s3
diff --git a/cpp/Brewfile b/cpp/Brewfile
@@ -19,7 +19,9 @@ brew "aws-sdk-cpp"
 brew "bash"
 brew "boost"
 brew "brotli"
+brew "bzip2"
 brew "c-ares"
+brew "curl"
 brew "ccache"
 brew "cmake"
 brew "flatbuffers"
@@ -29,14 +31,18 @@ brew "googletest"
 brew "grpc"
 brew "llvm@14"
 brew "lz4"
+brew "mimalloc"
 brew "ninja"
 brew "node"
 brew "openssl@3"
+brew "pkg-config"
 brew "protobuf"
 brew "python"
 brew "rapidjson"
+brew "re2"
 brew "snappy"
 brew "thrift"
+brew "utf8proc"
 brew "wget"
 brew "xsimd"
 brew "zstd"
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -456,11 +456,18 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STRE
   # Don't complain about optimization passes that were not possible
   set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-pass-failed")
 
-  # Avoid clang / libc++ error about C++17 aligned allocation on macOS.
-  # See https://chromium.googlesource.com/chromium/src/+/eee44569858fc650b635779c4e34be5cb0c73186%5E%21/#F0
-  # for details.
   if(APPLE)
-    set(CXX_ONLY_FLAGS "${CXX_ONLY_FLAGS} -fno-aligned-new")
+    # Avoid clang / libc++ error about C++17 aligned allocation on macOS.
+    # See https://chromium.googlesource.com/chromium/src/+/eee44569858fc650b635779c4e34be5cb0c73186%5E%21/#F0
+    # for details.
+    string(APPEND CXX_ONLY_FLAGS " -fno-aligned-new")
+
+    if(CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
+      # Avoid C++17 std::get 'not available' issue on macOS 10.13
+      # This will be required until atleast R 4.4 is released and
+      # CRAN (hopefully) stops checking on 10.13
+      string(APPEND CXX_ONLY_FLAGS " -D_LIBCPP_DISABLE_AVAILABILITY")
+    endif()
   endif()
 endif()
 

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -1308,13 +1308,34 @@ macro(build_snappy)
   set(SNAPPY_CMAKE_ARGS
       ${EP_COMMON_CMAKE_ARGS} -DSNAPPY_BUILD_TESTS=OFF -DSNAPPY_BUILD_BENCHMARKS=OFF
       "-DCMAKE_INSTALL_PREFIX=${SNAPPY_PREFIX}")
+  # Snappy unconditionaly enables Werror when building with clang this can lead
+  # to build failues by way of new compiler warnings. This adds a flag to disable
+  # Werror to the very end of the invocation to override the snappy internal setting.
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    foreach(CONFIG DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
+      list(APPEND
+           SNAPPY_CMAKE_ARGS
+           "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS_${CONFIG}} -Wno-error"
+      )
+    endforeach()
+  endif()
+
+  if(APPLE AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
+    # On macOS 10.13 we need to explicitly add <functional> to avoid a missing include error
+    # This can be removed once CRAN no longer checks on macOS 10.13
+    find_program(PATCH patch REQUIRED)
+    set(SNAPPY_PATCH_COMMAND ${PATCH} -p1 -i ${CMAKE_CURRENT_LIST_DIR}/snappy.diff)
+  else()
+    set(SNAPPY_PATCH_COMMAND)
+  endif()
 
   externalproject_add(snappy_ep
                       ${EP_COMMON_OPTIONS}
                       BUILD_IN_SOURCE 1
                       INSTALL_DIR ${SNAPPY_PREFIX}
                       URL ${SNAPPY_SOURCE_URL}
                       URL_HASH "SHA256=${ARROW_SNAPPY_BUILD_SHA256_CHECKSUM}"
+                      PATCH_COMMAND ${SNAPPY_PATCH_COMMAND}
                       CMAKE_ARGS ${SNAPPY_CMAKE_ARGS}
                       BUILD_BYPRODUCTS "${SNAPPY_STATIC_LIB}")
 

diff --git a/cpp/cmake_modules/snappy.diff b/cpp/cmake_modules/snappy.diff
@@ -0,0 +1,12 @@
+diff --git a/snappy.cc b/snappy.cc
+index d414718..5b0d0d6 100644
+--- a/snappy.cc
++++ b/snappy.cc
+@@ -83,6 +83,7 @@
+ #include <string>
+ #include <utility>
+ #include <vector>
++#include <functional>
+
+ namespace snappy {
+
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -204,7 +204,7 @@ class ORCFileReader::Impl {
 
   Status Init() {
     int64_t nstripes = reader_->getNumberOfStripes();
-    stripes_.resize(nstripes);
+    stripes_.resize(static_cast<size_t>(nstripes));
     std::unique_ptr<liborc::StripeInformation> stripe;
     uint64_t first_row_of_stripe = 0;
     for (int i = 0; i < nstripes; ++i) {
@@ -222,7 +222,9 @@ class ORCFileReader::Impl {
 
   int64_t NumberOfRows() { return static_cast<int64_t>(reader_->getNumberOfRows()); }
 
-  StripeInformation GetStripeInformation(int64_t stripe) { return stripes_[stripe]; }
+  StripeInformation GetStripeInformation(int64_t stripe) {
+    return stripes_[static_cast<size_t>(stripe)];
+  }
 
   FileVersion GetFileVersion() {
     liborc::FileVersion orc_file_version = reader_->getFormatVersion();
@@ -365,7 +367,7 @@ class ORCFileReader::Impl {
     liborc::RowReaderOptions opts = default_row_reader_options();
     RETURN_NOT_OK(SelectStripe(&opts, stripe));
     ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts));
-    return ReadBatch(opts, schema, stripes_[stripe].num_rows);
+    return ReadBatch(opts, schema, stripes_[static_cast<size_t>(stripe)].num_rows);
   }
 
   Result<std::shared_ptr<RecordBatch>> ReadStripe(
@@ -374,7 +376,7 @@ class ORCFileReader::Impl {
     RETURN_NOT_OK(SelectIndices(&opts, include_indices));
     RETURN_NOT_OK(SelectStripe(&opts, stripe));
     ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts));
-    return ReadBatch(opts, schema, stripes_[stripe].num_rows);
+    return ReadBatch(opts, schema, stripes_[static_cast<size_t>(stripe)].num_rows);
   }
 
   Result<std::shared_ptr<RecordBatch>> ReadStripe(
@@ -383,15 +385,15 @@ class ORCFileReader::Impl {
     RETURN_NOT_OK(SelectNames(&opts, include_names));
     RETURN_NOT_OK(SelectStripe(&opts, stripe));
     ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts));
-    return ReadBatch(opts, schema, stripes_[stripe].num_rows);
+    return ReadBatch(opts, schema, stripes_[static_cast<size_t>(stripe)].num_rows);
   }
 
   Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) {
     ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(),
                     Status::Invalid("Out of bounds stripe: ", stripe));
 
-    opts->range(static_cast<uint64_t>(stripes_[stripe].offset),
-                static_cast<uint64_t>(stripes_[stripe].length));
+    opts->range(static_cast<uint64_t>(stripes_[static_cast<size_t>(stripe)].offset),
+                static_cast<uint64_t>(stripes_[static_cast<size_t>(stripe)].length));
     return Status::OK();
   }
 

diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
@@ -388,7 +388,12 @@ static std::vector<std::shared_ptr<DataType>> TestArrayUtilitiesAgainstTheseType
       large_utf8(),
       list(utf8()),
       list(int64()),  // NOTE: Regression case for ARROW-9071/MakeArrayOfNull
+      list(large_utf8()),
+      list(list(int64())),
+      list(list(large_utf8())),
+      large_list(utf8()),
       large_list(large_utf8()),
+      large_list(list(large_utf8())),
       fixed_size_list(utf8(), 3),
       fixed_size_list(int64(), 4),
       dictionary(int32(), utf8()),

diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc
@@ -366,9 +366,11 @@ class NullArrayFactory {
     }
 
     template <typename T>
-    enable_if_var_size_list<T, Status> Visit(const T&) {
+    enable_if_var_size_list<T, Status> Visit(const T& type) {
       // values array may be empty, but there must be at least one offset of 0
-      return MaxOf(sizeof(typename T::offset_type) * (length_ + 1));
+      RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * (length_ + 1)));
+      RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_)));
+      return Status::OK();
     }
 
     template <typename T>

diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
@@ -713,8 +713,10 @@ struct ValidateArrayImpl {
     }
 
     // An empty list array can have 0 offsets
-    const auto required_offsets = (data.length > 0) ? data.length + data.offset + 1 : 0;
     const auto offsets_byte_size = data.buffers[1]->size();
+    const auto required_offsets = ((data.length > 0) || (offsets_byte_size > 0))
+                                      ? data.length + data.offset + 1
+                                      : 0;
     if (offsets_byte_size / static_cast<int32_t>(sizeof(offset_type)) <
         required_offsets) {
       return Status::Invalid("Offsets buffer size (bytes): ", offsets_byte_size,

diff --git a/cpp/src/arrow/dataset/dataset_writer.cc b/cpp/src/arrow/dataset/dataset_writer.cc
@@ -139,7 +139,7 @@ class DatasetWriterFileQueue {
       : options_(options), schema_(schema), writer_state_(writer_state) {}
 
   void Start(util::AsyncTaskScheduler* file_tasks, const std::string& filename) {
-    file_tasks_ = std::move(file_tasks);
+    file_tasks_ = file_tasks;
     // Because the scheduler runs one task at a time we know the writer will
     // be opened before any attempt to write
     file_tasks_->AddSimpleTask(
@@ -575,7 +575,7 @@ class DatasetWriter::DatasetWriterImpl {
   }
 
  protected:
-  Status CloseLargestFile() {
+  Status TryCloseLargestFile() {
     std::shared_ptr<DatasetWriterDirectoryQueue> largest = nullptr;
     uint64_t largest_num_rows = 0;
     for (auto& dir_queue : directory_queues_) {
@@ -584,7 +584,10 @@ class DatasetWriter::DatasetWriterImpl {
         largest = dir_queue.second;
       }
     }
-    DCHECK_NE(largest, nullptr);
+    if (largest == nullptr) {
+      // GH-38011: If all written files has written 0 rows, we should not close any file
+      return Status::OK();
+    }
     return largest->FinishCurrentFile();
   }
 
@@ -618,7 +621,7 @@ class DatasetWriter::DatasetWriterImpl {
         backpressure = writer_state_.open_files_throttle.Acquire(1);
         if (!backpressure.is_finished()) {
           EVENT_ON_CURRENT_SPAN("DatasetWriter::Backpressure::TooManyOpenFiles");
-          RETURN_NOT_OK(CloseLargestFile());
+          RETURN_NOT_OK(TryCloseLargestFile());
           break;
         }
       }

diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc
@@ -26,7 +26,9 @@
 #include "arrow/tensor.h"
 #include "arrow/util/int_util_overflow.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/print.h"
 #include "arrow/util/sort.h"
+#include "arrow/util/string.h"
 
 #include <rapidjson/document.h>
 #include <rapidjson/writer.h>
@@ -104,6 +106,22 @@ bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const {
          permutation_equivalent;
 }
 
+std::string FixedShapeTensorType::ToString() const {
+  std::stringstream ss;
+  ss << "extension<" << this->extension_name()
+     << "[value_type=" << value_type_->ToString()
+     << ", shape=" << ::arrow::internal::PrintVector{shape_, ","};
+
+  if (!permutation_.empty()) {
+    ss << ", permutation=" << ::arrow::internal::PrintVector{permutation_, ","};
+  }
+  if (!dim_names_.empty()) {
+    ss << ", dim_names=[" << internal::JoinStrings(dim_names_, ",") << "]";
+  }
+  ss << "]>";
+  return ss.str();
+}
+
 std::string FixedShapeTensorType::Serialize() const {
   rj::Document document;
   document.SetObject();

diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h
@@ -61,6 +61,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType {
         dim_names_(dim_names) {}
 
   std::string extension_name() const override { return "arrow.fixed_shape_tensor"; }
+  std::string ToString() const override;
 
   /// Number of dimensions of tensor elements
   size_t ndim() { return shape_.size(); }

diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
@@ -434,4 +434,32 @@ TEST_F(TestExtensionType, ComputeStrides) {
   ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})");
 }
 
+TEST_F(TestExtensionType, ToString) {
+  auto exact_ext_type = internal::checked_pointer_cast<FixedShapeTensorType>(ext_type_);
+
+  auto ext_type_1 = internal::checked_pointer_cast<FixedShapeTensorType>(
+      fixed_shape_tensor(int16(), {3, 4, 7}));
+  auto ext_type_2 = internal::checked_pointer_cast<FixedShapeTensorType>(
+      fixed_shape_tensor(int32(), {3, 4, 7}, {1, 0, 2}));
+  auto ext_type_3 = internal::checked_pointer_cast<FixedShapeTensorType>(
+      fixed_shape_tensor(int64(), {3, 4, 7}, {}, {"C", "H", "W"}));
+
+  std::string result_1 = ext_type_1->ToString();
+  std::string expected_1 =
+      "extension<arrow.fixed_shape_tensor[value_type=int16, shape=[3,4,7]]>";
+  ASSERT_EQ(expected_1, result_1);
+
+  std::string result_2 = ext_type_2->ToString();
+  std::string expected_2 =
+      "extension<arrow.fixed_shape_tensor[value_type=int32, shape=[3,4,7], "
+      "permutation=[1,0,2]]>";
+  ASSERT_EQ(expected_2, result_2);
+
+  std::string result_3 = ext_type_3->ToString();
+  std::string expected_3 =
+      "extension<arrow.fixed_shape_tensor[value_type=int64, shape=[3,4,7], "
+      "dim_names=[C,H,W]]>";
+  ASSERT_EQ(expected_3, result_3);
+}
+
 }  // namespace arrow