Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into dataset_encryption
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche committed Oct 9, 2023
2 parents d2f1584 + 293819c commit 2610fbb
Show file tree
Hide file tree
Showing 164 changed files with 9,686 additions and 4,482 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ jobs:
shell: bash
run: |
gem install test-unit
pip install "cython<3" setuptools six pytest jira
pip install "cython>=0.29.31" setuptools six pytest jira
- name: Run Release Test
env:
ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand Down
13 changes: 0 additions & 13 deletions LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1889,19 +1889,6 @@ for PyArrow. Ibis is released under the Apache License, Version 2.0.

--------------------------------------------------------------------------------

This project includes code from the autobrew project.

The following files are based on code from the autobrew project:
* r/tools/autobrew
* dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb

Copyright (c) 2019, Jeroen Ooms
License: MIT
Homepage: https://github.com/autobrew/

--------------------------------------------------------------------------------

dev/tasks/homebrew-formulae/apache-arrow.rb has the following license:

BSD 2-Clause License
Expand Down
13 changes: 13 additions & 0 deletions ci/appveyor-cpp-build.bat
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,19 @@ set ARROW_HOME=%CONDA_PREFIX%\Library
@rem ARROW-3075; pkgconfig is broken for Parquet for now
set PARQUET_HOME=%CONDA_PREFIX%\Library

@rem Download IANA Timezone Database to a non-standard location to
@rem test the configurability of the timezone database path
curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output tzdata.tar.gz || exit /B
mkdir %USERPROFILE%\Downloads\test\tzdata
tar --extract --file tzdata.tar.gz --directory %USERPROFILE%\Downloads\test\tzdata
curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml ^
--output %USERPROFILE%\Downloads\test\tzdata\windowsZones.xml || exit /B
@rem Remove the database from the default location
rmdir /s /q %USERPROFILE%\Downloads\tzdata
@rem Set the env var for the non-standard location of the database
@rem (only needed for testing purposes)
set PYARROW_TZDATA_PATH=%USERPROFILE%\Downloads\test\tzdata

python setup.py develop -q || exit /B

set PYTHONDEVMODE=1
Expand Down
2 changes: 1 addition & 1 deletion ci/conda_env_python.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# don't add pandas here, because it is not a mandatory test dependency
boto3 # not a direct dependency of s3fs, but needed for our s3fs fixture
cffi
cython<3
cython>=0.29.31
cloudpickle
fsspec
hypothesis
Expand Down
4 changes: 2 additions & 2 deletions ci/scripts/install_pandas.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ pandas=$1
numpy=${2:-"latest"}

if [ "${numpy}" = "nightly" ]; then
pip install --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple --pre numpy
pip install --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple --pre numpy
elif [ "${numpy}" = "latest" ]; then
pip install numpy
else
Expand All @@ -38,7 +38,7 @@ fi
if [ "${pandas}" = "upstream_devel" ]; then
pip install git+https://github.com/pandas-dev/pandas.git
elif [ "${pandas}" = "nightly" ]; then
pip install --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple --pre pandas
pip install --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple --pre pandas
elif [ "${pandas}" = "latest" ]; then
pip install pandas
else
Expand Down
3 changes: 0 additions & 3 deletions ci/scripts/integration_dask.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,7 @@ python -c "import dask.dataframe"
# The "skip_with_pyarrow_strings" marker is meant to skip automatically, but that doesn't work with --pyargs, so de-selecting manually
pytest -v --pyargs dask.dataframe.tests.test_dataframe -m "not skip_with_pyarrow_strings"
pytest -v --pyargs dask.dataframe.io.tests.test_orc
# skip failing parquet tests
# test_pandas_timestamp_overflow_pyarrow is skipped because of GH-33321.
pytest -v --pyargs dask.dataframe.io.tests.test_parquet \
-k "not test_pandas_timestamp_overflow_pyarrow" \
-m "not skip_with_pyarrow_strings and not xfail_with_pyarrow_strings"
# this file contains parquet tests that use S3 filesystem
pytest -v --pyargs dask.bytes.tests.test_s3
6 changes: 6 additions & 0 deletions cpp/Brewfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ brew "aws-sdk-cpp"
brew "bash"
brew "boost"
brew "brotli"
brew "bzip2"
brew "c-ares"
brew "curl"
brew "ccache"
brew "cmake"
brew "flatbuffers"
Expand All @@ -29,14 +31,18 @@ brew "googletest"
brew "grpc"
brew "llvm@14"
brew "lz4"
brew "mimalloc"
brew "ninja"
brew "node"
brew "openssl@3"
brew "pkg-config"
brew "protobuf"
brew "python"
brew "rapidjson"
brew "re2"
brew "snappy"
brew "thrift"
brew "utf8proc"
brew "wget"
brew "xsimd"
brew "zstd"
15 changes: 11 additions & 4 deletions cpp/cmake_modules/SetupCxxFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -456,11 +456,18 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STRE
# Don't complain about optimization passes that were not possible
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-pass-failed")

# Avoid clang / libc++ error about C++17 aligned allocation on macOS.
# See https://chromium.googlesource.com/chromium/src/+/eee44569858fc650b635779c4e34be5cb0c73186%5E%21/#F0
# for details.
if(APPLE)
set(CXX_ONLY_FLAGS "${CXX_ONLY_FLAGS} -fno-aligned-new")
# Avoid clang / libc++ error about C++17 aligned allocation on macOS.
# See https://chromium.googlesource.com/chromium/src/+/eee44569858fc650b635779c4e34be5cb0c73186%5E%21/#F0
# for details.
string(APPEND CXX_ONLY_FLAGS " -fno-aligned-new")

if(CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
# Avoid C++17 std::get 'not available' issue on macOS 10.13
# This will be required until atleast R 4.4 is released and
# CRAN (hopefully) stops checking on 10.13
string(APPEND CXX_ONLY_FLAGS " -D_LIBCPP_DISABLE_AVAILABILITY")
endif()
endif()
endif()

Expand Down
21 changes: 21 additions & 0 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1308,13 +1308,34 @@ macro(build_snappy)
set(SNAPPY_CMAKE_ARGS
${EP_COMMON_CMAKE_ARGS} -DSNAPPY_BUILD_TESTS=OFF -DSNAPPY_BUILD_BENCHMARKS=OFF
"-DCMAKE_INSTALL_PREFIX=${SNAPPY_PREFIX}")
# Snappy unconditionaly enables Werror when building with clang this can lead
# to build failues by way of new compiler warnings. This adds a flag to disable
# Werror to the very end of the invocation to override the snappy internal setting.
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
foreach(CONFIG DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
list(APPEND
SNAPPY_CMAKE_ARGS
"-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS_${CONFIG}} -Wno-error"
)
endforeach()
endif()

if(APPLE AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
# On macOS 10.13 we need to explicitly add <functional> to avoid a missing include error
# This can be removed once CRAN no longer checks on macOS 10.13
find_program(PATCH patch REQUIRED)
set(SNAPPY_PATCH_COMMAND ${PATCH} -p1 -i ${CMAKE_CURRENT_LIST_DIR}/snappy.diff)
else()
set(SNAPPY_PATCH_COMMAND)
endif()

externalproject_add(snappy_ep
${EP_COMMON_OPTIONS}
BUILD_IN_SOURCE 1
INSTALL_DIR ${SNAPPY_PREFIX}
URL ${SNAPPY_SOURCE_URL}
URL_HASH "SHA256=${ARROW_SNAPPY_BUILD_SHA256_CHECKSUM}"
PATCH_COMMAND ${SNAPPY_PATCH_COMMAND}
CMAKE_ARGS ${SNAPPY_CMAKE_ARGS}
BUILD_BYPRODUCTS "${SNAPPY_STATIC_LIB}")

Expand Down
12 changes: 12 additions & 0 deletions cpp/cmake_modules/snappy.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
diff --git a/snappy.cc b/snappy.cc
index d414718..5b0d0d6 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -83,6 +83,7 @@
#include <string>
#include <utility>
#include <vector>
+#include <functional>

namespace snappy {

16 changes: 9 additions & 7 deletions cpp/src/arrow/adapters/orc/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ class ORCFileReader::Impl {

Status Init() {
int64_t nstripes = reader_->getNumberOfStripes();
stripes_.resize(nstripes);
stripes_.resize(static_cast<size_t>(nstripes));
std::unique_ptr<liborc::StripeInformation> stripe;
uint64_t first_row_of_stripe = 0;
for (int i = 0; i < nstripes; ++i) {
Expand All @@ -222,7 +222,9 @@ class ORCFileReader::Impl {

int64_t NumberOfRows() { return static_cast<int64_t>(reader_->getNumberOfRows()); }

StripeInformation GetStripeInformation(int64_t stripe) { return stripes_[stripe]; }
StripeInformation GetStripeInformation(int64_t stripe) {
return stripes_[static_cast<size_t>(stripe)];
}

FileVersion GetFileVersion() {
liborc::FileVersion orc_file_version = reader_->getFormatVersion();
Expand Down Expand Up @@ -365,7 +367,7 @@ class ORCFileReader::Impl {
liborc::RowReaderOptions opts = default_row_reader_options();
RETURN_NOT_OK(SelectStripe(&opts, stripe));
ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts));
return ReadBatch(opts, schema, stripes_[stripe].num_rows);
return ReadBatch(opts, schema, stripes_[static_cast<size_t>(stripe)].num_rows);
}

Result<std::shared_ptr<RecordBatch>> ReadStripe(
Expand All @@ -374,7 +376,7 @@ class ORCFileReader::Impl {
RETURN_NOT_OK(SelectIndices(&opts, include_indices));
RETURN_NOT_OK(SelectStripe(&opts, stripe));
ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts));
return ReadBatch(opts, schema, stripes_[stripe].num_rows);
return ReadBatch(opts, schema, stripes_[static_cast<size_t>(stripe)].num_rows);
}

Result<std::shared_ptr<RecordBatch>> ReadStripe(
Expand All @@ -383,15 +385,15 @@ class ORCFileReader::Impl {
RETURN_NOT_OK(SelectNames(&opts, include_names));
RETURN_NOT_OK(SelectStripe(&opts, stripe));
ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts));
return ReadBatch(opts, schema, stripes_[stripe].num_rows);
return ReadBatch(opts, schema, stripes_[static_cast<size_t>(stripe)].num_rows);
}

Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) {
ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(),
Status::Invalid("Out of bounds stripe: ", stripe));

opts->range(static_cast<uint64_t>(stripes_[stripe].offset),
static_cast<uint64_t>(stripes_[stripe].length));
opts->range(static_cast<uint64_t>(stripes_[static_cast<size_t>(stripe)].offset),
static_cast<uint64_t>(stripes_[static_cast<size_t>(stripe)].length));
return Status::OK();
}

Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,12 @@ static std::vector<std::shared_ptr<DataType>> TestArrayUtilitiesAgainstTheseType
large_utf8(),
list(utf8()),
list(int64()), // NOTE: Regression case for ARROW-9071/MakeArrayOfNull
list(large_utf8()),
list(list(int64())),
list(list(large_utf8())),
large_list(utf8()),
large_list(large_utf8()),
large_list(list(large_utf8())),
fixed_size_list(utf8(), 3),
fixed_size_list(int64(), 4),
dictionary(int32(), utf8()),
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/arrow/array/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,9 +366,11 @@ class NullArrayFactory {
}

template <typename T>
enable_if_var_size_list<T, Status> Visit(const T&) {
enable_if_var_size_list<T, Status> Visit(const T& type) {
// values array may be empty, but there must be at least one offset of 0
return MaxOf(sizeof(typename T::offset_type) * (length_ + 1));
RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * (length_ + 1)));
RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_)));
return Status::OK();
}

template <typename T>
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/arrow/array/validate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -713,8 +713,10 @@ struct ValidateArrayImpl {
}

// An empty list array can have 0 offsets
const auto required_offsets = (data.length > 0) ? data.length + data.offset + 1 : 0;
const auto offsets_byte_size = data.buffers[1]->size();
const auto required_offsets = ((data.length > 0) || (offsets_byte_size > 0))
? data.length + data.offset + 1
: 0;
if (offsets_byte_size / static_cast<int32_t>(sizeof(offset_type)) <
required_offsets) {
return Status::Invalid("Offsets buffer size (bytes): ", offsets_byte_size,
Expand Down
11 changes: 7 additions & 4 deletions cpp/src/arrow/dataset/dataset_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ class DatasetWriterFileQueue {
: options_(options), schema_(schema), writer_state_(writer_state) {}

void Start(util::AsyncTaskScheduler* file_tasks, const std::string& filename) {
file_tasks_ = std::move(file_tasks);
file_tasks_ = file_tasks;
// Because the scheduler runs one task at a time we know the writer will
// be opened before any attempt to write
file_tasks_->AddSimpleTask(
Expand Down Expand Up @@ -575,7 +575,7 @@ class DatasetWriter::DatasetWriterImpl {
}

protected:
Status CloseLargestFile() {
Status TryCloseLargestFile() {
std::shared_ptr<DatasetWriterDirectoryQueue> largest = nullptr;
uint64_t largest_num_rows = 0;
for (auto& dir_queue : directory_queues_) {
Expand All @@ -584,7 +584,10 @@ class DatasetWriter::DatasetWriterImpl {
largest = dir_queue.second;
}
}
DCHECK_NE(largest, nullptr);
if (largest == nullptr) {
// GH-38011: If all written files has written 0 rows, we should not close any file
return Status::OK();
}
return largest->FinishCurrentFile();
}

Expand Down Expand Up @@ -618,7 +621,7 @@ class DatasetWriter::DatasetWriterImpl {
backpressure = writer_state_.open_files_throttle.Acquire(1);
if (!backpressure.is_finished()) {
EVENT_ON_CURRENT_SPAN("DatasetWriter::Backpressure::TooManyOpenFiles");
RETURN_NOT_OK(CloseLargestFile());
RETURN_NOT_OK(TryCloseLargestFile());
break;
}
}
Expand Down
18 changes: 18 additions & 0 deletions cpp/src/arrow/extension/fixed_shape_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
#include "arrow/tensor.h"
#include "arrow/util/int_util_overflow.h"
#include "arrow/util/logging.h"
#include "arrow/util/print.h"
#include "arrow/util/sort.h"
#include "arrow/util/string.h"

#include <rapidjson/document.h>
#include <rapidjson/writer.h>
Expand Down Expand Up @@ -104,6 +106,22 @@ bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const {
permutation_equivalent;
}

std::string FixedShapeTensorType::ToString() const {
std::stringstream ss;
ss << "extension<" << this->extension_name()
<< "[value_type=" << value_type_->ToString()
<< ", shape=" << ::arrow::internal::PrintVector{shape_, ","};

if (!permutation_.empty()) {
ss << ", permutation=" << ::arrow::internal::PrintVector{permutation_, ","};
}
if (!dim_names_.empty()) {
ss << ", dim_names=[" << internal::JoinStrings(dim_names_, ",") << "]";
}
ss << "]>";
return ss.str();
}

std::string FixedShapeTensorType::Serialize() const {
rj::Document document;
document.SetObject();
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/extension/fixed_shape_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType {
dim_names_(dim_names) {}

std::string extension_name() const override { return "arrow.fixed_shape_tensor"; }
std::string ToString() const override;

/// Number of dimensions of tensor elements
size_t ndim() { return shape_.size(); }
Expand Down
28 changes: 28 additions & 0 deletions cpp/src/arrow/extension/fixed_shape_tensor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -434,4 +434,32 @@ TEST_F(TestExtensionType, ComputeStrides) {
ASSERT_EQ(ext_type_7->Serialize(), R"({"shape":[3,4,7],"permutation":[2,0,1]})");
}

TEST_F(TestExtensionType, ToString) {
auto exact_ext_type = internal::checked_pointer_cast<FixedShapeTensorType>(ext_type_);

auto ext_type_1 = internal::checked_pointer_cast<FixedShapeTensorType>(
fixed_shape_tensor(int16(), {3, 4, 7}));
auto ext_type_2 = internal::checked_pointer_cast<FixedShapeTensorType>(
fixed_shape_tensor(int32(), {3, 4, 7}, {1, 0, 2}));
auto ext_type_3 = internal::checked_pointer_cast<FixedShapeTensorType>(
fixed_shape_tensor(int64(), {3, 4, 7}, {}, {"C", "H", "W"}));

std::string result_1 = ext_type_1->ToString();
std::string expected_1 =
"extension<arrow.fixed_shape_tensor[value_type=int16, shape=[3,4,7]]>";
ASSERT_EQ(expected_1, result_1);

std::string result_2 = ext_type_2->ToString();
std::string expected_2 =
"extension<arrow.fixed_shape_tensor[value_type=int32, shape=[3,4,7], "
"permutation=[1,0,2]]>";
ASSERT_EQ(expected_2, result_2);

std::string result_3 = ext_type_3->ToString();
std::string expected_3 =
"extension<arrow.fixed_shape_tensor[value_type=int64, shape=[3,4,7], "
"dim_names=[C,H,W]]>";
ASSERT_EQ(expected_3, result_3);
}

} // namespace arrow
Loading

0 comments on commit 2610fbb

Please sign in to comment.