Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into apachegh-42222-copy-to
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche committed Aug 14, 2024
2 parents edfb00f + ab432b1 commit 58ba96a
Show file tree
Hide file tree
Showing 134 changed files with 6,796 additions and 918 deletions.
11 changes: 6 additions & 5 deletions .github/workflows/r.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,19 @@ jobs:
run: |
sudo apt-get install devscripts
- uses: r-lib/actions/setup-r@v2
# replace the SHA with v2 once INFRA-26031 is resolved
- uses: r-lib/actions/setup-r@732fb28088814627972f1ccbacc02561178cf391
with:
use-public-rspm: true
install-r: false

- uses: r-lib/actions/setup-r-dependencies@v2
- uses: r-lib/actions/setup-r-dependencies@732fb28088814627972f1ccbacc02561178cf391
with:
extra-packages: any::rcmdcheck
needs: check
working-directory: src/r

- uses: r-lib/actions/check-r-package@v2
- uses: r-lib/actions/check-r-package@732fb28088814627972f1ccbacc02561178cf391
with:
working-directory: src/r
env:
Expand Down Expand Up @@ -337,11 +338,11 @@ jobs:
cd r/windows
ls *.zip | xargs -n 1 unzip -uo
rm -rf *.zip
- uses: r-lib/actions/setup-r@v2
- uses: r-lib/actions/setup-r@732fb28088814627972f1ccbacc02561178cf391
with:
r-version: ${{ matrix.config.rversion }}
Ncpus: 2
- uses: r-lib/actions/setup-r-dependencies@v2
- uses: r-lib/actions/setup-r-dependencies@732fb28088814627972f1ccbacc02561178cf391
env:
GITHUB_PAT: "${{ github.token }}"
with:
Expand Down
17 changes: 9 additions & 8 deletions c_glib/arrow-cuda-glib/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,15 @@ libarrow_cuda_glib = library('arrow-cuda-glib',
arrow_cuda_glib = declare_dependency(link_with: libarrow_cuda_glib,
include_directories: base_include_directories,
dependencies: dependencies)

pkgconfig.generate(libarrow_cuda_glib,
description: 'C API for Apache Arrow CUDA based on GLib',
filebase: 'arrow-cuda-glib',
name: 'Apache Arrow CUDA GLib',
requires: ['arrow-glib', 'arrow-cuda'],
variables: pkgconfig_variables,
version: version)
if target_machine.system() != 'windows'
pkgconfig.generate(libarrow_cuda_glib,
description: 'C API for Apache Arrow CUDA based on GLib',
filebase: 'arrow-cuda-glib',
name: 'Apache Arrow CUDA GLib',
requires: ['arrow-glib', 'arrow-cuda'],
variables: pkgconfig_variables,
version: version)
endif

if have_gi
gir_dependencies = [
Expand Down
1 change: 1 addition & 0 deletions ci/docker/ubuntu-24.04-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ RUN latest_system_llvm=18 && \
clang-${llvm} \
clang-format-${clang_tools} \
clang-tidy-${clang_tools} \
libclang-rt-${llvm}-dev \
llvm-${llvm}-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists*
Expand Down
7 changes: 3 additions & 4 deletions ci/scripts/java_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,6 @@ if [ $ARROW_JAVA_SKIP_GIT_PLUGIN ]; then
mvn="${mvn} -Dmaven.gitcommitid.skip=true"
fi

# Use `2 * ncores` threads
mvn="${mvn} -T 2C"

# https://github.com/apache/arrow/issues/41429
# TODO: We want to out-of-source build. This is a workaround. We copy
# all needed files to the build directory from the source directory
Expand All @@ -98,10 +95,12 @@ if [ "${ARROW_JAVA_JNI}" = "ON" ]; then
mvn="${mvn} -Darrow.cpp.build.dir=${java_jni_dist_dir} -Parrow-jni"
fi

${mvn} clean install
# Use `2 * ncores` threads
${mvn} -T 2C clean install

if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then
# HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633
# GH-43378: Maven site plugins not compatible with multithreading
mkdir -p ${build_dir}/docs/java/reference
${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site
rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference
Expand Down
10 changes: 3 additions & 7 deletions ci/scripts/java_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,10 @@ pushd ${source_dir}
${mvn} clean test

projects=()
if [ "${ARROW_DATASET}" = "ON" ]; then
projects+=(gandiva)
fi
if [ "${ARROW_GANDIVA}" = "ON" ]; then
projects+=(gandiva)
fi
if [ "${ARROW_ORC}" = "ON" ]; then
if [ "${ARROW_JAVA_JNI}" = "ON" ]; then
projects+=(adapter/orc)
projects+=(dataset)
projects+=(gandiva)
fi
if [ "${#projects[@]}" -gt 0 ]; then
${mvn} clean test \
Expand Down
4 changes: 4 additions & 0 deletions cpp/cmake_modules/FindThriftAlt.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,10 @@ if(ThriftAlt_FOUND)
# thrift/windows/config.h for Visual C++.
set_target_properties(thrift::thrift PROPERTIES INTERFACE_LINK_LIBRARIES "ws2_32")
endif()
# Workaround: thrift.pc doesn't have Boost dependency.
if(TARGET Boost::headers)
target_link_libraries(thrift::thrift INTERFACE Boost::headers)
endif()

if(Thrift_COMPILER_FOUND)
add_executable(thrift::compiler IMPORTED)
Expand Down
5 changes: 5 additions & 0 deletions cpp/cmake_modules/UseCython.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -184,4 +184,9 @@ function(cython_add_module _name pyx_target_name generated_files)
add_dependencies(${_name} ${pyx_target_name})
endfunction()

execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from Cython.Compiler.Version import version; print(version)"
OUTPUT_VARIABLE CYTHON_VERSION_OUTPUT
OUTPUT_STRIP_TRAILING_WHITESPACE)
set(CYTHON_VERSION "${CYTHON_VERSION_OUTPUT}")

include(CMakeParseArguments)
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,7 @@ endif()
if(ARROW_JSON)
arrow_add_object_library(ARROW_JSON
extension/fixed_shape_tensor.cc
extension/opaque.cc
json/options.cc
json/chunked_builder.cc
json/chunker.cc
Expand Down
6 changes: 1 addition & 5 deletions cpp/src/arrow/array/statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,8 @@
#include <cstdint>
#include <optional>
#include <string>
#include <string_view>
#include <variant>

#include "arrow/util/float16.h"
#include "arrow/util/visibility.h"

namespace arrow {
Expand All @@ -34,9 +32,7 @@ namespace arrow {
/// as Apache Parquet may have statistics. Statistics associated with
/// data source can be read unified API via this class.
struct ARROW_EXPORT ArrayStatistics {
using ValueType =
std::variant<bool, int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t,
uint64_t, util::Float16, float, double, std::string, std::string_view>;
using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;

/// \brief The number of null values, may not be set
std::optional<int64_t> null_count = std::nullopt;
Expand Down
14 changes: 7 additions & 7 deletions cpp/src/arrow/array/statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ TEST(ArrayStatisticsTest, TestMin) {
ArrayStatistics statistics;
ASSERT_FALSE(statistics.min.has_value());
ASSERT_FALSE(statistics.is_min_exact.has_value());
statistics.min = static_cast<int32_t>(29);
statistics.min = static_cast<uint64_t>(29);
statistics.is_min_exact = true;
ASSERT_TRUE(statistics.min.has_value());
ASSERT_TRUE(std::holds_alternative<int32_t>(statistics.min.value()));
ASSERT_EQ(29, std::get<int32_t>(statistics.min.value()));
ASSERT_TRUE(std::holds_alternative<uint64_t>(statistics.min.value()));
ASSERT_EQ(29, std::get<uint64_t>(statistics.min.value()));
ASSERT_TRUE(statistics.is_min_exact.has_value());
ASSERT_TRUE(statistics.is_min_exact.value());
}
Expand Down Expand Up @@ -79,19 +79,19 @@ TEST(ArrayStatisticsTest, TestEquality) {
statistics2.distinct_count = 2929;
ASSERT_EQ(statistics1, statistics2);

statistics1.min = std::string_view("world");
statistics1.min = std::string("world");
ASSERT_NE(statistics1, statistics2);
statistics2.min = std::string_view("world");
statistics2.min = std::string("world");
ASSERT_EQ(statistics1, statistics2);

statistics1.is_min_exact = false;
ASSERT_NE(statistics1, statistics2);
statistics2.is_min_exact = false;
ASSERT_EQ(statistics1, statistics2);

statistics1.max = arrow::util::Float16(-29);
statistics1.max = static_cast<int64_t>(-29);
ASSERT_NE(statistics1, statistics2);
statistics2.max = arrow::util::Float16(-29);
statistics2.max = static_cast<int64_t>(-29);
ASSERT_EQ(statistics1, statistics2);

statistics1.is_max_exact = true;
Expand Down
23 changes: 23 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,25 @@ std::shared_ptr<CastFunction> GetCastToHalfFloat() {
return func;
}

struct NullExtensionTypeMatcher : public TypeMatcher {
~NullExtensionTypeMatcher() override = default;

bool Matches(const DataType& type) const override {
return type.id() == Type::EXTENSION &&
checked_cast<const ExtensionType&>(type).storage_id() == Type::NA;
}

std::string ToString() const override { return "extension<storage_type: null>"; }

bool Equals(const TypeMatcher& other) const override {
if (this == &other) {
return true;
}
auto casted = dynamic_cast<const NullExtensionTypeMatcher*>(&other);
return casted != nullptr;
}
};

} // namespace

std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
Expand All @@ -875,6 +894,10 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
auto cast_null = std::make_shared<CastFunction>("cast_null", Type::NA);
DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, null(),
OutputAllNull));
// Explicitly allow casting extension type with null backing array to null
DCHECK_OK(cast_null->AddKernel(
Type::EXTENSION, {InputType(std::make_shared<NullExtensionTypeMatcher>())}, null(),
OutputAllNull));
functions.push_back(cast_null);

functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/dataset/dataset_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ class DatasetWriterTestFixture : public testing::Test {

std::shared_ptr<RecordBatch> ReadAsBatch(std::string_view data, int* num_batches) {
std::shared_ptr<io::RandomAccessFile> in_stream =
std::make_shared<io::BufferReader>(data);
std::make_shared<io::BufferReader>(std::make_shared<Buffer>(data));
EXPECT_OK_AND_ASSIGN(std::shared_ptr<ipc::RecordBatchFileReader> reader,
ipc::RecordBatchFileReader::Open(in_stream));
RecordBatchVector batches;
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/extension/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,10 @@ add_arrow_test(test
PREFIX
"arrow-fixed-shape-tensor")

add_arrow_test(test
SOURCES
opaque_test.cc
PREFIX
"arrow-extension-opaque")

arrow_install_all_headers("arrow/extension")
109 changes: 109 additions & 0 deletions cpp/src/arrow/extension/opaque.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/extension/opaque.h"

#include <sstream>

#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep
#include "arrow/util/logging.h"

#include <rapidjson/document.h>
#include <rapidjson/error/en.h>
#include <rapidjson/writer.h>

namespace arrow::extension {

std::string OpaqueType::ToString(bool show_metadata) const {
std::stringstream ss;
ss << "extension<" << this->extension_name()
<< "[storage_type=" << storage_type_->ToString(show_metadata)
<< ", type_name=" << type_name_ << ", vendor_name=" << vendor_name_ << "]>";
return ss.str();
}

bool OpaqueType::ExtensionEquals(const ExtensionType& other) const {
if (extension_name() != other.extension_name()) {
return false;
}
const auto& opaque = internal::checked_cast<const OpaqueType&>(other);
return storage_type()->Equals(*opaque.storage_type()) &&
type_name() == opaque.type_name() && vendor_name() == opaque.vendor_name();
}

std::string OpaqueType::Serialize() const {
rapidjson::Document document;
document.SetObject();
rapidjson::Document::AllocatorType& allocator = document.GetAllocator();

rapidjson::Value type_name(rapidjson::StringRef(type_name_));
document.AddMember(rapidjson::Value("type_name", allocator), type_name, allocator);
rapidjson::Value vendor_name(rapidjson::StringRef(vendor_name_));
document.AddMember(rapidjson::Value("vendor_name", allocator), vendor_name, allocator);

rapidjson::StringBuffer buffer;
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
document.Accept(writer);
return buffer.GetString();
}

Result<std::shared_ptr<DataType>> OpaqueType::Deserialize(
std::shared_ptr<DataType> storage_type, const std::string& serialized_data) const {
rapidjson::Document document;
const auto& parsed = document.Parse(serialized_data.data(), serialized_data.length());
if (parsed.HasParseError()) {
return Status::Invalid("Invalid serialized JSON data for OpaqueType: ",
rapidjson::GetParseError_En(parsed.GetParseError()), ": ",
serialized_data);
} else if (!document.IsObject()) {
return Status::Invalid("Invalid serialized JSON data for OpaqueType: not an object");
}
if (!document.HasMember("type_name")) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: missing type_name");
} else if (!document.HasMember("vendor_name")) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: missing vendor_name");
}

const auto& type_name = document["type_name"];
const auto& vendor_name = document["vendor_name"];
if (!type_name.IsString()) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: type_name is not a string");
} else if (!vendor_name.IsString()) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: vendor_name is not a string");
}

return opaque(std::move(storage_type), type_name.GetString(), vendor_name.GetString());
}

std::shared_ptr<Array> OpaqueType::MakeArray(std::shared_ptr<ArrayData> data) const {
DCHECK_EQ(data->type->id(), Type::EXTENSION);
DCHECK_EQ("arrow.opaque",
internal::checked_cast<const ExtensionType&>(*data->type).extension_name());
return std::make_shared<OpaqueArray>(data);
}

std::shared_ptr<DataType> opaque(std::shared_ptr<DataType> storage_type,
std::string type_name, std::string vendor_name) {
return std::make_shared<OpaqueType>(std::move(storage_type), std::move(type_name),
std::move(vendor_name));
}

} // namespace arrow::extension
Loading

0 comments on commit 58ba96a

Please sign in to comment.