From f3c5fb98ae7673ad94b198b2da4c741013084e46 Mon Sep 17 00:00:00 2001 From: James Henderson Date: Wed, 27 Mar 2024 13:33:35 +0000 Subject: [PATCH 01/81] =?UTF-8?q?GH-40796:=20[Java]=20set=20`lastSet`=20in?= =?UTF-8?q?=20`ListVector.setNull`=20to=20avoid=20O(n=C2=B2)=20in=20ListVe?= =?UTF-8?q?ctors=20with=20lots=20of=20nulls=20(#40810)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Would benefit from someone with knowledge of the context double-checking this doesn't have nuances I'm not aware of - particularly, there's a comment on the field: `the maximum index that is actually set` which one _could_ read to mean 'excluding nulls'? ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #40796 Authored-by: James Henderson Signed-off-by: David Li --- .../arrow/vector/complex/ListVector.java | 1 + .../apache/arrow/vector/TestValueVector.java | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 5154ac17279c5..7df659e4cc9da 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -856,6 +856,7 @@ public void setNull(int index) { offsetBuffer.setInt((i + 1) * OFFSET_WIDTH, currentOffset); } BitVectorHelper.unsetBit(validityBuffer, index); + lastSet = index; } /** diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 10091aebdd50b..ad84882c66275 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -2859,6 +2859,29 @@ public void testListVectorEquals() { } } + @Test + public void testListVectorSetNull() { + try (final ListVector vector = ListVector.empty("list", allocator)) { + UnionListWriter writer = vector.getWriter(); + writer.allocate(); + + writeListVector(writer, new int[] {1, 2}); + writeListVector(writer, new int[] {3, 4}); + writeListVector(writer, new int[] {5, 6}); + vector.setNull(3); + vector.setNull(4); + vector.setNull(5); + writer.setValueCount(6); + + assertEquals(vector.getObject(0), Arrays.asList(1, 2)); + assertEquals(vector.getObject(1), Arrays.asList(3, 4)); + assertEquals(vector.getObject(2), Arrays.asList(5, 6)); + assertTrue(vector.isNull(3)); + assertTrue(vector.isNull(4)); + assertTrue(vector.isNull(5)); + } + } + @Test public void testStructVectorEqualsWithNull() { From 83dc0a91d2f1e238a7e4d033d9373928bd8ab4a3 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Thu, 28 Mar 2024 03:32:56 +1300 Subject: [PATCH 02/81] GH-40790: [C#] Account for offset and length when getting fields of a StructArray (#40805) ### Rationale for this change See #40790. The `StructArray.Fields` property currently returns the child arrays without accounting for the array offset and length. This meant that consumers would need to know to account for the offset and length themselves when accessing the child arrays, and this is inconsistent with the behaviour of Arrow APIs in other languages. ### What changes are included in this PR? Changes the behaviour of the `StructArray.Fields` property, so that the returned arrays are sliced if required. This behaviour is consistent with the C++ Arrow API, eg. see: https://github.com/apache/arrow/blob/f710ac52b049806515a14445b242c3ec819fb99d/cpp/src/arrow/array/array_nested.cc#L1019-L1020 I also checked that pyarrow behaves like this too: ```python import pyarrow as pa a = pa.array([0, 1, 2, 3, 4], type=pa.int32()) b = pa.array([0.0, 0.1, 0.2, 0.3, 0.4], type=pa.float32()) xs = pa.StructArray.from_arrays([a, b], names=["a", "b"]) slice = xs.slice(2, 3) assert len(slice) == 3 assert len(slice.field(0)) == 3 assert len(slice.field(1)) == 3 ``` ### Are these changes tested? Yes, I've added new unit tests. ### Are there any user-facing changes? Yes, this is a user-facing bug fix and behaviour change. **This PR includes breaking changes to public APIs.** The behaviour of `StructArray.Fields` has changed. If users were previously accounting for the array offset and length themselves, this will break existing code. I first tried to make this non-breaking, by introducing a new property to replace `Fields`, and marking that property as obsolete. But `StructArray` implements `IArrowRecord`, so the behaviour of the `IArrowRecord.Column` would either need to be kept as broken, or fixed with a breaking change. It seems simplest and most consistent to fix the behaviour for all methods. If users need to maintain compatibility across different Arrow versions, I'd suggest using a pattern like: ```c# var field = structArray.Fields[0]; if (field.Length != structArray.Length) { field = ArrowArrayFactory.Slice(field, structArray.Offset, structArray.Length); } ``` * GitHub Issue: #40790 Authored-by: Adam Reeve Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow/Arrays/MapArray.cs | 10 +-- csharp/src/Apache.Arrow/Arrays/StructArray.cs | 10 ++- .../Apache.Arrow.Tests/StructArrayTests.cs | 80 +++++++++++++++++++ 3 files changed, 91 insertions(+), 9 deletions(-) diff --git a/csharp/src/Apache.Arrow/Arrays/MapArray.cs b/csharp/src/Apache.Arrow/Arrays/MapArray.cs index dad50981ea54d..c1dc9688b5a00 100644 --- a/csharp/src/Apache.Arrow/Arrays/MapArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/MapArray.cs @@ -155,10 +155,9 @@ public IEnumerable> GetTuples(int inde // Get key values int start = offsets[index]; int end = offsets[index + 1]; - StructArray array = KeyValues.Slice(start, end - start) as StructArray; - TKeyArray keyArray = array.Fields[0] as TKeyArray; - TValueArray valueArray = array.Fields[1] as TValueArray; + TKeyArray keyArray = KeyValues.Fields[0] as TKeyArray; + TValueArray valueArray = KeyValues.Fields[1] as TValueArray; for (int i = start; i < end; i++) { @@ -173,10 +172,9 @@ public IEnumerable> GetKeyValuePairs _fields; public IReadOnlyList Fields => - LazyInitializer.EnsureInitialized(ref _fields, () => InitializeFields()); + LazyInitializer.EnsureInitialized(ref _fields, InitializeFields); public StructArray( IArrowType dataType, int length, @@ -35,7 +35,6 @@ public StructArray( dataType, length, nullCount, offset, new[] { nullBitmapBuffer }, children.Select(child => child.Data))) { - _fields = children.ToArray(); } public StructArray(ArrayData data) @@ -65,7 +64,12 @@ private IReadOnlyList InitializeFields() IArrowArray[] result = new IArrowArray[Data.Children.Length]; for (int i = 0; i < Data.Children.Length; i++) { - result[i] = ArrowArrayFactory.BuildArray(Data.Children[i]); + var childData = Data.Children[i]; + if (Data.Offset != 0 || childData.Length != Data.Length) + { + childData = childData.Slice(Data.Offset, Data.Length); + } + result[i] = ArrowArrayFactory.BuildArray(childData); } return result; } diff --git a/csharp/test/Apache.Arrow.Tests/StructArrayTests.cs b/csharp/test/Apache.Arrow.Tests/StructArrayTests.cs index e2d0fa85137ec..ff5e8d2a5909b 100644 --- a/csharp/test/Apache.Arrow.Tests/StructArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/StructArrayTests.cs @@ -17,6 +17,7 @@ using Apache.Arrow.Types; using System.Collections.Generic; using System.IO; +using System.Linq; using Xunit; namespace Apache.Arrow.Tests @@ -121,6 +122,85 @@ public void TestListOfStructArray() TestRoundTripRecordBatch(batch); } + [Fact] + public void TestSliceStructArray() + { + const int numRows = 10; + var fields = new List + { + new Field.Builder().Name("ints").DataType(new Int32Type()).Nullable(true).Build(), + new Field.Builder().Name("doubles").DataType(new DoubleType()).Nullable(true).Build(), + }; + var arrays = new List + { + new Int32Array.Builder().AppendRange(Enumerable.Range(0, numRows)).Build(), + new DoubleArray.Builder().AppendRange(Enumerable.Range(0, numRows).Select(i => i * 0.1)).Build(), + }; + + var nullBitmap = new ArrowBuffer.BitmapBuilder().AppendRange(true, numRows).Build(); + var array = new StructArray(new StructType(fields), numRows, arrays, nullBitmap, nullCount: 0); + + var slicedArray = (StructArray) array.Slice(3, 4); + + Assert.Equal(4, slicedArray.Length); + Assert.Equal(2, slicedArray.Fields.Count); + + var slicedInts = slicedArray.Fields[0]; + var expectedInts = Enumerable.Range(3, 4).Select(val => (int?) val).ToArray(); + Assert.Equal(expectedInts, (IReadOnlyList) slicedInts); + + var slicedDoubles = slicedArray.Fields[1]; + var expectedDoubles = Enumerable.Range(3, 4).Select(val => (double?) (val * 0.1)).ToArray(); + Assert.Equal(expectedDoubles, (IReadOnlyList) slicedDoubles); + } + + [Fact] + public void TestStructArrayConstructedWithOffset() + { + const int dataNumRows = 10; + const int arrayLength = 4; + const int arrayOffset = 3; + + var fields = new List + { + new Field.Builder().Name("ints").DataType(new Int32Type()).Nullable(true).Build(), + new Field.Builder().Name("doubles").DataType(new DoubleType()).Nullable(true).Build(), + }; + var arrays = new List + { + new Int32Array.Builder().AppendRange(Enumerable.Range(0, dataNumRows)).Build(), + new DoubleArray.Builder().AppendRange(Enumerable.Range(0, dataNumRows).Select(i => i * 0.1)).Build(), + }; + + var nullBitmap = new ArrowBuffer.BitmapBuilder().AppendRange(true, dataNumRows).Build(); + var array = new StructArray( + new StructType(fields), arrayLength, arrays, nullBitmap, nullCount: 0, offset: arrayOffset); + + Assert.Equal(4, array.Length); + Assert.Equal(3, array.Offset); + Assert.Equal(2, array.Fields.Count); + + var slicedInts = array.Fields[0]; + var expectedInts = Enumerable.Range(3, 4).Select(val => (int?) val).ToArray(); + Assert.Equal(expectedInts, (IReadOnlyList) slicedInts); + + var slicedDoubles = array.Fields[1]; + var expectedDoubles = Enumerable.Range(3, 4).Select(val => (double?) (val * 0.1)).ToArray(); + Assert.Equal(expectedDoubles, (IReadOnlyList) slicedDoubles); + + var subSlice = (StructArray) array.Slice(1, 2); + Assert.Equal(2, subSlice.Length); + Assert.Equal(2, subSlice.Fields.Count); + + var subSlicedInts = subSlice.Fields[0]; + var expectedSubSliceInts = Enumerable.Range(4, 2).Select(val => (int?) val).ToArray(); + Assert.Equal(expectedSubSliceInts, (IReadOnlyList) subSlicedInts); + + var subSlicedDoubles = subSlice.Fields[1]; + var expectedSubSliceDoubles = Enumerable.Range(4, 2).Select(val => (double?) (val * 0.1)).ToArray(); + Assert.Equal(expectedSubSliceDoubles, (IReadOnlyList) subSlicedDoubles); + } + private static void TestRoundTripRecordBatch(RecordBatch originalBatch) { using (MemoryStream stream = new MemoryStream()) From dc2c5c66f5234a92169da76613399135786dbffb Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Thu, 28 Mar 2024 05:27:36 +1300 Subject: [PATCH 03/81] MINOR: [C++] Remove misleading comment on FileKeyUnwrapper constructor (#40808) ### Rationale for this change I added this comment in #34181, but from the discussion in https://github.com/apache/arrow/pull/40732#discussion_r1535001401, I realised this comment was incorrect. The extra overload appears to just be a convenience as a `FileKeyMaterialStore` is already constructed in `KeyToolkit::RotateMasterKeys`, but the store isn't actually used by the `FileKeyUnwrapper` in that method, as only `FileKeyUnwrapper::GetDataEncryptionKey` is called, which bypasses the store. `RotateMasterKeys` does however rely on the `temp_key_material_store` passed to the `FileKeyWrapper` being used, which is possibly where this confusion came from. ### What changes are included in this PR? Removes an incorrect statement from a C++ header comment. ### Are these changes tested? NA ### Are there any user-facing changes? No Authored-by: Adam Reeve Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encryption/file_key_unwrapper.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/parquet/encryption/file_key_unwrapper.h b/cpp/src/parquet/encryption/file_key_unwrapper.h index c60c0c71ba5e0..6147abbecd3e6 100644 --- a/cpp/src/parquet/encryption/file_key_unwrapper.h +++ b/cpp/src/parquet/encryption/file_key_unwrapper.h @@ -57,8 +57,7 @@ class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever { /// Constructor overload that takes a raw pointer to the KeyToolkit and /// accepts an existing key_material_store rather than using - /// the file path and file system to create one when needed. This is useful for key - /// rotation to allow accessing the key material store after it is used. + /// the file path and file system to create one when needed. FileKeyUnwrapper(KeyToolkit* key_toolkit, const KmsConnectionConfig& kms_connection_config, double cache_lifetime_seconds, From 515c61dd617e65c01a6e40e570487ad4ae9f151c Mon Sep 17 00:00:00 2001 From: James Henderson Date: Wed, 27 Mar 2024 18:37:16 +0000 Subject: [PATCH 04/81] GH-40773: [Java] add `DENSEUNION` case to StructWriters, resolves #40773 (#40809) ### What changes are included in this PR? Adding a `DENSEUNION` case to the `StructWriters` template so that one can create StructVectors with a DenseUnionVector child. ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #40773 Authored-by: James Henderson Signed-off-by: David Li --- .../src/main/codegen/templates/StructWriters.java | 6 ++++++ .../org/apache/arrow/vector/TestValueVector.java | 15 +++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/java/vector/src/main/codegen/templates/StructWriters.java b/java/vector/src/main/codegen/templates/StructWriters.java index 84e5d8113b321..b6dd2b75c526a 100644 --- a/java/vector/src/main/codegen/templates/StructWriters.java +++ b/java/vector/src/main/codegen/templates/StructWriters.java @@ -73,6 +73,12 @@ public class ${mode}StructWriter extends AbstractFieldWriter { map(child.getName(), arrowType.getKeysSorted()); break; } + case DENSEUNION: { + FieldType fieldType = new FieldType(addVectorAsNullable, MinorType.DENSEUNION.getType(), null, null); + DenseUnionWriter writer = new DenseUnionWriter(container.addOrGet(child.getName(), fieldType, DenseUnionVector.class), getNullableStructWriterFactory()); + fields.put(handleCase(child.getName()), writer); + break; + } case UNION: FieldType fieldType = new FieldType(addVectorAsNullable, MinorType.UNION.getType(), null, null); UnionWriter writer = new UnionWriter(container.addOrGet(child.getName(), fieldType, UnionVector.class), getNullableStructWriterFactory()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index ad84882c66275..3e53512f7338f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -60,6 +60,7 @@ import org.apache.arrow.vector.testing.ValueVectorDataPopulator; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.UnionMode; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -2974,6 +2975,20 @@ public void testStructVectorEqualsWithDiffChild() { } } + @Test + public void testStructVectorAcceptsDenseUnionChild() { + Field childField = new Field("child", + FieldType.notNullable(new ArrowType.Union(UnionMode.Dense, new int[] {})), + Collections.emptyList()); + Field structField = new Field("struct", + FieldType.notNullable(ArrowType.Struct.INSTANCE), + Collections.singletonList(childField)); + + try (FieldVector structVec = structField.createVector(allocator)) { + assertEquals(structField, structVec.getField()); + } + } + @Test public void testUnionVectorEquals() { try (final UnionVector vector1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); From 2146ab10e653f927a6e92d29ee0910f30f4cb996 Mon Sep 17 00:00:00 2001 From: sullis Date: Wed, 27 Mar 2024 13:32:45 -0700 Subject: [PATCH 05/81] MINOR: [Java] Bump Netty to 4.1.108.Final (#40491) ### Rationale for this change [Java] bump to latest version of Netty https://netty.io/news/2024/02/13/4-1-107-Final.html https://netty.io/news/2024/03/21/4-1-108-Final.html ### What changes are included in this PR? modified Java pom.xml ### Are these changes tested? GitHub Actions CI build ### Are there any user-facing changes? No Authored-by: sullis Signed-off-by: David Li --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index b064d07e1e0dc..add2823ccb0d2 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -33,7 +33,7 @@ 5.10.2 2.0.11 33.0.0-jre - 4.1.106.Final + 4.1.108.Final 1.61.1 3.23.1 2.17.0 From c9cb3fa85c1e9927fc473e1459a4fd5633614003 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 28 Mar 2024 09:38:49 +0900 Subject: [PATCH 06/81] GH-40586: [Dev][C++][Python][R] Use pre-commit for clang-format (#40587) ### Rationale for this change We can run `clang-format` easily than `archery lint` by using `pre-commit`: * We don't need to install `clang-format-14` separately because `pre-commit` prepare it automatically. * We don't need to run `cmake` to run `clang-format-14`. ### What changes are included in this PR? Add `clang-format` related `pre-commit` configurations. This doesn't change `archery lint` because our `pre-commit` configurations can't replace `archery lint` entirely yet. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40586 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .pre-commit-config.yaml | 44 ++++++++++++++++++++++ cpp/src/arrow/util/windows_compatibility.h | 1 - 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a08f219a52b62..2e598e0a95064 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,6 +51,26 @@ repos: hooks: - id: cython-lint args: [--no-pycodestyle] + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v14.0.6 + hooks: + - id: clang-format + name: C++ Format + types_or: + - c++ + # - json + # - proto + files: >- + ^cpp/ + exclude: >- + ( + ?\.grpc\.fb\.(cc|h)$| + ?\.pb\.(cc|h)$| + ?_generated.*\.(cc|h)$| + ?^cpp/src/arrow/vendored/| + ?^cpp/src/generated/| + ?^cpp/thirdparty/| + ) - repo: https://github.com/pre-commit/mirrors-clang-format rev: v14.0.6 hooks: @@ -65,6 +85,30 @@ repos: name: MATLAB (C++) Format files: >- ^matlab/src/cpp/ + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v14.0.6 + hooks: + - id: clang-format + name: Python (C++) Format + files: >- + ^python/pyarrow/src/ + exclude: >- + ( + ?\.grpc\.fb\.(cc|h)$| + ?.pb\.(cc|h)$| + ?^cpp/src/generated/| + ) + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v14.0.6 + hooks: + - id: clang-format + name: R (C++) Format + files: >- + ^r/src/ + exclude: >- + ( + ?^r/src/arrowExports\.cpp$| + ) - repo: https://github.com/cheshirekow/cmake-format-precommit rev: v0.6.13 hooks: diff --git a/cpp/src/arrow/util/windows_compatibility.h b/cpp/src/arrow/util/windows_compatibility.h index ea0d0167569e8..c97b2f3b76a7c 100644 --- a/cpp/src/arrow/util/windows_compatibility.h +++ b/cpp/src/arrow/util/windows_compatibility.h @@ -33,7 +33,6 @@ #endif #include -#include #include "arrow/util/windows_fixup.h" From b270dcdcdf7390a0486600374a900fa2b1b8d430 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 28 Mar 2024 08:54:52 +0800 Subject: [PATCH 07/81] GH-40814: [C++] Thirdparty: bump zstd to 1.5.6 (#40837) ### Rationale for this change Zstd releases 1.5.6 here: https://github.com/facebook/zstd/releases/tag/v1.5.6 ### What changes are included in this PR? Change default zstd to 1.5.6 ### Are these changes tested? Already has test ### Are there any user-facing changes? no * GitHub Issue: #40814 Authored-by: mwish Signed-off-by: Sutou Kouhei --- cpp/thirdparty/versions.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 18bb6c9b6e09c..760b19f71e2e0 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -117,8 +117,8 @@ ARROW_XSIMD_BUILD_VERSION=9.0.1 ARROW_XSIMD_BUILD_SHA256_CHECKSUM=b1bb5f92167fd3a4f25749db0be7e61ed37e0a5d943490f3accdcd2cd2918cc0 ARROW_ZLIB_BUILD_VERSION=1.3.1 ARROW_ZLIB_BUILD_SHA256_CHECKSUM=9a93b2b7dfdac77ceba5a558a580e74667dd6fede4585b91eefb60f03b72df23 -ARROW_ZSTD_BUILD_VERSION=1.5.5 -ARROW_ZSTD_BUILD_SHA256_CHECKSUM=9c4396cc829cfae319a6e2615202e82aad41372073482fce286fac78646d3ee4 +ARROW_ZSTD_BUILD_VERSION=1.5.6 +ARROW_ZSTD_BUILD_SHA256_CHECKSUM=8c29e06cf42aacc1eafc4077ae2ec6c6fcb96a626157e0593d5e82a34fd403c1 # The first field is the name of the environment variable expected by cmake. From 3d5e9aaedecadee9daa86232ec58de422caecdb6 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 27 Mar 2024 19:39:44 -0800 Subject: [PATCH 08/81] MINOR: [Docs] Fix broken link in acero/options.h docstring (#40811) ### Rationale for this change A "See also" link at https://arrow.apache.org/docs/cpp/api/acero.html#_CPPv4N5arrow5acero22TableSourceNodeOptionsE isn't automatically linked, probably because SourceNode itself isn't documented. ### What changes are included in this PR? I updated the string to be "SourceNodeOptions" so it links there, which I'm pretty sure is what was intended because TableSourceNode inherits from SourceNode and the docs for SourceNodeOptions documents the behavior of SourceNode. ### Are these changes tested? Yes, locally. ### Are there any user-facing changes? Just docs. Authored-by: Bryce Mecum Signed-off-by: Bryce Mecum --- cpp/src/arrow/acero/options.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/acero/options.h b/cpp/src/arrow/acero/options.h index 1ede3fbfc8ed0..4447e9c67a199 100644 --- a/cpp/src/arrow/acero/options.h +++ b/cpp/src/arrow/acero/options.h @@ -105,8 +105,8 @@ class ARROW_ACERO_EXPORT SourceNodeOptions : public ExecNodeOptions { /// \brief a node that generates data from a table already loaded in memory /// /// The table source node will slice off chunks, defined by `max_batch_size` -/// for parallel processing. The source node extends source node and so these -/// chunks will be iteratively processed in small batches. \see SourceNode +/// for parallel processing. The table source node extends source node and so these +/// chunks will be iteratively processed in small batches. \see SourceNodeOptions /// for details. class ARROW_ACERO_EXPORT TableSourceNodeOptions : public ExecNodeOptions { public: From 7da8dfe480a6afb3113a972a08adedf88dbf4d1c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 28 Mar 2024 13:26:16 +0900 Subject: [PATCH 09/81] GH-40674: [GLib] Don't assume gint64 and int64_t use the same type (#40736) ### Rationale for this change GLib doesn't guarantee that `gint64` and `int64_t` use the same type: https://docs.gtk.org/glib/types.html#gint64 > Note that on platforms with more than one 64-bit standard integer > type, gint64 and int64_t are not necessarily implemented by the same > 64-bit integer type. For example, on a platform where both long and > long long are 64-bit, it might be the case that one of those types is > used for gint64 and the other is used for int64_t. ### What changes are included in this PR? Add explicit casts. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40674 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/arrow-glib/array-builder.cpp | 6 ++++-- c_glib/arrow-glib/composite-array.cpp | 7 ++++--- c_glib/gandiva-glib/node.cpp | 6 ++++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index 6d8ce4a35ac0a..b498ecb51cedb 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -4995,7 +4995,8 @@ garrow_binary_dictionary_array_builder_append_indices( auto append_function = [&arrow_builder](const gint64 *values, gint64 values_length, const uint8_t *valid_bytes) -> arrow::Status { - return arrow_builder->AppendIndices(values, values_length, valid_bytes); + auto int64_t_values = reinterpret_cast(values); + return arrow_builder->AppendIndices(int64_t_values, values_length, valid_bytes); }; return garrow_array_builder_append_values(values, values_length, @@ -5226,7 +5227,8 @@ garrow_string_dictionary_array_builder_append_indices( auto append_function = [&arrow_builder](const gint64 *values, gint64 values_length, const uint8_t *valid_bytes) -> arrow::Status { - return arrow_builder->AppendIndices(values, values_length, valid_bytes); + auto int64_t_values = reinterpret_cast(values); + return arrow_builder->AppendIndices(int64_t_values, values_length, valid_bytes); }; return garrow_array_builder_append_values(values, values_length, diff --git a/c_glib/arrow-glib/composite-array.cpp b/c_glib/arrow-glib/composite-array.cpp index cc254b26e1e4c..d49b393605453 100644 --- a/c_glib/arrow-glib/composite-array.cpp +++ b/c_glib/arrow-glib/composite-array.cpp @@ -591,9 +591,10 @@ garrow_large_list_array_get_value_length(GArrowLargeListArray *array, gint64 i) const gint64 * garrow_large_list_array_get_value_offsets(GArrowLargeListArray *array, gint64 *n_offsets) { - return garrow_base_list_array_get_value_offsets( - GARROW_ARRAY(array), - n_offsets); + auto value_offsets = + garrow_base_list_array_get_value_offsets(GARROW_ARRAY(array), + n_offsets); + return reinterpret_cast(value_offsets); } typedef struct GArrowStructArrayPrivate_ diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp index e83dc41e9274b..fe75b0db03fe3 100644 --- a/c_glib/gandiva-glib/node.cpp +++ b/c_glib/gandiva-glib/node.cpp @@ -873,7 +873,8 @@ ggandiva_int64_literal_node_class_init(GGandivaInt64LiteralNodeClass *klass) GGandivaInt64LiteralNode * ggandiva_int64_literal_node_new(gint64 value) { - auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + auto int64_t_value = static_cast(value); + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(int64_t_value); return GGANDIVA_INT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, NULL)); } @@ -916,7 +917,8 @@ ggandiva_uint64_literal_node_class_init(GGandivaUInt64LiteralNodeClass *klass) GGandivaUInt64LiteralNode * ggandiva_uint64_literal_node_new(guint64 value) { - auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + auto uint64_t_value = static_cast(value); + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(uint64_t_value); return GGANDIVA_UINT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, NULL)); } From 6cecbab5172b2b339277dde741bfff455646eb32 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 27 Mar 2024 21:13:39 -0800 Subject: [PATCH 10/81] GH-40806: [C++] Correctly report asimd/neon in GetRuntimeInfo (#40857) ### What changes are included in this PR? New case to conditional in `MakeSimdLevelString` which makes `GetRuntimeInfo` report correctly on respective CPUs. I chose to have it report "neon". Lowercase to match other strings and "neon" instead of "asimd" because I think that makes more sense to users. I'm not 100% sure which is more correct. Fixes #40806 ### Are these changes tested? We don't have automated tests for this. I did install the R package and, on my M1 laptop it reports 'neon' now instead of 'none' before: ```r > arrow_info() ... SIMD Level neon Detected SIMD Level neon ``` ### Are there any user-facing changes? No. * GitHub Issue: #40806 --- cpp/src/arrow/config.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/config.cc b/cpp/src/arrow/config.cc index 9e32e5437325f..1f852e84d3d5c 100644 --- a/cpp/src/arrow/config.cc +++ b/cpp/src/arrow/config.cc @@ -58,6 +58,8 @@ std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) { return "avx"; } else if (query_flag(CpuInfo::SSE4_2)) { return "sse4_2"; + } else if (query_flag(CpuInfo::ASIMD)) { + return "neon"; } else { return "none"; } From a9b2cc2c962f064c3fa5504909f122e9bcabda3f Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Thu, 28 Mar 2024 06:06:21 -0700 Subject: [PATCH 11/81] GH-40843: [Java] Cleanup protobuf-maven-plugin usage (#40844) ### Rationale for this change `protobuf-maven-plugin` usage in Arrow codebase does not follow plugins best practices like sharing the same output directory for different execution or not using test goals for generating test classes ### What changes are included in this PR? * Add protobuf-maven-plugin plugin to top level pom.xml under pluginManagement to define version and common configuration for all modules * Remove unnecessary executions of test-compile goal when no test protobufs are present * Remove use of outputDirectory and clearOutputDirectory and let the plugin choose it for each execution (the default output directory is based on the phase (main vs test) and the language/plugin-id) * Replace use of compile/compile-custom goals with test-compile/test-compile-custom when generating test protobufs ### Are these changes tested? As those changes are in the build system, they are covered by the build framework and tests run as part of the build ### Are there any user-facing changes? None * GitHub Issue: #40843 Authored-by: Laurent Goujon Signed-off-by: David Li --- java/dataset/pom.xml | 11 ++++------ java/flight/flight-core/pom.xml | 16 ++------------- .../src/test/{protobuf => proto}/perf.proto | 0 .../src/test/{protobuf => proto}/test.proto | 0 java/flight/pom.xml | 20 ------------------- java/gandiva/pom.xml | 11 ++++------ java/pom.xml | 10 ++++++++++ 7 files changed, 20 insertions(+), 48 deletions(-) rename java/flight/flight-core/src/test/{protobuf => proto}/perf.proto (100%) rename java/flight/flight-core/src/test/{protobuf => proto}/test.proto (100%) diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index a003fd18068ec..43b913167390f 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -177,18 +177,15 @@ org.xolstice.maven.plugins protobuf-maven-plugin - 0.6.1 - - com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier} - - ../../cpp/src/jni/dataset/proto - + src compile - test-compile + + ../../cpp/src/jni/dataset/proto + diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index 98491e7ba091e..830caf8a28246 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -228,19 +228,11 @@ org.xolstice.maven.plugins protobuf-maven-plugin - 0.6.1 - - com.google.protobuf:protoc:${dep.protobuf-bom.version}:exe:${os.detected.classifier} - false - grpc-java - io.grpc:protoc-gen-grpc-java:${dep.grpc-bom.version}:exe:${os.detected.classifier} - src ${basedir}/../../../format/ - ${project.build.directory}/generated-sources/protobuf compile @@ -249,13 +241,9 @@ test - - ${basedir}/src/test/protobuf - ${project.build.directory}/generated-test-sources//protobuf - - compile - compile-custom + test-compile + test-compile-custom diff --git a/java/flight/flight-core/src/test/protobuf/perf.proto b/java/flight/flight-core/src/test/proto/perf.proto similarity index 100% rename from java/flight/flight-core/src/test/protobuf/perf.proto rename to java/flight/flight-core/src/test/proto/perf.proto diff --git a/java/flight/flight-core/src/test/protobuf/test.proto b/java/flight/flight-core/src/test/proto/test.proto similarity index 100% rename from java/flight/flight-core/src/test/protobuf/test.proto rename to java/flight/flight-core/src/test/proto/test.proto diff --git a/java/flight/pom.xml b/java/flight/pom.xml index 2f777ab42b756..5b9caafa82ef9 100644 --- a/java/flight/pom.xml +++ b/java/flight/pom.xml @@ -32,26 +32,6 @@ flight-integration-tests - - - - - org.xolstice.maven.plugins - protobuf-maven-plugin - 0.6.1 - - - com.google.protobuf:protoc:${dep.protobuf-bom.version}:exe:${os.detected.classifier} - - grpc-java - io.grpc:protoc-gen-grpc-java:${dep.grpc-bom.version}:exe:${os.detected.classifier} - - - - - - - pin-mockito-jdk8 diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index 819baee11edec..0d2a23345f6ea 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -132,18 +132,15 @@ org.xolstice.maven.plugins protobuf-maven-plugin - 0.6.1 - - com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier} - - proto - + src compile - test-compile + + proto + diff --git a/java/pom.xml b/java/pom.xml index add2823ccb0d2..659ccfca08c76 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -637,6 +637,16 @@ + + org.xolstice.maven.plugins + protobuf-maven-plugin + 0.6.1 + + com.google.protobuf:protoc:${dep.protobuf-bom.version}:exe:${os.detected.classifier} + grpc-java + io.grpc:protoc-gen-grpc-java:${dep.grpc-bom.version}:exe:${os.detected.classifier} + + From edf7e57127766e0e2aa7d14db12d3d3f5f12ecbe Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 28 Mar 2024 12:21:14 -0300 Subject: [PATCH 12/81] MINOR: [C++][Azure][FS] Document some limitations and atomicity guarantees (#40838) ### Rationale for this change Documenting some details of the behavior of destructive filesystem operations. ### What changes are included in this PR? Only docstring changes. ### Are these changes tested? N/A. Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.h | 42 +++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 308347426ae26..350014954f056 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -264,15 +264,35 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { Status CreateDir(const std::string& path, bool recursive) override; + /// \brief Delete a directory and its contents recursively. + /// + /// Atomicity is guaranteed only on Hierarchical Namespace Storage accounts. Status DeleteDir(const std::string& path) override; + /// \brief Non-atomically deletes the contents of a directory. + /// + /// This function can return a bad Status after only partially deleting the + /// contents of the directory. Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; + /// \brief Deletion of all the containers in the storage account (not + /// implemented for safety reasons). + /// + /// \return Status::NotImplemented Status DeleteRootDirContents() override; + /// \brief Deletes a file. + /// + /// Supported on both flat namespace and Hierarchical Namespace storage + /// accounts. A check is made to guarantee the parent directory doesn't + /// disappear after the blob is deleted and while this operation is running, + /// no other client can delete the parent directory due to the use of leases. + /// + /// This means applications can safely retry this operation without coordination to + /// guarantee only one client/process is trying to delete the same file. Status DeleteFile(const std::string& path) override; - /// \brief Move / rename a file or directory. + /// \brief Move/rename a file or directory. /// /// There are no files immediately at the root directory, so paths like /// "/segment" always refer to a container of the storage account and are @@ -282,6 +302,7 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { /// guarantees `dest` is not lost. /// /// Conditions for a successful move: + /// /// 1. `src` must exist. /// 2. `dest` can't contain a strict path prefix of `src`. More generally, /// a directory can't be made a subdirectory of itself. @@ -291,6 +312,25 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { /// 5. If `dest` already exists and it's a directory, `src` must also be a /// directory and `dest` must be empty. `dest` is then replaced by `src` /// and its contents. + /// + /// Leases are used to guarantee the pre-condition checks and the rename + /// operation are atomic: other clients can't invalidate the pre-condition in + /// the time between the checks and the actual rename operation. + /// + /// This is possible because Move() is only support on storage accounts with + /// Hierarchical Namespace Support enabled. + /// + /// ## Limitations + /// + /// - Moves are not supported on storage accounts without + /// Hierarchical Namespace support enabled + /// - Moves across different containers are not supported + /// - Moving a path of the form `/container` is not supported as it would + /// require moving all the files in a container to another container. + /// The only exception is a `Move("/container_a", "/container_b")` where + /// both containers are empty or `container_b` doesn't even exist. + /// The atomicity of the emptiness checks followed by the renaming operation + /// is guaranteed by the use of leases. Status Move(const std::string& src, const std::string& dest) override; Status CopyFile(const std::string& src, const std::string& dest) override; From cf832b8b5dd91ca1b70519fa544f0a44ebdb3bce Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 28 Mar 2024 23:23:59 +0800 Subject: [PATCH 13/81] GH-40863: [C++] Fix TSAN link error for module library (#40864) ### Rationale for this change Module library `arrow_filesystem_example` is introduced in #39067 for filesystem testing: https://github.com/apache/arrow/blob/6cecbab5172b2b339277dde741bfff455646eb32/cpp/src/arrow/testing/CMakeLists.txt#L25 However when built with TSAN, linker flags such as `-fsanitize=thread` is not set, causing the link error in #40863. ### What changes are included in this PR? Add necessary linker flags for module library. ### Are these changes tested? Manually tested. ### Are there any user-facing changes? None. * GitHub Issue: #40863 Authored-by: Ruoxi Sun Signed-off-by: Antoine Pitrou --- cpp/cmake_modules/san-config.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/cmake_modules/san-config.cmake b/cpp/cmake_modules/san-config.cmake index 2221dc16665ac..8c2983e18b40a 100644 --- a/cpp/cmake_modules/san-config.cmake +++ b/cpp/cmake_modules/san-config.cmake @@ -78,6 +78,7 @@ if(${ARROW_USE_TSAN}) # Some of the above also need to be passed to the linker. set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pie -fsanitize=thread") + set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -pie -fsanitize=thread") # Strictly speaking, TSAN doesn't require dynamic linking. But it does # require all code to be position independent, and the easiest way to From 4f39e6eac9f24b37b0866c432c030de2eaef78e1 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 29 Mar 2024 01:17:33 +0800 Subject: [PATCH 14/81] GH-40507: [C++][ORC] Upgrade ORC to 2.0.0 (#40508) ### Rationale for this change This PR aims to upgrade to a new major version of Apache ORC: https://orc.apache.org/news/2024/03/08/ORC-2.0.0/ ### What changes are included in this PR? This PR upgrades ORC dependency from 1.9.2 to 2.0.0. ### Are these changes tested? Pass the CIs. ### Are there any user-facing changes? No. * GitHub Issue: #40507 Lead-authored-by: Antoine Pitrou Co-authored-by: Gang Wu Signed-off-by: Antoine Pitrou --- ci/scripts/python_wheel_macos_build.sh | 9 ++++++++- ci/scripts/python_wheel_manylinux_build.sh | 1 - cpp/thirdparty/versions.txt | 4 ++-- dev/tasks/python-wheels/github.osx.yml | 4 ++++ 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index bea5409100770..a94dac40e931f 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -91,6 +91,13 @@ echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${VCPKG_FEATURE_FLAGS:=-manifests} : ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-osx-static-${CMAKE_BUILD_TYPE}}} +echo "=== Protobuf compiler versions on PATH ===" +which -a protoc || echo "no protoc on PATH!" + +echo "=== Protobuf compiler version from vcpkg ===" +_pbc=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc +echo "$_pbc: `$_pbc --version`" + mkdir -p ${build_dir}/build pushd ${build_dir}/build @@ -122,6 +129,7 @@ cmake \ -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT} \ -DARROW_TENSORFLOW=${ARROW_TENSORFLOW} \ -DARROW_USE_CCACHE=ON \ + -DARROW_VERBOSE_THIRDPARTY_BUILD=ON \ -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI} \ -DARROW_WITH_BZ2=${ARROW_WITH_BZ2} \ -DARROW_WITH_LZ4=${ARROW_WITH_LZ4} \ @@ -134,7 +142,6 @@ cmake \ -DCMAKE_INSTALL_PREFIX=${build_dir}/install \ -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ - -DORC_PROTOBUF_EXECUTABLE=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc \ -DORC_SOURCE=BUNDLED \ -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DVCPKG_MANIFEST_MODE=OFF \ diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 4d4d4fb694e0b..6e29ef58d2318 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -123,7 +123,6 @@ cmake \ -DCMAKE_INSTALL_LIBDIR=lib \ -DCMAKE_INSTALL_PREFIX=/tmp/arrow-dist \ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ - -DORC_PROTOBUF_EXECUTABLE=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc \ -DORC_SOURCE=BUNDLED \ -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DVCPKG_MANIFEST_MODE=OFF \ diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 760b19f71e2e0..4093b0ec43efd 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -90,8 +90,8 @@ ARROW_OPENTELEMETRY_BUILD_VERSION=v1.8.1 ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM=3d640201594b07f08dade9cd1017bd0b59674daca26223b560b9bb6bf56264c2 ARROW_OPENTELEMETRY_PROTO_BUILD_VERSION=v0.17.0 ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM=f269fbcb30e17b03caa1decd231ce826e59d7651c0f71c3b28eb5140b4bb5412 -ARROW_ORC_BUILD_VERSION=1.9.2 -ARROW_ORC_BUILD_SHA256_CHECKSUM=7f46f2c184ecefd6791f1a53fb062286818bd8710c3f08b94dd3cac365e240ee +ARROW_ORC_BUILD_VERSION=2.0.0 +ARROW_ORC_BUILD_SHA256_CHECKSUM=9107730919c29eb39efaff1b9e36166634d1d4d9477e5fee76bfd6a8fec317df ARROW_PROTOBUF_BUILD_VERSION=v21.3 ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM=2f723218f6cb709ae4cdc4fb5ed56a5951fc5d466f0128ce4c946b8c78c8c49f # Because of https://github.com/Tencent/rapidjson/pull/1323, we require diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml index 11bdf031f51bd..e7b6d7898103b 100644 --- a/dev/tasks/python-wheels/github.osx.yml +++ b/dev/tasks/python-wheels/github.osx.yml @@ -47,6 +47,10 @@ jobs: brew install bash bison coreutils ninja echo "$(brew --prefix bison)/bin" >> $GITHUB_PATH + - name: Homebrew packages + run: | + brew list + - name: Retrieve VCPKG version from arrow/.env run: | vcpkg_version=$(cat "arrow/.env" | grep "VCPKG" | cut -d "=" -f2 | tr -d '"') From 683a78bb8a7a3ff2e252a70ef00d796a758b4527 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 28 Mar 2024 16:03:49 -0300 Subject: [PATCH 15/81] GH-40870: [C#] Update CompareValidityBuffer() to pass when unspecified final bits are not identical (#40873) ### Rationale for this change Before fixing nanoarrow's testing JSON reader to align with other implementations and properly zero out the last few bits, integration tests failed because C#'s `CompareValidityBuffer()` was comparing the bytes of the validity buffer (including undefined final bits that are maybe not identical due to uninitialized memory or because the arrays are slices). ### What changes are included in this PR? `CompareValidityBuffer()` now compares the memory for all except the last byte and compares the last byte bitwise. ### Are these changes tested? They should be but I am not sure exactly where to add the test! ### Are there any user-facing changes? No * GitHub Issue: #40870 Authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- .../Apache.Arrow.Tests/ArrowReaderVerifier.cs | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index 2e7488092c2cf..ceeab92860e6f 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -432,12 +432,27 @@ private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer e { Assert.True(expectedValidityBuffer.Span.SequenceEqual(actualValidityBuffer.Span)); } - else if (nullCount != 0) + else if (nullCount != 0 && arrayLength > 0) { int validityBitmapByteCount = BitUtility.ByteCount(arrayLength); + ReadOnlySpan expectedSpanPartial = expectedValidityBuffer.Span.Slice(0, validityBitmapByteCount - 1); + ReadOnlySpan actualSpanPartial = actualValidityBuffer.Span.Slice(0, validityBitmapByteCount - 1); + + // Compare the first validityBitmapByteCount - 1 bytes Assert.True( - expectedValidityBuffer.Span.Slice(0, validityBitmapByteCount).SequenceEqual(actualValidityBuffer.Span.Slice(0, validityBitmapByteCount)), - "Validity buffers do not match."); + expectedSpanPartial.SequenceEqual(actualSpanPartial), + string.Format("First {0} bytes of validity buffer do not match", validityBitmapByteCount - 1)); + + // Compare the last byte bitwise (because there is no guarantee about the value of + // bits outside the range [0, arrayLength]) + ReadOnlySpan expectedSpanFull = expectedValidityBuffer.Span.Slice(0, validityBitmapByteCount); + ReadOnlySpan actualSpanFull = actualValidityBuffer.Span.Slice(0, validityBitmapByteCount); + for (int i = 8 * (validityBitmapByteCount - 1); i < arrayLength; i++) + { + Assert.True( + BitUtility.GetBit(expectedSpanFull, i) == BitUtility.GetBit(actualSpanFull, i), + string.Format("Bit at index {0}/{1} is not equal", i, arrayLength)); + } } } } From 1feb945c1dc61afeaa6bfd412d0c7eaa71a1c139 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Thu, 28 Mar 2024 11:26:10 -0800 Subject: [PATCH 16/81] GH-40858: [R] Remove dangling commas from codegen.R (#40859) ### Rationale for this change This is a draft PR fixing https://github.com/apache/arrow/issues/40858, though I'm not sure how or why this broke (or worked correctly). Fixes #40858 ### Are these changes tested? These have been tested locally. * GitHub Issue: #40858 Authored-by: Bryce Mecum Signed-off-by: Bryce Mecum --- r/data-raw/codegen.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index e8d53467d4589..4f027a3d9ddc7 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -145,7 +145,7 @@ cpp_functions_definitions <- arrow_exports %>% // {basename(file)} {ifdef_wrap(cpp11_wrapped, name, sexp_signature, decoration)} ", - sep = "\n", + sep = "\n" ) }) %>% glue_collapse(sep = "\n") @@ -176,7 +176,7 @@ arrow_exports_cpp <- paste0( static const R_CallMethodDef CallEntries[] = { ", glue::glue_collapse(glue::glue( - '\t\t{{ "_{features}_available", (DL_FUNC)& _{features}_available, 0 }},', + '\t\t{{ "_{features}_available", (DL_FUNC)& _{features}_available, 0 }},' ), sep = "\n"), glue::glue("\n {cpp_functions_registration} @@ -217,7 +217,7 @@ r_functions <- arrow_exports %>% ", list_params = glue_collapse_data(args, "{name}"), - sep = "\n", + sep = "\n" ) }) %>% glue_collapse(sep = "\n") From 950fbb62ce7388aad926c5af5861bf07f7db6de1 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 28 Mar 2024 15:59:14 -0400 Subject: [PATCH 17/81] GH-40733: [Go] Require Go 1.21 or later (#40848) ### Rationale for this change Bumping to require Go 1.21 or later as 1.20 is EOL * GitHub Issue: #40733 Authored-by: Matt Topol Signed-off-by: Matt Topol --- .env | 4 +- .github/workflows/go.yml | 28 +++++---- ci/docker/conda-integration.dockerfile | 2 +- ci/docker/debian-12-go.dockerfile | 4 +- dev/release/verify-release-candidate.sh | 6 +- dev/tasks/tasks.yml | 2 +- go/arrow/bitutil/bitutil.go | 35 +----------- .../bitutil/bitutil_bytes.go} | 26 ++++----- go/arrow/cdata/cdata_allocate.go | 57 +++++++++++++++++++ go/arrow/cdata/cdata_exports.go | 55 ------------------ go/arrow/compute/exec/span.go | 17 ------ go/arrow/compute/exec/span_offsets.go | 36 ++++++++++++ go/arrow/compute/fieldref.go | 17 ------ go/arrow/compute/fieldref_hash.go | 39 +++++++++++++ go/arrow/doc.go | 2 - .../flight/flightsql/driver/driver_test.go | 1 + go/arrow/memory/mallocator/mallocator.go | 11 ++-- go/arrow/memory/mallocator/mallocator_util.go | 26 +++++++++ go/go.mod | 2 +- go/internal/hashing/hash_string.go | 4 ++ go/internal/hashing/xxh3_memo_table.go | 9 +-- go/parquet/types.go | 44 +++----------- 22 files changed, 219 insertions(+), 208 deletions(-) rename go/{internal/hashing/hash_string_go1.19.go => arrow/bitutil/bitutil_bytes.go} (58%) create mode 100644 go/arrow/cdata/cdata_allocate.go create mode 100644 go/arrow/compute/exec/span_offsets.go create mode 100644 go/arrow/compute/fieldref_hash.go create mode 100644 go/arrow/memory/mallocator/mallocator_util.go diff --git a/.env b/.env index b5c66563f5f7d..298c100c094b0 100644 --- a/.env +++ b/.env @@ -58,8 +58,8 @@ CUDA=11.2.2 DASK=latest DOTNET=7.0 GCC_VERSION="" -GO=1.19.13 -STATICCHECK=v0.4.5 +GO=1.21.8 +STATICCHECK=v0.4.7 HDFS=3.2.1 JDK=8 KARTOTHEK=latest diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 47148d9568c18..7ff781d35e8ec 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -59,13 +59,13 @@ jobs: { "arch-label": "AMD64", "arch": "amd64", - "go": "1.19", + "go": "1.21", "runs-on": "ubuntu-latest" }, { "arch-label": "AMD64", "arch": "amd64", - "go": "1.20", + "go": "1.22", "runs-on": "ubuntu-latest" } JSON @@ -75,13 +75,13 @@ jobs: { "arch-label": "ARM64", "arch": "arm64v8", - "go": "1.19", + "go": "1.21", "runs-on": ["self-hosted", "arm", "linux"] }, { "arch-label": "ARM64", "arch": "arm64v8", - "go": "1.20", + "go": "1.22", "runs-on": ["self-hosted", "arm", "linux"] } JSON @@ -169,10 +169,13 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Get required Go version + run: | + (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV - name: Install Go uses: actions/setup-go@v5 with: - go-version: 1.19 + go-version: "${{ env.GO_VERSION }}" cache: true cache-dependency-path: go/go.sum - name: Run build @@ -188,7 +191,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.19, '1.20'] + go: ['1.21', '1.22'] env: GO: ${{ matrix.go }} steps: @@ -229,7 +232,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.19, '1.20'] + go: ['1.21', '1.22'] env: GO: ${{ matrix.go }} steps: @@ -268,7 +271,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.19, '1.20'] + go: ['1.21', '1.22'] steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -301,7 +304,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.19, '1.20'] + go: ['1.21', '1.22'] steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -359,7 +362,7 @@ jobs: strategy: fail-fast: false matrix: - go: [1.19, '1.20'] + go: ['1.21', '1.22'] env: ARROW_GO_TESTCGO: "1" steps: @@ -428,6 +431,9 @@ jobs: shell: msys2 {0} run: | ci/scripts/msys2_setup.sh cgo + - name: Get required Go version + run: | + (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV - name: Update CGO Env vars shell: msys2 {0} run: | @@ -437,7 +443,7 @@ jobs: - name: Install go uses: actions/setup-go@v5 with: - go-version: '1.19' + go-version: "${{ env.GO_VERSION }}" cache: true cache-dependency-path: go/go.sum - name: Install staticcheck diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index 8406a419c06ab..a747ccbc7262f 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -24,7 +24,7 @@ ARG maven=3.8.7 ARG node=16 ARG yarn=1.22 ARG jdk=8 -ARG go=1.19.13 +ARG go=1.21.8 # Install Archery and integration dependencies COPY ci/conda_env_archery.txt /arrow/ci/ diff --git a/ci/docker/debian-12-go.dockerfile b/ci/docker/debian-12-go.dockerfile index 7c077910a67a0..c958e6bdee211 100644 --- a/ci/docker/debian-12-go.dockerfile +++ b/ci/docker/debian-12-go.dockerfile @@ -16,8 +16,8 @@ # under the License. ARG arch=amd64 -ARG go=1.19 -ARG staticcheck=v0.4.5 +ARG go=1.21 +ARG staticcheck=v0.4.7 FROM ${arch}/golang:${go}-bookworm # FROM collects all the args, get back the staticcheck version arg diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index d74ce1f67066d..e7d78328ed16c 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -24,7 +24,7 @@ # - JDK >=8 # - gcc >= 4.8 # - Node.js >= 18 -# - Go >= 1.19 +# - Go >= 1.21 # - Docker # # If using a non-system Boost, set BOOST_ROOT and add Boost libraries to @@ -405,7 +405,7 @@ install_go() { return 0 fi - local version=1.19.13 + local version=1.21.8 show_info "Installing go version ${version}..." local arch="$(uname -m)" @@ -953,7 +953,7 @@ test_go() { show_header "Build and test Go libraries" maybe_setup_go - maybe_setup_conda compilers go=1.19 + maybe_setup_conda compilers go=1.21 pushd go go get -v ./... diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 15b687b2d2fad..2abfbc15174df 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1415,7 +1415,7 @@ tasks: R_PRUNE_DEPS: TRUE image: fedora-r-clang-sanitizer - {% for go_version, staticcheck in [("1.19", "v0.4.5"), ("1.21", "latest")] %} + {% for go_version, staticcheck in [("1.21", "v0.4.7"), ("1.22", "latest")] %} test-debian-12-go-{{ go_version }}: ci: azure template: docker-tests/azure.linux.yml diff --git a/go/arrow/bitutil/bitutil.go b/go/arrow/bitutil/bitutil.go index 82747ee1417b8..6a8f75410363f 100644 --- a/go/arrow/bitutil/bitutil.go +++ b/go/arrow/bitutil/bitutil.go @@ -19,7 +19,6 @@ package bitutil import ( "math" "math/bits" - "reflect" "unsafe" "github.com/apache/arrow/go/v16/arrow/memory" @@ -99,8 +98,6 @@ func countSetBitsWithOffset(buf []byte, offset, n int) int { count := 0 beg := offset - end := offset + n - begU8 := roundUp(beg, uint64SizeBits) init := min(n, begU8-beg) @@ -110,27 +107,8 @@ func countSetBitsWithOffset(buf []byte, offset, n int) int { } } - nU64 := (n - init) / uint64SizeBits - begU64 := begU8 / uint64SizeBits - endU64 := begU64 + nU64 - bufU64 := bytesToUint64(buf) - if begU64 < len(bufU64) { - for _, v := range bufU64[begU64:endU64] { - count += bits.OnesCount64(v) - } - } - - // FIXME: use a fallback to bits.OnesCount8 - // before counting the tail bits. - - tail := beg + init + nU64*uint64SizeBits - for i := tail; i < end; i++ { - if BitIsSet(buf, i) { - count++ - } - } - - return count + begU64 := BytesForBits(int64(beg + init)) + return count + CountSetBits(buf[begU64:], 0, n-init) } func roundUp(v, f int) int { @@ -149,15 +127,6 @@ const ( uint64SizeBits = uint64SizeBytes * 8 ) -func bytesToUint64(b []byte) []uint64 { - if cap(b) < uint64SizeBytes { - return nil - } - - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - return unsafe.Slice((*uint64)(unsafe.Pointer(h.Data)), cap(b)/uint64SizeBytes)[:len(b)/uint64SizeBytes] -} - var ( // PrecedingBitmask is a convenience set of values as bitmasks for checking // prefix bits of a byte diff --git a/go/internal/hashing/hash_string_go1.19.go b/go/arrow/bitutil/bitutil_bytes.go similarity index 58% rename from go/internal/hashing/hash_string_go1.19.go rename to go/arrow/bitutil/bitutil_bytes.go index f38eb5c523dde..09dd5cbc67d39 100644 --- a/go/internal/hashing/hash_string_go1.19.go +++ b/go/arrow/bitutil/bitutil_bytes.go @@ -14,24 +14,24 @@ // See the License for the specific language governing permissions and // limitations under the License. -//go:build !go1.20 && !tinygo +//go:build go1.20 || tinygo -package hashing +package bitutil import ( - "reflect" "unsafe" ) -func hashString(val string, alg uint64) uint64 { - if val == "" { - return Hash([]byte{}, alg) +func bytesToUint64(b []byte) []uint64 { + if len(b) < uint64SizeBytes { + return nil } - // highly efficient way to get byte slice without copy before - // the introduction of unsafe.StringData in go1.20 - // (https://stackoverflow.com/questions/59209493/how-to-use-unsafe-get-a-byte-slice-from-a-string-without-memory-copy) - const MaxInt32 = 1<<31 - 1 - buf := (*[MaxInt32]byte)(unsafe.Pointer((*reflect.StringHeader)( - unsafe.Pointer(&val)).Data))[: len(val)&MaxInt32 : len(val)&MaxInt32] - return Hash(buf, alg) + + ptr := unsafe.SliceData(b) + if ptr == nil { + return nil + } + + return unsafe.Slice((*uint64)(unsafe.Pointer(ptr)), + len(b)/uint64SizeBytes) } diff --git a/go/arrow/cdata/cdata_allocate.go b/go/arrow/cdata/cdata_allocate.go new file mode 100644 index 0000000000000..da0bd957de1df --- /dev/null +++ b/go/arrow/cdata/cdata_allocate.go @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.20 || tinygo + +package cdata + +// #include +// #include "arrow/c/abi.h" +import "C" + +import ( + "unsafe" +) + +func allocateArrowSchemaArr(n int) (out []CArrowSchema) { + return unsafe.Slice((*CArrowSchema)(C.calloc(C.size_t(n), + C.sizeof_struct_ArrowSchema)), n) +} + +func allocateArrowSchemaPtrArr(n int) (out []*CArrowSchema) { + return unsafe.Slice((**CArrowSchema)(C.calloc(C.size_t(n), + C.size_t(unsafe.Sizeof((*CArrowSchema)(nil))))), n) +} + +func allocateArrowArrayArr(n int) (out []CArrowArray) { + return unsafe.Slice((*CArrowArray)(C.calloc(C.size_t(n), + C.sizeof_struct_ArrowArray)), n) +} + +func allocateArrowArrayPtrArr(n int) (out []*CArrowArray) { + return unsafe.Slice((**CArrowArray)(C.calloc(C.size_t(n), + C.size_t(unsafe.Sizeof((*CArrowArray)(nil))))), n) +} + +func allocateBufferPtrArr(n int) (out []*C.void) { + return unsafe.Slice((**C.void)(C.calloc(C.size_t(n), + C.size_t(unsafe.Sizeof((*C.void)(nil))))), n) +} + +func allocateBufferSizeArr(n int) (out []C.int64_t) { + return unsafe.Slice((*C.int64_t)(C.calloc(C.size_t(n), + C.sizeof_int64_t)), n) +} diff --git a/go/arrow/cdata/cdata_exports.go b/go/arrow/cdata/cdata_exports.go index d59c87712eedf..fecc8610bf2a0 100644 --- a/go/arrow/cdata/cdata_exports.go +++ b/go/arrow/cdata/cdata_exports.go @@ -39,7 +39,6 @@ import ( "bytes" "encoding/binary" "fmt" - "reflect" "runtime/cgo" "strconv" "strings" @@ -291,60 +290,6 @@ func (exp *schemaExporter) export(field arrow.Field) { exp.exportMeta(&field.Metadata) } -func allocateArrowSchemaArr(n int) (out []CArrowSchema) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.sizeof_struct_ArrowSchema)) - s.Len = n - s.Cap = n - - return -} - -func allocateArrowSchemaPtrArr(n int) (out []*CArrowSchema) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof((*CArrowSchema)(nil))))) - s.Len = n - s.Cap = n - - return -} - -func allocateArrowArrayArr(n int) (out []CArrowArray) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.sizeof_struct_ArrowArray)) - s.Len = n - s.Cap = n - - return -} - -func allocateArrowArrayPtrArr(n int) (out []*CArrowArray) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof((*CArrowArray)(nil))))) - s.Len = n - s.Cap = n - - return -} - -func allocateBufferPtrArr(n int) (out []*C.void) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof((*C.void)(nil))))) - s.Len = n - s.Cap = n - - return -} - -func allocateBufferSizeArr(n int) (out []C.int64_t) { - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = uintptr(C.calloc(C.size_t(n), C.size_t(unsafe.Sizeof(int64(0))))) - s.Len = n - s.Cap = n - - return -} - func (exp *schemaExporter) finish(out *CArrowSchema) { out.dictionary = nil if exp.dict != nil { diff --git a/go/arrow/compute/exec/span.go b/go/arrow/compute/exec/span.go index 6f9bb240e3469..4425784f25c94 100644 --- a/go/arrow/compute/exec/span.go +++ b/go/arrow/compute/exec/span.go @@ -19,7 +19,6 @@ package exec import ( - "reflect" "sync/atomic" "unsafe" @@ -250,22 +249,6 @@ func (a *ArraySpan) resizeChildren(i int) { } } -// convenience function for populating the offsets buffer from a scalar -// value's size. -func setOffsetsForScalar[T int32 | int64](span *ArraySpan, buf []T, valueSize int64, bufidx int) { - buf[0] = 0 - buf[1] = T(valueSize) - - b := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) - s := (*reflect.SliceHeader)(unsafe.Pointer(&span.Buffers[bufidx].Buf)) - s.Data = b.Data - s.Len = 2 * int(unsafe.Sizeof(T(0))) - s.Cap = s.Len - - span.Buffers[bufidx].Owner = nil - span.Buffers[bufidx].SelfAlloc = false -} - // FillFromScalar populates this ArraySpan as if it were a 1 length array // with the single value equal to the passed in Scalar. func (a *ArraySpan) FillFromScalar(val scalar.Scalar) { diff --git a/go/arrow/compute/exec/span_offsets.go b/go/arrow/compute/exec/span_offsets.go new file mode 100644 index 0000000000000..d2d0398884c9d --- /dev/null +++ b/go/arrow/compute/exec/span_offsets.go @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.20 || tinygo + +package exec + +import ( + "unsafe" +) + +// convenience function for populating the offsets buffer from a scalar +// value's size. +func setOffsetsForScalar[T int32 | int64](span *ArraySpan, buf []T, valueSize int64, bufidx int) { + buf[0] = 0 + buf[1] = T(valueSize) + + span.Buffers[bufidx].Buf = unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(buf))), + 2*int(unsafe.Sizeof(T(0)))) + + span.Buffers[bufidx].Owner = nil + span.Buffers[bufidx].SelfAlloc = false +} diff --git a/go/arrow/compute/fieldref.go b/go/arrow/compute/fieldref.go index ab6d856f85f0d..0c55c36dab243 100644 --- a/go/arrow/compute/fieldref.go +++ b/go/arrow/compute/fieldref.go @@ -20,12 +20,10 @@ import ( "errors" "fmt" "hash/maphash" - "math/bits" "reflect" "strconv" "strings" "unicode" - "unsafe" "github.com/apache/arrow/go/v16/arrow" "github.com/apache/arrow/go/v16/arrow/array" @@ -168,21 +166,6 @@ func (f FieldPath) GetColumn(batch arrow.Record) (arrow.Array, error) { return f.getArray(batch.Columns()) } -func (f FieldPath) hash(h *maphash.Hash) { - raw := (*reflect.SliceHeader)(unsafe.Pointer(&f)).Data - - var b []byte - s := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - s.Data = raw - if bits.UintSize == 32 { - s.Len = arrow.Int32Traits.BytesRequired(len(f)) - } else { - s.Len = arrow.Int64Traits.BytesRequired(len(f)) - } - s.Cap = s.Len - h.Write(b) -} - func (f FieldPath) findAll(fields []arrow.Field) []FieldPath { _, err := f.GetFieldFromSlice(fields) if err == nil { diff --git a/go/arrow/compute/fieldref_hash.go b/go/arrow/compute/fieldref_hash.go new file mode 100644 index 0000000000000..dace05788bb46 --- /dev/null +++ b/go/arrow/compute/fieldref_hash.go @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.20 || tinygo + +package compute + +import ( + "hash/maphash" + "math/bits" + "unsafe" + + "github.com/apache/arrow/go/v16/arrow" +) + +func (f FieldPath) hash(h *maphash.Hash) { + raw := unsafe.Pointer(unsafe.SliceData(f)) + var byteLen int + if bits.UintSize == 32 { + byteLen = arrow.Int32Traits.BytesRequired(len(f)) + } else { + byteLen = arrow.Int64Traits.BytesRequired(len(f)) + } + + h.Write(unsafe.Slice((*byte)(raw), byteLen)) +} diff --git a/go/arrow/doc.go b/go/arrow/doc.go index 2f7c8c2acf1ce..19f24c5d0b8c3 100644 --- a/go/arrow/doc.go +++ b/go/arrow/doc.go @@ -30,8 +30,6 @@ array is valid (not null). If the array has no null entries, it is possible to o # Requirements -Despite the go.mod stating go1.20, everything is able to be built with go1.19 or higher. - To build with tinygo include the noasm build tag. */ package arrow diff --git a/go/arrow/flight/flightsql/driver/driver_test.go b/go/arrow/flight/flightsql/driver/driver_test.go index 79955f6099f8a..11b9036519d79 100644 --- a/go/arrow/flight/flightsql/driver/driver_test.go +++ b/go/arrow/flight/flightsql/driver/driver_test.go @@ -619,6 +619,7 @@ func (s *SqlTestSuite) TestRowsPrematureCloseDuringNextLoop() { require.NoError(t, err) require.Equal(t, int64(rowCount), insertedRows) + time.Sleep(200 * time.Millisecond) // Do query const sqlSelectAll = `SELECT id, name, value FROM ` + tableName diff --git a/go/arrow/memory/mallocator/mallocator.go b/go/arrow/memory/mallocator/mallocator.go index a111f009ec52d..59d240a1063e8 100644 --- a/go/arrow/memory/mallocator/mallocator.go +++ b/go/arrow/memory/mallocator/mallocator.go @@ -30,7 +30,6 @@ package mallocator import "C" import ( - "reflect" "sync/atomic" "unsafe" ) @@ -70,18 +69,18 @@ func (alloc *Mallocator) Allocate(size int) []byte { } func (alloc *Mallocator) Free(b []byte) { - sh := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - C.free(unsafe.Pointer(sh.Data)) + sz := len(b) + C.free(getPtr(b)) // Subtract sh.Len via two's complement (since atomic doesn't offer subtract) - atomic.AddUint64(&alloc.allocatedBytes, ^(uint64(sh.Len) - 1)) + atomic.AddUint64(&alloc.allocatedBytes, ^(uint64(sz) - 1)) } func (alloc *Mallocator) Reallocate(size int, b []byte) []byte { if size < 0 { panic("mallocator: negative size") } - sh := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - ptr, err := C.realloc_and_initialize(unsafe.Pointer(sh.Data), C.size_t(sh.Cap), C.size_t(size)) + cp := cap(b) + ptr, err := C.realloc_and_initialize(getPtr(b), C.size_t(cp), C.size_t(size)) if err != nil { panic(err) } else if ptr == nil && size != 0 { diff --git a/go/arrow/memory/mallocator/mallocator_util.go b/go/arrow/memory/mallocator/mallocator_util.go new file mode 100644 index 0000000000000..0ab5f8f515e17 --- /dev/null +++ b/go/arrow/memory/mallocator/mallocator_util.go @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//go:build go1.20 || tinygo + +package mallocator + +import "unsafe" + +func getPtr(b []byte) unsafe.Pointer { + return unsafe.Pointer(unsafe.SliceData(b)) +} diff --git a/go/go.mod b/go/go.mod index 5c297c74d6080..2f788c5c26b02 100644 --- a/go/go.mod +++ b/go/go.mod @@ -16,7 +16,7 @@ module github.com/apache/arrow/go/v16 -go 1.20 +go 1.21 require ( github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c diff --git a/go/internal/hashing/hash_string.go b/go/internal/hashing/hash_string.go index b772c7d7f8998..c8579c1ec5eaa 100644 --- a/go/internal/hashing/hash_string.go +++ b/go/internal/hashing/hash_string.go @@ -24,3 +24,7 @@ func hashString(val string, alg uint64) uint64 { buf := unsafe.Slice(unsafe.StringData(val), len(val)) return Hash(buf, alg) } + +func strToBytes(v string) []byte { + return unsafe.Slice(unsafe.StringData(v), len(v)) +} diff --git a/go/internal/hashing/xxh3_memo_table.go b/go/internal/hashing/xxh3_memo_table.go index 283bc1a953f05..fbb8b33531bbd 100644 --- a/go/internal/hashing/xxh3_memo_table.go +++ b/go/internal/hashing/xxh3_memo_table.go @@ -22,7 +22,6 @@ package hashing import ( "bytes" "math" - "reflect" "unsafe" ) @@ -183,13 +182,7 @@ func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte { case ByteSlice: return v.Bytes() case string: - var out []byte - h := (*reflect.StringHeader)(unsafe.Pointer(&v)) - s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) - s.Data = h.Data - s.Len = h.Len - s.Cap = h.Len - return out + return strToBytes(v) default: panic("invalid type for binarymemotable") } diff --git a/go/parquet/types.go b/go/parquet/types.go index 8742c3ba8bfba..5447e793b4ea6 100644 --- a/go/parquet/types.go +++ b/go/parquet/types.go @@ -95,27 +95,13 @@ type int96Traits struct{} func (int96Traits) BytesRequired(n int) int { return Int96SizeBytes * n } func (int96Traits) CastFromBytes(b []byte) []Int96 { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - var res []Int96 - s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) - s.Data = h.Data - s.Len = h.Len / Int96SizeBytes - s.Cap = h.Cap / Int96SizeBytes - - return res + return unsafe.Slice((*Int96)(unsafe.Pointer(unsafe.SliceData(b))), + len(b)/Int96SizeBytes) } func (int96Traits) CastToBytes(b []Int96) []byte { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - var res []byte - s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) - s.Data = h.Data - s.Len = h.Len * Int96SizeBytes - s.Cap = h.Cap * Int96SizeBytes - - return res + return unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(b))), + len(b)*Int96SizeBytes) } // ByteArray is a type to be utilized for representing the Parquet ByteArray physical type, represented as a byte slice @@ -142,15 +128,8 @@ func (byteArrayTraits) BytesRequired(n int) int { } func (byteArrayTraits) CastFromBytes(b []byte) []ByteArray { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - var res []ByteArray - s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) - s.Data = h.Data - s.Len = h.Len / ByteArraySizeBytes - s.Cap = h.Cap / ByteArraySizeBytes - - return res + return unsafe.Slice((*ByteArray)(unsafe.Pointer(unsafe.SliceData(b))), + len(b)/ByteArraySizeBytes) } // FixedLenByteArray is a go type to represent a FixedLengthByteArray as a byte slice @@ -177,15 +156,8 @@ func (fixedLenByteArrayTraits) BytesRequired(n int) int { } func (fixedLenByteArrayTraits) CastFromBytes(b []byte) []FixedLenByteArray { - h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - - var res []FixedLenByteArray - s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) - s.Data = h.Data - s.Len = h.Len / FixedLenByteArraySizeBytes - s.Cap = h.Cap / FixedLenByteArraySizeBytes - - return res + return unsafe.Slice((*FixedLenByteArray)(unsafe.Pointer(unsafe.SliceData(b))), + len(b)/FixedLenByteArraySizeBytes) } // Creating our own enums allows avoiding the transitive dependency on the From 7d1111214d70e2fd069962efb4d8d42a2829e95b Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 28 Mar 2024 16:05:03 -0400 Subject: [PATCH 18/81] GH-40847: [Go] update readme (#40877) ### Rationale for this change Remove reference to deleted internal package * GitHub Issue: #40847 Authored-by: Matt Topol Signed-off-by: Matt Topol --- go/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/README.md b/go/README.md index 4a9e151ddf234..20bd7cd77575e 100644 --- a/go/README.md +++ b/go/README.md @@ -87,8 +87,8 @@ advanced optimizer and generate PLAN9 assembly functions from C/C++ code. The arrow package can be compiled without these optimizations using the `noasm` build tag. Alternatively, by configuring an environment variable, it is possible to dynamically configure which architecture optimizations are used at -runtime. See the `cpu` package [README](arrow/internal/cpu/README.md) for a -description of this environment variable. +runtime. We use the (cpu)[https://pkg.go.dev/golang.org/x/sys/cpu] package to +check dynamically for these features. ### Example Usage From 29314394d3c17e332cb3bb42464dd20888d88a74 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 29 Mar 2024 06:07:08 +0900 Subject: [PATCH 19/81] MINOR: [Java] Bump org.apache.maven.plugins:maven-surefire-plugin from 3.2.3 to 3.2.5 in /java (#40525) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugins:maven-surefire-plugin](https://github.com/apache/maven-surefire) from 3.2.3 to 3.2.5.
Release notes

Sourced from org.apache.maven.plugins:maven-surefire-plugin's releases.

3.2.5

JIRA link

Release Notes - Maven Surefire - Version 3.2.5


What's Changed

... (truncated)

Commits
  • 4b3a271 [maven-release-plugin] prepare release surefire-3.2.5
  • eb3f1d9 Bump org.codehaus.plexus:plexus-component-metadata from 2.1.1 to 2.2.0
  • 430c406 Bump org.assertj:assertj-core from 3.24.2 to 3.25.1
  • 2d92f2d [SUREFIRE-2231] JaCoCo 0.8.11 fails with old TestNG releases on Java 17+
  • 3290740 Bump org.apache.maven.plugins:maven-docck-plugin from 1.1 to 1.2
  • 25a9776 Bump net.java.dev.javacc:javacc from 7.0.12 to 7.0.13
  • 7752f7e Bump commons-io:commons-io from 2.15.0 to 2.15.1
  • 8874add Revert "Bump jacocoVersion from 0.8.8 to 0.8.11"
  • c0f7755 Fix formatting
  • e5f4545 Bump jacocoVersion from 0.8.8 to 0.8.11
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-surefire-plugin&package-manager=maven&previous-version=3.2.3&new-version=3.2.5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/performance/pom.xml | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/performance/pom.xml b/java/performance/pom.xml index d3bba882a0898..3f69be32a20e5 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -207,7 +207,7 @@ maven-surefire-plugin - 3.2.3 + 3.2.5 diff --git a/java/pom.xml b/java/pom.xml index 659ccfca08c76..850b4d0508539 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -454,7 +454,7 @@ maven-surefire-plugin - 3.2.3 + 3.2.5 org.junit.jupiter From 50ca7a76d38e6ecf19589bc44f46bffd1db0d4c8 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Thu, 28 Mar 2024 17:09:18 -0400 Subject: [PATCH 20/81] GH-40716: [Java][Integration] Fix test_package_java in verification scripts (#40724) ### Rationale for this change JPMS changed the location of JNI libs in the dist dir. ### What changes are included in this PR? * Update the dist path in the verification script ### Are these changes tested? CI ### Are there any user-facing changes? No * GitHub Issue: #40716 Authored-by: Dane Pitkin Signed-off-by: Sutou Kouhei --- dev/release/verify-release-candidate.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index e7d78328ed16c..f18b18aaa997c 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -642,8 +642,8 @@ test_package_java() { normalized_arch=x86_64 ;; esac - mkdir -p ${dist_dir}/${normalized_arch}/ - mv ${install_dir}/lib/* ${dist_dir}/${normalized_arch}/ + mkdir -p ${dist_dir} + mv ${install_dir}/lib/* ${dist_dir} mvn install \ -Darrow.c.jni.dist.dir=${dist_dir} \ -Parrow-c-data From ed8c3630dbe2261bed9123a4ccfc7df0e3f031bd Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 29 Mar 2024 08:29:28 +0100 Subject: [PATCH 21/81] GH-40841: [Docs][C++][Python] Add initial documentation for RecordBatch::Tensor conversion (#40842) ### Rationale for this change The work on the conversion from `Table`/`RecordBatch` to `Tensor` is progressing and we have to make sure to add information to the documentation. ### What changes are included in this PR? I propose to add - new page (`converting_recordbatch_to_tensor.rst`) in the `cpp/examples` section, - added section (Conversion of RecordBatch do Tensor) in the `docs/source/python/data.rst` the content above would be updated as the features are added in the future (row-major conversion, `Table::ToTensor`, DLPack support for `Tensor` class, etc.) ### Are these changes tested? It will be tested with the crossbow preview-docs job. ### Are there any user-facing changes? No, just documentation. * GitHub Issue: #40841 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- .../converting_recordbatch_to_tensor.rst | 46 ++++++++++++++++ docs/source/cpp/examples/index.rst | 1 + docs/source/python/data.rst | 52 +++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 docs/source/cpp/examples/converting_recordbatch_to_tensor.rst diff --git a/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst b/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst new file mode 100644 index 0000000000000..2be27096cf973 --- /dev/null +++ b/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst @@ -0,0 +1,46 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Conversion of ``RecordBatch`` to ``Tensor`` instances +===================================================== + +Arrow provides a method to convert ``RecordBatch`` objects to a ``Tensor`` +with two dimensions: + +.. code:: + + std::shared_ptr batch; + + ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor()); + ASSERT_OK(tensor->Validate()); + +The conversion supports signed and unsigned integer types plus float types. +In case the ``RecordBatch`` has null values the conversion succeeds if +``null_to_nan`` parameter is set to ``true``. In this case all +types will be promoted to a floating-point data type. + +.. code:: + + std::shared_ptr batch; + + ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor->Validate()); + +Currently only column-major conversion is supported. diff --git a/docs/source/cpp/examples/index.rst b/docs/source/cpp/examples/index.rst index b886a0d29e8da..90b00bbdf6ac7 100644 --- a/docs/source/cpp/examples/index.rst +++ b/docs/source/cpp/examples/index.rst @@ -27,3 +27,4 @@ Examples dataset_skyhook_scan_example row_columnar_conversion std::tuple-like ranges to Arrow + Converting RecordBatch to Tensor diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst index 2cc33561d40b6..9156157fcd0c2 100644 --- a/docs/source/python/data.rst +++ b/docs/source/python/data.rst @@ -560,3 +560,55 @@ schema without having to get any of the batches.:: x: int64 It can also be sent between languages using the :ref:`C stream interface `. + +Conversion of RecordBatch do Tensor +----------------------------------- + +Each array of the ``RecordBatch`` has it's own contiguous memory that is not necessarily +adjacent to other arrays. A different memory structure that is used in machine learning +libraries is a two dimensional array (also called a 2-dim tensor or a matrix) which takes +only one contiguous block of memory. + +For this reason there is a function ``pyarrow.RecordBatch.to_tensor()`` available +to efficiently convert tabular columnar data into a tensor. + +Data types supported in this conversion are unsigned, signed integer and float +types. Currently only column-major conversion is supported. + + >>> import pyarrow as pa + >>> arr1 = [1, 2, 3, 4, 5] + >>> arr2 = [10, 20, 30, 40, 50] + >>> batch = pa.RecordBatch.from_arrays( + ... [ + ... pa.array(arr1, type=pa.uint16()), + ... pa.array(arr2, type=pa.int16()), + ... ], ["a", "b"] + ... ) + >>> batch.to_tensor() + + type: int32 + shape: (9, 2) + strides: (4, 36) + >>> batch.to_tensor().to_numpy() + array([[ 1, 10], + [ 2, 20], + [ 3, 30], + [ 4, 40], + [ 5, 50]], dtype=int32) + +With ``null_to_nan`` set to ``True`` one can also convert data with +nulls. They will be converted to ``NaN``: + + >>> import pyarrow as pa + >>> batch = pa.record_batch( + ... [ + ... pa.array([1, 2, 3, 4, None], type=pa.int32()), + ... pa.array([10, 20, 30, 40, None], type=pa.float32()), + ... ], names = ["a", "b"] + ... ) + >>> batch.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) From 96f686b81ba148f4d434846f0b9e161c538f131d Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 29 Mar 2024 08:30:03 +0100 Subject: [PATCH 22/81] GH-40061: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - add option to cast NULL to NaN (#40803) ### Rationale for this change The conversion from `RecordBatch` to `Tensor` class exists but it doesn't support record batches with validity bitmaps. This PR adds support for an option to convert null values to NaN. ### What changes are included in this PR? This PR adds a `nul_to_nan` option in `RecordBatch::ToTensor` so that null values are converted to NaN in the resulting `Tensor`. This for example works: ```python >>> import pyarrow as pa >>> batch = pa.record_batch( ... [ ... pa.array([1, 2, 3, 4, None], type=pa.int32()), ... pa.array([10, 20, 30, 40, None], type=pa.float32()), ... ], names = ["a", "b"] ... ) >>> batch pyarrow.RecordBatch a: int32 b: float ---- a: [1,2,3,4,null] b: [10,20,30,40,null] >>> batch.to_tensor(null_to_nan=True) type: double shape: (5, 2) strides: (8, 40) >>> batch.to_tensor(null_to_nan=True).to_numpy() array([[ 1., 10.], [ 2., 20.], [ 3., 30.], [ 4., 40.], [nan, nan]]) ``` but default would raise: ```python >>> batch.to_tensor() Traceback (most recent call last): File "", line 1, in File "pyarrow/table.pxi", line 3421, in pyarrow.lib.RecordBatch.to_tensor a: int32 File "pyarrow/error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status return check_status(status) File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status raise convert_status(status) pyarrow.lib.ArrowTypeError: Can only convert a RecordBatch with no nulls. Set null_to_nan to true to convert nulls to nan ``` ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40061 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/record_batch.cc | 47 ++++++++++++----- cpp/src/arrow/record_batch.h | 6 ++- cpp/src/arrow/record_batch_test.cc | 76 +++++++++++++++++++++++++++- python/pyarrow/includes/libarrow.pxd | 2 +- python/pyarrow/table.pxi | 49 ++++++++++++++++-- python/pyarrow/tests/test_table.py | 48 +++++++++++++++++- 6 files changed, 208 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 0d8bda9b66e24..6f3b8e75a20d0 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -18,6 +18,7 @@ #include "arrow/record_batch.h" #include +#include #include #include #include @@ -261,12 +262,19 @@ struct ConvertColumnsToTensorVisitor { using In = typename T::c_type; auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); - if constexpr (std::is_same_v) { - memcpy(out_values, in_values.data(), in_values.size_bytes()); - out_values += in_values.size(); + if (in_data.null_count == 0) { + if constexpr (std::is_same_v) { + memcpy(out_values, in_values.data(), in_values.size_bytes()); + out_values += in_values.size(); + } else { + for (In in_value : in_values) { + *out_values++ = static_cast(in_value); + } + } } else { - for (In in_value : in_values) { - *out_values++ = static_cast(in_value); + for (int64_t i = 0; i < in_data.length; ++i) { + *out_values++ = + in_data.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); } } return Status::OK(); @@ -286,16 +294,20 @@ inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out) { } } -Result> RecordBatch::ToTensor(MemoryPool* pool) const { +Result> RecordBatch::ToTensor(bool null_to_nan, + MemoryPool* pool) const { if (num_columns() == 0) { return Status::TypeError( "Conversion to Tensor for RecordBatches without columns/schema is not " "supported."); } // Check for no validity bitmap of each field + // if null_to_nan conversion is set to false for (int i = 0; i < num_columns(); ++i) { - if (column(i)->null_count() > 0) { - return Status::TypeError("Can only convert a RecordBatch with no nulls."); + if (column(i)->null_count() > 0 && !null_to_nan) { + return Status::TypeError( + "Can only convert a RecordBatch with no nulls. Set null_to_nan to true to " + "convert nulls to NaN"); } } @@ -308,12 +320,12 @@ Result> RecordBatch::ToTensor(MemoryPool* pool) const { std::shared_ptr result_field = schema_->field(0); std::shared_ptr result_type = result_field->type(); - if (num_columns() > 1) { - Field::MergeOptions options; - options.promote_integer_to_float = true; - options.promote_integer_sign = true; - options.promote_numeric_width = true; + Field::MergeOptions options; + options.promote_integer_to_float = true; + options.promote_integer_sign = true; + options.promote_numeric_width = true; + if (num_columns() > 1) { for (int i = 1; i < num_columns(); ++i) { if (!is_numeric(column(i)->type()->id())) { return Status::TypeError("DataType is not supported: ", @@ -334,6 +346,15 @@ Result> RecordBatch::ToTensor(MemoryPool* pool) const { result_type = result_field->type(); } + // Check if result_type is signed or unsigned integer and null_to_nan is set to true + // Then all columns should be promoted to float type + if (is_integer(result_type->id()) && null_to_nan) { + ARROW_ASSIGN_OR_RAISE( + result_field, + result_field->MergeWith(field(result_field->name(), float32()), options)); + result_type = result_field->type(); + } + // Allocate memory ARROW_ASSIGN_OR_RAISE( std::shared_ptr result, diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 16d721caad443..5202ff4abfa0b 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -85,8 +85,12 @@ class ARROW_EXPORT RecordBatch { /// Create a Tensor object with shape (number of rows, number of columns) and /// strides (type size in bytes, type size in bytes * number of rows). /// Generated Tensor will have column-major layout. + /// + /// \param[in] null_to_nan if true, convert nulls to NaN + /// \param[in] pool the memory pool to allocate the tensor buffer + /// \return the resulting Tensor Result> ToTensor( - MemoryPool* pool = default_memory_pool()) const; + bool null_to_nan = false, MemoryPool* pool = default_memory_pool()) const; /// \brief Construct record batch from struct array /// diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 81154452d7229..7e0eb1d460555 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -667,7 +667,8 @@ TEST_F(TestRecordBatch, ToTensorUnsupportedMissing) { auto batch = RecordBatch::Make(schema, length, {a0, a1}); ASSERT_RAISES_WITH_MESSAGE(TypeError, - "Type error: Can only convert a RecordBatch with no nulls.", + "Type error: Can only convert a RecordBatch with no nulls. " + "Set null_to_nan to true to convert nulls to NaN", batch->ToTensor()); } @@ -740,6 +741,79 @@ TEST_F(TestRecordBatch, ToTensorSupportedNaN) { CheckTensor(tensor, 18, shape, f_strides); } +TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { + const int length = 9; + + // int32 + float32 = float64 + auto f0 = field("f0", int32()); + auto f1 = field("f1", float32()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + auto a0 = ArrayFromJSON(int32(), "[null, 2, 3, 4, 5, 6, 7, 8, 9]"); + auto a1 = ArrayFromJSON(float32(), "[10, 20, 30, 40, null, 60, 70, 80, 90]"); + + auto batch = RecordBatch::Make(schema, length, {a0, a1}); + + ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 2}; + const int64_t f64_size = sizeof(double); + std::vector f_strides = {f64_size, f64_size * shape[0]}; + std::shared_ptr tensor_expected = TensorFromJSON( + float64(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + shape, f_strides); + + EXPECT_FALSE(tensor_expected->Equals(*tensor)); + EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true))); + + CheckTensor(tensor, 18, shape, f_strides); + + // int32 -> float64 + auto f2 = field("f2", int32()); + + std::vector> fields1 = {f0, f2}; + auto schema1 = ::arrow::schema(fields1); + + auto a2 = ArrayFromJSON(int32(), "[10, 20, 30, 40, null, 60, 70, 80, 90]"); + auto batch1 = RecordBatch::Make(schema1, length, {a0, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor1, batch1->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor1->Validate()); + + EXPECT_FALSE(tensor_expected->Equals(*tensor1)); + EXPECT_TRUE(tensor_expected->Equals(*tensor1, EqualOptions().nans_equal(true))); + + CheckTensor(tensor1, 18, shape, f_strides); + + // int8 -> float32 + auto f3 = field("f3", int8()); + auto f4 = field("f4", int8()); + + std::vector> fields2 = {f3, f4}; + auto schema2 = ::arrow::schema(fields2); + + auto a3 = ArrayFromJSON(int8(), "[null, 2, 3, 4, 5, 6, 7, 8, 9]"); + auto a4 = ArrayFromJSON(int8(), "[10, 20, 30, 40, null, 60, 70, 80, 90]"); + auto batch2 = RecordBatch::Make(schema2, length, {a3, a4}); + + ASSERT_OK_AND_ASSIGN(auto tensor2, batch2->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor2->Validate()); + + const int64_t f32_size = sizeof(float); + std::vector f_strides_2 = {f32_size, f32_size * shape[0]}; + std::shared_ptr tensor_expected_2 = TensorFromJSON( + float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + shape, f_strides_2); + + EXPECT_FALSE(tensor_expected_2->Equals(*tensor2)); + EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, EqualOptions().nans_equal(true))); + + CheckTensor(tensor2, 18, shape, f_strides_2); +} + TEST_F(TestRecordBatch, ToTensorSupportedTypesMixed) { const int length = 9; diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9e5e3d3fa683b..aa50dd189a82d 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -984,7 +984,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CRecordBatch] Slice(int64_t offset) shared_ptr[CRecordBatch] Slice(int64_t offset, int64_t length) - CResult[shared_ptr[CTensor]] ToTensor() const + CResult[shared_ptr[CTensor]] ToTensor(c_bool null_to_nan, CMemoryPool* pool) const cdef cppclass CRecordBatchWithMetadata" arrow::RecordBatchWithMetadata": shared_ptr[CRecordBatch] batch diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 1ab3fd04ed9f0..54fda1da7dcaf 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3389,21 +3389,64 @@ cdef class RecordBatch(_Tabular): deref(c_record_batch).ToStructArray()) return pyarrow_wrap_array(c_array) - def to_tensor(self): + def to_tensor(self, c_bool null_to_nan=False, MemoryPool memory_pool=None): """ Convert to a :class:`~pyarrow.Tensor`. RecordBatches that can be converted have fields of type signed or unsigned - integer or float, including all bit-widths, with no validity bitmask. + integer or float, including all bit-widths. RecordBatches with validity bitmask + for any of the arrays can be converted with ``null_to_nan``turned to ``True``. + In this case null values are converted to NaN and signed or unsigned integer + type arrays are promoted to appropriate float type. + + Parameters + ---------- + null_to_nan : bool, default False + Whether to write null values in the result as ``NaN``. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Examples + -------- + >>> import pyarrow as pa + >>> batch = pa.record_batch( + ... [ + ... pa.array([1, 2, 3, 4, None], type=pa.int32()), + ... pa.array([10, 20, 30, 40, None], type=pa.float32()), + ... ], names = ["a", "b"] + ... ) + + >>> batch + pyarrow.RecordBatch + a: int32 + b: float + ---- + a: [1,2,3,4,null] + b: [10,20,30,40,null] + + >>> batch.to_tensor(null_to_nan=True) + + type: double + shape: (5, 2) + strides: (8, 40) + + >>> batch.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) """ cdef: shared_ptr[CRecordBatch] c_record_batch shared_ptr[CTensor] c_tensor + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) c_record_batch = pyarrow_unwrap_batch(self) with nogil: c_tensor = GetResultValue( - deref(c_record_batch).ToTensor()) + deref(c_record_batch).ToTensor(null_to_nan, + pool)) return pyarrow_wrap_tensor(c_tensor) def _export_to_c(self, out_ptr, out_schema_ptr=0): diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index a7d917c2baf2d..8e30574188763 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1061,7 +1061,7 @@ def test_recordbatch_to_tensor_null(): arr2 = [10, 20, 30, 40, 50, 60, 70, None, 90] batch = pa.RecordBatch.from_arrays( [ - pa.array(arr1, type=pa.float32()), + pa.array(arr1, type=pa.int32()), pa.array(arr2, type=pa.float32()), ], ["a", "b"] ) @@ -1071,6 +1071,52 @@ def test_recordbatch_to_tensor_null(): ): batch.to_tensor() + result = batch.to_tensor(null_to_nan=True) + + x = np.array([arr1, arr2], np.float64).transpose() + expected = pa.Tensor.from_numpy(x) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 18 + assert result.type == pa.float64() + assert result.shape == expected.shape + assert result.strides == expected.strides + + # int32 -> float64 + batch = pa.RecordBatch.from_arrays( + [ + pa.array(arr1, type=pa.int32()), + pa.array(arr2, type=pa.int32()), + ], ["a", "b"] + ) + + result = batch.to_tensor(null_to_nan=True) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 18 + assert result.type == pa.float64() + assert result.shape == expected.shape + assert result.strides == expected.strides + + # int8 -> float32 + batch = pa.RecordBatch.from_arrays( + [ + pa.array(arr1, type=pa.int8()), + pa.array(arr2, type=pa.int8()), + ], ["a", "b"] + ) + + result = batch.to_tensor(null_to_nan=True) + + x = np.array([arr1, arr2], np.float32).transpose() + expected = pa.Tensor.from_numpy(x) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 18 + assert result.type == pa.float32() + assert result.shape == expected.shape + assert result.strides == expected.strides + def test_recordbatch_to_tensor_empty(): batch = pa.RecordBatch.from_arrays( From d32e4b053e6fd70ff4f0e2a0552f2bf3b94647b3 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 29 Mar 2024 14:46:22 -0400 Subject: [PATCH 23/81] MINOR: [Java] Bump org.apache.hadoop dependencies from 3.3.6 to 3.4.0 in /java (#40890) Updates the Hadoop version to 3.4.0 to address vulnerabilities identified in https://deps.dev/maven/org.apache.hadoop%3Ahadoop-common/3.3.6 --- java/adapter/orc/pom.xml | 6 +++--- java/pom.xml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml index e7a2bfe872eb3..060aed5dcf156 100644 --- a/java/adapter/orc/pom.xml +++ b/java/adapter/orc/pom.xml @@ -58,7 +58,7 @@ org.apache.hadoop hadoop-client-runtime - 3.3.6 + ${dep.hadoop.version} test @@ -70,12 +70,12 @@ org.apache.hadoop hadoop-client-api - 3.3.6 + ${dep.hadoop.version} org.apache.hadoop hadoop-common - 3.3.6 + ${dep.hadoop.version} test diff --git a/java/pom.xml b/java/pom.xml index 850b4d0508539..b05b2d8f1425a 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -37,7 +37,7 @@ 1.61.1 3.23.1 2.17.0 - 3.3.6 + 3.4.0 23.5.26 1.11.3 From ce11e561d37db3cdbc8c55e000ca46256f504dc1 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Fri, 29 Mar 2024 16:57:39 -0400 Subject: [PATCH 24/81] GH-38659: [CI][MATLAB][Packaging] Add MATLAB `packaging` task to crossbow `tasks.yml` (#38660) ### Rationale for this change Per the following mailing list discussion: https://lists.apache.org/thread/0xyow40h7b1bptsppb0rxd4g9r1xpmh6 to integrate the MATLAB interface code with the existing Arrow release tooling, we first need to add a task to the [`packaging` group](https://github.com/apache/arrow/blob/1fd11d33cb56fd7eff4dce05edaba1c9d8a1dccd/dev/tasks/tasks.yml#L55) to crossbow. This packaging task will automatically create a [MLTBX file](https://www.mathworks.com/help/matlab/creating-help.html?s_tid=CRUX_lftnav) (the MATLAB equivalent to a Python binary wheel or Ruby gem) that can be installed via a "one-click" workflow in MATLAB. This will enable MATLAB users to install the interface without needing to build from source. ### Licensing For more information about licensing of the MLTBX file contents, please refer to the mailing list discussion and ASF Legal ticket linked below: 1. https://lists.apache.org/thread/zlpnncgvo6l4cvkxfxn7zt4q7qhptotw 2. https://issues.apache.org/jira/browse/LEGAL-665 ### What changes are included in this PR? 1. Added a `matlab` task to the [`packaging` group](https://github.com/apache/arrow/blob/1fd11d33cb56fd7eff4dce05edaba1c9d8a1dccd/dev/tasks/tasks.yml#L55) in `dev/tasks/tasks.yml`. 4. Added a new GitHub Actions workflow called `dev/tasks/matlab/github.yml` which builds the MATLAB interface code on all platforms (Windows, macOS, and Ubuntu 20.04) and packages the generated build artifacts into a single MLTBX file using [`matlab.addons.toolbox.packageToolbox`](https://www.mathworks.com/help/matlab/ref/matlab.addons.toolbox.packagetoolbox.html). 5. Changed the GitHub-hosted runner to `ubuntu-20.04` from `ubuntu-latest` for the MATLAB CI check (i.e. `.github/workflows/matlab.yml`). The rationale for this change is that we primarily develop and qualify against Debian 11 locally, but the CI check has been building against `ubuntu-latest` (i.e. `ubuntu-22.04`). There are two issues with using `ubuntu-22.04`. The first is that the version of `GLIBC` shipped with `ubuntu-22.04` is not fully compatible with the version of `GLIBC` shipped with `Debian 11`. This results in a runtime linker error when qualifying the packaged MATLAB interface code locally on Debian 11. The second issue with using `ubuntu-22.04` is that the system version of `GLIBCXX` is not fully compatible with the version of `GLIBCXX` bundled with MATLAB R2023a (this is a relatively common issue - e.g. see: https://www.mathworks.com/matlabcentral/answers/1907290-how-to-manually-select-the-libstdc-library-to-use-to-resolve-a-version-glibcxx_-not-found). Previously, we worked around this issue in GitHub Actions by using `LD_PRELOAD` before starting up MATLAB to run the unit tests. On the other hand, the version of `GLIBCXX` shipped with `ubuntu-20.04` **is** binary compatible with the version bundled with MATLAB R2023a. Therefore, we believe it would be better to use `ubuntu-20.04` in the MATLAB CI checks for the time being until we can qualify the MATLAB interface against `ubuntu-22.04`. ### Are these changes tested? Yes. 1. Successfully submitted a crossbow `packaging` job for the MATLAB interface by commenting `@ github-actions crossbow submit matlab`. Example of a successful packaging job: https://github.com/ursacomputing/crossbow/actions/runs/6893506432/job/18753227453. 2. Manually installed the resulting MLTBX file on macOS, Windows, Debian 11, and Ubuntu 20.04. Ran all tests under `matlab/test` using `runtests . IncludeSubFolders 1`. ### Are there any user-facing changes? No. ### Notes 1. While qualifying, we discovered that [MATLAB's programmatic packaging interface](https://www.mathworks.com/help/matlab/ref/matlab.addons.toolbox.packagetoolbox.html) does not properly include symbolic link files in the packaged MLTBX file. We've reported this bug to the relevant MathWorks development team. As a temporary workaround, we included a step to change the expected name of the Arrow C++ libraries (using `patchelf`/`install_name_tool`) which `libarrowproxy.so`/`libarrowproxy.dylib` depends on to `libarrow.so.1500.0.0`/`libarrow.1500.0.0.dylib` instead of `libarrow.so.1500`/`libarrow.1500.dylib`, respectively. Once this bug is resolved, we will remove this step from the workflow. ### Future Directions 1. Add tooling to upload release candidate (RC) MLTBX files to apache/arrow's GitHub Releases area and mark them as "Prerelease". In other words, modify https://github.com/apache/arrow/blob/main/dev/release/05-binary-upload.sh. 2. Add a post-release script to upload release MLTBX files to apache/arrow's GitHub Releases area (similar to how https://github.com/apache/arrow/blob/main/dev/release/post-09-python.sh works). 4. Enable nightly builds for the MATLAB interface. 6. Document how to qualify a MATLAB Arrow interface release. 7. Enable building and testing the MATLAB Arrow interface on multiple Ubuntu distributions simulatneously (e.g. 20.04 *and* 22.04). * Closes: #38659 * GitHub Issue: #38659 Lead-authored-by: Sarah Gilmore Co-authored-by: Kevin Gurney Signed-off-by: Kevin Gurney --- .github/workflows/matlab.yml | 28 +++-- dev/tasks/matlab/github.yml | 162 ++++++++++++++++++++++++++ dev/tasks/tasks.yml | 9 ++ matlab/CMakeLists.txt | 17 --- matlab/tools/packageMatlabInterface.m | 84 +++++++++++++ 5 files changed, 273 insertions(+), 27 deletions(-) create mode 100644 dev/tasks/matlab/github.yml create mode 100644 matlab/tools/packageMatlabInterface.m diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index eceeb551a0653..dfc734e043371 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -42,7 +42,23 @@ jobs: ubuntu: name: AMD64 Ubuntu 20.04 MATLAB - runs-on: ubuntu-latest + # Explicitly pin the Ubuntu version to 20.04 for the time being because: + # + # 1. The version of GLIBCXX shipped with Ubuntu 22.04 is not binary compatible + # with the GLIBCXX bundled with MATLAB R2023a. This is a relatively common + # issue. + # + # For example, see: + # + # https://www.mathworks.com/matlabcentral/answers/1907290-how-to-manually-select-the-libstdc-library-to-use-to-resolve-a-version-glibcxx_-not-found + # + # 2. The version of GLIBCXX shipped with Ubuntu 22.04 is not binary compatible with + # the version of GLIBCXX shipped with Debian 11. Several of the Arrow community + # members who work on the MATLAB bindings use Debian 11 locally for qualification. + # Using Ubuntu 20.04 eases development workflows for these community members. + # + # In the future, we can investigate adding support for building against more Linux (e.g. `ubuntu-22.04`) and MATLAB versions (e.g. R2023b). + runs-on: ubuntu-20.04 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Check out repository @@ -74,14 +90,6 @@ jobs: run: ci/scripts/matlab_build.sh $(pwd) - name: Run MATLAB Tests env: - # libarrow.so requires a more recent version of libstdc++.so - # than is bundled with MATLAB under /sys/os/glnxa64. - # Therefore, if a MEX function that depends on libarrow.so - # is executed within the MATLAB address space, runtime linking - # errors will occur. To work around this issue, we can explicitly - # force MATLAB to use the system libstdc++.so via LD_PRELOAD. - LD_PRELOAD: /usr/lib/x86_64-linux-gnu/libstdc++.so.6 - # Add the installation directory to the MATLAB Search Path by # setting the MATLABPATH environment variable. MATLABPATH: matlab/install/arrow_matlab @@ -89,7 +97,7 @@ jobs: with: select-by-folder: matlab/test macos: - name: AMD64 macOS 11 MATLAB + name: AMD64 macOS 12 MATLAB runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: diff --git a/dev/tasks/matlab/github.yml b/dev/tasks/matlab/github.yml new file mode 100644 index 0000000000000..1cd3949efbcf8 --- /dev/null +++ b/dev/tasks/matlab/github.yml @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +{% import 'macros.jinja' as macros with context %} + +{{ macros.github_header() }} + +jobs: + + ubuntu: + name: AMD64 Ubuntu 20.04 MATLAB + runs-on: ubuntu-20.04 + steps: + {{ macros.github_checkout_arrow()|indent }} + - name: Install ninja-build + run: sudo apt-get update && sudo apt-get install ninja-build + - name: Install MATLAB + uses: matlab-actions/setup-matlab@v1 + with: + release: R2023a + - name: Build MATLAB Interface + env: + {{ macros.github_set_sccache_envvars()|indent(8) }} + run: arrow/ci/scripts/matlab_build.sh $(pwd)/arrow + - name: Change shared library dependency name + # MATLAB's programmatic packaging interface does not properly + # include symbolic link files in the package MLTBX - this is a + # bug. As a temporary workaround, change the expected name of the + # Arrow C++ library which libarrowproxy.so depends on. For example, + # change libarrow.so.1500 to libarrow.so.1500.0.0. + run: | + pushd arrow/matlab/install/arrow_matlab/+libmexclass/+proxy/ + SYMLINK_ARROW_LIB="$(find . -name 'libarrow.so.*' -type l | xargs basename)" + REGULAR_ARROW_LIB="$(echo libarrow.so.*.*)" + echo "SYMLINK_ARROW_LIB = ${SYMLINK_ARROW_LIB}" + echo "REGULAR_ARROW_LIB = ${REGULAR_ARROW_LIB}" + patchelf --replace-needed $SYMLINK_ARROW_LIB $REGULAR_ARROW_LIB libarrowproxy.so + popd + - name: Compress into single artifact + run: tar -cvzf matlab-arrow-ubuntu.tar.gz arrow/matlab/install/arrow_matlab + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: matlab-arrow-ubuntu.tar.gz + path: matlab-arrow-ubuntu.tar.gz + + macos: + name: AMD64 macOS 12 MATLAB + runs-on: macos-latest + steps: + {{ macros.github_checkout_arrow()|indent }} + - name: Install ninja-build + run: brew install ninja + - name: Install MATLAB + uses: matlab-actions/setup-matlab@v1 + with: + release: R2023a + - name: Build MATLAB Interface + env: + {{ macros.github_set_sccache_envvars()|indent(8) }} + run: arrow/ci/scripts/matlab_build.sh $(pwd)/arrow + - name: Change shared library dependency name + # MATLAB's programmatic packaging interface does not properly + # include symbolic link files in the package MLTBX - this is a + # bug. As a temporary workaround, change the expected name of the + # Arrow C++ library which libarrowproxy.dylib depends on. + # For example, change libarrow.1500.dylib to libarrow.1500.0.0.dylib. + run: | + pushd arrow/matlab/install/arrow_matlab/+libmexclass/+proxy + SYMLINK_ARROW_LIB="$(find . -name 'libarrow.*.dylib' -type l | xargs basename)" + REGULAR_ARROW_LIB="$(echo libarrow.*.*.dylib)" + echo "SYMLINK_ARROW_LIB = ${SYMLINK_ARROW_LIB}" + echo "REGULAR_ARROW_LIB = ${REGULAR_ARROW_LIB}" + install_name_tool -change @rpath/$SYMLINK_ARROW_LIB @rpath/$REGULAR_ARROW_LIB libarrowproxy.dylib + popd + - name: Compress into single artifact + run: tar -cvzf matlab-arrow-macos.tar.gz arrow/matlab/install/arrow_matlab + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: matlab-arrow-macos.tar.gz + path: matlab-arrow-macos.tar.gz + + windows: + name: AMD64 Windows 2022 MATLAB + runs-on: windows-2022 + steps: + {{ macros.github_checkout_arrow()|indent }} + - name: Install MATLAB + uses: matlab-actions/setup-matlab@v1 + with: + release: R2023a + - name: Install sccache + shell: bash + run: arrow/ci/scripts/install_sccache.sh pc-windows-msvc $(pwd)/sccache + - name: Build MATLAB Interface + shell: cmd + env: + {{ macros.github_set_sccache_envvars()|indent(8) }} + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + bash -c "arrow/ci/scripts/matlab_build.sh $(pwd)/arrow" + - name: Compress into single artifact + shell: bash + run: tar -cvzf matlab-arrow-windows.tar.gz arrow/matlab/install/arrow_matlab + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: matlab-arrow-windows.tar.gz + path: matlab-arrow-windows.tar.gz + + package-mltbx: + name: Package MATLAB Toolbox (MLTBX) Files + runs-on: ubuntu-latest + needs: + - ubuntu + - macos + - windows + steps: + {{ macros.github_checkout_arrow(fetch_depth=0)|indent }} + - name: Download Artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts-downloaded + - name: Decompress Artifacts + run: | + mv artifacts-downloaded/*/*.tar.gz . + tar -xzvf matlab-arrow-ubuntu.tar.gz + tar -xzvf matlab-arrow-macos.tar.gz + tar -xzvf matlab-arrow-windows.tar.gz + - name: Copy LICENSE.txt and NOTICE.txt for packaging + run: | + cp arrow/LICENSE.txt arrow/matlab/install/arrow_matlab/LICENSE.txt + cp arrow/NOTICE.txt arrow/matlab/install/arrow_matlab/NOTICE.txt + - name: Install MATLAB + uses: matlab-actions/setup-matlab@v1 + with: + release: R2023a + - name: Run commands + env: + MATLABPATH: arrow/matlab/tools + ARROW_MATLAB_TOOLBOX_FOLDER: arrow/matlab/install/arrow_matlab + ARROW_MATLAB_TOOLBOX_OUTPUT_FOLDER: artifacts/matlab-dist + ARROW_MATLAB_TOOLBOX_VERSION: {{ arrow.no_rc_version }} + uses: matlab-actions/run-command@v1 + with: + command: packageMatlabInterface + {{ macros.github_upload_releases(["artifacts/matlab-dist/*.mltbx"])|indent }} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 2abfbc15174df..5e1ef8d13b988 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -59,6 +59,7 @@ groups: - conan-* - debian-* - java-jars + - matlab - nuget - python-sdist - r-binary-packages @@ -665,6 +666,14 @@ tasks: params: formula: apache-arrow.rb + ############################## MATLAB Packages ################################ + + matlab: + ci: github + template: matlab/github.yml + artifacts: + - matlab-arrow-{no_rc_version}.mltbx + ############################## Arrow JAR's ################################## java-jars: diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index 206ecb318b3cc..b85f782d2d37a 100644 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -201,9 +201,6 @@ get_filename_component(ARROW_SHARED_LIB_DIR ${ARROW_SHARED_LIB} DIRECTORY) get_filename_component(ARROW_SHARED_LIB_FILENAME ${ARROW_SHARED_LIB} NAME_WE) if(NOT Arrow_FOUND) - # If Arrow_FOUND is false, Arrow is built by the arrow_shared target and needs - # to be copied to CMAKE_PACKAGED_INSTALL_DIR. - if(APPLE) # Install libarrow.dylib (symlink) and the real files it points to. # on macOS, we need to match these files: libarrow.dylib @@ -226,20 +223,6 @@ if(NOT Arrow_FOUND) set(SHARED_LIBRARY_VERSION_REGEX ${ARROW_SHARED_LIB_FILENAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() - - # The subfolders cmake and pkgconfig are excluded as they will be empty. - # Note: The following CMake Issue suggests enabling an option to exclude all - # folders that would be empty after installation: - # https://gitlab.kitware.com/cmake/cmake/-/issues/17122 - - set(CMAKE_PACKAGED_INSTALL_DIR "${CMAKE_INSTALL_DIR}/+arrow") - - install(DIRECTORY "${ARROW_SHARED_LIB_DIR}/" - DESTINATION ${CMAKE_PACKAGED_INSTALL_DIR} - FILES_MATCHING - REGEX ${SHARED_LIBRARY_VERSION_REGEX} - PATTERN "cmake" EXCLUDE - PATTERN "pkgconfig" EXCLUDE) endif() # MATLAB_ADD_INSTALL_DIR_TO_STARTUP_FILE toggles whether an addpath command to add the install diff --git a/matlab/tools/packageMatlabInterface.m b/matlab/tools/packageMatlabInterface.m new file mode 100644 index 0000000000000..55b4d4241a569 --- /dev/null +++ b/matlab/tools/packageMatlabInterface.m @@ -0,0 +1,84 @@ +% Licensed to the Apache Software Foundation (ASF) under one +% or more contributor license agreements. See the NOTICE file +% distributed with this work for additional information +% regarding copyright ownership. The ASF licenses this file +% to you under the Apache License, Version 2.0 (the +% "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, +% software distributed under the License is distributed on an +% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +% KIND, either express or implied. See the License for the +% specific language governing permissions and limitations +% under the License. + +toolboxFolder = string(getenv("ARROW_MATLAB_TOOLBOX_FOLDER")); +outputFolder = string(getenv("ARROW_MATLAB_TOOLBOX_OUTPUT_FOLDER")); +toolboxVersionRaw = string(getenv("ARROW_MATLAB_TOOLBOX_VERSION")); + +appendLicenseText(fullfile(toolboxFolder, "LICENSE.txt")); +appendNoticeText(fullfile(toolboxFolder, "NOTICE.txt")); + +% Output folder must exist. +mkdir(outputFolder); + +disp("Toolbox Folder: " + toolboxFolder); +disp("Output Folder: " + outputFolder); +disp("Toolbox Version Raw: " + toolboxVersionRaw); + + +% Note: This string processing heuristic may not be robust to future +% changes in the Arrow versioning scheme. +dotIdx = strfind(toolboxVersionRaw, "."); +numDots = numel(dotIdx); +if numDots >= 3 + toolboxVersion = extractBefore(toolboxVersionRaw, dotIdx(3)); +else + toolboxVersion = toolboxVersionRaw; +end + +disp("Toolbox Version:" + toolboxVersion); + +identifier = "ad1d0fe6-22d1-4969-9e6f-0ab5d0f12ce3"; +opts = matlab.addons.toolbox.ToolboxOptions(toolboxFolder, identifier); +opts.ToolboxName = "MATLAB Arrow Interface"; +opts.ToolboxVersion = toolboxVersion; +opts.AuthorName = "The Apache Software Foundation"; +opts.AuthorEmail = "dev@arrow.apache.org"; + +% Set the SupportedPlatforms +opts.SupportedPlatforms.Win64 = true; +opts.SupportedPlatforms.Maci64 = true; +opts.SupportedPlatforms.Glnxa64 = true; +opts.SupportedPlatforms.MatlabOnline = true; + +% Interface is only qualified against R2023a at the moment +opts.MinimumMatlabRelease = "R2023a"; +opts.MaximumMatlabRelease = "R2023a"; + +opts.OutputFile = fullfile(outputFolder, compose("matlab-arrow-%s.mltbx", toolboxVersionRaw)); +disp("Output File: " + opts.OutputFile); +matlab.addons.toolbox.packageToolbox(opts); + +function appendLicenseText(filename) + licenseText = [ ... + newline + "--------------------------------------------------------------------------------" + newline + "3rdparty dependency mathworks/libmexclass is redistributed as a dynamically" + "linked shared library in certain binary distributions, like the MATLAB" + "distribution." + newline + "Copyright: 2022-2024 The MathWorks, Inc. All rights reserved." + "Homepage: https://github.com/mathworks/libmexclass" + "License: 3-clause BSD" ]; + writelines(licenseText, filename, WriteMode="append"); +end + +function appendNoticeText(filename) + noticeText = [ ... + newline + "---------------------------------------------------------------------------------" + newline + "This product includes software from The MathWorks, Inc. (Apache 2.0)" + " * Copyright (C) 2024 The MathWorks, Inc."]; + writelines(noticeText, filename, WriteMode="append"); +end \ No newline at end of file From 9f0101ec14336b2baad45d57320fb56c71d9321b Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Fri, 29 Mar 2024 18:29:21 -0700 Subject: [PATCH 25/81] GH-40878: [JAVA] Fix flight-sql-jdbc-driver shading issues (#40879) ### Rationale for this change The `flight-sql-jdbc-driver` jar is not shaded properly: * a reduced pom.xml file is not generated. The published pom.xml file declares dependencies which are actually present in the jar and should not be fetched externally * several classes/files are not relocated properly ### What changes are included in this PR? Fix pom.xml and relocations. Also removes annotations dependencies and include a integration test to prevent future breakage. ### Are these changes tested? Yes. A new integration test check the jar content ### Are there any user-facing changes? Yes. The published pom.xml file on Maven will be cleaned of any dependency * GitHub Issue: #40878 Authored-by: Laurent Goujon Signed-off-by: David Li --- java/flight/flight-sql-jdbc-driver/pom.xml | 51 ++++++- .../driver/jdbc/ITDriverJarValidation.java | 141 ++++++++++++++++++ 2 files changed, 184 insertions(+), 8 deletions(-) create mode 100644 java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java diff --git a/java/flight/flight-sql-jdbc-driver/pom.xml b/java/flight/flight-sql-jdbc-driver/pom.xml index 84ec1ff8c1f95..53d929afa781c 100644 --- a/java/flight/flight-sql-jdbc-driver/pom.xml +++ b/java/flight/flight-sql-jdbc-driver/pom.xml @@ -148,13 +148,16 @@ - maven-surefire-plugin - - false - - ${project.basedir}/../../../testing/data - - + org.apache.maven.plugins + maven-failsafe-plugin + + + + integration-test + verify + + + org.apache.maven.plugins @@ -167,12 +170,22 @@ false - false + true false *:* + + + org.checkerframework:checker-qual + org.codehaus.mojo:animal-sniffer-annotations + javax.annotation:javax.annotation-api + com.google.android:annotations + com.google.errorprone:error_prone_annotations + com.google.code.findbugs:jsr305 + com.google.j2objc:j2objc-annotations + @@ -199,6 +212,14 @@ io. cfjd.io. + + net. + cfjd.net. + + + mozilla. + cfjd.mozilla. + META-INF.native.libnetty_ @@ -213,12 +234,25 @@ + + org.apache.arrow:arrow-vector + + codegen/** + + org.apache.calcite.avatica:* META-INF/services/java.sql.Driver + + org.eclipse.collections:* + + about.html + LICENSE-*-1.0.txt + + *:* @@ -227,6 +261,7 @@ **/*.DSA META-INF/native/libio_grpc_netty* META-INF/native/io_grpc_netty_shaded* + **/*.proto diff --git a/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java b/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java new file mode 100644 index 0000000000000..fdb580d493abf --- /dev/null +++ b/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.driver.jdbc; + +import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.File; +import java.io.IOException; +import java.net.JarURLConnection; +import java.net.URL; +import java.util.Enumeration; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.jar.JarEntry; +import java.util.jar.JarFile; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ErrorCollector; +import org.junit.rules.TestRule; +import org.junit.rules.Timeout; + +import com.google.common.collect.ImmutableSet; + +/** + * Check the content of the JDBC driver jar + * + * After shading everything should be either under org.apache.arrow.driver.jdbc., + * org.slf4j., or cfjd. packages + */ +public class ITDriverJarValidation { + /** + * Use this property to provide path to the JDBC driver jar. Can be used to run the test from an IDE + */ + public static final String JDBC_DRIVER_PATH_OVERRIDE = + System.getProperty("arrow-flight-jdbc-driver.jar.override"); + + /** + * List of allowed prefixes a jar entry may match. + */ + public static final Set ALLOWED_PREFIXES = ImmutableSet.of( + "org/apache/arrow/driver/jdbc/", + "cfjd/", + "org/slf4j/", + "META-INF/"); + + /** + * List of allowed files a jar entry may match. + */ + public static final Set ALLOWED_FILES = ImmutableSet.of( + "arrow-git.properties", + "properties/flight.properties"); + + // This method is designed to work with Maven failsafe plugin and expects the + // JDBC driver jar to be present in the test classpath (instead of the individual classes) + private static JarFile getJdbcJarFile() throws IOException { + // Check if an override has been set + if (JDBC_DRIVER_PATH_OVERRIDE != null) { + return new JarFile(new File(JDBC_DRIVER_PATH_OVERRIDE)); + } + + // Check classpath to find the driver jar + URL driverClassURL = ITDriverJarValidation.class.getClassLoader() + .getResource("org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriver.class"); + + assertNotNull(driverClassURL, "Driver jar was not detected in the classpath"); + assertEquals("Driver jar was not detected in the classpath", "jar", driverClassURL.getProtocol()); + + JarURLConnection connection = (JarURLConnection) driverClassURL.openConnection(); + return connection.getJarFile(); + } + + @ClassRule + public static final TestRule CLASS_TIMEOUT = Timeout.builder().withTimeout(2, TimeUnit.MINUTES).build(); + + @Rule + public ErrorCollector collector = new ErrorCollector(); + + @Test + public void validateShadedJar() throws IOException { + // Validate the content of the jar to enforce all 3rd party dependencies have + // been shaded + try (JarFile jar = getJdbcJarFile()) { + for (Enumeration entries = jar.entries(); entries.hasMoreElements();) { + final JarEntry entry = entries.nextElement(); + if (entry.isDirectory()) { + // Directories are ignored + continue; + } + + try { + checkEntryAllowed(entry.getName()); + } catch (AssertionError e) { + collector.addError(e); + } + } + } + } + + /** + * Check if a jar entry is allowed. + * + *

+ * A jar entry is allowed if either it is part of the allowed files or it + * matches one of the allowed prefixes + * + * @param name the jar entry name + * @throws AssertionException if the entry is not allowed + */ + private void checkEntryAllowed(String name) { + // Check if there's a matching file entry first + if (ALLOWED_FILES.contains(name)) { + return; + } + + for (String prefix : ALLOWED_PREFIXES) { + if (name.startsWith(prefix)) { + return; + } + } + + throw new AssertionError("'" + name + "' is not an allowed jar entry"); + } +} From 17a536839ee20f80e80f93ec6ea714a301d12fdf Mon Sep 17 00:00:00 2001 From: Paul Date: Sun, 31 Mar 2024 10:11:08 -0500 Subject: [PATCH 26/81] GH-40893: [Java][FlightRPC] Support IntervalMonthDayNanoVector in FlightSQL JDBC Driver (#40894) ### Rationale for this change Fixes https://github.com/apache/arrow/issues/40893. ### What changes are included in this PR? - Support IntervalMonthDayNanoVector in FlightSQL JDBC Driver - Return PeriodDuration as JDBC Object type, because there is no good java.time type for this interval - Return an ISO-8601 interval as the stringified version of PeriodDuration - Make PeriodDuration implement TemporalAccessor for standardization ### Are these changes tested? Unit tests have been added that match those for other interval types. I'm unaware of any other types of tests worth adding to, but I'd be happy to if pointed there. ### Are there any user-facing changes? The only change users should noticed is that the FlightSQL JDBC Driver can now handle more query responses. * GitHub Issue: #40893 Authored-by: paul Signed-off-by: David Li --- .../ArrowFlightJdbcAccessorFactory.java | 4 + ...ArrowFlightJdbcIntervalVectorAccessor.java | 32 ++++++++ .../ArrowFlightJdbcAccessorFactoryTest.java | 14 ++++ ...wFlightJdbcIntervalVectorAccessorTest.java | 51 ++++++++++++- .../apache/arrow/vector/PeriodDuration.java | 73 ++++++++++++++++++- .../arrow/vector/TestPeriodDuration.java | 47 ++++++++++++ 6 files changed, 217 insertions(+), 4 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactory.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactory.java index 813b40a8070f7..fa45d7a867c4a 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactory.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactory.java @@ -51,6 +51,7 @@ import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; import org.apache.arrow.vector.IntervalYearVector; import org.apache.arrow.vector.LargeVarBinaryVector; import org.apache.arrow.vector.LargeVarCharVector; @@ -176,6 +177,9 @@ public static ArrowFlightJdbcAccessor createAccessor(ValueVector vector, } else if (vector instanceof IntervalYearVector) { return new ArrowFlightJdbcIntervalVectorAccessor(((IntervalYearVector) vector), getCurrentRow, setCursorWasNull); + } else if (vector instanceof IntervalMonthDayNanoVector) { + return new ArrowFlightJdbcIntervalVectorAccessor(((IntervalMonthDayNanoVector) vector), getCurrentRow, + setCursorWasNull); } else if (vector instanceof StructVector) { return new ArrowFlightJdbcStructVectorAccessor((StructVector) vector, getCurrentRow, setCursorWasNull); diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessor.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessor.java index 21d1c15712cdb..90b53bc856023 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessor.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessor.java @@ -30,8 +30,11 @@ import org.apache.arrow.driver.jdbc.accessor.ArrowFlightJdbcAccessorFactory; import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.PeriodDuration; import org.apache.arrow.vector.holders.NullableIntervalDayHolder; +import org.apache.arrow.vector.holders.NullableIntervalMonthDayNanoHolder; import org.apache.arrow.vector.holders.NullableIntervalYearHolder; /** @@ -96,6 +99,35 @@ public ArrowFlightJdbcIntervalVectorAccessor(IntervalYearVector vector, objectClass = java.time.Period.class; } + /** + * Instantiate an accessor for a {@link IntervalMonthDayNanoVector}. + * + * @param vector an instance of a IntervalMonthDayNanoVector. + * @param currentRowSupplier the supplier to track the rows. + * @param setCursorWasNull the consumer to set if value was null. + */ + public ArrowFlightJdbcIntervalVectorAccessor(IntervalMonthDayNanoVector vector, + IntSupplier currentRowSupplier, + ArrowFlightJdbcAccessorFactory.WasNullConsumer setCursorWasNull) { + super(currentRowSupplier, setCursorWasNull); + this.vector = vector; + stringGetter = (index) -> { + final NullableIntervalMonthDayNanoHolder holder = new NullableIntervalMonthDayNanoHolder(); + vector.get(index, holder); + if (holder.isSet == 0) { + return null; + } else { + final int months = holder.months; + final int days = holder.days; + final long nanos = holder.nanoseconds; + final Period period = Period.ofMonths(months).plusDays(days); + final Duration duration = Duration.ofNanos(nanos); + return new PeriodDuration(period, duration).toISO8601IntervalString(); + } + }; + objectClass = PeriodDuration.class; + } + @Override public Class getObjectClass() { return objectClass; diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactoryTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactoryTest.java index 4b3744372c0e8..ab7f215f5d102 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactoryTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/ArrowFlightJdbcAccessorFactoryTest.java @@ -41,6 +41,7 @@ import org.apache.arrow.driver.jdbc.utils.RootAllocatorTestRule; import org.apache.arrow.vector.DurationVector; import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; import org.apache.arrow.vector.IntervalYearVector; import org.apache.arrow.vector.LargeVarCharVector; import org.apache.arrow.vector.ValueVector; @@ -405,6 +406,19 @@ public void createAccessorForIntervalYearVector() { } } + @Test + public void createAccessorForIntervalMonthDayNanoVector() { + try (ValueVector valueVector = new IntervalMonthDayNanoVector("", + rootAllocatorTestRule.getRootAllocator())) { + ArrowFlightJdbcAccessor accessor = + ArrowFlightJdbcAccessorFactory.createAccessor(valueVector, GET_CURRENT_ROW, + (boolean wasNull) -> { + }); + + Assert.assertTrue(accessor instanceof ArrowFlightJdbcIntervalVectorAccessor); + } + } + @Test public void createAccessorForUnionVector() { try (ValueVector valueVector = new UnionVector("", rootAllocatorTestRule.getRootAllocator(), diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessorTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessorTest.java index 322b7d40bd6e1..956738168f083 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessorTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/accessor/impl/calendar/ArrowFlightJdbcIntervalVectorAccessorTest.java @@ -24,6 +24,7 @@ import java.time.Duration; import java.time.Period; +import java.time.format.DateTimeParseException; import java.util.Arrays; import java.util.Collection; import java.util.function.Supplier; @@ -32,7 +33,9 @@ import org.apache.arrow.driver.jdbc.utils.AccessorTestUtils; import org.apache.arrow.driver.jdbc.utils.RootAllocatorTestRule; import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalMonthDayNanoVector; import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.PeriodDuration; import org.apache.arrow.vector.ValueVector; import org.junit.After; import org.junit.Assert; @@ -66,6 +69,9 @@ public class ArrowFlightJdbcIntervalVectorAccessorTest { } else if (vector instanceof IntervalYearVector) { return new ArrowFlightJdbcIntervalVectorAccessor((IntervalYearVector) vector, getCurrentRow, noOpWasNullConsumer); + } else if (vector instanceof IntervalMonthDayNanoVector) { + return new ArrowFlightJdbcIntervalVectorAccessor((IntervalMonthDayNanoVector) vector, + getCurrentRow, noOpWasNullConsumer); } return null; }; @@ -98,6 +104,17 @@ public static Collection data() { } return vector; }, "IntervalYearVector"}, + {(Supplier) () -> { + IntervalMonthDayNanoVector vector = + new IntervalMonthDayNanoVector("", rootAllocatorTestRule.getRootAllocator()); + + int valueCount = 10; + vector.setValueCount(valueCount); + for (int i = 0; i < valueCount; i++) { + vector.set(i, i + 1, (i + 1) * 10, (i + 1) * 100); + } + return vector; + }, "IntervalMonthDayNanoVector"}, }); } @@ -137,13 +154,31 @@ public void testShouldGetObjectReturnNull() throws Exception { } private String getStringOnVector(ValueVector vector, int index) { - String object = getExpectedObject(vector, index).toString(); + Object object = getExpectedObject(vector, index); if (object == null) { return null; } else if (vector instanceof IntervalDayVector) { - return formatIntervalDay(Duration.parse(object)); + return formatIntervalDay(Duration.parse(object.toString())); } else if (vector instanceof IntervalYearVector) { - return formatIntervalYear(Period.parse(object)); + return formatIntervalYear(Period.parse(object.toString())); + } else if (vector instanceof IntervalMonthDayNanoVector) { + String iso8601IntervalString = ((PeriodDuration) object).toISO8601IntervalString(); + String[] periodAndDuration = iso8601IntervalString.split("T"); + if (periodAndDuration.length == 1) { + // If there is no 'T', then either Period or Duration is zero, and the other one will successfully parse it + String periodOrDuration = periodAndDuration[0]; + try { + return new PeriodDuration(Period.parse(periodOrDuration), Duration.ZERO).toISO8601IntervalString(); + } catch (DateTimeParseException e) { + return new PeriodDuration(Period.ZERO, Duration.parse(periodOrDuration)).toISO8601IntervalString(); + } + } else { + // If there is a 'T', both Period and Duration are non-zero, and we just need to prepend the 'PT' to the + // duration for both to parse successfully + Period parse = Period.parse(periodAndDuration[0]); + Duration duration = Duration.parse("PT" + periodAndDuration[1]); + return new PeriodDuration(parse, duration).toISO8601IntervalString(); + } } return null; } @@ -225,6 +260,8 @@ private Class getExpectedObjectClassForVector(ValueVector vector) { return Duration.class; } else if (vector instanceof IntervalYearVector) { return Period.class; + } else if (vector instanceof IntervalMonthDayNanoVector) { + return PeriodDuration.class; } return null; } @@ -239,6 +276,10 @@ private void setAllNullOnVector(ValueVector vector) { for (int i = 0; i < valueCount; i++) { ((IntervalYearVector) vector).setNull(i); } + } else if (vector instanceof IntervalMonthDayNanoVector) { + for (int i = 0; i < valueCount; i++) { + ((IntervalMonthDayNanoVector) vector).setNull(i); + } } } @@ -247,6 +288,10 @@ private Object getExpectedObject(ValueVector vector, int currentRow) { return Duration.ofDays(currentRow + 1).plusMillis((currentRow + 1) * 1000L); } else if (vector instanceof IntervalYearVector) { return Period.ofMonths(currentRow + 1); + } else if (vector instanceof IntervalMonthDayNanoVector) { + Period period = Period.ofMonths(currentRow + 1).plusDays((currentRow + 1) * 10L); + Duration duration = Duration.ofNanos((currentRow + 1) * 100L); + return new PeriodDuration(period, duration); } return null; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java b/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java index ee48fe7972251..c94e4b534cac7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/PeriodDuration.java @@ -17,8 +17,22 @@ package org.apache.arrow.vector; +import static java.time.temporal.ChronoUnit.DAYS; +import static java.time.temporal.ChronoUnit.MONTHS; +import static java.time.temporal.ChronoUnit.NANOS; +import static java.time.temporal.ChronoUnit.SECONDS; +import static java.time.temporal.ChronoUnit.YEARS; + import java.time.Duration; import java.time.Period; +import java.time.temporal.ChronoUnit; +import java.time.temporal.Temporal; +import java.time.temporal.TemporalAmount; +import java.time.temporal.TemporalUnit; +import java.time.temporal.UnsupportedTemporalTypeException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; import org.apache.arrow.util.Preconditions; @@ -26,7 +40,10 @@ * Combination of Period and Duration for representing this interval type * as a POJO. */ -public class PeriodDuration { +public class PeriodDuration implements TemporalAmount { + + private static final List SUPPORTED_UNITS = + Collections.unmodifiableList(Arrays.asList(YEARS, MONTHS, DAYS, SECONDS, NANOS)); private final Period period; private final Duration duration; @@ -43,6 +60,60 @@ public Duration getDuration() { return duration; } + @Override + public long get(TemporalUnit unit) { + if (unit instanceof ChronoUnit) { + switch ((ChronoUnit) unit) { + case YEARS: + return period.getYears(); + case MONTHS: + return period.getMonths(); + case DAYS: + return period.getDays(); + case SECONDS: + return duration.getSeconds(); + case NANOS: + return duration.getNano(); + default: + break; + } + } + throw new UnsupportedTemporalTypeException("Unsupported TemporalUnit: " + unit); + } + + @Override + public List getUnits() { + return SUPPORTED_UNITS; + } + + @Override + public Temporal addTo(Temporal temporal) { + return temporal.plus(period).plus(duration); + } + + @Override + public Temporal subtractFrom(Temporal temporal) { + return temporal.minus(period).minus(duration); + } + + /** + * Format this PeriodDuration as an ISO-8601 interval. + * + * @return An ISO-8601 formatted string representing the interval. + */ + public String toISO8601IntervalString() { + if (duration.isZero()) { + return period.toString(); + } + String durationString = duration.toString(); + if (period.isZero()) { + return durationString; + } + + // Remove 'P' from duration string and concatenate to produce an ISO-8601 representation + return period + durationString.substring(1); + } + @Override public String toString() { return period.toString() + " " + duration.toString(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java b/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java index c8965dec3b83b..2b9f4cca8c22f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestPeriodDuration.java @@ -21,7 +21,10 @@ import static org.junit.Assert.assertNotEquals; import java.time.Duration; +import java.time.LocalDate; +import java.time.LocalDateTime; import java.time.Period; +import java.time.temporal.ChronoUnit; import org.junit.Test; @@ -43,4 +46,48 @@ public void testBasics() { assertNotEquals(pd1.hashCode(), pd3.hashCode()); } + @Test + public void testToISO8601IntervalString() { + assertEquals("P0D", + new PeriodDuration(Period.ZERO, Duration.ZERO).toISO8601IntervalString()); + assertEquals("P1Y2M3D", + new PeriodDuration(Period.of(1, 2, 3), Duration.ZERO).toISO8601IntervalString()); + assertEquals("PT0.000000123S", + new PeriodDuration(Period.ZERO, Duration.ofNanos(123)).toISO8601IntervalString()); + assertEquals("PT1.000000123S", + new PeriodDuration(Period.ZERO, Duration.ofSeconds(1).withNanos(123)).toISO8601IntervalString()); + assertEquals("PT1H1.000000123S", + new PeriodDuration(Period.ZERO, Duration.ofSeconds(3601).withNanos(123)).toISO8601IntervalString()); + assertEquals("PT24H1M1.000000123S", + new PeriodDuration(Period.ZERO, Duration.ofSeconds(86461).withNanos(123)).toISO8601IntervalString()); + assertEquals("P1Y2M3DT24H1M1.000000123S", + new PeriodDuration(Period.of(1, 2, 3), Duration.ofSeconds(86461).withNanos(123)).toISO8601IntervalString()); + + assertEquals("P-1Y-2M-3D", + new PeriodDuration(Period.of(-1, -2, -3), Duration.ZERO).toISO8601IntervalString()); + assertEquals("PT-0.000000123S", + new PeriodDuration(Period.ZERO, Duration.ofNanos(-123)).toISO8601IntervalString()); + assertEquals("PT-24H-1M-0.999999877S", + new PeriodDuration(Period.ZERO, Duration.ofSeconds(-86461).withNanos(123)).toISO8601IntervalString()); + assertEquals("P-1Y-2M-3DT-0.999999877S", + new PeriodDuration(Period.of(-1, -2, -3), Duration.ofSeconds(-1).withNanos(123)).toISO8601IntervalString()); + } + + @Test + public void testTemporalAccessor() { + LocalDate date = LocalDate.of(2024, 1, 2); + PeriodDuration pd1 = new PeriodDuration(Period.ofYears(1), Duration.ZERO); + assertEquals(LocalDate.of(2025, 1, 2), pd1.addTo(date)); + + LocalDateTime dateTime = LocalDateTime.of(2024, 1, 2, 3, 4); + PeriodDuration pd2 = new PeriodDuration(Period.ZERO, Duration.ofMinutes(1)); + assertEquals(LocalDateTime.of(2024, 1, 2, 3, 3), pd2.subtractFrom(dateTime)); + + PeriodDuration pd3 = new PeriodDuration(Period.of(1, 2, 3), Duration.ofSeconds(86461).withNanos(123)); + assertEquals(pd3.get(ChronoUnit.YEARS), 1); + assertEquals(pd3.get(ChronoUnit.MONTHS), 2); + assertEquals(pd3.get(ChronoUnit.DAYS), 3); + assertEquals(pd3.get(ChronoUnit.SECONDS), 86461); + assertEquals(pd3.get(ChronoUnit.NANOS), 123); + } } From 71321841eb6d94946de43cccb7f04afe5cf2aa10 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 1 Apr 2024 11:15:59 -0400 Subject: [PATCH 27/81] GH-40900: [Go] Fix Mallocator Weirdness (#40902) ### Rationale for this change With help from @ lidavidm and @ bkietz digging into the linked issue, we found the following: * Using `mtrace` and `strace` didn't produce much enlightenment to what was happening. * If the python adbc_driver_manager was built so that the cython lib is built using `CMAKE_BUILD_TYPE=Debug` then the crash/failure goes away * If the env var `MALLOC_MMAP_THRESHOLD_` is set to 128MB, the crash/failure goes away * It is only reproducible when calling through python, I haven't been able to reproduce it using pure Go * Calling `calloc` again after it fails, still fails * Calling `malloc` + `memset` immediately after the failing `calloc` works perfectly and doesn't fail anymore ### What changes are included in this PR? Adding a comment describing the situation and falling back to `malloc` + `memset` if `calloc` returns an error. If the pointer returned from `malloc` is `nil` then we surface the error. * GitHub Issue: #40900 Authored-by: Matt Topol Signed-off-by: Matt Topol --- go/arrow/memory/mallocator/mallocator.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/go/arrow/memory/mallocator/mallocator.go b/go/arrow/memory/mallocator/mallocator.go index 59d240a1063e8..9483bdfc2a05f 100644 --- a/go/arrow/memory/mallocator/mallocator.go +++ b/go/arrow/memory/mallocator/mallocator.go @@ -60,10 +60,19 @@ func (alloc *Mallocator) Allocate(size int) []byte { } ptr, err := C.calloc(C.size_t(size), 1) if err != nil { - panic(err) + // under some circumstances and allocation patterns, we can end up in a scenario + // where for some reason calloc return ENOMEM even though there is definitely memory + // available for use. So we attempt to fallback to simply doing malloc + memset in + // this case. If malloc returns a nil pointer, then we know we're out of memory + // and will surface the error. + if ptr = C.malloc(C.size_t(size)); ptr == nil { + panic(err) + } + C.memset(ptr, 0, C.size_t(size)) } else if ptr == nil { panic("mallocator: out of memory") } + atomic.AddUint64(&alloc.allocatedBytes, uint64(size)) return unsafe.Slice((*byte)(ptr), size) } From 68241d8a86e9923cda2b758d10176b8dfb1cfea7 Mon Sep 17 00:00:00 2001 From: wayne Date: Mon, 1 Apr 2024 12:01:49 -0600 Subject: [PATCH 28/81] GH-40888: [Go][FlightRPC] support conversion from array.Duration in FlightSQL driver (#40889) ### Rationale for this change To enable the use of the flightsql driver's implementation of golang sql interfaces. ### What changes are included in this PR? A new switch branch for handling `array.Duration`. ### Are these changes tested? I manually tested and didn't add new unit tests because none of the other types handled in the same switch block are unit tested. ### Are there any user-facing changes? Just a more complete set of types handled by the sql driver. * GitHub Issue: #40888 Authored-by: wayne warren Signed-off-by: Matt Topol --- go/arrow/flight/flightsql/driver/utils.go | 4 ++++ go/arrow/flight/flightsql/driver/utils_test.go | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/go/arrow/flight/flightsql/driver/utils.go b/go/arrow/flight/flightsql/driver/utils.go index a99c045e2ed02..84cf2110cca92 100644 --- a/go/arrow/flight/flightsql/driver/utils.go +++ b/go/arrow/flight/flightsql/driver/utils.go @@ -104,6 +104,10 @@ func fromArrowType(arr arrow.Array, idx int) (interface{}, error) { return v.ToTime(ts.TimeUnit()), nil case *array.Date64: return c.Value(idx).ToTime(), nil + case *array.Duration: + dt := arr.DataType().(*arrow.DurationType) + duration := time.Duration(c.Value(idx)) * dt.Unit.Multiplier() + return duration, nil case *array.DayTimeInterval: durationDays := time.Duration(c.Value(idx).Days*24) * time.Hour duration := time.Duration(c.Value(idx).Milliseconds) * time.Millisecond diff --git a/go/arrow/flight/flightsql/driver/utils_test.go b/go/arrow/flight/flightsql/driver/utils_test.go index 6b1adfed47503..8ea7921b64e79 100644 --- a/go/arrow/flight/flightsql/driver/utils_test.go +++ b/go/arrow/flight/flightsql/driver/utils_test.go @@ -50,6 +50,10 @@ func Test_fromArrowType(t *testing.T) { {Name: "f15-ts_us", Type: arrow.FixedWidthTypes.Timestamp_ns}, {Name: "f16-d64", Type: arrow.FixedWidthTypes.Date64}, {Name: "f17-dti", Type: arrow.FixedWidthTypes.DayTimeInterval}, + {Name: "f18-duration_s", Type: arrow.FixedWidthTypes.Duration_s}, + {Name: "f19-duration_ms", Type: arrow.FixedWidthTypes.Duration_ms}, + {Name: "f20-duration_us", Type: arrow.FixedWidthTypes.Duration_us}, + {Name: "f21-duration_ns", Type: arrow.FixedWidthTypes.Duration_ns}, } schema := arrow.NewSchema(fields, nil) @@ -90,6 +94,10 @@ func Test_fromArrowType(t *testing.T) { testTime := time.Now() b.Field(15).(*array.Date64Builder).Append(arrow.Date64FromTime(testTime)) b.Field(16).(*array.DayTimeIntervalBuilder).Append(arrow.DayTimeInterval{Days: 1, Milliseconds: 1000}) + b.Field(17).(*array.DurationBuilder).Append(1) + b.Field(18).(*array.DurationBuilder).Append(1) + b.Field(19).(*array.DurationBuilder).Append(1) + b.Field(20).(*array.DurationBuilder).Append(1) rec := b.NewRecord() defer rec.Release() @@ -123,4 +131,8 @@ func Test_fromArrowType(t *testing.T) { tf(t, 14, time.Date(1970, 1, 1, 12, 0, 0, 0, time.UTC)) // "f15-ts_us" tf(t, 15, testTime.In(time.UTC).Truncate(24*time.Hour)) // "f16-d64" tf(t, 16, time.Duration(24*time.Hour+time.Second)) // "f17-dti" + tf(t, 17, time.Duration(1000000000)) // "f18-duration_s" + tf(t, 18, time.Duration(1000000)) // "f19-duration_ms" + tf(t, 19, time.Duration(1000)) // "f20-duration_us" + tf(t, 20, time.Duration(1)) // "f21-duration_ns" } From e44dc29df9587a139fe539069c3dafc771256b90 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Apr 2024 14:02:32 -0400 Subject: [PATCH 29/81] MINOR: [Go] Bump github.com/google/flatbuffers from 24.3.7+incompatible to 24.3.25+incompatible in /go (#40922) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [github.com/google/flatbuffers](https://github.com/google/flatbuffers) from 24.3.7+incompatible to 24.3.25+incompatible.

Release notes

Sourced from github.com/google/flatbuffers's releases.

v24.3.25

What's Changed

New Contributors

Full Changelog: https://github.com/google/flatbuffers/compare/v24.3.7...v24.3.25

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/google/flatbuffers&package-manager=go_modules&previous-version=24.3.7+incompatible&new-version=24.3.25+incompatible)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 2f788c5c26b02..9975ecfc69d34 100644 --- a/go/go.mod +++ b/go/go.mod @@ -25,7 +25,7 @@ require ( github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815 github.com/goccy/go-json v0.10.2 github.com/golang/snappy v0.0.4 - github.com/google/flatbuffers v24.3.7+incompatible + github.com/google/flatbuffers v24.3.25+incompatible github.com/klauspost/asmfmt v1.3.2 github.com/klauspost/compress v1.17.7 github.com/klauspost/cpuid/v2 v2.2.7 diff --git a/go/go.sum b/go/go.sum index 593746bcf9e4e..462c43021a29e 100644 --- a/go/go.sum +++ b/go/go.sum @@ -1,9 +1,11 @@ github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c h1:RGWPOewvKIROun94nF7v2cua9qP+thov/7M50KEoeSU= github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk= github.com/alecthomas/assert/v2 v2.3.0 h1:mAsH2wmvjsuvyBvAmCtm7zFsBlb8mIHx5ySLVdDZXL0= +github.com/alecthomas/assert/v2 v2.3.0/go.mod h1:pXcQ2Asjp247dahGEmsZ6ru0UVwnkhktn7S0bBDLxvQ= github.com/alecthomas/participle/v2 v2.1.0 h1:z7dElHRrOEEq45F2TG5cbQihMtNTv8vwldytDj7Wrz4= github.com/alecthomas/participle/v2 v2.1.0/go.mod h1:Y1+hAs8DHPmc3YUFzqllV+eSQ9ljPTk0ZkPMtEdAx2c= github.com/alecthomas/repr v0.2.0 h1:HAzS41CIzNW5syS8Mf9UwXhNH1J9aix/BvDRf1Ml2Yk= +github.com/alecthomas/repr v0.2.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= github.com/apache/thrift v0.19.0 h1:sOqkWPzMj7w6XaYbJQG7m4sGqVolaW/0D28Ln7yPzMk= @@ -19,8 +21,11 @@ github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+m github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= github.com/go-playground/locales v0.13.0 h1:HyWk6mgj5qFqCT5fjGBuRArbVDfE4hi8+e8ceBS/t7Q= +github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8= github.com/go-playground/universal-translator v0.17.0 h1:icxd5fm+REJzpZx7ZfpaD876Lmtgy7VtROAbHHXk8no= +github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA= github.com/go-playground/validator/v10 v10.4.1 h1:pH2c5ADXtd66mxoE0Zm9SUhxE20r7aM3F26W0hOn+GE= +github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn9GlaMV7XkbRSipzJ0Ii4= github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/goccy/go-yaml v1.11.0 h1:n7Z+zx8S9f9KgzG6KtQKf+kwqXZlLNR2F6018Dgau54= @@ -30,12 +35,14 @@ github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/google/flatbuffers v24.3.7+incompatible h1:BxGUkIQnOciBu33bd5BdvqY8Qvo0O/GR4SPhh7x9Ed0= -github.com/google/flatbuffers v24.3.7+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= +github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI= +github.com/google/flatbuffers v24.3.25+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbuBVKCudVG457BR2GZFIz3uw3hQ= +github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hamba/avro/v2 v2.20.1 h1:3WByQiVn7wT7d27WQq6pvBRC00FVOrniP6u67FLA/2E= @@ -43,6 +50,7 @@ github.com/hamba/avro/v2 v2.20.1/go.mod h1:xHiKXbISpb3Ovc809XdzWow+XGTn+Oyf/F9aZ github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= +github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= @@ -52,15 +60,18 @@ github.com/klauspost/compress v1.17.7/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ib github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM= github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y= +github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= +github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= @@ -99,9 +110,11 @@ github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhso github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= +github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= golang.org/x/crypto v0.21.0 h1:X31++rzVUdKhX5sWmSOFZxx8UW/ldWx55cbf08iNAMA= +golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 h1:LfspQV/FYTatPTr/3HzIcmiUFH7PGP+OQ6mgDYo3yuQ= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc= golang.org/x/mod v0.16.0 h1:QX4fJ0Rr5cPQCF7O9lh9Se4pmwfwskqZfq5moyldzic= @@ -134,9 +147,11 @@ google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGm google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= modernc.org/fileutil v1.3.0 h1:gQ5SIzK3H9kdfai/5x41oQiKValumqNTDXMvKo62HvE= +modernc.org/fileutil v1.3.0/go.mod h1:XatxS8fZi3pS8/hKG2GH/ArUogfxjpEKs3Ku3aK4JyQ= modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 h1:5D53IMaUuA5InSeMu9eJtlQXS2NxAhyWQvkKEgXZhHI= modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6/go.mod h1:Qz0X07sNOR1jWYCrJMEnbW/X55x206Q7Vt4mz6/wHp4= modernc.org/libc v1.41.0 h1:g9YAc6BkKlgORsUWj+JwqoB1wU3o4DE3bM3yvA3k+Gk= From 48ee2eabffb6059206176f8a53c19bec11e9d441 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Apr 2024 12:42:27 -0700 Subject: [PATCH 30/81] MINOR: [C#] Bump Google.Protobuf from 3.26.0 to 3.26.1 in /csharp (#40923) Bumps [Google.Protobuf](https://github.com/protocolbuffers/protobuf) from 3.26.0 to 3.26.1.
Commits
  • 2434ef2 Updating version.json and repo version numbers to: 26.1
  • 49253b1 Merge pull request #16308 from protocolbuffers/cp-26x-3
  • 9bf69ec Fix validateFeatures to be called after resolved features are actually set to...
  • b752bc2 Merge pull request #16307 from protocolbuffers/cp-26x-2
  • f7d2326 Merge pull request #16309 from protocolbuffers/cp-26x-4
  • 2e51ff6 Cherry-pick required label handling in JRuby field descriptor from https://gi...
  • a2f5303 Update cmake stalenes
  • 6a177d2 Merge branch '26.x' into cp-26x-4
  • 2d3d8ba Expand cpp_features_proto_srcs visibility
  • e1092ee Merge pull request #16294 from protocolbuffers/cp-26x
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Google.Protobuf&package-manager=nuget&previous-version=3.26.0&new-version=3.26.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index bd6ae7ad22b42..04b8a7dc734f0 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -5,7 +5,7 @@ - + From 9e320d7181fb5b7192d690b634a247c66132f864 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 2 Apr 2024 06:15:53 +0900 Subject: [PATCH 31/81] GH-39069: [C++][FS][Azure] Use the generic filesystem tests (#40567) ### Rationale for this change We should provide common spec for all filesystem API. ### What changes are included in this PR? Enable the generic filesystem tests. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #39069 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/azurefs.cc | 117 +++++++-- cpp/src/arrow/filesystem/azurefs_test.cc | 319 +++++++++++++++-------- cpp/src/arrow/filesystem/test_util.cc | 30 ++- cpp/src/arrow/filesystem/test_util.h | 4 + 4 files changed, 333 insertions(+), 137 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 260478b068ed1..84733a824e7ba 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1591,7 +1591,9 @@ class AzureFileSystem::Impl { if (info.type() == FileType::NotFound) { return PathNotFound(location); } - DCHECK_EQ(info.type(), FileType::Directory); + if (info.type() != FileType::Directory) { + return NotADir(location); + } return Status::OK(); } @@ -1818,8 +1820,67 @@ class AzureFileSystem::Impl { const AzureLocation& location, bool recursive) { DCHECK(!location.container.empty()); DCHECK(!location.path.empty()); - // Non-recursive CreateDir calls require the parent directory to exist. - if (!recursive) { + if (recursive) { + // Recursive CreateDir calls require that all path segments be + // either a directory or not found. + + // Check each path segment is a directory or not + // found. Nonexistent segments are collected to + // nonexistent_locations. We'll create directories for + // nonexistent segments later. + std::vector nonexistent_locations; + for (auto prefix = location; !prefix.path.empty(); prefix = prefix.parent()) { + ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(container_client, prefix)); + if (info.type() == FileType::File) { + return NotADir(prefix); + } + if (info.type() == FileType::NotFound) { + nonexistent_locations.push_back(prefix); + } + } + // Ensure container exists + ARROW_ASSIGN_OR_RAISE(auto container, + AzureLocation::FromString(location.container)); + ARROW_ASSIGN_OR_RAISE(auto container_info, + GetContainerPropsAsFileInfo(container, container_client)); + if (container_info.type() == FileType::NotFound) { + try { + container_client.CreateIfNotExists(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to create directory '", + location.all, "': ", container_client.GetUrl()); + } + } + // Create nonexistent directories from shorter to longer: + // + // Example: + // + // * location: /container/a/b/c/d/ + // * Nonexistent path segments: + // * /container/a/ + // * /container/a/c/ + // * /container/a/c/d/ + // * target_locations: + // 1. /container/a/c/d/ + // 2. /container/a/c/ + // 3. /container/a/ + // + // Create order: + // 1. /container/a/ + // 2. /container/a/c/ + // 3. /container/a/c/d/ + for (size_t i = nonexistent_locations.size(); i > 0; --i) { + const auto& nonexistent_location = nonexistent_locations[i - 1]; + try { + create_if_not_exists(container_client, nonexistent_location); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to create directory '", + location.all, "': ", container_client.GetUrl()); + } + } + return Status::OK(); + } else { + // Non-recursive CreateDir calls require the parent directory to exist. auto parent = location.parent(); if (!parent.path.empty()) { RETURN_NOT_OK(CheckDirExists(container_client, parent)); @@ -1827,28 +1888,17 @@ class AzureFileSystem::Impl { // If the parent location is just the container, we don't need to check if it // exists because the operation we perform below will fail if the container // doesn't exist and we can handle that error according to the recursive flag. - } - try { - create_if_not_exists(container_client, location); - return Status::OK(); - } catch (const Storage::StorageException& exception) { - if (IsContainerNotFound(exception)) { - try { - if (recursive) { - container_client.CreateIfNotExists(); - create_if_not_exists(container_client, location); - return Status::OK(); - } else { - auto parent = location.parent(); - return PathNotFound(parent); - } - } catch (const Storage::StorageException& second_exception) { - return ExceptionToStatus(second_exception, "Failed to create directory '", - location.all, "': ", container_client.GetUrl()); + try { + create_if_not_exists(container_client, location); + return Status::OK(); + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { + auto parent = location.parent(); + return PathNotFound(parent); } + return ExceptionToStatus(exception, "Failed to create directory '", location.all, + "': ", container_client.GetUrl()); } - return ExceptionToStatus(exception, "Failed to create directory '", location.all, - "': ", container_client.GetUrl()); } } @@ -2016,8 +2066,15 @@ class AzureFileSystem::Impl { bool found_dir_marker_blob = false; try { auto list_response = container_client.ListBlobs(options); - if (require_dir_to_exist && list_response.Blobs.empty()) { - return PathNotFound(location); + if (list_response.Blobs.empty()) { + if (require_dir_to_exist) { + return PathNotFound(location); + } else { + ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(container_client, location)); + if (info.type() == FileType::File) { + return NotADir(location); + } + } } for (; list_response.HasPage(); list_response.MoveToNextPage()) { if (list_response.Blobs.empty()) { @@ -2732,6 +2789,16 @@ class AzureFileSystem::Impl { } auto dest_blob_client = GetBlobClient(dest.container, dest.path); auto src_url = GetBlobClient(src.container, src.path).GetUrl(); + if (!dest.path.empty()) { + auto dest_parent = dest.parent(); + if (!dest_parent.path.empty()) { + auto dest_container_client = GetBlobContainerClient(dest_parent.container); + ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(dest_container_client, dest_parent)); + if (info.type() == FileType::File) { + return NotADir(dest_parent); + } + } + } try { dest_blob_client.CopyFromUri(src_url); } catch (const Storage::StorageException& exception) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 7ea5eb446bc12..24031e313f798 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -98,6 +98,7 @@ class BaseAzureEnv : public ::testing::Environment { virtual AzureBackend backend() const = 0; + virtual bool HasSubmitBatchBug() const { return false; } virtual bool WithHierarchicalNamespace() const { return false; } virtual Result GetDebugLogSize() { return 0; } @@ -207,6 +208,18 @@ class AzuriteEnv : public AzureEnvImpl { return self; } + /// Azurite has a bug that causes BlobContainerClient::SubmitBatch to fail on macOS. + /// SubmitBatch is used by: + /// - AzureFileSystem::DeleteDir + /// - AzureFileSystem::DeleteDirContents + bool HasSubmitBatchBug() const override { +#ifdef __APPLE__ + return true; +#else + return false; +#endif + } + Result GetDebugLogSize() override { ARROW_ASSIGN_OR_RAISE(auto exists, arrow::internal::FileExists(debug_log_path_)); if (!exists) { @@ -274,6 +287,186 @@ class AzureHierarchicalNSEnv : public AzureEnvImpl { bool WithHierarchicalNamespace() const final { return true; } }; +namespace { +Result MakeOptions(BaseAzureEnv* env) { + AzureOptions options; + options.account_name = env->account_name(); + switch (env->backend()) { + case AzureBackend::kAzurite: + options.blob_storage_authority = "127.0.0.1:10000"; + options.dfs_storage_authority = "127.0.0.1:10000"; + options.blob_storage_scheme = "http"; + options.dfs_storage_scheme = "http"; + break; + case AzureBackend::kAzure: + // Use the default values + break; + } + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredential(env->account_key())); + return options; +} +} // namespace + +struct PreexistingData { + public: + using RNG = random::pcg32_fast; + + public: + const std::string container_name; + static constexpr char const* kObjectName = "test-object-name"; + + static constexpr char const* kLoremIpsum = R"""( +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. +)"""; + + public: + explicit PreexistingData(RNG& rng) : container_name{RandomContainerName(rng)} {} + + // Creates a path by concatenating the container name and the stem. + std::string ContainerPath(std::string_view stem) const { return Path(stem); } + + // Short alias to ContainerPath() + std::string Path(std::string_view stem) const { + return ConcatAbstractPath(container_name, stem); + } + + std::string ObjectPath() const { return ContainerPath(kObjectName); } + std::string NotFoundObjectPath() const { return ContainerPath("not-found"); } + + std::string RandomDirectoryPath(RNG& rng) const { + return ContainerPath(RandomChars(32, rng)); + } + + // Utilities + static std::string RandomContainerName(RNG& rng) { return RandomChars(32, rng); } + + static std::string RandomChars(int count, RNG& rng) { + auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); + std::uniform_int_distribution d(0, static_cast(fillers.size()) - 1); + std::string s; + std::generate_n(std::back_inserter(s), count, [&] { return fillers[d(rng)]; }); + return s; + } + + static int RandomIndex(int end, RNG& rng) { + return std::uniform_int_distribution(0, end - 1)(rng); + } + + static std::string RandomLine(int lineno, int width, RNG& rng) { + auto line = std::to_string(lineno) + ": "; + line += RandomChars(width - static_cast(line.size()) - 1, rng); + line += '\n'; + return line; + } +}; + +class TestGeneric : public ::testing::Test, public GenericFileSystemTest { + public: + void TearDown() override { + if (azure_fs_) { + ASSERT_OK(azure_fs_->DeleteDir(container_name_)); + } + } + + protected: + void SetUpInternal(BaseAzureEnv* env) { + env_ = env; + random::pcg32_fast rng((std::random_device()())); + container_name_ = PreexistingData::RandomContainerName(rng); + ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env_)); + ASSERT_OK_AND_ASSIGN(azure_fs_, AzureFileSystem::Make(options)); + ASSERT_OK(azure_fs_->CreateDir(container_name_, true)); + fs_ = std::make_shared(container_name_, azure_fs_); + } + + std::shared_ptr GetEmptyFileSystem() override { return fs_; } + + bool have_implicit_directories() const override { return true; } + bool allow_write_file_over_dir() const override { return true; } + bool allow_read_dir_as_file() const override { return true; } + bool allow_move_dir() const override { return false; } + bool allow_move_file() const override { return true; } + bool allow_append_to_file() const override { return true; } + bool have_directory_mtimes() const override { return false; } + bool have_flaky_directory_tree_deletion() const override { return false; } + bool have_file_metadata() const override { return true; } + // calloc() used in libxml2's xmlNewGlobalState() is detected as a + // memory leak like the following. But it's a false positive. It's + // used in ListBlobsByHierarchy() for GetFileInfo() and it's freed + // in the call. This is detected as a memory leak only with + // generator API (GetFileInfoGenerator()) and not detected with + // non-generator API (GetFileInfo()). So this is a false positive. + // + // ==2875409==ERROR: LeakSanitizer: detected memory leaks + // + // Direct leak of 968 byte(s) in 1 object(s) allocated from: + // #0 0x55d02c967bdc in calloc (build/debug/arrow-azurefs-test+0x17bbdc) (BuildId: + // 520690d1b20e860cc1feef665dce8196e64f955e) #1 0x7fa914b1cd1e in xmlNewGlobalState + // builddir/main/../../threads.c:580:10 #2 0x7fa914b1cd1e in xmlGetGlobalState + // builddir/main/../../threads.c:666:31 + bool have_false_positive_memory_leak_with_generator() const override { return true; } + + BaseAzureEnv* env_; + std::shared_ptr azure_fs_; + std::shared_ptr fs_; + + private: + std::string container_name_; +}; + +class TestAzuriteGeneric : public TestGeneric { + public: + void SetUp() override { + ASSERT_OK_AND_ASSIGN(auto env, AzuriteEnv::GetInstance()); + SetUpInternal(env); + } + + protected: + // Azurite doesn't support moving files over containers. + bool allow_move_file() const override { return false; } + // DeleteDir() doesn't work with Azurite on macOS + bool have_flaky_directory_tree_deletion() const override { + return env_->HasSubmitBatchBug(); + } +}; + +class TestAzureFlatNSGeneric : public TestGeneric { + public: + void SetUp() override { + auto env_result = AzureFlatNSEnv::GetInstance(); + if (env_result.status().IsCancelled()) { + GTEST_SKIP() << env_result.status().message(); + } + ASSERT_OK_AND_ASSIGN(auto env, env_result); + SetUpInternal(env); + } + + protected: + // Flat namespace account doesn't support moving files over containers. + bool allow_move_file() const override { return false; } +}; + +class TestAzureHierarchicalNSGeneric : public TestGeneric { + public: + void SetUp() override { + auto env_result = AzureHierarchicalNSEnv::GetInstance(); + if (env_result.status().IsCancelled()) { + GTEST_SKIP() << env_result.status().message(); + } + ASSERT_OK_AND_ASSIGN(auto env, env_result); + SetUpInternal(env); + } +}; + +GENERIC_FS_TEST_FUNCTIONS(TestAzuriteGeneric); +GENERIC_FS_TEST_FUNCTIONS(TestAzureFlatNSGeneric); +GENERIC_FS_TEST_FUNCTIONS(TestAzureHierarchicalNSGeneric); + TEST(AzureFileSystem, InitializingFilesystemWithoutAccountNameFails) { AzureOptions options; ASSERT_RAISES(Invalid, options.ConfigureAccountKeyCredential("account_key")); @@ -532,64 +725,6 @@ TEST_F(TestAzureOptions, FromUriInvalidQueryParameter) { TestFromUriInvalidQueryParameter(); } -struct PreexistingData { - public: - using RNG = random::pcg32_fast; - - public: - const std::string container_name; - static constexpr char const* kObjectName = "test-object-name"; - - static constexpr char const* kLoremIpsum = R"""( -Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor -incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis -nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu -fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in -culpa qui officia deserunt mollit anim id est laborum. -)"""; - - public: - explicit PreexistingData(RNG& rng) : container_name{RandomContainerName(rng)} {} - - // Creates a path by concatenating the container name and the stem. - std::string ContainerPath(std::string_view stem) const { return Path(stem); } - - // Short alias to ContainerPath() - std::string Path(std::string_view stem) const { - return ConcatAbstractPath(container_name, stem); - } - - std::string ObjectPath() const { return ContainerPath(kObjectName); } - std::string NotFoundObjectPath() const { return ContainerPath("not-found"); } - - std::string RandomDirectoryPath(RNG& rng) const { - return ContainerPath(RandomChars(32, rng)); - } - - // Utilities - static std::string RandomContainerName(RNG& rng) { return RandomChars(32, rng); } - - static std::string RandomChars(int count, RNG& rng) { - auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); - std::uniform_int_distribution d(0, static_cast(fillers.size()) - 1); - std::string s; - std::generate_n(std::back_inserter(s), count, [&] { return fillers[d(rng)]; }); - return s; - } - - static int RandomIndex(int end, RNG& rng) { - return std::uniform_int_distribution(0, end - 1)(rng); - } - - static std::string RandomLine(int lineno, int width, RNG& rng) { - auto line = std::to_string(lineno) + ": "; - line += RandomChars(width - static_cast(line.size()) - 1, rng); - line += '\n'; - return line; - } -}; - class TestAzureFileSystem : public ::testing::Test { protected: // Set in constructor @@ -621,24 +756,6 @@ class TestAzureFileSystem : public ::testing::Test { return fs(CachedHNSSupport(*env)); } - static Result MakeOptions(BaseAzureEnv* env) { - AzureOptions options; - options.account_name = env->account_name(); - switch (env->backend()) { - case AzureBackend::kAzurite: - options.blob_storage_authority = "127.0.0.1:10000"; - options.dfs_storage_authority = "127.0.0.1:10000"; - options.blob_storage_scheme = "http"; - options.dfs_storage_scheme = "http"; - break; - case AzureBackend::kAzure: - // Use the default values - break; - } - ARROW_EXPECT_OK(options.ConfigureAccountKeyCredential(env->account_key())); - return options; - } - void SetUp() override { auto make_options = [this]() -> Result { ARROW_ASSIGN_OR_RAISE(auto env, GetAzureEnv()); @@ -824,19 +941,6 @@ class TestAzureFileSystem : public ::testing::Test { "This test is affected by an Azurite issue: " "https://github.com/Azure/Azurite/pull/2302"; - /// Azurite has a bug that causes BlobContainerClient::SubmitBatch to fail on macOS. - /// SubmitBatch is used by: - /// - AzureFileSystem::DeleteDir - /// - AzureFileSystem::DeleteDirContents - bool HasSubmitBatchBug() const { -#ifdef __APPLE__ - EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); - return env->backend() == AzureBackend::kAzurite; -#else - return false; -#endif - } - static bool WithErrno(const Status& status, int expected_errno) { auto* detail = status.detail().get(); return detail && @@ -1059,9 +1163,7 @@ class TestAzureFileSystem : public ::testing::Test { auto path2 = data.Path("directory2"); ASSERT_OK(fs()->OpenOutputStream(path2)); - // CreateDir returns OK even if there is already a file or directory at this - // location. Whether or not this is the desired behaviour is debatable. - ASSERT_OK(fs()->CreateDir(path2)); + ASSERT_RAISES(IOError, fs()->CreateDir(path2)); AssertFileInfo(fs(), path2, FileType::File); } @@ -1070,7 +1172,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirSuccessEmpty() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -1090,7 +1193,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirSuccessHaveBlob() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -1105,7 +1209,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestNonEmptyDirWithTrailingSlash() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -1120,7 +1225,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirSuccessHaveDirectory() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -1135,7 +1241,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirContentsSuccessExist() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto preexisting_data = SetUpPreexistingData(); @@ -1149,7 +1256,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirContentsSuccessExistWithTrailingSlash() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto preexisting_data = SetUpPreexistingData(); @@ -1163,7 +1271,8 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirContentsSuccessNonexistent() { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -2174,7 +2283,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessContainer) { } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -2185,7 +2295,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessHaveBlobs) { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -2213,7 +2324,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirUri) { } TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); @@ -2228,7 +2340,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { } TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessDirectory) { - if (HasSubmitBatchBug()) { + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + if (env->HasSubmitBatchBug()) { GTEST_SKIP() << kSubmitBatchBugMessage; } auto data = SetUpPreexistingData(); diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 040917dcd218a..19226ce01ae2f 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -252,8 +252,7 @@ void GenericFileSystemTest::TestCreateDir(FileSystem* fs) { } void GenericFileSystemTest::TestDeleteDir(FileSystem* fs) { - if (have_flaky_directory_tree_deletion()) - GTEST_SKIP() << "Flaky directory deletion on Windows"; + if (have_flaky_directory_tree_deletion()) GTEST_SKIP() << "Flaky directory deletion"; ASSERT_OK(fs->CreateDir("AB/CD/EF")); ASSERT_OK(fs->CreateDir("AB/GH/IJ")); @@ -281,8 +280,7 @@ void GenericFileSystemTest::TestDeleteDir(FileSystem* fs) { } void GenericFileSystemTest::TestDeleteDirContents(FileSystem* fs) { - if (have_flaky_directory_tree_deletion()) - GTEST_SKIP() << "Flaky directory deletion on Windows"; + if (have_flaky_directory_tree_deletion()) GTEST_SKIP() << "Flaky directory deletion"; ASSERT_OK(fs->CreateDir("AB/CD/EF")); ASSERT_OK(fs->CreateDir("AB/GH/IJ")); @@ -313,6 +311,8 @@ void GenericFileSystemTest::TestDeleteDirContents(FileSystem* fs) { } void GenericFileSystemTest::TestDeleteRootDirContents(FileSystem* fs) { + if (have_flaky_directory_tree_deletion()) GTEST_SKIP() << "Flaky directory deletion"; + ASSERT_OK(fs->CreateDir("AB/CD")); CreateFile(fs, "AB/abc", ""); @@ -323,9 +323,7 @@ void GenericFileSystemTest::TestDeleteRootDirContents(FileSystem* fs) { AssertAllDirs(fs, {"AB", "AB/CD"}); AssertAllFiles(fs, {"AB/abc"}); } else { - if (!have_flaky_directory_tree_deletion()) { - AssertAllDirs(fs, {}); - } + AssertAllDirs(fs, {}); AssertAllFiles(fs, {}); } } @@ -385,6 +383,10 @@ void GenericFileSystemTest::TestDeleteFiles(FileSystem* fs) { } void GenericFileSystemTest::TestMoveFile(FileSystem* fs) { + if (!allow_move_file()) { + GTEST_SKIP() << "Filesystem doesn't allow moving files"; + } + ASSERT_OK(fs->CreateDir("AB/CD")); ASSERT_OK(fs->CreateDir("EF")); CreateFile(fs, "abc", "data"); @@ -750,6 +752,12 @@ void GenericFileSystemTest::TestGetFileInfoSelector(FileSystem* fs) { } void GenericFileSystemTest::TestGetFileInfoGenerator(FileSystem* fs) { +#ifdef ADDRESS_SANITIZER + if (have_false_positive_memory_leak_with_generator()) { + GTEST_SKIP() << "Filesystem have false positive memory leak with generator"; + } +#endif + ASSERT_OK(fs->CreateDir("AB/CD")); CreateFile(fs, "abc", "data"); CreateFile(fs, "AB/def", "some data"); @@ -1177,8 +1185,12 @@ void GenericFileSystemTest::TestSpecialChars(FileSystem* fs) { AssertFileContents(fs, "Special and%different.txt", "data"); ASSERT_OK(fs->DeleteFile("Special and%different.txt")); - ASSERT_OK(fs->DeleteDir("Blank Char")); - AssertAllDirs(fs, {}); + if (have_flaky_directory_tree_deletion()) { + ASSERT_OK(fs->DeleteFile("Blank Char/Special%Char.txt")); + } else { + ASSERT_OK(fs->DeleteDir("Blank Char")); + AssertAllDirs(fs, {}); + } AssertAllFiles(fs, {}); } diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index 62b488e159a24..e70c787aa85c4 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -168,6 +168,8 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest { virtual bool allow_write_file_over_dir() const { return false; } // - Whether the filesystem allows reading a directory virtual bool allow_read_dir_as_file() const { return false; } + // - Whether the filesystem allows moving a file + virtual bool allow_move_file() const { return true; } // - Whether the filesystem allows moving a directory virtual bool allow_move_dir() const { return true; } // - Whether the filesystem allows moving a directory "over" a non-empty destination @@ -182,6 +184,8 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest { virtual bool have_flaky_directory_tree_deletion() const { return false; } // - Whether the filesystem stores some metadata alongside files virtual bool have_file_metadata() const { return false; } + // - Whether the filesystem has a false positive memory leak with generator + virtual bool have_false_positive_memory_leak_with_generator() const { return false; } void TestEmpty(FileSystem* fs); void TestNormalizePath(FileSystem* fs); From 06f305e5adb1fa660e16e0a8ed4421e4a8eb036d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 2 Apr 2024 06:17:03 +0900 Subject: [PATCH 32/81] GH-40882: [C++] Suppress shorten-64-to-32 warnings in CUDA/Skyhook codes (#40883) ### Rationale for this change ```text cpp/src/arrow/gpu/cuda_memory.cc:497:72: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] ARROW_ASSIGN_OR_RAISE(auto device, arrow::cuda::CudaDevice::Make(device_id)); ~~~~~ ^~~~~~~~~ ``` ```text cpp/src/arrow/gpu/cuda_memory.cc:508:68: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] ARROW_ASSIGN_OR_RAISE(auto device, arrow::cuda::CudaDevice::Make(device_id)); ~~~~~ ^~~~~~~~~ ``` ```text cpp/src/skyhook/protocol/skyhook_protocol.cc:109:69: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'unsigned int' [-Werror,-Wshorten-64-to-32] bl->append(reinterpret_cast(buffer->data()), buffer->size()); ~~~~~~ ~~~~~~~~^~~~~~ ``` ```text cpp/src/skyhook/cls/cls_skyhook.cc:87:37: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] cls_cxx_read(hctx_, position, nbytes, bl.get()); ~~~~~~~~~~~~ ^~~~~~ cpp/src/skyhook/cls/cls_skyhook.cc:87:27: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] cls_cxx_read(hctx_, position, nbytes, bl.get()); ~~~~~~~~~~~~ ^~~~~~~~ ``` ```text cpp/src/skyhook/protocol/skyhook_protocol.cc:109:69: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'unsigned int' [-Werror,-Wshorten-64-to-32] bl->append(reinterpret_cast(buffer->data()), buffer->size()); ~~~~~~ ``` ### What changes are included in this PR? Add casts explicitly. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40882 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/gpu/cuda_memory.cc | 6 ++++-- cpp/src/skyhook/cls/cls_skyhook.cc | 2 +- cpp/src/skyhook/protocol/skyhook_protocol.cc | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/gpu/cuda_memory.cc b/cpp/src/arrow/gpu/cuda_memory.cc index 6972321006a9a..dcf0a31963e45 100644 --- a/cpp/src/arrow/gpu/cuda_memory.cc +++ b/cpp/src/arrow/gpu/cuda_memory.cc @@ -494,7 +494,8 @@ Result> DefaultMemoryMapper(ArrowDeviceType devic case ARROW_DEVICE_CUDA: case ARROW_DEVICE_CUDA_HOST: case ARROW_DEVICE_CUDA_MANAGED: { - ARROW_ASSIGN_OR_RAISE(auto device, arrow::cuda::CudaDevice::Make(device_id)); + ARROW_ASSIGN_OR_RAISE(auto device, + arrow::cuda::CudaDevice::Make(static_cast(device_id))); return device->default_memory_manager(); } default: @@ -505,7 +506,8 @@ Result> DefaultMemoryMapper(ArrowDeviceType devic namespace { Result> DefaultCUDADeviceMapper(int64_t device_id) { - ARROW_ASSIGN_OR_RAISE(auto device, arrow::cuda::CudaDevice::Make(device_id)); + ARROW_ASSIGN_OR_RAISE(auto device, + arrow::cuda::CudaDevice::Make(static_cast(device_id))); return device->default_memory_manager(); } diff --git a/cpp/src/skyhook/cls/cls_skyhook.cc b/cpp/src/skyhook/cls/cls_skyhook.cc index 24f80c79d5730..e021cb3c8248a 100644 --- a/cpp/src/skyhook/cls/cls_skyhook.cc +++ b/cpp/src/skyhook/cls/cls_skyhook.cc @@ -84,7 +84,7 @@ class RandomAccessObject : public arrow::io::RandomAccessFile { if (nbytes > 0) { std::shared_ptr bl = std::make_shared(); - cls_cxx_read(hctx_, position, nbytes, bl.get()); + cls_cxx_read(hctx_, static_cast(position), static_cast(nbytes), bl.get()); chunks_.push_back(bl); return std::make_shared((uint8_t*)bl->c_str(), bl->length()); } diff --git a/cpp/src/skyhook/protocol/skyhook_protocol.cc b/cpp/src/skyhook/protocol/skyhook_protocol.cc index 3b1234c6ed913..b91a9bfdd2ecb 100644 --- a/cpp/src/skyhook/protocol/skyhook_protocol.cc +++ b/cpp/src/skyhook/protocol/skyhook_protocol.cc @@ -106,7 +106,8 @@ arrow::Status SerializeTable(const std::shared_ptr& table, ARROW_RETURN_NOT_OK(writer->Close()); ARROW_ASSIGN_OR_RAISE(auto buffer, buffer_output_stream->Finish()); - bl->append(reinterpret_cast(buffer->data()), buffer->size()); + bl->append(reinterpret_cast(buffer->data()), + static_cast(buffer->size())); return arrow::Status::OK(); } From 757ee7a910b9380bd0821a34ac123dec2e53ced0 Mon Sep 17 00:00:00 2001 From: carehabit <165479941+carehabit@users.noreply.github.com> Date: Tue, 2 Apr 2024 08:08:24 +0800 Subject: [PATCH 33/81] MINOR: [Docs] Remove repetitive words (#40914) ### Rationale for this change ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? Authored-by: carehabit Signed-off-by: Sutou Kouhei --- cpp/src/arrow/vendored/datetime/tz.cpp | 2 +- cpp/src/arrow/vendored/pcg/pcg_random.hpp | 4 ++-- docs/source/developers/release.rst | 2 +- docs/source/format/ADBC.rst | 2 +- python/pyarrow/src/arrow/python/python_to_arrow.cc | 2 +- r/R/dplyr-arrange.R | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/vendored/datetime/tz.cpp b/cpp/src/arrow/vendored/datetime/tz.cpp index 6962a8b3c3572..e94c1bc8ae682 100644 --- a/cpp/src/arrow/vendored/datetime/tz.cpp +++ b/cpp/src/arrow/vendored/datetime/tz.cpp @@ -118,7 +118,7 @@ #include #include -// unistd.h is used on some platforms as part of the the means to get +// unistd.h is used on some platforms as part of the means to get // the current time zone. On Win32 windows.h provides a means to do it. // gcc/mingw supports unistd.h on Win32 but MSVC does not. diff --git a/cpp/src/arrow/vendored/pcg/pcg_random.hpp b/cpp/src/arrow/vendored/pcg/pcg_random.hpp index a864ba0a2c59b..e39e61e908a2a 100644 --- a/cpp/src/arrow/vendored/pcg/pcg_random.hpp +++ b/cpp/src/arrow/vendored/pcg/pcg_random.hpp @@ -1900,7 +1900,7 @@ typedef pcg_engines::ext_oneseq_xsh_rs_64_32<1,32,true> pcg32_k2_fast; // - the k variants are k-dimensionally equidistributed // - the c variants offer better crypographic security // -// (just how good the cryptographic security is is an open question) +// (just how good the cryptographic security is an open question) typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,true> pcg32_k64; typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,true> pcg32_k64_oneseq; @@ -1923,7 +1923,7 @@ typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,false> pcg64_c32_fast; // - the k variants are k-dimensionally equidistributed // - the c variants offer better crypographic security // -// (just how good the cryptographic security is is an open question) +// (just how good the cryptographic security is an open question) typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,true> pcg32_k1024; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,true> pcg32_k1024_fast; diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index 09608f2834478..e7431ce0fb7b9 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -131,7 +131,7 @@ branch from main. Follow up Release Candidates will update the maintenance branch by cherry-picking specific commits. -For the the initial Release Candidate for a minor or a patch release we will create +For the initial Release Candidate for a minor or a patch release we will create a maintenance branch from the previous corresponding release. For example, for a 15.0.1 patch we will create a maint-15.0.1 branch from maint-15.0.0 and for a maint-15.0.2 we will create it from maint-15.0.1. Once the maintenance branch is diff --git a/docs/source/format/ADBC.rst b/docs/source/format/ADBC.rst index f90ab24d1b9c2..41aa08ddbfb32 100644 --- a/docs/source/format/ADBC.rst +++ b/docs/source/format/ADBC.rst @@ -92,7 +92,7 @@ implemented directly by a vendor-specific "driver" or a vendor-neutral Version 1.0.0 of the standard corresponds to tag adbc-1.0.0 of the repository ``apache/arrow-adbc``, which is commit -f044edf5256abfb4c091b0ad2acc73afea2c93c0_. Note that is is separate +f044edf5256abfb4c091b0ad2acc73afea2c93c0_. Note that is separate from releases of the actual implementations. See the language-specific pages for details: diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 902814a4e91f1..79da47567bf24 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -405,7 +405,7 @@ class PyValue { RETURN_NOT_OK(PopulateMonthDayNano::Field( obj, &output.months, &found_attrs)); // on relativeoffset weeks is a property calculated from days. On - // DateOffset is is a field on its own. timedelta doesn't have a weeks + // DateOffset is a field on its own. timedelta doesn't have a weeks // attribute. PyObject* pandas_date_offset_type = internal::BorrowPandasDataOffsetType(); bool is_date_offset = pandas_date_offset_type == (PyObject*)Py_TYPE(obj); diff --git a/r/R/dplyr-arrange.R b/r/R/dplyr-arrange.R index e3e20f2cb3ac3..f91cd14211e0f 100644 --- a/r/R/dplyr-arrange.R +++ b/r/R/dplyr-arrange.R @@ -24,7 +24,7 @@ arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { exprs <- expand_across(.data, quos(...)) if (.by_group) { - # when the data is is grouped and .by_group is TRUE, order the result by + # when the data is grouped and .by_group is TRUE, order the result by # the grouping columns first exprs <- c(quos(!!!dplyr::groups(.data)), exprs) } From a0cfc258901942af27351f4ed20b3d233a9a1f0b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Apr 2024 09:37:49 +0900 Subject: [PATCH 34/81] MINOR: [CI] Bump actions/setup-python from 5.0.0 to 5.1.0 (#40917) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5.0.0 to 5.1.0.
Release notes

Sourced from actions/setup-python's releases.

v5.1.0

What's Changed

New Contributors

Full Changelog: https://github.com/actions/setup-python/compare/v5.0.0...v5.1.0

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/setup-python&package-manager=github_actions&previous-version=5.0.0&new-version=5.1.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .github/workflows/archery.yml | 2 +- .github/workflows/comment_bot.yml | 2 +- .github/workflows/cpp.yml | 4 ++-- .github/workflows/dev.yml | 4 ++-- .github/workflows/docs.yml | 2 +- .github/workflows/docs_light.yml | 2 +- .github/workflows/go.yml | 6 +++--- .github/workflows/integration.yml | 2 +- .github/workflows/java.yml | 2 +- .github/workflows/java_jni.yml | 4 ++-- .github/workflows/java_nightly.yml | 2 +- .github/workflows/js.yml | 2 +- .github/workflows/pr_bot.yml | 2 +- .github/workflows/python.yml | 4 ++-- .github/workflows/r.yml | 4 ++-- .github/workflows/r_nightly.yml | 2 +- .github/workflows/ruby.yml | 2 +- 17 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index dbd24796db52b..cb783dd66c3fb 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -57,7 +57,7 @@ jobs: shell: bash run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: python-version: '3.12' - name: Install pygit2 binary wheel diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 038a468a81276..a34856d2dc81a 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -41,7 +41,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 3036d06d5d7b2..e8e41f1bcb90c 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -237,7 +237,7 @@ jobs: $(brew --prefix bash)/bin/bash \ ci/scripts/install_minio.sh latest ${ARROW_HOME} - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: python-version: 3.12 - name: Install Google Cloud Storage Testbench @@ -458,7 +458,7 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x /usr/local/bin/minio.exe - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: python-version: 3.9 - name: Install Google Cloud Storage Testbench diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 3a48270a97c9a..37fda2e313ae2 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -42,7 +42,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Install pre-commit @@ -101,7 +101,7 @@ jobs: with: fetch-depth: 0 - name: Install Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: '3.12' - name: Install Ruby diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 82b43ee2363b5..9c7701f25f756 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -51,7 +51,7 @@ jobs: key: ubuntu-docs-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-docs- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 306fc5135073d..6ec4c3d53d0e3 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -57,7 +57,7 @@ jobs: key: conda-docs-${{ hashFiles('cpp/**') }} restore-keys: conda-docs- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 7ff781d35e8ec..7fca38528260f 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -201,7 +201,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -241,7 +241,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -333,7 +333,7 @@ jobs: github.event_name == 'push' && github.repository == 'apache/arrow' && github.ref_name == 'main' - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: '3.10' - name: Run Benchmarks diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index adb6fb2b57c75..0f186ff6a4527 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -81,7 +81,7 @@ jobs: key: conda-${{ hashFiles('cpp/**') }} restore-keys: conda- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index a14977525b6c6..423f54cd93547 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -75,7 +75,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 46f3381ed0e8f..790ffd5c650e0 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -69,7 +69,7 @@ jobs: key: java-jni-manylinux-2014-${{ hashFiles('cpp/**', 'java/**') }} restore-keys: java-jni-manylinux-2014- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -109,7 +109,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml index c535dc4a07de3..f40d4ce5b42d6 100644 --- a/.github/workflows/java_nightly.yml +++ b/.github/workflows/java_nightly.yml @@ -58,7 +58,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 304eba41e4d37..dab89da44c861 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -51,7 +51,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index 6af7dbe7680f5..e589610f536b3 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -82,7 +82,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 25d918bcc25aa..1147ac13e6f93 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -100,7 +100,7 @@ jobs: key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} restore-keys: ${{ matrix.cache }}- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -162,7 +162,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v5.1.0 with: python-version: '3.11' - name: Install Dependencies diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 8c47915b7b6d3..78677499f3e45 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -142,7 +142,7 @@ jobs: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery @@ -203,7 +203,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index 6629b5c8a5673..af5382f90834c 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -60,7 +60,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 74d56895f4c34..311c1c822baf6 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -82,7 +82,7 @@ jobs: key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby- - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.8 - name: Setup Archery From aaacefa6b6986916256e0e7002bfcfed293443c4 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 1 Apr 2024 21:56:32 -0400 Subject: [PATCH 35/81] GH-40896: [Java] Remove runtime dependencies on Eclipse, logback (#40904) ### Rationale for this change Remove runtime dependencies on [Category B](https://apache.org/legal/resolved.html#category-b) dependencies. ### What changes are included in this PR? - logback: move to test-only - eclipse: remove dependency, vendor the Netty implementation we originally used I wanted to remove javax.annotation.Generated but gRPC doesn't yet let us do that (https://github.com/grpc/grpc-java/issues/9179). That's ~okay though since effectively that's a build only dependency. ### Are these changes tested? #40901 ### Are there any user-facing changes? No. **This PR contains a "Critical Fix".** License issues do not cause runtime issues but are important as an Apache project. * GitHub Issue: #40896 Authored-by: David Li Signed-off-by: Sutou Kouhei --- LICENSE.txt | 7 + dev/release/rat_exclude_files.txt | 2 + java/dev/checkstyle/suppressions.xml | 3 + java/tools/pom.xml | 2 +- java/vector/pom.xml | 5 - java/vector/src/main/java/module-info.java | 1 - .../arrow/vector/util/IntObjectHashMap.java | 736 ++++++++++++++++++ .../arrow/vector/util/IntObjectMap.java | 87 +++ .../arrow/vector/util/MapWithOrdinalImpl.java | 2 - .../vector/util/MultiMapWithOrdinal.java | 2 - 10 files changed, 836 insertions(+), 11 deletions(-) create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java diff --git a/LICENSE.txt b/LICENSE.txt index 0423854567b26..7bb1330a1002b 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -2252,3 +2252,10 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java + +These file are derived from code from Netty, which is made available under the +Apache License 2.0. diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 4f86a12afe4fb..f4d7b411c4dc2 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -80,6 +80,8 @@ go/parquet/internal/gen-go/parquet/GoUnusedProtection__.go go/parquet/internal/gen-go/parquet/parquet-consts.go go/parquet/internal/gen-go/parquet/parquet.go go/parquet/version_string.go +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java +java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java js/.npmignore js/closure-compiler-scripts/* js/src/fb/*.ts diff --git a/java/dev/checkstyle/suppressions.xml b/java/dev/checkstyle/suppressions.xml index a3536e2ca9212..e8669c54e61fd 100644 --- a/java/dev/checkstyle/suppressions.xml +++ b/java/dev/checkstyle/suppressions.xml @@ -36,6 +36,9 @@ + + + diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 0688fae1ab78c..9b55f07c013d3 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -52,7 +52,7 @@ ch.qos.logback logback-classic 1.3.14 - runtime + test
com.fasterxml.jackson.core diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 5cd6d0a00fcca..20af3dbd38443 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -74,11 +74,6 @@ org.slf4j slf4j-api - - org.eclipse.collections - eclipse-collections - 11.1.0 -
diff --git a/java/vector/src/main/java/module-info.java b/java/vector/src/main/java/module-info.java index 20f7094715f4d..e2ebcd1e86740 100644 --- a/java/vector/src/main/java/module-info.java +++ b/java/vector/src/main/java/module-info.java @@ -45,6 +45,5 @@ requires org.apache.arrow.format; requires org.apache.arrow.memory.core; requires org.apache.commons.codec; - requires org.eclipse.collections.impl; requires org.slf4j; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java b/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java new file mode 100644 index 0000000000000..f3d0fb628edf0 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectHashMap.java @@ -0,0 +1,736 @@ +/* + * Copyright 2014 The Netty Project + * + * The Netty Project licenses this file to you under the Apache License, version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at: + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.AbstractCollection; +import java.util.AbstractSet; +import java.util.Arrays; +import java.util.Collection; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Set; + +/** + * A vendored specialized copy of Netty's IntObjectHashMap for use within Arrow. + * Avoids requiring Netty in the Arrow core just for this one class. + * + * @param The value type stored in the map. + */ +class IntObjectHashMap implements IntObjectMap { + + /** + * Default initial capacity. Used if not specified in the constructor + */ + public static final int DEFAULT_CAPACITY = 8; + + /** + * Default load factor. Used if not specified in the constructor + */ + public static final float DEFAULT_LOAD_FACTOR = 0.5f; + + /** + * Placeholder for null values, so we can use the actual null to mean available. + * (Better than using a placeholder for available: less references for GC processing.) + */ + private static final Object NULL_VALUE = new Object(); + + /** + * The maximum number of elements allowed without allocating more space. + */ + private int maxSize; + + /** + * The load factor for the map. Used to calculate {@link #maxSize}. + */ + private final float loadFactor; + + private int[] keys; + private V[] values; + private int size; + private int mask; + + private final Set keySet = new KeySet(); + private final Set> entrySet = new EntrySet(); + private final Iterable> entries = new Iterable>() { + @Override + public Iterator> iterator() { + return new PrimitiveIterator(); + } + }; + + public IntObjectHashMap() { + this(DEFAULT_CAPACITY, DEFAULT_LOAD_FACTOR); + } + + public IntObjectHashMap(int initialCapacity) { + this(initialCapacity, DEFAULT_LOAD_FACTOR); + } + + public IntObjectHashMap(int initialCapacity, float loadFactor) { + if (loadFactor <= 0.0f || loadFactor > 1.0f) { + // Cannot exceed 1 because we can never store more than capacity elements; + // using a bigger loadFactor would trigger rehashing before the desired load is reached. + throw new IllegalArgumentException("loadFactor must be > 0 and <= 1"); + } + + this.loadFactor = loadFactor; + + // Adjust the initial capacity if necessary. + int capacity = safeFindNextPositivePowerOfTwo(initialCapacity); + mask = capacity - 1; + + // Allocate the arrays. + keys = new int[capacity]; + @SuppressWarnings({"unchecked", "SuspiciousArrayCast"}) + V[] temp = (V[]) new Object[capacity]; + values = temp; + + // Initialize the maximum size value. + maxSize = calcMaxSize(capacity); + } + + private static T toExternal(T value) { + assert value != null : "null is not a legitimate internal value. Concurrent Modification?"; + return value == NULL_VALUE ? null : value; + } + + @SuppressWarnings("unchecked") + private static T toInternal(T value) { + return value == null ? (T) NULL_VALUE : value; + } + + @Override + public V get(int key) { + int index = indexOf(key); + return index == -1 ? null : toExternal(values[index]); + } + + @Override + public V put(int key, V value) { + int startIndex = hashIndex(key); + int index = startIndex; + + for (; ; ) { + if (values[index] == null) { + // Found empty slot, use it. + keys[index] = key; + values[index] = toInternal(value); + growSize(); + return null; + } + if (keys[index] == key) { + // Found existing entry with this key, just replace the value. + V previousValue = values[index]; + values[index] = toInternal(value); + return toExternal(previousValue); + } + + // Conflict, keep probing ... + if ((index = probeNext(index)) == startIndex) { + // Can only happen if the map was full at MAX_ARRAY_SIZE and couldn't grow. + throw new IllegalStateException("Unable to insert"); + } + } + } + + @Override + public void putAll(Map sourceMap) { + if (sourceMap instanceof IntObjectHashMap) { + // Optimization - iterate through the arrays. + @SuppressWarnings("unchecked") + IntObjectHashMap source = (IntObjectHashMap) sourceMap; + for (int i = 0; i < source.values.length; ++i) { + V sourceValue = source.values[i]; + if (sourceValue != null) { + put(source.keys[i], sourceValue); + } + } + return; + } + + // Otherwise, just add each entry. + for (Entry entry : sourceMap.entrySet()) { + put(entry.getKey(), entry.getValue()); + } + } + + @Override + public V remove(int key) { + int index = indexOf(key); + if (index == -1) { + return null; + } + + V prev = values[index]; + removeAt(index); + return toExternal(prev); + } + + @Override + public int size() { + return size; + } + + @Override + public boolean isEmpty() { + return size == 0; + } + + @Override + public void clear() { + Arrays.fill(keys, (int) 0); + Arrays.fill(values, null); + size = 0; + } + + @Override + public boolean containsKey(int key) { + return indexOf(key) >= 0; + } + + @Override + public boolean containsValue(Object value) { + @SuppressWarnings("unchecked") + V v1 = toInternal((V) value); + for (V v2 : values) { + // The map supports null values; this will be matched as NULL_VALUE.equals(NULL_VALUE). + if (v2 != null && v2.equals(v1)) { + return true; + } + } + return false; + } + + @Override + public Iterable> entries() { + return entries; + } + + @Override + public Collection values() { + return new AbstractCollection() { + @Override + public Iterator iterator() { + return new Iterator() { + final PrimitiveIterator iter = new PrimitiveIterator(); + + @Override + public boolean hasNext() { + return iter.hasNext(); + } + + @Override + public V next() { + return iter.next().value(); + } + + @Override + public void remove() { + iter.remove(); + } + }; + } + + @Override + public int size() { + return size; + } + }; + } + + @Override + public int hashCode() { + // Hashcode is based on all non-zero, valid keys. We have to scan the whole keys + // array, which may have different lengths for two maps of same size(), so the + // capacity cannot be used as input for hashing but the size can. + int hash = size; + for (int key : keys) { + // 0 can be a valid key or unused slot, but won't impact the hashcode in either case. + // This way we can use a cheap loop without conditionals, or hard-to-unroll operations, + // or the devastatingly bad memory locality of visiting value objects. + // Also, it's important to use a hash function that does not depend on the ordering + // of terms, only their values; since the map is an unordered collection and + // entries can end up in different positions in different maps that have the same + // elements, but with different history of puts/removes, due to conflicts. + hash ^= hashCode(key); + } + return hash; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof IntObjectMap)) { + return false; + } + @SuppressWarnings("rawtypes") + IntObjectMap other = (IntObjectMap) obj; + if (size != other.size()) { + return false; + } + for (int i = 0; i < values.length; ++i) { + V value = values[i]; + if (value != null) { + int key = keys[i]; + Object otherValue = other.get(key); + if (value == NULL_VALUE) { + if (otherValue != null) { + return false; + } + } else if (!value.equals(otherValue)) { + return false; + } + } + } + return true; + } + + @Override + public boolean containsKey(Object key) { + return containsKey(objectToKey(key)); + } + + @Override + public V get(Object key) { + return get(objectToKey(key)); + } + + @Override + public V put(Integer key, V value) { + return put(objectToKey(key), value); + } + + @Override + public V remove(Object key) { + return remove(objectToKey(key)); + } + + @Override + public Set keySet() { + return keySet; + } + + @Override + public Set> entrySet() { + return entrySet; + } + + private int objectToKey(Object key) { + return (int) (Integer) key; + } + + /** + * Locates the index for the given key. This method probes using double hashing. + * + * @param key the key for an entry in the map. + * @return the index where the key was found, or {@code -1} if no entry is found for that key. + */ + private int indexOf(int key) { + int startIndex = hashIndex(key); + int index = startIndex; + + for (; ; ) { + if (values[index] == null) { + // It's available, so no chance that this value exists anywhere in the map. + return -1; + } + if (key == keys[index]) { + return index; + } + + // Conflict, keep probing ... + if ((index = probeNext(index)) == startIndex) { + return -1; + } + } + } + + /** + * Returns the hashed index for the given key. + */ + private int hashIndex(int key) { + // The array lengths are always a power of two, so we can use a bitmask to stay inside the array bounds. + return hashCode(key) & mask; + } + + /** + * Returns the hash code for the key. + */ + private static int hashCode(int key) { + return key; + } + + /** + * Get the next sequential index after {@code index} and wraps if necessary. + */ + private int probeNext(int index) { + // The array lengths are always a power of two, so we can use a bitmask to stay inside the array bounds. + return (index + 1) & mask; + } + + /** + * Grows the map size after an insertion. If necessary, performs a rehash of the map. + */ + private void growSize() { + size++; + + if (size > maxSize) { + if (keys.length == Integer.MAX_VALUE) { + throw new IllegalStateException("Max capacity reached at size=" + size); + } + + // Double the capacity. + rehash(keys.length << 1); + } + } + + /** + * Removes entry at the given index position. Also performs opportunistic, incremental rehashing + * if necessary to not break conflict chains. + * + * @param index the index position of the element to remove. + * @return {@code true} if the next item was moved back. {@code false} otherwise. + */ + private boolean removeAt(final int index) { + --size; + // Clearing the key is not strictly necessary (for GC like in a regular collection), + // but recommended for security. The memory location is still fresh in the cache anyway. + keys[index] = 0; + values[index] = null; + + // In the interval from index to the next available entry, the arrays may have entries + // that are displaced from their base position due to prior conflicts. Iterate these + // entries and move them back if possible, optimizing future lookups. + // Knuth Section 6.4 Algorithm R, also used by the JDK's IdentityHashMap. + + int nextFree = index; + int i = probeNext(index); + for (V value = values[i]; value != null; value = values[i = probeNext(i)]) { + int key = keys[i]; + int bucket = hashIndex(key); + if (i < bucket && (bucket <= nextFree || nextFree <= i) || + bucket <= nextFree && nextFree <= i) { + // Move the displaced entry "back" to the first available position. + keys[nextFree] = key; + values[nextFree] = value; + // Put the first entry after the displaced entry + keys[i] = 0; + values[i] = null; + nextFree = i; + } + } + return nextFree != index; + } + + /** + * Calculates the maximum size allowed before rehashing. + */ + private int calcMaxSize(int capacity) { + // Clip the upper bound so that there will always be at least one available slot. + int upperBound = capacity - 1; + return Math.min(upperBound, (int) (capacity * loadFactor)); + } + + /** + * Rehashes the map for the given capacity. + * + * @param newCapacity the new capacity for the map. + */ + private void rehash(int newCapacity) { + int[] oldKeys = keys; + V[] oldVals = values; + + keys = new int[newCapacity]; + @SuppressWarnings({"unchecked", "SuspiciousArrayCast"}) + V[] temp = (V[]) new Object[newCapacity]; + values = temp; + + maxSize = calcMaxSize(newCapacity); + mask = newCapacity - 1; + + // Insert to the new arrays. + for (int i = 0; i < oldVals.length; ++i) { + V oldVal = oldVals[i]; + if (oldVal != null) { + // Inlined put(), but much simpler: we don't need to worry about + // duplicated keys, growing/rehashing, or failing to insert. + int oldKey = oldKeys[i]; + int index = hashIndex(oldKey); + + for (; ; ) { + if (values[index] == null) { + keys[index] = oldKey; + values[index] = oldVal; + break; + } + + // Conflict, keep probing. Can wrap around, but never reaches startIndex again. + index = probeNext(index); + } + } + } + } + + @Override + public String toString() { + if (isEmpty()) { + return "{}"; + } + StringBuilder sb = new StringBuilder(4 * size); + sb.append('{'); + boolean first = true; + for (int i = 0; i < values.length; ++i) { + V value = values[i]; + if (value != null) { + if (!first) { + sb.append(", "); + } + sb.append(keyToString(keys[i])).append('=').append(value == this ? "(this Map)" : + toExternal(value)); + first = false; + } + } + return sb.append('}').toString(); + } + + /** + * Helper method called by {@link #toString()} in order to convert a single map key into a string. + * This is protected to allow subclasses to override the appearance of a given key. + */ + protected String keyToString(int key) { + return Integer.toString(key); + } + + /** + * Set implementation for iterating over the entries of the map. + */ + private final class EntrySet extends AbstractSet> { + @Override + public Iterator> iterator() { + return new MapIterator(); + } + + @Override + public int size() { + return IntObjectHashMap.this.size(); + } + } + + /** + * Set implementation for iterating over the keys. + */ + private final class KeySet extends AbstractSet { + @Override + public int size() { + return IntObjectHashMap.this.size(); + } + + @Override + public boolean contains(Object o) { + return IntObjectHashMap.this.containsKey(o); + } + + @Override + public boolean remove(Object o) { + return IntObjectHashMap.this.remove(o) != null; + } + + @Override + public boolean retainAll(Collection retainedKeys) { + boolean changed = false; + for (Iterator> iter = entries().iterator(); iter.hasNext(); ) { + PrimitiveEntry entry = iter.next(); + if (!retainedKeys.contains(entry.key())) { + changed = true; + iter.remove(); + } + } + return changed; + } + + @Override + public void clear() { + IntObjectHashMap.this.clear(); + } + + @Override + public Iterator iterator() { + return new Iterator() { + private final Iterator> iter = entrySet.iterator(); + + @Override + public boolean hasNext() { + return iter.hasNext(); + } + + @Override + public Integer next() { + return iter.next().getKey(); + } + + @Override + public void remove() { + iter.remove(); + } + }; + } + } + + /** + * Iterator over primitive entries. Entry key/values are overwritten by each call to {@link #next()}. + */ + private final class PrimitiveIterator implements Iterator>, PrimitiveEntry { + private int prevIndex = -1; + private int nextIndex = -1; + private int entryIndex = -1; + + private void scanNext() { + while (++nextIndex != values.length && values[nextIndex] == null) { + } + } + + @Override + public boolean hasNext() { + if (nextIndex == -1) { + scanNext(); + } + return nextIndex != values.length; + } + + @Override + public PrimitiveEntry next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + + prevIndex = nextIndex; + scanNext(); + + // Always return the same Entry object, just change its index each time. + entryIndex = prevIndex; + return this; + } + + @Override + public void remove() { + if (prevIndex == -1) { + throw new IllegalStateException("next must be called before each remove."); + } + if (removeAt(prevIndex)) { + // removeAt may move elements "back" in the array if they have been displaced because their spot in the + // array was occupied when they were inserted. If this occurs then the nextIndex is now invalid and + // should instead point to the prevIndex which now holds an element which was "moved back". + nextIndex = prevIndex; + } + prevIndex = -1; + } + + // Entry implementation. Since this implementation uses a single Entry, we coalesce that + // into the Iterator object (potentially making loop optimization much easier). + + @Override + public int key() { + return keys[entryIndex]; + } + + @Override + public V value() { + return toExternal(values[entryIndex]); + } + + @Override + public void setValue(V value) { + values[entryIndex] = toInternal(value); + } + } + + /** + * Iterator used by the {@link Map} interface. + */ + private final class MapIterator implements Iterator> { + private final PrimitiveIterator iter = new PrimitiveIterator(); + + @Override + public boolean hasNext() { + return iter.hasNext(); + } + + @Override + public Entry next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + + iter.next(); + + return new MapEntry(iter.entryIndex); + } + + @Override + public void remove() { + iter.remove(); + } + } + + /** + * A single entry in the map. + */ + final class MapEntry implements Entry { + private final int entryIndex; + + MapEntry(int entryIndex) { + this.entryIndex = entryIndex; + } + + @Override + public Integer getKey() { + verifyExists(); + return keys[entryIndex]; + } + + @Override + public V getValue() { + verifyExists(); + return toExternal(values[entryIndex]); + } + + @Override + public V setValue(V value) { + verifyExists(); + V prevValue = toExternal(values[entryIndex]); + values[entryIndex] = toInternal(value); + return prevValue; + } + + private void verifyExists() { + if (values[entryIndex] == null) { + throw new IllegalStateException("The map entry has been removed"); + } + } + } + + static int safeFindNextPositivePowerOfTwo(final int value) { + return value <= 0 ? 1 : value >= 0x40000000 ? 0x40000000 : findNextPositivePowerOfTwo(value); + } + + static int findNextPositivePowerOfTwo(final int value) { + assert value > Integer.MIN_VALUE && value < 0x40000000; + return 1 << (32 - Integer.numberOfLeadingZeros(value - 1)); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java b/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java new file mode 100644 index 0000000000000..5a9d2a5a52eb9 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/IntObjectMap.java @@ -0,0 +1,87 @@ +/* + * Copyright 2014 The Netty Project + * + * The Netty Project licenses this file to you under the Apache License, version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at: + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.Iterator; +import java.util.Map; + +/** + * A vendored specialized copy of Netty's IntObjectMap for use within Arrow. + * Avoids requiring Netty in the Arrow core just for this one class. + * + * @param the value type stored in the map. + */ +interface IntObjectMap extends Map { + + /** + * A primitive entry in the map, provided by the iterator from {@link #entries()}. + * + * @param the value type stored in the map. + */ + interface PrimitiveEntry { + /** + * Gets the key for this entry. + */ + int key(); + + /** + * Gets the value for this entry. + */ + V value(); + + /** + * Sets the value for this entry. + */ + void setValue(V value); + } + + /** + * Gets the value in the map with the specified key. + * + * @param key the key whose associated value is to be returned. + * @return the value or {@code null} if the key was not found in the map. + */ + V get(int key); + + /** + * Puts the given entry into the map. + * + * @param key the key of the entry. + * @param value the value of the entry. + * @return the previous value for this key or {@code null} if there was no previous mapping. + */ + V put(int key, V value); + + /** + * Removes the entry with the specified key. + * + * @param key the key for the entry to be removed from this map. + * @return the previous value for the key, or {@code null} if there was no mapping. + */ + V remove(int key); + + /** + * Gets an iterable to traverse over the primitive entries contained in this map. As an optimization, + * the {@link PrimitiveEntry}s returned by the {@link Iterator} may change as the {@link Iterator} + * progresses. The caller should not rely on {@link PrimitiveEntry} key/value stability. + */ + Iterable> entries(); + + /** + * Indicates whether or not this map contains a value for the specified key. + */ + boolean containsKey(int key); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java index 1f18587afdfd1..14b86c6129c81 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinalImpl.java @@ -26,8 +26,6 @@ import java.util.Set; import java.util.stream.Collectors; -import org.eclipse.collections.impl.map.mutable.primitive.IntObjectHashMap; - /** * An implementation of map that supports constant time look-up by a generic key or an ordinal. * diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java b/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java index f722a8a86772c..10566586b21c0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/MultiMapWithOrdinal.java @@ -25,8 +25,6 @@ import java.util.Set; import java.util.stream.Collectors; -import org.eclipse.collections.impl.map.mutable.primitive.IntObjectHashMap; - /** * An implementation of a multimap that supports constant time look-up by a generic key or an ordinal. * From 65dd5c7e23b0e4a7aa57a50f619ef5c017da0894 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Tue, 2 Apr 2024 00:02:15 -0700 Subject: [PATCH 36/81] MINOR: [Java] Fix maven-checkstyle-plugin configuration (#40850) ### Rationale for this change `maven-checkstyle-plugin` configuration refers to several unrecognized properties, causing build output to print several messages like: > [WARNING] Parameter 'format' is unknown for plugin 'maven-checkstyle-plugin:3.1.0:check (validate)' ### What changes are included in this PR? Fix checkstyle configuration and use the correct outputFileFormat and inputEncoding properties in place of the unrecognized format and encoding ones. ### Are these changes tested? As this is a build change with no code change, only via a local build + visual inspection of the build output ### Are there any user-facing changes? No Authored-by: Laurent Goujon Signed-off-by: David Li --- java/maven/pom.xml | 5 ++--- java/pom.xml | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/java/maven/pom.xml b/java/maven/pom.xml index ccc12f5397fb7..f6a6da3afe53e 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -257,13 +257,12 @@ ../dev/checkstyle/checkstyle.license ../dev/checkstyle/suppressions.xml true - UTF-8 + UTF-8 true ${checkstyle.failOnViolation} ${checkstyle.failOnViolation} warning - xml - html + xml ${project.build.directory}/test/checkstyle-errors.xml false diff --git a/java/pom.xml b/java/pom.xml index b05b2d8f1425a..610593580f720 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -327,13 +327,12 @@ dev/checkstyle/checkstyle.license dev/checkstyle/suppressions.xml true - UTF-8 + UTF-8 true ${checkstyle.failOnViolation} ${checkstyle.failOnViolation} warning - xml - html + xml ${project.build.directory}/test/checkstyle-errors.xml false From 549e1c4e66e9e8af2808d49d624ef443816a630a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Apr 2024 03:02:42 -0400 Subject: [PATCH 37/81] MINOR: [Java] Bump org.apache.maven.plugins:maven-gpg-plugin from 3.1.0 to 3.2.2 in /java (#40921) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugins:maven-gpg-plugin](https://github.com/apache/maven-gpg-plugin) from 3.1.0 to 3.2.2.
Release notes

Sourced from org.apache.maven.plugins:maven-gpg-plugin's releases.

3.2.2

JiRA link

Release Notes - Maven GPG Plugin - Version 3.2.2


What's Changed

Full Changelog: https://github.com/apache/maven-gpg-plugin/compare/maven-gpg-plugin-3.2.1...maven-gpg-plugin-3.2.2

3.2.1

JIRA link

Release Notes - Maven GPG Plugin - Version 3.2.1

... (truncated)

Commits
  • ab97064 [maven-release-plugin] prepare release maven-gpg-plugin-3.2.2
  • 2be0a00 [MGPG-115] Show more info about key used to sign (#84)
  • 3631830 [MGPG-114] Allow max key size of 16KB (#83)
  • 528fab9 [MGPG-113] SignAndDeployFileMojo results in 401 (#82)
  • 770636b [maven-release-plugin] prepare for next development iteration
  • 5b69086 [maven-release-plugin] prepare release maven-gpg-plugin-3.2.1
  • 28d298c [MGPG-111] Fix dependencies (#81)
  • 75d8ed5 [MGPG-112] serverId def value was unintentionally dropped (#80)
  • 2a11a2d [maven-release-plugin] prepare for next development iteration
  • 4b23da8 [maven-release-plugin] prepare release maven-gpg-plugin-3.2.0
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-gpg-plugin&package-manager=maven&previous-version=3.1.0&new-version=3.2.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/gandiva/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index 0d2a23345f6ea..cb2deb07db42a 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -96,7 +96,7 @@ org.apache.maven.plugins maven-gpg-plugin - 3.1.0 + 3.2.2 sign-artifacts From 82f9403077547046e589d44d8682388ac618c75d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Apr 2024 03:03:45 -0400 Subject: [PATCH 38/81] MINOR: [Java] Bump org.apache.maven.plugin-tools:maven-plugin-annotations from 3.6.0 to 3.11.0 in /java (#40524) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugin-tools:maven-plugin-annotations](https://github.com/apache/maven-plugin-tools) from 3.6.0 to 3.11.0.
Release notes

Sourced from org.apache.maven.plugin-tools:maven-plugin-annotations's releases.

3.11.0

Release Notes - Maven Plugin Tools - Version 3.11.0

Bug

  • [MPLUGIN-496] - Translation for keys report.plugin.goal.yes,no are missing
  • [MPLUGIN-499] - Deprecate descriptions are missing in description table

Improvement

  • [MPLUGIN-450] - Make goal prefix mandatory by default
  • [MPLUGIN-474] - Improve descriptor docs for requiredJavaVersion
  • [MPLUGIN-492] - Documentation for plugins in general: Goals comprises more than that
  • [MPLUGIN-495] - WARNINGs based on usage of @ Component for MavenSession/MavenProject instead of @ Parameter

Task

  • [MPLUGIN-493] - Consistently evaluate skip parameter in MavenReport#canGenerateReport()
  • [MPLUGIN-498] - Move section rendering to separate methods

Dependency upgrade

3.10.2

Release Notes - Maven Plugin Tools - Version 3.10.2

Bug

Dependency upgrade

  • [MPLUGIN-485] - Upgrade Parent to 40
  • [MPLUGIN-487] - Bump org.codehaus.plexus:plexus-java from 1.1.2 to 1.2.0
  • [MPLUGIN-488] - Bump asmVersion from 9.5 to 9.6
  • [MPLUGIN-489] - Bump antVersion from 1.10.13 to 1.10.14
  • [MPLUGIN-490] - Bump org.jsoup:jsoup from 1.16.1 to 1.16.2
  • [MPLUGIN-491] - Bump org.codehaus.plexus:plexus-testing from 1.1.0 to 1.2.0

3.10.1

... (truncated)

Commits
  • 4178d33 [maven-release-plugin] prepare release maven-plugin-tools-3.11.0
  • 25d920f [MNG-5695] document Maven 3.2.5+ scoped components usage
  • 6418490 [MPLUGIN-495] WARNINGs based on usage of @​Component for MavenSession/MavenPro...
  • 8b93d12 Bump org.jsoup:jsoup from 1.17.1 to 1.17.2
  • f4973ac Bump org.assertj:assertj-core from 3.24.2 to 3.25.1
  • 7dd3a25 [MPLUGIN-499] Add deprecate description in parameters table (#250)
  • 9bb13f0 [MPLUGIN-492] Documentation for plugins in general: Goals comprises more than...
  • fc41218 [MPLUGIN-498] Move section rendering to separate methods
  • ed4774b [MPLUGIN-450] Require goalPrefix to be valid (#240)
  • 331cf42 [MPLUGIN-497] Upgrade components
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugin-tools:maven-plugin-annotations&package-manager=maven&previous-version=3.6.0&new-version=3.11.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/maven/module-info-compiler-maven-plugin/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/maven/module-info-compiler-maven-plugin/pom.xml b/java/maven/module-info-compiler-maven-plugin/pom.xml index 6881018933d3f..910fede33ce3b 100644 --- a/java/maven/module-info-compiler-maven-plugin/pom.xml +++ b/java/maven/module-info-compiler-maven-plugin/pom.xml @@ -66,7 +66,7 @@ org.apache.maven.plugin-tools maven-plugin-annotations - 3.6.0 + 3.11.0 provided From 2b3d071cd17458363cf1550c4396ce67a12ef6a5 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Tue, 2 Apr 2024 12:46:00 +0530 Subject: [PATCH 39/81] GH-40684: [Java][Docs] JNI module debugging with IntelliJ (#40685) ### Rationale for this change Adding documentation for debugging JNI-based Java modules. ### What changes are included in this PR? Documentation update for developer docs for Java development. ### Are these changes tested? Locally built the docs and it shows the expected content. ### Are there any user-facing changes? N/A * GitHub Issue: #40684 Lead-authored-by: Vibhatha Abeykoon Co-authored-by: Vibhatha Lakmal Abeykoon Signed-off-by: David Li --- docs/source/developers/java/building.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index 27e2de97328c3..c059ff676efb2 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -347,6 +347,11 @@ Arrow repository, and update the following settings: * If using IntelliJ's Maven integration to build, you may need to change ```` to ``false`` in the pom.xml files due to an `IntelliJ bug `__. +* To enable debugging JNI-based modules like ``dataset``, + activate specific profiles in the Maven tab under "Profiles". + Ensure the profiles ``arrow-c-data``, ``arrow-jni``, ``generate-libs-cdata-all-os``, + ``generate-libs-jni-macos-linux``, and ``jdk11+`` are enabled, so that the + IDE can build them and enable debugging. You may not need to update all of these settings if you build/test with the IntelliJ Maven integration instead of with IntelliJ directly. From 096cdad5b434a6aa6ccf066efb894a8e05353309 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Apr 2024 03:18:55 -0400 Subject: [PATCH 40/81] MINOR: [Java] Bump io.grpc:grpc-bom from 1.61.1 to 1.62.2 in /java (#40920) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [io.grpc:grpc-bom](https://github.com/grpc/grpc-java) from 1.61.1 to 1.62.2.
Release notes

Sourced from io.grpc:grpc-bom's releases.

v1.62.2

gRPC Java 1.62.2 Release Notes

Note that this is the initial 1.62.x release

API Changes

  • services: Remove io.grpc.services.BinaryLogs, which was deprecated since 2021. io.grpc.protobuf.services.BinaryLogs should be used instead (#10832).
  • Allow users outside of io.grpc.xds package to create custom xDS resources (#10834) (6d96e6588)

New Features

  • api:Add ClientTransportFilter. Similarly to ServerTransportFilter, this will provide an observability hook and it allows direct modification of the transport's attributes. (#10646)

Improvements

  • java_grpc_library.bzl: Add support for Auto Exec Groups (cb03bd234). This is mostly a behind-the-scenes change to adjust to the newer way Bazel operates
  • java_grpc_library.bzl: Support runfiles for protoc and the plugin (65a6b3bc2). Neither binary uses runfiles, but the task will be ready if they need to in the future
  • xds: Add EC key support for XdsChannelCredentials/XdsServerCredentials (100d5a55f)
  • binder:Change log level from WARNING to FINER for expected exception during close with error, to reduce log spamming (#10899) (7ba0718bb)

Bug Fixes

  • xds: Fix a bug in WeightedRoundRobinLoadBalancer policy that could raise NullPointerException and further cause channel panic when picking a subchannel. This bug can only be triggered when connection can not be established and the channel reports TRANSIENT_FAILURE state. (#10868)

Dependencies

  • The protoc plugin no longer supports macOS Big Sur (macOS 11). Binaries are now built using Monterey (macOS 12)

Acknowledgements

Commits
  • 3e993a9 Bump version to 1.62.1
  • 1da945b Update README etc to reference 1.62.1
  • 7089f04 Change GAE interop tests to use java11 runtime (#10933)
  • 597f26e Bump version to 1.62.1-SNAPSHOT
  • 10eb91f Bump version to 1.62.0
  • 28dffe5 Update README etc to reference 1.62.0
  • 5ba8b71 util: MultiChildLoadBalance.shutdown() log to FINE (#10935)
  • 1795348 Remove semi-circular dependency between core and util
  • 95b847e interop-testing: Use separate event loops in RetryTest
  • 7ba0718 Change log level from WARNING to FINER for expected exception (#10899)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=io.grpc:grpc-bom&package-manager=maven&previous-version=1.61.1&new-version=1.62.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 610593580f720..bdefbea2d8787 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -34,7 +34,7 @@ 2.0.11 33.0.0-jre 4.1.108.Final - 1.61.1 + 1.62.2 3.23.1 2.17.0 3.4.0 From 42b49df0f3dc1586ad38c608ec93f382a4f4e3c4 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Tue, 2 Apr 2024 00:53:56 -0700 Subject: [PATCH 41/81] GH-40907: [Java][FlightSQL] Shade slf4j-api in JDBC driver (#40908) ### Rationale for this change FlightSQL JDBC Driver does not shade slfj4 api which may come into conflict into the version used by an application. If the application uses slf4j 1.x, it may cause the application slf4j backend to not be loaded properly. The change configured maven-shade-plugin to also shade slf4j-api. To make sure log messages are still visible, slf4j-jdk14 is included as well so that all messages will be redirected to `java.util.logging` framework. The application can use jul-to-slf4j adapter to redirect log messages back to slf4j. ### What changes are included in this PR? Overrides `Driver#getParentLogger()` to return the root logger for the JDBC driver (which is `org.apache.arrow.driver.jdbc`). To make sure shaded dependencies loggers are included as well, change relocation from `cfjd.` to `org.apache.arrow.driver.jdbc.shaded. `(or `oaadj` for native libraries) ### Are these changes tested? Verifying that slf4j-api is shaded along with the other relocation changes are covered by `ITDriverJarValidation` ### Are there any user-facing changes? Yes. Driver will not expose directly slf4j api and the logger names for the shaded dependencies have been updated. For applications which were relying on configuring directly a slf4j logging backend for the driver, they may need to include `org.slf4j:slf4-api` and `org.slf4j:jul-to-slf4j` for logging configuration to work. * GitHub Issue: #40907 Authored-by: Laurent Goujon Signed-off-by: David Li --- .../driver/jdbc/ArrowFlightJdbcDriver.java | 10 +++++++++- java/flight/flight-sql-jdbc-driver/pom.xml | 20 +++++++++++-------- .../driver/jdbc/ITDriverJarValidation.java | 5 +---- java/pom.xml | 5 +++++ 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriver.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriver.java index 183e3d5c7b055..d0daaa8bda155 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriver.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcDriver.java @@ -31,6 +31,7 @@ import java.util.Objects; import java.util.Optional; import java.util.Properties; +import java.util.logging.Logger; import org.apache.arrow.driver.jdbc.utils.ArrowFlightConnectionConfigImpl.ArrowFlightConnectionProperty; import org.apache.arrow.driver.jdbc.utils.UrlParser; @@ -58,7 +59,7 @@ public class ArrowFlightJdbcDriver extends UnregisteredDriver { // Netty requires some extra properties to unlock some native memory management api // Setting this property if not already set externally // This has to be done before any netty class is being loaded - final String key = "cfjd.io.netty.tryReflectionSetAccessible"; + final String key = "io.netty.tryReflectionSetAccessible"; final String tryReflectionSetAccessible = System.getProperty(key); if (tryReflectionSetAccessible == null) { System.setProperty(key, Boolean.TRUE.toString()); @@ -67,6 +68,13 @@ public class ArrowFlightJdbcDriver extends UnregisteredDriver { new ArrowFlightJdbcDriver().register(); } + @Override + public Logger getParentLogger() { + // Return the logger associated with the driver package ('org.apache.arrow.driver.jdbc') + // When packaged in flight-sql-jdbc-driver, it will also apply to all shaded dependencies + return Logger.getLogger(getClass().getPackage().getName()); + } + @Override public ArrowFlightConnection connect(final String url, final Properties info) throws SQLException { diff --git a/java/flight/flight-sql-jdbc-driver/pom.xml b/java/flight/flight-sql-jdbc-driver/pom.xml index 53d929afa781c..2157c09eaf583 100644 --- a/java/flight/flight-sql-jdbc-driver/pom.xml +++ b/java/flight/flight-sql-jdbc-driver/pom.xml @@ -97,6 +97,11 @@ slf4j-api runtime + + org.slf4j + slf4j-jdk14 + runtime + io.netty @@ -190,17 +195,16 @@ com. - cfjd.com. + org.apache.arrow.driver.jdbc.shaded.com. com.sun.** org. - cfjd.org. + org.apache.arrow.driver.jdbc.shaded.org. org.apache.arrow.driver.jdbc.** - org.slf4j.** org.apache.arrow.flight.name org.apache.arrow.flight.version @@ -210,24 +214,24 @@ io. - cfjd.io. + org.apache.arrow.driver.jdbc.shaded.io. net. - cfjd.net. + org.apache.arrow.driver.jdbc.shaded.net. mozilla. - cfjd.mozilla. + org.apache.arrow.driver.jdbc.shaded.mozilla. META-INF.native.libnetty_ - META-INF.native.libcfjd_netty_ + META-INF.native.liboaadj_netty_ META-INF.native.netty_ - META-INF.native.cfjd_netty_ + META-INF.native.oaadj_netty_ diff --git a/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java b/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java index fdb580d493abf..0cae2fd5f5cb8 100644 --- a/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java +++ b/java/flight/flight-sql-jdbc-driver/src/test/java/org/apache/arrow/driver/jdbc/ITDriverJarValidation.java @@ -42,8 +42,7 @@ /** * Check the content of the JDBC driver jar * - * After shading everything should be either under org.apache.arrow.driver.jdbc., - * org.slf4j., or cfjd. packages + * After shading everything should be either under org.apache.arrow.driver.jdbc. package */ public class ITDriverJarValidation { /** @@ -57,8 +56,6 @@ public class ITDriverJarValidation { */ public static final Set ALLOWED_PREFIXES = ImmutableSet.of( "org/apache/arrow/driver/jdbc/", - "cfjd/", - "org/slf4j/", "META-INF/"); /** diff --git a/java/pom.xml b/java/pom.xml index bdefbea2d8787..8e9ddd5480ea8 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -680,6 +680,11 @@ slf4j-api ${dep.slf4j.version} + + org.slf4j + slf4j-jdk14 + ${dep.slf4j.version} + javax.annotation javax.annotation-api From 15522931377724c4e5ce6cc6151f88021de55a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 2 Apr 2024 12:50:46 +0200 Subject: [PATCH 42/81] GH-40833: [Docs][Release] Make explicit in the documentation that verifying binaries is not required in order to case a vote (#40834) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Based on the discussion on https://lists.apache.org/thread/ogp9dthp124oq0fmvlyzvjorjsyom03v making clear that binaries verification are not required in order to cast a positive vote for the release. ### What changes are included in this PR? Document the required process ### Are these changes tested? preview-docs job on archery will be run ### Are there any user-facing changes? No * GitHub Issue: #40833 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- .../developers/release_verification.rst | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/docs/source/developers/release_verification.rst b/docs/source/developers/release_verification.rst index 53c8f54e5b5bd..ec474a5729b64 100644 --- a/docs/source/developers/release_verification.rst +++ b/docs/source/developers/release_verification.rst @@ -44,20 +44,36 @@ Linux and macOS In order to run the verification script either for the source release or the binary artifacts see the following guidelines: +Required source verification +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Individuals are REQUIRED to download all signed source code packages onto their +own hardware, validate all cryptographic signatures, compile as provided, +and test the result on their own platform in order to cast a +1 vote. + .. code-block:: # this will create and automatically clean up a temporary directory for the verification environment and will run the source verification TEST_DEFAULT=0 TEST_SOURCE=1 verify-release-candidate.sh $VERSION $RC_NUM - # this will create and automatically clean up a temporary directory for the verification environment and will run the binary verification - TEST_DEFAULT=0 TEST_BINARIES=1 dev/release/verify-release-candidate.sh $VERSION $RC_NUM - # to verify only certain implementations use the TEST_DEFAULT=0 and TEST_* variables # here are a couple of examples, but see the source code for the available options TEST_DEFAULT=0 TEST_CPP=1 verify-release-candidate.sh $VERSION $RC_NUM # only C++ tests TEST_DEFAULT=0 TEST_CPP=1 TEST_PYTHON=1 verify-release-candidate.sh $VERSION $RC_NUM # C++ and Python tests TEST_DEFAULT=0 TEST_INTEGRATION_CPP=1 TEST_INTEGRATION_JAVA=1 verify-release-candidate.sh $VERSION $RC_NUM # C++ and Java integration tests - + +Binary verification +^^^^^^^^^^^^^^^^^^^ + +The binaries are generated from the source that has been verified. Those binaries are +tested on CI but can be tested locally for further validation. It is not necessary to +test them in order to cast a positive vote. + +.. code-block:: + + # this will create and automatically clean up a temporary directory for the verification environment and will run the binary verification + TEST_DEFAULT=0 TEST_BINARIES=1 dev/release/verify-release-candidate.sh $VERSION $RC_NUM + # to verify certain binaries use the TEST_* variables as: TEST_DEFAULT=0 TEST_WHEELS=1 verify-release-candidate.sh $VERSION $RC_NUM # only Wheels TEST_DEFAULT=0 TEST_APT=1 verify-release-candidate.sh $VERSION $RC_NUM # only APT packages @@ -130,7 +146,6 @@ As an example: I've verified successfully the sources and binaries with: TEST_DEFAULT=0 TEST_SOURCE=1 dev/release/verify-release-candidate.sh 15.0.0 1 - TEST_DEFAULT=0 TEST_BINARIES=1 dev/release/verify-release-candidate.sh 15.0.0 1 with: * Python 3.10.12 * gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 From 5ddef639dfcaf62a02ed8c8d63103f22ae41a5ee Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 2 Apr 2024 04:19:03 -0700 Subject: [PATCH 43/81] GH-40038: [Java] Export non empty offset buffer for variable-size layout through C Data Interface (#40043) ### Rationale for this change We encountered an error when exchanging string array from Java to Rust through Arrow C data interface. At Rust side, it complains that the buffer at position 1 (offset buffer) is null. After tracing down and some debugging, it looks like the issue is Java Arrow `BaseVariableWidthVector` class assigns an empty offset buffer if the array is empty (value count 0). According to Arrow [spec](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) for variable size binary layout: > The offsets buffer contains length + 1 signed integers ... So for an empty string array, its offset buffer should be a buffer with one element (generally it is `0`). ### What changes are included in this PR? This patch replaces current empty offset buffer in variable-size layout vector classes when exporting arrays through C Data Interface. ### Are these changes tested? Added test cases. ### Are there any user-facing changes? No * Closes: #40038 Authored-by: Liang-Chi Hsieh Signed-off-by: David Li --- .../org/apache/arrow/c/ArrayExporter.java | 10 +---- .../org/apache/arrow/c/RoundtripTest.java | 18 +++++++- .../vector/BaseLargeVariableWidthVector.java | 35 ++++++++++++++-- .../arrow/vector/BaseVariableWidthVector.java | 35 ++++++++++++++-- .../org/apache/arrow/vector/FieldVector.java | 41 +++++++++++++++++++ .../complex/BaseRepeatedValueVector.java | 7 ++-- .../arrow/vector/complex/LargeListVector.java | 29 +++++++++++-- .../arrow/vector/complex/ListVector.java | 22 +++++++++- .../arrow/vector/complex/MapVector.java | 2 +- 9 files changed, 174 insertions(+), 25 deletions(-) diff --git a/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java b/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java index d6479a3ba4ca8..05ab3e5ff6063 100644 --- a/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java +++ b/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java @@ -98,15 +98,7 @@ void export(ArrowArray array, FieldVector vector, DictionaryProvider dictionaryP if (buffers != null) { data.buffers = new ArrayList<>(buffers.size()); data.buffers_ptrs = allocator.buffer((long) buffers.size() * Long.BYTES); - for (ArrowBuf arrowBuf : buffers) { - if (arrowBuf != null) { - arrowBuf.getReferenceManager().retain(); - data.buffers_ptrs.writeLong(arrowBuf.memoryAddress()); - } else { - data.buffers_ptrs.writeLong(NULL); - } - data.buffers.add(arrowBuf); - } + vector.exportCDataBuffers(data.buffers, data.buffers_ptrs, NULL); } if (dictionaryEncoding != null) { diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index a7e3cde2e7b4b..768394ef7ab60 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -33,6 +33,7 @@ import java.util.Map; import java.util.UUID; import java.util.stream.Collectors; +import java.util.stream.IntStream; import java.util.stream.Stream; import org.apache.arrow.memory.ArrowBuf; @@ -165,10 +166,25 @@ VectorSchemaRoot vectorSchemaRootRoundtrip(VectorSchemaRoot root) { } boolean roundtrip(FieldVector vector, Class clazz) { + List fieldBuffers = vector.getFieldBuffers(); + List orgRefCnts = fieldBuffers.stream().map(buf -> buf.refCnt()).collect(Collectors.toList()); + long orgMemorySize = allocator.getAllocatedMemory(); + + boolean result = false; try (ValueVector imported = vectorRoundtrip(vector)) { assertTrue(clazz.isInstance(imported), String.format("expected %s but was %s", clazz, imported.getClass())); - return VectorEqualsVisitor.vectorEquals(vector, imported); + result = VectorEqualsVisitor.vectorEquals(vector, imported); } + + // Check that the ref counts of the buffers are the same after the roundtrip + IntStream.range(0, orgRefCnts.size()).forEach(i -> { + ArrowBuf buf = fieldBuffers.get(i); + assertEquals(buf.refCnt(), orgRefCnts.get(i)); + }); + + assertEquals(orgMemorySize, allocator.getAllocatedMemory()); + + return result; } @Test diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index c239edbcc3c29..34c9e73a0b072 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -336,6 +336,34 @@ public List getFieldBuffers() { return result; } + /** + * Export the buffers of the fields for C Data Interface. This method traverse the buffers and + * export buffer and buffer's memory address into a list of buffers and a pointer to the list of buffers. + */ + @Override + public void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + // before flight/IPC, we must bring the vector to a consistent state. + // this is because, it is possible that the offset buffers of some trailing values + // are not updated. this may cause some data in the data buffer being lost. + // for details, please see TestValueVector#testUnloadVariableWidthVector. + fillHoles(valueCount); + + exportBuffer(validityBuffer, buffers, buffersPtr, nullValue, true); + + if (offsetBuffer.capacity() == 0) { + // Empty offset buffer is allowed for historical reason. + // To export it through C Data interface, we need to allocate a buffer with one offset. + // We set `retain = false` to explicitly not increase the ref count for the exported buffer. + // The ref count of the newly created buffer (i.e., 1) already represents the usage + // at imported side. + exportBuffer(allocateOffsetBuffer(OFFSET_WIDTH), buffers, buffersPtr, nullValue, false); + } else { + exportBuffer(offsetBuffer, buffers, buffersPtr, nullValue, true); + } + + exportBuffer(valueBuffer, buffers, buffersPtr, nullValue, true); + } + /** * Set the reader and writer indexes for the inner buffers. */ @@ -456,10 +484,11 @@ private void allocateBytes(final long valueBufferSize, final int valueCount) { } /* allocate offset buffer */ - private void allocateOffsetBuffer(final long size) { - offsetBuffer = allocator.buffer(size); + private ArrowBuf allocateOffsetBuffer(final long size) { + ArrowBuf offsetBuffer = allocator.buffer(size); offsetBuffer.readerIndex(0); initOffsetBuffer(); + return offsetBuffer; } /* allocate validity buffer */ @@ -760,7 +789,7 @@ private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseLargeV final long start = getStartOffset(startIndex); final long end = getStartOffset(startIndex + length); final long dataLength = end - start; - target.allocateOffsetBuffer((long) (length + 1) * OFFSET_WIDTH); + target.offsetBuffer = target.allocateOffsetBuffer((long) (length + 1) * OFFSET_WIDTH); for (int i = 0; i < length + 1; i++) { final long relativeSourceOffset = getStartOffset(startIndex + i) - start; target.offsetBuffer.setLong((long) i * OFFSET_WIDTH, relativeSourceOffset); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 4cf495a349f02..6b82dd7729a6c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -355,6 +355,34 @@ public List getFieldBuffers() { return result; } + /** + * Export the buffers of the fields for C Data Interface. This method traverse the buffers and + * export buffer and buffer's memory address into a list of buffers and a pointer to the list of buffers. + */ + @Override + public void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + // before flight/IPC, we must bring the vector to a consistent state. + // this is because, it is possible that the offset buffers of some trailing values + // are not updated. this may cause some data in the data buffer being lost. + // for details, please see TestValueVector#testUnloadVariableWidthVector. + fillHoles(valueCount); + + exportBuffer(validityBuffer, buffers, buffersPtr, nullValue, true); + + if (offsetBuffer.capacity() == 0) { + // Empty offset buffer is allowed for historical reason. + // To export it through C Data interface, we need to allocate a buffer with one offset. + // We set `retain = false` to explicitly not increase the ref count for the exported buffer. + // The ref count of the newly created buffer (i.e., 1) already represents the usage + // at imported side. + exportBuffer(allocateOffsetBuffer(OFFSET_WIDTH), buffers, buffersPtr, nullValue, false); + } else { + exportBuffer(offsetBuffer, buffers, buffersPtr, nullValue, true); + } + + exportBuffer(valueBuffer, buffers, buffersPtr, nullValue, true); + } + /** * Set the reader and writer indexes for the inner buffers. */ @@ -476,11 +504,12 @@ private void allocateBytes(final long valueBufferSize, final int valueCount) { } /* allocate offset buffer */ - private void allocateOffsetBuffer(final long size) { + private ArrowBuf allocateOffsetBuffer(final long size) { final int curSize = (int) size; - offsetBuffer = allocator.buffer(curSize); + ArrowBuf offsetBuffer = allocator.buffer(curSize); offsetBuffer.readerIndex(0); initOffsetBuffer(); + return offsetBuffer; } /* allocate validity buffer */ @@ -805,7 +834,7 @@ private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseVariab (1 + length) * ((long) OFFSET_WIDTH)); target.offsetBuffer = transferBuffer(slicedOffsetBuffer, target.allocator); } else { - target.allocateOffsetBuffer((long) (length + 1) * OFFSET_WIDTH); + target.offsetBuffer = target.allocateOffsetBuffer((long) (length + 1) * OFFSET_WIDTH); for (int i = 0; i < length + 1; i++) { final int relativeSourceOffset = getStartOffset(startIndex + i) - start; target.offsetBuffer.setInt((long) i * OFFSET_WIDTH, relativeSourceOffset); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java index 299828f6d9d08..04229563bcc67 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java @@ -60,6 +60,47 @@ public interface FieldVector extends ValueVector { */ List getFieldBuffers(); + /** + * Export a given buffer and its memory address into a list of buffers and a pointer to the list of buffers. + * + * @param buffer the buffer to export + * @param buffers the list of buffers + * @param buffersPtr the pointer to the list of buffers + * @param nullValue the null value to use for null buffer + * @param retain whether to retain the buffer when exporting + */ + default void exportBuffer( + ArrowBuf buffer, + List buffers, + ArrowBuf buffersPtr, + long nullValue, + boolean retain) { + if (buffer != null) { + if (retain) { + buffer.getReferenceManager().retain(); + } + buffersPtr.writeLong(buffer.memoryAddress()); + } else { + buffersPtr.writeLong(nullValue); + } + buffers.add(buffer); + } + + /** + * Export the buffers of the fields for C Data Interface. This method traverse the buffers and + * export buffer and buffer's memory address into a list of buffers and a pointer to the list of buffers. + * + * By default, when exporting a buffer, it will increase ref count for exported buffer that counts + * the usage at imported side. + */ + default void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + List fieldBuffers = getFieldBuffers(); + + for (ArrowBuf arrowBuf : fieldBuffers) { + exportBuffer(arrowBuf, buffers, buffersPtr, nullValue, true); + } + } + /** * Get the inner vectors. * diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 8ba2e48dc2fa3..7906d90c2fff0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -83,7 +83,7 @@ public String getName() { public boolean allocateNewSafe() { boolean dataAlloc = false; try { - allocateOffsetBuffer(offsetAllocationSizeInBytes); + offsetBuffer = allocateOffsetBuffer(offsetAllocationSizeInBytes); dataAlloc = vector.allocateNewSafe(); } catch (Exception e) { e.printStackTrace(); @@ -97,12 +97,13 @@ public boolean allocateNewSafe() { return dataAlloc; } - protected void allocateOffsetBuffer(final long size) { + protected ArrowBuf allocateOffsetBuffer(final long size) { final int curSize = (int) size; - offsetBuffer = allocator.buffer(curSize); + ArrowBuf offsetBuffer = allocator.buffer(curSize); offsetBuffer.readerIndex(0); offsetAllocationSizeInBytes = curSize; offsetBuffer.setZero(0, offsetBuffer.capacity()); + return offsetBuffer; } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java index b934cbd81db16..b29b72ad2b1a0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java @@ -287,6 +287,26 @@ public List getFieldBuffers() { return result; } + /** + * Export the buffers of the fields for C Data Interface. This method traverse the buffers and + * export buffer and buffer's memory address into a list of buffers and a pointer to the list of buffers. + */ + @Override + public void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + exportBuffer(validityBuffer, buffers, buffersPtr, nullValue, true); + + if (offsetBuffer.capacity() == 0) { + // Empty offset buffer is allowed for historical reason. + // To export it through C Data interface, we need to allocate a buffer with one offset. + // We set `retain = false` to explicitly not increase the ref count for the exported buffer. + // The ref count of the newly created buffer (i.e., 1) already represents the usage + // at imported side. + exportBuffer(allocateOffsetBuffer(OFFSET_WIDTH), buffers, buffersPtr, nullValue, false); + } else { + exportBuffer(offsetBuffer, buffers, buffersPtr, nullValue, true); + } + } + /** * Set the reader and writer indexes for the inner buffers. */ @@ -343,7 +363,7 @@ public boolean allocateNewSafe() { /* allocate offset and data buffer */ boolean dataAlloc = false; try { - allocateOffsetBuffer(offsetAllocationSizeInBytes); + offsetBuffer = allocateOffsetBuffer(offsetAllocationSizeInBytes); dataAlloc = vector.allocateNewSafe(); } catch (Exception e) { e.printStackTrace(); @@ -371,11 +391,12 @@ private void allocateValidityBuffer(final long size) { validityBuffer.setZero(0, validityBuffer.capacity()); } - protected void allocateOffsetBuffer(final long size) { - offsetBuffer = allocator.buffer(size); + protected ArrowBuf allocateOffsetBuffer(final long size) { + ArrowBuf offsetBuffer = allocator.buffer(size); offsetBuffer.readerIndex(0); offsetAllocationSizeInBytes = size; offsetBuffer.setZero(0, offsetBuffer.capacity()); + return offsetBuffer; } /** @@ -656,7 +677,7 @@ public void splitAndTransfer(int startIndex, int length) { final long startPoint = offsetBuffer.getLong((long) startIndex * OFFSET_WIDTH); final long sliceLength = offsetBuffer.getLong((long) (startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); - to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); /* splitAndTransfer offset buffer */ for (int i = 0; i < length + 1; i++) { final long relativeOffset = offsetBuffer.getLong((long) (startIndex + i) * OFFSET_WIDTH) - startPoint; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 7df659e4cc9da..91275ae73d2c3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -242,6 +242,26 @@ public List getFieldBuffers() { return result; } + /** + * Export the buffers of the fields for C Data Interface. This method traverse the buffers and + * export buffer and buffer's memory address into a list of buffers and a pointer to the list of buffers. + */ + @Override + public void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + exportBuffer(validityBuffer, buffers, buffersPtr, nullValue, true); + + if (offsetBuffer.capacity() == 0) { + // Empty offset buffer is allowed for historical reason. + // To export it through C Data interface, we need to allocate a buffer with one offset. + // We set `retain = false` to explicitly not increase the ref count for the exported buffer. + // The ref count of the newly created buffer (i.e., 1) already represents the usage + // at imported side. + exportBuffer(allocateOffsetBuffer(OFFSET_WIDTH), buffers, buffersPtr, nullValue, false); + } else { + exportBuffer(offsetBuffer, buffers, buffersPtr, nullValue, true); + } + } + /** * Set the reader and writer indexes for the inner buffers. */ @@ -535,7 +555,7 @@ public void splitAndTransfer(int startIndex, int length) { final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); - to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); /* splitAndTransfer offset buffer */ for (int i = 0; i < length + 1; i++) { final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index e082b2f43be64..c49f138b64c6b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -209,7 +209,7 @@ public void splitAndTransfer(int startIndex, int length) { final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); - to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); /* splitAndTransfer offset buffer */ for (int i = 0; i < length + 1; i++) { final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; From 8163d026b3c56253d9e33c0129fac5d9ba573c53 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <106943008+ZhangHuiGui@users.noreply.github.com> Date: Wed, 3 Apr 2024 00:01:52 +0800 Subject: [PATCH 44/81] GH-40431: [C++] Move key_hash/key_map/light_array related files to internal for prevent using by users (#40484) ### Rationale for this change These files expose implementation details and APIs that are not meant for third-party use. This PR explicitly marks them internal, which also avoids having them installed. ### Are these changes tested? By existing builds and tests. ### Are there any user-facing changes? No, except hiding some header files that were not supposed to be included externally. * GitHub Issue: #40431 Lead-authored-by: ZhangHuiGui Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/CMakeLists.txt | 10 +++++----- cpp/src/arrow/acero/asof_join_node.cc | 4 ++-- cpp/src/arrow/acero/bloom_filter_test.cc | 2 +- cpp/src/arrow/acero/hash_join_node.cc | 2 +- cpp/src/arrow/acero/hash_join_node.h | 3 ++- cpp/src/arrow/acero/schema_util.h | 14 +++++++------- cpp/src/arrow/acero/swiss_join.cc | 2 +- cpp/src/arrow/acero/swiss_join_internal.h | 4 ++-- .../compute/{key_hash.cc => key_hash_internal.cc} | 4 ++-- .../compute/{key_hash.h => key_hash_internal.h} | 2 +- ...{key_hash_avx2.cc => key_hash_internal_avx2.cc} | 2 +- cpp/src/arrow/compute/key_hash_test.cc | 2 +- .../compute/{key_map.cc => key_map_internal.cc} | 2 +- .../compute/{key_map.h => key_map_internal.h} | 0 .../{key_map_avx2.cc => key_map_internal_avx2.cc} | 2 +- .../{light_array.cc => light_array_internal.cc} | 2 +- .../{light_array.h => light_array_internal.h} | 0 cpp/src/arrow/compute/light_array_test.cc | 2 +- cpp/src/arrow/compute/row/compare_internal.h | 2 +- cpp/src/arrow/compute/row/encode_internal.h | 4 ++-- cpp/src/arrow/compute/row/grouper.cc | 4 ++-- cpp/src/arrow/compute/row/row_internal.h | 2 +- cpp/src/arrow/compute/util.cc | 4 ++-- cpp/src/arrow/compute/util.h | 8 ++++++-- 24 files changed, 44 insertions(+), 39 deletions(-) rename cpp/src/arrow/compute/{key_hash.cc => key_hash_internal.cc} (99%) rename cpp/src/arrow/compute/{key_hash.h => key_hash_internal.h} (99%) rename cpp/src/arrow/compute/{key_hash_avx2.cc => key_hash_internal_avx2.cc} (99%) rename cpp/src/arrow/compute/{key_map.cc => key_map_internal.cc} (99%) rename cpp/src/arrow/compute/{key_map.h => key_map_internal.h} (100%) rename cpp/src/arrow/compute/{key_map_avx2.cc => key_map_internal_avx2.cc} (99%) rename cpp/src/arrow/compute/{light_array.cc => light_array_internal.cc} (99%) rename cpp/src/arrow/compute/{light_array.h => light_array_internal.h} (100%) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 4bf1008af4cd0..617bfedabf373 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -689,9 +689,9 @@ set(ARROW_COMPUTE_SRCS compute/function.cc compute/function_internal.cc compute/kernel.cc - compute/key_hash.cc - compute/key_map.cc - compute/light_array.cc + compute/key_hash_internal.cc + compute/key_map_internal.cc + compute/light_array_internal.cc compute/ordering.cc compute/registry.cc compute/kernels/codegen_internal.cc @@ -717,8 +717,8 @@ set(ARROW_COMPUTE_SRCS compute/row/row_internal.cc compute/util.cc) -append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/key_hash_avx2.cc) -append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/key_map_avx2.cc) +append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/key_hash_internal_avx2.cc) +append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/key_map_internal_avx2.cc) append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/row/compare_internal_avx2.cc) append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/row/encode_internal_avx2.cc) append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/util_avx2.cc) diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index cf0d475c1d770..48cc83dd3d6a9 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -45,8 +45,8 @@ #include "arrow/compute/function_internal.h" #endif #include "arrow/acero/time_series_util.h" -#include "arrow/compute/key_hash.h" -#include "arrow/compute/light_array.h" +#include "arrow/compute/key_hash_internal.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc b/cpp/src/arrow/acero/bloom_filter_test.cc index bad331cfd99d1..a2d6e9575a1aa 100644 --- a/cpp/src/arrow/acero/bloom_filter_test.cc +++ b/cpp/src/arrow/acero/bloom_filter_test.cc @@ -27,7 +27,7 @@ #include "arrow/acero/task_util.h" #include "arrow/acero/test_util_internal.h" #include "arrow/acero/util.h" -#include "arrow/compute/key_hash.h" +#include "arrow/compute/key_hash_internal.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/config.h" #include "arrow/util/cpu_info.h" diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc index c0179fd160e4e..b49364300dac8 100644 --- a/cpp/src/arrow/acero/hash_join_node.cc +++ b/cpp/src/arrow/acero/hash_join_node.cc @@ -27,7 +27,7 @@ #include "arrow/acero/options.h" #include "arrow/acero/schema_util.h" #include "arrow/acero/util.h" -#include "arrow/compute/key_hash.h" +#include "arrow/compute/key_hash_internal.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" #include "arrow/util/thread_pool.h" diff --git a/cpp/src/arrow/acero/hash_join_node.h b/cpp/src/arrow/acero/hash_join_node.h index cca64d59830b2..ad60019ceabc4 100644 --- a/cpp/src/arrow/acero/hash_join_node.h +++ b/cpp/src/arrow/acero/hash_join_node.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include "arrow/acero/options.h" @@ -88,7 +89,7 @@ class ARROW_ACERO_EXPORT HashJoinSchema { const Expression& filter); bool PayloadIsEmpty(int side) { - ARROW_DCHECK(side == 0 || side == 1); + assert(side == 0 || side == 1); return proj_maps[side].num_cols(HashJoinProjection::PAYLOAD) == 0; } diff --git a/cpp/src/arrow/acero/schema_util.h b/cpp/src/arrow/acero/schema_util.h index 6760022feb4be..db3076a58841a 100644 --- a/cpp/src/arrow/acero/schema_util.h +++ b/cpp/src/arrow/acero/schema_util.h @@ -17,13 +17,13 @@ #pragma once +#include #include #include #include #include -#include "arrow/compute/light_array.h" // for KeyColumnMetadata -#include "arrow/type.h" // for DataType, FieldRef, Field and Schema +#include "arrow/type.h" // for DataType, FieldRef, Field and Schema namespace arrow { @@ -47,8 +47,8 @@ struct SchemaProjectionMap { const int* source_to_base; const int* base_to_target; inline int get(int i) const { - ARROW_DCHECK(i >= 0 && i < num_cols); - ARROW_DCHECK(source_to_base[i] != kMissingField); + assert(i >= 0 && i < num_cols); + assert(source_to_base[i] != kMissingField); return base_to_target[source_to_base[i]]; } }; @@ -66,7 +66,7 @@ class SchemaProjectionMaps { Status Init(ProjectionIdEnum full_schema_handle, const Schema& schema, const std::vector& projection_handles, const std::vector*>& projections) { - ARROW_DCHECK(projection_handles.size() == projections.size()); + assert(projection_handles.size() == projections.size()); ARROW_RETURN_NOT_OK(RegisterSchema(full_schema_handle, schema)); for (size_t i = 0; i < projections.size(); ++i) { ARROW_RETURN_NOT_OK( @@ -174,7 +174,7 @@ class SchemaProjectionMaps { } } // We should never get here - ARROW_DCHECK(false); + assert(false); return -1; } @@ -207,7 +207,7 @@ class SchemaProjectionMaps { break; } } - ARROW_DCHECK(field_id != SchemaProjectionMap::kMissingField); + assert(field_id != SchemaProjectionMap::kMissingField); mapping[i] = field_id; inverse_mapping[field_id] = i; } diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 68b0e37b01aa9..61c8bfe95414e 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -25,7 +25,7 @@ #include "arrow/acero/util.h" #include "arrow/array/util.h" // MakeArrayFromScalar #include "arrow/compute/kernels/row_encoder_internal.h" -#include "arrow/compute/key_hash.h" +#include "arrow/compute/key_hash_internal.h" #include "arrow/compute/row/compare_internal.h" #include "arrow/compute/row/encode_internal.h" #include "arrow/util/bit_util.h" diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index aa36a61109274..dceb74abe4f1b 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -23,8 +23,8 @@ #include "arrow/acero/schema_util.h" #include "arrow/acero/task_util.h" #include "arrow/compute/kernels/row_encoder_internal.h" -#include "arrow/compute/key_map.h" -#include "arrow/compute/light_array.h" +#include "arrow/compute/key_map_internal.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/encode_internal.h" namespace arrow { diff --git a/cpp/src/arrow/compute/key_hash.cc b/cpp/src/arrow/compute/key_hash_internal.cc similarity index 99% rename from cpp/src/arrow/compute/key_hash.cc rename to cpp/src/arrow/compute/key_hash_internal.cc index 1902b9ce9a88e..a0002efb3faf3 100644 --- a/cpp/src/arrow/compute/key_hash.cc +++ b/cpp/src/arrow/compute/key_hash_internal.cc @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/compute/key_hash.h" +#include "arrow/compute/key_hash_internal.h" #include #include #include -#include "arrow/compute/light_array.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/util/bit_util.h" #include "arrow/util/ubsan.h" diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash_internal.h similarity index 99% rename from cpp/src/arrow/compute/key_hash.h rename to cpp/src/arrow/compute/key_hash_internal.h index 1173df5ed103e..7d226f52086b1 100644 --- a/cpp/src/arrow/compute/key_hash.h +++ b/cpp/src/arrow/compute/key_hash_internal.h @@ -23,7 +23,7 @@ #include -#include "arrow/compute/light_array.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/compute/util.h" namespace arrow { diff --git a/cpp/src/arrow/compute/key_hash_avx2.cc b/cpp/src/arrow/compute/key_hash_internal_avx2.cc similarity index 99% rename from cpp/src/arrow/compute/key_hash_avx2.cc rename to cpp/src/arrow/compute/key_hash_internal_avx2.cc index aec2800c647d7..4def87bd7aa20 100644 --- a/cpp/src/arrow/compute/key_hash_avx2.cc +++ b/cpp/src/arrow/compute/key_hash_internal_avx2.cc @@ -17,7 +17,7 @@ #include -#include "arrow/compute/key_hash.h" +#include "arrow/compute/key_hash_internal.h" #include "arrow/util/bit_util.h" namespace arrow { diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc index c998df7169c4a..4e5d869cb7db6 100644 --- a/cpp/src/arrow/compute/key_hash_test.cc +++ b/cpp/src/arrow/compute/key_hash_test.cc @@ -23,7 +23,7 @@ #include #include "arrow/array/builder_binary.h" -#include "arrow/compute/key_hash.h" +#include "arrow/compute/key_hash_internal.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/util/cpu_info.h" diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map_internal.cc similarity index 99% rename from cpp/src/arrow/compute/key_map.cc rename to cpp/src/arrow/compute/key_map_internal.cc index a027ec811cf24..9e6d60ab5032b 100644 --- a/cpp/src/arrow/compute/key_map.cc +++ b/cpp/src/arrow/compute/key_map_internal.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/compute/key_map.h" +#include "arrow/compute/key_map_internal.h" #include #include diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map_internal.h similarity index 100% rename from cpp/src/arrow/compute/key_map.h rename to cpp/src/arrow/compute/key_map_internal.h diff --git a/cpp/src/arrow/compute/key_map_avx2.cc b/cpp/src/arrow/compute/key_map_internal_avx2.cc similarity index 99% rename from cpp/src/arrow/compute/key_map_avx2.cc rename to cpp/src/arrow/compute/key_map_internal_avx2.cc index 3526a6cb0f344..8c98166f49269 100644 --- a/cpp/src/arrow/compute/key_map_avx2.cc +++ b/cpp/src/arrow/compute/key_map_internal_avx2.cc @@ -17,7 +17,7 @@ #include -#include "arrow/compute/key_map.h" +#include "arrow/compute/key_map_internal.h" #include "arrow/util/logging.h" namespace arrow { diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array_internal.cc similarity index 99% rename from cpp/src/arrow/compute/light_array.cc rename to cpp/src/arrow/compute/light_array_internal.cc index b225e04b05cea..4f235925d0fb6 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array_internal.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/compute/light_array.h" +#include "arrow/compute/light_array_internal.h" #include diff --git a/cpp/src/arrow/compute/light_array.h b/cpp/src/arrow/compute/light_array_internal.h similarity index 100% rename from cpp/src/arrow/compute/light_array.h rename to cpp/src/arrow/compute/light_array_internal.h diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index ecc5f3ad37931..08f36ee606025 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/compute/light_array.h" +#include "arrow/compute/light_array_internal.h" #include #include diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index db953fbe11271..b039ca97ff978 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -19,7 +19,7 @@ #include -#include "arrow/compute/light_array.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/encode_internal.h" #include "arrow/compute/row/row_internal.h" #include "arrow/compute/util.h" diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index 6091fb66982af..2afc150530b9e 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -22,8 +22,8 @@ #include #include "arrow/array/data.h" -#include "arrow/compute/key_map.h" -#include "arrow/compute/light_array.h" +#include "arrow/compute/key_map_internal.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/row_internal.h" #include "arrow/compute/util.h" #include "arrow/memory_pool.h" diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index 5e23eda16fda2..756c70967ac6f 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -26,8 +26,8 @@ #include "arrow/compute/api_vector.h" #include "arrow/compute/function.h" #include "arrow/compute/kernels/row_encoder_internal.h" -#include "arrow/compute/key_hash.h" -#include "arrow/compute/light_array.h" +#include "arrow/compute/key_hash_internal.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/compare_internal.h" #include "arrow/compute/row/grouper_internal.h" diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index c9194267aa3fe..3220b7ffe6e40 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -20,7 +20,7 @@ #include #include "arrow/buffer.h" -#include "arrow/compute/light_array.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc index 2058ba9f30757..b0c863b26a062 100644 --- a/cpp/src/arrow/compute/util.cc +++ b/cpp/src/arrow/compute/util.cc @@ -32,7 +32,7 @@ using internal::CpuInfo; namespace util { void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) { - int64_t new_top = top_ + PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); + int64_t new_top = top_ + EstimatedAllocationSize(num_bytes); // Stack overflow check (see GH-39582). // XXX cannot return a regular Status because most consumers do not either. ARROW_CHECK_LE(new_top, buffer_size_) << "TempVectorStack::alloc overflow"; @@ -48,7 +48,7 @@ void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) { void TempVectorStack::release(int id, uint32_t num_bytes) { ARROW_DCHECK(num_vectors_ == id + 1); - int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); + int64_t size = EstimatedAllocationSize(num_bytes); ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == kGuard2); ARROW_DCHECK(top_ >= size); diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h index e2a2e71b8521d..88dce160ce936 100644 --- a/cpp/src/arrow/compute/util.h +++ b/cpp/src/arrow/compute/util.h @@ -89,7 +89,7 @@ class ARROW_EXPORT TempVectorStack { Status Init(MemoryPool* pool, int64_t size) { num_vectors_ = 0; top_ = 0; - buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * sizeof(uint64_t); + buffer_size_ = EstimatedAllocationSize(size); ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); // Ensure later operations don't accidentally read uninitialized memory. std::memset(buffer->mutable_data(), 0xFF, size); @@ -98,7 +98,11 @@ class ARROW_EXPORT TempVectorStack { } private: - int64_t PaddedAllocationSize(int64_t num_bytes) { + static int64_t EstimatedAllocationSize(int64_t size) { + return PaddedAllocationSize(size) + 2 * sizeof(uint64_t); + } + + static int64_t PaddedAllocationSize(int64_t num_bytes) { // Round up allocation size to multiple of 8 bytes // to avoid returning temp vectors with unaligned address. // From 782c52f2f1802b2c6b112f8695c6e6a9246fa855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=87=8E=E9=B9=BF?= <359391169@qq.com> Date: Wed, 3 Apr 2024 04:53:40 +0800 Subject: [PATCH 45/81] MINOR: [Docs][Java] Fix wrong method usage in example (#40940) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change `DictionaryProvider` doesn't have `get` method. ### What changes are included in this PR? It should be `lookup` not `get`. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. Authored-by: 野鹿 <359391169@qq.com> Signed-off-by: Sutou Kouhei --- docs/source/java/vector.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/java/vector.rst b/docs/source/java/vector.rst index 377db71659c13..abbbd1a236d6d 100644 --- a/docs/source/java/vector.rst +++ b/docs/source/java/vector.rst @@ -342,7 +342,7 @@ This is where the ``DictionaryEncoding``'s 'id' attribute comes in. This value i // now we want to decode our vector, so we retrieve its dictionary from the provider Field f = vsr.getField(encoded.getName()); DictionaryEncoding encoding = f.getDictionary(); - Dictionary dictionary = provider.get(encoding.getId()); + Dictionary dictionary = provider.lookup(encoding.getId()); As you can see, a ``DictionaryProvider`` is handy for managing the dictionaries associated with a ``VectorSchemaRoot``. More importantly, it helps package the dictionaries for a ``VectorSchemaRoot`` when it's written. The classes ``ArrowFileWriter`` and ``ArrowStreamWriter`` both accept an optional ``DictionaryProvider`` argument for that purpose. You can find example code for writing dictionaries in the documentation for (:doc:`ipc`). ``ArrowReader`` and its subclasses also implement the ``DictionaryProvider`` interface, so you can retrieve the actual dictionaries when reading a file. From 41a989c81616aea103e554521fa6d6209ffa248d Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Tue, 2 Apr 2024 17:37:21 -0700 Subject: [PATCH 46/81] GH-40952: [Java][FlightSQL] Clean up flight-sql-jdbc-driver dependencies (#40953) ### Rationale for this change Module `flight-sql-jdbc-driver` declares multiple dependencies, some (joda) are not used anymore (but still packaged). ### What changes are included in this PR? Clean up list of dependencies declared in flight-sql-jdbc-driver modules as all of them are transitive dependencies from flight-sql-jdbc-core. ### Are these changes tested? Yes: build + existing shading test ### Are there any user-facing changes? No * GitHub Issue: #40952 Authored-by: Laurent Goujon Signed-off-by: David Li --- java/flight/flight-sql-jdbc-driver/pom.xml | 111 +-------------------- 1 file changed, 1 insertion(+), 110 deletions(-) diff --git a/java/flight/flight-sql-jdbc-driver/pom.xml b/java/flight/flight-sql-jdbc-driver/pom.xml index 2157c09eaf583..b586a0e46f51f 100644 --- a/java/flight/flight-sql-jdbc-driver/pom.xml +++ b/java/flight/flight-sql-jdbc-driver/pom.xml @@ -28,126 +28,24 @@ https://arrow.apache.org - - - org.hamcrest - hamcrest - 2.2 - test - - org.apache.arrow flight-sql-jdbc-core runtime - - org.bouncycastle - bcpkix-jdk15on - 1.70 - runtime - - - - - org.apache.arrow - arrow-memory-core - runtime - - - - org.apache.arrow - flight-sql - runtime - - - - org.mockito - mockito-core - ${mockito.core.version} - test - - - - org.mockito - mockito-inline - ${mockito.inline.version} - test - - - - org.apache.arrow - flight-core - - - io.netty - netty-transport-native-kqueue - - - io.netty - netty-transport-native-epoll - - - runtime - - - - org.slf4j - slf4j-api - runtime - org.slf4j slf4j-jdk14 runtime - - io.netty - netty-common - runtime - - - - joda-time - joda-time - 2.12.6 - runtime - - - - org.apache.calcite.avatica - avatica - 1.24.0 - runtime - - - - com.google.protobuf - protobuf-java - runtime - - - - org.apache.arrow - arrow-vector - ${arrow.vector.classifier} - runtime - - com.google.guava guava + runtime - - - commons-io - commons-io - 2.15.1 - test - @@ -250,13 +148,6 @@ META-INF/services/java.sql.Driver - - org.eclipse.collections:* - - about.html - LICENSE-*-1.0.txt - - *:* From be3b78902ff2871e637af2c340e24b75853e32e7 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 3 Apr 2024 10:33:31 +0200 Subject: [PATCH 47/81] GH-40954: [CI] Fix use of obsolete docker-compose command on Github Actions (#40949) ### Rationale for this change The `docker-compose` utility is progressively being removed from GHA-provided runners: https://github.com/actions/runner-images/issues/9557 ### What changes are included in this PR? Use `docker` client CLI directly instead of `docker-compose` where possible. ### Are these changes tested? Yes, this should fix the sporadic CI failures because of the above migration. ### Are there any user-facing changes? No, except additional optional env var `ARCHERY_DEBUG`. * GitHub Issue: #40954 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .github/workflows/archery.yml | 2 + .github/workflows/cpp.yml | 5 ++ .github/workflows/dev.yml | 4 + .github/workflows/docs.yml | 2 + .github/workflows/docs_light.yml | 2 + .github/workflows/go.yml | 9 +++ .github/workflows/integration.yml | 2 + .github/workflows/java.yml | 2 + .github/workflows/java_jni.yml | 2 + .github/workflows/js.yml | 4 + .github/workflows/python.yml | 2 + .github/workflows/r.yml | 2 + .github/workflows/ruby.yml | 2 + .github/workflows/swift.yml | 2 + dev/archery/archery/cli.py | 1 + dev/archery/archery/docker/cli.py | 79 ++++++------------- dev/archery/archery/docker/core.py | 50 +++++++----- .../archery/docker/tests/test_docker_cli.py | 19 +---- dev/tasks/java-jars/github.yml | 2 + dev/tasks/linux-packages/github.linux.yml | 1 + dev/tasks/macros.jinja | 4 + dev/tasks/python-wheels/github.linux.yml | 2 + dev/tasks/tasks.yml | 4 + 23 files changed, 117 insertions(+), 87 deletions(-) diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index cb783dd66c3fb..2aa6b0dcbb91d 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -32,7 +32,9 @@ on: - 'docker-compose.yml' env: + ARCHERY_DEBUG: 1 ARCHERY_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + ARCHERY_USE_DOCKER_CLI: 1 concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index e8e41f1bcb90c..1d10be3b5bc82 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -53,6 +53,7 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" @@ -94,6 +95,7 @@ jobs: cat <> "$GITHUB_OUTPUT" { "arch": "arm64v8", + "archery-use-docker-cli": "0", "clang-tools": "10", "image": "ubuntu-cpp", "llvm": "10", @@ -118,6 +120,9 @@ jobs: include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} + # By default, use Docker CLI because docker-compose v1 is obsolete, + # except where the Docker client version is too old. + ARCHERY_USE_DOCKER_CLI: ${{ matrix.archery-use-docker-cli || '1' }} ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} CLANG_TOOLS: ${{ matrix.clang-tools }} LLVM: ${{ matrix.llvm }} diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 37fda2e313ae2..8af5832f15948 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -29,6 +29,10 @@ concurrency: permissions: contents: read +env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 + jobs: lint: diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 9c7701f25f756..fe49e275d908d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -24,6 +24,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 6ec4c3d53d0e3..376c87651d2d0 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -33,6 +33,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 7fca38528260f..11dc29dcae54e 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -41,6 +41,10 @@ concurrency: permissions: contents: read +env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 + jobs: docker-targets: @@ -75,12 +79,14 @@ jobs: { "arch-label": "ARM64", "arch": "arm64v8", + "archery-use-docker-cli": "0", "go": "1.21", "runs-on": ["self-hosted", "arm", "linux"] }, { "arch-label": "ARM64", "arch": "arm64v8", + "archery-use-docker-cli": "0", "go": "1.22", "runs-on": ["self-hosted", "arm", "linux"] } @@ -101,6 +107,9 @@ jobs: include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} + # By default, use Docker CLI because docker-compose v1 is obsolete, + # except where the Docker client version is too old. + ARCHERY_USE_DOCKER_CLI: ${{ matrix.archery-use-docker-cli || '1' }} GO: ${{ matrix.go }} steps: - name: Checkout Arrow diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 0f186ff6a4527..2c3499c160f9c 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -51,6 +51,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 423f54cd93547..611e202ca0624 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -45,6 +45,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 790ffd5c650e0..958216ac7669d 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -45,6 +45,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index dab89da44c861..c9b7d7b742d88 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -38,6 +38,10 @@ concurrency: permissions: contents: read +env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 + jobs: docker: diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 1147ac13e6f93..2db9b17e895b0 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -41,6 +41,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 78677499f3e45..05c85fa6dc2c2 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -51,6 +51,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 311c1c822baf6..ea3e61d55787d 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -53,6 +53,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml index f55e9e77503c0..3f039315b505a 100644 --- a/.github/workflows/swift.yml +++ b/.github/workflows/swift.yml @@ -41,6 +41,8 @@ permissions: contents: read env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 DOCKER_VOLUME_PREFIX: ".docker/" jobs: diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 32921afb2e61b..5fa41e28a3208 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -44,6 +44,7 @@ @click.group(context_settings={"help_option_names": ["-h", "--help"]}) @click.option("--debug", type=BOOL, is_flag=True, default=False, + envvar='ARCHERY_DEBUG', help="Increase logging with debugging output.") @click.option("--pdb", type=BOOL, is_flag=True, default=False, help="Invoke pdb on uncaught exception.") diff --git a/dev/archery/archery/docker/cli.py b/dev/archery/archery/docker/cli.py index 20d9a16138bac..e6baf0ca1f002 100644 --- a/dev/archery/archery/docker/cli.py +++ b/dev/archery/archery/docker/cli.py @@ -46,10 +46,19 @@ def _execute(self, *args, **kwargs): callback=validate_arrow_sources, help="Specify Arrow source directory.") @click.option('--dry-run/--execute', default=False, - help="Display the docker-compose commands instead of executing " - "them.") + help="Display the docker commands instead of executing them.") +@click.option('--using-docker-cli', default=False, is_flag=True, + envvar='ARCHERY_USE_DOCKER_CLI', + help="Use docker CLI directly for building instead of calling " + "docker-compose. This may help to reuse cached layers.") +@click.option('--using-docker-buildx', default=False, is_flag=True, + envvar='ARCHERY_USE_DOCKER_BUILDX', + help="Use buildx with docker CLI directly for building instead " + "of calling docker-compose or the plain docker build " + "command. This option makes the build cache reusable " + "across hosts.") @click.pass_context -def docker(ctx, src, dry_run): +def docker(ctx, src, dry_run, using_docker_cli, using_docker_buildx): """ Interact with docker-compose based builds. """ @@ -64,7 +73,10 @@ def docker(ctx, src, dry_run): # take the docker-compose parameters like PYTHON, PANDAS, UBUNTU from the # environment variables to keep the usage similar to docker-compose + using_docker_cli |= using_docker_buildx compose = DockerCompose(config_path, params=os.environ, + using_docker=using_docker_cli, + using_buildx=using_docker_buildx, debug=ctx.obj.get('debug', False)) if dry_run: _mock_compose_calls(compose) @@ -83,24 +95,19 @@ def check_config(obj): @docker.command('pull') @click.argument('image') -@click.option('--using-docker-cli', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_CLI', - help="Use docker CLI directly for pulling instead of calling " - "docker-compose. This may help to reuse cached layers.") @click.option('--pull-leaf/--no-leaf', default=True, help="Whether to pull leaf images too.") @click.option('--ignore-pull-failures/--no-ignore-pull-failures', default=True, help="Whether to ignore pull failures.") @click.pass_obj -def docker_pull(obj, image, *, using_docker_cli, pull_leaf, - ignore_pull_failures): +def docker_pull(obj, image, *, pull_leaf, ignore_pull_failures): """ Execute docker-compose pull. """ compose = obj['compose'] try: - compose.pull(image, pull_leaf=pull_leaf, using_docker=using_docker_cli, + compose.pull(image, pull_leaf=pull_leaf, ignore_pull_failures=ignore_pull_failures) except UndefinedImage as e: raise click.ClickException( @@ -115,16 +122,6 @@ def docker_pull(obj, image, *, using_docker_cli, pull_leaf, @click.argument('image') @click.option('--force-pull/--no-pull', default=True, help="Whether to force pull the image and its ancestor images") -@click.option('--using-docker-cli', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_CLI', - help="Use docker CLI directly for building instead of calling " - "docker-compose. This may help to reuse cached layers.") -@click.option('--using-docker-buildx', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_BUILDX', - help="Use buildx with docker CLI directly for building instead " - "of calling docker-compose or the plain docker build " - "command. This option makes the build cache reusable " - "across hosts.") @click.option('--use-cache/--no-cache', default=True, help="Whether to use cache when building the image and its " "ancestor images") @@ -133,22 +130,17 @@ def docker_pull(obj, image, *, using_docker_cli, pull_leaf, "passed as the argument. To disable caching for both the " "image and its ancestors use --no-cache option.") @click.pass_obj -def docker_build(obj, image, *, force_pull, using_docker_cli, - using_docker_buildx, use_cache, use_leaf_cache): +def docker_build(obj, image, *, force_pull, use_cache, use_leaf_cache): """ Execute docker-compose builds. """ compose = obj['compose'] - using_docker_cli |= using_docker_buildx try: if force_pull: - compose.pull(image, pull_leaf=use_leaf_cache, - using_docker=using_docker_cli) + compose.pull(image, pull_leaf=use_leaf_cache) compose.build(image, use_cache=use_cache, use_leaf_cache=use_leaf_cache, - using_docker=using_docker_cli, - using_buildx=using_docker_buildx, pull_parents=force_pull) except UndefinedImage as e: raise click.ClickException( @@ -172,16 +164,6 @@ def docker_build(obj, image, *, force_pull, using_docker_cli, help="Whether to force build the image and its ancestor images") @click.option('--build-only', default=False, is_flag=True, help="Pull and/or build the image, but do not run it") -@click.option('--using-docker-cli', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_CLI', - help="Use docker CLI directly for building instead of calling " - "docker-compose. This may help to reuse cached layers.") -@click.option('--using-docker-buildx', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_BUILDX', - help="Use buildx with docker CLI directly for building instead " - "of calling docker-compose or the plain docker build " - "command. This option makes the build cache reusable " - "across hosts.") @click.option('--use-cache/--no-cache', default=True, help="Whether to use cache when building the image and its " "ancestor images") @@ -191,7 +173,7 @@ def docker_build(obj, image, *, force_pull, using_docker_cli, "image and its ancestors use --no-cache option.") @click.option('--resource-limit', default=None, help="A CPU/memory limit preset to mimic CI environments like " - "GitHub Actions. Implies --using-docker-cli. Note that " + "GitHub Actions. Mandates --using-docker-cli. Note that " "exporting ARCHERY_DOCKER_BIN=\"sudo docker\" is likely " "required, unless Docker is configured with cgroups v2 " "(else Docker will silently ignore the limits).") @@ -199,8 +181,8 @@ def docker_build(obj, image, *, force_pull, using_docker_cli, help="Set volume within the container") @click.pass_obj def docker_run(obj, image, command, *, env, user, force_pull, force_build, - build_only, using_docker_cli, using_docker_buildx, use_cache, - use_leaf_cache, resource_limit, volume): + build_only, use_cache, use_leaf_cache, resource_limit, + volume): """ Execute docker-compose builds. @@ -234,18 +216,14 @@ def docker_run(obj, image, command, *, env, user, force_pull, force_build, archery docker run ubuntu-cpp bash """ compose = obj['compose'] - using_docker_cli |= using_docker_buildx env = dict(kv.split('=', 1) for kv in env) try: if force_pull: - compose.pull(image, pull_leaf=use_leaf_cache, - using_docker=using_docker_cli) + compose.pull(image, pull_leaf=use_leaf_cache) if force_build: compose.build(image, use_cache=use_cache, - use_leaf_cache=use_leaf_cache, - using_docker=using_docker_cli, - using_buildx=using_docker_buildx) + use_leaf_cache=use_leaf_cache) if build_only: return compose.run( @@ -253,7 +231,6 @@ def docker_run(obj, image, command, *, env, user, force_pull, force_build, command=command, env=env, user=user, - using_docker=using_docker_cli, resource_limit=resource_limit, volumes=volume ) @@ -273,15 +250,11 @@ def docker_run(obj, image, command, *, env, user, force_pull, force_build, @click.option('--password', '-p', required=False, envvar='ARCHERY_DOCKER_PASSWORD', help='Docker repository password') -@click.option('--using-docker-cli', default=False, is_flag=True, - help="Use docker CLI directly for building instead of calling " - "docker-compose. This may help to reuse cached layers.") @click.pass_obj -def docker_compose_push(obj, image, user, password, using_docker_cli): +def docker_compose_push(obj, image, user, password): """Push the generated docker-compose image.""" compose = obj['compose'] - compose.push(image, user=user, password=password, - using_docker=using_docker_cli) + compose.push(image, user=user, password=password) @docker.command('images') diff --git a/dev/archery/archery/docker/core.py b/dev/archery/archery/docker/core.py index 184d9808759b8..38720e5856a14 100644 --- a/dev/archery/archery/docker/core.py +++ b/dev/archery/archery/docker/core.py @@ -58,12 +58,21 @@ class UndefinedImage(Exception): class ComposeConfig: - def __init__(self, config_path, dotenv_path, compose_bin, params=None): + def __init__(self, config_path, dotenv_path, compose_bin, + using_docker=False, using_buildx=False, + params=None, debug=False): + self.using_docker = using_docker + self.using_buildx = using_buildx + self.debug = debug config_path = _ensure_path(config_path) if dotenv_path: dotenv_path = _ensure_path(dotenv_path) else: dotenv_path = config_path.parent / '.env' + if self.debug: + # Log docker version + Docker().run('version') + self._read_env(dotenv_path, params) self._read_config(config_path, compose_bin) @@ -122,8 +131,13 @@ def _read_config(self, config_path, compose_bin): ) # trigger docker-compose's own validation - compose = Command('docker-compose') - args = ['--file', str(config_path), 'config'] + if self.using_docker: + compose = Docker() + args = ['compose'] + else: + compose = Command('docker-compose') + args = [] + args += ['--file', str(config_path), 'config'] result = compose.run(*args, env=self.env, check=False, stderr=subprocess.PIPE, stdout=subprocess.PIPE) @@ -164,12 +178,13 @@ def __init__(self, docker_bin=None): class DockerCompose(Command): def __init__(self, config_path, dotenv_path=None, compose_bin=None, - params=None, debug=False): + using_docker=False, using_buildx=False, params=None, + debug=False): compose_bin = default_bin(compose_bin, 'docker-compose') self.config = ComposeConfig(config_path, dotenv_path, compose_bin, - params) + params=params, using_docker=using_docker, + using_buildx=using_buildx, debug=debug) self.bin = compose_bin - self.debug = debug self.pull_memory = set() def clear_pull_memory(self): @@ -215,14 +230,13 @@ def _execute_docker(self, *args, **kwargs): ) ) - def pull(self, service_name, pull_leaf=True, using_docker=False, - ignore_pull_failures=True): + def pull(self, service_name, pull_leaf=True, ignore_pull_failures=True): def _pull(service): args = ['pull'] if service['image'] in self.pull_memory: return - if using_docker: + if self.config.using_docker: try: self._execute_docker(*args, service['image']) except Exception as e: @@ -245,7 +259,7 @@ def _pull(service): _pull(service) def build(self, service_name, use_cache=True, use_leaf_cache=True, - using_docker=False, using_buildx=False, pull_parents=True): + pull_parents=True): def _build(service, use_cache): if 'build' not in service: # nothing to do @@ -273,7 +287,7 @@ def _build(service, use_cache): if self.config.env.get('BUILDKIT_INLINE_CACHE') == '1': args.extend(['--build-arg', 'BUILDKIT_INLINE_CACHE=1']) - if using_buildx: + if self.config.using_buildx: for k, v in service['build'].get('args', {}).items(): args.extend(['--build-arg', '{}={}'.format(k, v)]) @@ -295,9 +309,9 @@ def _build(service, use_cache): service['build'].get('context', '.') ]) self._execute_docker("buildx", "build", *args) - elif using_docker: + elif self.config.using_docker: # better for caching - if self.debug: + if self.config.debug: args.append("--progress=plain") for k, v in service['build'].get('args', {}).items(): args.extend(['--build-arg', '{}={}'.format(k, v)]) @@ -310,7 +324,7 @@ def _build(service, use_cache): ]) self._execute_docker("build", *args) else: - if self.debug: + if self.config.debug: args.append("--progress=plain") self._execute_compose("build", *args, service['name']) @@ -322,7 +336,7 @@ def _build(service, use_cache): _build(service, use_cache=use_cache and use_leaf_cache) def run(self, service_name, command=None, *, env=None, volumes=None, - user=None, using_docker=False, resource_limit=None): + user=None, resource_limit=None): service = self.config.get(service_name) args = [] @@ -337,7 +351,7 @@ def run(self, service_name, command=None, *, env=None, volumes=None, for volume in volumes: args.extend(['--volume', volume]) - if using_docker or service['need_gpu'] or resource_limit: + if self.config.using_docker or service['need_gpu'] or resource_limit: # use gpus, requires docker>=19.03 if service['need_gpu']: args.extend(['--gpus', 'all']) @@ -399,9 +413,9 @@ def run(self, service_name, command=None, *, env=None, volumes=None, args.append(command) self._execute_compose('run', '--rm', *args) - def push(self, service_name, user=None, password=None, using_docker=False): + def push(self, service_name, user=None, password=None): def _push(service): - if using_docker: + if self.config.using_docker: return self._execute_docker('push', service['image']) else: return self._execute_compose('push', service['name']) diff --git a/dev/archery/archery/docker/tests/test_docker_cli.py b/dev/archery/archery/docker/tests/test_docker_cli.py index ab39c7b9dbb4a..c117a3edfff65 100644 --- a/dev/archery/archery/docker/tests/test_docker_cli.py +++ b/dev/archery/archery/docker/tests/test_docker_cli.py @@ -33,14 +33,12 @@ def test_docker_run_with_custom_command(run, build, pull): assert result.exit_code == 0 pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, using_docker=False + "ubuntu-cpp", pull_leaf=True, ) build.assert_called_once_with( "ubuntu-cpp", use_cache=True, use_leaf_cache=True, - using_docker=False, - using_buildx=False ) run.assert_called_once_with( "ubuntu-cpp", @@ -48,7 +46,6 @@ def test_docker_run_with_custom_command(run, build, pull): env={}, resource_limit=None, user=None, - using_docker=False, volumes=(), ) @@ -75,14 +72,12 @@ def test_docker_run_options(run, build, pull): result = CliRunner().invoke(docker, args) assert result.exit_code == 0 pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, using_docker=False + "ubuntu-cpp", pull_leaf=True, ) build.assert_called_once_with( "ubuntu-cpp", use_cache=True, use_leaf_cache=True, - using_docker=False, - using_buildx=False ) run.assert_called_once_with( "ubuntu-cpp", @@ -90,7 +85,6 @@ def test_docker_run_options(run, build, pull): env={"ARROW_GANDIVA": "OFF", "ARROW_FLIGHT": "ON"}, resource_limit=None, user="root", - using_docker=False, volumes=( "./build:/build", "./ccache:/ccache:delegated", @@ -126,7 +120,6 @@ def test_docker_limit_options(run): env={"ARROW_GANDIVA": "OFF", "ARROW_FLIGHT": "ON"}, resource_limit="github", user="root", - using_docker=False, volumes=( "./build:/build", "./ccache:/ccache:delegated", @@ -145,7 +138,6 @@ def test_docker_run_without_pulling_or_building(run): env={}, resource_limit=None, user=None, - using_docker=False, volumes=(), ) @@ -157,14 +149,12 @@ def test_docker_run_only_pulling_and_building(build, pull): result = CliRunner().invoke(docker, args) assert result.exit_code == 0 pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, using_docker=False + "ubuntu-cpp", pull_leaf=True, ) build.assert_called_once_with( "ubuntu-cpp", use_cache=True, use_leaf_cache=True, - using_docker=False, - using_buildx=False ) @@ -187,8 +177,6 @@ def test_docker_run_without_build_cache(run, build): "ubuntu-cpp", use_cache=False, use_leaf_cache=False, - using_docker=False, - using_buildx=False ) run.assert_called_once_with( "ubuntu-cpp", @@ -196,6 +184,5 @@ def test_docker_run_without_build_cache(run, build): env={}, resource_limit=None, user="me", - using_docker=False, volumes=(), ) diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index 03cbcc7c98fcc..0437ee7864979 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -30,6 +30,7 @@ jobs: ARCH: {{ '${{ matrix.platform.archery_arch }}' }} ARCH_ALIAS: {{ '${{ matrix.platform.archery_arch_alias }}' }} ARCH_SHORT: {{ '${{ matrix.platform.archery_arch_short }}' }} + ARCHERY_USE_DOCKER_CLI: {{ "${{matrix.platform.archery_use_docker_cli || '1'}}" }} strategy: fail-fast: false matrix: @@ -44,6 +45,7 @@ jobs: archery_arch: "arm64v8" archery_arch_alias: "aarch64" archery_arch_short: "arm64" + archery_use_docker_cli: "0" steps: {{ macros.github_checkout_arrow()|indent }} {{ macros.github_free_space()|indent }} diff --git a/dev/tasks/linux-packages/github.linux.yml b/dev/tasks/linux-packages/github.linux.yml index 6de3edfce07e1..9e24835b8b627 100644 --- a/dev/tasks/linux-packages/github.linux.yml +++ b/dev/tasks/linux-packages/github.linux.yml @@ -29,6 +29,7 @@ jobs: {% endif %} env: ARCHITECTURE: {{ architecture }} + ARCHERY_USE_DOCKER_CLI: {{ '0' if architecture == 'arm64' else '1' }} steps: {{ macros.github_checkout_arrow()|indent }} {{ macros.github_login_dockerhub()|indent }} diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index bcafe53066ef8..f55a7f9481e56 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -23,6 +23,10 @@ on: push: branches: - "*-github-*" + +env: + ARCHERY_DEBUG: 1 + ARCHERY_USE_DOCKER_CLI: 1 {% endmacro %} {%- macro github_checkout_arrow(fetch_depth=1, submodules="recursive", action_v="4") -%} diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml index 41b18684cee10..0ff3c56b695eb 100644 --- a/dev/tasks/python-wheels/github.linux.yml +++ b/dev/tasks/python-wheels/github.linux.yml @@ -31,8 +31,10 @@ jobs: # archery uses these environment variables {% if arch == "amd64" %} ARCH: amd64 + ARCHERY_USE_DOCKER_CLI: 1 {% else %} ARCH: arm64v8 + ARCHERY_USE_DOCKER_CLI: 0 {% endif %} PYTHON: "{{ python_version }}" diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 5e1ef8d13b988..cf46cb8c6ad70 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1456,12 +1456,16 @@ tasks: ci: github template: docker-tests/github.cuda.yml params: + env: + ARCHERY_USE_DOCKER_CLI: 0 image: ubuntu-cuda-cpp test-cuda-python: ci: github template: docker-tests/github.cuda.yml params: + env: + ARCHERY_USE_DOCKER_CLI: 0 image: ubuntu-cuda-python ############################## Fuzz tests ################################# From 469430f50085b4b462fcc7db84a08cb554c698cd Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 3 Apr 2024 13:51:49 +0200 Subject: [PATCH 48/81] MINOR: [CI] Avoid docker-compose on Azure (#40972) ### Rationale for this change Same as PR #40949, but for Crossbow builds hosted on Azure Pipelines. ### Are these changes tested? Yes, they fix CI tests. ### Are there any user-facing changes? No. Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .github/workflows/archery.yml | 2 +- dev/archery/requirements-test.txt | 1 + dev/tasks/docker-tests/azure.linux.yml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index 2aa6b0dcbb91d..c698baba2c816 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -61,7 +61,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5.1.0 with: - python-version: '3.12' + python-version: '3.9' - name: Install pygit2 binary wheel run: pip install pygit2 --only-binary pygit2 - name: Install Archery, Crossbow- and Test Dependencies diff --git a/dev/archery/requirements-test.txt b/dev/archery/requirements-test.txt index 208ec64cdf026..e3e62a993c2a2 100644 --- a/dev/archery/requirements-test.txt +++ b/dev/archery/requirements-test.txt @@ -1,2 +1,3 @@ +docker-compose pytest responses diff --git a/dev/tasks/docker-tests/azure.linux.yml b/dev/tasks/docker-tests/azure.linux.yml index be03957e925ed..b66bfbdfe940a 100644 --- a/dev/tasks/docker-tests/azure.linux.yml +++ b/dev/tasks/docker-tests/azure.linux.yml @@ -46,7 +46,7 @@ jobs: displayName: Setup Archery - script: | - archery docker run \ + archery --debug docker --using-docker-cli run \ -e ARROW_DOCS_VERSION="{{ arrow.no_rc_version }}" \ -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \ {{ flags|default("") }} \ From e0d73c5fd521e9cf79f8c0bbe74285ab8fd4b6d2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 3 Apr 2024 11:14:27 -0300 Subject: [PATCH 49/81] GH-38828: [R] Ensure that streams can be written to socket connections (#38897) ### Rationale for this change Currently we can't write to socket connection from R. This is a very useful way to send Arrow data around and should work! ### What changes are included in this PR? Implements `Tell()` for non-seekable output streams. Apparently some Arrow code calls this to figure out how many bytes have been written. ### Are these changes tested? I'm not quite sure how to test this...all output streams we can easily test are seekable. We could try to spin up a socket server on another thread (like the reprex below) but I'm worried that will be flaky. ### Are there any user-facing changes? Yes (something that should have previously worked now works), although there is no place where we currently document anything about how connections can be used. ``` r tmp <- tempfile() proc <- callr::r_bg(function() { server <- function() { library(arrow) while (TRUE) { writeLines("Listening...") con <- socketConnection(host = "localhost", port = 6011, blocking = TRUE, server = TRUE, open = "r+b") socketTimeout(con, 3600) data <- arrow::read_ipc_stream(con, as_data_frame = FALSE) print(head(as.data.frame(data))) } } server() }, stdout = tmp) Sys.sleep(0.5) library(arrow, warn.conflicts = FALSE) #> Some features are not enabled in this build of Arrow. Run `arrow_info()` for more information. rb <- arrow::record_batch(iris) socketDriver <- socketConnection(host = "localhost", port = "6011", blocking = TRUE, server = FALSE, open = "w+b") write_ipc_stream(rb, socketDriver) Sys.sleep(0.5) cat(brio::read_file(tmp)) #> Listening... #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 1 5.1 3.5 1.4 0.2 setosa #> 2 4.9 3.0 1.4 0.2 setosa #> 3 4.7 3.2 1.3 0.2 setosa #> 4 4.6 3.1 1.5 0.2 setosa #> 5 5.0 3.6 1.4 0.2 setosa #> 6 5.4 3.9 1.7 0.4 setosa #> Listening... # Shutdown server proc$interrupt() #> [1] TRUE Sys.sleep(0.5) proc$is_alive() #> [1] FALSE ``` Created on 2023-11-27 with [reprex v2.0.2](https://reprex.tidyverse.org) * Closes: #38828 * GitHub Issue: #38828 Authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- r/R/csv.R | 7 ++--- r/R/feather.R | 2 +- r/R/ipc-stream.R | 4 +-- r/R/parquet.R | 2 +- r/man/read_delim_arrow.Rd | 5 ++-- r/man/read_feather.Rd | 4 +-- r/man/read_ipc_stream.Rd | 4 +-- r/man/read_json_arrow.Rd | 5 ++-- r/man/read_parquet.Rd | 4 +-- r/man/write_csv_arrow.Rd | 2 +- r/man/write_feather.Rd | 2 +- r/man/write_ipc_stream.Rd | 2 +- r/man/write_parquet.Rd | 2 +- r/src/io.cpp | 57 ++++++++++++++++++++++++++++++--------- 14 files changed, 69 insertions(+), 33 deletions(-) diff --git a/r/R/csv.R b/r/R/csv.R index 03540006ca0a2..733547570391d 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -78,8 +78,9 @@ #' `col_names`, and the CSV file has a header row that would otherwise be used #' to identify column names, you'll need to add `skip = 1` to skip that row. #' -#' @param file A character file name or URI, literal data (either a single string or a [raw] vector), -#' an Arrow input stream, or a `FileSystem` with path (`SubTreeFileSystem`). +#' @param file A character file name or URI, connection, literal data (either a +#' single string or a [raw] vector), an Arrow input stream, or a `FileSystem` +#' with path (`SubTreeFileSystem`). #' #' If a file name, a memory-mapped Arrow [InputStream] will be opened and #' closed when finished; compression will be detected from the file extension @@ -894,7 +895,7 @@ readr_to_csv_convert_options <- function(na, #' Write CSV file to disk #' #' @param x `data.frame`, [RecordBatch], or [Table] -#' @param sink A string file path, URI, or [OutputStream], or path in a file +#' @param sink A string file path, connection, URI, or [OutputStream], or path in a file #' system (`SubTreeFileSystem`) #' @param file file name. Specify this or `sink`, not both. #' @param include_header Whether to write an initial header line with column names diff --git a/r/R/feather.R b/r/R/feather.R index 474fc6118e44f..aa08dfdbc96a5 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -29,7 +29,7 @@ #' [write_ipc_file()] can only write V2 files. #' #' @param x `data.frame`, [RecordBatch], or [Table] -#' @param sink A string file path, URI, or [OutputStream], or path in a file +#' @param sink A string file path, connection, URI, or [OutputStream], or path in a file #' system (`SubTreeFileSystem`) #' @param version integer Feather file version, Version 1 or Version 2. Version 2 is the default. #' @param chunk_size For V2 files, the number of rows that each chunk of data diff --git a/r/R/ipc-stream.R b/r/R/ipc-stream.R index 37ef0bbaf2126..26a61a790f936 100644 --- a/r/R/ipc-stream.R +++ b/r/R/ipc-stream.R @@ -82,8 +82,8 @@ write_to_raw <- function(x, format = c("stream", "file")) { #' a "stream" format and a "file" format, known as Feather. `read_ipc_stream()` #' and [read_feather()] read those formats, respectively. #' -#' @param file A character file name or URI, `raw` vector, an Arrow input stream, -#' or a `FileSystem` with path (`SubTreeFileSystem`). +#' @param file A character file name or URI, connection, `raw` vector, an +#' Arrow input stream, or a `FileSystem` with path (`SubTreeFileSystem`). #' If a file name or URI, an Arrow [InputStream] will be opened and #' closed when finished. If an input stream is provided, it will be left #' open. diff --git a/r/R/parquet.R b/r/R/parquet.R index d92e913cb5db3..0ee6c62601c1d 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -90,7 +90,7 @@ read_parquet <- function(file, #' article} for examples of this. #' #' @param x `data.frame`, [RecordBatch], or [Table] -#' @param sink A string file path, URI, or [OutputStream], or path in a file +#' @param sink A string file path, connection, URI, or [OutputStream], or path in a file #' system (`SubTreeFileSystem`) #' @param chunk_size how many rows of data to write to disk at once. This #' directly corresponds to how many rows will be in each row group in diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd index b56d445c9e2e3..f946785e4a41e 100644 --- a/r/man/read_delim_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -90,8 +90,9 @@ read_tsv_arrow( ) } \arguments{ -\item{file}{A character file name or URI, literal data (either a single string or a \link{raw} vector), -an Arrow input stream, or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +\item{file}{A character file name or URI, connection, literal data (either a +single string or a \link{raw} vector), an Arrow input stream, or a \code{FileSystem} +with path (\code{SubTreeFileSystem}). If a file name, a memory-mapped Arrow \link{InputStream} will be opened and closed when finished; compression will be detected from the file extension diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index c3b4a54158c7f..95661d9778576 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -10,8 +10,8 @@ read_feather(file, col_select = NULL, as_data_frame = TRUE, mmap = TRUE) read_ipc_file(file, col_select = NULL, as_data_frame = TRUE, mmap = TRUE) } \arguments{ -\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, -or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +\item{file}{A character file name or URI, connection, \code{raw} vector, an +Arrow input stream, or a \code{FileSystem} with path (\code{SubTreeFileSystem}). If a file name or URI, an Arrow \link{InputStream} will be opened and closed when finished. If an input stream is provided, it will be left open.} diff --git a/r/man/read_ipc_stream.Rd b/r/man/read_ipc_stream.Rd index db930b52bde18..49d3949bfcf22 100644 --- a/r/man/read_ipc_stream.Rd +++ b/r/man/read_ipc_stream.Rd @@ -7,8 +7,8 @@ read_ipc_stream(file, as_data_frame = TRUE, ...) } \arguments{ -\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, -or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +\item{file}{A character file name or URI, connection, \code{raw} vector, an +Arrow input stream, or a \code{FileSystem} with path (\code{SubTreeFileSystem}). If a file name or URI, an Arrow \link{InputStream} will be opened and closed when finished. If an input stream is provided, it will be left open.} diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index 9230a9a017495..f289d3356e559 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -13,8 +13,9 @@ read_json_arrow( ) } \arguments{ -\item{file}{A character file name or URI, literal data (either a single string or a \link{raw} vector), -an Arrow input stream, or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +\item{file}{A character file name or URI, connection, literal data (either a +single string or a \link{raw} vector), an Arrow input stream, or a \code{FileSystem} +with path (\code{SubTreeFileSystem}). If a file name, a memory-mapped Arrow \link{InputStream} will be opened and closed when finished; compression will be detected from the file extension diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd index 4f1936529531a..95ee4ac5a8ceb 100644 --- a/r/man/read_parquet.Rd +++ b/r/man/read_parquet.Rd @@ -14,8 +14,8 @@ read_parquet( ) } \arguments{ -\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, -or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +\item{file}{A character file name or URI, connection, \code{raw} vector, an +Arrow input stream, or a \code{FileSystem} with path (\code{SubTreeFileSystem}). If a file name or URI, an Arrow \link{InputStream} will be opened and closed when finished. If an input stream is provided, it will be left open.} diff --git a/r/man/write_csv_arrow.Rd b/r/man/write_csv_arrow.Rd index 9fcca49fadc05..9f9fde74cc996 100644 --- a/r/man/write_csv_arrow.Rd +++ b/r/man/write_csv_arrow.Rd @@ -19,7 +19,7 @@ write_csv_arrow( \arguments{ \item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} -\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +\item{sink}{A string file path, connection, URI, or \link{OutputStream}, or path in a file system (\code{SubTreeFileSystem})} \item{file}{file name. Specify this or \code{sink}, not both.} diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 0d3a7da3b90b4..823bd2224eac2 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -25,7 +25,7 @@ write_ipc_file( \arguments{ \item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} -\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +\item{sink}{A string file path, connection, URI, or \link{OutputStream}, or path in a file system (\code{SubTreeFileSystem})} \item{version}{integer Feather file version, Version 1 or Version 2. Version 2 is the default.} diff --git a/r/man/write_ipc_stream.Rd b/r/man/write_ipc_stream.Rd index 094e3ad11a0c8..da9bb6bcacb45 100644 --- a/r/man/write_ipc_stream.Rd +++ b/r/man/write_ipc_stream.Rd @@ -9,7 +9,7 @@ write_ipc_stream(x, sink, ...) \arguments{ \item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} -\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +\item{sink}{A string file path, connection, URI, or \link{OutputStream}, or path in a file system (\code{SubTreeFileSystem})} \item{...}{extra parameters passed to \code{write_feather()}.} diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index 480abb12fcf4a..954c692dad2f1 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -22,7 +22,7 @@ write_parquet( \arguments{ \item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} -\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +\item{sink}{A string file path, connection, URI, or \link{OutputStream}, or path in a file system (\code{SubTreeFileSystem})} \item{chunk_size}{how many rows of data to write to disk at once. This diff --git a/r/src/io.cpp b/r/src/io.cpp index 4d5ee31794ae8..2f36d51dcd7ab 100644 --- a/r/src/io.cpp +++ b/r/src/io.cpp @@ -212,11 +212,16 @@ void io___BufferOutputStream__Write( class RConnectionFileInterface : public virtual arrow::io::FileInterface { public: explicit RConnectionFileInterface(cpp11::sexp connection_sexp) - : connection_sexp_(connection_sexp), closed_(false) { + : connection_sexp_(connection_sexp), + closed_(false), + seekable_(false), + bytes_written_(0), + bytes_read_(0) { check_closed(); + seekable_ = check_seekable(); } - arrow::Status Close() { + arrow::Status Close() override { if (closed_) { return arrow::Status::OK(); } @@ -227,11 +232,21 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { "close() on R connection"); } - arrow::Result Tell() const { + arrow::Result Tell() const override { if (closed()) { return arrow::Status::IOError("R connection is closed"); } + // R connections use seek() with no additional arguments as a tell() + // implementation; however, non-seekable connections will error if you + // do this. This heuristic allows Tell() to return a reasonable value + // (used by at least the IPC writer). + if (!seekable_ && bytes_written_ > 0) { + return bytes_written_; + } else if (!seekable_) { + return bytes_read_; + } + return SafeCallIntoR( [&]() { cpp11::sexp result = cpp11::package("base")["seek"](connection_sexp_); @@ -240,7 +255,7 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { "tell() on R connection"); } - bool closed() const { return closed_; } + bool closed() const override { return closed_; } protected: cpp11::sexp connection_sexp_; @@ -261,13 +276,14 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { return SafeCallIntoR( [&] { cpp11::function read_bin = cpp11::package("base")["readBin"]; - cpp11::writable::raws ptype((R_xlen_t)0); + cpp11::writable::raws ptype(static_cast(0)); cpp11::integers n = cpp11::as_sexp(static_cast(nbytes)); cpp11::sexp result = read_bin(connection_sexp_, ptype, n); int64_t result_size = cpp11::safe[Rf_xlength](result); memcpy(out, cpp11::safe[RAW](result), result_size); + bytes_read_++; return result_size; }, "readBin() on R connection"); @@ -294,6 +310,7 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { cpp11::function write_bin = cpp11::package("base")["writeBin"]; write_bin(data_raw, connection_sexp_); + bytes_written_ += nbytes; }, "writeBin() on R connection"); } @@ -312,6 +329,9 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { private: bool closed_; + bool seekable_; + int64_t bytes_written_; + int64_t bytes_read_; bool check_closed() { if (closed_) { @@ -333,6 +353,15 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { return closed_; } + + bool check_seekable() { + auto is_seekable_result = SafeCallIntoR([&] { + cpp11::sexp result = cpp11::package("base")["isSeekable"](connection_sexp_); + return cpp11::as_cpp(result); + }); + + return is_seekable_result.ok() && *is_seekable_result; + } }; class RConnectionInputStream : public virtual arrow::io::InputStream, @@ -341,9 +370,11 @@ class RConnectionInputStream : public virtual arrow::io::InputStream, explicit RConnectionInputStream(cpp11::sexp connection_sexp) : RConnectionFileInterface(connection_sexp) {} - arrow::Result Read(int64_t nbytes, void* out) { return ReadBase(nbytes, out); } + arrow::Result Read(int64_t nbytes, void* out) override { + return ReadBase(nbytes, out); + } - arrow::Result> Read(int64_t nbytes) { + arrow::Result> Read(int64_t nbytes) override { return ReadBase(nbytes); } }; @@ -373,13 +404,15 @@ class RConnectionRandomAccessFile : public arrow::io::RandomAccessFile, } } - arrow::Result GetSize() { return size_; } + arrow::Result GetSize() override { return size_; } - arrow::Status Seek(int64_t pos) { return SeekBase(pos); } + arrow::Status Seek(int64_t pos) override { return SeekBase(pos); } - arrow::Result Read(int64_t nbytes, void* out) { return ReadBase(nbytes, out); } + arrow::Result Read(int64_t nbytes, void* out) override { + return ReadBase(nbytes, out); + } - arrow::Result> Read(int64_t nbytes) { + arrow::Result> Read(int64_t nbytes) override { return ReadBase(nbytes); } @@ -393,7 +426,7 @@ class RConnectionOutputStream : public arrow::io::OutputStream, explicit RConnectionOutputStream(cpp11::sexp connection_sexp) : RConnectionFileInterface(connection_sexp) {} - arrow::Status Write(const void* data, int64_t nbytes) { + arrow::Status Write(const void* data, int64_t nbytes) override { return WriteBase(data, nbytes); } }; From 8c09c7f1816c03e82761daa44159ebe9c21742eb Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 3 Apr 2024 15:02:13 -0300 Subject: [PATCH 50/81] GH-40983: [C++] Fix unused function build error (#40984) ### Rationale for this change Make sure local `CompressedInputArguments` is only defined when it's being used. ### What changes are included in this PR? Wrapping definition and usage under the same `#ifdef`. ### Are these changes tested? By building with `-Werror` and running all existing tests. * GitHub Issue: #40983 Authored-by: Felipe Oliveira Carvalho Signed-off-by: mwish --- cpp/src/arrow/io/compressed_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/io/compressed_benchmark.cc b/cpp/src/arrow/io/compressed_benchmark.cc index 52a30d8cb0887..ae9ecd57143f3 100644 --- a/cpp/src/arrow/io/compressed_benchmark.cc +++ b/cpp/src/arrow/io/compressed_benchmark.cc @@ -170,6 +170,7 @@ static void CompressedInputStreamNonZeroCopyBufferReturnedByCallee( BufferReadMode::ReturnedByCallee>(state, kCompression); } +#ifdef ARROW_WITH_LZ4 static void CompressedInputArguments(::benchmark::internal::Benchmark* b) { b->ArgNames({"num_bytes", "batch_size"}) ->Args({8 * 1024, 8 * 1024}) @@ -180,7 +181,6 @@ static void CompressedInputArguments(::benchmark::internal::Benchmark* b) { ->Args({1024 * 1024, 1024 * 1024}); } -#ifdef ARROW_WITH_LZ4 // Benchmark LZ4 because it's lightweight, which makes benchmarking focused on the // overhead of the compression input stream. BENCHMARK_TEMPLATE(CompressedInputStreamZeroCopyBufferProvidedByCaller, From a4acb643437af2323f683e51d6043907fed496a9 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 4 Apr 2024 03:57:41 +0800 Subject: [PATCH 51/81] GH-40872: [C++][Parquet] Encoding: Optimize DecodeArrow/Decode(bitmap) for PlainBooleanDecoder (#40876) ### Rationale for this change This is for enhance boolean decoding. I optimized the `DecodeArrow` for PlainBoolean ### What changes are included in this PR? Optimize DecodeArrow/Decode(bitmap) for PlainBooleanDecoder, and add benchmarks ### Are these changes tested? Yes ### Are there any user-facing changes? Minor optimization. And `Decode` boolean will change the syntax * GitHub Issue: #40872 Lead-authored-by: mwish Co-authored-by: Antoine Pitrou Signed-off-by: mwish --- cpp/src/parquet/column_reader.cc | 3 +- cpp/src/parquet/encoding.cc | 79 ++++++--- cpp/src/parquet/encoding.h | 4 +- cpp/src/parquet/encoding_benchmark.cc | 236 ++++++++++++++++++++++---- 4 files changed, 261 insertions(+), 61 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3fb224154c4ec..af489c70a5233 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1316,7 +1316,8 @@ class TypedRecordReader : public TypedColumnReaderImpl, levels_position_ = 0; levels_capacity_ = 0; read_dense_for_nullable_ = read_dense_for_nullable; - uses_values_ = !(descr->physical_type() == Type::BYTE_ARRAY); + // BYTE_ARRAY values are not stored in the `values_` buffer. + uses_values_ = descr->physical_type() != Type::BYTE_ARRAY; if (uses_values_) { values_ = AllocateBuffer(pool); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 3eed88f08b22a..f16e9b34fc682 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -55,6 +55,7 @@ namespace bit_util = arrow::bit_util; using arrow::Status; using arrow::VisitNullBitmapInline; using arrow::internal::AddWithOverflow; +using arrow::internal::BitBlockCounter; using arrow::internal::checked_cast; using arrow::internal::MultiplyWithOverflow; using arrow::internal::SafeSignedSubtract; @@ -1173,13 +1174,15 @@ class PlainBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { private: std::unique_ptr<::arrow::bit_util::BitReader> bit_reader_; + int total_num_values_{0}; }; PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr) : DecoderImpl(descr, Encoding::PLAIN) {} void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) { - num_values_ = num_values; + DecoderImpl::SetData(num_values, data, len); + total_num_values_ = num_values; bit_reader_ = std::make_unique(data, len); } @@ -1188,19 +1191,52 @@ int PlainBooleanDecoder::DecodeArrow( typename EncodingTraits::Accumulator* builder) { int values_decoded = num_values - null_count; if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) { + // A too large `num_values` was requested. + ParquetException::EofException(); + } + if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(values_decoded))) { ParquetException::EofException(); } - PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); - - VisitNullBitmapInline( - valid_bits, valid_bits_offset, num_values, null_count, - [&]() { - bool value; - ARROW_IGNORE_EXPR(bit_reader_->GetValue(1, &value)); - builder->UnsafeAppend(value); - }, - [&]() { builder->UnsafeAppendNull(); }); + if (null_count == 0) { + // FastPath: can copy the data directly + PARQUET_THROW_NOT_OK(builder->AppendValues(data_, values_decoded, NULLPTR, + total_num_values_ - num_values_)); + } else { + // Handle nulls by BitBlockCounter + PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); + BitBlockCounter bit_counter(valid_bits, valid_bits_offset, num_values); + int64_t value_position = 0; + int64_t valid_bits_offset_position = valid_bits_offset; + int64_t previous_value_offset = 0; + while (value_position < num_values) { + auto block = bit_counter.NextWord(); + if (block.AllSet()) { + // GH-40978: We don't have UnsafeAppendValues for booleans currently, + // so using `AppendValues` here. + PARQUET_THROW_NOT_OK( + builder->AppendValues(data_, block.length, NULLPTR, previous_value_offset)); + previous_value_offset += block.length; + } else if (block.NoneSet()) { + // GH-40978: We don't have UnsafeAppendNulls for booleans currently, + // so using `AppendNulls` here. + PARQUET_THROW_NOT_OK(builder->AppendNulls(block.length)); + } else { + for (int64_t i = 0; i < block.length; ++i) { + if (bit_util::GetBit(valid_bits, valid_bits_offset_position + i)) { + bool value = bit_util::GetBit( + data_, total_num_values_ - num_values_ + previous_value_offset); + builder->UnsafeAppend(value); + previous_value_offset += 1; + } else { + builder->UnsafeAppendNull(); + } + } + } + value_position += block.length; + valid_bits_offset_position += block.length; + } + } num_values_ -= values_decoded; return values_decoded; @@ -1214,18 +1250,15 @@ inline int PlainBooleanDecoder::DecodeArrow( int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) { max_values = std::min(max_values, num_values_); - bool val; - ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values); - for (int i = 0; i < max_values; ++i) { - if (!bit_reader_->GetValue(1, &val)) { - ParquetException::EofException(); - } - if (val) { - bit_writer.Set(); - } - bit_writer.Next(); + if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(max_values))) { + ParquetException::EofException(); } - bit_writer.Finish(); + // Copy the data directly + // Parquet's boolean encoding is bit-packed using LSB. So + // we can directly copy the data to the buffer. + ::arrow::internal::CopyBitmap(this->data_, /*offset=*/total_num_values_ - num_values_, + /*length=*/max_values, /*dest=*/buffer, + /*dest_offset=*/0); num_values_ -= max_values; return max_values; } @@ -1692,7 +1725,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { } protected: - Status IndexInBounds(int32_t index) { + Status IndexInBounds(int32_t index) const { if (ARROW_PREDICT_TRUE(0 <= index && index < dictionary_length_)) { return Status::OK(); } diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index de47bb7deb839..602009189595e 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -400,7 +400,9 @@ class BooleanDecoder : virtual public TypedDecoder { /// \brief Decode and bit-pack values into a buffer /// /// \param[in] buffer destination for decoded values - /// This buffer will contain bit-packed values. + /// This buffer will contain bit-packed values. If + /// max_values is not a multiple of 8, the trailing bits + /// of the last byte will be undefined. /// \param[in] max_values max values to decode. /// \return The number of values decoded. Should be identical to max_values except /// at the end of the current data page. diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 61959b659f633..9c07d262b350e 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -66,6 +66,7 @@ static void BM_PlainEncodingBoolean(benchmark::State& state) { typed_encoder->FlushValues(); } state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(bool)); + state.SetItemsProcessed(state.iterations() * state.range(0)); } BENCHMARK(BM_PlainEncodingBoolean)->Range(MIN_RANGE, MAX_RANGE); @@ -86,11 +87,34 @@ static void BM_PlainDecodingBoolean(benchmark::State& state) { } state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(bool)); + state.SetItemsProcessed(state.iterations() * state.range(0)); delete[] output; } BENCHMARK(BM_PlainDecodingBoolean)->Range(MIN_RANGE, MAX_RANGE); +static void BM_PlainDecodingBooleanToBitmap(benchmark::State& state) { + std::vector values(state.range(0), true); + int64_t bitmap_bytes = ::arrow::bit_util::BytesForBits(state.range(0)); + std::vector output(bitmap_bytes, 0); + auto encoder = MakeEncoder(Type::BOOLEAN, Encoding::PLAIN); + auto typed_encoder = dynamic_cast(encoder.get()); + typed_encoder->Put(values, static_cast(values.size())); + std::shared_ptr buf = encoder->FlushValues(); + + for (auto _ : state) { + auto decoder = MakeTypedDecoder(Encoding::PLAIN); + decoder->SetData(static_cast(values.size()), buf->data(), + static_cast(buf->size())); + decoder->Decode(output.data(), static_cast(values.size())); + } + // Still set `BytesProcessed` to byte level. + state.SetBytesProcessed(state.iterations() * bitmap_bytes); + state.SetItemsProcessed(state.iterations() * state.range(0)); +} + +BENCHMARK(BM_PlainDecodingBooleanToBitmap)->Range(MIN_RANGE, MAX_RANGE); + static void BM_PlainEncodingInt64(benchmark::State& state) { std::vector values(state.range(0), 64); auto encoder = MakeTypedEncoder(Encoding::PLAIN); @@ -1097,8 +1121,11 @@ BENCHMARK(BM_DictDecodingByteArray)->Apply(ByteArrayCustomArguments); using ::arrow::BinaryBuilder; using ::arrow::BinaryDictionary32Builder; -class BenchmarkDecodeArrow : public ::benchmark::Fixture { +template +class BenchmarkDecodeArrowBase : public ::benchmark::Fixture { public: + virtual ~BenchmarkDecodeArrowBase() = default; + void SetUp(const ::benchmark::State& state) override { num_values_ = static_cast(state.range()); InitDataInputs(); @@ -1111,37 +1138,18 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture { values_.clear(); } - void InitDataInputs() { - // Generate a random string dictionary without any nulls so that this dataset can - // be used for benchmarking the DecodeArrowNonNull API - constexpr int repeat_factor = 8; - constexpr int64_t min_length = 2; - constexpr int64_t max_length = 10; - ::arrow::random::RandomArrayGenerator rag(0); - input_array_ = rag.StringWithRepeats(num_values_, num_values_ / repeat_factor, - min_length, max_length, /*null_probability=*/0); - valid_bits_ = input_array_->null_bitmap_data(); - total_size_ = input_array_->data()->buffers[2]->size(); - - values_.reserve(num_values_); - const auto& binary_array = static_cast(*input_array_); - for (int64_t i = 0; i < binary_array.length(); i++) { - auto view = binary_array.GetView(i); - values_.emplace_back(static_cast(view.length()), - reinterpret_cast(view.data())); - } - } - + virtual void InitDataInputs() = 0; virtual void DoEncodeArrow() = 0; virtual void DoEncodeLowLevel() = 0; - - virtual std::unique_ptr InitializeDecoder() = 0; + virtual std::unique_ptr> InitializeDecoder() = 0; + virtual typename EncodingTraits::Accumulator CreateAccumulator() = 0; void EncodeArrowBenchmark(benchmark::State& state) { for (auto _ : state) { DoEncodeArrow(); } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } void EncodeLowLevelBenchmark(benchmark::State& state) { @@ -1149,26 +1157,27 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture { DoEncodeLowLevel(); } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } void DecodeArrowDenseBenchmark(benchmark::State& state) { for (auto _ : state) { auto decoder = InitializeDecoder(); - typename EncodingTraits::Accumulator acc; - acc.builder.reset(new BinaryBuilder); + auto acc = CreateAccumulator(); decoder->DecodeArrow(num_values_, 0, valid_bits_, 0, &acc); } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } void DecodeArrowNonNullDenseBenchmark(benchmark::State& state) { for (auto _ : state) { auto decoder = InitializeDecoder(); - typename EncodingTraits::Accumulator acc; - acc.builder.reset(new BinaryBuilder); + auto acc = CreateAccumulator(); decoder->DecodeArrowNonNull(num_values_, &acc); } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } void DecodeArrowDictBenchmark(benchmark::State& state) { @@ -1179,6 +1188,7 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture { } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } void DecodeArrowNonNullDictBenchmark(benchmark::State& state) { @@ -1189,20 +1199,56 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture { } state.SetBytesProcessed(state.iterations() * total_size_); + state.SetItemsProcessed(state.iterations() * num_values_); } protected: - int num_values_; + int num_values_{0}; std::shared_ptr<::arrow::Array> input_array_; - std::vector values_; - uint64_t total_size_; - const uint8_t* valid_bits_; + uint64_t total_size_{0}; + const uint8_t* valid_bits_{nullptr}; std::shared_ptr buffer_; + std::vector values_; +}; + +class BenchmarkDecodeArrowByteArray : public BenchmarkDecodeArrowBase { + public: + using ByteArrayAccumulator = typename EncodingTraits::Accumulator; + + ByteArrayAccumulator CreateAccumulator() final { + ByteArrayAccumulator acc; + acc.builder = std::make_unique(default_memory_pool()); + return acc; + } + + void InitDataInputs() final { + // Generate a random string dictionary without any nulls so that this dataset can + // be used for benchmarking the DecodeArrowNonNull API + constexpr int repeat_factor = 8; + constexpr int64_t min_length = 2; + constexpr int64_t max_length = 10; + ::arrow::random::RandomArrayGenerator rag(0); + input_array_ = rag.StringWithRepeats(num_values_, num_values_ / repeat_factor, + min_length, max_length, /*null_probability=*/0); + valid_bits_ = input_array_->null_bitmap_data(); + total_size_ = input_array_->data()->buffers[2]->size(); + + values_.reserve(num_values_); + const auto& binary_array = static_cast(*input_array_); + for (int64_t i = 0; i < binary_array.length(); i++) { + auto view = binary_array.GetView(i); + values_.emplace_back(static_cast(view.length()), + reinterpret_cast(view.data())); + } + } + + protected: + std::vector values_; }; // ---------------------------------------------------------------------- // Benchmark Decoding from Plain Encoding -class BM_ArrowBinaryPlain : public BenchmarkDecodeArrow { +class BM_ArrowBinaryPlain : public BenchmarkDecodeArrowByteArray { public: void DoEncodeArrow() override { auto encoder = MakeTypedEncoder(Encoding::PLAIN); @@ -1251,7 +1297,7 @@ BENCHMARK_REGISTER_F(BM_ArrowBinaryPlain, DecodeArrowNonNull_Dict) // ---------------------------------------------------------------------- // Benchmark Decoding from Dictionary Encoding -class BM_ArrowBinaryDict : public BenchmarkDecodeArrow { +class BM_ArrowBinaryDict : public BenchmarkDecodeArrowByteArray { public: template void DoEncode(PutValuesFunc&& put_values) { @@ -1319,7 +1365,7 @@ class BM_ArrowBinaryDict : public BenchmarkDecodeArrow { } void TearDown(const ::benchmark::State& state) override { - BenchmarkDecodeArrow::TearDown(state); + BenchmarkDecodeArrowByteArray::TearDown(state); dict_buffer_.reset(); descr_.reset(); } @@ -1327,7 +1373,7 @@ class BM_ArrowBinaryDict : public BenchmarkDecodeArrow { protected: std::unique_ptr descr_; std::shared_ptr dict_buffer_; - int num_dict_entries_; + int num_dict_entries_{0}; }; BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, EncodeArrow) @@ -1373,4 +1419,122 @@ BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict) BENCHMARK_REGISTER_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict) ->Range(MIN_RANGE, MAX_RANGE); +class BenchmarkDecodeArrowBoolean : public BenchmarkDecodeArrowBase { + public: + void InitDataInputs() final { + // Generate a random boolean array with `null_probability_`. + ::arrow::random::RandomArrayGenerator rag(0); + input_array_ = rag.Boolean(num_values_, /*true_probability=*/0.5, null_probability_); + valid_bits_ = input_array_->null_bitmap_data(); + + // Arrow uses a bitmap representation for boolean arrays, + // so, we uses this as "total_size" for the benchmark. + total_size_ = ::arrow::bit_util::BytesForBits(num_values_); + + values_.reserve(num_values_); + const auto& boolean_array = static_cast(*input_array_); + for (int64_t i = 0; i < boolean_array.length(); i++) { + values_.push_back(boolean_array.Value(i)); + } + } + + typename EncodingTraits::Accumulator CreateAccumulator() final { + return typename EncodingTraits::Accumulator(); + } + + void DoEncodeLowLevel() final { ParquetException::NYI(); } + + void DecodeArrowWithNullDenseBenchmark(benchmark::State& state); + + protected: + void DoEncodeArrowImpl(Encoding::type encoding) { + auto encoder = MakeTypedEncoder(encoding); + encoder->Put(*input_array_); + buffer_ = encoder->FlushValues(); + } + + std::unique_ptr> InitializeDecoderImpl( + Encoding::type encoding) const { + auto decoder = MakeTypedDecoder(encoding); + decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); + return decoder; + } + + protected: + double null_probability_ = 0.0; +}; + +void BenchmarkDecodeArrowBoolean::DecodeArrowWithNullDenseBenchmark( + benchmark::State& state) { + // Change null_probability + null_probability_ = static_cast(state.range(1)) / 10000; + InitDataInputs(); + this->DoEncodeArrow(); + int num_values_with_nulls = this->num_values_; + + for (auto _ : state) { + auto decoder = this->InitializeDecoder(); + auto acc = this->CreateAccumulator(); + decoder->DecodeArrow( + num_values_with_nulls, + /*null_count=*/static_cast(this->input_array_->null_count()), + this->valid_bits_, 0, &acc); + } + state.SetBytesProcessed(state.iterations() * static_cast(total_size_)); + state.SetItemsProcessed(state.iterations() * state.range(0)); +} + +class BM_DecodeArrowBooleanPlain : public BenchmarkDecodeArrowBoolean { + public: + void DoEncodeArrow() final { DoEncodeArrowImpl(Encoding::PLAIN); } + + std::unique_ptr> InitializeDecoder() override { + return InitializeDecoderImpl(Encoding::PLAIN); + } +}; + +class BM_DecodeArrowBooleanRle : public BenchmarkDecodeArrowBoolean { + public: + void DoEncodeArrow() final { DoEncodeArrowImpl(Encoding::RLE); } + + std::unique_ptr> InitializeDecoder() override { + return InitializeDecoderImpl(Encoding::RLE); + } +}; + +static void BooleanWithNullCustomArguments(benchmark::internal::Benchmark* b) { + b->ArgsProduct({ + benchmark::CreateRange(MIN_RANGE, MAX_RANGE, /*multi=*/4), + {1, 100, 1000, 5000, 10000}, + }) + ->ArgNames({"num_values", "null_in_ten_thousand"}); +} + +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrow)(benchmark::State& state) { + DecodeArrowDenseBenchmark(state); +} +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrow)->Range(MIN_RANGE, MAX_RANGE); +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) +(benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) + ->Range(MIN_RANGE, MAX_RANGE); +// TODO(mwish): RleBoolean not implemented DecodeArrow with null slots yet. +// BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) +//(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } +// BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) +// ->Apply(BooleanWithNullCustomArguments); + +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrow) +(benchmark::State& state) { DecodeArrowDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanPlain, DecodeArrow) + ->Range(MIN_RANGE, MAX_RANGE); +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrowNonNull) +(benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanPlain, DecodeArrowNonNull) + ->Range(MIN_RANGE, MAX_RANGE); +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrowWithNull) +(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanPlain, DecodeArrowWithNull) + ->Apply(BooleanWithNullCustomArguments); + } // namespace parquet From a5f80184750fb09fd232f2acbe5558b03b45a156 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 4 Apr 2024 06:41:04 +0900 Subject: [PATCH 52/81] GH-40961: [GLib] Suppress warnings for Vala examples on macOS (#40962) ### Rationale for this change There are some warnings for Vala examples on macOS: ```text FAILED: example/vala/read-file.p/meson-generated_read-file.c.o ccache cc -Iexample/vala/read-file.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/read-file.p/meson-generated_read-file.c.o -MF example/vala/read-file.p/meson-generated_read-file.c.o.d -o example/vala/read-file.p/meson-generated_read-file.c.o -c example/vala/read-file.p/read-file.c ../../c_glib/example/vala/read-file.vala:123:61: error: format specifies type 'long long' but the argument has type 'gint' (aka 'int') [-Werror,-Wformat] fprintf (_tmp2_, "columns[%" G_GINT64_FORMAT "](%s): ", nth_column, _tmp3_); ~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~ 1 error generated. ``` ```text FAILED: example/vala/read-stream.p/meson-generated_read-stream.c.o ccache cc -Iexample/vala/read-stream.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/read-stream.p/meson-generated_read-stream.c.o -MF example/vala/read-stream.p/meson-generated_read-stream.c.o.d -o example/vala/read-stream.p/meson-generated_read-stream.c.o -c example/vala/read-stream.p/read-stream.c ../../c_glib/example/vala/read-stream.vala:123:61: error: format specifies type 'long long' but the argument has type 'gint' (aka 'int') [-Werror,-Wformat] fprintf (_tmp2_, "columns[%" G_GINT64_FORMAT "](%s): ", nth_column, _tmp3_); ~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~ 1 error generated. ``` ```text FAILED: example/vala/write-file.p/meson-generated_write-file.c.o ccache cc -Iexample/vala/write-file.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/write-file.p/meson-generated_write-file.c.o -MF example/vala/write-file.p/meson-generated_write-file.c.o.d -o example/vala/write-file.p/meson-generated_write-file.c.o -c example/vala/write-file.p/write-file.c write-file.c:373:8: error: variable '_tmp45__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp45__length1; ^ write-file.c:504:8: error: variable '_tmp57__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp57__length1; ^ write-file.c:635:8: error: variable '_tmp69__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp69__length1; ^ write-file.c:766:8: error: variable '_tmp81__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp81__length1; ^ write-file.c:897:8: error: variable '_tmp93__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp93__length1; ^ write-file.c:1028:8: error: variable '_tmp105__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp105__length1; ^ write-file.c:1159:8: error: variable '_tmp117__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp117__length1; ^ write-file.c:1290:8: error: variable '_tmp129__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp129__length1; ^ write-file.c:1421:8: error: variable '_tmp141__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp141__length1; ^ write-file.c:1552:8: error: variable '_tmp153__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp153__length1; ^ 10 errors generated. ``` ### What changes are included in this PR? * Fix wrong format string * Disable `unused-but-set-variable` warning ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40961 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/example/vala/meson.build | 6 +++++- c_glib/example/vala/read-file.vala | 4 ++-- c_glib/example/vala/read-stream.vala | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/c_glib/example/vala/meson.build b/c_glib/example/vala/meson.build index 474f0b1e9a51a..ff65a7328f171 100644 --- a/c_glib/example/vala/meson.build +++ b/c_glib/example/vala/meson.build @@ -18,11 +18,15 @@ # under the License. if generate_vapi + c_flags = [ + '-Wunused-but-set-variable', + ] + c_flags = meson.get_compiler('c').get_supported_arguments(c_flags) vala_example_executable_kwargs = { 'c_args': [ '-I' + project_build_root, '-I' + project_source_root, - ], + ] + c_flags, 'dependencies': [ arrow_glib_vapi, dependency('gio-2.0'), diff --git a/c_glib/example/vala/read-file.vala b/c_glib/example/vala/read-file.vala index a0a06275c4b24..287eddac76352 100644 --- a/c_glib/example/vala/read-file.vala +++ b/c_glib/example/vala/read-file.vala @@ -119,8 +119,8 @@ void print_array(GArrow.Array array) { void print_record_batch(GArrow.RecordBatch record_batch) { var n_columns = record_batch.get_n_columns(); - for (var nth_column = 0; nth_column < n_columns; nth_column++) { - stdout.printf("columns[%" + int64.FORMAT + "](%s): ", + for (int nth_column = 0; nth_column < n_columns; nth_column++) { + stdout.printf("columns[%d](%s): ", nth_column, record_batch.get_column_name(nth_column)); var array = record_batch.get_column_data(nth_column); diff --git a/c_glib/example/vala/read-stream.vala b/c_glib/example/vala/read-stream.vala index c58dc848930a8..4520c8609bdaf 100644 --- a/c_glib/example/vala/read-stream.vala +++ b/c_glib/example/vala/read-stream.vala @@ -119,8 +119,8 @@ void print_array(GArrow.Array array) { void print_record_batch(GArrow.RecordBatch record_batch) { var n_columns = record_batch.get_n_columns(); - for (var nth_column = 0; nth_column < n_columns; nth_column++) { - stdout.printf("columns[%" + int64.FORMAT + "](%s): ", + for (int nth_column = 0; nth_column < n_columns; nth_column++) { + stdout.printf("columns[%d](%s): ", nth_column, record_batch.get_column_name(nth_column)); var array = record_batch.get_column_data(nth_column); From 5b09059ae40164c173a6ef593ab7b463b24d431d Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Wed, 3 Apr 2024 18:31:19 -0400 Subject: [PATCH 53/81] GH-40851: [JS] Fix nullcount and make vectors created from typed arrays not nullable (#40852) * GitHub Issue: #40851 --- js/src/data.ts | 13 ++++---- js/src/vector.ts | 2 +- js/test/unit/vector/vector-tests.ts | 46 ++++++++++++++++++++++++++--- 3 files changed, 51 insertions(+), 10 deletions(-) diff --git a/js/src/data.ts b/js/src/data.ts index 6f8792508858b..45fcc35d37676 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -109,7 +109,10 @@ export class Data { let nullCount = this._nullCount; let nullBitmap: Uint8Array | undefined; if (nullCount <= kUnknownNullCount && (nullBitmap = this.nullBitmap)) { - this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); + this._nullCount = nullCount = nullBitmap.length === 0 ? + // no null bitmap, so all values are valid + 0 : + this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); } return nullCount; } @@ -177,16 +180,16 @@ export class Data { // if we have a nullBitmap, truncate + slice and set it over the pre-filled 1s if (this.nullCount > 0) { nullBitmap.set(truncateBitmap(offset, length, this.nullBitmap), 0); + Object.assign(this, { nullBitmap }); + } else { + Object.assign(this, { nullBitmap, _nullCount: 0 }); } - Object.assign(this, { nullBitmap, _nullCount: -1 }); } const byte = nullBitmap[byteOffset]; prev = (byte & mask) !== 0; - value ? - (nullBitmap[byteOffset] = byte | mask) : - (nullBitmap[byteOffset] = byte & ~mask); + nullBitmap[byteOffset] = value ? (byte | mask) : (byte & ~mask); } if (prev !== !!value) { diff --git a/js/src/vector.ts b/js/src/vector.ts index a7c103bc326ee..1b0d9a05796f0 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -445,7 +445,7 @@ export function makeVector(init: any) { if (init instanceof DataView) { init = new Uint8Array(init.buffer); } - const props = { offset: 0, length: init.length, nullCount: 0, data: init }; + const props = { offset: 0, length: init.length, nullCount: -1, data: init }; if (init instanceof Int8Array) { return new Vector([makeData({ ...props, type: new dtypes.Int8 })]); } if (init instanceof Int16Array) { return new Vector([makeData({ ...props, type: new dtypes.Int16 })]); } if (init instanceof Int32Array) { return new Vector([makeData({ ...props, type: new dtypes.Int32 })]); } diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts index bfcf0d8547861..a10d7c757ca17 100644 --- a/js/test/unit/vector/vector-tests.ts +++ b/js/test/unit/vector/vector-tests.ts @@ -16,7 +16,7 @@ // under the License. import { - Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray + Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray, makeData } from 'apache-arrow'; describe(`makeVectorFromArray`, () => { @@ -33,6 +33,47 @@ describe(`makeVectorFromArray`, () => { }); }); +describe(`basic vector methods`, () => { + test(`not nullable`, () => { + const vector = makeVector([makeData({ data: new Int32Array([1, 2, 3]), nullCount: -1, type: new Int32() })]); + expect(vector.nullable).toBe(false); + expect(vector.nullCount).toBe(0); + }); + + test(`nullable`, () => { + const vector = makeVector([makeData({ data: new Int32Array([1, 2, 3]), nullCount: 0, type: new Int32() })]); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(0); + expect(vector.isValid(0)).toBe(true); + + // set a value to null + vector.set(0, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + expect(vector.isValid(0)).toBe(false); + + // set the same value to null which should not change anything + vector.set(0, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + + // set a different value to null + vector.set(1, null); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(2); + + // set first value to non-null + vector.set(0, 1); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(1); + + // set last null to non-null + vector.set(1, 2); + expect(vector.nullable).toBe(true); + expect(vector.nullCount).toBe(0); + }); +}); + describe(`StructVector`, () => { test(`makeVectorFromArray`, () => { const values: { a?: number; b?: string | null; c?: boolean | null }[] = [ @@ -108,7 +149,6 @@ describe(`DateVector`, () => { }); describe(`DictionaryVector`, () => { - const dictionary = ['foo', 'bar', 'baz']; const extras = ['abc', '123']; // values to search for that should NOT be found const dictionary_vec = vectorFromArray(dictionary, new Utf8).memoize(); @@ -117,7 +157,6 @@ describe(`DictionaryVector`, () => { const validity = Array.from({ length: indices.length }, () => Math.random() > 0.2); describe(`index with nullCount == 0`, () => { - const values = indices.map((d) => dictionary[d]); const vector = makeVector({ data: indices, @@ -133,7 +172,6 @@ describe(`DictionaryVector`, () => { }); describe(`index with nullCount > 0`, () => { - const nullBitmap = util.packBools(validity); const nullCount = validity.reduce((acc, d) => acc + (d ? 0 : 1), 0); const values = indices.map((d, i) => validity[i] ? dictionary[d] : null); From 2caec860894945e8bfe5b557c825ba962a6a16bd Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Wed, 3 Apr 2024 18:32:12 -0400 Subject: [PATCH 54/81] GH-40891: [JS] Store Dates as TimestampMillisecond (#40892) Fixes #40891 Tested with ```ts const date = new Date("2023-03-29T12:34:56Z"); console.log("original", date) console.log("=> vec") const vec = arrow.vectorFromArray([date]) console.log(vec.toArray()) console.log(vec.toJSON()) console.log(vec.type) console.log(vec.get(0)) console.log("=> vec2") const vec2 = arrow.vectorFromArray([date], new arrow.DateMillisecond) console.log(vec2.toArray()) console.log(vec.toJSON()) console.log(vec2.type) console.log(vec2.get(0)) console.log("=> table") const table = arrow.tableFromJSON([{ date }]) console.log(table.toArray()) console.log(table.schema.fields[0].type) console.log(table.getChildAt(0)?.get(0)) console.log("=> table2") const table2 = arrow.tableFromIPC(arrow.tableToIPC(table)); console.log(table2.toArray()) console.log(table2.schema.fields[0].type) console.log(table2.getChildAt(0)?.get(0)) console.log("=> table3") const table3 = new arrow.Table({ dates: vec2 }) console.log(table3.toArray()) console.log(table3.schema.fields[0].type) console.log(table3.getChildAt(0)?.get(0)) ``` ``` => table [ {"date": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)} ] TimestampMillisecond { typeId: 10, unit: 1, timezone: undefined, toString: [Function: toString], ArrayType: [class Int32Array], [Symbol(Symbol.toStringTag)]: "Timestamp", children: null, OffsetArrayType: [class Int32Array], } 2023-03-29T12:34:56.000Z => table2 [ {"date": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)} ] Timestamp_ { typeId: 10, unit: 1, timezone: null, toString: [Function: toString], ArrayType: [class Int32Array], children: null, OffsetArrayType: [class Int32Array], } 2023-03-29T12:34:56.000Z => table3 [ {"dates": Wed Mar 29 2023 08:34:56 GMT-0400 (Eastern Daylight Time)} ] DateMillisecond { typeId: 8, unit: 1, toString: [Function: toString], ArrayType: [class Int32Array], [Symbol(Symbol.toStringTag)]: "Date", children: null, OffsetArrayType: [class Int32Array], } 2023-03-29T12:34:56.000Z ``` * GitHub Issue: #40891 --- js/src/factories.ts | 4 ++-- js/src/type.ts | 14 +++++++++++++- js/test/unit/vector/date-vector-tests.ts | 19 ++++++++++++++----- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/js/src/factories.ts b/js/src/factories.ts index aa54498c50bc0..657ae1b95ab92 100644 --- a/js/src/factories.ts +++ b/js/src/factories.ts @@ -65,7 +65,7 @@ export function makeBuilder(option export function vectorFromArray(values: readonly (null | undefined)[], type?: dtypes.Null): Vector; export function vectorFromArray(values: readonly (null | undefined | boolean)[], type?: dtypes.Bool): Vector; export function vectorFromArray = dtypes.Dictionary>(values: readonly (null | undefined | string)[], type?: T): Vector; -export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; +export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type: T): Vector; export function vectorFromArray(values: readonly (null | undefined | bigint)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type?: T): Vector; @@ -145,7 +145,7 @@ function inferType(value: readonly unknown[]): dtypes.DataType { } else if (booleansCount + nullsCount === value.length) { return new dtypes.Bool; } else if (datesCount + nullsCount === value.length) { - return new dtypes.DateMillisecond; + return new dtypes.TimestampMillisecond; } else if (arraysCount + nullsCount === value.length) { const array = value as Array[]; const childType = inferType(array[array.findIndex((ary) => ary != null)]); diff --git a/js/src/type.ts b/js/src/type.ts index ae3aefa025999..a42552d65ad27 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -349,7 +349,19 @@ export class Date_ extends DataType { /** @ignore */ export class DateDay extends Date_ { constructor() { super(DateUnit.DAY); } } -/** @ignore */ +/** + * A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) in milliseconds. + * According to the specification, this should be treated as the number of days, in milliseconds, since the UNIX epoch. + * Therefore, values must be evenly divisible by `86_400_000` (the number of milliseconds in a standard day). + * + * Practically, validation that values of this type are evenly divisible by `86_400_000` is not enforced by this library + * for performance and usability reasons. + * + * Users should prefer to use {@link DateDay} to cleanly represent the number of days. For JS dates, + * {@link TimestampMillisecond} is the preferred type. + * + * @ignore + */ export class DateMillisecond extends Date_ { constructor() { super(DateUnit.MILLISECOND); } } /** @ignore */ diff --git a/js/test/unit/vector/date-vector-tests.ts b/js/test/unit/vector/date-vector-tests.ts index f8b4c1c7976d2..e5cd49933eac5 100644 --- a/js/test/unit/vector/date-vector-tests.ts +++ b/js/test/unit/vector/date-vector-tests.ts @@ -15,10 +15,19 @@ // specific language governing permissions and limitations // under the License. -import { DateDay, DateMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow'; +import { DateDay, DateMillisecond, TimestampMillisecond, RecordBatchReader, Table, vectorFromArray } from 'apache-arrow'; + +describe(`TimestampVector`, () => { + test(`Dates are stored in TimestampMillisecond`, () => { + const date = new Date('2023-02-01T12:34:56Z'); + const vec = vectorFromArray([date]); + expect(vec.type).toBeInstanceOf(TimestampMillisecond); + expect(vec.get(0)).toBe(date.valueOf()); + }); +}); describe(`DateVector`, () => { - it('returns days since the epoch as correct JS Dates', () => { + test(`returns days since the epoch as correct JS Dates`, () => { const table = new Table(RecordBatchReader.from(test_data)); const expectedMillis = expectedMillis32(); const date32 = table.getChildAt(0)!; @@ -28,7 +37,7 @@ describe(`DateVector`, () => { } }); - it('returns millisecond longs since the epoch as correct JS Dates', () => { + test(`returns millisecond longs since the epoch as correct JS Dates`, () => { const table = new Table(RecordBatchReader.from(test_data)); const expectedMillis = expectedMillis64(); const date64 = table.getChildAt(1)!; @@ -38,9 +47,9 @@ describe(`DateVector`, () => { } }); - it('returns the same date that was in the vector', () => { + test(`returns the same date that was in the vector`, () => { const dates = [new Date(1950, 1, 0)]; - const vec = vectorFromArray(dates); + const vec = vectorFromArray(dates, new DateMillisecond()); for (const date of vec) { expect(date).toEqual(dates.shift()); } From 89d5d8b40a5074a7ae30430b3fa95f7a9daf16da Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 3 Apr 2024 17:14:06 -0800 Subject: [PATCH 55/81] MINOR: [R] Replace use of show_query in test-duckdb.R with dbplyr::sql_build (#40955) ### Rationale for this change I just ran the R package tests and saw a printed query mixed amongst testthat output: ``` [ FAIL 0 | WARN 0 | SKIP 1 | PASS 15 ] SELECT * FROM arrow_010 ``` I thought it would be good to silence this in some way. ### What changes are included in this PR? I silenced this by changing out the call to `dplyr::show_query` for `dbplyr::sql_build` which produces different output but (1) doesn't print as as side-effect, (2) is specifically made for testing, and (3) still produces output we can use in this test. For reference, this is what show_query prints (assuming via `cat`): ``` > show_query(table_four) SELECT * FROM arrow_011 ``` Whereas `sql_build`: ``` > dbplyr::sql_build(table_four) [1] `arrow_011` ``` ### Are these changes tested? Yes, but just manually on my system. ### Are there any user-facing changes? No. Authored-by: Bryce Mecum Signed-off-by: Bryce Mecum --- r/tests/testthat/test-duckdb.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-duckdb.R b/r/tests/testthat/test-duckdb.R index 33ab1ecc7aa4d..dd7b6dba7fde0 100644 --- a/r/tests/testthat/test-duckdb.R +++ b/r/tests/testthat/test-duckdb.R @@ -281,7 +281,7 @@ test_that("to_duckdb passing a connection", { to_duckdb(con = con_separate, auto_disconnect = FALSE) # Generates a query like SELECT * FROM arrow_xxx - table_four_query <- paste(show_query(table_four), collapse = "\n") + table_four_query <- paste(dbplyr::sql_build(table_four), collapse = "\n") table_four_name <- stringr::str_extract(table_four_query, "arrow_[0-9]{3}") expect_false(is.na(table_four_name)) From 36ed0328b43ca39533e58a889f8e091d1f1ca7dc Mon Sep 17 00:00:00 2001 From: James Henderson Date: Thu, 4 Apr 2024 05:39:37 +0100 Subject: [PATCH 56/81] GH-24826: [Java] Add DUV.setOffset method (#40985) ### Are these changes tested? Yes ### Are there any user-facing changes? Yes - the addition of a public DUV.setOffset method * GitHub Issue: #24826 Authored-by: James Henderson Signed-off-by: David Li --- .../codegen/templates/DenseUnionVector.java | 8 ++++ .../arrow/vector/TestDenseUnionVector.java | 38 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java index c23caf3bb5a03..8edd167152d7f 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionVector.java +++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -908,6 +908,14 @@ private int getTypeBufferValueCapacity() { return (int) typeBuffer.capacity() / TYPE_WIDTH; } + public void setOffset(int index, int offset) { + while (index >= getOffsetBufferValueCapacity()) { + reallocOffsetBuffer(); + } + + offsetBuffer.setInt((long) index * OFFSET_WIDTH, offset); + } + private long getOffsetBufferValueCapacity() { return offsetBuffer.capacity() / OFFSET_WIDTH; } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java index 8fd33eb5a8432..2c29861561bb7 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java @@ -99,6 +99,44 @@ public void testDenseUnionVector() throws Exception { } } + @Test + public void testSetOffset() { + try (DenseUnionVector duv = DenseUnionVector.empty("foo", allocator)) { + duv.allocateNew(); + byte i32TypeId = duv.registerNewTypeId(Field.notNullable("i32", MinorType.INT.getType())); + byte f64TypeId = duv.registerNewTypeId(Field.notNullable("f64", MinorType.FLOAT8.getType())); + + IntVector i32Vector = ((IntVector) duv.addVector(i32TypeId, new IntVector("i32", allocator))); + Float8Vector f64Vector = ((Float8Vector) duv.addVector(f64TypeId, new Float8Vector("f64", allocator))); + + i32Vector.allocateNew(3); + f64Vector.allocateNew(1); + + duv.setTypeId(0, i32TypeId); + duv.setOffset(0, 0); + i32Vector.set(0, 42); + + duv.setTypeId(1, i32TypeId); + duv.setOffset(1, 1); + i32Vector.set(1, 43); + + duv.setTypeId(2, f64TypeId); + duv.setOffset(2, 0); + f64Vector.set(0, 3.14); + + duv.setTypeId(3, i32TypeId); + duv.setOffset(3, 2); + i32Vector.set(2, 44); + + duv.setValueCount(4); + + assertEquals(42, duv.getObject(0)); + assertEquals(43, duv.getObject(1)); + assertEquals(3.14, duv.getObject(2)); + assertEquals(44, duv.getObject(3)); + } + } + @Test public void testTransfer() throws Exception { try (DenseUnionVector srcVector = new DenseUnionVector(EMPTY_SCHEMA_PATH, allocator, null, null)) { From 26631d7504420ff00a827d40273b589c6d38860f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 4 Apr 2024 10:29:26 +0200 Subject: [PATCH 57/81] GH-40806: [C++] Revert changes from PR #40857 (#40980) Revert changes from https://github.com/apache/arrow/pull/40857. `GetRuntimeInfo` returns the SIMD level for dynamic dispatch, but Neon currently does not participate in dynamic dispatch (actually, Neon should be available by default on all modern Arm CPUs AFAIU). Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/config.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/config.cc b/cpp/src/arrow/config.cc index 1f852e84d3d5c..9e32e5437325f 100644 --- a/cpp/src/arrow/config.cc +++ b/cpp/src/arrow/config.cc @@ -58,8 +58,6 @@ std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) { return "avx"; } else if (query_flag(CpuInfo::SSE4_2)) { return "sse4_2"; - } else if (query_flag(CpuInfo::ASIMD)) { - return "neon"; } else { return "none"; } From 640c10191a51f6d0f408c72f45dbf5d94ec0b9d7 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 4 Apr 2024 02:51:58 -0700 Subject: [PATCH 58/81] GH-40224: [C++] Fix: improve the backpressure handling in the dataset writer (#40722) ### Rationale for this change The dataset writer would fire the resume callback as soon as the underlying dataset writer's queues freed up, even if there were pending tasks. Backpressure is not applied immediately and so a few tasks will always trickle in. If backpressure is pausing and then resuming frequently this can lead to a buildup of pending tasks and uncontrolled memory growth. ### What changes are included in this PR? The resume callback is not called until all pending write tasks have completed. ### Are these changes tested? There is quite an extensive set of tests for the dataset writer already and they continue to pass. I ran them on repeat, with and without stress, and did not see any issues. However, the underlying problem (dataset writer can have uncontrolled memory growth) is still not tested as it is quite difficult to test. I was able to run the setup described in the issue to reproduce the issue. With this fix the repartitioning task completes for me. ### Are there any user-facing changes? No * GitHub Issue: #40224 Authored-by: Weston Pace Signed-off-by: Antoine Pitrou --- cpp/src/arrow/dataset/dataset_writer.cc | 36 ++++++++++++++++++++----- cpp/src/arrow/util/async_util.cc | 7 +++++ cpp/src/arrow/util/async_util.h | 3 +++ cpp/src/arrow/util/async_util_test.cc | 1 + 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset_writer.cc b/cpp/src/arrow/dataset/dataset_writer.cc index 34731d19ab3eb..754386275d60c 100644 --- a/cpp/src/arrow/dataset/dataset_writer.cc +++ b/cpp/src/arrow/dataset/dataset_writer.cc @@ -515,7 +515,7 @@ class DatasetWriter::DatasetWriterImpl { std::function finish_callback, uint64_t max_rows_queued) : scheduler_(scheduler), write_tasks_(util::MakeThrottledAsyncTaskGroup( - scheduler_, 1, /*queue=*/nullptr, + scheduler_, /*max_concurrent_cost=*/1, /*queue=*/nullptr, [finish_callback = std::move(finish_callback)] { finish_callback(); return Status::OK(); @@ -541,6 +541,23 @@ class DatasetWriter::DatasetWriterImpl { } } + void ResumeIfNeeded() { + if (!paused_) { + return; + } + bool needs_resume = false; + { + std::lock_guard lg(mutex_); + if (!write_tasks_ || write_tasks_->QueueSize() == 0) { + needs_resume = true; + } + } + if (needs_resume) { + paused_ = false; + resume_callback_(); + } + } + void WriteRecordBatch(std::shared_ptr batch, const std::string& directory, const std::string& prefix) { write_tasks_->AddSimpleTask( @@ -549,11 +566,14 @@ class DatasetWriter::DatasetWriterImpl { WriteAndCheckBackpressure(std::move(batch), directory, prefix); if (!has_room.is_finished()) { // We don't have to worry about sequencing backpressure here since - // task_group_ serves as our sequencer. If batches continue to arrive after - // we pause they will queue up in task_group_ until we free up and call - // Resume + // task_group_ serves as our sequencer. If batches continue to arrive + // after we pause they will queue up in task_group_ until we free up and + // call Resume pause_callback_(); - return has_room.Then([this] { resume_callback_(); }); + paused_ = true; + return has_room.Then([this] { ResumeIfNeeded(); }); + } else { + ResumeIfNeeded(); } return has_room; }, @@ -571,6 +591,9 @@ class DatasetWriter::DatasetWriterImpl { return Future<>::MakeFinished(); }, "DatasetWriter::FinishAll"sv); + // Reset write_tasks_ to signal that we are done adding tasks, this will allow + // us to invoke the finish callback once the tasks wrap up. + std::lock_guard lg(mutex_); write_tasks_.reset(); } @@ -660,7 +683,7 @@ class DatasetWriter::DatasetWriterImpl { } util::AsyncTaskScheduler* scheduler_ = nullptr; - std::unique_ptr write_tasks_; + std::unique_ptr write_tasks_; Future<> finish_fut_ = Future<>::Make(); FileSystemDatasetWriteOptions write_options_; DatasetWriterState writer_state_; @@ -670,6 +693,7 @@ class DatasetWriter::DatasetWriterImpl { std::unordered_map> directory_queues_; std::mutex mutex_; + bool paused_ = false; Status err_; }; diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc index 63e27bfbe5773..fbd45eadac2cd 100644 --- a/cpp/src/arrow/util/async_util.cc +++ b/cpp/src/arrow/util/async_util.cc @@ -118,6 +118,8 @@ class FifoQueue : public ThrottledAsyncTaskScheduler::Queue { void Purge() override { tasks_.clear(); } + std::size_t Size() const override { return tasks_.size(); } + private: std::list> tasks_; }; @@ -332,6 +334,10 @@ class ThrottledAsyncTaskSchedulerImpl void Pause() override { throttle_->Pause(); } void Resume() override { throttle_->Resume(); } + std::size_t QueueSize() override { + std::lock_guard lk(mutex_); + return queue_->Size(); + } const util::tracing::Span& span() const override { return target_->span(); } private: @@ -499,6 +505,7 @@ class ThrottledAsyncTaskGroup : public ThrottledAsyncTaskScheduler { : throttle_(std::move(throttle)), task_group_(std::move(task_group)) {} void Pause() override { throttle_->Pause(); } void Resume() override { throttle_->Resume(); } + std::size_t QueueSize() override { return throttle_->QueueSize(); } const util::tracing::Span& span() const override { return task_group_->span(); } bool AddTask(std::unique_ptr task) override { return task_group_->AddTask(std::move(task)); diff --git a/cpp/src/arrow/util/async_util.h b/cpp/src/arrow/util/async_util.h index 7a675da59facd..d9ed63bdbce22 100644 --- a/cpp/src/arrow/util/async_util.h +++ b/cpp/src/arrow/util/async_util.h @@ -226,6 +226,7 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler { virtual bool Empty() = 0; /// Purge the queue of all items virtual void Purge() = 0; + virtual std::size_t Size() const = 0; }; class Throttle { @@ -277,6 +278,8 @@ class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler { /// Allows task to be submitted again. If there is a max_concurrent_cost limit then /// it will still apply. virtual void Resume() = 0; + /// Return the number of tasks queued but not yet submitted + virtual std::size_t QueueSize() = 0; /// Create a throttled view of a scheduler /// diff --git a/cpp/src/arrow/util/async_util_test.cc b/cpp/src/arrow/util/async_util_test.cc index 313ca91912335..1f9aad453e9c4 100644 --- a/cpp/src/arrow/util/async_util_test.cc +++ b/cpp/src/arrow/util/async_util_test.cc @@ -680,6 +680,7 @@ class PriorityQueue : public ThrottledAsyncTaskScheduler::Queue { queue_.pop(); } } + std::size_t Size() const { return queue_.size(); } private: std::priority_queue, From ad6758900da1706d3cbfd59e5fe7d1d548c4235b Mon Sep 17 00:00:00 2001 From: James Henderson Date: Thu, 4 Apr 2024 12:34:40 +0100 Subject: [PATCH 59/81] MINOR: [Java] `DenseUnionVector.empty` should create not-nullable DUVs (#41001) ### Rationale for this change DUVs do not have a validity vector, so cannot be set to null - `isNull`, for example, always returns false. This change ensures vectors created through `DenseUnionVector.empty` reflect this in their FieldType. ### What changes are included in this PR? `DenseUnionVector.empty` now creates DUVs with a not-nullable `FieldType` ### Are these changes tested? I haven't added an explicit test for this as it would essentially be testing `FieldType.notNullable` - confirmed that it doesn't break any existing tests. ### Are there any user-facing changes? Yes, strictly speaking this is a public change, correcting a bug in a public API. **This PR includes breaking changes to public APIs.** Authored-by: James Henderson Signed-off-by: David Li --- java/vector/src/main/codegen/templates/DenseUnionVector.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java index 8edd167152d7f..27fd8e9798b67 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionVector.java +++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -124,7 +124,7 @@ public class DenseUnionVector extends AbstractContainerVector implements FieldVe ArrowType.Struct.INSTANCE, /*dictionary*/ null, /*metadata*/ null); public static DenseUnionVector empty(String name, BufferAllocator allocator) { - FieldType fieldType = FieldType.nullable(new ArrowType.Union( + FieldType fieldType = FieldType.notNullable(new ArrowType.Union( UnionMode.Dense, null)); return new DenseUnionVector(name, allocator, fieldType, null); } From bbeeb33a2fb65f40caf6c3176ee377de2b9de6e5 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 4 Apr 2024 14:38:16 +0200 Subject: [PATCH 60/81] GH-40974: [CI][Python] CI failures on Python builds due to pytest_cython (#40975) ### Rationale for this change We are seeing sporadic CI failures on Python builds due to `pytest_cython`. ### What changes are included in this PR? Upper pin of 0.3.0 added for `pytest-cython`. ### Are these changes tested? With a green CI. ### Are there any user-facing changes? No. * GitHub Issue: #40974 Authored-by: AlenkaF Signed-off-by: Sutou Kouhei --- ci/conda_env_sphinx.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 6899f9c36a7f6..0a356d5722c42 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -29,5 +29,8 @@ sphinx-copybutton sphinxcontrib-jquery sphinx==6.2 # Requirement for doctest-cython -pytest-cython +# Needs upper pin of 0.3.0, see: +# https://github.com/lgpage/pytest-cython/issues/67 +# With 0.3.* bug fix release, the pin can be removed +pytest-cython==0.2.2 pandas From b99b00dd66586cf54b04ce6a51eb1cf68b1510a3 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 4 Apr 2024 21:35:00 +0800 Subject: [PATCH 61/81] GH-40994: [C++][Parquet] RleBooleanDecoder supports DecodeArrow with nulls (#40995) ### Rationale for this change Supports DecodeArrow with nulls in RleBooleanDecoder ### What changes are included in this PR? Supports DecodeArrow with nulls in RleBooleanDecoder ### Are these changes tested? Yes ### Are there any user-facing changes? currently not * GitHub Issue: #40994 Lead-authored-by: mwish Co-authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 63 ++++++++++++++++++++------- cpp/src/parquet/encoding_benchmark.cc | 9 ++-- cpp/src/parquet/encoding_test.cc | 46 +++++++++---------- 3 files changed, 74 insertions(+), 44 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index f16e9b34fc682..6e93b493392c9 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3143,27 +3143,58 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out) override { - if (null_count != 0) { - // TODO(ARROW-34660): implement DecodeArrow with null slots. - ParquetException::NYI("RleBoolean DecodeArrow with null slots"); + if (null_count == num_values) { + PARQUET_THROW_NOT_OK(out->AppendNulls(null_count)); + return 0; } constexpr int kBatchSize = 1024; std::array values; - int sum_decode_count = 0; - do { - int current_batch = std::min(kBatchSize, num_values); - int decoded_count = decoder_->GetBatch(values.data(), current_batch); - if (decoded_count == 0) { - break; + const int num_non_null_values = num_values - null_count; + // Remaining non-null boolean values to read from decoder. + // We decode from `decoder_` with maximum 1024 size batches. + int num_remain_non_null_values = num_non_null_values; + int current_index_in_batch = 0; + int current_batch_size = 0; + auto next_boolean_batch = [&]() { + DCHECK_GT(num_remain_non_null_values, 0); + DCHECK_EQ(current_index_in_batch, current_batch_size); + current_batch_size = std::min(num_remain_non_null_values, kBatchSize); + int decoded_count = decoder_->GetBatch(values.data(), current_batch_size); + if (ARROW_PREDICT_FALSE(decoded_count != current_batch_size)) { + // required values is more than values in decoder. + ParquetException::EofException(); } - sum_decode_count += decoded_count; - PARQUET_THROW_NOT_OK(out->Reserve(sum_decode_count)); - for (int i = 0; i < decoded_count; ++i) { - PARQUET_THROW_NOT_OK(out->Append(values[i])); + num_remain_non_null_values -= current_batch_size; + current_index_in_batch = 0; + }; + + // Reserve all values including nulls first + PARQUET_THROW_NOT_OK(out->Reserve(num_values)); + if (null_count == 0) { + // Fast-path for not having nulls. + do { + next_boolean_batch(); + PARQUET_THROW_NOT_OK( + out->AppendValues(values.begin(), values.begin() + current_batch_size)); + num_values -= current_batch_size; + current_index_in_batch = 0; + } while (num_values > 0); + return num_non_null_values; + } + auto next_value = [&]() -> bool { + if (current_index_in_batch == current_batch_size) { + next_boolean_batch(); + DCHECK_GT(current_batch_size, 0); } - num_values -= decoded_count; - } while (num_values > 0); - return sum_decode_count; + DCHECK_LT(current_index_in_batch, current_batch_size); + bool value = values[current_index_in_batch]; + ++current_index_in_batch; + return value; + }; + VisitNullBitmapInline( + valid_bits, valid_bits_offset, num_values, null_count, + [&]() { out->UnsafeAppend(next_value()); }, [&]() { out->UnsafeAppendNull(); }); + return num_non_null_values; } int DecodeArrow( diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 9c07d262b350e..a858c53e931d8 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -1518,11 +1518,10 @@ BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) (benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); } BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull) ->Range(MIN_RANGE, MAX_RANGE); -// TODO(mwish): RleBoolean not implemented DecodeArrow with null slots yet. -// BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) -//(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } -// BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) -// ->Apply(BooleanWithNullCustomArguments); +BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) +(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); } +BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull) + ->Apply(BooleanWithNullCustomArguments); BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrow) (benchmark::State& state) { DecodeArrowDenseBenchmark(state); } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index ea0029f4c7d7f..bb5126ce251d4 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -602,7 +602,7 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { // Check that one can put several Arrow arrays into a given encoder // and decode to the right values (see GH-36939) -TEST(PlainBooleanArrayEncoding, AdHocRoundTrip) { +TEST(BooleanArrayEncoding, AdHocRoundTrip) { std::vector> arrays{ ::arrow::ArrayFromJSON(::arrow::boolean(), R"([])"), ::arrow::ArrayFromJSON(::arrow::boolean(), R"([false, null, true])"), @@ -610,27 +610,29 @@ TEST(PlainBooleanArrayEncoding, AdHocRoundTrip) { ::arrow::ArrayFromJSON(::arrow::boolean(), R"([true, null, false])"), }; - auto encoder = MakeTypedEncoder(Encoding::PLAIN, - /*use_dictionary=*/false); - for (const auto& array : arrays) { - encoder->Put(*array); - } - auto buffer = encoder->FlushValues(); - auto decoder = MakeTypedDecoder(Encoding::PLAIN); - EXPECT_OK_AND_ASSIGN(auto expected, ::arrow::Concatenate(arrays)); - decoder->SetData(static_cast(expected->length()), buffer->data(), - static_cast(buffer->size())); - - ::arrow::BooleanBuilder builder; - ASSERT_EQ(static_cast(expected->length() - expected->null_count()), - decoder->DecodeArrow(static_cast(expected->length()), - static_cast(expected->null_count()), - expected->null_bitmap_data(), 0, &builder)); + for (auto encoding : {Encoding::PLAIN, Encoding::RLE}) { + auto encoder = MakeTypedEncoder(encoding, + /*use_dictionary=*/false); + for (const auto& array : arrays) { + encoder->Put(*array); + } + auto buffer = encoder->FlushValues(); + auto decoder = MakeTypedDecoder(encoding); + EXPECT_OK_AND_ASSIGN(auto expected, ::arrow::Concatenate(arrays)); + decoder->SetData(static_cast(expected->length()), buffer->data(), + static_cast(buffer->size())); + + ::arrow::BooleanBuilder builder; + ASSERT_EQ(static_cast(expected->length() - expected->null_count()), + decoder->DecodeArrow(static_cast(expected->length()), + static_cast(expected->null_count()), + expected->null_bitmap_data(), 0, &builder)); - std::shared_ptr<::arrow::Array> result; - ASSERT_OK(builder.Finish(&result)); - ASSERT_EQ(expected->length(), result->length()); - ::arrow::AssertArraysEqual(*expected, *result, /*verbose=*/true); + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(builder.Finish(&result)); + ASSERT_EQ(expected->length(), result->length()); + ::arrow::AssertArraysEqual(*expected, *result, /*verbose=*/true); + } } template @@ -963,8 +965,6 @@ TYPED_TEST(EncodingAdHocTyped, ByteStreamSplitArrowDirectPut) { } TYPED_TEST(EncodingAdHocTyped, RleArrowDirectPut) { - // TODO: test with nulls once RleBooleanDecoder::DecodeArrow supports them - this->null_probability_ = 0; for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { this->Rle(seed); } From 72d20ad719021c5513620e23a0a65fb724f0e299 Mon Sep 17 00:00:00 2001 From: Clif Houck Date: Thu, 4 Apr 2024 08:59:30 -0500 Subject: [PATCH 62/81] GH-20213: [C++] Implement cast to/from halffloat (#40067) ### Rationale for this change ### What changes are included in this PR? This PR implements casting to and from float16 types using the vendored float16 library included in arrow at `cpp/arrrow/util/float16.*`. ### Are these changes tested? Unit tests are included in this PR. ### Are there any user-facing changes? In that casts to and from float16 will now work, yes. * Closes: #20213 ### TODO - [x] Add casts to/from float64. - [x] String <-> float16 casts. - [x] Integer <-> float16 casts. - [x] Tests. - [x] Update https://github.com/apache/arrow/blob/main/docs/source/status.rst about half float. - [x] Rebase. - [x] Run clang format over this PR. * GitHub Issue: #20213 Authored-by: Clif Houck Signed-off-by: Sutou Kouhei --- c_glib/test/test-half-float-scalar.rb | 2 +- cpp/src/arrow/compare.cc | 30 +++++ .../compute/kernels/scalar_cast_internal.cc | 70 ++++++++++++ .../compute/kernels/scalar_cast_numeric.cc | 103 +++++++++++++++--- .../compute/kernels/scalar_cast_string.cc | 4 + .../arrow/compute/kernels/scalar_cast_test.cc | 25 +++-- cpp/src/arrow/ipc/json_simple.cc | 32 +++++- cpp/src/arrow/ipc/json_simple_test.cc | 35 +++++- cpp/src/arrow/record_batch_test.cc | 3 + cpp/src/arrow/type_traits.h | 1 + cpp/src/arrow/util/formatting.cc | 11 ++ cpp/src/arrow/util/formatting.h | 7 ++ cpp/src/arrow/util/value_parsing.cc | 14 +++ cpp/src/arrow/util/value_parsing.h | 17 +++ docs/source/status.rst | 11 +- 15 files changed, 325 insertions(+), 40 deletions(-) diff --git a/c_glib/test/test-half-float-scalar.rb b/c_glib/test/test-half-float-scalar.rb index ac41f91ece621..3073d84d796cf 100644 --- a/c_glib/test/test-half-float-scalar.rb +++ b/c_glib/test/test-half-float-scalar.rb @@ -41,7 +41,7 @@ def test_equal end def test_to_s - assert_equal("[\n #{@half_float}\n]", @scalar.to_s) + assert_equal("1.0009765625", @scalar.to_s) end def test_value diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index bb632e2eb912d..e983b47e39dc4 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -44,6 +44,7 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/float16.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -59,6 +60,7 @@ using internal::BitmapReader; using internal::BitmapUInt64Reader; using internal::checked_cast; using internal::OptionalBitmapEquals; +using util::Float16; // ---------------------------------------------------------------------- // Public method implementations @@ -95,6 +97,30 @@ struct FloatingEquality { const T epsilon; }; +// For half-float equality. +template +struct FloatingEquality { + explicit FloatingEquality(const EqualOptions& options) + : epsilon(static_cast(options.atol())) {} + + bool operator()(uint16_t x, uint16_t y) const { + Float16 f_x = Float16::FromBits(x); + Float16 f_y = Float16::FromBits(y); + if (x == y) { + return Flags::signed_zeros_equal || (f_x.signbit() == f_y.signbit()); + } + if (Flags::nans_equal && f_x.is_nan() && f_y.is_nan()) { + return true; + } + if (Flags::approximate && (fabs(f_x.ToFloat() - f_y.ToFloat()) <= epsilon)) { + return true; + } + return false; + } + + const float epsilon; +}; + template struct FloatingEqualityDispatcher { const EqualOptions& options; @@ -259,6 +285,8 @@ class RangeDataEqualsImpl { Status Visit(const DoubleType& type) { return CompareFloating(type); } + Status Visit(const HalfFloatType& type) { return CompareFloating(type); } + // Also matches StringType Status Visit(const BinaryType& type) { return CompareBinary(type); } @@ -863,6 +891,8 @@ class ScalarEqualsVisitor { Status Visit(const DoubleScalar& left) { return CompareFloating(left); } + Status Visit(const HalfFloatScalar& left) { return CompareFloating(left); } + template enable_if_t::value, Status> Visit(const T& left) { const auto& right = checked_cast(right_); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc index 8cf5a04addb00..d8c4088759643 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc @@ -19,10 +19,13 @@ #include "arrow/compute/cast_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/extension_type.h" +#include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/float16.h" namespace arrow { +using arrow::util::Float16; using internal::checked_cast; using internal::PrimitiveScalarBase; @@ -47,6 +50,42 @@ struct CastPrimitive { } }; +// Converting floating types to half float. +template +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + using InT = typename InType::c_type; + const InT* in_values = arr.GetValues(1); + uint16_t* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = Float16(*in_values++).bits(); + } + } +}; + +// Converting from half float to other floating types. +template <> +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + const uint16_t* in_values = arr.GetValues(1); + float* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = Float16::FromBits(*in_values++).ToFloat(); + } + } +}; + +template <> +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + const uint16_t* in_values = arr.GetValues(1); + double* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = Float16::FromBits(*in_values++).ToDouble(); + } + } +}; + template struct CastPrimitive::value>> { // memcpy output @@ -56,6 +95,33 @@ struct CastPrimitive: } }; +// Cast int to half float +template +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + using InT = typename InType::c_type; + const InT* in_values = arr.GetValues(1); + uint16_t* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + float temp = static_cast(*in_values++); + *out_values++ = Float16(temp).bits(); + } + } +}; + +// Cast half float to int +template +struct CastPrimitive> { + static void Exec(const ArraySpan& arr, ArraySpan* out) { + using OutT = typename OutType::c_type; + const uint16_t* in_values = arr.GetValues(1); + OutT* out_values = out->GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = static_cast(Float16::FromBits(*in_values++).ToFloat()); + } + } +}; + template void CastNumberImpl(Type::type out_type, const ArraySpan& input, ArraySpan* out) { switch (out_type) { @@ -79,6 +145,8 @@ void CastNumberImpl(Type::type out_type, const ArraySpan& input, ArraySpan* out) return CastPrimitive::Exec(input, out); case Type::DOUBLE: return CastPrimitive::Exec(input, out); + case Type::HALF_FLOAT: + return CastPrimitive::Exec(input, out); default: break; } @@ -109,6 +177,8 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, return CastNumberImpl(out_type, input, out); case Type::DOUBLE: return CastNumberImpl(out_type, input, out); + case Type::HALF_FLOAT: + return CastNumberImpl(out_type, input, out); default: DCHECK(false); break; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index b054e57f04d12..3df86e7d6936c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -23,6 +23,7 @@ #include "arrow/compute/kernels/util_internal.h" #include "arrow/scalar.h" #include "arrow/util/bit_block_counter.h" +#include "arrow/util/float16.h" #include "arrow/util/int_util.h" #include "arrow/util/value_parsing.h" @@ -34,6 +35,7 @@ using internal::IntegersCanFit; using internal::OptionalBitBlockCounter; using internal::ParseValue; using internal::PrimitiveScalarBase; +using util::Float16; namespace compute { namespace internal { @@ -56,18 +58,37 @@ Status CastFloatingToFloating(KernelContext*, const ExecSpan& batch, ExecResult* // ---------------------------------------------------------------------- // Implement fast safe floating point to integer cast +// +template +struct WasTruncated { + static bool Check(OutT out_val, InT in_val) { + return static_cast(out_val) != in_val; + } + + static bool CheckMaybeNull(OutT out_val, InT in_val, bool is_valid) { + return is_valid && static_cast(out_val) != in_val; + } +}; + +// Half float to int +template +struct WasTruncated { + using OutT = typename OutType::c_type; + static bool Check(OutT out_val, uint16_t in_val) { + return static_cast(out_val) != Float16::FromBits(in_val).ToFloat(); + } + + static bool CheckMaybeNull(OutT out_val, uint16_t in_val, bool is_valid) { + return is_valid && static_cast(out_val) != Float16::FromBits(in_val).ToFloat(); + } +}; // InType is a floating point type we are planning to cast to integer template ARROW_DISABLE_UBSAN("float-cast-overflow") Status CheckFloatTruncation(const ArraySpan& input, const ArraySpan& output) { - auto WasTruncated = [&](OutT out_val, InT in_val) -> bool { - return static_cast(out_val) != in_val; - }; - auto WasTruncatedMaybeNull = [&](OutT out_val, InT in_val, bool is_valid) -> bool { - return is_valid && static_cast(out_val) != in_val; - }; auto GetErrorMessage = [&](InT val) { return Status::Invalid("Float value ", val, " was truncated converting to ", *output.type); @@ -86,26 +107,28 @@ Status CheckFloatTruncation(const ArraySpan& input, const ArraySpan& output) { if (block.popcount == block.length) { // Fast path: branchless for (int64_t i = 0; i < block.length; ++i) { - block_out_of_bounds |= WasTruncated(out_data[i], in_data[i]); + block_out_of_bounds |= + WasTruncated::Check(out_data[i], in_data[i]); } } else if (block.popcount > 0) { // Indices have nulls, must only boundscheck non-null values for (int64_t i = 0; i < block.length; ++i) { - block_out_of_bounds |= WasTruncatedMaybeNull( + block_out_of_bounds |= WasTruncated::CheckMaybeNull( out_data[i], in_data[i], bit_util::GetBit(bitmap, offset_position + i)); } } if (ARROW_PREDICT_FALSE(block_out_of_bounds)) { if (input.GetNullCount() > 0) { for (int64_t i = 0; i < block.length; ++i) { - if (WasTruncatedMaybeNull(out_data[i], in_data[i], - bit_util::GetBit(bitmap, offset_position + i))) { + if (WasTruncated::CheckMaybeNull( + out_data[i], in_data[i], + bit_util::GetBit(bitmap, offset_position + i))) { return GetErrorMessage(in_data[i]); } } } else { for (int64_t i = 0; i < block.length; ++i) { - if (WasTruncated(out_data[i], in_data[i])) { + if (WasTruncated::Check(out_data[i], in_data[i])) { return GetErrorMessage(in_data[i]); } } @@ -151,6 +174,9 @@ Status CheckFloatToIntTruncation(const ExecValue& input, const ExecResult& outpu return CheckFloatToIntTruncationImpl(input.array, *output.array_span()); case Type::DOUBLE: return CheckFloatToIntTruncationImpl(input.array, *output.array_span()); + case Type::HALF_FLOAT: + return CheckFloatToIntTruncationImpl(input.array, + *output.array_span()); default: break; } @@ -293,6 +319,15 @@ struct CastFunctor< } }; +template <> +struct CastFunctor> { + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return applicator::ScalarUnaryNotNull>::Exec(ctx, batch, + out); + } +}; + // ---------------------------------------------------------------------- // Decimal to integer @@ -689,6 +724,10 @@ std::shared_ptr GetCastToInteger(std::string name) { DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, CastFloatingToInteger)); } + // Cast from half-float + DCHECK_OK(func->AddKernel(Type::HALF_FLOAT, {InputType(Type::HALF_FLOAT)}, out_ty, + CastFloatingToInteger)); + // From other numbers to integer AddCommonNumberCasts(out_ty, func.get()); @@ -715,6 +754,10 @@ std::shared_ptr GetCastToFloating(std::string name) { DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, CastFloatingToFloating)); } + // From half-float to float/double + DCHECK_OK(func->AddKernel(Type::HALF_FLOAT, {InputType(Type::HALF_FLOAT)}, out_ty, + CastFloatingToFloating)); + // From other numbers to floating point AddCommonNumberCasts(out_ty, func.get()); @@ -723,6 +766,7 @@ std::shared_ptr GetCastToFloating(std::string name) { CastFunctor::Exec)); DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty, CastFunctor::Exec)); + return func; } @@ -795,6 +839,32 @@ std::shared_ptr GetCastToDecimal256() { return func; } +std::shared_ptr GetCastToHalfFloat() { + // HalfFloat is a bit brain-damaged for now + auto func = std::make_shared("func", Type::HALF_FLOAT); + AddCommonCasts(Type::HALF_FLOAT, float16(), func.get()); + + // Casts from integer to floating point + for (const std::shared_ptr& in_ty : IntTypes()) { + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, + TypeTraits::type_singleton(), + CastIntegerToFloating)); + } + + // Cast from other strings to half float. + for (const std::shared_ptr& in_ty : BaseBinaryTypes()) { + auto exec = GenerateVarBinaryBase(*in_ty); + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, + TypeTraits::type_singleton(), exec)); + } + + DCHECK_OK(func.get()->AddKernel(Type::FLOAT, {InputType(Type::FLOAT)}, float16(), + CastFloatingToFloating)); + DCHECK_OK(func.get()->AddKernel(Type::DOUBLE, {InputType(Type::DOUBLE)}, float16(), + CastFloatingToFloating)); + return func; +} + } // namespace std::vector> GetNumericCasts() { @@ -830,13 +900,14 @@ std::vector> GetNumericCasts() { functions.push_back(GetCastToInteger("cast_uint64")); // HalfFloat is a bit brain-damaged for now - auto cast_half_float = - std::make_shared("cast_half_float", Type::HALF_FLOAT); - AddCommonCasts(Type::HALF_FLOAT, float16(), cast_half_float.get()); + auto cast_half_float = GetCastToHalfFloat(); functions.push_back(cast_half_float); - functions.push_back(GetCastToFloating("cast_float")); - functions.push_back(GetCastToFloating("cast_double")); + auto cast_float = GetCastToFloating("cast_float"); + functions.push_back(cast_float); + + auto cast_double = GetCastToFloating("cast_double"); + functions.push_back(cast_double); functions.push_back(GetCastToDecimal128()); functions.push_back(GetCastToDecimal256()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index a6576e4e4c26f..3a8352a9b870f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -437,6 +437,10 @@ void AddNumberToStringCasts(CastFunction* func) { GenerateNumeric(*in_ty), NullHandling::COMPUTED_NO_PREALLOCATE)); } + + DCHECK_OK(func->AddKernel(Type::HALF_FLOAT, {float16()}, out_ty, + NumericToStringCastFunctor::Exec, + NullHandling::COMPUTED_NO_PREALLOCATE)); } template diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index a8acf68f66c8b..af62b4da2caa5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -389,7 +389,7 @@ TEST(Cast, ToIntDowncastUnsafe) { } TEST(Cast, FloatingToInt) { - for (auto from : {float32(), float64()}) { + for (auto from : {float16(), float32(), float64()}) { for (auto to : {int32(), int64()}) { // float to int no truncation CheckCast(ArrayFromJSON(from, "[1.0, null, 0.0, -1.0, 5.0]"), @@ -407,6 +407,15 @@ TEST(Cast, FloatingToInt) { } } +TEST(Cast, FloatingToFloating) { + for (auto from : {float16(), float32(), float64()}) { + for (auto to : {float16(), float32(), float64()}) { + CheckCast(ArrayFromJSON(from, "[1.0, 0.0, -1.0, 5.0]"), + ArrayFromJSON(to, "[1.0, 0.0, -1.0, 5.0]")); + } + } +} + TEST(Cast, IntToFloating) { for (auto from : {uint32(), int32()}) { std::string two_24 = "[16777216, 16777217]"; @@ -2220,14 +2229,12 @@ TEST(Cast, IntToString) { } TEST(Cast, FloatingToString) { - for (auto string_type : {utf8(), large_utf8()}) { - CheckCast( - ArrayFromJSON(float32(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), - ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); - - CheckCast( - ArrayFromJSON(float64(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), - ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); + for (auto float_type : {float16(), float32(), float64()}) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCast(ArrayFromJSON(float_type, "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), + ArrayFromJSON(string_type, + R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); + } } } diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index ceeabe01677ed..9fd449831c980 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -36,6 +36,7 @@ #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "arrow/util/logging.h" #include "arrow/util/value_parsing.h" @@ -52,6 +53,7 @@ namespace rj = arrow::rapidjson; namespace arrow { using internal::ParseValue; +using util::Float16; namespace ipc { namespace internal { @@ -232,9 +234,9 @@ enable_if_physical_signed_integer ConvertNumber(const rj::Value& json // Convert single unsigned integer value template -enable_if_physical_unsigned_integer ConvertNumber(const rj::Value& json_obj, - const DataType& type, - typename T::c_type* out) { +enable_if_unsigned_integer ConvertNumber(const rj::Value& json_obj, + const DataType& type, + typename T::c_type* out) { if (json_obj.IsUint64()) { uint64_t v64 = json_obj.GetUint64(); *out = static_cast(v64); @@ -249,6 +251,30 @@ enable_if_physical_unsigned_integer ConvertNumber(const rj::Value& js } } +// Convert float16/HalfFloatType +template +enable_if_half_float ConvertNumber(const rj::Value& json_obj, + const DataType& type, uint16_t* out) { + if (json_obj.IsDouble()) { + double f64 = json_obj.GetDouble(); + *out = Float16(f64).bits(); + return Status::OK(); + } else if (json_obj.IsUint()) { + uint32_t u32t = json_obj.GetUint(); + double f64 = static_cast(u32t); + *out = Float16(f64).bits(); + return Status::OK(); + } else if (json_obj.IsInt()) { + int32_t i32t = json_obj.GetInt(); + double f64 = static_cast(i32t); + *out = Float16(f64).bits(); + return Status::OK(); + } else { + *out = static_cast(0); + return JSONTypeError("unsigned int", json_obj.GetType()); + } +} + // Convert single floating point value template enable_if_physical_floating_point ConvertNumber(const rj::Value& json_obj, diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index ea3a9ae1a14a9..b3f7fc5b3458b 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -44,6 +44,7 @@ #include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #if defined(_MSC_VER) // "warning C4307: '+': integral constant overflow" @@ -51,6 +52,9 @@ #endif namespace arrow { + +using util::Float16; + namespace ipc { namespace internal { namespace json { @@ -185,6 +189,21 @@ class TestIntegers : public ::testing::Test { TYPED_TEST_SUITE_P(TestIntegers); +template +std::vector TestIntegersMutateIfNeeded( + std::vector data) { + return data; +} + +// TODO: This works, but is it the right way to do this? +template <> +std::vector TestIntegersMutateIfNeeded( + std::vector data) { + std::for_each(data.begin(), data.end(), + [](HalfFloatType::c_type& value) { value = Float16(value).bits(); }); + return data; +} + TYPED_TEST_P(TestIntegers, Basics) { using T = TypeParam; using c_type = typename T::c_type; @@ -193,16 +212,17 @@ TYPED_TEST_P(TestIntegers, Basics) { auto type = this->type(); AssertJSONArray(type, "[]", {}); - AssertJSONArray(type, "[4, 0, 5]", {4, 0, 5}); - AssertJSONArray(type, "[4, null, 5]", {true, false, true}, {4, 0, 5}); + AssertJSONArray(type, "[4, 0, 5]", TestIntegersMutateIfNeeded({4, 0, 5})); + AssertJSONArray(type, "[4, null, 5]", {true, false, true}, + TestIntegersMutateIfNeeded({4, 0, 5})); // Test limits const auto min_val = std::numeric_limits::min(); const auto max_val = std::numeric_limits::max(); std::string json_string = JSONArray(0, 1, min_val); - AssertJSONArray(type, json_string, {0, 1, min_val}); + AssertJSONArray(type, json_string, TestIntegersMutateIfNeeded({0, 1, min_val})); json_string = JSONArray(0, 1, max_val); - AssertJSONArray(type, json_string, {0, 1, max_val}); + AssertJSONArray(type, json_string, TestIntegersMutateIfNeeded({0, 1, max_val})); } TYPED_TEST_P(TestIntegers, Errors) { @@ -269,7 +289,12 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt8, TestIntegers, UInt8Type); INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt16, TestIntegers, UInt16Type); INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt32, TestIntegers, UInt32Type); INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt64, TestIntegers, UInt64Type); -INSTANTIATE_TYPED_TEST_SUITE_P(TestHalfFloat, TestIntegers, HalfFloatType); +// FIXME: I understand that HalfFloatType is backed by a uint16_t, but does it +// make sense to run this test over it? +// The way ConvertNumber for HalfFloatType is currently written, it allows the +// conversion of floating point notation to a half float, which causes failures +// in this test, one example is asserting 0.0 cannot be parsed as a half float. +// INSTANTIATE_TYPED_TEST_SUITE_P(TestHalfFloat, TestIntegers, HalfFloatType); template class TestStrings : public ::testing::Test { diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 7e0eb1d460555..95f601465b440 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -36,11 +36,14 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/type.h" +#include "arrow/util/float16.h" #include "arrow/util/iterator.h" #include "arrow/util/key_value_metadata.h" namespace arrow { +using util::Float16; + class TestRecordBatch : public ::testing::Test {}; TEST_F(TestRecordBatch, Equals) { diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index ed66c9367dc36..8caf4400fe86d 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -305,6 +305,7 @@ struct TypeTraits { using BuilderType = HalfFloatBuilder; using ScalarType = HalfFloatScalar; using TensorType = HalfFloatTensor; + using CType = uint16_t; static constexpr int64_t bytes_required(int64_t elements) { return elements * static_cast(sizeof(uint16_t)); diff --git a/cpp/src/arrow/util/formatting.cc b/cpp/src/arrow/util/formatting.cc index c16d42ce5cfe2..c5a7e03f8573a 100644 --- a/cpp/src/arrow/util/formatting.cc +++ b/cpp/src/arrow/util/formatting.cc @@ -18,10 +18,12 @@ #include "arrow/util/formatting.h" #include "arrow/util/config.h" #include "arrow/util/double_conversion.h" +#include "arrow/util/float16.h" #include "arrow/util/logging.h" namespace arrow { +using util::Float16; using util::double_conversion::DoubleToStringConverter; static constexpr int kMinBufferSize = DoubleToStringConverter::kBase10MaximalLength + 1; @@ -87,5 +89,14 @@ int FloatToStringFormatter::FormatFloat(double v, char* out_buffer, int out_size return builder.position(); } +int FloatToStringFormatter::FormatFloat(uint16_t v, char* out_buffer, int out_size) { + DCHECK_GE(out_size, kMinBufferSize); + util::double_conversion::StringBuilder builder(out_buffer, out_size); + bool result = impl_->converter_.ToShortest(Float16::FromBits(v).ToFloat(), &builder); + DCHECK(result); + ARROW_UNUSED(result); + return builder.position(); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/formatting.h b/cpp/src/arrow/util/formatting.h index 71bae74629e35..6125f792ff988 100644 --- a/cpp/src/arrow/util/formatting.h +++ b/cpp/src/arrow/util/formatting.h @@ -268,6 +268,7 @@ class ARROW_EXPORT FloatToStringFormatter { // Returns the number of characters written int FormatFloat(float v, char* out_buffer, int out_size); int FormatFloat(double v, char* out_buffer, int out_size); + int FormatFloat(uint16_t v, char* out_buffer, int out_size); protected: struct Impl; @@ -301,6 +302,12 @@ class FloatToStringFormatterMixin : public FloatToStringFormatter { } }; +template <> +class StringFormatter : public FloatToStringFormatterMixin { + public: + using FloatToStringFormatterMixin::FloatToStringFormatterMixin; +}; + template <> class StringFormatter : public FloatToStringFormatterMixin { public: diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc index f6a24ac1467f8..e84aac995e35f 100644 --- a/cpp/src/arrow/util/value_parsing.cc +++ b/cpp/src/arrow/util/value_parsing.cc @@ -22,8 +22,11 @@ #include #include +#include "arrow/util/float16.h" #include "arrow/vendored/fast_float/fast_float.h" +using arrow::util::Float16; + namespace arrow { namespace internal { @@ -43,6 +46,17 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, double* out return res.ec == std::errc() && res.ptr == s + length; } +// Half float +bool StringToFloat(const char* s, size_t length, char decimal_point, uint16_t* out) { + ::arrow_vendored::fast_float::parse_options options{ + ::arrow_vendored::fast_float::chars_format::general, decimal_point}; + float temp_out; + const auto res = + ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, temp_out, options); + *out = Float16::FromFloat(temp_out).bits(); + return res.ec == std::errc() && res.ptr == s + length; +} + // ---------------------------------------------------------------------- // strptime-like parsing diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index b3c711840f3e2..609906052cd20 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -135,6 +135,9 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, float* out) ARROW_EXPORT bool StringToFloat(const char* s, size_t length, char decimal_point, double* out); +ARROW_EXPORT +bool StringToFloat(const char* s, size_t length, char decimal_point, uint16_t* out); + template <> struct StringConverter { using value_type = float; @@ -163,6 +166,20 @@ struct StringConverter { const char decimal_point; }; +template <> +struct StringConverter { + using value_type = uint16_t; + + explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {} + + bool Convert(const HalfFloatType&, const char* s, size_t length, value_type* out) { + return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out)); + } + + private: + const char decimal_point; +}; + // NOTE: HalfFloatType would require a half<->float conversion library inline uint8_t ParseDecimalDigit(char c) { return static_cast(c - '0'); } diff --git a/docs/source/status.rst b/docs/source/status.rst index 9af2fd1921e22..71d33eaa6520c 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -40,7 +40,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | UInt8/16/32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Float16 | ✓ (1) | ✓ (2) | ✓ | ✓ | ✓ (3)| ✓ | ✓ | | +| Float16 | ✓ | ✓ (1) | ✓ | ✓ | ✓ (2)| ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Float32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -104,7 +104,7 @@ Data Types | Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift | | (special) | | | | | | | | | +===================+=======+=======+=======+============+=======+=======+=======+=======+ -| Dictionary | ✓ | ✓ (4) | ✓ | ✓ | ✓ | ✓ (3) | ✓ | | +| Dictionary | ✓ | ✓ (3) | ✓ | ✓ | ✓ | ✓ (3) | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Extension | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -113,10 +113,9 @@ Data Types Notes: -* \(1) Casting to/from Float16 in C++ is not supported. -* \(2) Casting to/from Float16 in Java is not supported. -* \(3) Float16 support in C# is only available when targeting .NET 6+. -* \(4) Nested dictionaries not supported +* \(1) Casting to/from Float16 in Java is not supported. +* \(2) Float16 support in C# is only available when targeting .NET 6+. +* \(3) Nested dictionaries not supported .. seealso:: The :ref:`format_columnar` specification. From 1a1d2c87d11c48569fa603cc157e6bddc1f7d306 Mon Sep 17 00:00:00 2001 From: Joe Marshall Date: Fri, 5 Apr 2024 03:34:32 +0100 Subject: [PATCH 63/81] GH-23221: [C++] Add support for building with Emscripten (#37821) Split from #37696 This is just the cmake changes to enable building on emscripten. Changes are: 1) Support for target system "emscripten" 2) Cmake preset for building libarrow ` ninja-release-python-emscripten` (same as `ninja-release-python`, but with emscripten support) 3) Override file for cmake on emscripten, to set various build parameters that need setting to make it build there. 4) Changes in pyarrow cmake so it works if you are building libarrow as shared library, and also an option to enable the cmake file there to just dump the current arrow configuration, which is useful for cross-compile builds. * Closes: #23221 Lead-authored-by: Joe Marshall Co-authored-by: Sutou Kouhei Co-authored-by: Joris Van den Bossche Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- ci/docker/ubuntu-22.04-cpp.dockerfile | 16 +- ci/scripts/cpp_build.sh | 232 ++++++++++-------- ci/scripts/cpp_test.sh | 4 + ci/scripts/go_bench.sh | 0 cpp/CMakePresets.json | 46 +++- cpp/build-support/emscripten-test-init.js | 24 ++ cpp/cmake_modules/BuildUtils.cmake | 4 +- cpp/cmake_modules/SetupCxxFlags.cmake | 89 ++++++- cpp/cmake_modules/ThirdpartyToolchain.cmake | 184 +++++++++++--- cpp/src/arrow/array/array_dict_test.cc | 2 +- .../compute/kernels/scalar_string_test.cc | 11 + .../compute/kernels/scalar_temporal_test.cc | 9 +- cpp/src/arrow/filesystem/localfs_test.cc | 3 + cpp/src/arrow/io/file.cc | 6 + cpp/src/arrow/io/file_test.cc | 15 ++ cpp/src/arrow/ipc/read_write_test.cc | 7 + cpp/src/arrow/util/async_generator_test.cc | 8 + cpp/src/arrow/util/atfork_test.cc | 20 ++ cpp/src/arrow/util/cache_test.cc | 5 + cpp/src/arrow/util/cancel_test.cc | 24 ++ cpp/src/arrow/util/counting_semaphore_test.cc | 13 + cpp/src/arrow/util/future_test.cc | 16 ++ cpp/src/arrow/util/io_util.cc | 9 +- cpp/src/arrow/util/io_util_test.cc | 35 ++- cpp/src/arrow/util/mutex.cc | 6 +- cpp/src/arrow/util/rle_encoding_test.cc | 16 +- cpp/src/arrow/util/value_parsing_test.cc | 5 + dev/tasks/tasks.yml | 8 + docker-compose.yml | 26 ++ docs/source/developers/cpp/emscripten.rst | 99 ++++++++ docs/source/developers/cpp/index.rst | 1 + 31 files changed, 784 insertions(+), 159 deletions(-) mode change 100644 => 100755 ci/scripts/go_bench.sh create mode 100644 cpp/build-support/emscripten-test-init.js create mode 100644 docs/source/developers/cpp/emscripten.rst diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index e8416c1378a9a..eb189841cd344 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -65,6 +65,7 @@ RUN latest_system_llvm=14 && \ RUN apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ autoconf \ + bzip2 \ ca-certificates \ ccache \ cmake \ @@ -115,10 +116,20 @@ RUN apt-get update -y -q && \ rapidjson-dev \ rsync \ tzdata \ - wget && \ + wget \ + xz-utils && \ apt-get clean && \ rm -rf /var/lib/apt/lists* +# install emscripten using EMSDK +ARG emscripten_version="3.1.45" +RUN cd ~ && git clone https://github.com/emscripten-core/emsdk.git && \ + cd emsdk && \ + ./emsdk install ${emscripten_version} && \ + ./emsdk activate ${emscripten_version} && \ + echo "Installed emsdk to:" ~/emsdk + + ARG gcc_version="" RUN if [ "${gcc_version}" = "" ]; then \ apt-get update -y -q && \ @@ -151,6 +162,9 @@ RUN if [ "${gcc_version}" = "" ]; then \ update-alternatives --set c++ /usr/bin/g++; \ fi +# make sure zlib is cached in the EMSDK folder +RUN source ~/emsdk/emsdk_env.sh && embuilder --pic build zlib + COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 1e09924a5e576..52c89acb9a76a 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -92,112 +92,132 @@ esac mkdir -p ${build_dir} pushd ${build_dir} -cmake \ - -Dabsl_SOURCE=${absl_SOURCE:-} \ - -DARROW_ACERO=${ARROW_ACERO:-OFF} \ - -DARROW_AZURE=${ARROW_AZURE:-OFF} \ - -DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \ - -DARROW_BUILD_BENCHMARKS_REFERENCE=${ARROW_BUILD_BENCHMARKS:-OFF} \ - -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ - -DARROW_BUILD_EXAMPLES=${ARROW_BUILD_EXAMPLES:-OFF} \ - -DARROW_BUILD_INTEGRATION=${ARROW_BUILD_INTEGRATION:-OFF} \ - -DARROW_BUILD_SHARED=${ARROW_BUILD_SHARED:-ON} \ - -DARROW_BUILD_STATIC=${ARROW_BUILD_STATIC:-ON} \ - -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS:-OFF} \ - -DARROW_BUILD_UTILITIES=${ARROW_BUILD_UTILITIES:-ON} \ - -DARROW_COMPUTE=${ARROW_COMPUTE:-ON} \ - -DARROW_CSV=${ARROW_CSV:-ON} \ - -DARROW_CUDA=${ARROW_CUDA:-OFF} \ - -DARROW_CXXFLAGS=${ARROW_CXXFLAGS:-} \ - -DARROW_CXX_FLAGS_DEBUG="${ARROW_CXX_FLAGS_DEBUG:-}" \ - -DARROW_CXX_FLAGS_RELEASE="${ARROW_CXX_FLAGS_RELEASE:-}" \ - -DARROW_CXX_FLAGS_RELWITHDEBINFO="${ARROW_CXX_FLAGS_RELWITHDEBINFO:-}" \ - -DARROW_C_FLAGS_DEBUG="${ARROW_C_FLAGS_DEBUG:-}" \ - -DARROW_C_FLAGS_RELEASE="${ARROW_C_FLAGS_RELEASE:-}" \ - -DARROW_C_FLAGS_RELWITHDEBINFO="${ARROW_C_FLAGS_RELWITHDEBINFO:-}" \ - -DARROW_DATASET=${ARROW_DATASET:-OFF} \ - -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ - -DARROW_ENABLE_THREADING=${ARROW_ENABLE_THREADING:-ON} \ - -DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \ - -DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \ - -DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \ - -DARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \ - -DARROW_FLIGHT_SQL=${ARROW_FLIGHT_SQL:-OFF} \ - -DARROW_FUZZING=${ARROW_FUZZING:-OFF} \ - -DARROW_GANDIVA_PC_CXX_FLAGS=${ARROW_GANDIVA_PC_CXX_FLAGS:-} \ - -DARROW_GANDIVA=${ARROW_GANDIVA:-OFF} \ - -DARROW_GCS=${ARROW_GCS:-OFF} \ - -DARROW_HDFS=${ARROW_HDFS:-ON} \ - -DARROW_INSTALL_NAME_RPATH=${ARROW_INSTALL_NAME_RPATH:-ON} \ - -DARROW_JEMALLOC=${ARROW_JEMALLOC:-ON} \ - -DARROW_JSON=${ARROW_JSON:-ON} \ - -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ - -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ - -DARROW_NO_DEPRECATED_API=${ARROW_NO_DEPRECATED_API:-OFF} \ - -DARROW_ORC=${ARROW_ORC:-OFF} \ - -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ - -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ - -DARROW_S3=${ARROW_S3:-OFF} \ - -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \ - -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ - -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} \ - -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ - -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ - -DARROW_USE_ASAN=${ARROW_USE_ASAN:-OFF} \ - -DARROW_USE_CCACHE=${ARROW_USE_CCACHE:-ON} \ - -DARROW_USE_GLOG=${ARROW_USE_GLOG:-OFF} \ - -DARROW_USE_LD_GOLD=${ARROW_USE_LD_GOLD:-OFF} \ - -DARROW_USE_MOLD=${ARROW_USE_MOLD:-OFF} \ - -DARROW_USE_PRECOMPILED_HEADERS=${ARROW_USE_PRECOMPILED_HEADERS:-OFF} \ - -DARROW_USE_STATIC_CRT=${ARROW_USE_STATIC_CRT:-OFF} \ - -DARROW_USE_TSAN=${ARROW_USE_TSAN:-OFF} \ - -DARROW_USE_UBSAN=${ARROW_USE_UBSAN:-OFF} \ - -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \ - -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-OFF} \ - -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-OFF} \ - -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-OFF} \ - -DARROW_WITH_OPENTELEMETRY=${ARROW_WITH_OPENTELEMETRY:-OFF} \ - -DARROW_WITH_MUSL=${ARROW_WITH_MUSL:-OFF} \ - -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-OFF} \ - -DARROW_WITH_UCX=${ARROW_WITH_UCX:-OFF} \ - -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ - -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ - -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ - -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ - -DAzure_SOURCE=${Azure_SOURCE:-} \ - -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ - -DBOOST_SOURCE=${BOOST_SOURCE:-} \ - -DBrotli_SOURCE=${Brotli_SOURCE:-} \ - -DBUILD_WARNING_LEVEL=${BUILD_WARNING_LEVEL:-CHECKIN} \ - -Dc-ares_SOURCE=${cares_SOURCE:-} \ - -DCMAKE_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} \ - -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE:-OFF} \ - -DCMAKE_C_FLAGS="${CFLAGS:-}" \ - -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ - -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-17}" \ - -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ - -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ - -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ - -Dgflags_SOURCE=${gflags_SOURCE:-} \ - -Dgoogle_cloud_cpp_storage_SOURCE=${google_cloud_cpp_storage_SOURCE:-} \ - -DgRPC_SOURCE=${gRPC_SOURCE:-} \ - -DGTest_SOURCE=${GTest_SOURCE:-} \ - -Dlz4_SOURCE=${lz4_SOURCE:-} \ - -DORC_SOURCE=${ORC_SOURCE:-} \ - -DPARQUET_BUILD_EXAMPLES=${PARQUET_BUILD_EXAMPLES:-OFF} \ - -DPARQUET_BUILD_EXECUTABLES=${PARQUET_BUILD_EXECUTABLES:-OFF} \ - -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION:-ON} \ - -DProtobuf_SOURCE=${Protobuf_SOURCE:-} \ - -DRapidJSON_SOURCE=${RapidJSON_SOURCE:-} \ - -Dre2_SOURCE=${re2_SOURCE:-} \ - -DSnappy_SOURCE=${Snappy_SOURCE:-} \ - -DThrift_SOURCE=${Thrift_SOURCE:-} \ - -Dutf8proc_SOURCE=${utf8proc_SOURCE:-} \ - -Dzstd_SOURCE=${zstd_SOURCE:-} \ - -Dxsimd_SOURCE=${xsimd_SOURCE:-} \ - -G "${CMAKE_GENERATOR:-Ninja}" \ - ${ARROW_CMAKE_ARGS} \ - ${source_dir} +if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then + if [ "${UBUNTU}" = "20.04" ]; then + echo "arrow emscripten build is not supported on Ubuntu 20.04, run with UBUNTU=22.04" + exit -1 + fi + n_jobs=2 # Emscripten build fails on docker unless this is set really low + source ~/emsdk/emsdk_env.sh + emcmake cmake \ + --preset=ninja-${ARROW_BUILD_TYPE:-debug}-emscripten \ + -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE:-OFF} \ + -DCMAKE_C_FLAGS="${CFLAGS:-}" \ + -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ + -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-17}" \ + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ + -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ + ${ARROW_CMAKE_ARGS} \ + ${source_dir} +else + cmake \ + -Dabsl_SOURCE=${absl_SOURCE:-} \ + -DARROW_ACERO=${ARROW_ACERO:-OFF} \ + -DARROW_AZURE=${ARROW_AZURE:-OFF} \ + -DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \ + -DARROW_BUILD_BENCHMARKS_REFERENCE=${ARROW_BUILD_BENCHMARKS:-OFF} \ + -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ + -DARROW_BUILD_EXAMPLES=${ARROW_BUILD_EXAMPLES:-OFF} \ + -DARROW_BUILD_INTEGRATION=${ARROW_BUILD_INTEGRATION:-OFF} \ + -DARROW_BUILD_SHARED=${ARROW_BUILD_SHARED:-ON} \ + -DARROW_BUILD_STATIC=${ARROW_BUILD_STATIC:-ON} \ + -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS:-OFF} \ + -DARROW_BUILD_UTILITIES=${ARROW_BUILD_UTILITIES:-ON} \ + -DARROW_COMPUTE=${ARROW_COMPUTE:-ON} \ + -DARROW_CSV=${ARROW_CSV:-ON} \ + -DARROW_CUDA=${ARROW_CUDA:-OFF} \ + -DARROW_CXXFLAGS=${ARROW_CXXFLAGS:-} \ + -DARROW_CXX_FLAGS_DEBUG="${ARROW_CXX_FLAGS_DEBUG:-}" \ + -DARROW_CXX_FLAGS_RELEASE="${ARROW_CXX_FLAGS_RELEASE:-}" \ + -DARROW_CXX_FLAGS_RELWITHDEBINFO="${ARROW_CXX_FLAGS_RELWITHDEBINFO:-}" \ + -DARROW_C_FLAGS_DEBUG="${ARROW_C_FLAGS_DEBUG:-}" \ + -DARROW_C_FLAGS_RELEASE="${ARROW_C_FLAGS_RELEASE:-}" \ + -DARROW_C_FLAGS_RELWITHDEBINFO="${ARROW_C_FLAGS_RELWITHDEBINFO:-}" \ + -DARROW_DATASET=${ARROW_DATASET:-OFF} \ + -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ + -DARROW_ENABLE_THREADING=${ARROW_ENABLE_THREADING:-ON} \ + -DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \ + -DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \ + -DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \ + -DARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \ + -DARROW_FLIGHT_SQL=${ARROW_FLIGHT_SQL:-OFF} \ + -DARROW_FUZZING=${ARROW_FUZZING:-OFF} \ + -DARROW_GANDIVA_PC_CXX_FLAGS=${ARROW_GANDIVA_PC_CXX_FLAGS:-} \ + -DARROW_GANDIVA=${ARROW_GANDIVA:-OFF} \ + -DARROW_GCS=${ARROW_GCS:-OFF} \ + -DARROW_HDFS=${ARROW_HDFS:-ON} \ + -DARROW_INSTALL_NAME_RPATH=${ARROW_INSTALL_NAME_RPATH:-ON} \ + -DARROW_JEMALLOC=${ARROW_JEMALLOC:-ON} \ + -DARROW_JSON=${ARROW_JSON:-ON} \ + -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ + -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ + -DARROW_NO_DEPRECATED_API=${ARROW_NO_DEPRECATED_API:-OFF} \ + -DARROW_ORC=${ARROW_ORC:-OFF} \ + -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ + -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ + -DARROW_S3=${ARROW_S3:-OFF} \ + -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \ + -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ + -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} \ + -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ + -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ + -DARROW_USE_ASAN=${ARROW_USE_ASAN:-OFF} \ + -DARROW_USE_CCACHE=${ARROW_USE_CCACHE:-ON} \ + -DARROW_USE_GLOG=${ARROW_USE_GLOG:-OFF} \ + -DARROW_USE_LD_GOLD=${ARROW_USE_LD_GOLD:-OFF} \ + -DARROW_USE_MOLD=${ARROW_USE_MOLD:-OFF} \ + -DARROW_USE_PRECOMPILED_HEADERS=${ARROW_USE_PRECOMPILED_HEADERS:-OFF} \ + -DARROW_USE_STATIC_CRT=${ARROW_USE_STATIC_CRT:-OFF} \ + -DARROW_USE_TSAN=${ARROW_USE_TSAN:-OFF} \ + -DARROW_USE_UBSAN=${ARROW_USE_UBSAN:-OFF} \ + -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \ + -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-OFF} \ + -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-OFF} \ + -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-OFF} \ + -DARROW_WITH_OPENTELEMETRY=${ARROW_WITH_OPENTELEMETRY:-OFF} \ + -DARROW_WITH_MUSL=${ARROW_WITH_MUSL:-OFF} \ + -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-OFF} \ + -DARROW_WITH_UCX=${ARROW_WITH_UCX:-OFF} \ + -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ + -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ + -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ + -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ + -DAzure_SOURCE=${Azure_SOURCE:-} \ + -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ + -DBOOST_SOURCE=${BOOST_SOURCE:-} \ + -DBrotli_SOURCE=${Brotli_SOURCE:-} \ + -DBUILD_WARNING_LEVEL=${BUILD_WARNING_LEVEL:-CHECKIN} \ + -Dc-ares_SOURCE=${cares_SOURCE:-} \ + -DCMAKE_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} \ + -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE:-OFF} \ + -DCMAKE_C_FLAGS="${CFLAGS:-}" \ + -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ + -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-17}" \ + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ + -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ + -Dgflags_SOURCE=${gflags_SOURCE:-} \ + -Dgoogle_cloud_cpp_storage_SOURCE=${google_cloud_cpp_storage_SOURCE:-} \ + -DgRPC_SOURCE=${gRPC_SOURCE:-} \ + -DGTest_SOURCE=${GTest_SOURCE:-} \ + -Dlz4_SOURCE=${lz4_SOURCE:-} \ + -DORC_SOURCE=${ORC_SOURCE:-} \ + -DPARQUET_BUILD_EXAMPLES=${PARQUET_BUILD_EXAMPLES:-OFF} \ + -DPARQUET_BUILD_EXECUTABLES=${PARQUET_BUILD_EXECUTABLES:-OFF} \ + -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION:-ON} \ + -DProtobuf_SOURCE=${Protobuf_SOURCE:-} \ + -DRapidJSON_SOURCE=${RapidJSON_SOURCE:-} \ + -Dre2_SOURCE=${re2_SOURCE:-} \ + -DSnappy_SOURCE=${Snappy_SOURCE:-} \ + -DThrift_SOURCE=${Thrift_SOURCE:-} \ + -Dutf8proc_SOURCE=${utf8proc_SOURCE:-} \ + -Dzstd_SOURCE=${zstd_SOURCE:-} \ + -Dxsimd_SOURCE=${xsimd_SOURCE:-} \ + -G "${CMAKE_GENERATOR:-Ninja}" \ + ${ARROW_CMAKE_ARGS} \ + ${source_dir} +fi export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:-$[${n_jobs} + 1]} time cmake --build . --target install diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index f388825fd0a98..2c640f2c1fb6a 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -80,6 +80,10 @@ case "$(uname)" in ;; esac +if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then + n_jobs=1 # avoid spurious fails on emscripten due to loading too many big executables +fi + pushd ${build_dir} if [ -z "${PYTHON}" ] && ! which python > /dev/null 2>&1; then diff --git a/ci/scripts/go_bench.sh b/ci/scripts/go_bench.sh old mode 100644 new mode 100755 diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 9d99b3b2a79e0..13d1241990c31 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -46,6 +46,32 @@ "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, + { + "name": "features-emscripten", + "hidden": true, + "cacheVariables": { + "ARROW_ACERO": "ON", + "ARROW_BUILD_SHARED": "OFF", + "ARROW_BUILD_STATIC": "ON", + "ARROW_CUDA": "OFF", + "ARROW_DEPENDENCY_SOURCE": "BUNDLED", + "ARROW_DEPENDENCY_USE_SHARED": "OFF", + "ARROW_ENABLE_THREADING": "OFF", + "ARROW_FLIGHT": "OFF", + "ARROW_IPC": "ON", + "ARROW_JEMALLOC": "OFF", + "ARROW_MIMALLOC": "OFF", + "ARROW_ORC": "ON", + "ARROW_RUNTIME_SIMD_LEVEL": "NONE", + "ARROW_S3": "OFF", + "ARROW_SIMD_LEVEL": "NONE", + "ARROW_SUBSTRAIT": "ON", + "ARROW_WITH_BROTLI": "ON", + "ARROW_WITH_OPENTELEMETRY": "OFF", + "ARROW_WITH_SNAPPY": "ON", + "CMAKE_C_BYTE_ORDER": "LITTLE_ENDIAN" + } + }, { "name": "features-minimal", "hidden": true, @@ -341,6 +367,24 @@ "displayName": "Release build with CUDA integration", "cacheVariables": {} }, + { + "name": "ninja-debug-emscripten", + "inherits": [ + "features-emscripten", + "base-debug" + ], + "displayName": "Debug build which builds an Emscripten library", + "cacheVariables": {} + }, + { + "name": "ninja-release-emscripten", + "inherits": [ + "features-emscripten", + "base-release" + ], + "displayName": "Release build which builds an Emscripten library", + "cacheVariables": {} + }, { "name": "ninja-release-flight", "inherits": [ @@ -447,4 +491,4 @@ } } ] -} +} \ No newline at end of file diff --git a/cpp/build-support/emscripten-test-init.js b/cpp/build-support/emscripten-test-init.js new file mode 100644 index 0000000000000..bbb542a29f021 --- /dev/null +++ b/cpp/build-support/emscripten-test-init.js @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +var Module = { +}; + +// make sure tests can access the current parquet test data files +Module.preRun = () => {ENV.PARQUET_TEST_DATA = process.env.PARQUET_TEST_DATA; + ENV.ARROW_TEST_DATA = process.env.ARROW_TEST_DATA; +}; \ No newline at end of file diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 7a45e9cca59de..e7523add27223 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -760,8 +760,8 @@ function(ADD_TEST_CASE REL_TEST_NAME) valgrind --suppressions=valgrind.supp --tool=memcheck --gen-suppressions=all \ --num-callers=500 --leak-check=full --leak-check-heuristics=stdstring \ --error-exitcode=1 ${TEST_PATH} ${ARG_TEST_ARGUMENTS}") - elseif(WIN32) - add_test(${TEST_NAME} ${TEST_PATH} ${ARG_TEST_ARGUMENTS}) + elseif(WIN32 OR CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME} ${ARG_TEST_ARGUMENTS}) else() add_test(${TEST_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 90decb4224ec6..1d709fe98d7fe 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -24,7 +24,9 @@ include(CheckCXXSourceCompiles) message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") if(NOT DEFINED ARROW_CPU_FLAG) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|amd64|X86|x86|i[3456]86|x64") + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + set(ARROW_CPU_FLAG "emscripten") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|amd64|X86|x86|i[3456]86|x64") set(ARROW_CPU_FLAG "x86") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64") set(ARROW_CPU_FLAG "aarch64") @@ -312,7 +314,12 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wextra") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wdocumentation") - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wshorten-64-to-32") + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # size_t is 32 bit in Emscripten wasm32 - ignore conversion errors + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-shorten-64-to-32") + else() + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wshorten-64-to-32") + endif() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-missing-braces") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unused-parameter") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-constant-logical-operand") @@ -692,17 +699,36 @@ if(NOT MSVC) set(C_DEBUG_FLAGS "") set(CXX_DEBUG_FLAGS "") if(NOT MSVC) - if(NOT CMAKE_C_FLAGS_DEBUG MATCHES "-O") - string(APPEND C_DEBUG_FLAGS " -O0") - endif() - if(NOT CMAKE_CXX_FLAGS_DEBUG MATCHES "-O") - string(APPEND CXX_DEBUG_FLAGS " -O0") - endif() - if(ARROW_GGDB_DEBUG) - string(APPEND C_DEBUG_FLAGS " -ggdb") - string(APPEND CXX_DEBUG_FLAGS " -ggdb") - string(APPEND C_RELWITHDEBINFO_FLAGS " -ggdb") - string(APPEND CXX_RELWITHDEBINFO_FLAGS " -ggdb") + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # with -g it uses DWARF debug info, which is really slow to build + # on emscripten (and uses tons of memory) + string(REPLACE "-g" " " CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) + string(REPLACE "-g" " " CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + string(APPEND C_DEBUG_FLAGS " -g2") + string(APPEND CXX_DEBUG_FLAGS " -g2") + string(APPEND C_RELWITHDEBINFO_FLAGS " -g2") + string(APPEND CXX_RELWITHDEBINFO_FLAGS " -g2") + # without -O1, emscripten executables are *MASSIVE*. Don't use -O0 + if(NOT CMAKE_C_FLAGS_DEBUG MATCHES "-O") + string(APPEND C_DEBUG_FLAGS " -O1") + endif() + if(NOT CMAKE_CXX_FLAGS_DEBUG MATCHES "-O") + string(APPEND CXX_DEBUG_FLAGS " -O1") + endif() + else() + if(NOT CMAKE_C_FLAGS_DEBUG MATCHES "-O") + string(APPEND C_DEBUG_FLAGS " -O0") + endif() + if(NOT CMAKE_CXX_FLAGS_DEBUG MATCHES "-O") + string(APPEND CXX_DEBUG_FLAGS " -O0") + endif() + + if(ARROW_GGDB_DEBUG) + string(APPEND C_DEBUG_FLAGS " -ggdb") + string(APPEND CXX_DEBUG_FLAGS " -ggdb") + string(APPEND C_RELWITHDEBINFO_FLAGS " -ggdb") + string(APPEND CXX_RELWITHDEBINFO_FLAGS " -ggdb") + endif() endif() endif() @@ -733,3 +759,40 @@ if(MSVC) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MSVC_LINKER_FLAGS}") endif() endif() + +if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # flags are: + # 1) We force *everything* to build as position independent + # 2) And with support for C++ exceptions + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -fexceptions") + # deprecated-literal-operator error is thrown in datetime (vendored lib in arrow) + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -fPIC -fexceptions -Wno-error=deprecated-literal-operator") + + # flags for creating shared libraries (only used in pyarrow, because + # Emscripten builds libarrow as static) + # flags are: + # 1) Tell it to use JavaScript / WebAssembly 64 bit number support. + # 2) Tell it to build with support for C++ exceptions + # 3) Skip linker flags error which happens with -soname parameter + set(ARROW_EMSCRIPTEN_LINKER_FLAGS "-sWASM_BIGINT=1 -fexceptions -Wno-error=linkflags") + set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS + "-sSIDE_MODULE=1 ${ARROW_EMSCRIPTEN_LINKER_FLAGS}") + set(CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS + "-sSIDE_MODULE=1 ${ARROW_EMSCRIPTEN_LINKER_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "-sSIDE_MODULE=1 ${ARROW_EMSCRIPTEN_LINKER_FLAGS}") + if(ARROW_TESTING) + # flags for building test executables for use in node + if("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + set(CMAKE_EXE_LINKER_FLAGS + "${ARROW_EMSCRIPTEN_LINKER_FLAGS} -sALLOW_MEMORY_GROWTH -lnodefs.js -lnoderawfs.js --pre-js ${BUILD_SUPPORT_DIR}/emscripten-test-init.js" + ) + else() + set(CMAKE_EXE_LINKER_FLAGS + "${ARROW_EMSCRIPTEN_LINKER_FLAGS} -sERROR_ON_WASM_CHANGES_AFTER_LINK=1 -sALLOW_MEMORY_GROWTH -lnodefs.js -lnoderawfs.js --pre-js ${BUILD_SUPPORT_DIR}/emscripten-test-init.js" + ) + endif() + else() + set(CMAKE_EXE_LINKER_FLAGS "${ARROW_EMSCRIPTEN_LINKER_FLAGS} -sALLOW_MEMORY_GROWTH") + endif() +endif() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index ad7344b09dd4e..4a67eac1d4d59 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -976,6 +976,23 @@ set(EP_COMMON_CMAKE_ARGS -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT} -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE}) +# if building with a toolchain file, pass that through +if(CMAKE_TOOLCHAIN_FILE) + list(APPEND EP_COMMON_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}) +endif() + +# and crosscompiling emulator (for try_run() ) +if(CMAKE_CROSSCOMPILING_EMULATOR) + string(REPLACE ";" ${EP_LIST_SEPARATOR} EP_CMAKE_CROSSCOMPILING_EMULATOR + "${CMAKE_CROSSCOMPILING_EMULATOR}") + list(APPEND EP_COMMON_CMAKE_ARGS + -DCMAKE_CROSSCOMPILING_EMULATOR=${EP_CMAKE_CROSSCOMPILING_EMULATOR}) +endif() + +if(CMAKE_PROJECT_INCLUDE) + list(APPEND EP_COMMON_CMAKE_ARGS -DCMAKE_PROJECT_INCLUDE=${CMAKE_PROJECT_INCLUDE}) +endif() + # Enable s/ccache if set by parent. if(CMAKE_C_COMPILER_LAUNCHER AND CMAKE_CXX_COMPILER_LAUNCHER) list(APPEND EP_COMMON_CMAKE_ARGS @@ -1349,6 +1366,14 @@ macro(build_snappy) set(SNAPPY_PATCH_COMMAND) endif() + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # ignore linker flag errors, as Snappy sets + # -Werror -Wall, and Emscripten doesn't support -soname + list(APPEND SNAPPY_CMAKE_ARGS + "-DCMAKE_SHARED_LINKER_FLAGS=${CMAKE_SHARED_LINKER_FLAGS}" + "-Wno-error=linkflags") + endif() + externalproject_add(snappy_ep ${EP_COMMON_OPTIONS} BUILD_IN_SOURCE 1 @@ -1394,6 +1419,7 @@ macro(build_brotli) message(STATUS "Building brotli from source") set(BROTLI_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/brotli_ep/src/brotli_ep-install") set(BROTLI_INCLUDE_DIR "${BROTLI_PREFIX}/include") + set(BROTLI_LIB_DIR "${BROTLI_PREFIX}/lib") set(BROTLI_STATIC_LIBRARY_ENC "${BROTLI_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}brotlienc-static${CMAKE_STATIC_LIBRARY_SUFFIX}" ) @@ -1405,6 +1431,26 @@ macro(build_brotli) ) set(BROTLI_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${BROTLI_PREFIX}") + set(BROTLI_EP_OPTIONS) + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # "cmake install" is disabled for Brotli on Emscripten, so the + # default INSTALL_COMMAND fails. We need to disable the default + # INSTALL_COMMAND. + list(APPEND + BROTLI_EP_OPTIONS + INSTALL_COMMAND + ${CMAKE_COMMAND} + -E + true) + + set(BROTLI_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/brotli_ep-prefix/src/brotli_ep-build) + set(BROTLI_BUILD_LIBS + "${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlienc-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlidec-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlicommon-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + endif() + externalproject_add(brotli_ep ${EP_COMMON_OPTIONS} URL ${BROTLI_SOURCE_URL} @@ -1414,7 +1460,20 @@ macro(build_brotli) "${BROTLI_STATIC_LIBRARY_COMMON}" ${BROTLI_BUILD_BYPRODUCTS} CMAKE_ARGS ${BROTLI_CMAKE_ARGS} - STEP_TARGETS headers_copy) + STEP_TARGETS headers_copy ${BROTLI_EP_OPTIONS}) + + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # Copy the libraries to our install directory manually. + set(BROTLI_BUILD_INCLUDE_DIR + ${CMAKE_CURRENT_BINARY_DIR}/brotli_ep-prefix/src/brotli_ep/c/include/brotli) + add_custom_command(TARGET brotli_ep + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${BROTLI_BUILD_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}*${CMAKE_STATIC_LIBRARY_SUFFIX} + ${BROTLI_LIB_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${BROTLI_BUILD_INCLUDE_DIR} ${BROTLI_INCLUDE_DIR}/brotli) + endif() file(MAKE_DIRECTORY "${BROTLI_INCLUDE_DIR}") @@ -1657,6 +1716,9 @@ macro(build_thrift) if(DEFINED BOOST_ROOT) list(APPEND THRIFT_CMAKE_ARGS "-DBOOST_ROOT=${BOOST_ROOT}") endif() + if(DEFINED Boost_INCLUDE_DIR) + list(APPEND THRIFT_CMAKE_ARGS "-DBoost_INCLUDE_DIR=${Boost_INCLUDE_DIR}") + endif() if(DEFINED Boost_NAMESPACE) list(APPEND THRIFT_CMAKE_ARGS "-DBoost_NAMESPACE=${Boost_NAMESPACE}") endif() @@ -1798,6 +1860,36 @@ macro(build_protobuf) add_dependencies(arrow::protobuf::protoc protobuf_ep) list(APPEND ARROW_BUNDLED_STATIC_LIBS arrow::protobuf::libprotobuf) + + if(CMAKE_CROSSCOMPILING) + # If we are cross compiling, we need to build protoc for the host + # system also, as it is used when building Arrow + # We do this by calling CMake as a child process + # with CXXFLAGS / CFLAGS and CMake flags cleared. + set(PROTOBUF_HOST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/protobuf_ep_host-install") + set(PROTOBUF_HOST_COMPILER "${PROTOBUF_HOST_PREFIX}/bin/protoc") + + set(PROTOBUF_HOST_CMAKE_ARGS + "-DCMAKE_CXX_FLAGS=" + "-DCMAKE_C_FLAGS=" + "-DCMAKE_INSTALL_PREFIX=${PROTOBUF_HOST_PREFIX}" + -Dprotobuf_BUILD_TESTS=OFF + -Dprotobuf_DEBUG_POSTFIX=) + + externalproject_add(protobuf_ep_host + ${EP_COMMON_OPTIONS} + CMAKE_ARGS ${PROTOBUF_HOST_CMAKE_ARGS} + BUILD_BYPRODUCTS "${PROTOBUF_HOST_COMPILER}" + BUILD_IN_SOURCE 1 + URL ${PROTOBUF_SOURCE_URL} + URL_HASH "SHA256=${ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM}") + + add_executable(arrow::protobuf::host_protoc IMPORTED) + set_target_properties(arrow::protobuf::host_protoc + PROPERTIES IMPORTED_LOCATION "${PROTOBUF_HOST_COMPILER}") + + add_dependencies(arrow::protobuf::host_protoc protobuf_ep_host) + endif() endmacro() if(ARROW_WITH_PROTOBUF) @@ -1862,7 +1954,11 @@ if(ARROW_WITH_PROTOBUF) else() set(ARROW_PROTOBUF_LIBPROTOC protobuf::libprotoc) endif() - if(TARGET arrow::protobuf::protoc) + if(TARGET arrow::protobuf::host_protoc) + # make sure host protoc is used for compiling protobuf files + # during build of e.g. orc + set(ARROW_PROTOBUF_PROTOC arrow::protobuf::host_protoc) + elseif(TARGET arrow::protobuf::protoc) set(ARROW_PROTOBUF_PROTOC arrow::protobuf::protoc) else() if(NOT TARGET protobuf::protoc) @@ -2164,8 +2260,15 @@ function(build_gtest) if(APPLE) string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-value" " -Wno-ignored-attributes") endif() - set(BUILD_SHARED_LIBS ON) - set(BUILD_STATIC_LIBS OFF) + # If we're building static libs for Emscripten, we need to build *everything* as + # static libs. + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + set(BUILD_SHARED_LIBS OFF) + set(BUILD_STATIC_LIBS ON) + else() + set(BUILD_SHARED_LIBS ON) + set(BUILD_STATIC_LIBS OFF) + endif() # We need to use "cache" variable to override the default # INSTALL_GTEST option by this value. See also: # https://cmake.org/cmake/help/latest/policy/CMP0077.html @@ -2403,37 +2506,58 @@ endif() macro(build_zlib) message(STATUS "Building ZLIB from source") - set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install") - if(MSVC) - if(${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") - set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib) - else() - set(ZLIB_STATIC_LIB_NAME zlibstatic.lib) + + # ensure zlib is built with -fpic + # and make sure that the build finds the version in Emscripten ports + # - n.b. the actual linking happens because -sUSE_ZLIB=1 is + # set in the compiler variables, but cmake expects + # it to exist at configuration time if we aren't building it as + # bundled. We need to do this for all packages + # not just zlib as some depend on zlib, but we don't rebuild + # if it exists already + if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # build zlib using Emscripten ports + if(NOT EXISTS ${EMSCRIPTEN_SYSROOT}/lib/wasm32-emscripten/pic/libz.a) + execute_process(COMMAND embuilder --pic --force build zlib) endif() + add_library(ZLIB::ZLIB STATIC IMPORTED) + set_property(TARGET ZLIB::ZLIB + PROPERTY IMPORTED_LOCATION + "${EMSCRIPTEN_SYSROOT}/lib/wasm32-emscripten/pic/libz.a") + list(APPEND ARROW_BUNDLED_STATIC_LIBS ZLIB::ZLIB) else() - set(ZLIB_STATIC_LIB_NAME libz.a) - endif() - set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") - set(ZLIB_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}") + set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install") + if(MSVC) + if(${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") + set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib) + else() + set(ZLIB_STATIC_LIB_NAME zlibstatic.lib) + endif() + else() + set(ZLIB_STATIC_LIB_NAME libz.a) + endif() + set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") + set(ZLIB_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}") - externalproject_add(zlib_ep - ${EP_COMMON_OPTIONS} - URL ${ZLIB_SOURCE_URL} - URL_HASH "SHA256=${ARROW_ZLIB_BUILD_SHA256_CHECKSUM}" - BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}" - CMAKE_ARGS ${ZLIB_CMAKE_ARGS}) + externalproject_add(zlib_ep + ${EP_COMMON_OPTIONS} + URL ${ZLIB_SOURCE_URL} + URL_HASH "SHA256=${ARROW_ZLIB_BUILD_SHA256_CHECKSUM}" + BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}" + CMAKE_ARGS ${ZLIB_CMAKE_ARGS}) - file(MAKE_DIRECTORY "${ZLIB_PREFIX}/include") + file(MAKE_DIRECTORY "${ZLIB_PREFIX}/include") - add_library(ZLIB::ZLIB STATIC IMPORTED) - set(ZLIB_LIBRARIES ${ZLIB_STATIC_LIB}) - set(ZLIB_INCLUDE_DIRS "${ZLIB_PREFIX}/include") - set_target_properties(ZLIB::ZLIB PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARIES}) - target_include_directories(ZLIB::ZLIB BEFORE INTERFACE "${ZLIB_INCLUDE_DIRS}") + add_library(ZLIB::ZLIB STATIC IMPORTED) + set(ZLIB_LIBRARIES ${ZLIB_STATIC_LIB}) + set(ZLIB_INCLUDE_DIRS "${ZLIB_PREFIX}/include") + set_target_properties(ZLIB::ZLIB PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARIES}) + target_include_directories(ZLIB::ZLIB BEFORE INTERFACE "${ZLIB_INCLUDE_DIRS}") - add_dependencies(ZLIB::ZLIB zlib_ep) + add_dependencies(ZLIB::ZLIB zlib_ep) + list(APPEND ARROW_BUNDLED_STATIC_LIBS ZLIB::ZLIB) + endif() - list(APPEND ARROW_BUNDLED_STATIC_LIBS ZLIB::ZLIB) set(ZLIB_VENDORED TRUE) endmacro() @@ -4390,6 +4514,10 @@ macro(build_orc) "-DPROTOBUF_LIBRARY=$" "-DPROTOC_LIBRARY=$" "-DSNAPPY_HOME=${ORC_SNAPPY_ROOT}" + "-DSNAPPY_LIBRARY=$" + "-DLZ4_LIBRARY=$" + "-DLZ4_STATIC_LIBRARY=$" + "-DLZ4_INCLUDE_DIR=${ORC_LZ4_ROOT}/include" "-DSNAPPY_INCLUDE_DIR=${ORC_SNAPPY_INCLUDE_DIR}" "-DZSTD_HOME=${ORC_ZSTD_ROOT}" "-DZSTD_INCLUDE_DIR=$" diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 4ae9e3d6dcbfc..22d6d1fc5ae92 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1129,7 +1129,7 @@ TEST(TestDictionary, Validate) { arr = std::make_shared(dict_type, indices, MakeArray(invalid_data)); ASSERT_RAISES(Invalid, arr->ValidateFull()); -#if !defined(__APPLE__) && !defined(ARROW_VALGRIND) +#if !defined(__APPLE__) && !defined(ARROW_VALGRIND) && !defined(__EMSCRIPTEN__) // GH-35712: ASSERT_DEATH would make testing slow on macOS. ASSERT_DEATH( { diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 26289a7f787e1..c7dbdef2436c3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -1988,6 +1988,11 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) { #endif TYPED_TEST(TestStringKernels, Strptime) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Skipping some strptime tests due to emscripten bug " + "https://github.com/emscripten-core/emscripten/issues/20466"; +#endif + std::string input1 = R"(["5/1/2020", null, null, "12/13/1900", null])"; std::string input2 = R"(["5-1-2020", "12/13/1900"])"; std::string input3 = R"(["5/1/2020", "AA/BB/CCCC"])"; @@ -2008,6 +2013,7 @@ TYPED_TEST(TestStringKernels, Strptime) { this->CheckUnary("strptime", input4, unit, output4, &options); options.format = "%m/%d/%Y %%z"; + // emscripten bug https://github.com/emscripten-core/emscripten/issues/20466 this->CheckUnary("strptime", input5, unit, output1, &options); options.error_is_null = false; @@ -2019,6 +2025,11 @@ TYPED_TEST(TestStringKernels, Strptime) { } TYPED_TEST(TestStringKernels, StrptimeZoneOffset) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() + << "Emscripten bug https://github.com/emscripten-core/emscripten/issues/20467"; +#endif + if (!arrow::internal::kStrptimeSupportsZone) { GTEST_SKIP() << "strptime does not support %z on this platform"; } diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 8dac6525fe2e6..8da8c760ea22b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -2143,7 +2143,10 @@ TEST_F(ScalarTemporalTest, StrftimeCLocale) { TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { #ifdef _WIN32 GTEST_SKIP() << "There is a known bug in strftime for locales on Windows (ARROW-15922)"; -#else +#elif defined(__EMSCRIPTEN__) + GTEST_SKIP() << "Emscripten doesn't build with multiple locales as default"; +#endif + if (!LocaleExists("fr_FR.UTF-8")) { GTEST_SKIP() << "locale 'fr_FR.UTF-8' doesn't exist on this system"; } @@ -2155,10 +2158,12 @@ TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { ["01 janvier 1970 00:00:59,123", "18 août 2021 15:11:50,456", null])"; CheckScalarUnary("strftime", timestamp(TimeUnit::MILLI, "UTC"), milliseconds, utf8(), expected, &options); -#endif } TEST_F(ScalarTemporalTest, StrftimeInvalidLocale) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Emscripten doesn't build with multiple locales as default"; +#endif auto options = StrftimeOptions("%d %B %Y %H:%M:%S", "nonexistent"); const char* seconds = R"(["1970-01-01T00:00:59", null])"; auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), seconds); diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index f90833a88d118..b76c7ebad45db 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -138,6 +138,9 @@ TEST(FileSystemFromUri, LinkedRegisteredFactory) { } TEST(FileSystemFromUri, LoadedRegisteredFactory) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Emscripten dynamic library testing disabled"; +#endif // Since the registrar's definition is in libarrow_filesystem_example.so, // its factory will be registered only after the library is dynamically loaded. std::string path; diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 00426f9957b1f..cc3a5187059e9 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -398,8 +398,14 @@ class MemoryMappedFile::MemoryMap ~Region() { if (data_ != nullptr) { +#ifndef __EMSCRIPTEN__ int result = munmap(data(), static_cast(size_)); + // emscripten erroneously reports failures in munmap + // https://github.com/emscripten-core/emscripten/issues/20459 ARROW_CHECK_EQ(result, 0) << "munmap failed"; +#else + munmap(data(), static_cast(size_)); +#endif } } diff --git a/cpp/src/arrow/io/file_test.cc b/cpp/src/arrow/io/file_test.cc index e7e7ba949c9fd..af414891b950e 100644 --- a/cpp/src/arrow/io/file_test.cc +++ b/cpp/src/arrow/io/file_test.cc @@ -42,6 +42,7 @@ #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" +#include "arrow/util/config.h" #include "arrow/util/future.h" #include "arrow/util/io_util.h" @@ -486,6 +487,10 @@ TEST_F(TestReadableFile, CustomMemoryPool) { } TEST_F(TestReadableFile, ThreadSafety) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + std::string data = "foobar"; { std::ofstream stream; @@ -540,6 +545,9 @@ class TestPipeIO : public ::testing::Test { }; TEST_F(TestPipeIO, TestWrite) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Pipes not supported on Emscripten"; +#endif std::string data1 = "test", data2 = "data!"; std::shared_ptr file; uint8_t buffer[10]; @@ -570,6 +578,9 @@ TEST_F(TestPipeIO, TestWrite) { } TEST_F(TestPipeIO, ReadableFileFails) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Pipes not supported on Emscripten"; +#endif // ReadableFile fails on non-seekable fd ASSERT_RAISES(IOError, ReadableFile::Open(pipe_.rfd.fd())); } @@ -1048,6 +1059,10 @@ TEST_F(TestMemoryMappedFile, CastableToFileInterface) { } TEST_F(TestMemoryMappedFile, ThreadSafety) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + std::string data = "foobar"; std::string path = TempFile("ipc-multithreading-test"); CreateFile(path, static_cast(data.size())); diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index c5075299a3e35..ff7838cc39d72 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -1046,6 +1046,9 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { }; TEST_F(RecursionLimits, WriteLimit) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "This crashes the Emscripten runtime."; +#endif int32_t metadata_length = -1; int64_t body_length = -1; std::shared_ptr schema; @@ -1078,6 +1081,10 @@ TEST_F(RecursionLimits, ReadLimit) { // Test fails with a structured exception on Windows + Debug #if !defined(_WIN32) || defined(NDEBUG) TEST_F(RecursionLimits, StressLimit) { +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "This crashes the Emscripten runtime."; +#endif + auto CheckDepth = [this](int recursion_depth, bool* it_works) { int32_t metadata_length = -1; int64_t body_length = -1; diff --git a/cpp/src/arrow/util/async_generator_test.cc b/cpp/src/arrow/util/async_generator_test.cc index 2b74313db279b..afb03b67209a6 100644 --- a/cpp/src/arrow/util/async_generator_test.cc +++ b/cpp/src/arrow/util/async_generator_test.cc @@ -399,6 +399,10 @@ TEST(TestAsyncUtil, MapParallelStress) { } TEST(TestAsyncUtil, MapQueuingFailStress) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + constexpr int NTASKS = 10; constexpr int NITEMS = 10; for (bool slow : {true, false}) { @@ -1872,6 +1876,10 @@ TEST(PushGenerator, DanglingProducer) { } TEST(PushGenerator, Stress) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + const int NTHREADS = 20; const int NVALUES = 2000; const int NFUTURES = NVALUES + 100; diff --git a/cpp/src/arrow/util/atfork_test.cc b/cpp/src/arrow/util/atfork_test.cc index 004e28e19514a..750f4d138793b 100644 --- a/cpp/src/arrow/util/atfork_test.cc +++ b/cpp/src/arrow/util/atfork_test.cc @@ -35,6 +35,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/atfork_internal.h" +#include "arrow/util/config.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" @@ -109,6 +110,10 @@ class TestAtFork : public ::testing::Test { #ifndef _WIN32 TEST_F(TestAtFork, EmptyHandlers) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + auto handlers = std::make_shared(); RegisterAtFork(handlers); @@ -130,6 +135,10 @@ TEST_F(TestAtFork, EmptyHandlers) { } TEST_F(TestAtFork, SingleThread) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + auto handlers1 = std::make_shared(PushBefore(1), PushParentAfter(11), PushChildAfter(21)); auto handlers2 = std::make_shared(PushBefore(2), PushParentAfter(12), @@ -188,6 +197,10 @@ TEST_F(TestAtFork, SingleThread) { // https://github.com/google/sanitizers/issues/950. TEST_F(TestAtFork, MultipleThreads) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + const int kNumThreads = 5; const int kNumIterations = 40; const int kParentAfterAddend = 10000; @@ -245,6 +258,9 @@ TEST_F(TestAtFork, NestedChild) { #ifdef __APPLE__ GTEST_SKIP() << "Nested fork is not supported on macOS"; #endif +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif auto handlers1 = std::make_shared(PushBefore(1), PushParentAfter(11), PushChildAfter(21)); @@ -286,6 +302,10 @@ TEST_F(TestAtFork, NestedChild) { #ifdef _WIN32 TEST_F(TestAtFork, NoOp) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + auto handlers = std::make_shared(PushBefore(1), PushParentAfter(11), PushChildAfter(21)); diff --git a/cpp/src/arrow/util/cache_test.cc b/cpp/src/arrow/util/cache_test.cc index 6b71baa369b9b..264bfe68ec5d2 100644 --- a/cpp/src/arrow/util/cache_test.cc +++ b/cpp/src/arrow/util/cache_test.cc @@ -26,6 +26,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/cache_internal.h" +#include "arrow/util/config.h" namespace arrow { namespace internal { @@ -255,6 +256,10 @@ TYPED_TEST(TestMemoizeLru, Basics) { this->TestBasics(); } class TestMemoizeLruThreadSafe : public TestMemoizeLru {}; TEST_F(TestMemoizeLruThreadSafe, Threads) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + using V = IntValue; Callable c; diff --git a/cpp/src/arrow/util/cancel_test.cc b/cpp/src/arrow/util/cancel_test.cc index 45f6cde4f5579..713418f15a0cc 100644 --- a/cpp/src/arrow/util/cancel_test.cc +++ b/cpp/src/arrow/util/cancel_test.cc @@ -232,6 +232,10 @@ class SignalCancelTest : public CancelTest { }; TEST_F(SignalCancelTest, Register) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + RegisterHandler(); TriggerSignal(); @@ -239,6 +243,10 @@ TEST_F(SignalCancelTest, Register) { } TEST_F(SignalCancelTest, RegisterUnregister) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + // The signal stop source was set up but no handler was registered, // so the token shouldn't be signalled. TriggerSignal(); @@ -261,6 +269,10 @@ TEST_F(SignalCancelTest, RegisterUnregister) { #if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || \ defined(THREAD_SANITIZER)) TEST_F(SignalCancelTest, ForkSafetyUnregisteredHandlers) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + RunInChild([&]() { // Child TriggerSignal(); @@ -284,6 +296,10 @@ TEST_F(SignalCancelTest, ForkSafetyUnregisteredHandlers) { } TEST_F(SignalCancelTest, ForkSafetyRegisteredHandlers) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + RegisterHandler(); RunInChild([&]() { @@ -307,6 +323,10 @@ TEST_F(SignalCancelTest, ForkSafetyRegisteredHandlers) { #endif TEST_F(CancelTest, ThreadedPollSuccess) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + constexpr int kNumThreads = 10; std::vector results(kNumThreads); @@ -339,6 +359,10 @@ TEST_F(CancelTest, ThreadedPollSuccess) { } TEST_F(CancelTest, ThreadedPollCancel) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + constexpr int kNumThreads = 10; std::vector results(kNumThreads); diff --git a/cpp/src/arrow/util/counting_semaphore_test.cc b/cpp/src/arrow/util/counting_semaphore_test.cc index a5fa9f6bde891..4de11ce852a03 100644 --- a/cpp/src/arrow/util/counting_semaphore_test.cc +++ b/cpp/src/arrow/util/counting_semaphore_test.cc @@ -22,12 +22,17 @@ #include #include "arrow/testing/gtest_util.h" +#include "arrow/util/config.h" #include "gtest/gtest.h" namespace arrow { namespace util { TEST(CountingSemaphore, Basic) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + CountingSemaphore semaphore; std::atomic acquired{false}; std::atomic started{false}; @@ -50,6 +55,10 @@ TEST(CountingSemaphore, Basic) { } TEST(CountingSemaphore, CloseAborts) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + CountingSemaphore semaphore; std::atomic cleanup{false}; std::thread acquirer([&] { @@ -64,6 +73,10 @@ TEST(CountingSemaphore, CloseAborts) { } TEST(CountingSemaphore, Stress) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + constexpr uint32_t NTHREADS = 10; CountingSemaphore semaphore; std::vector max_allowed_cases = {1, 3}; diff --git a/cpp/src/arrow/util/future_test.cc b/cpp/src/arrow/util/future_test.cc index 87891e48efa5e..2ed2b69aed524 100644 --- a/cpp/src/arrow/util/future_test.cc +++ b/cpp/src/arrow/util/future_test.cc @@ -415,6 +415,10 @@ TEST(FutureRefTest, HeadRemoved) { } TEST(FutureStressTest, Callback) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + #ifdef ARROW_VALGRIND const int NITERS = 2; #else @@ -471,6 +475,10 @@ TEST(FutureStressTest, Callback) { } TEST(FutureStressTest, TryAddCallback) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + for (unsigned int n = 0; n < 1; n++) { auto fut = Future<>::Make(); std::atomic callbacks_added(0); @@ -527,6 +535,10 @@ TEST(FutureStressTest, TryAddCallback) { } TEST(FutureStressTest, DeleteAfterWait) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + constexpr int kNumTasks = 100; for (int i = 0; i < kNumTasks; i++) { { @@ -1543,6 +1555,10 @@ TEST(FnOnceTest, MoveOnlyDataType) { } TEST(FutureTest, MatcherExamples) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + EXPECT_THAT(Future::MakeFinished(Status::Invalid("arbitrary error")), Finishes(Raises(StatusCode::Invalid))); diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc index 5928ebcb88959..d48f9eb97d562 100644 --- a/cpp/src/arrow/util/io_util.cc +++ b/cpp/src/arrow/util/io_util.cc @@ -95,6 +95,7 @@ #include "arrow/result.h" #include "arrow/util/atfork_internal.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/config.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" #include "arrow/util/mutex.h" @@ -1485,6 +1486,7 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, } Status MemoryAdviseWillNeed(const std::vector& regions) { +#ifndef __EMSCRIPTEN__ const auto page_size = static_cast(GetPageSize()); DCHECK_GT(page_size, 0); const size_t page_mask = ~(page_size - 1); @@ -1543,6 +1545,9 @@ Status MemoryAdviseWillNeed(const std::vector& regions) { #else return Status::OK(); #endif +#else + return Status::OK(); +#endif } // @@ -2067,7 +2072,9 @@ Status SendSignal(int signum) { } Status SendSignalToThread(int signum, uint64_t thread_id) { -#ifdef _WIN32 +#ifndef ARROW_ENABLE_THREADING + return Status::NotImplemented("Can't send signal with no threads"); +#elif defined(_WIN32) return Status::NotImplemented("Cannot send signal to specific thread on Windows"); #else // Have to use a C-style cast because pthread_t can be a pointer *or* integer type diff --git a/cpp/src/arrow/util/io_util_test.cc b/cpp/src/arrow/util/io_util_test.cc index d0569c799561f..73213bf9ce48a 100644 --- a/cpp/src/arrow/util/io_util_test.cc +++ b/cpp/src/arrow/util/io_util_test.cc @@ -40,6 +40,7 @@ #include "arrow/buffer.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/bit_util.h" +#include "arrow/util/config.h" #include "arrow/util/cpu_info.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" @@ -146,8 +147,8 @@ TEST(MemoryAdviseWillNeed, Basics) { ASSERT_OK(MemoryAdviseWillNeed({{addr1, 0}, {addr2 + 1, 0}})); // Should probably fail - // (but on Windows, MemoryAdviseWillNeed can be a no-op) -#ifndef _WIN32 + // (but on Windows or Emscripten, MemoryAdviseWillNeed can be a no-op) +#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) ASSERT_RAISES(IOError, MemoryAdviseWillNeed({{nullptr, std::numeric_limits::max()}})); #endif @@ -368,6 +369,10 @@ TestSelfPipe* TestSelfPipe::instance_; TEST_F(TestSelfPipe, MakeAndShutdown) {} TEST_F(TestSelfPipe, WaitAndSend) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + StartReading(); SleepABit(); AssertPayloadsEventually({}); @@ -380,6 +385,10 @@ TEST_F(TestSelfPipe, WaitAndSend) { } TEST_F(TestSelfPipe, SendAndWait) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + self_pipe_->Send(123456789123456789ULL); StartReading(); SleepABit(); @@ -390,6 +399,10 @@ TEST_F(TestSelfPipe, SendAndWait) { } TEST_F(TestSelfPipe, WaitAndShutdown) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + StartReading(); SleepABit(); ASSERT_OK(self_pipe_->Shutdown()); @@ -401,6 +414,9 @@ TEST_F(TestSelfPipe, WaitAndShutdown) { } TEST_F(TestSelfPipe, ShutdownAndWait) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif self_pipe_->Send(123456789123456789ULL); ASSERT_OK(self_pipe_->Shutdown()); StartReading(); @@ -413,6 +429,10 @@ TEST_F(TestSelfPipe, ShutdownAndWait) { } TEST_F(TestSelfPipe, WaitAndSendFromSignal) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + signal_received_.store(0); SignalHandlerGuard guard(SIGINT, &HandleSignal); @@ -431,6 +451,10 @@ TEST_F(TestSelfPipe, WaitAndSendFromSignal) { } TEST_F(TestSelfPipe, SendFromSignalAndWait) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + signal_received_.store(0); SignalHandlerGuard guard(SIGINT, &HandleSignal); @@ -450,6 +474,10 @@ TEST_F(TestSelfPipe, SendFromSignalAndWait) { #if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || \ defined(THREAD_SANITIZER)) TEST_F(TestSelfPipe, ForkSafety) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + self_pipe_->Send(123456789123456789ULL); auto child_pid = fork(); @@ -1025,6 +1053,9 @@ TEST_F(TestSendSignal, Generic) { } TEST_F(TestSendSignal, ToThread) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "SendSignalToThread requires threading"; +#endif #ifdef _WIN32 uint64_t dummy_thread_id = 42; ASSERT_RAISES(NotImplemented, SendSignalToThread(SIGINT, dummy_thread_id)); diff --git a/cpp/src/arrow/util/mutex.cc b/cpp/src/arrow/util/mutex.cc index 9f82ad45b0740..bbf2a9a93e692 100644 --- a/cpp/src/arrow/util/mutex.cc +++ b/cpp/src/arrow/util/mutex.cc @@ -24,6 +24,7 @@ #include #endif +#include "arrow/util/config.h" #include "arrow/util/logging.h" namespace arrow { @@ -35,9 +36,12 @@ struct Mutex::Impl { Mutex::Guard::Guard(Mutex* locked) : locked_(locked, [](Mutex* locked) { +#ifdef ARROW_ENABLE_THREADING DCHECK(!locked->impl_->mutex_.try_lock()); +#endif locked->impl_->mutex_.unlock(); - }) {} + }) { +} Mutex::Guard Mutex::TryLock() { DCHECK_NE(impl_, nullptr); diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 01d1ffd767fc9..26984e5f7735d 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -214,7 +214,14 @@ TEST(BitUtil, RoundTripIntValues) { void ValidateRle(const std::vector& values, int bit_width, uint8_t* expected_encoding, int expected_len) { const int len = 64 * 1024; +#ifdef __EMSCRIPTEN__ + // don't make this on the stack as it is + // too big for emscripten + std::vector buffer_vec(static_cast(len)); + uint8_t* buffer = buffer_vec.data(); +#else uint8_t buffer[len]; +#endif EXPECT_LE(expected_len, len); RleEncoder encoder(buffer, len, bit_width); @@ -227,7 +234,7 @@ void ValidateRle(const std::vector& values, int bit_width, if (expected_len != -1) { EXPECT_EQ(encoded_len, expected_len); } - if (expected_encoding != NULL) { + if (expected_encoding != NULL && encoded_len == expected_len) { EXPECT_EQ(memcmp(buffer, expected_encoding, encoded_len), 0); } @@ -256,7 +263,14 @@ void ValidateRle(const std::vector& values, int bit_width, // the returned values are not all the same bool CheckRoundTrip(const std::vector& values, int bit_width) { const int len = 64 * 1024; +#ifdef __EMSCRIPTEN__ + // don't make this on the stack as it is + // too big for emscripten + std::vector buffer_vec(static_cast(len)); + uint8_t* buffer = buffer_vec.data(); +#else uint8_t buffer[len]; +#endif RleEncoder encoder(buffer, len, bit_width); for (size_t i = 0; i < values.size(); ++i) { bool result = encoder.Put(values[i]); diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index 30c5e6aae74ba..92d727019aaf5 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -794,6 +794,11 @@ TEST(TimestampParser, StrptimeZoneOffset) { if (!kStrptimeSupportsZone) { GTEST_SKIP() << "strptime does not support %z on this platform"; } +#ifdef __EMSCRIPTEN__ + GTEST_SKIP() << "Test temporarily disabled due to emscripten bug " + "https://github.com/emscripten-core/emscripten/issues/20467 "; +#endif + std::string format = "%Y-%d-%m %H:%M:%S%z"; auto parser = TimestampParser::MakeStrptime(format); diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index cf46cb8c6ad70..f6ba03552dcce 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1164,6 +1164,14 @@ tasks: flags: "-e ARROW_CSV=ON -e ARROW_PARQUET=ON" image: ubuntu-cpp-minimal + test-ubuntu-22.04-cpp-emscripten: + ci: github + template: docker-tests/github.linux.yml + params: + env: + UBUNTU: 22.04 + image: ubuntu-cpp-emscripten + {% for python_version in ["3.8", "3.9", "3.10", "3.11", "3.12"] %} test-conda-python-{{ python_version }}: ci: github diff --git a/docker-compose.yml b/docker-compose.yml index 9b0610fe553b5..46717557bc337 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -157,6 +157,7 @@ x-hierarchy: - ubuntu-csharp - ubuntu-cpp-sanitizer - ubuntu-cpp-thread-sanitizer + - ubuntu-cpp-emscripten - ubuntu-r-sanitizer - ubuntu-r-valgrind - ubuntu-swift @@ -652,6 +653,31 @@ services: ARROW_USE_TSAN: "ON" command: *cpp-command + ubuntu-cpp-emscripten: + # Usage: + # docker-compose build ubuntu-cpp-emscripten + # docker-compose run --rm ubuntu-cpp-emscripten + # Parameters: + # ARCH: amd64, arm64v8, ... + # UBUNTU: 22.04 + image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp + build: + context: . + dockerfile: ci/docker/ubuntu-${UBUNTU}-cpp.dockerfile + cache_from: + - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp + args: + arch: ${ARCH} + clang_tools: ${CLANG_TOOLS} + llvm: ${LLVM} + shm_size: *shm-size + volumes: *ubuntu-volumes + environment: + <<: [*common, *ccache, *sccache, *cpp] + ARROW_EMSCRIPTEN: "ON" + UBUNTU: + command: *cpp-command + fedora-cpp: # Usage: # docker-compose build fedora-cpp diff --git a/docs/source/developers/cpp/emscripten.rst b/docs/source/developers/cpp/emscripten.rst new file mode 100644 index 0000000000000..b4c563aae1a3b --- /dev/null +++ b/docs/source/developers/cpp/emscripten.rst @@ -0,0 +1,99 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + + +.. highlight:: console + +.. _developers-cpp-emscripten: +=============================================== +Cross compiling for WebAssembly with Emscripten +=============================================== + +Prerequisites +------------- +You need CMake and compilers etc. installed as per the normal build instructions. Before building with Emscripten, you also need to install Emscripten and +activate it using the commands below (see https://emscripten.org/docs/getting_started/downloads.html for details). + +.. code:: shell + + git clone https://github.com/emscripten-core/emsdk.git + cd emsdk + # replace with the desired EMSDK version. + # e.g. for Pyodide 0.24, you need EMSDK version 3.1.45 + ./emsdk install + ./emsdk activate + source ./emsdk_env.sh + +If you want to build PyArrow for `Pyodide `_, you +need ``pyodide-build`` installed via ``pip``, and to be running with the +same version of Python that Pyodide is built for, along with the same +versions of emsdk tools. + +.. code:: shell + + # install Pyodide build tools. + # e.g. for version 0.24 of Pyodide: + pip install pyodide-build==0.24 + +Then build with the ``ninja-release-emscripten`` CMake preset, +like below: + +.. code:: shell + + emcmake cmake --preset "ninja-release-emscripten" + ninja install + +This will install a built static library version of ``libarrow`` it into the +Emscripten sysroot cache, meaning you can build things that depend on it +and they will find ``libarrow``. + +e.g. if you want to build for Pyodide, run the commands above, and then +go to ``arrow/python`` and run + +.. code:: shell + + pyodide build + +It should make a wheel targeting the currently enabled version of +Pyodide (i.e. the version corresponding to the currently installed +``pyodide-build``) in the ``dist`` subdirectory. + + +Manual Build +------------ + +If you want to manually build for Emscripten, take a look at the +``CMakePresets.json`` file in the ``arrow/cpp`` directory for a list of things +you will need to override. In particular you will need: + +#. Build dependencies set to ``BUNDLED``, so it uses properly cross + compiled build dependencies. + +#. ``CMAKE_TOOLCHAIN_FILE`` set by using ``emcmake cmake`` instead of just ``cmake``. + +#. You will quite likely need to set ``ARROW_ENABLE_THREADING`` to ``OFF`` + for builds targeting single threaded Emscripten environments such as + Pyodide. + +#. ``ARROW_FLIGHT`` and anything else that uses network probably won't + work. + +#. ``ARROW_JEMALLOC`` and ``ARROW_MIMALLOC`` again probably need to be + ``OFF`` + +#. ``ARROW_BUILD_STATIC`` set to ``ON`` and ``ARROW_BUILD_SHARED`` set to + ``OFF`` is most likely to work. diff --git a/docs/source/developers/cpp/index.rst b/docs/source/developers/cpp/index.rst index 36c9778bea1b0..603e1607dc543 100644 --- a/docs/source/developers/cpp/index.rst +++ b/docs/source/developers/cpp/index.rst @@ -27,5 +27,6 @@ C++ Development building development windows + emscripten conventions fuzzing From 139afe592a16e6fd168abc9f28b6c140071b4d4e Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Thu, 4 Apr 2024 19:51:29 -0700 Subject: [PATCH 64/81] GH-40775: [Benchmarking][Java] Fix conbench timeout (#40786) ### Rationale for this change The java build script has been recently updated and it is affecting conbench, which is now seeing timeouts when building java. The logs are producing 100s of GB of data due to an unnecessary debug log msg. ### What changes are included in this PR? * Delete log message on write to memory ### Are these changes tested? Yes, via conbench ### Are there any user-facing changes? No * GitHub Issue: #40775 Authored-by: Dane Pitkin Signed-off-by: Sutou Kouhei --- .../org/apache/arrow/memory/netty/TestNettyAllocator.java | 7 +++++-- .../java/org/apache/arrow/vector/ipc/WriteChannel.java | 3 --- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/java/memory/memory-netty/src/test/java/org/apache/arrow/memory/netty/TestNettyAllocator.java b/java/memory/memory-netty/src/test/java/org/apache/arrow/memory/netty/TestNettyAllocator.java index 07fdc3f784e43..a6da36bb35aa7 100644 --- a/java/memory/memory-netty/src/test/java/org/apache/arrow/memory/netty/TestNettyAllocator.java +++ b/java/memory/memory-netty/src/test/java/org/apache/arrow/memory/netty/TestNettyAllocator.java @@ -69,9 +69,12 @@ public void testMemoryUsage() { break; } } - assertTrue("Log messages are:\n" + + synchronized (memoryLogsAppender.list) { + assertTrue("Log messages are:\n" + memoryLogsAppender.list.stream().map(ILoggingEvent::toString).collect(Collectors.joining("\n")), - result); + result); + } + } finally { memoryLogsAppender.stop(); logger.detachAppender(memoryLogsAppender); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java index 9ad71f6fe8847..73bc2ecb12714 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java @@ -105,9 +105,6 @@ public long align() throws IOException { */ public long write(ByteBuffer buffer) throws IOException { long length = buffer.remaining(); - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("Writing buffer with size: {}", length); - } while (buffer.hasRemaining()) { out.write(buffer); } From 074d45f1c133e82da09ff7e6da706a3ad73c293a Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 5 Apr 2024 10:54:14 +0200 Subject: [PATCH 65/81] GH-39440: [Python] Calling pyarrow.dataset.ParquetFileFormat.make_write_options as a class method results in a segfault (#40976) ### Rationale for this change Calling `make_write_options()` method as class instead of instance method results in segfault. ### What changes are included in this PR? Adds a type check on `self` and raises an error if not `ParquetFileFormat`. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #39440 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Antoine Pitrou Signed-off-by: AlenkaF --- python/pyarrow/_dataset_parquet.pyx | 4 ++++ python/pyarrow/tests/test_dataset.py | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 58ef6145cf7d1..a55e889ba8246 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -198,6 +198,10 @@ cdef class ParquetFileFormat(FileFormat): ------- pyarrow.dataset.FileWriteOptions """ + # Safeguard from calling make_write_options as a static class method + if not isinstance(self, ParquetFileFormat): + raise TypeError("make_write_options() should be called on " + "an instance of ParquetFileFormat") opts = FileFormat.make_write_options(self) ( opts).update(**kwargs) return opts diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 3d77214c174c5..6bba7240c05df 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -5630,3 +5630,16 @@ def test_checksum_write_dataset_read_dataset_to_table(tempdir): corrupted_dir_path, format=pq_read_format_crc ).to_table() + + +def test_make_write_options_error(): + # GH-39440 + msg = ("make_write_options\\(\\) should be called on an " + "instance of ParquetFileFormat") + with pytest.raises(TypeError, match=msg): + pa.dataset.ParquetFileFormat.make_write_options(43) + + pformat = pa.dataset.ParquetFileFormat() + msg = "make_write_options\\(\\) takes exactly 0 positional arguments" + with pytest.raises(TypeError, match=msg): + pformat.make_write_options(43) From 110efed4d0d8cf16dc201a422809efdca6de9518 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 5 Apr 2024 20:01:52 +0800 Subject: [PATCH 66/81] GH-41024: [C++] IO: fixing compiling in gcc 7.5.0 (#41025) ### Rationale for this change Previous pr ( https://github.com/apache/arrow/pull/39807 ) remove std::move when returning value, however, it's not allowed in some old compilers ### What changes are included in this PR? add std::move for return, and add reason for that ### Are these changes tested? Should test by other ci ### Are there any user-facing changes? no * GitHub Issue: #41024 Authored-by: mwish Signed-off-by: Sutou Kouhei --- cpp/src/arrow/io/compressed.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/io/compressed.cc b/cpp/src/arrow/io/compressed.cc index d06101748dc0c..5faa4d095eb1e 100644 --- a/cpp/src/arrow/io/compressed.cc +++ b/cpp/src/arrow/io/compressed.cc @@ -405,7 +405,9 @@ class CompressedInputStream::Impl { ARROW_ASSIGN_OR_RAISE(auto buf, AllocateResizableBuffer(nbytes, pool_)); ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buf->mutable_data())); RETURN_NOT_OK(buf->Resize(bytes_read)); - return buf; + // Using std::move because the some compiler might has issue below: + // https://wg21.cmeerw.net/cwg/issue1579 + return std::move(buf); } const std::shared_ptr& raw() const { return raw_; } From 925ca66474692c46c6dcdda29a732b5e7d628b64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 5 Apr 2024 14:26:20 +0200 Subject: [PATCH 67/81] GH-40549: [Java] Revert bump org.apache.maven.plugins:maven-shade-plugin from 3.2.4 to 3.5.2 in /java (#40462)" (#41006) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 8ee9679d401183220a4566681ca7ef9e887ba4d2. ### Rationale for this change Spark integration tests are failing due to this bump. ### What changes are included in this PR? Revert bump org.apache.maven.plugins:maven-shade-plugin from 3.2.4 to 3.5.2 ### Are these changes tested? Via CI. ### Are there any user-facing changes? They shouldn't * GitHub Issue: #40549 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- java/flight/flight-core/pom.xml | 2 +- java/pom.xml | 2 +- java/vector/pom.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index 830caf8a28246..9ea6393f0f6df 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -164,7 +164,7 @@ issues in the arrow-tools tests looking up FlatBuffer dependencies. --> - 3.5.2 + 3.2.4 shade-main diff --git a/java/pom.xml b/java/pom.xml index 8e9ddd5480ea8..b6eb774f580d4 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -449,7 +449,7 @@ org.apache.maven.plugins maven-shade-plugin - 3.5.2 + 3.5.1 maven-surefire-plugin diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 20af3dbd38443..436ffd15b297d 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -179,7 +179,7 @@ issues in the arrow-tools tests looking up FlatBuffer dependencies. --> - 3.5.2 + 3.2.4 package From 7da285e39a89269f8aa175369d8da4e577805319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 5 Apr 2024 15:16:33 +0200 Subject: [PATCH 68/81] GH-41005: [CI] HDFS and skyhook tests require docker compose usage because they require multiple containers (#41027) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change We are currently using docker run but we require linked containers to be also started. ### What changes are included in this PR? Use docker compose instead of docker ### Are these changes tested? Via archery ### Are there any user-facing changes? No * GitHub Issue: #41005 Authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- dev/tasks/tasks.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index f6ba03552dcce..fc8d03b3d5bfb 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1090,6 +1090,7 @@ tasks: template: docker-tests/github.linux.yml params: env: + ARCHERY_USE_DOCKER_CLI: 0 UBUNTU: 20.04 flags: -e ARROW_SKYHOOK=ON image: ubuntu-cpp @@ -1534,8 +1535,9 @@ tasks: template: docker-tests/github.linux.yml params: env: - PYTHON: "3.10" + ARCHERY_USE_DOCKER_CLI: 0 HDFS: "{{ hdfs_version }}" + PYTHON: "3.10" image: conda-python-hdfs {% endfor %} From 17e51ccaace120cbcef3bfc8bf28553a1e7985a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 5 Apr 2024 16:04:56 +0200 Subject: [PATCH 69/81] GH-41007: [CI][Archery] Correctly interpolate environment variables from docker compose when using docker cli on archery docker (#41026) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Currently our verification jobs are failing due to environment variables not being correctly interpolated. docker compose expects a double `$$` sign on the command as explained here: https://docs.docker.com/compose/compose-file/12-interpolation/ When we use `ARCHERY_USE_DOCKER_CLI=1` we are using `docker run` instead of `docker compose run`. The behaviour of the command changes and we have to update change the environment variable. ### What changes are included in this PR? Use correct docker compose binary when using `docker cli` and remove double $$. ### Are these changes tested? Via archery ### Are there any user-facing changes? No * GitHub Issue: #41007 Authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- dev/archery/archery/docker/cli.py | 4 +++- dev/archery/archery/docker/core.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dev/archery/archery/docker/cli.py b/dev/archery/archery/docker/cli.py index e6baf0ca1f002..7053db2afccff 100644 --- a/dev/archery/archery/docker/cli.py +++ b/dev/archery/archery/docker/cli.py @@ -77,7 +77,9 @@ def docker(ctx, src, dry_run, using_docker_cli, using_docker_buildx): compose = DockerCompose(config_path, params=os.environ, using_docker=using_docker_cli, using_buildx=using_docker_buildx, - debug=ctx.obj.get('debug', False)) + debug=ctx.obj.get('debug', False), + compose_bin=("docker compose" if using_docker_cli + else "docker-compose")) if dry_run: _mock_compose_calls(compose) ctx.obj['compose'] = compose diff --git a/dev/archery/archery/docker/core.py b/dev/archery/archery/docker/core.py index 38720e5856a14..0b49111dd6944 100644 --- a/dev/archery/archery/docker/core.py +++ b/dev/archery/archery/docker/core.py @@ -402,6 +402,10 @@ def run(self, service_name, command=None, *, env=None, volumes=None, # on the docker-compose yaml file. if isinstance(cmd, list): cmd = shlex.join(cmd) + # Match behaviour from docker compose + # to interpolate environment variables + # https://docs.docker.com/compose/compose-file/12-interpolation/ + cmd = cmd.replace("$$", "$") args.extend(shlex.split(cmd)) # execute as a plain docker cli command From 83359d6958273be534f376f40976196c14675c8b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 5 Apr 2024 23:35:59 +0900 Subject: [PATCH 70/81] GH-40962: [GLib] Add missing "no" to suppress warnings (#41030) ### Rationale for this change There are some `unused-but-set-variable` warnings for Vala examples on macOS: ```text FAILED: example/vala/write-file.p/meson-generated_write-file.c.o ccache cc -Iexample/vala/write-file.p -Iexample/vala -I../../c_glib/example/vala -I/Users/runner/work/arrow/arrow/build/c_glib -I/Users/runner/work/arrow/arrow/c_glib -Iarrow-glib -I../../c_glib/arrow-glib -I/usr/local/Cellar/glib/2.80.0_2/include -I/usr/local/Cellar/glib/2.80.0_2/include/glib-2.0 -I/usr/local/Cellar/glib/2.80.0_2/lib/glib-2.0/include -I/usr/local/opt/gettext/include -I/usr/local/Cellar/pcre2/10.43/include -I/Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/usr/include/ffi -fdiagnostics-color=always -Wall -Winvalid-pch -Werror -std=c99 -O0 -g -DARROW_NO_DEPRECATED_API -MD -MQ example/vala/write-file.p/meson-generated_write-file.c.o -MF example/vala/write-file.p/meson-generated_write-file.c.o.d -o example/vala/write-file.p/meson-generated_write-file.c.o -c example/vala/write-file.p/write-file.c write-file.c:373:8: error: variable '_tmp45__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp45__length1; ^ write-file.c:504:8: error: variable '_tmp57__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp57__length1; ^ write-file.c:635:8: error: variable '_tmp69__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp69__length1; ^ write-file.c:766:8: error: variable '_tmp81__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp81__length1; ^ write-file.c:897:8: error: variable '_tmp93__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp93__length1; ^ write-file.c:1028:8: error: variable '_tmp105__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp105__length1; ^ write-file.c:1159:8: error: variable '_tmp117__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp117__length1; ^ write-file.c:1290:8: error: variable '_tmp129__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp129__length1; ^ write-file.c:1421:8: error: variable '_tmp141__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp141__length1; ^ write-file.c:1552:8: error: variable '_tmp153__length1' set but not used [-Werror,-Wunused-but-set-variable] gint _tmp153__length1; ^ 10 errors generated. ``` ### What changes are included in this PR? Add missing `no` to option that suppress this warning: -Wunused-but-set-variable -> -Wno-unused-but-set-variable ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40962 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/example/vala/meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_glib/example/vala/meson.build b/c_glib/example/vala/meson.build index ff65a7328f171..b7eb86200ddd6 100644 --- a/c_glib/example/vala/meson.build +++ b/c_glib/example/vala/meson.build @@ -19,7 +19,7 @@ if generate_vapi c_flags = [ - '-Wunused-but-set-variable', + '-Wno-unused-but-set-variable', ] c_flags = meson.get_compiler('c').get_supported_arguments(c_flags) vala_example_executable_kwargs = { From bf3c1b99c3398e3ed390e4eb7f2895f64e9abf80 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:09:42 -0700 Subject: [PATCH 71/81] GH-41039: [Python] ListView pandas tests should use np.nan instead of None (#41040) ### Rationale for this change ListView pandas conversion tests are failing in upstream development branches of pandas/numpy. ### What changes are included in this PR? * Use np.nan instead of None when comparing with an expected Pandas Series. ### Are these changes tested? Yes, unit tests updated. ### Are there any user-facing changes? No. * GitHub Issue: #41039 Authored-by: Dane Pitkin Signed-off-by: AlenkaF --- python/pyarrow/tests/test_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 90b9bd8b8c453..3678b4e57a9a8 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2571,7 +2571,7 @@ def test_list_view_to_pandas_with_null_values(self, klass): ) actual = arr.to_pandas() - expected = pd.Series([[1, None], [], None]) + expected = pd.Series([[1, np.nan], [], None]) tm.assert_series_equal(actual, expected) @@ -2593,7 +2593,7 @@ def test_list_view_to_pandas_multiple_chunks(self, klass): arr = pa.chunked_array([arr1, arr2]) actual = arr.to_pandas() - expected = pd.Series([[3, 4], [2, 3], [1, 2], [5, 6, 7], [6, 7, None], None]) + expected = pd.Series([[3, 4], [2, 3], [1, 2], [5, 6, 7], [6, 7, np.nan], None]) tm.assert_series_equal(actual, expected) From 02bc6537f5989bc37896429394e8c6b0cb843889 Mon Sep 17 00:00:00 2001 From: Austin Dickey Date: Fri, 5 Apr 2024 16:19:16 -0500 Subject: [PATCH 72/81] GH-41015: [JS] [Benchmarking] allow JS benchmarks to run more portably (#41031) --- js/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/package.json b/js/package.json index bb70fd0a395b0..773cf23e35b36 100644 --- a/js/package.json +++ b/js/package.json @@ -11,7 +11,7 @@ "build": "cross-env NODE_NO_WARNINGS=1 gulp build", "clean": "cross-env NODE_NO_WARNINGS=1 gulp clean", "debug": "cross-env NODE_NO_WARNINGS=1 gulp debug", - "perf": "perf/index.ts", + "perf": "node --no-warnings --loader ts-node/esm/transpile-only perf/index.ts", "test:integration": "bin/integration.ts --mode validate", "release": "./npm-release.sh", "clean:all": "yarn clean && yarn clean:testdata", From be3baf2697fa449d527e475749d0619f4f77bead Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Fri, 5 Apr 2024 15:05:11 -0700 Subject: [PATCH 73/81] GH-40680: [Java] Test JDK 22 in CI (#41038) ### Rationale for this change JDK 22 is released. Let's ensure Arrow java CI tests this version successfully. ### What changes are included in this PR? * Add JDK 22 to CI ### Are these changes tested? Tested by CI jobs ### Are there any user-facing changes? No * GitHub Issue: #40680 Authored-by: Dane Pitkin Signed-off-by: David Li --- .github/workflows/java.yml | 4 ++-- docs/source/java/install.rst | 10 +++++----- java/pom.xml | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 611e202ca0624..e92d3f4fc5877 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -58,8 +58,8 @@ jobs: strategy: fail-fast: false matrix: - jdk: [8, 11, 17, 21] - maven: [3.9.5] + jdk: [8, 11, 17, 21, 22] + maven: [3.9.6] image: [java] env: JDK: ${{ matrix.jdk }} diff --git a/docs/source/java/install.rst b/docs/source/java/install.rst index 7ac1a4990f37d..a551edc36c477 100644 --- a/docs/source/java/install.rst +++ b/docs/source/java/install.rst @@ -29,8 +29,8 @@ Java modules are regularly built and tested on macOS and Linux distributions. Java Compatibility ================== -Java modules are compatible with JDK 8 and above. -Currently, JDK 8, 11, 17, and 21 are tested in CI. +Java modules are compatible with JDK 8 and above. Currently, JDK versions +8, 11, 17, and 21 are tested in CI. The latest JDK is also tested in CI. When using Java 9 or later, some JDK internals must be exposed by adding ``--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED`` to the ``java`` command: @@ -61,7 +61,7 @@ Modifying the command above for Flight: $ env _JAVA_OPTIONS="--add-reads=org.apache.arrow.flight.core=ALL-UNNAMED --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... Otherwise, you may see errors like ``java.lang.IllegalAccessError: superclass access check failed: class -org.apache.arrow.flight.ArrowMessage$ArrowBufRetainingCompositeByteBuf (in module org.apache.arrow.flight.core) +org.apache.arrow.flight.ArrowMessage$ArrowBufRetainingCompositeByteBuf (in module org.apache.arrow.flight.core) cannot access class io.netty.buffer.CompositeByteBuf (in unnamed module ...) because module org.apache.arrow.flight.core does not read unnamed module ... @@ -74,8 +74,8 @@ Modifying the command above for arrow-memory: # Indirectly via environment variables $ env _JAVA_OPTIONS="--add-opens=java.base/java.nio=org.apache.arrow.dataset,org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... -Otherwise you may see errors such as ``java.lang.RuntimeException: java.lang.reflect.InaccessibleObjectException: -Unable to make static void java.nio.Bits.reserveMemory(long,long) accessible: module +Otherwise you may see errors such as ``java.lang.RuntimeException: java.lang.reflect.InaccessibleObjectException: +Unable to make static void java.nio.Bits.reserveMemory(long,long) accessible: module java.base does not "opens java.nio" to module org.apache.arrow.dataset`` If using Maven and Surefire for unit testing, :ref:`this argument must diff --git a/java/pom.xml b/java/pom.xml index b6eb774f580d4..95b27922eafa9 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -46,7 +46,7 @@ 9+181-r4173-1 2.24.0 3.12.1 - 5.5.0 + 5.11.0 5.2.0 3.42.0 From 3886cf1d43701afe25f3d008b81ae62e880e7878 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Fri, 5 Apr 2024 17:55:28 -0500 Subject: [PATCH 74/81] GH-40991: [R] Prefer r-universe, add a startup message (#41019) ### Rationale for this change If someone loads a version of Arrow on macOS with features disabled, warn them on startup that they can use `install_arrow()`. By default, prefer R-Universe in `install_arrow()` ### What changes are included in this PR? ^^^ ### Are these changes tested? Yes ### Are there any user-facing changes? Yes **This PR contains a "Critical Fix".** * GitHub Issue: #40991 Lead-authored-by: Jonathan Keane Co-authored-by: Neal Richardson Signed-off-by: Jonathan Keane --- r/DESCRIPTION | 2 +- r/R/arrow-info.R | 3 +- r/R/arrow-package.R | 71 ++++++++++++++++++++++++++---------------- r/R/install-arrow.R | 14 ++------- r/man/arrow-package.Rd | 4 +-- r/man/format_schema.Rd | 18 +++++++++++ 6 files changed, 70 insertions(+), 42 deletions(-) create mode 100644 r/man/format_schema.Rd diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 6062a8c4f4689..c9f84e2e794c4 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -43,7 +43,7 @@ Imports: utils, vctrs Roxygen: list(markdown = TRUE, r6 = FALSE, load = "source") -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Config/testthat/edition: 3 Config/build/bootstrap: TRUE Suggests: diff --git a/r/R/arrow-info.R b/r/R/arrow-info.R index 54145b2ee1204..916b6683fbcce 100644 --- a/r/R/arrow-info.R +++ b/r/R/arrow-info.R @@ -139,7 +139,8 @@ arrow_with_json <- function() { some_features_are_off <- function(features) { # `features` is a named logical vector (as in arrow_info()$capabilities) # Let's exclude some less relevant ones - blocklist <- c("lzo", "bz2", "brotli", "substrait") + # jemalloc is only included because it is sometimes disabled in our build process + blocklist <- c("lzo", "bz2", "brotli", "substrait", "jemalloc") # Return TRUE if any of the other features are FALSE !all(features[setdiff(names(features), blocklist)]) } diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 54e237192e080..f6977e626276b 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -182,37 +182,54 @@ configure_tzdb <- function() { .onAttach <- function(libname, pkgname) { # Just to be extra safe, let's wrap this in a try(); # we don't want a failed startup message to prevent the package from loading - try({ - # On macOS only, Check if we are running in under emulation, and warn this will not work - if (on_rosetta()) { - packageStartupMessage( - paste( - "Warning:", - " It appears that you are running R and Arrow in emulation (i.e. you're", - " running an Intel version of R on a non-Intel mac). This configuration is", - " not supported by arrow, you should install a native (arm64) build of R", - " and use arrow with that. See https://cran.r-project.org/bin/macosx/", - "", - sep = "\n" + try( + { + # On macOS only, Check if we are running in under emulation, and warn this will not work + if (on_rosetta()) { + packageStartupMessage( + paste( + "Warning:", + " It appears that you are running R and Arrow in emulation (i.e. you're", + " running an Intel version of R on a non-Intel mac). This configuration is", + " not supported by arrow, you should install a native (arm64) build of R", + " and use arrow with that. See https://cran.r-project.org/bin/macosx/", + "", + sep = "\n" + ) ) - ) - } + } - features <- arrow_info()$capabilities - # That has all of the #ifdef features, plus the compression libs and the - # string libraries (but not the memory allocators, they're added elsewhere) - # - # Let's print a message if some are off - if (some_features_are_off(features)) { - packageStartupMessage( - paste( - "Some features are not enabled in this build of Arrow.", - "Run `arrow_info()` for more information." + features <- arrow_info()$capabilities + # That has all of the #ifdef features, plus the compression libs and the + # string libraries (but not the memory allocators, they're added elsewhere) + # + # Let's print a message if some are off + if (some_features_are_off(features)) { + packageStartupMessage( + paste( + "Some features are not enabled in this build of Arrow.", + "Run `arrow_info()` for more information." + ) ) - ) - } - }, silent = TRUE) + # On macOS binaries from CRAN can be hobbled. They sometimes restrict access to our + # dependency source downloading even though we can build from source on their machines. + # They also refuse to allow libarrow binaries to be downloaded, so instead distribute + # hobbled arrow binaries + # If on macOS, and features are disabled, advise that reinstalling might help + if (identical(tolower(Sys.info()[["sysname"]]), "darwin")) { + packageStartupMessage( + paste0( + "The repository you retrieved Arrow from did not include all of Arrow's features.\n", + "You can install a fully-featured version by running:\n", + "`install.packages('arrow', repos = 'https://apache.r-universe.dev')`." + ) + ) + } + } + }, + silent = TRUE + ) } # Clean up the StopSource that was registered in .onLoad() so that if the diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 74d3a96454777..44b876537490c 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -76,22 +76,14 @@ install_arrow <- function(nightly = FALSE, ARROW_R_DEV = verbose, ARROW_USE_PKG_CONFIG = use_system ) - # On the M1, we can't use the usual autobrew, which pulls Intel dependencies - apple_m1 <- grepl("arm-apple|aarch64.*darwin", R.Version()$platform) - # On Rosetta, we have to build without JEMALLOC, so we also can't autobrew - rosetta <- on_rosetta() - if (rosetta) { + # On Rosetta, we have to build without JEMALLOC + if (on_rosetta()) { Sys.setenv(ARROW_JEMALLOC = "OFF") - } - if (apple_m1 || rosetta) { Sys.setenv(FORCE_BUNDLED_BUILD = "true") } opts <- list() - if (apple_m1 || rosetta) { - # Skip binaries (esp. for rosetta) - opts$pkgType <- "source" - } else if (isTRUE(binary)) { + if (isTRUE(binary)) { # Unless otherwise directed, don't consider newer source packages when # options(pkgType) == "both" (default on win/mac) opts$install.packages.check.source <- "no" diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd index 15f672a1fe949..41ec7956fe815 100644 --- a/r/man/arrow-package.Rd +++ b/r/man/arrow-package.Rd @@ -18,15 +18,15 @@ Useful links: } \author{ -\strong{Maintainer}: Nic Crane \email{thisisnic@gmail.com} +\strong{Maintainer}: Jonathan Keane \email{jkeane@gmail.com} Authors: \itemize{ \item Neal Richardson \email{neal.p.richardson@gmail.com} \item Ian Cook \email{ianmcook@gmail.com} + \item Nic Crane \email{thisisnic@gmail.com} \item Dewey Dunnington \email{dewey@fishandwhistle.net} (\href{https://orcid.org/0000-0002-9415-4582}{ORCID}) \item Romain François (\href{https://orcid.org/0000-0002-2444-4226}{ORCID}) - \item Jonathan Keane \email{jkeane@gmail.com} \item Dragoș Moldovan-Grünfeld \email{dragos.mold@gmail.com} \item Jeroen Ooms \email{jeroen@berkeley.edu} \item Jacob Wujciak-Jens \email{jacob@wujciak.de} diff --git a/r/man/format_schema.Rd b/r/man/format_schema.Rd new file mode 100644 index 0000000000000..d1c81e3fe7623 --- /dev/null +++ b/r/man/format_schema.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/schema.R +\name{format_schema} +\alias{format_schema} +\title{Get a string representing a Dataset or RecordBatchReader object's schema} +\usage{ +format_schema(obj) +} +\arguments{ +\item{obj}{a Dataset or RecordBatchReader} +} +\value{ +A string containing a formatted representation of the schema of \code{obj} +} +\description{ +Get a string representing a Dataset or RecordBatchReader object's schema +} +\keyword{internal} From df7da77acbecdc617b8fa03863cbdb38321b629c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 6 Apr 2024 08:13:55 +0900 Subject: [PATCH 75/81] GH-40855: [C++][ORC] Fix `std::filesystem` related link error with ORC 2.0.0 or later (#41023) ### Rationale for this change If we use `std::filesystem`, we need `-lstdc++fs` with GCC 8 and `-lc++fs` for clang 7. We don't want to maintain CMake code for GCC 8/clang 7. ### What changes are included in this PR? * Don't use `std::filesystem` with ORC 2.0.0 or later. * Use missing `-lstdc++fs`/`-lc++fs` with bundled ORC 2.0.0. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40855 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/FindorcAlt.cmake | 12 +++++++++++- cpp/cmake_modules/ThirdpartyToolchain.cmake | 14 ++++++++++++++ cpp/src/arrow/CMakeLists.txt | 4 ++++ cpp/src/arrow/adapters/orc/adapter.cc | 14 ++++++++++---- 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/cpp/cmake_modules/FindorcAlt.cmake b/cpp/cmake_modules/FindorcAlt.cmake index dc3b978cf4037..289416678ad39 100644 --- a/cpp/cmake_modules/FindorcAlt.cmake +++ b/cpp/cmake_modules/FindorcAlt.cmake @@ -29,6 +29,7 @@ endif() find_package(orc ${find_package_args}) if(orc_FOUND) set(orcAlt_FOUND TRUE) + set(orcAlt_VERSION ${orc_VERSION}) return() endif() @@ -51,8 +52,17 @@ else() NAMES orc/orc-config.hh PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() +if(ORC_INCLUDE_DIR) + file(READ "${ORC_INCLUDE_DIR}/orc/orc-config.hh" ORC_CONFIG_HH_CONTENT) + string(REGEX MATCH "#define ORC_VERSION \"[0-9.]+\"" ORC_VERSION_DEFINITION + "${ORC_CONFIG_HH_CONTENT}") + string(REGEX MATCH "[0-9.]+" ORC_VERSION "${ORC_VERSION_DEFINITION}") +endif() -find_package_handle_standard_args(orcAlt REQUIRED_VARS ORC_STATIC_LIB ORC_INCLUDE_DIR) +find_package_handle_standard_args( + orcAlt + REQUIRED_VARS ORC_STATIC_LIB ORC_INCLUDE_DIR + VERSION_VAR ORC_VERSION) if(orcAlt_FOUND) if(NOT TARGET orc::orc) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 4a67eac1d4d59..7d54ccccf7c19 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4559,6 +4559,15 @@ macro(build_orc) endif() target_link_libraries(orc::orc INTERFACE ${CMAKE_DL_LIBS}) endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9") + target_link_libraries(orc::orc INTERFACE stdc++fs) + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8") + target_link_libraries(orc::orc INTERFACE c++fs) + endif() + endif() add_dependencies(orc::orc orc_ep) @@ -4568,6 +4577,11 @@ endmacro() if(ARROW_ORC) resolve_dependency(orc HAVE_ALT TRUE) target_link_libraries(orc::orc INTERFACE ${ARROW_PROTOBUF_LIBPROTOBUF}) + if(ORC_VENDORED) + set(ARROW_ORC_VERSION ${ARROW_ORC_BUILD_VERSION}) + else() + set(ARROW_ORC_VERSION ${orcAlt_VERSION}) + endif() message(STATUS "Found ORC static library: ${ORC_STATIC_LIB}") message(STATUS "Found ORC headers: ${ORC_INCLUDE_DIR}") endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 617bfedabf373..026bb5c77e066 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -899,6 +899,10 @@ if(ARROW_ORC) adapters/orc/util.cc) foreach(ARROW_ORC_TARGET ${ARROW_ORC_TARGETS}) target_link_libraries(${ARROW_ORC_TARGET} PRIVATE orc::orc) + if(ARROW_ORC_VERSION VERSION_LESS "2.0.0") + target_compile_definitions(${ARROW_ORC_TARGET} + PRIVATE ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK) + endif() endforeach() else() set(ARROW_ORC_TARGET_SHARED) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 127ec49ba990f..98784450b3cce 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -18,13 +18,16 @@ #include "arrow/adapters/orc/adapter.h" #include -#include #include #include #include #include #include +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK +#include +#endif + #include "arrow/adapters/orc/util.h" #include "arrow/builder.h" #include "arrow/io/interfaces.h" @@ -183,11 +186,9 @@ liborc::RowReaderOptions DefaultRowReaderOptions() { return options; } +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK // Proactively check timezone database availability for ORC versions older than 2.0.0 Status CheckTimeZoneDatabaseAvailability() { - if (GetOrcMajorVersion() >= 2) { - return Status::OK(); - } auto tz_dir = std::getenv("TZDIR"); bool is_tzdb_avaiable = tz_dir != nullptr ? std::filesystem::exists(tz_dir) @@ -200,6 +201,7 @@ Status CheckTimeZoneDatabaseAvailability() { } return Status::OK(); } +#endif } // namespace @@ -559,7 +561,9 @@ ORCFileReader::~ORCFileReader() {} Result> ORCFileReader::Open( const std::shared_ptr& file, MemoryPool* pool) { +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); +#endif auto result = std::unique_ptr(new ORCFileReader()); RETURN_NOT_OK(result->impl_->Open(file, pool)); return std::move(result); @@ -826,7 +830,9 @@ ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); } Result> ORCFileWriter::Open( io::OutputStream* output_stream, const WriteOptions& writer_options) { +#ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); +#endif std::unique_ptr result = std::unique_ptr(new ORCFileWriter()); Status status = result->impl_->Open(output_stream, writer_options); From 6f6b3b49c013ab2982fbd8b50313c3b0539bc7ca Mon Sep 17 00:00:00 2001 From: "Marcus D. Hanwell" Date: Fri, 5 Apr 2024 19:20:13 -0400 Subject: [PATCH 76/81] GH-40400: [C++] Add support for LLD (#40927) ### Rationale for this change Add support for the LLD LLVM linker, it is faster than the default linker. ### What changes are included in this PR? Added `ARROW_USE_LLD` as a CMake option, test if GCC supports the flags and then set the linker flags if so when the option is enabled. ### Are these changes tested? When this is enabled then the libraries and tests will use the linker flag. ### Are there any user-facing changes? No * GitHub Issue: #40400 Authored-by: Marcus D. Hanwell Signed-off-by: Sutou Kouhei --- ci/docker/ubuntu-20.04-cpp.dockerfile | 2 ++ ci/scripts/cpp_build.sh | 1 + cpp/cmake_modules/DefineOptions.cmake | 2 ++ cpp/cmake_modules/SetupCxxFlags.cmake | 28 +++++++++++++++++++++++++++ 4 files changed, 33 insertions(+) diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 3e3b7ac3a6d99..124256378b287 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -101,6 +101,7 @@ RUN apt-get update -y -q && \ libutf8proc-dev \ libxml2-dev \ libzstd-dev \ + lld \ make \ ninja-build \ nlohmann-json3-dev \ @@ -164,6 +165,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_SUBSTRAIT=ON \ ARROW_USE_ASAN=OFF \ ARROW_USE_CCACHE=ON \ + ARROW_USE_LLD=ON \ ARROW_USE_UBSAN=OFF \ ARROW_WITH_BROTLI=ON \ ARROW_WITH_BZ2=ON \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 52c89acb9a76a..e28ceae8801f0 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -166,6 +166,7 @@ else -DARROW_USE_CCACHE=${ARROW_USE_CCACHE:-ON} \ -DARROW_USE_GLOG=${ARROW_USE_GLOG:-OFF} \ -DARROW_USE_LD_GOLD=${ARROW_USE_LD_GOLD:-OFF} \ + -DARROW_USE_LLD=${ARROW_USE_LLD:-OFF} \ -DARROW_USE_MOLD=${ARROW_USE_MOLD:-OFF} \ -DARROW_USE_PRECOMPILED_HEADERS=${ARROW_USE_PRECOMPILED_HEADERS:-OFF} \ -DARROW_USE_STATIC_CRT=${ARROW_USE_STATIC_CRT:-OFF} \ diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 5b8bcb3ac6965..dc0e5da63adb7 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -170,6 +170,8 @@ takes precedence over ccache if a storage backend is configured" ON) define_option(ARROW_USE_LD_GOLD "Use ld.gold for linking on Linux (if available)" OFF) + define_option(ARROW_USE_LLD "Use the LLVM lld for linking (if available)" OFF) + define_option(ARROW_USE_MOLD "Use mold for linking on Linux (if available)" OFF) define_option(ARROW_USE_PRECOMPILED_HEADERS "Use precompiled headers when compiling" diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 1d709fe98d7fe..3e580dfc109cf 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -663,6 +663,34 @@ if(NOT WIN32 AND NOT APPLE) endif() endif() +if(ARROW_USE_LLD) + find_program(LD_LLD ld.lld) + if(LD_LLD) + unset(LLD_LINKER_FLAGS) + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.1.0") + set(LLD_LINKER_FLAGS "-fuse-ld=lld") + else() + message(STATUS "Need GCC 9.1.0 or later to use LLD linker: ${CMAKE_CXX_COMPILER_VERSION}" + ) + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(LLD_LINKER_FLAGS "--ld-path=${LD_LLD}") + else() + message(STATUS "Using the default linker because compiler doesn't support LLD: ${CMAKE_CXX_COMPILER_ID}" + ) + endif() + if(LLD_LINKER_FLAGS) + message(STATUS "Using optional LLVM LLD linker") + string(APPEND CMAKE_EXE_LINKER_FLAGS " ${LLD_LINKER_FLAGS}") + string(APPEND CMAKE_MODULE_LINKER_FLAGS " ${LLD_LINKER_FLAGS}") + string(APPEND CMAKE_SHARED_LINKER_FLAGS " ${LLD_LINKER_FLAGS}") + else() + message(STATUS "Using the default linker because the LLD isn't supported") + endif() + endif() +endif() + # compiler flags for different build types (run 'cmake -DCMAKE_BUILD_TYPE= .') # For all builds: # For CMAKE_BUILD_TYPE=Debug From 6aa33210ecd9773f18ceb775306b79de132b6c6f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 5 Apr 2024 23:05:12 -0300 Subject: [PATCH 77/81] GH-41044: [C++] formatting.h: Make sure space is allocated for the 'Z' when formatting timestamps (#41045) ### What changes are included in this PR? A test that reproduces an issue found by the fuzzer and a fix for it. ### Are these changes tested? - A test - Comments clarifying somethings around `formatting.h` - Increasing the size of the local buffer used to format timestamps The issue was introduced only recently (unreleased): #39272 * GitHub Issue: #41044 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/util/formatting.h | 11 ++++++++--- cpp/src/arrow/util/formatting_util_test.cc | 8 ++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/formatting.h b/cpp/src/arrow/util/formatting.h index 6125f792ff988..dd9af907ecc37 100644 --- a/cpp/src/arrow/util/formatting.h +++ b/cpp/src/arrow/util/formatting.h @@ -126,8 +126,10 @@ namespace detail { ARROW_EXPORT extern const char digit_pairs[]; // Based on fmtlib's format_int class: -// Write digits from right to left into a stack allocated buffer -inline void FormatOneChar(char c, char** cursor) { *--*cursor = c; } +// Write digits from right to left into a stack allocated buffer. +// \pre *cursor points to the byte after the one that will be written. +// \post *cursor points to the byte that was written. +inline void FormatOneChar(char c, char** cursor) { *(--(*cursor)) = c; } template void FormatOneDigit(Int value, char** cursor) { @@ -326,6 +328,7 @@ class StringFormatter : public FloatToStringFormatterMixin constexpr size_t BufferSizeHH_MM_SS() { + // "23:59:59" ("." "9"+)? return detail::Digits10(23) + 1 + detail::Digits10(59) + 1 + detail::Digits10(59) + 1 + detail::Digits10(Duration::period::den) - 1; } @@ -505,8 +509,9 @@ class StringFormatter { timepoint_days -= days(1); } + // YYYY_MM_DD " " HH_MM_SS "Z"? constexpr size_t buffer_size = - detail::BufferSizeYYYY_MM_DD() + 1 + detail::BufferSizeHH_MM_SS(); + detail::BufferSizeYYYY_MM_DD() + 1 + detail::BufferSizeHH_MM_SS() + 1; std::array buffer; char* cursor = buffer.data() + buffer_size; diff --git a/cpp/src/arrow/util/formatting_util_test.cc b/cpp/src/arrow/util/formatting_util_test.cc index 13f57a495d639..fcbeec347d32a 100644 --- a/cpp/src/arrow/util/formatting_util_test.cc +++ b/cpp/src/arrow/util/formatting_util_test.cc @@ -533,6 +533,14 @@ TEST(Formatting, Timestamp) { } } + { + constexpr int64_t kMillisInDay = 24 * 60 * 60 * 1000; + auto ty = timestamp(TimeUnit::MILLI, "+01:00"); + StringFormatter formatter(ty.get()); + AssertFormatting(formatter, -15000 * 365 * kMillisInDay + 1, + "-13021-12-17 00:00:00.001Z"); + } + { auto ty = timestamp(TimeUnit::MILLI, "Pacific/Maruesas"); StringFormatter formatter(ty.get()); From 8fd3ce9e64565935b17c87bd00037d8e96528cce Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sun, 7 Apr 2024 12:23:43 -0300 Subject: [PATCH 78/81] GH-40898: [C#] Do not import length-zero buffers from C Data Interface Arrays (#41054) ### Rationale for this change When implementing integration tests for nanoarrow, it was observed that C# never released arrays where `array->buffers[i]` was `NULL` (including any buffers of any recursive child arrays). This is allowed ( https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowArray.buffers ); however, every other implementation appears to allocate even for length zero buffers (including nanoarrow after https://github.com/apache/arrow-nanoarrow/pull/399 ). ### What changes are included in this PR? `AddMemory()` is replaced with `ArrowBuffer.Empty` if the length of the imported buffer would have been 0 bytes. For other buffers (or anywhere I saw dereferencing a buffer pointer), I added a `Debug.Assert` just to be sure. ### Are these changes tested? I'm not sure what the best way to test them is! They won't be tested in the nanoarrow integration tests since at the point that they run, nanoarrow will no longer export arrays that would trigger this. ### Are there any user-facing changes? No * GitHub Issue: #40898 Authored-by: Dewey Dunnington Signed-off-by: Curt Hagenlocher --- .../src/Apache.Arrow/C/CArrowArrayImporter.cs | 41 +++++++++++++------ 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index fbb2be661fc5d..abe02dcbb591f 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -17,6 +17,7 @@ using System; using System.Collections.Generic; +using System.Diagnostics; using System.Runtime.InteropServices; using Apache.Arrow.Memory; using Apache.Arrow.Types; @@ -36,7 +37,7 @@ public static class CArrowArrayImporter /// Typically, you will allocate an uninitialized CArrowArray pointer, /// pass that to external function, and then use this method to import /// the result. - /// + /// /// /// CArrowArray* importedPtr = CArrowArray.Create(); /// foreign_export_function(importedPtr); @@ -71,7 +72,7 @@ public static unsafe IArrowArray ImportArray(CArrowArray* ptr, IArrowType type) /// Typically, you will allocate an uninitialized CArrowArray pointer, /// pass that to external function, and then use this method to import /// the result. - /// + /// /// /// CArrowArray* importedPtr = CArrowArray.Create(); /// foreign_export_function(importedPtr); @@ -256,6 +257,19 @@ private ArrowBuffer ImportValidityBuffer(CArrowArray* cArray) return (cArray->buffers[0] == null) ? ArrowBuffer.Empty : new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, validityLength)); } + private ArrowBuffer ImportCArrayBuffer(CArrowArray* cArray, int i, int lengthBytes) + { + if (lengthBytes > 0) + { + Debug.Assert(cArray->buffers[i] != null); + return new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[i], 0, lengthBytes)); + } + else + { + return ArrowBuffer.Empty; + } + } + private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray) { if (cArray->n_buffers != 3) @@ -266,12 +280,13 @@ private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray) int length = checked((int)cArray->length); int offsetsLength = (length + 1) * 4; int* offsets = (int*)cArray->buffers[1]; + Debug.Assert(offsets != null); int valuesLength = offsets[length]; ArrowBuffer[] buffers = new ArrowBuffer[3]; buffers[0] = ImportValidityBuffer(cArray); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); - buffers[2] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[2], 0, valuesLength)); + buffers[1] = ImportCArrayBuffer(cArray, 1, offsetsLength); + buffers[2] = ImportCArrayBuffer(cArray, 2, valuesLength); return buffers; } @@ -289,10 +304,10 @@ private ArrowBuffer[] ImportByteArrayViewBuffers(CArrowArray* cArray) long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers - 1]; ArrowBuffer[] buffers = new ArrowBuffer[cArray->n_buffers - 1]; buffers[0] = ImportValidityBuffer(cArray); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, viewsLength)); + buffers[1] = ImportCArrayBuffer(cArray, 1, viewsLength); for (int i = 2; i < buffers.Length; i++) { - buffers[i] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[i], 0, checked((int)bufferLengths[i - 2]))); + buffers[i] = ImportCArrayBuffer(cArray, i, checked((int)bufferLengths[i - 2])); } return buffers; @@ -310,7 +325,7 @@ private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray) ArrowBuffer[] buffers = new ArrowBuffer[2]; buffers[0] = ImportValidityBuffer(cArray); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); + buffers[1] = ImportCArrayBuffer(cArray, 1, offsetsLength); return buffers; } @@ -327,8 +342,8 @@ private ArrowBuffer[] ImportListViewBuffers(CArrowArray* cArray) ArrowBuffer[] buffers = new ArrowBuffer[3]; buffers[0] = ImportValidityBuffer(cArray); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); - buffers[2] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[2], 0, offsetsLength)); + buffers[1] = ImportCArrayBuffer(cArray, 1, offsetsLength); + buffers[2] = ImportCArrayBuffer(cArray, 2, offsetsLength); return buffers; } @@ -356,8 +371,8 @@ private ArrowBuffer[] ImportDenseUnionBuffers(CArrowArray* cArray) int offsetsLength = length * 4; ArrowBuffer[] buffers = new ArrowBuffer[2]; - buffers[0] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, length)); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); + buffers[0] = ImportCArrayBuffer(cArray, 0, length); + buffers[1] = ImportCArrayBuffer(cArray, 1, offsetsLength); return buffers; } @@ -370,7 +385,7 @@ private ArrowBuffer[] ImportSparseUnionBuffers(CArrowArray* cArray) } ArrowBuffer[] buffers = new ArrowBuffer[1]; - buffers[0] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, checked((int)cArray->length))); + buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->length)); return buffers; } @@ -392,7 +407,7 @@ private ArrowBuffer[] ImportFixedWidthBuffers(CArrowArray* cArray, int bitWidth) ArrowBuffer[] buffers = new ArrowBuffer[2]; buffers[0] = ImportValidityBuffer(cArray); - buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, valuesLength)); + buffers[1] = ImportCArrayBuffer(cArray, 1, valuesLength); return buffers; } From 6c1417220151fe91e6ea9c5e6af7916863f40bfe Mon Sep 17 00:00:00 2001 From: James Duong Date: Mon, 8 Apr 2024 08:05:16 +0900 Subject: [PATCH 79/81] GH-40745: [Java][FlightRPC] Support configuring backpressure threshold (#41051) ### Rationale for this change gRPC uses a default backpressure threshold that is too low for services that send large amounts of data such as Arrow Flight. This causes excessive blocking and reduces throughput. ### What changes are included in this PR? * Update to grpc-java 1.63.0 * Add to FlightServer.Builder an option to set the number of bytes queued before blocking due to backpressure. Set the default to 10MB instead of gRPC's default of 64K. * Add a ServerInterceptor for automating setting the backpressure threshold on ServerCalls. ### Are these changes tested? Tested through existing unit tests. ### Are there any user-facing changes? The FlightServer.Builder class has an extra configuration option to let users change the backpressure threshold themselves. * GitHub Issue: #40745 Authored-by: James Duong Signed-off-by: David Li --- .../org/apache/arrow/flight/FlightServer.java | 15 +++++++ ...erverBackpressureThresholdInterceptor.java | 43 +++++++++++++++++++ java/pom.xml | 2 +- 3 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/ServerBackpressureThresholdInterceptor.java diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java index d873f7d2828d0..dc545c131828a 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java @@ -42,6 +42,7 @@ import org.apache.arrow.flight.auth2.Auth2Constants; import org.apache.arrow.flight.auth2.CallHeaderAuthenticator; import org.apache.arrow.flight.auth2.ServerCallHeaderAuthMiddleware; +import org.apache.arrow.flight.grpc.ServerBackpressureThresholdInterceptor; import org.apache.arrow.flight.grpc.ServerInterceptorAdapter; import org.apache.arrow.flight.grpc.ServerInterceptorAdapter.KeyFactory; import org.apache.arrow.memory.BufferAllocator; @@ -79,6 +80,9 @@ public class FlightServer implements AutoCloseable { /** The maximum size of an individual gRPC message. This effectively disables the limit. */ static final int MAX_GRPC_MESSAGE_SIZE = Integer.MAX_VALUE; + /** The default number of bytes that can be queued on an output stream before blocking. */ + public static final int DEFAULT_BACKPRESSURE_THRESHOLD = 10 * 1024 * 1024; // 10MB + /** Create a new instance from a gRPC server. For internal use only. */ private FlightServer(Location location, Server server, ExecutorService grpcExecutor) { this.location = location; @@ -179,6 +183,7 @@ public static final class Builder { private CallHeaderAuthenticator headerAuthenticator = CallHeaderAuthenticator.NO_OP; private ExecutorService executor = null; private int maxInboundMessageSize = MAX_GRPC_MESSAGE_SIZE; + private int backpressureThreshold = DEFAULT_BACKPRESSURE_THRESHOLD; private InputStream certChain; private InputStream key; private InputStream mTlsCACert; @@ -300,6 +305,7 @@ public FlightServer build() { .addService( ServerInterceptors.intercept( flightService, + new ServerBackpressureThresholdInterceptor(backpressureThreshold), new ServerAuthInterceptor(authHandler))); // Allow hooking into the gRPC builder. This is not guaranteed to be available on all Arrow versions or @@ -336,6 +342,15 @@ public Builder maxInboundMessageSize(int maxMessageSize) { return this; } + /** + * Set the number of bytes that may be queued on a server output stream before writes are blocked. + */ + public Builder backpressureThreshold(int backpressureThreshold) { + Preconditions.checkArgument(backpressureThreshold > 0); + this.backpressureThreshold = backpressureThreshold; + return this; + } + /** * A small utility function to ensure that InputStream attributes. * are closed if they are not null diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/ServerBackpressureThresholdInterceptor.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/ServerBackpressureThresholdInterceptor.java new file mode 100644 index 0000000000000..bd42fbc8ad6a4 --- /dev/null +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/ServerBackpressureThresholdInterceptor.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.grpc; + +import io.grpc.Metadata; +import io.grpc.ServerCall; +import io.grpc.ServerCallHandler; +import io.grpc.ServerInterceptor; + +/** + * An interceptor for specifying the number of bytes that can be queued before a call with an output stream + * gets blocked by backpressure. + */ +public class ServerBackpressureThresholdInterceptor implements ServerInterceptor { + + private final int numBytes; + + public ServerBackpressureThresholdInterceptor(int numBytes) { + this.numBytes = numBytes; + } + + @Override + public ServerCall.Listener interceptCall(ServerCall call, Metadata headers, + ServerCallHandler next) { + call.setOnReadyThreshold(numBytes); + return next.startCall(call, headers); + } +} diff --git a/java/pom.xml b/java/pom.xml index 95b27922eafa9..9892061677d09 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -34,7 +34,7 @@ 2.0.11 33.0.0-jre 4.1.108.Final - 1.62.2 + 1.63.0 3.23.1 2.17.0 3.4.0 From 805327e48d81cc1ccd3ae4de749547c4f1f4842c Mon Sep 17 00:00:00 2001 From: normanj-bitquill <78755797+normanj-bitquill@users.noreply.github.com> Date: Sun, 7 Apr 2024 18:17:09 -0500 Subject: [PATCH 80/81] GH-40249: [Java] Fix NPE in ArrowDatabaseMetadata (#40988) ### Rationale for this change When retrieving database metadata using the JDBC driver, some data such as SQL keywords could be null. Before this change, an NPE would be thrown when trying to convert the list of SQL keywords into a String. ### What changes are included in this PR? The following database metadata fields: * SQL keywords * Numeric functions * String functions * System functions * Time/date functions will convert to an empty string when they are null. ### Are these changes tested? A unit test has been added to verify that the fields above are converted to the empty string when null, with no exceptions thrown. ### Are there any user-facing changes? The fields above will now return an empty string rather than throw an NPE. * GitHub Issue: #40249 Authored-by: Norman Jordan Signed-off-by: David Li --- .../driver/jdbc/ArrowDatabaseMetadata.java | 24 +++++++++++++------ .../jdbc/ArrowDatabaseMetadataTest.java | 19 ++++++++++++++- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java index d68b8070e2bb7..4af3e55ee11c0 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadata.java @@ -49,6 +49,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -197,31 +198,36 @@ public boolean isReadOnly() throws SQLException { @Override public String getSQLKeywords() throws SQLException { return convertListSqlInfoToString( - getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_KEYWORDS, List.class)); + getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_KEYWORDS, List.class)) + .orElse(""); } @Override public String getNumericFunctions() throws SQLException { return convertListSqlInfoToString( - getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_NUMERIC_FUNCTIONS, List.class)); + getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_NUMERIC_FUNCTIONS, List.class)) + .orElse(""); } @Override public String getStringFunctions() throws SQLException { return convertListSqlInfoToString( - getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_STRING_FUNCTIONS, List.class)); + getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_STRING_FUNCTIONS, List.class)) + .orElse(""); } @Override public String getSystemFunctions() throws SQLException { return convertListSqlInfoToString( - getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_SYSTEM_FUNCTIONS, List.class)); + getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_SYSTEM_FUNCTIONS, List.class)) + .orElse(""); } @Override public String getTimeDateFunctions() throws SQLException { return convertListSqlInfoToString( - getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_DATETIME_FUNCTIONS, List.class)); + getSqlInfoAndCacheIfCacheIsEmpty(SqlInfo.SQL_DATETIME_FUNCTIONS, List.class)) + .orElse(""); } @Override @@ -753,8 +759,12 @@ private T getSqlInfoAndCacheIfCacheIsEmpty(final SqlInfo sqlInfoCommand, return desiredType.cast(cachedSqlInfo.get(sqlInfoCommand)); } - private String convertListSqlInfoToString(final List sqlInfoList) { - return sqlInfoList.stream().map(Object::toString).collect(Collectors.joining(", ")); + private Optional convertListSqlInfoToString(final List sqlInfoList) { + if (sqlInfoList == null) { + return Optional.empty(); + } else { + return Optional.of(sqlInfoList.stream().map(Object::toString).collect(Collectors.joining(", "))); + } } private boolean getSqlInfoEnumOptionAndCacheIfCacheIsEmpty( diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadataTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadataTest.java index 0d930f4c44e1f..51334c77486bf 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadataTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowDatabaseMetadataTest.java @@ -95,9 +95,14 @@ public class ArrowDatabaseMetadataTest { public static final boolean EXPECTED_MAX_ROW_SIZE_INCLUDES_BLOBS = false; private static final MockFlightSqlProducer FLIGHT_SQL_PRODUCER = new MockFlightSqlProducer(); + private static final MockFlightSqlProducer FLIGHT_SQL_PRODUCER_EMPTY_SQLINFO = + new MockFlightSqlProducer(); @ClassRule public static final FlightServerTestRule FLIGHT_SERVER_TEST_RULE = FlightServerTestRule .createStandardTestRule(FLIGHT_SQL_PRODUCER); + @ClassRule + public static final FlightServerTestRule FLIGHT_SERVER_EMPTY_SQLINFO_TEST_RULE = + FlightServerTestRule.createStandardTestRule(FLIGHT_SQL_PRODUCER_EMPTY_SQLINFO); private static final int ROW_COUNT = 10; private static final List> EXPECTED_GET_CATALOGS_RESULTS = range(0, ROW_COUNT) @@ -604,7 +609,7 @@ public static void setUpBeforeClass() throws SQLException { @AfterClass public static void tearDown() throws Exception { - AutoCloseables.close(connection, FLIGHT_SQL_PRODUCER); + AutoCloseables.close(connection, FLIGHT_SQL_PRODUCER, FLIGHT_SQL_PRODUCER_EMPTY_SQLINFO); } @@ -1420,4 +1425,16 @@ public void testSqlToRegexLike() { Assert.assertEquals("\\*", ArrowDatabaseMetadata.sqlToRegexLike("*")); Assert.assertEquals("T\\*E.S.*T", ArrowDatabaseMetadata.sqlToRegexLike("T*E_S%T")); } + + @Test + public void testEmptySqlInfo() throws Exception { + try (final Connection testConnection = FLIGHT_SERVER_EMPTY_SQLINFO_TEST_RULE.getConnection(false)) { + final DatabaseMetaData metaData = testConnection.getMetaData(); + collector.checkThat(metaData.getSQLKeywords(), is("")); + collector.checkThat(metaData.getNumericFunctions(), is("")); + collector.checkThat(metaData.getStringFunctions(), is("")); + collector.checkThat(metaData.getSystemFunctions(), is("")); + collector.checkThat(metaData.getTimeDateFunctions(), is("")); + } + } } From 84f6edef697fd0fa0f5fce252c017a31e4ba3944 Mon Sep 17 00:00:00 2001 From: James Henderson Date: Mon, 8 Apr 2024 00:34:27 +0100 Subject: [PATCH 81/81] GH-40999: [Java] Fix AIOOBE trying to splitAndTransfer DUV within nullable struct (#41000) We add a `typeId >= 0` guard to `DUV.TransferImpl.splitAndTransfer` to fix #40999. ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #40999 Authored-by: James Henderson Signed-off-by: David Li --- .../codegen/templates/DenseUnionVector.java | 10 ++++--- .../arrow/vector/TestDenseUnionVector.java | 28 +++++++++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/java/vector/src/main/codegen/templates/DenseUnionVector.java b/java/vector/src/main/codegen/templates/DenseUnionVector.java index 27fd8e9798b67..42e96f7aca335 100644 --- a/java/vector/src/main/codegen/templates/DenseUnionVector.java +++ b/java/vector/src/main/codegen/templates/DenseUnionVector.java @@ -676,10 +676,12 @@ public void splitAndTransfer(int startIndex, int length) { for (int i = startIndex; i < startIndex + length; i++) { byte typeId = typeBuffer.getByte(i); - to.offsetBuffer.setInt((long) (i - startIndex) * OFFSET_WIDTH, typeCounts[typeId]); - typeCounts[typeId] += 1; - if (typeStarts[typeId] == -1) { - typeStarts[typeId] = offsetBuffer.getInt((long) i * OFFSET_WIDTH); + if (typeId >= 0) { + to.offsetBuffer.setInt((long) (i - startIndex) * OFFSET_WIDTH, typeCounts[typeId]); + typeCounts[typeId] += 1; + if (typeStarts[typeId] == -1) { + typeStarts[typeId] = offsetBuffer.getInt((long) i * OFFSET_WIDTH); + } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java index 2c29861561bb7..0621fd4527520 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDenseUnionVector.java @@ -363,6 +363,34 @@ public void testSplitAndTransferWithMixedVectors() throws Exception { } } + @Test + public void testSplitAndTransferDuvInStruct() { + try (StructVector struct = StructVector.empty("struct", allocator)) { + DenseUnionVector duv = struct.addOrGet("duv", + FieldType.notNullable(MinorType.DENSEUNION.getType()), + DenseUnionVector.class); + byte i32TypeId = duv.registerNewTypeId(Field.notNullable("i32", MinorType.INT.getType())); + duv.addVector(i32TypeId, new IntVector("i32", allocator)); + + struct.setIndexDefined(0); + duv.setTypeId(0, i32TypeId); + duv.setSafe(0, newIntHolder(42)); + + struct.setNull(1); + struct.setValueCount(2); + + try (StructVector dest = StructVector.empty("dest", allocator)) { + TransferPair pair = struct.makeTransferPair(dest); + pair.splitAndTransfer(0, 2); + + assertEquals(2, dest.getValueCount()); + assertFalse(dest.isNull(0)); + assertEquals(42, dest.getObject(0).get("duv")); + assertTrue(dest.isNull(1)); + } + } + } + @Test public void testGetFieldTypeInfo() throws Exception { Map metadata = new HashMap<>();