Skip to content

Commit

Permalink
apacheGH-39852: [Python] Support creating Binary/StringView arrays fr…
Browse files Browse the repository at this point in the history
…om python objects
  • Loading branch information
jorisvandenbossche committed Jan 30, 2024
1 parent 787afa1 commit 8622dd3
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 38 deletions.
64 changes: 53 additions & 11 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,10 @@ class PyValue {
return view.ParseString(obj);
}

static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& view) {
return view.ParseString(obj);
}

static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
PyBytesView& view) {
ARROW_RETURN_NOT_OK(view.ParseString(obj));
Expand All @@ -499,8 +503,8 @@ class PyValue {
}

template <typename T>
static enable_if_string<T, Status> Convert(const T*, const O& options, I obj,
PyBytesView& view) {
static enable_if_t<is_string_type<T>::value || is_string_view_type<T>::value, Status>
Convert(const T*, const O& options, I obj, PyBytesView& view) {
if (options.strict) {
// Strict conversion, force output to be unicode / utf8 and validate that
// any binary values are utf8
Expand Down Expand Up @@ -570,18 +574,12 @@ struct PyConverterTrait;

template <typename T>
struct PyConverterTrait<
T,
enable_if_t<(!is_nested_type<T>::value && !is_interval_type<T>::value &&
!is_extension_type<T>::value && !is_binary_view_like_type<T>::value) ||
std::is_same<T, MonthDayNanoIntervalType>::value>> {
T, enable_if_t<(!is_nested_type<T>::value && !is_interval_type<T>::value &&
!is_extension_type<T>::value) ||
std::is_same<T, MonthDayNanoIntervalType>::value>> {
using type = PyPrimitiveConverter<T>;
};

template <typename T>
struct PyConverterTrait<T, enable_if_binary_view_like<T>> {
// not implemented
};

template <typename T>
struct PyConverterTrait<T, enable_if_list_like<T>> {
using type = PyListConverter<T>;
Expand Down Expand Up @@ -745,6 +743,50 @@ class PyPrimitiveConverter<T, enable_if_base_binary<T>>
bool observed_binary_ = false;
};

template <typename T>
class PyPrimitiveConverter<T, enable_if_t<is_binary_view_like_type<T>::value>>
: public PrimitiveConverter<T, PyConverter> {
public:
Status Append(PyObject* value) override {
if (PyValue::IsNull(this->options_, value)) {
this->primitive_builder_->UnsafeAppendNull();
} else if (arrow::py::is_scalar(value)) {
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
arrow::py::unwrap_scalar(value));
ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
} else {
ARROW_RETURN_NOT_OK(
PyValue::Convert(this->primitive_type_, this->options_, value, view_));
if (!view_.is_utf8) {
// observed binary value
observed_binary_ = true;
}
// Since we don't know the varying length input size in advance, we need to
// reserve space in the value builder one by one. ReserveData raises CapacityError
// if the value would not fit into the array.
ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size));
this->primitive_builder_->UnsafeAppend(view_.bytes,
static_cast<int64_t>(view_.size));
}
return Status::OK();
}

Result<std::shared_ptr<Array>> ToArray() override {
ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter<T, PyConverter>::ToArray()));
if (observed_binary_) {
// if we saw any non-unicode, cast results to BinaryArray
auto binary_type = TypeTraits<typename T::PhysicalType>::type_singleton();
return array->View(binary_type);
} else {
return array;
}
}

protected:
PyBytesView view_;
bool observed_binary_ = false;
};

template <typename U>
class PyDictionaryConverter<U, enable_if_has_c_type<U>>
: public DictionaryConverter<U, PyConverter> {
Expand Down
19 changes: 16 additions & 3 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,16 @@ def test_sequence_unicode():
assert arr.to_pylist() == data


@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), pa.string_view()])
def test_sequence_unicode_explicit_type(ty):
data = ['foo', 'bar', None, 'mañana']
arr = pa.array(data, type=ty)
assert len(arr) == 4
assert arr.null_count == 1
assert arr.type == ty
assert arr.to_pylist() == data


def check_array_mixed_unicode_bytes(binary_type, string_type):
values = ['qux', b'foo', bytearray(b'barz')]
b_values = [b'qux', b'foo', b'barz']
Expand All @@ -787,6 +797,7 @@ def check_array_mixed_unicode_bytes(binary_type, string_type):
def test_array_mixed_unicode_bytes():
check_array_mixed_unicode_bytes(pa.binary(), pa.string())
check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string())
check_array_mixed_unicode_bytes(pa.binary_view(), pa.string_view())


@pytest.mark.large_memory
Expand Down Expand Up @@ -818,7 +829,7 @@ def test_large_binary_value(ty):


@pytest.mark.large_memory
@pytest.mark.parametrize("ty", [pa.binary(), pa.string()])
@pytest.mark.parametrize("ty", [pa.binary(), pa.string(), pa.string_view()])
def test_string_too_large(ty):
# Construct a binary array with a single value larger than 4GB
s = b"0123456789abcdefghijklmnopqrstuvwxyz"
Expand All @@ -836,15 +847,15 @@ def test_sequence_bytes():
u1.decode('utf-8'), # unicode gets encoded,
bytearray(b'bar'),
None]
for ty in [None, pa.binary(), pa.large_binary()]:
for ty in [None, pa.binary(), pa.large_binary(), pa.binary_view()]:
arr = pa.array(data, type=ty)
assert len(arr) == 6
assert arr.null_count == 1
assert arr.type == ty or pa.binary()
assert arr.to_pylist() == [b'foo', b'dada', b'data', u1, b'bar', None]


@pytest.mark.parametrize("ty", [pa.string(), pa.large_string()])
@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), pa.string_view()])
def test_sequence_utf8_to_unicode(ty):
# ARROW-1225
data = [b'foo', None, b'bar']
Expand Down Expand Up @@ -2431,6 +2442,8 @@ def test_array_from_pylist_offset_overflow():
pa.binary(3)),
([b"a"], [pa.scalar("a", type=pa.large_binary())], pa.large_binary()),
(["a"], [pa.scalar("a", type=pa.large_string())], pa.large_string()),
([b"a"], [pa.scalar("a", type=pa.binary_view())], pa.binary_view()),
(["a"], [pa.scalar("a", type=pa.string_view())], pa.string_view()),
(
["a"],
[pa.scalar("a", type=pa.dictionary(pa.int64(), pa.string()))],
Expand Down
28 changes: 4 additions & 24 deletions python/pyarrow/tests/test_scalars.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,8 @@
(b"bytes", None, pa.BinaryScalar),
("largestring", pa.large_string(), pa.LargeStringScalar),
(b"largebytes", pa.large_binary(), pa.LargeBinaryScalar),
# TODO(GH-39633) pa.scalar(..) requires python->arrow conversion to be implemented
# ("string_view", pa.string_view(), pa.StringViewScalar),
# (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar),
("string_view", pa.string_view(), pa.StringViewScalar),
(b"bytes_view", pa.binary_view(), pa.BinaryViewScalar),
(b"abc", pa.binary(3), pa.FixedSizeBinaryScalar),
([1, 2, 3], None, pa.ListScalar),
([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar),
Expand Down Expand Up @@ -492,7 +491,7 @@ def test_month_day_nano_interval():
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
(pa.string(), pa.StringScalar),
(pa.large_string(), pa.LargeStringScalar),
# (pa.string_view(), pa.StringViewScalar),
(pa.string_view(), pa.StringViewScalar),
])
def test_string(value, ty, scalar_typ):
s = pa.scalar(value, type=ty)
Expand All @@ -507,30 +506,11 @@ def test_string(value, ty, scalar_typ):
assert buf.to_pybytes() == value.encode()


@pytest.mark.parametrize('value', ['foo', 'mañana'])
def test_string_view(value):
# TODO: replace with normal scalar construction
builder = pa.lib.StringViewBuilder()
builder.append(value)
arr = builder.finish()

s = arr[0]
assert isinstance(s, pa.StringViewScalar)
assert s.as_py() == value
assert s.as_py() != 'something'
assert repr(value) in repr(s)
assert str(s) == str(value)

buf = s.as_buffer()
assert isinstance(buf, pa.Buffer)
assert buf.to_pybytes() == value.encode()


@pytest.mark.parametrize('value', [b'foo', b'bar'])
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
(pa.binary(), pa.BinaryScalar),
(pa.large_binary(), pa.LargeBinaryScalar),
# (pa.binary_view(), pa.BinaryViewScalar),
(pa.binary_view(), pa.BinaryViewScalar),
])
def test_binary(value, ty, scalar_typ):
s = pa.scalar(value, type=ty)
Expand Down

0 comments on commit 8622dd3

Please sign in to comment.