From 2d37d2b9ea8ea09fbec71d098b739db2ed596650 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Sat, 6 Jun 2026 23:47:09 +0800 Subject: [PATCH] Fix count nullness for sliced union arrays --- .../arrow/compute/kernels/aggregate_test.cc | 30 ++++++++++++++ cpp/src/arrow/util/union_util.cc | 8 ++-- python/pyarrow/tests/test_compute.py | 39 +++++++++++++++++++ 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 6783475db345..bfc3442a73d5 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -951,6 +951,36 @@ TEST(TestCountKernel, RunEndEncodedNulls) { ValidateCount(*array->Slice(3, 6), {3, 3}); } +TEST(TestCountKernel, SparseUnionSlicedNulls) { + // GH-50113: Sliced unions can report incorrect null counts in count. + auto type_ids = ArrayFromJSON(int8(), "[0, 1, 0, 0, 1, 1]"); + ArrayVector children = { + ArrayFromJSON(float64(), "[0.5, 99.0, null, 3.0, 88.0, 77.0]"), + ArrayFromJSON(boolean(), "[false, null, true, false, true, false]")}; + ASSERT_OK_AND_ASSIGN(auto array, + SparseUnionArray::Make(*type_ids, std::move(children))); + + // Logical array: [0.5, null, null, 3.0, true, false]. + ValidateCount(*array, {4, 2}); + // Logical slice: [null, null, 3.0, true]. + ValidateCount(*array->Slice(1, 4), {2, 2}); +} + +TEST(TestCountKernel, DenseUnionSlicedNulls) { + // GH-50113: Sliced unions can report incorrect null counts in count. + auto type_ids = ArrayFromJSON(int8(), "[0, 1, 0, 0, 1, 1]"); + auto value_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 2, 1, 2]"); + ArrayVector children = {ArrayFromJSON(float64(), "[0.5, null, 3.0]"), + ArrayFromJSON(boolean(), "[null, true, false]")}; + ASSERT_OK_AND_ASSIGN( + auto array, DenseUnionArray::Make(*type_ids, *value_offsets, std::move(children))); + + // Logical array: [0.5, null, null, 3.0, true, false]. + ValidateCount(*array, {4, 2}); + // Logical slice: [null, null, 3.0, true]. + ValidateCount(*array->Slice(1, 4), {2, 2}); +} + template class TestRandomNumericCountKernel : public ::testing::Test {}; diff --git a/cpp/src/arrow/util/union_util.cc b/cpp/src/arrow/util/union_util.cc index 7f2a150db855..6b4d752d8685 100644 --- a/cpp/src/arrow/util/union_util.cc +++ b/cpp/src/arrow/util/union_util.cc @@ -33,9 +33,9 @@ int64_t LogicalSparseUnionNullCount(const ArraySpan& span) { const int8_t* types = span.GetValues(1); // NOLINT int64_t null_count = 0; for (int64_t i = 0; i < span.length; i++) { - const int8_t child_id = sparse_union_type->child_ids()[types[span.offset + i]]; + const int8_t child_id = sparse_union_type->child_ids()[types[i]]; - null_count += span.child_data[child_id].IsNull(i); + null_count += span.child_data[child_id].IsNull(span.offset + i); } return null_count; } @@ -48,8 +48,8 @@ int64_t LogicalDenseUnionNullCount(const ArraySpan& span) { const int32_t* offsets = span.GetValues(2); // NOLINT int64_t null_count = 0; for (int64_t i = 0; i < span.length; i++) { - const int8_t child_id = dense_union_type->child_ids()[types[span.offset + i]]; - const int32_t offset = offsets[span.offset + i]; + const int8_t child_id = dense_union_type->child_ids()[types[i]]; + const int32_t offset = offsets[i]; null_count += span.child_data[child_id].IsNull(offset); } return null_count; diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 1c82d6c944bd..4e7f506ba54d 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -2889,6 +2889,45 @@ def test_count_run_end_encoded_nulls(): assert pc.count(arr.slice(3, 6), mode="only_null").as_py() == 3 +def test_count_sparse_union_sliced_nulls(): + # GH-50113: Sliced unions can report incorrect null counts in count. + arr = pa.UnionArray.from_sparse( + pa.array([0, 1, 0, 0, 1, 1], type=pa.int8()), + [ + pa.array([0.5, 99.0, None, 3.0, 88.0, 77.0]), + pa.array([False, None, True, False, True, False]), + ] + ) + + # Logical array: [0.5, None, None, 3.0, True, False]. + assert pc.count(arr, mode="only_valid").as_py() == 4 + assert pc.count(arr, mode="only_null").as_py() == 2 + assert pc.count(arr, mode="all").as_py() == 6 + # Logical slice: [None, None, 3.0, True]. + assert pc.count(arr.slice(1, 4), mode="only_valid").as_py() == 2 + assert pc.count(arr.slice(1, 4), mode="only_null").as_py() == 2 + + +def test_count_dense_union_sliced_nulls(): + # GH-50113: Sliced unions can report incorrect null counts in count. + arr = pa.UnionArray.from_dense( + pa.array([0, 1, 0, 0, 1, 1], type=pa.int8()), + pa.array([0, 0, 1, 2, 1, 2], type=pa.int32()), + [ + pa.array([0.5, None, 3.0]), + pa.array([None, True, False]), + ] + ) + + # Logical array: [0.5, None, None, 3.0, True, False]. + assert pc.count(arr, mode="only_valid").as_py() == 4 + assert pc.count(arr, mode="only_null").as_py() == 2 + assert pc.count(arr, mode="all").as_py() == 6 + # Logical slice: [None, None, 3.0, True]. + assert pc.count(arr.slice(1, 4), mode="only_valid").as_py() == 2 + assert pc.count(arr.slice(1, 4), mode="only_null").as_py() == 2 + + def test_index(): arr = pa.array([0, 1, None, 3, 4], type=pa.int64()) assert pc.index(arr, pa.scalar(0)).as_py() == 0