From 8be0e31b52a935cff24c93bc5097698b1bc9e99e Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Thu, 4 Jun 2026 16:33:50 +0800 Subject: [PATCH 1/3] GH-50043: fix hash_any/hash_all on sliced boolean arrays --- cpp/src/arrow/acero/hash_aggregate_test.cc | 37 +++++++++++++++++++ .../arrow/compute/kernels/hash_aggregate.cc | 3 +- python/pyarrow/tests/test_table.py | 31 ++++++++++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index 12d24429cb6c..9ab0d24b0de9 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -2157,6 +2157,43 @@ TEST_P(GroupBy, AnyAndAll) { } } +TEST_P(GroupBy, AnyAllSlicedNullableBoolean) { + auto table = TableFromJSON(schema({field("any_arg", boolean()), + field("all_arg", boolean()), field("key", int64())}), + {R"([ + [true, false, 99], + [false, true, 10], + [null, null, 10] + ])"}); + auto sliced = table->Slice(1); + + // GH-50043: hash_any/hash_all should respect the slice offset. + // After Slice(1), any_arg=[false, null] and all_arg=[true, null]. + for (bool use_threads : {true, false}) { + SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + + ASSERT_OK_AND_ASSIGN(Datum actual, GroupByTest({sliced->GetColumnByName("any_arg"), + sliced->GetColumnByName("all_arg")}, + {sliced->GetColumnByName("key")}, + { + {"hash_any", nullptr}, + {"hash_all", nullptr}, + }, + use_threads)); + ValidateOutput(actual); + + Datum expected = ArrayFromJSON(struct_({ + field("key_0", int64()), + field("hash_any", boolean()), + field("hash_all", boolean()), + }), + R"([ + [10, false, true] + ])"); + AssertDatumsEqual(expected, actual, /*verbose=*/true); + } +} + TEST_P(GroupBy, AnyAllScalar) { BatchesWithSchema input; input.batches = { diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 8359945319cd..d07096236d8c 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -1261,7 +1261,8 @@ struct GroupedBooleanAggregator : public GroupedAggregator { input.buffers[0].data, input.offset, input.length, [&](int64_t position) { counts[*g]++; - Impl::UpdateGroupWith(reduced, *g, bit_util::GetBit(bitmap, position)); + Impl::UpdateGroupWith(reduced, *g, + bit_util::GetBit(bitmap, input.offset + position)); g++; }, [&] { bit_util::SetBitTo(no_nulls, *g++, false); }); diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index b65fb7d952c8..0c052dbaa624 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -2976,6 +2976,37 @@ def sorted_by_keys(d): } +@pytest.mark.acero +def test_group_by_sliced_any_all(): + # GH-50043: hash_any/hash_all produce incorrect results on sliced boolean arrays + # Row 0 will be discarded by slice, should not affect aggregation + table = pa.table( + { + "g": [99, 10, 10], + "any_arg": [True, False, None], + "all_arg": [False, True, None], + } + ) + sliced = table.slice(1) + + # any(False, None) = False, all(True, None) = True + result = sliced.group_by("g", use_threads=False).aggregate( + [ + ("any_arg", "any"), + ("all_arg", "all"), + ] + ) + + expected = pa.table( + { + "g": [10], + "any_arg_any": [False], + "all_arg_all": [True], + } + ) + assert result.equals(expected) + + @pytest.mark.acero def test_table_group_by_first(): # "first" is an ordered aggregation -> requires to specify use_threads=False From 22dd4b3f0fb2be07c5fa0e7802bf0f419951b648 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Thu, 4 Jun 2026 19:48:09 +0800 Subject: [PATCH 2/3] GH-50043: update regression tests --- cpp/src/arrow/acero/hash_aggregate_test.cc | 28 +++++++++++----------- python/pyarrow/tests/test_table.py | 19 ++++++++------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index 9ab0d24b0de9..f94a86822eb9 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -2172,22 +2172,22 @@ TEST_P(GroupBy, AnyAllSlicedNullableBoolean) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN(Datum actual, GroupByTest({sliced->GetColumnByName("any_arg"), - sliced->GetColumnByName("all_arg")}, - {sliced->GetColumnByName("key")}, - { - {"hash_any", nullptr}, - {"hash_all", nullptr}, - }, - use_threads)); + ASSERT_OK_AND_ASSIGN(auto actual, GroupByTest({sliced->GetColumnByName("any_arg"), + sliced->GetColumnByName("all_arg")}, + {sliced->GetColumnByName("key")}, + { + {"hash_any", nullptr}, + {"hash_all", nullptr}, + }, + use_threads)); ValidateOutput(actual); - Datum expected = ArrayFromJSON(struct_({ - field("key_0", int64()), - field("hash_any", boolean()), - field("hash_all", boolean()), - }), - R"([ + auto expected = ArrayFromJSON(struct_({ + field("key_0", int64()), + field("hash_any", boolean()), + field("hash_all", boolean()), + }), + R"([ [10, false, true] ])"); AssertDatumsEqual(expected, actual, /*verbose=*/true); diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 0c052dbaa624..c6dbbc5145d1 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -2989,14 +2989,6 @@ def test_group_by_sliced_any_all(): ) sliced = table.slice(1) - # any(False, None) = False, all(True, None) = True - result = sliced.group_by("g", use_threads=False).aggregate( - [ - ("any_arg", "any"), - ("all_arg", "all"), - ] - ) - expected = pa.table( { "g": [10], @@ -3004,7 +2996,16 @@ def test_group_by_sliced_any_all(): "all_arg_all": [True], } ) - assert result.equals(expected) + + # any(False, None) = False, all(True, None) = True + for use_threads in [False, True]: + result = sliced.group_by("g", use_threads=use_threads).aggregate( + [ + ("any_arg", "any"), + ("all_arg", "all"), + ] + ) + assert result.equals(expected) @pytest.mark.acero From 183d474daa520f80d11db662cb39fc9fcc3ce715 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Thu, 4 Jun 2026 20:14:49 +0800 Subject: [PATCH 3/3] GH-50043: update regression tests --- cpp/src/arrow/acero/hash_aggregate_test.cc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index f94a86822eb9..442dcd588345 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -2169,6 +2169,15 @@ TEST_P(GroupBy, AnyAllSlicedNullableBoolean) { // GH-50043: hash_any/hash_all should respect the slice offset. // After Slice(1), any_arg=[false, null] and all_arg=[true, null]. + auto expected = ArrayFromJSON(struct_({ + field("key_0", int64()), + field("hash_any", boolean()), + field("hash_all", boolean()), + }), + R"([ + [10, false, true] + ])"); + for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); @@ -2182,14 +2191,6 @@ TEST_P(GroupBy, AnyAllSlicedNullableBoolean) { use_threads)); ValidateOutput(actual); - auto expected = ArrayFromJSON(struct_({ - field("key_0", int64()), - field("hash_any", boolean()), - field("hash_all", boolean()), - }), - R"([ - [10, false, true] - ])"); AssertDatumsEqual(expected, actual, /*verbose=*/true); } }