Skip to content

Commit e6255d8

Browse files
Backport PR #51542 on branch 2.0.x (API: ArrowExtensionArray.value_counts returns pyarrow.int64 type) (#51620)
Backport PR #51542: API: ArrowExtensionArray.value_counts returns pyarrow.int64 type Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 42c3ab1 commit e6255d8

File tree

7 files changed

+57
-75
lines changed

7 files changed

+57
-75
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,7 @@ Other API changes
818818
- :class:`DataFrame` and :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`)
819819
- Division by zero with :class:`ArrowDtype` dtypes returns ``-inf``, ``nan``, or ``inf`` depending on the numerator, instead of raising (:issue:`51541`)
820820
- Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`)
821+
- :meth:`~arrays.ArrowExtensionArray.value_counts` now returns data with :class:`ArrowDtype` with ``pyarrow.int64`` type instead of ``"Int64"`` type (:issue:`51462`)
821822

822823
.. note::
823824

pandas/core/algorithms.py

+3
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,9 @@ def value_counts(
873873
result.name = name
874874
result.index.name = index_name
875875
counts = result._values
876+
if not isinstance(counts, np.ndarray):
877+
# e.g. ArrowExtensionArray
878+
counts = np.asarray(counts)
876879

877880
elif isinstance(values, ABCMultiIndex):
878881
# GH49558

pandas/core/arrays/arrow/array.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -983,12 +983,11 @@ def value_counts(self, dropna: bool = True) -> Series:
983983
if pa.types.is_duration(pa_type):
984984
values = values.cast(pa_type)
985985

986-
# No missing values so we can adhere to the interface and return a numpy array.
987-
counts = np.array(counts)
986+
counts = ArrowExtensionArray(counts)
988987

989988
index = Index(type(self)(values))
990989

991-
return Series(counts, index=index, name="count").astype("Int64")
990+
return Series(counts, index=index, name="count")
992991

993992
@classmethod
994993
def _concat_same_type(

pandas/tests/arrays/string_/test_string.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -453,20 +453,28 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2):
453453

454454

455455
def test_value_counts_na(dtype):
456+
if getattr(dtype, "storage", "") == "pyarrow":
457+
exp_dtype = "int64[pyarrow]"
458+
else:
459+
exp_dtype = "Int64"
456460
arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
457461
result = arr.value_counts(dropna=False)
458-
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64", name="count")
462+
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype=exp_dtype, name="count")
459463
tm.assert_series_equal(result, expected)
460464

461465
result = arr.value_counts(dropna=True)
462-
expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count")
466+
expected = pd.Series([2, 1], index=arr[:2], dtype=exp_dtype, name="count")
463467
tm.assert_series_equal(result, expected)
464468

465469

466470
def test_value_counts_with_normalize(dtype):
471+
if getattr(dtype, "storage", "") == "pyarrow":
472+
exp_dtype = "double[pyarrow]"
473+
else:
474+
exp_dtype = "Float64"
467475
ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
468476
result = ser.value_counts(normalize=True)
469-
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
477+
expected = pd.Series([2, 1], index=ser[:2], dtype=exp_dtype, name="proportion") / 3
470478
tm.assert_series_equal(result, expected)
471479

472480

pandas/tests/base/test_value_counts.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,11 @@ def test_value_counts(index_or_series_obj):
3939
expected.index.name = obj.name
4040

4141
if not isinstance(result.dtype, np.dtype):
42-
# i.e IntegerDtype
43-
expected = expected.astype("Int64")
42+
if getattr(obj.dtype, "storage", "") == "pyarrow":
43+
expected = expected.astype("int64[pyarrow]")
44+
else:
45+
# i.e IntegerDtype
46+
expected = expected.astype("Int64")
4447

4548
# TODO(GH#32514): Order of entries with the same count is inconsistent
4649
# on CI (gh-32449)
@@ -90,8 +93,11 @@ def test_value_counts_null(null_obj, index_or_series_obj):
9093
result = result.sort_index()
9194

9295
if not isinstance(result.dtype, np.dtype):
93-
# i.e IntegerDtype
94-
expected = expected.astype("Int64")
96+
if getattr(obj.dtype, "storage", "") == "pyarrow":
97+
expected = expected.astype("int64[pyarrow]")
98+
else:
99+
# i.e IntegerDtype
100+
expected = expected.astype("Int64")
95101
tm.assert_series_equal(result, expected)
96102

97103
expected[null_obj] = 3

pandas/tests/extension/test_arrow.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -793,9 +793,25 @@ def test_diff(self, data, periods, request):
793793
)
794794
super().test_diff(data, periods)
795795

796-
@pytest.mark.parametrize("dropna", [True, False])
797-
def test_value_counts(self, all_data, dropna, request):
798-
super().test_value_counts(all_data, dropna)
796+
def test_value_counts_returns_pyarrow_int64(self, data):
797+
# GH 51462
798+
data = data[:10]
799+
result = data.value_counts()
800+
assert result.dtype == ArrowDtype(pa.int64())
801+
802+
def test_value_counts_with_normalize(self, data, request):
803+
data = data[:10].unique()
804+
values = np.array(data[~data.isna()])
805+
ser = pd.Series(data, dtype=data.dtype)
806+
807+
result = ser.value_counts(normalize=True).sort_index()
808+
809+
expected = pd.Series(
810+
[1 / len(values)] * len(values), index=result.index, name="proportion"
811+
)
812+
expected = expected.astype("double[pyarrow]")
813+
814+
self.assert_series_equal(result, expected)
799815

800816
def test_argmin_argmax(
801817
self, data_for_sorting, data_missing_for_sorting, na_value, request

pandas/tests/extension/test_string.py

+11-62
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import numpy as np
1919
import pytest
2020

21-
from pandas.compat import pa_version_under7p0
2221
from pandas.errors import PerformanceWarning
2322

2423
import pandas as pd
@@ -196,70 +195,20 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
196195

197196

198197
class TestMethods(base.BaseMethodsTests):
199-
def test_argsort(self, data_for_sorting):
200-
with tm.maybe_produces_warning(
201-
PerformanceWarning,
202-
pa_version_under7p0
203-
and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow",
204-
check_stacklevel=False,
205-
):
206-
super().test_argsort(data_for_sorting)
198+
def test_value_counts_with_normalize(self, data):
199+
data = data[:10].unique()
200+
values = np.array(data[~data.isna()])
201+
ser = pd.Series(data, dtype=data.dtype)
207202

208-
def test_argsort_missing(self, data_missing_for_sorting):
209-
with tm.maybe_produces_warning(
210-
PerformanceWarning,
211-
pa_version_under7p0
212-
and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
213-
check_stacklevel=False,
214-
):
215-
super().test_argsort_missing(data_missing_for_sorting)
216-
217-
def test_argmin_argmax(
218-
self, data_for_sorting, data_missing_for_sorting, na_value, request
219-
):
220-
super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value)
221-
222-
@pytest.mark.parametrize(
223-
"op_name, skipna, expected",
224-
[
225-
("idxmax", True, 0),
226-
("idxmin", True, 2),
227-
("argmax", True, 0),
228-
("argmin", True, 2),
229-
("idxmax", False, np.nan),
230-
("idxmin", False, np.nan),
231-
("argmax", False, -1),
232-
("argmin", False, -1),
233-
],
234-
)
235-
def test_argreduce_series(
236-
self, data_missing_for_sorting, op_name, skipna, expected, request
237-
):
238-
super().test_argreduce_series(
239-
data_missing_for_sorting, op_name, skipna, expected
240-
)
203+
result = ser.value_counts(normalize=True).sort_index()
241204

242-
@pytest.mark.parametrize("dropna", [True, False])
243-
def test_value_counts(self, all_data, dropna, request):
244-
all_data = all_data[:10]
245-
if dropna:
246-
other = all_data[~all_data.isna()]
205+
expected = pd.Series(
206+
[1 / len(values)] * len(values), index=result.index, name="proportion"
207+
)
208+
if getattr(data.dtype, "storage", "") == "pyarrow":
209+
expected = expected.astype("double[pyarrow]")
247210
else:
248-
other = all_data
249-
with tm.maybe_produces_warning(
250-
PerformanceWarning,
251-
pa_version_under7p0
252-
and getattr(all_data.dtype, "storage", "") == "pyarrow"
253-
and not (dropna and "data_missing" in request.node.nodeid),
254-
):
255-
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
256-
with tm.maybe_produces_warning(
257-
PerformanceWarning,
258-
pa_version_under7p0
259-
and getattr(other.dtype, "storage", "") == "pyarrow"
260-
and not (dropna and "data_missing" in request.node.nodeid),
261-
):
262-
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
211+
expected = expected.astype("Float64")
263212

264213
self.assert_series_equal(result, expected)
265214

0 commit comments

Comments
 (0)