Skip to content

Backport PR #51542 on branch 2.0.x (API: ArrowExtensionArray.value_counts returns pyarrow.int64 type) #51620

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,7 @@ Other API changes
- :class:`DataFrame` and :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`)
- Division by zero with :class:`ArrowDtype` dtypes returns ``-inf``, ``nan``, or ``inf`` depending on the numerator, instead of raising (:issue:`51541`)
- Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`)
- :meth:`~arrays.ArrowExtensionArray.value_counts` now returns data with :class:`ArrowDtype` with ``pyarrow.int64`` type instead of ``"Int64"`` type (:issue:`51462`)

.. note::

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,9 @@ def value_counts(
result.name = name
result.index.name = index_name
counts = result._values
if not isinstance(counts, np.ndarray):
# e.g. ArrowExtensionArray
counts = np.asarray(counts)

elif isinstance(values, ABCMultiIndex):
# GH49558
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,12 +983,11 @@ def value_counts(self, dropna: bool = True) -> Series:
if pa.types.is_duration(pa_type):
values = values.cast(pa_type)

# No missing values so we can adhere to the interface and return a numpy array.
counts = np.array(counts)
counts = ArrowExtensionArray(counts)

index = Index(type(self)(values))

return Series(counts, index=index, name="count").astype("Int64")
return Series(counts, index=index, name="count")

@classmethod
def _concat_same_type(
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,20 +453,28 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2):


def test_value_counts_na(dtype):
if getattr(dtype, "storage", "") == "pyarrow":
exp_dtype = "int64[pyarrow]"
else:
exp_dtype = "Int64"
arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
result = arr.value_counts(dropna=False)
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64", name="count")
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype=exp_dtype, name="count")
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count")
expected = pd.Series([2, 1], index=arr[:2], dtype=exp_dtype, name="count")
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize(dtype):
if getattr(dtype, "storage", "") == "pyarrow":
exp_dtype = "double[pyarrow]"
else:
exp_dtype = "Float64"
ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
expected = pd.Series([2, 1], index=ser[:2], dtype=exp_dtype, name="proportion") / 3
tm.assert_series_equal(result, expected)


Expand Down
14 changes: 10 additions & 4 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@ def test_value_counts(index_or_series_obj):
expected.index.name = obj.name

if not isinstance(result.dtype, np.dtype):
# i.e IntegerDtype
expected = expected.astype("Int64")
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")

# TODO(GH#32514): Order of entries with the same count is inconsistent
# on CI (gh-32449)
Expand Down Expand Up @@ -90,8 +93,11 @@ def test_value_counts_null(null_obj, index_or_series_obj):
result = result.sort_index()

if not isinstance(result.dtype, np.dtype):
# i.e IntegerDtype
expected = expected.astype("Int64")
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")
tm.assert_series_equal(result, expected)

expected[null_obj] = 3
Expand Down
22 changes: 19 additions & 3 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,9 +793,25 @@ def test_diff(self, data, periods, request):
)
super().test_diff(data, periods)

@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna, request):
super().test_value_counts(all_data, dropna)
def test_value_counts_returns_pyarrow_int64(self, data):
# GH 51462
data = data[:10]
result = data.value_counts()
assert result.dtype == ArrowDtype(pa.int64())

def test_value_counts_with_normalize(self, data, request):
data = data[:10].unique()
values = np.array(data[~data.isna()])
ser = pd.Series(data, dtype=data.dtype)

result = ser.value_counts(normalize=True).sort_index()

expected = pd.Series(
[1 / len(values)] * len(values), index=result.index, name="proportion"
)
expected = expected.astype("double[pyarrow]")

self.assert_series_equal(result, expected)

def test_argmin_argmax(
self, data_for_sorting, data_missing_for_sorting, na_value, request
Expand Down
73 changes: 11 additions & 62 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import numpy as np
import pytest

from pandas.compat import pa_version_under7p0
from pandas.errors import PerformanceWarning

import pandas as pd
Expand Down Expand Up @@ -196,70 +195,20 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):


class TestMethods(base.BaseMethodsTests):
def test_argsort(self, data_for_sorting):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow",
check_stacklevel=False,
):
super().test_argsort(data_for_sorting)
def test_value_counts_with_normalize(self, data):
data = data[:10].unique()
values = np.array(data[~data.isna()])
ser = pd.Series(data, dtype=data.dtype)

def test_argsort_missing(self, data_missing_for_sorting):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
check_stacklevel=False,
):
super().test_argsort_missing(data_missing_for_sorting)

def test_argmin_argmax(
self, data_for_sorting, data_missing_for_sorting, na_value, request
):
super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value)

@pytest.mark.parametrize(
"op_name, skipna, expected",
[
("idxmax", True, 0),
("idxmin", True, 2),
("argmax", True, 0),
("argmin", True, 2),
("idxmax", False, np.nan),
("idxmin", False, np.nan),
("argmax", False, -1),
("argmin", False, -1),
],
)
def test_argreduce_series(
self, data_missing_for_sorting, op_name, skipna, expected, request
):
super().test_argreduce_series(
data_missing_for_sorting, op_name, skipna, expected
)
result = ser.value_counts(normalize=True).sort_index()

@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna, request):
all_data = all_data[:10]
if dropna:
other = all_data[~all_data.isna()]
expected = pd.Series(
[1 / len(values)] * len(values), index=result.index, name="proportion"
)
if getattr(data.dtype, "storage", "") == "pyarrow":
expected = expected.astype("double[pyarrow]")
else:
other = all_data
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(all_data.dtype, "storage", "") == "pyarrow"
and not (dropna and "data_missing" in request.node.nodeid),
):
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(other.dtype, "storage", "") == "pyarrow"
and not (dropna and "data_missing" in request.node.nodeid),
):
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
expected = expected.astype("Float64")

self.assert_series_equal(result, expected)

Expand Down