Skip to content

API: ArrowExtensionArray.value_counts returns pyarrow.int64 type #51542

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Feb 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,7 @@ Other API changes
- :class:`DataFrame` and :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`)
- Division by zero with :class:`ArrowDtype` dtypes returns ``-inf``, ``nan``, or ``inf`` depending on the numerator, instead of raising (:issue:`51541`)
- Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`)
- :meth:`~arrays.ArrowExtensionArray.value_counts` now returns data with :class:`ArrowDtype` with ``pyarrow.int64`` type instead of ``"Int64"`` type (:issue:`51462`)

.. note::

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,9 @@ def value_counts(
result.name = name
result.index.name = index_name
counts = result._values
if not isinstance(counts, np.ndarray):
# e.g. ArrowExtensionArray
counts = np.asarray(counts)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this needed? id expect pd.value_counts to behave like obj.value_counts

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For normalize below, counts.sum() is called and ._values which is an ArrowExtensionArray doesn't have sum.

I think it was assumed that counts is a numpy array here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If EA's implement value_counts they should also handle normalize?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it was assumed that counts is a numpy array here?

For a long time I wanted value_counts to have int64 dtype and get rid of Int64 cases, but eventually gave up on that. So i think at least Int64 is possible. IIRC value_counts is only kinda-sorta part of the EA interface so the exact requirements aren't pinned down.

If EA's implement value_counts they should also handle normalize?

If the normalization logic can live exclusively here, I think I'd prefer that.

which is an ArrowExtensionArray doesn't have sum.

Out of scope, but I'd just put sum and the other most-basic reductions (say, all the ones in the np.ndarray namespace) on EA and let it raise TypeError for non-supported dtypes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you prefer me to put sum on ArrowExtensionArray so this works without the if statement or should we leave in this check?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i guess cleaner to leave this and can do another api change later if called for


elif isinstance(values, ABCMultiIndex):
# GH49558
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,12 +983,11 @@ def value_counts(self, dropna: bool = True) -> Series:
if pa.types.is_duration(pa_type):
values = values.cast(pa_type)

# No missing values so we can adhere to the interface and return a numpy array.
counts = np.array(counts)
counts = ArrowExtensionArray(counts)

index = Index(type(self)(values))

return Series(counts, index=index, name="count").astype("Int64")
return Series(counts, index=index, name="count")

@classmethod
def _concat_same_type(
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,20 +453,28 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2):


def test_value_counts_na(dtype):
if getattr(dtype, "storage", "") == "pyarrow":
exp_dtype = "int64[pyarrow]"
else:
exp_dtype = "Int64"
arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
result = arr.value_counts(dropna=False)
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64", name="count")
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype=exp_dtype, name="count")
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count")
expected = pd.Series([2, 1], index=arr[:2], dtype=exp_dtype, name="count")
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize(dtype):
if getattr(dtype, "storage", "") == "pyarrow":
exp_dtype = "double[pyarrow]"
else:
exp_dtype = "Float64"
ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
expected = pd.Series([2, 1], index=ser[:2], dtype=exp_dtype, name="proportion") / 3
tm.assert_series_equal(result, expected)


Expand Down
14 changes: 10 additions & 4 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@ def test_value_counts(index_or_series_obj):
expected.index.name = obj.name

if not isinstance(result.dtype, np.dtype):
# i.e IntegerDtype
expected = expected.astype("Int64")
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")

# TODO(GH#32514): Order of entries with the same count is inconsistent
# on CI (gh-32449)
Expand Down Expand Up @@ -90,8 +93,11 @@ def test_value_counts_null(null_obj, index_or_series_obj):
result = result.sort_index()

if not isinstance(result.dtype, np.dtype):
# i.e IntegerDtype
expected = expected.astype("Int64")
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")
tm.assert_series_equal(result, expected)

expected[null_obj] = 3
Expand Down
22 changes: 19 additions & 3 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -785,9 +785,25 @@ def test_diff(self, data, periods, request):
)
super().test_diff(data, periods)

@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna, request):
super().test_value_counts(all_data, dropna)
def test_value_counts_returns_pyarrow_int64(self, data):
# GH 51462
data = data[:10]
result = data.value_counts()
assert result.dtype == ArrowDtype(pa.int64())

def test_value_counts_with_normalize(self, data, request):
data = data[:10].unique()
values = np.array(data[~data.isna()])
ser = pd.Series(data, dtype=data.dtype)

result = ser.value_counts(normalize=True).sort_index()

expected = pd.Series(
[1 / len(values)] * len(values), index=result.index, name="proportion"
)
expected = expected.astype("double[pyarrow]")

self.assert_series_equal(result, expected)

def test_argmin_argmax(
self, data_for_sorting, data_missing_for_sorting, na_value, request
Expand Down
73 changes: 11 additions & 62 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import numpy as np
import pytest

from pandas.compat import pa_version_under7p0
from pandas.errors import PerformanceWarning

import pandas as pd
Expand Down Expand Up @@ -196,70 +195,20 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):


class TestMethods(base.BaseMethodsTests):
def test_argsort(self, data_for_sorting):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow",
check_stacklevel=False,
):
super().test_argsort(data_for_sorting)
def test_value_counts_with_normalize(self, data):
data = data[:10].unique()
values = np.array(data[~data.isna()])
ser = pd.Series(data, dtype=data.dtype)

def test_argsort_missing(self, data_missing_for_sorting):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
check_stacklevel=False,
):
super().test_argsort_missing(data_missing_for_sorting)

def test_argmin_argmax(
self, data_for_sorting, data_missing_for_sorting, na_value, request
):
super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value)

@pytest.mark.parametrize(
"op_name, skipna, expected",
[
("idxmax", True, 0),
("idxmin", True, 2),
("argmax", True, 0),
("argmin", True, 2),
("idxmax", False, np.nan),
("idxmin", False, np.nan),
("argmax", False, -1),
("argmin", False, -1),
],
)
def test_argreduce_series(
self, data_missing_for_sorting, op_name, skipna, expected, request
):
super().test_argreduce_series(
data_missing_for_sorting, op_name, skipna, expected
)
result = ser.value_counts(normalize=True).sort_index()

@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna, request):
all_data = all_data[:10]
if dropna:
other = all_data[~all_data.isna()]
expected = pd.Series(
[1 / len(values)] * len(values), index=result.index, name="proportion"
)
if getattr(data.dtype, "storage", "") == "pyarrow":
expected = expected.astype("double[pyarrow]")
else:
other = all_data
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(all_data.dtype, "storage", "") == "pyarrow"
and not (dropna and "data_missing" in request.node.nodeid),
):
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(other.dtype, "storage", "") == "pyarrow"
and not (dropna and "data_missing" in request.node.nodeid),
):
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
expected = expected.astype("Float64")

self.assert_series_equal(result, expected)

Expand Down