Skip to content

TST: Split and simplify test_value_counts_unique_nunique #32281

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
100 changes: 49 additions & 51 deletions pandas/tests/base/test_ops.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import collections
from datetime import datetime, timedelta
from io import StringIO
import sys
Expand Down Expand Up @@ -28,7 +29,6 @@
Series,
Timedelta,
TimedeltaIndex,
Timestamp,
)
import pandas._testing as tm

Expand All @@ -39,6 +39,23 @@ def allow_na_ops(obj: Any) -> bool:
return not is_bool_index and obj._can_hold_na


def repeat_values(obj):
"""
Repeat values so that the previous values are ordered (increasing)
by number of occurrences
"""
klass = type(obj)

if isinstance(obj, pd.Index):
return obj.repeat(range(1, len(obj) + 1))
elif isinstance(obj, pd.Series):
indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1))
rep = obj.values.take(indices)
idx = obj.index.repeat(range(1, len(obj) + 1))
return klass(rep, index=idx)
raise TypeError(f"Unexpected type: {klass}")


class Ops:
def setup_method(self, method):
self.bool_index = tm.makeBoolIndex(10, name="a")
Expand Down Expand Up @@ -205,63 +222,44 @@ def test_ndarray_compat_properties(self, index_or_series_obj):
assert Index([1]).item() == 1
assert Series([1]).item() == 1

def test_value_counts_unique_nunique(self, index_or_series_obj):
orig = index_or_series_obj
obj = orig.copy()
klass = type(obj)
values = obj._values

if orig.duplicated().any():
pytest.xfail(
"The test implementation isn't flexible enough to deal"
" with duplicated values. This isn't a bug in the"
" application code, but in the test code."
)
def test_unique(self, index_or_series_obj):
obj = repeat_values(index_or_series_obj)
result = obj.unique()

# create repeated values, 'n'th element is repeated by n+1 times
if isinstance(obj, Index):
expected_index = Index(obj[::-1])
expected_index.name = None
obj = obj.repeat(range(1, len(obj) + 1))
# dict.fromkeys preserves the order
unique_values = list(dict.fromkeys(obj.values))
if isinstance(obj, pd.MultiIndex):
expected = pd.MultiIndex.from_tuples(unique_values)
expected.names = obj.names
tm.assert_index_equal(result, expected)
elif isinstance(obj, pd.Index):
expected = pd.Index(unique_values, dtype=obj.dtype)
if is_datetime64tz_dtype(obj):
expected = expected.normalize()
tm.assert_index_equal(result, expected)
else:
expected_index = Index(values[::-1])
idx = obj.index.repeat(range(1, len(obj) + 1))
# take-based repeat
indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1))
rep = values.take(indices)
obj = klass(rep, index=idx)

# check values has the same dtype as the original
assert obj.dtype == orig.dtype
expected = np.array(unique_values)
tm.assert_numpy_array_equal(result, expected)

expected_s = Series(
range(len(orig), 0, -1), index=expected_index, dtype="int64"
)
def test_nunique(self, index_or_series_obj):
obj = repeat_values(index_or_series_obj)
result = obj.nunique(dropna=False)
assert result == len(obj.unique())

def test_value_counts(self, index_or_series_obj):
obj = repeat_values(index_or_series_obj)
result = obj.value_counts()
tm.assert_series_equal(result, expected_s)
assert result.index.name is None

result = obj.unique()
if isinstance(obj, Index):
assert isinstance(result, type(obj))
tm.assert_index_equal(result, orig)
assert result.dtype == orig.dtype
elif is_datetime64tz_dtype(obj):
# datetimetz Series returns array of Timestamp
assert result[0] == orig[0]
for r in result:
assert isinstance(r, Timestamp)

tm.assert_numpy_array_equal(
result.astype(object), orig._values.astype(object)
)
else:
tm.assert_numpy_array_equal(result, orig.values)
assert result.dtype == orig.dtype
counter = collections.Counter(obj)
expected = pd.Series(dict(counter.most_common()), dtype=np.int64)
expected.index = expected.index.astype(obj.dtype)
if isinstance(obj, pd.MultiIndex):
expected.index = pd.Index(expected.index)

# dropna=True would break for MultiIndex
assert obj.nunique(dropna=False) == len(np.unique(obj.values))
# sort_index to avoid switched order when values share the same count
result = result.sort_index()
expected = expected.sort_index()
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj):
Expand Down