Skip to content

TST: Split and simplify test_value_counts_unique_nunique #32281

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
282 changes: 126 additions & 156 deletions pandas/tests/base/test_ops.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import collections
from datetime import datetime, timedelta
from io import StringIO
import sys
Expand All @@ -15,7 +16,6 @@
is_datetime64_dtype,
is_datetime64tz_dtype,
is_object_dtype,
is_period_dtype,
needs_i8_conversion,
)

Expand All @@ -26,11 +26,9 @@
Index,
Interval,
IntervalIndex,
PeriodIndex,
Series,
Timedelta,
TimedeltaIndex,
Timestamp,
)
import pandas._testing as tm

Expand Down Expand Up @@ -207,180 +205,152 @@ def test_ndarray_compat_properties(self, index_or_series_obj):
assert Index([1]).item() == 1
assert Series([1]).item() == 1

def test_value_counts_unique_nunique(self, index_or_series_obj):
orig = index_or_series_obj
obj = orig.copy()
klass = type(obj)
values = obj._values

if orig.duplicated().any():
pytest.xfail(
"The test implementation isn't flexible enough to deal "
"with duplicated values. This isn't a bug in the "
"application code, but in the test code."
)
def test_unique(self, index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
result = obj.unique()

# create repeated values, 'n'th element is repeated by n+1 times
if isinstance(obj, Index):
expected_index = Index(obj[::-1])
expected_index.name = None
obj = obj.repeat(range(1, len(obj) + 1))
# dict.fromkeys preserves the order
unique_values = list(dict.fromkeys(obj.values))
if isinstance(obj, pd.MultiIndex):
expected = pd.MultiIndex.from_tuples(unique_values)
expected.names = obj.names
tm.assert_index_equal(result, expected)
elif isinstance(obj, pd.Index):
expected = pd.Index(unique_values, dtype=obj.dtype)
if is_datetime64tz_dtype(obj):
expected = expected.normalize()
tm.assert_index_equal(result, expected)
else:
expected_index = Index(values[::-1])
idx = obj.index.repeat(range(1, len(obj) + 1))
# take-based repeat
indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1))
rep = values.take(indices)
obj = klass(rep, index=idx)

# check values has the same dtype as the original
assert obj.dtype == orig.dtype

expected_s = Series(
range(len(orig), 0, -1), index=expected_index, dtype="int64"
)
expected = np.array(unique_values)
tm.assert_numpy_array_equal(result, expected)

result = obj.value_counts()
tm.assert_series_equal(result, expected_s)
assert result.index.name is None
@pytest.mark.parametrize("null_obj", [np.nan, None])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use nulls_fixture

Copy link
Contributor Author

@SaturnFromTitan SaturnFromTitan Mar 3, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried using nulls_fixture and unique_nulls_fixture. Nearly all configurations break for pd.NaT and pd.NA though...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, yeah we need to test these, can you create an issue. we will want to add these even if they need xfailing for now as there is no testing on them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xref: #32437

def test_unique_null(self, null_obj, index_or_series_obj):
obj = index_or_series_obj

if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif len(obj) < 1:
pytest.skip("Test doesn't make sense on empty data")
elif isinstance(obj, pd.MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")

values = obj.values
if needs_i8_conversion(obj):
values[0:2] = iNaT
else:
values[0:2] = null_obj

klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)
result = obj.unique()
if isinstance(obj, Index):
assert isinstance(result, type(obj))
tm.assert_index_equal(result, orig)
assert result.dtype == orig.dtype
elif is_datetime64tz_dtype(obj):
# datetimetz Series returns array of Timestamp
assert result[0] == orig[0]
for r in result:
assert isinstance(r, Timestamp)

tm.assert_numpy_array_equal(
result.astype(object), orig._values.astype(object)
)

unique_values_raw = dict.fromkeys(obj.values)
# because np.nan == np.nan is False, but None == None is True
# np.nan would be duplicated, whereas None wouldn't
unique_values_not_null = [
val for val in unique_values_raw if not pd.isnull(val)
]
unique_values = [null_obj] + unique_values_not_null

if isinstance(obj, pd.Index):
expected = pd.Index(unique_values, dtype=obj.dtype)
if is_datetime64tz_dtype(obj):
result = result.normalize()
expected = expected.normalize()
elif isinstance(obj, pd.CategoricalIndex):
expected = expected.set_categories(unique_values_not_null)
tm.assert_index_equal(result, expected)
else:
tm.assert_numpy_array_equal(result, orig.values)
assert result.dtype == orig.dtype
expected = np.array(unique_values, dtype=obj.dtype)
tm.assert_numpy_array_equal(result, expected)

# dropna=True would break for MultiIndex
assert obj.nunique(dropna=False) == len(np.unique(obj.values))
def test_nunique(self, index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
expected = len(obj.unique())
assert obj.nunique(dropna=False) == expected

@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj):
orig = index_or_series_obj
obj = orig.copy()
klass = type(obj)
values = obj._ndarray_values
num_values = len(orig)
def test_nunique_null(self, null_obj, index_or_series_obj):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

obj = index_or_series_obj

if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)):
pytest.skip(f"values of {klass} cannot be changed")
elif isinstance(orig, pd.MultiIndex):
pytest.skip("MultiIndex doesn't support isna")
elif orig.duplicated().any():
pytest.xfail(
"The test implementation isn't flexible enough to deal "
"with duplicated values. This isn't a bug in the "
"application code, but in the test code."
)

# special assign to the numpy array
if is_datetime64tz_dtype(obj):
if isinstance(obj, DatetimeIndex):
v = obj.asi8
v[0:2] = iNaT
values = obj._shallow_copy(v)
else:
obj = obj.copy()
obj[0:2] = pd.NaT
values = obj._values
elif isinstance(obj, pd.MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")

elif is_period_dtype(obj):
values[0:2] = iNaT
parr = type(obj._data)(values, dtype=obj.dtype)
values = obj._shallow_copy(parr)
elif needs_i8_conversion(obj):
values = obj.values
if needs_i8_conversion(obj):
values[0:2] = iNaT
values = obj._shallow_copy(values)
else:
values[0:2] = null_obj

# check values has the same dtype as the original
assert values.dtype == obj.dtype

# create repeated values, 'n'th element is repeated by n+1
# times
if isinstance(obj, (DatetimeIndex, PeriodIndex)):
expected_index = obj.copy()
expected_index.name = None
klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)

# attach name to klass
obj = klass(values.repeat(range(1, len(obj) + 1)))
obj.name = "a"
else:
if isinstance(obj, DatetimeIndex):
expected_index = orig._values._shallow_copy(values)
else:
expected_index = Index(values)
expected_index.name = None
obj = obj.repeat(range(1, len(obj) + 1))
obj.name = "a"

# check values has the same dtype as the original
assert obj.dtype == orig.dtype

# check values correctly have NaN
nanloc = np.zeros(len(obj), dtype=np.bool)
nanloc[:3] = True
if isinstance(obj, Index):
tm.assert_numpy_array_equal(pd.isna(obj), nanloc)
if isinstance(obj, pd.CategoricalIndex):
assert obj.nunique() == len(obj.categories)
assert obj.nunique(dropna=False) == len(obj.categories) + 1
else:
exp = Series(nanloc, obj.index, name="a")
tm.assert_series_equal(pd.isna(obj), exp)

expected_data = list(range(num_values, 2, -1))
expected_data_na = expected_data.copy()
if expected_data_na:
expected_data_na.append(3)
expected_s_na = Series(
expected_data_na,
index=expected_index[num_values - 1 : 0 : -1],
dtype="int64",
name="a",
)
expected_s = Series(
expected_data,
index=expected_index[num_values - 1 : 1 : -1],
dtype="int64",
name="a",
)
num_unique_values = len(obj.unique())
assert obj.nunique() == max(0, num_unique_values - 1)
assert obj.nunique(dropna=False) == max(0, num_unique_values)

result_s_na = obj.value_counts(dropna=False)
tm.assert_series_equal(result_s_na, expected_s_na)
assert result_s_na.index.name is None
assert result_s_na.name == "a"
result_s = obj.value_counts()
tm.assert_series_equal(obj.value_counts(), expected_s)
assert result_s.index.name is None
assert result_s.name == "a"
def test_value_counts(self, index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
result = obj.value_counts()

result = obj.unique()
if isinstance(obj, Index):
tm.assert_index_equal(result, Index(values[1:], name="a"))
elif is_datetime64tz_dtype(obj):
# unable to compare NaT / nan
tm.assert_extension_array_equal(result[1:], values[2:])
assert result[0] is pd.NaT
elif len(obj) > 0:
tm.assert_numpy_array_equal(result[1:], values[2:])

assert pd.isna(result[0])
assert result.dtype == orig.dtype

assert obj.nunique() == max(0, num_values - 2)
assert obj.nunique(dropna=False) == max(0, num_values - 1)
counter = collections.Counter(obj)
expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name)
expected.index = expected.index.astype(obj.dtype)
if isinstance(obj, pd.MultiIndex):
expected.index = pd.Index(expected.index)

# sort_index to avoid switched order when values share the same count
result = result.sort_index()
expected = expected.sort_index()
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_value_counts_null(self, null_obj, index_or_series_obj):
orig = index_or_series_obj
obj = orig.copy()

if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif len(obj) < 1:
pytest.skip("Test doesn't make sense on empty data")
elif isinstance(orig, pd.MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")

values = obj.values
if needs_i8_conversion(obj):
values[0:2] = iNaT
else:
values[0:2] = null_obj

klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)

# because np.nan == np.nan is False, but None == None is True
# np.nan would be duplicated, whereas None wouldn't
counter = collections.Counter(obj.dropna())
expected = pd.Series(dict(counter.most_common()), dtype=np.int64)
expected.index = expected.index.astype(obj.dtype)

tm.assert_series_equal(obj.value_counts(), expected)

# can't use expected[null_obj] = 3 as
# IntervalIndex doesn't allow assignment
new_entry = pd.Series({np.nan: 3}, dtype=np.int64)
expected = expected.append(new_entry)
tm.assert_series_equal(obj.value_counts(dropna=False), expected)

def test_value_counts_inferred(self, index_or_series):
klass = index_or_series
Expand Down