Skip to content

Commit 0d04683

Browse files
TST: Split and simplify test_value_counts_unique_nunique (#32281)
1 parent c5f0ebf commit 0d04683

File tree

1 file changed

+126
-156
lines changed

1 file changed

+126
-156
lines changed

pandas/tests/base/test_ops.py

+126-156
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import collections
12
from datetime import datetime, timedelta
23
from io import StringIO
34
import sys
@@ -15,7 +16,6 @@
1516
is_datetime64_dtype,
1617
is_datetime64tz_dtype,
1718
is_object_dtype,
18-
is_period_dtype,
1919
needs_i8_conversion,
2020
)
2121

@@ -26,11 +26,9 @@
2626
Index,
2727
Interval,
2828
IntervalIndex,
29-
PeriodIndex,
3029
Series,
3130
Timedelta,
3231
TimedeltaIndex,
33-
Timestamp,
3432
)
3533
import pandas._testing as tm
3634

@@ -207,180 +205,152 @@ def test_ndarray_compat_properties(self, index_or_series_obj):
207205
assert Index([1]).item() == 1
208206
assert Series([1]).item() == 1
209207

210-
def test_value_counts_unique_nunique(self, index_or_series_obj):
211-
orig = index_or_series_obj
212-
obj = orig.copy()
213-
klass = type(obj)
214-
values = obj._values
215-
216-
if orig.duplicated().any():
217-
pytest.xfail(
218-
"The test implementation isn't flexible enough to deal "
219-
"with duplicated values. This isn't a bug in the "
220-
"application code, but in the test code."
221-
)
208+
def test_unique(self, index_or_series_obj):
209+
obj = index_or_series_obj
210+
obj = np.repeat(obj, range(1, len(obj) + 1))
211+
result = obj.unique()
222212

223-
# create repeated values, 'n'th element is repeated by n+1 times
224-
if isinstance(obj, Index):
225-
expected_index = Index(obj[::-1])
226-
expected_index.name = None
227-
obj = obj.repeat(range(1, len(obj) + 1))
213+
# dict.fromkeys preserves the order
214+
unique_values = list(dict.fromkeys(obj.values))
215+
if isinstance(obj, pd.MultiIndex):
216+
expected = pd.MultiIndex.from_tuples(unique_values)
217+
expected.names = obj.names
218+
tm.assert_index_equal(result, expected)
219+
elif isinstance(obj, pd.Index):
220+
expected = pd.Index(unique_values, dtype=obj.dtype)
221+
if is_datetime64tz_dtype(obj):
222+
expected = expected.normalize()
223+
tm.assert_index_equal(result, expected)
228224
else:
229-
expected_index = Index(values[::-1])
230-
idx = obj.index.repeat(range(1, len(obj) + 1))
231-
# take-based repeat
232-
indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1))
233-
rep = values.take(indices)
234-
obj = klass(rep, index=idx)
235-
236-
# check values has the same dtype as the original
237-
assert obj.dtype == orig.dtype
238-
239-
expected_s = Series(
240-
range(len(orig), 0, -1), index=expected_index, dtype="int64"
241-
)
225+
expected = np.array(unique_values)
226+
tm.assert_numpy_array_equal(result, expected)
242227

243-
result = obj.value_counts()
244-
tm.assert_series_equal(result, expected_s)
245-
assert result.index.name is None
228+
@pytest.mark.parametrize("null_obj", [np.nan, None])
229+
def test_unique_null(self, null_obj, index_or_series_obj):
230+
obj = index_or_series_obj
231+
232+
if not allow_na_ops(obj):
233+
pytest.skip("type doesn't allow for NA operations")
234+
elif len(obj) < 1:
235+
pytest.skip("Test doesn't make sense on empty data")
236+
elif isinstance(obj, pd.MultiIndex):
237+
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
238+
239+
values = obj.values
240+
if needs_i8_conversion(obj):
241+
values[0:2] = iNaT
242+
else:
243+
values[0:2] = null_obj
246244

245+
klass = type(obj)
246+
repeated_values = np.repeat(values, range(1, len(values) + 1))
247+
obj = klass(repeated_values, dtype=obj.dtype)
247248
result = obj.unique()
248-
if isinstance(obj, Index):
249-
assert isinstance(result, type(obj))
250-
tm.assert_index_equal(result, orig)
251-
assert result.dtype == orig.dtype
252-
elif is_datetime64tz_dtype(obj):
253-
# datetimetz Series returns array of Timestamp
254-
assert result[0] == orig[0]
255-
for r in result:
256-
assert isinstance(r, Timestamp)
257-
258-
tm.assert_numpy_array_equal(
259-
result.astype(object), orig._values.astype(object)
260-
)
249+
250+
unique_values_raw = dict.fromkeys(obj.values)
251+
# because np.nan == np.nan is False, but None == None is True
252+
# np.nan would be duplicated, whereas None wouldn't
253+
unique_values_not_null = [
254+
val for val in unique_values_raw if not pd.isnull(val)
255+
]
256+
unique_values = [null_obj] + unique_values_not_null
257+
258+
if isinstance(obj, pd.Index):
259+
expected = pd.Index(unique_values, dtype=obj.dtype)
260+
if is_datetime64tz_dtype(obj):
261+
result = result.normalize()
262+
expected = expected.normalize()
263+
elif isinstance(obj, pd.CategoricalIndex):
264+
expected = expected.set_categories(unique_values_not_null)
265+
tm.assert_index_equal(result, expected)
261266
else:
262-
tm.assert_numpy_array_equal(result, orig.values)
263-
assert result.dtype == orig.dtype
267+
expected = np.array(unique_values, dtype=obj.dtype)
268+
tm.assert_numpy_array_equal(result, expected)
264269

265-
# dropna=True would break for MultiIndex
266-
assert obj.nunique(dropna=False) == len(np.unique(obj.values))
270+
def test_nunique(self, index_or_series_obj):
271+
obj = index_or_series_obj
272+
obj = np.repeat(obj, range(1, len(obj) + 1))
273+
expected = len(obj.unique())
274+
assert obj.nunique(dropna=False) == expected
267275

268276
@pytest.mark.parametrize("null_obj", [np.nan, None])
269-
def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj):
270-
orig = index_or_series_obj
271-
obj = orig.copy()
272-
klass = type(obj)
273-
values = obj._ndarray_values
274-
num_values = len(orig)
277+
def test_nunique_null(self, null_obj, index_or_series_obj):
278+
obj = index_or_series_obj
275279

276280
if not allow_na_ops(obj):
277281
pytest.skip("type doesn't allow for NA operations")
278-
elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)):
279-
pytest.skip(f"values of {klass} cannot be changed")
280-
elif isinstance(orig, pd.MultiIndex):
281-
pytest.skip("MultiIndex doesn't support isna")
282-
elif orig.duplicated().any():
283-
pytest.xfail(
284-
"The test implementation isn't flexible enough to deal "
285-
"with duplicated values. This isn't a bug in the "
286-
"application code, but in the test code."
287-
)
288-
289-
# special assign to the numpy array
290-
if is_datetime64tz_dtype(obj):
291-
if isinstance(obj, DatetimeIndex):
292-
v = obj.asi8
293-
v[0:2] = iNaT
294-
values = obj._shallow_copy(v)
295-
else:
296-
obj = obj.copy()
297-
obj[0:2] = pd.NaT
298-
values = obj._values
282+
elif isinstance(obj, pd.MultiIndex):
283+
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
299284

300-
elif is_period_dtype(obj):
301-
values[0:2] = iNaT
302-
parr = type(obj._data)(values, dtype=obj.dtype)
303-
values = obj._shallow_copy(parr)
304-
elif needs_i8_conversion(obj):
285+
values = obj.values
286+
if needs_i8_conversion(obj):
305287
values[0:2] = iNaT
306-
values = obj._shallow_copy(values)
307288
else:
308289
values[0:2] = null_obj
309290

310-
# check values has the same dtype as the original
311-
assert values.dtype == obj.dtype
312-
313-
# create repeated values, 'n'th element is repeated by n+1
314-
# times
315-
if isinstance(obj, (DatetimeIndex, PeriodIndex)):
316-
expected_index = obj.copy()
317-
expected_index.name = None
291+
klass = type(obj)
292+
repeated_values = np.repeat(values, range(1, len(values) + 1))
293+
obj = klass(repeated_values, dtype=obj.dtype)
318294

319-
# attach name to klass
320-
obj = klass(values.repeat(range(1, len(obj) + 1)))
321-
obj.name = "a"
322-
else:
323-
if isinstance(obj, DatetimeIndex):
324-
expected_index = orig._values._shallow_copy(values)
325-
else:
326-
expected_index = Index(values)
327-
expected_index.name = None
328-
obj = obj.repeat(range(1, len(obj) + 1))
329-
obj.name = "a"
330-
331-
# check values has the same dtype as the original
332-
assert obj.dtype == orig.dtype
333-
334-
# check values correctly have NaN
335-
nanloc = np.zeros(len(obj), dtype=np.bool)
336-
nanloc[:3] = True
337-
if isinstance(obj, Index):
338-
tm.assert_numpy_array_equal(pd.isna(obj), nanloc)
295+
if isinstance(obj, pd.CategoricalIndex):
296+
assert obj.nunique() == len(obj.categories)
297+
assert obj.nunique(dropna=False) == len(obj.categories) + 1
339298
else:
340-
exp = Series(nanloc, obj.index, name="a")
341-
tm.assert_series_equal(pd.isna(obj), exp)
342-
343-
expected_data = list(range(num_values, 2, -1))
344-
expected_data_na = expected_data.copy()
345-
if expected_data_na:
346-
expected_data_na.append(3)
347-
expected_s_na = Series(
348-
expected_data_na,
349-
index=expected_index[num_values - 1 : 0 : -1],
350-
dtype="int64",
351-
name="a",
352-
)
353-
expected_s = Series(
354-
expected_data,
355-
index=expected_index[num_values - 1 : 1 : -1],
356-
dtype="int64",
357-
name="a",
358-
)
299+
num_unique_values = len(obj.unique())
300+
assert obj.nunique() == max(0, num_unique_values - 1)
301+
assert obj.nunique(dropna=False) == max(0, num_unique_values)
359302

360-
result_s_na = obj.value_counts(dropna=False)
361-
tm.assert_series_equal(result_s_na, expected_s_na)
362-
assert result_s_na.index.name is None
363-
assert result_s_na.name == "a"
364-
result_s = obj.value_counts()
365-
tm.assert_series_equal(obj.value_counts(), expected_s)
366-
assert result_s.index.name is None
367-
assert result_s.name == "a"
303+
def test_value_counts(self, index_or_series_obj):
304+
obj = index_or_series_obj
305+
obj = np.repeat(obj, range(1, len(obj) + 1))
306+
result = obj.value_counts()
368307

369-
result = obj.unique()
370-
if isinstance(obj, Index):
371-
tm.assert_index_equal(result, Index(values[1:], name="a"))
372-
elif is_datetime64tz_dtype(obj):
373-
# unable to compare NaT / nan
374-
tm.assert_extension_array_equal(result[1:], values[2:])
375-
assert result[0] is pd.NaT
376-
elif len(obj) > 0:
377-
tm.assert_numpy_array_equal(result[1:], values[2:])
378-
379-
assert pd.isna(result[0])
380-
assert result.dtype == orig.dtype
381-
382-
assert obj.nunique() == max(0, num_values - 2)
383-
assert obj.nunique(dropna=False) == max(0, num_values - 1)
308+
counter = collections.Counter(obj)
309+
expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name)
310+
expected.index = expected.index.astype(obj.dtype)
311+
if isinstance(obj, pd.MultiIndex):
312+
expected.index = pd.Index(expected.index)
313+
314+
# sort_index to avoid switched order when values share the same count
315+
result = result.sort_index()
316+
expected = expected.sort_index()
317+
tm.assert_series_equal(result, expected)
318+
319+
@pytest.mark.parametrize("null_obj", [np.nan, None])
320+
def test_value_counts_null(self, null_obj, index_or_series_obj):
321+
orig = index_or_series_obj
322+
obj = orig.copy()
323+
324+
if not allow_na_ops(obj):
325+
pytest.skip("type doesn't allow for NA operations")
326+
elif len(obj) < 1:
327+
pytest.skip("Test doesn't make sense on empty data")
328+
elif isinstance(orig, pd.MultiIndex):
329+
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
330+
331+
values = obj.values
332+
if needs_i8_conversion(obj):
333+
values[0:2] = iNaT
334+
else:
335+
values[0:2] = null_obj
336+
337+
klass = type(obj)
338+
repeated_values = np.repeat(values, range(1, len(values) + 1))
339+
obj = klass(repeated_values, dtype=obj.dtype)
340+
341+
# because np.nan == np.nan is False, but None == None is True
342+
# np.nan would be duplicated, whereas None wouldn't
343+
counter = collections.Counter(obj.dropna())
344+
expected = pd.Series(dict(counter.most_common()), dtype=np.int64)
345+
expected.index = expected.index.astype(obj.dtype)
346+
347+
tm.assert_series_equal(obj.value_counts(), expected)
348+
349+
# can't use expected[null_obj] = 3 as
350+
# IntervalIndex doesn't allow assignment
351+
new_entry = pd.Series({np.nan: 3}, dtype=np.int64)
352+
expected = expected.append(new_entry)
353+
tm.assert_series_equal(obj.value_counts(dropna=False), expected)
384354

385355
def test_value_counts_inferred(self, index_or_series):
386356
klass = index_or_series

0 commit comments

Comments
 (0)