diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py index 39dca1a9742df..8f48d0a3e8378 100644 --- a/pandas/tests/base/test_ops.py +++ b/pandas/tests/base/test_ops.py @@ -1,3 +1,4 @@ +import collections from datetime import datetime, timedelta from io import StringIO import sys @@ -15,7 +16,6 @@ is_datetime64_dtype, is_datetime64tz_dtype, is_object_dtype, - is_period_dtype, needs_i8_conversion, ) @@ -26,11 +26,9 @@ Index, Interval, IntervalIndex, - PeriodIndex, Series, Timedelta, TimedeltaIndex, - Timestamp, ) import pandas._testing as tm @@ -207,180 +205,152 @@ def test_ndarray_compat_properties(self, index_or_series_obj): assert Index([1]).item() == 1 assert Series([1]).item() == 1 - def test_value_counts_unique_nunique(self, index_or_series_obj): - orig = index_or_series_obj - obj = orig.copy() - klass = type(obj) - values = obj._values - - if orig.duplicated().any(): - pytest.xfail( - "The test implementation isn't flexible enough to deal " - "with duplicated values. This isn't a bug in the " - "application code, but in the test code." - ) + def test_unique(self, index_or_series_obj): + obj = index_or_series_obj + obj = np.repeat(obj, range(1, len(obj) + 1)) + result = obj.unique() - # create repeated values, 'n'th element is repeated by n+1 times - if isinstance(obj, Index): - expected_index = Index(obj[::-1]) - expected_index.name = None - obj = obj.repeat(range(1, len(obj) + 1)) + # dict.fromkeys preserves the order + unique_values = list(dict.fromkeys(obj.values)) + if isinstance(obj, pd.MultiIndex): + expected = pd.MultiIndex.from_tuples(unique_values) + expected.names = obj.names + tm.assert_index_equal(result, expected) + elif isinstance(obj, pd.Index): + expected = pd.Index(unique_values, dtype=obj.dtype) + if is_datetime64tz_dtype(obj): + expected = expected.normalize() + tm.assert_index_equal(result, expected) else: - expected_index = Index(values[::-1]) - idx = obj.index.repeat(range(1, len(obj) + 1)) - # take-based repeat - indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1)) - rep = values.take(indices) - obj = klass(rep, index=idx) - - # check values has the same dtype as the original - assert obj.dtype == orig.dtype - - expected_s = Series( - range(len(orig), 0, -1), index=expected_index, dtype="int64" - ) + expected = np.array(unique_values) + tm.assert_numpy_array_equal(result, expected) - result = obj.value_counts() - tm.assert_series_equal(result, expected_s) - assert result.index.name is None + @pytest.mark.parametrize("null_obj", [np.nan, None]) + def test_unique_null(self, null_obj, index_or_series_obj): + obj = index_or_series_obj + + if not allow_na_ops(obj): + pytest.skip("type doesn't allow for NA operations") + elif len(obj) < 1: + pytest.skip("Test doesn't make sense on empty data") + elif isinstance(obj, pd.MultiIndex): + pytest.skip(f"MultiIndex can't hold '{null_obj}'") + + values = obj.values + if needs_i8_conversion(obj): + values[0:2] = iNaT + else: + values[0:2] = null_obj + klass = type(obj) + repeated_values = np.repeat(values, range(1, len(values) + 1)) + obj = klass(repeated_values, dtype=obj.dtype) result = obj.unique() - if isinstance(obj, Index): - assert isinstance(result, type(obj)) - tm.assert_index_equal(result, orig) - assert result.dtype == orig.dtype - elif is_datetime64tz_dtype(obj): - # datetimetz Series returns array of Timestamp - assert result[0] == orig[0] - for r in result: - assert isinstance(r, Timestamp) - - tm.assert_numpy_array_equal( - result.astype(object), orig._values.astype(object) - ) + + unique_values_raw = dict.fromkeys(obj.values) + # because np.nan == np.nan is False, but None == None is True + # np.nan would be duplicated, whereas None wouldn't + unique_values_not_null = [ + val for val in unique_values_raw if not pd.isnull(val) + ] + unique_values = [null_obj] + unique_values_not_null + + if isinstance(obj, pd.Index): + expected = pd.Index(unique_values, dtype=obj.dtype) + if is_datetime64tz_dtype(obj): + result = result.normalize() + expected = expected.normalize() + elif isinstance(obj, pd.CategoricalIndex): + expected = expected.set_categories(unique_values_not_null) + tm.assert_index_equal(result, expected) else: - tm.assert_numpy_array_equal(result, orig.values) - assert result.dtype == orig.dtype + expected = np.array(unique_values, dtype=obj.dtype) + tm.assert_numpy_array_equal(result, expected) - # dropna=True would break for MultiIndex - assert obj.nunique(dropna=False) == len(np.unique(obj.values)) + def test_nunique(self, index_or_series_obj): + obj = index_or_series_obj + obj = np.repeat(obj, range(1, len(obj) + 1)) + expected = len(obj.unique()) + assert obj.nunique(dropna=False) == expected @pytest.mark.parametrize("null_obj", [np.nan, None]) - def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj): - orig = index_or_series_obj - obj = orig.copy() - klass = type(obj) - values = obj._ndarray_values - num_values = len(orig) + def test_nunique_null(self, null_obj, index_or_series_obj): + obj = index_or_series_obj if not allow_na_ops(obj): pytest.skip("type doesn't allow for NA operations") - elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)): - pytest.skip(f"values of {klass} cannot be changed") - elif isinstance(orig, pd.MultiIndex): - pytest.skip("MultiIndex doesn't support isna") - elif orig.duplicated().any(): - pytest.xfail( - "The test implementation isn't flexible enough to deal " - "with duplicated values. This isn't a bug in the " - "application code, but in the test code." - ) - - # special assign to the numpy array - if is_datetime64tz_dtype(obj): - if isinstance(obj, DatetimeIndex): - v = obj.asi8 - v[0:2] = iNaT - values = obj._shallow_copy(v) - else: - obj = obj.copy() - obj[0:2] = pd.NaT - values = obj._values + elif isinstance(obj, pd.MultiIndex): + pytest.skip(f"MultiIndex can't hold '{null_obj}'") - elif is_period_dtype(obj): - values[0:2] = iNaT - parr = type(obj._data)(values, dtype=obj.dtype) - values = obj._shallow_copy(parr) - elif needs_i8_conversion(obj): + values = obj.values + if needs_i8_conversion(obj): values[0:2] = iNaT - values = obj._shallow_copy(values) else: values[0:2] = null_obj - # check values has the same dtype as the original - assert values.dtype == obj.dtype - - # create repeated values, 'n'th element is repeated by n+1 - # times - if isinstance(obj, (DatetimeIndex, PeriodIndex)): - expected_index = obj.copy() - expected_index.name = None + klass = type(obj) + repeated_values = np.repeat(values, range(1, len(values) + 1)) + obj = klass(repeated_values, dtype=obj.dtype) - # attach name to klass - obj = klass(values.repeat(range(1, len(obj) + 1))) - obj.name = "a" - else: - if isinstance(obj, DatetimeIndex): - expected_index = orig._values._shallow_copy(values) - else: - expected_index = Index(values) - expected_index.name = None - obj = obj.repeat(range(1, len(obj) + 1)) - obj.name = "a" - - # check values has the same dtype as the original - assert obj.dtype == orig.dtype - - # check values correctly have NaN - nanloc = np.zeros(len(obj), dtype=np.bool) - nanloc[:3] = True - if isinstance(obj, Index): - tm.assert_numpy_array_equal(pd.isna(obj), nanloc) + if isinstance(obj, pd.CategoricalIndex): + assert obj.nunique() == len(obj.categories) + assert obj.nunique(dropna=False) == len(obj.categories) + 1 else: - exp = Series(nanloc, obj.index, name="a") - tm.assert_series_equal(pd.isna(obj), exp) - - expected_data = list(range(num_values, 2, -1)) - expected_data_na = expected_data.copy() - if expected_data_na: - expected_data_na.append(3) - expected_s_na = Series( - expected_data_na, - index=expected_index[num_values - 1 : 0 : -1], - dtype="int64", - name="a", - ) - expected_s = Series( - expected_data, - index=expected_index[num_values - 1 : 1 : -1], - dtype="int64", - name="a", - ) + num_unique_values = len(obj.unique()) + assert obj.nunique() == max(0, num_unique_values - 1) + assert obj.nunique(dropna=False) == max(0, num_unique_values) - result_s_na = obj.value_counts(dropna=False) - tm.assert_series_equal(result_s_na, expected_s_na) - assert result_s_na.index.name is None - assert result_s_na.name == "a" - result_s = obj.value_counts() - tm.assert_series_equal(obj.value_counts(), expected_s) - assert result_s.index.name is None - assert result_s.name == "a" + def test_value_counts(self, index_or_series_obj): + obj = index_or_series_obj + obj = np.repeat(obj, range(1, len(obj) + 1)) + result = obj.value_counts() - result = obj.unique() - if isinstance(obj, Index): - tm.assert_index_equal(result, Index(values[1:], name="a")) - elif is_datetime64tz_dtype(obj): - # unable to compare NaT / nan - tm.assert_extension_array_equal(result[1:], values[2:]) - assert result[0] is pd.NaT - elif len(obj) > 0: - tm.assert_numpy_array_equal(result[1:], values[2:]) - - assert pd.isna(result[0]) - assert result.dtype == orig.dtype - - assert obj.nunique() == max(0, num_values - 2) - assert obj.nunique(dropna=False) == max(0, num_values - 1) + counter = collections.Counter(obj) + expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) + expected.index = expected.index.astype(obj.dtype) + if isinstance(obj, pd.MultiIndex): + expected.index = pd.Index(expected.index) + + # sort_index to avoid switched order when values share the same count + result = result.sort_index() + expected = expected.sort_index() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("null_obj", [np.nan, None]) + def test_value_counts_null(self, null_obj, index_or_series_obj): + orig = index_or_series_obj + obj = orig.copy() + + if not allow_na_ops(obj): + pytest.skip("type doesn't allow for NA operations") + elif len(obj) < 1: + pytest.skip("Test doesn't make sense on empty data") + elif isinstance(orig, pd.MultiIndex): + pytest.skip(f"MultiIndex can't hold '{null_obj}'") + + values = obj.values + if needs_i8_conversion(obj): + values[0:2] = iNaT + else: + values[0:2] = null_obj + + klass = type(obj) + repeated_values = np.repeat(values, range(1, len(values) + 1)) + obj = klass(repeated_values, dtype=obj.dtype) + + # because np.nan == np.nan is False, but None == None is True + # np.nan would be duplicated, whereas None wouldn't + counter = collections.Counter(obj.dropna()) + expected = pd.Series(dict(counter.most_common()), dtype=np.int64) + expected.index = expected.index.astype(obj.dtype) + + tm.assert_series_equal(obj.value_counts(), expected) + + # can't use expected[null_obj] = 3 as + # IntervalIndex doesn't allow assignment + new_entry = pd.Series({np.nan: 3}, dtype=np.int64) + expected = expected.append(new_entry) + tm.assert_series_equal(obj.value_counts(dropna=False), expected) def test_value_counts_inferred(self, index_or_series): klass = index_or_series