TST: Split and simplify test_value_counts_unique_nunique (#32281)

SaturnFromTitan · web-flow · commit 0d04683baaf6 · 2020-03-04T08:52:30.000-05:00
diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py
@@ -1,3 +1,4 @@
+import collections
 from datetime import datetime, timedelta
 from io import StringIO
 import sys
@@ -15,7 +16,6 @@
     is_datetime64_dtype,
     is_datetime64tz_dtype,
     is_object_dtype,
-    is_period_dtype,
     needs_i8_conversion,
 )
 
@@ -26,11 +26,9 @@
     Index,
     Interval,
     IntervalIndex,
-    PeriodIndex,
     Series,
     Timedelta,
     TimedeltaIndex,
-    Timestamp,
 )
 import pandas._testing as tm
 
@@ -207,180 +205,152 @@ def test_ndarray_compat_properties(self, index_or_series_obj):
         assert Index([1]).item() == 1
         assert Series([1]).item() == 1
 
-    def test_value_counts_unique_nunique(self, index_or_series_obj):
-        orig = index_or_series_obj
-        obj = orig.copy()
-        klass = type(obj)
-        values = obj._values
-
-        if orig.duplicated().any():
-            pytest.xfail(
-                "The test implementation isn't flexible enough to deal "
-                "with duplicated values. This isn't a bug in the "
-                "application code, but in the test code."
-            )
+    def test_unique(self, index_or_series_obj):
+        obj = index_or_series_obj
+        obj = np.repeat(obj, range(1, len(obj) + 1))
+        result = obj.unique()
 
-        # create repeated values, 'n'th element is repeated by n+1 times
-        if isinstance(obj, Index):
-            expected_index = Index(obj[::-1])
-            expected_index.name = None
-            obj = obj.repeat(range(1, len(obj) + 1))
+        # dict.fromkeys preserves the order
+        unique_values = list(dict.fromkeys(obj.values))
+        if isinstance(obj, pd.MultiIndex):
+            expected = pd.MultiIndex.from_tuples(unique_values)
+            expected.names = obj.names
+            tm.assert_index_equal(result, expected)
+        elif isinstance(obj, pd.Index):
+            expected = pd.Index(unique_values, dtype=obj.dtype)
+            if is_datetime64tz_dtype(obj):
+                expected = expected.normalize()
+            tm.assert_index_equal(result, expected)
         else:
-            expected_index = Index(values[::-1])
-            idx = obj.index.repeat(range(1, len(obj) + 1))
-            # take-based repeat
-            indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1))
-            rep = values.take(indices)
-            obj = klass(rep, index=idx)
-
-        # check values has the same dtype as the original
-        assert obj.dtype == orig.dtype
-
-        expected_s = Series(
-            range(len(orig), 0, -1), index=expected_index, dtype="int64"
-        )
+            expected = np.array(unique_values)
+            tm.assert_numpy_array_equal(result, expected)
 
-        result = obj.value_counts()
-        tm.assert_series_equal(result, expected_s)
-        assert result.index.name is None
+    @pytest.mark.parametrize("null_obj", [np.nan, None])
+    def test_unique_null(self, null_obj, index_or_series_obj):
+        obj = index_or_series_obj
+
+        if not allow_na_ops(obj):
+            pytest.skip("type doesn't allow for NA operations")
+        elif len(obj) < 1:
+            pytest.skip("Test doesn't make sense on empty data")
+        elif isinstance(obj, pd.MultiIndex):
+            pytest.skip(f"MultiIndex can't hold '{null_obj}'")
+
+        values = obj.values
+        if needs_i8_conversion(obj):
+            values[0:2] = iNaT
+        else:
+            values[0:2] = null_obj
 
+        klass = type(obj)
+        repeated_values = np.repeat(values, range(1, len(values) + 1))
+        obj = klass(repeated_values, dtype=obj.dtype)
         result = obj.unique()
-        if isinstance(obj, Index):
-            assert isinstance(result, type(obj))
-            tm.assert_index_equal(result, orig)
-            assert result.dtype == orig.dtype
-        elif is_datetime64tz_dtype(obj):
-            # datetimetz Series returns array of Timestamp
-            assert result[0] == orig[0]
-            for r in result:
-                assert isinstance(r, Timestamp)
-
-            tm.assert_numpy_array_equal(
-                result.astype(object), orig._values.astype(object)
-            )
+
+        unique_values_raw = dict.fromkeys(obj.values)
+        # because np.nan == np.nan is False, but None == None is True
+        # np.nan would be duplicated, whereas None wouldn't
+        unique_values_not_null = [
+            val for val in unique_values_raw if not pd.isnull(val)
+        ]
+        unique_values = [null_obj] + unique_values_not_null
+
+        if isinstance(obj, pd.Index):
+            expected = pd.Index(unique_values, dtype=obj.dtype)
+            if is_datetime64tz_dtype(obj):
+                result = result.normalize()
+                expected = expected.normalize()
+            elif isinstance(obj, pd.CategoricalIndex):
+                expected = expected.set_categories(unique_values_not_null)
+            tm.assert_index_equal(result, expected)
         else:
-            tm.assert_numpy_array_equal(result, orig.values)
-            assert result.dtype == orig.dtype
+            expected = np.array(unique_values, dtype=obj.dtype)
+            tm.assert_numpy_array_equal(result, expected)
 
-        # dropna=True would break for MultiIndex
-        assert obj.nunique(dropna=False) == len(np.unique(obj.values))
+    def test_nunique(self, index_or_series_obj):
+        obj = index_or_series_obj
+        obj = np.repeat(obj, range(1, len(obj) + 1))
+        expected = len(obj.unique())
+        assert obj.nunique(dropna=False) == expected
 
     @pytest.mark.parametrize("null_obj", [np.nan, None])
-    def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj):
-        orig = index_or_series_obj
-        obj = orig.copy()
-        klass = type(obj)
-        values = obj._ndarray_values
-        num_values = len(orig)
+    def test_nunique_null(self, null_obj, index_or_series_obj):
+        obj = index_or_series_obj
 
         if not allow_na_ops(obj):
             pytest.skip("type doesn't allow for NA operations")
-        elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)):
-            pytest.skip(f"values of {klass} cannot be changed")
-        elif isinstance(orig, pd.MultiIndex):
-            pytest.skip("MultiIndex doesn't support isna")
-        elif orig.duplicated().any():
-            pytest.xfail(
-                "The test implementation isn't flexible enough to deal "
-                "with duplicated values. This isn't a bug in the "
-                "application code, but in the test code."
-            )
-
-        # special assign to the numpy array
-        if is_datetime64tz_dtype(obj):
-            if isinstance(obj, DatetimeIndex):
-                v = obj.asi8
-                v[0:2] = iNaT
-                values = obj._shallow_copy(v)
-            else:
-                obj = obj.copy()
-                obj[0:2] = pd.NaT
-                values = obj._values
+        elif isinstance(obj, pd.MultiIndex):
+            pytest.skip(f"MultiIndex can't hold '{null_obj}'")
 
-        elif is_period_dtype(obj):
-            values[0:2] = iNaT
-            parr = type(obj._data)(values, dtype=obj.dtype)
-            values = obj._shallow_copy(parr)
-        elif needs_i8_conversion(obj):
+        values = obj.values
+        if needs_i8_conversion(obj):
             values[0:2] = iNaT
-            values = obj._shallow_copy(values)
         else:
             values[0:2] = null_obj
 
-        # check values has the same dtype as the original
-        assert values.dtype == obj.dtype
-
-        # create repeated values, 'n'th element is repeated by n+1
-        # times
-        if isinstance(obj, (DatetimeIndex, PeriodIndex)):
-            expected_index = obj.copy()
-            expected_index.name = None
+        klass = type(obj)
+        repeated_values = np.repeat(values, range(1, len(values) + 1))
+        obj = klass(repeated_values, dtype=obj.dtype)
 
-            # attach name to klass
-            obj = klass(values.repeat(range(1, len(obj) + 1)))
-            obj.name = "a"
-        else:
-            if isinstance(obj, DatetimeIndex):
-                expected_index = orig._values._shallow_copy(values)
-            else:
-                expected_index = Index(values)
-            expected_index.name = None
-            obj = obj.repeat(range(1, len(obj) + 1))
-            obj.name = "a"
-
-        # check values has the same dtype as the original
-        assert obj.dtype == orig.dtype
-
-        # check values correctly have NaN
-        nanloc = np.zeros(len(obj), dtype=np.bool)
-        nanloc[:3] = True
-        if isinstance(obj, Index):
-            tm.assert_numpy_array_equal(pd.isna(obj), nanloc)
+        if isinstance(obj, pd.CategoricalIndex):
+            assert obj.nunique() == len(obj.categories)
+            assert obj.nunique(dropna=False) == len(obj.categories) + 1
         else:
-            exp = Series(nanloc, obj.index, name="a")
-            tm.assert_series_equal(pd.isna(obj), exp)
-
-        expected_data = list(range(num_values, 2, -1))
-        expected_data_na = expected_data.copy()
-        if expected_data_na:
-            expected_data_na.append(3)
-        expected_s_na = Series(
-            expected_data_na,
-            index=expected_index[num_values - 1 : 0 : -1],
-            dtype="int64",
-            name="a",
-        )
-        expected_s = Series(
-            expected_data,
-            index=expected_index[num_values - 1 : 1 : -1],
-            dtype="int64",
-            name="a",
-        )
+            num_unique_values = len(obj.unique())
+            assert obj.nunique() == max(0, num_unique_values - 1)
+            assert obj.nunique(dropna=False) == max(0, num_unique_values)
 
-        result_s_na = obj.value_counts(dropna=False)
-        tm.assert_series_equal(result_s_na, expected_s_na)
-        assert result_s_na.index.name is None
-        assert result_s_na.name == "a"
-        result_s = obj.value_counts()
-        tm.assert_series_equal(obj.value_counts(), expected_s)
-        assert result_s.index.name is None
-        assert result_s.name == "a"
+    def test_value_counts(self, index_or_series_obj):
+        obj = index_or_series_obj
+        obj = np.repeat(obj, range(1, len(obj) + 1))
+        result = obj.value_counts()
 
-        result = obj.unique()
-        if isinstance(obj, Index):
-            tm.assert_index_equal(result, Index(values[1:], name="a"))
-        elif is_datetime64tz_dtype(obj):
-            # unable to compare NaT / nan
-            tm.assert_extension_array_equal(result[1:], values[2:])
-            assert result[0] is pd.NaT
-        elif len(obj) > 0:
-            tm.assert_numpy_array_equal(result[1:], values[2:])
-
-            assert pd.isna(result[0])
-            assert result.dtype == orig.dtype
-
-        assert obj.nunique() == max(0, num_values - 2)
-        assert obj.nunique(dropna=False) == max(0, num_values - 1)
+        counter = collections.Counter(obj)
+        expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name)
+        expected.index = expected.index.astype(obj.dtype)
+        if isinstance(obj, pd.MultiIndex):
+            expected.index = pd.Index(expected.index)
+
+        # sort_index to avoid switched order when values share the same count
+        result = result.sort_index()
+        expected = expected.sort_index()
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("null_obj", [np.nan, None])
+    def test_value_counts_null(self, null_obj, index_or_series_obj):
+        orig = index_or_series_obj
+        obj = orig.copy()
+
+        if not allow_na_ops(obj):
+            pytest.skip("type doesn't allow for NA operations")
+        elif len(obj) < 1:
+            pytest.skip("Test doesn't make sense on empty data")
+        elif isinstance(orig, pd.MultiIndex):
+            pytest.skip(f"MultiIndex can't hold '{null_obj}'")
+
+        values = obj.values
+        if needs_i8_conversion(obj):
+            values[0:2] = iNaT
+        else:
+            values[0:2] = null_obj
+
+        klass = type(obj)
+        repeated_values = np.repeat(values, range(1, len(values) + 1))
+        obj = klass(repeated_values, dtype=obj.dtype)
+
+        # because np.nan == np.nan is False, but None == None is True
+        # np.nan would be duplicated, whereas None wouldn't
+        counter = collections.Counter(obj.dropna())
+        expected = pd.Series(dict(counter.most_common()), dtype=np.int64)
+        expected.index = expected.index.astype(obj.dtype)
+
+        tm.assert_series_equal(obj.value_counts(), expected)
+
+        # can't use expected[null_obj] = 3 as
+        # IntervalIndex doesn't allow assignment
+        new_entry = pd.Series({np.nan: 3}, dtype=np.int64)
+        expected = expected.append(new_entry)
+        tm.assert_series_equal(obj.value_counts(dropna=False), expected)
 
     def test_value_counts_inferred(self, index_or_series):
         klass = index_or_series