DEPR: accepting non-standard sequences in core.algorithms functions (pandas-dev#52986)

jbrockmendel · topper-123 · commit 1430e42f8577 · 2023-06-05T23:01:57.000+01:00
* DEPR: accepting non-standard sequences in core.algorithms functions

* GH ref

* fix doctests

* typo fixup
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -255,6 +255,7 @@ Deprecations
 - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`)
 - Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`)
 - Deprecated allowing non-standard inputs in :func:`take`, pass either a ``numpy.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series` (:issue:`52981`)
+- Deprecated allowing non-standard sequences for :func:`isin`, :func:`value_counts`, :func:`unique`, :func:`factorize`, case to one of ``numpy.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series` before calling (:issue:`52986`)
 - Deprecated behavior of :class:`DataFrame` reductions ``sum``, ``prod``, ``std``, ``var``, ``sem`` with ``axis=None``, in a future version this will operate over both axes returning a scalar instead of behaving like ``axis=0``; note this also affects numpy functions e.g. ``np.sum(df)`` (:issue:`21597`)
 - Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`)
 - Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -213,11 +213,22 @@ def _reconstruct_data(
     return values
 
 
-def _ensure_arraylike(values) -> ArrayLike:
+def _ensure_arraylike(values, func_name: str) -> ArrayLike:
     """
     ensure that we are arraylike if not already
     """
-    if not is_array_like(values):
+    if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
+        # GH#52986
+        if func_name != "isin-targets":
+            # Make an exception for the comps argument in isin.
+            warnings.warn(
+                f"{func_name} with argument that is not not a Series, Index, "
+                "ExtensionArray, or np.ndarray is deprecated and will raise in a "
+                "future version.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+
         inferred = lib.infer_dtype(values, skipna=False)
         if inferred in ["mixed", "string", "mixed-integer"]:
             # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160
@@ -357,7 +368,7 @@ def unique(values):
             dtype='datetime64[ns, US/Eastern]',
             freq=None)
 
-    >>> pd.unique(list("baabc"))
+    >>> pd.unique(np.array(list("baabc"), dtype="O"))
     array(['b', 'a', 'c'], dtype=object)
 
     An unordered Categorical will return categories in the
@@ -383,7 +394,7 @@ def unique(values):
 
     An array of tuples
 
-    >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")])
+    >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values)
     array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
     """
     return unique_with_mask(values)
@@ -414,7 +425,7 @@ def nunique_ints(values: ArrayLike) -> int:
 
 def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
     """See algorithms.unique for docs. Takes a mask for masked arrays."""
-    values = _ensure_arraylike(values)
+    values = _ensure_arraylike(values, func_name="unique")
 
     if isinstance(values.dtype, ExtensionDtype):
         # Dispatch to extension dtype's unique.
@@ -466,7 +477,7 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
 
     if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
         orig_values = list(values)
-        values = _ensure_arraylike(orig_values)
+        values = _ensure_arraylike(orig_values, func_name="isin-targets")
 
         if (
             len(values) > 0
@@ -483,7 +494,7 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
     else:
         values = extract_array(values, extract_numpy=True, extract_range=True)
 
-    comps_array = _ensure_arraylike(comps)
+    comps_array = _ensure_arraylike(comps, func_name="isin")
     comps_array = extract_array(comps_array, extract_numpy=True)
     if not isinstance(comps_array, np.ndarray):
         # i.e. Extension Array
@@ -669,7 +680,7 @@ def factorize(
     ``pd.factorize(values)``. The results are identical for methods like
     :meth:`Series.factorize`.
 
-    >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
+    >>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"))
     >>> codes
     array([0, 0, 1, 2, 0])
     >>> uniques
@@ -678,7 +689,8 @@ def factorize(
     With ``sort=True``, the `uniques` will be sorted, and `codes` will be
     shuffled so that the relationship is the maintained.
 
-    >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
+    >>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"),
+    ...                               sort=True)
     >>> codes
     array([1, 1, 0, 2, 1])
     >>> uniques
@@ -688,7 +700,7 @@ def factorize(
     the `codes` with the sentinel value ``-1`` and missing values are not
     included in `uniques`.
 
-    >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
+    >>> codes, uniques = pd.factorize(np.array(['b', None, 'a', 'c', 'b'], dtype="O"))
     >>> codes
     array([ 0, -1,  1,  2,  0])
     >>> uniques
@@ -746,7 +758,7 @@ def factorize(
     if isinstance(values, (ABCIndex, ABCSeries)):
         return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)
 
-    values = _ensure_arraylike(values)
+    values = _ensure_arraylike(values, func_name="factorize")
     original = values
 
     if (
@@ -880,7 +892,7 @@ def value_counts(
             counts = result._values
 
         else:
-            values = _ensure_arraylike(values)
+            values = _ensure_arraylike(values, func_name="value_counts")
             keys, counts = value_counts_arraylike(values, dropna)
             if keys.dtype == np.float16:
                 keys = keys.astype(np.float32)
@@ -981,7 +993,7 @@ def mode(
     -------
     np.ndarray or ExtensionArray
     """
-    values = _ensure_arraylike(values)
+    values = _ensure_arraylike(values, func_name="mode")
     original = values
 
     if needs_i8_conversion(values.dtype):
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -439,6 +439,9 @@ def __init__(
                 values = arr
 
         if dtype.categories is None:
+            if not isinstance(values, ABCIndex):
+                # in particular RangeIndex xref test_index_equal_range_categories
+                values = sanitize_array(values, None)
             try:
                 codes, categories = factorize(values, sort=True)
             except TypeError as err:
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -73,6 +73,7 @@
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCDatetimeIndex,
+    ABCSeries,
     ABCTimedeltaIndex,
 )
 from pandas.core.dtypes.inference import is_array_like
@@ -83,9 +84,13 @@
 
 import pandas.core.algorithms as algos
 from pandas.core.array_algos.putmask import validate_putmask
-from pandas.core.arrays import Categorical
+from pandas.core.arrays import (
+    Categorical,
+    ExtensionArray,
+)
 from pandas.core.arrays.categorical import factorize_from_iterables
 import pandas.core.common as com
+from pandas.core.construction import sanitize_array
 import pandas.core.indexes.base as ibase
 from pandas.core.indexes.base import (
     Index,
@@ -3404,6 +3409,8 @@ def _reorder_indexer(
                 new_order = np.arange(n)[indexer]
             elif is_list_like(k):
                 # Generate a map with all level codes as sorted initially
+                if not isinstance(k, (np.ndarray, ExtensionArray, Index, ABCSeries)):
+                    k = sanitize_array(k, None)
                 k = algos.unique(k)
                 key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len(
                     self.levels[i]
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -651,7 +651,7 @@ def _infer_precision(base_precision: int, bins) -> int:
     Infer an appropriate precision for _round_frac
     """
     for precision in range(base_precision, 20):
-        levels = [_round_frac(b, precision) for b in bins]
+        levels = np.asarray([_round_frac(b, precision) for b in bins])
         if algos.unique(levels).size == bins.size:
             return precision
     return base_precision  # default
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -243,6 +243,9 @@ def _maybe_cache(
         if not should_cache(arg):
             return cache_array
 
+        if not isinstance(arg, (np.ndarray, ExtensionArray, Index, ABCSeries)):
+            arg = np.array(arg)
+
         unique_dates = unique(arg)
         if len(unique_dates) < len(arg):
             cache_dates = convert_listlike(unique_dates, format)
diff --git a/pandas/tests/indexes/period/methods/test_factorize.py b/pandas/tests/indexes/period/methods/test_factorize.py
@@ -39,10 +39,12 @@ def test_factorize(self):
         tm.assert_numpy_array_equal(arr, exp_arr)
         tm.assert_index_equal(idx, exp_idx)
 
-    def test_factorize_complex(self):
+    def test_factorize_complex(self):  # TODO: WTF is this test doing here?s
         # GH 17927
         array = [1, 2, 2 + 1j]
-        labels, uniques = factorize(array)
+        msg = "factorize with argument that is not not a Series"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            labels, uniques = factorize(array)
 
         expected_labels = np.array([0, 1, 2], dtype=np.intp)
         tm.assert_numpy_array_equal(labels, expected_labels)
diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
@@ -721,14 +721,17 @@ def test_ismember_tuple_with_nans():
     # GH-41836
     values = [("a", float("nan")), ("b", 1)]
     comps = [("a", float("nan"))]
-    result = isin(values, comps)
+
+    msg = "isin with argument that is not not a Series"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = isin(values, comps)
     expected = np.array([True, False], dtype=np.bool_)
     tm.assert_numpy_array_equal(result, expected)
 
 
 def test_float_complex_int_are_equal_as_objects():
     values = ["a", 5, 5.0, 5.0 + 0j]
     comps = list(range(129))
-    result = isin(values, comps)
+    result = isin(np.array(values, dtype=object), np.asarray(comps))
     expected = np.array([False, True, True, True], dtype=np.bool_)
     tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py
@@ -393,7 +393,7 @@ def test_cut_duplicates_bin(kwargs, msg):
             cut(values, bins, **kwargs)
     else:
         result = cut(values, bins, **kwargs)
-        expected = cut(values, pd.unique(bins))
+        expected = cut(values, pd.unique(np.asarray(bins)))
         tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py