Skip to content

Commit 1430e42

Browse files
jbrockmendeltopper-123
authored andcommitted
DEPR: accepting non-standard sequences in core.algorithms functions (pandas-dev#52986)
* DEPR: accepting non-standard sequences in core.algorithms functions * GH ref * fix doctests * typo fixup
1 parent 84fafb2 commit 1430e42

File tree

10 files changed

+112
-55
lines changed

10 files changed

+112
-55
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ Deprecations
255255
- Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`)
256256
- Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`)
257257
- Deprecated allowing non-standard inputs in :func:`take`, pass either a ``numpy.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series` (:issue:`52981`)
258+
- Deprecated allowing non-standard sequences for :func:`isin`, :func:`value_counts`, :func:`unique`, :func:`factorize`, case to one of ``numpy.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series` before calling (:issue:`52986`)
258259
- Deprecated behavior of :class:`DataFrame` reductions ``sum``, ``prod``, ``std``, ``var``, ``sem`` with ``axis=None``, in a future version this will operate over both axes returning a scalar instead of behaving like ``axis=0``; note this also affects numpy functions e.g. ``np.sum(df)`` (:issue:`21597`)
259260
- Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`)
260261
- Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`)

pandas/core/algorithms.py

+25-13
Original file line numberDiff line numberDiff line change
@@ -213,11 +213,22 @@ def _reconstruct_data(
213213
return values
214214

215215

216-
def _ensure_arraylike(values) -> ArrayLike:
216+
def _ensure_arraylike(values, func_name: str) -> ArrayLike:
217217
"""
218218
ensure that we are arraylike if not already
219219
"""
220-
if not is_array_like(values):
220+
if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
221+
# GH#52986
222+
if func_name != "isin-targets":
223+
# Make an exception for the comps argument in isin.
224+
warnings.warn(
225+
f"{func_name} with argument that is not not a Series, Index, "
226+
"ExtensionArray, or np.ndarray is deprecated and will raise in a "
227+
"future version.",
228+
FutureWarning,
229+
stacklevel=find_stack_level(),
230+
)
231+
221232
inferred = lib.infer_dtype(values, skipna=False)
222233
if inferred in ["mixed", "string", "mixed-integer"]:
223234
# "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160
@@ -357,7 +368,7 @@ def unique(values):
357368
dtype='datetime64[ns, US/Eastern]',
358369
freq=None)
359370
360-
>>> pd.unique(list("baabc"))
371+
>>> pd.unique(np.array(list("baabc"), dtype="O"))
361372
array(['b', 'a', 'c'], dtype=object)
362373
363374
An unordered Categorical will return categories in the
@@ -383,7 +394,7 @@ def unique(values):
383394
384395
An array of tuples
385396
386-
>>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")])
397+
>>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values)
387398
array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
388399
"""
389400
return unique_with_mask(values)
@@ -414,7 +425,7 @@ def nunique_ints(values: ArrayLike) -> int:
414425

415426
def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
416427
"""See algorithms.unique for docs. Takes a mask for masked arrays."""
417-
values = _ensure_arraylike(values)
428+
values = _ensure_arraylike(values, func_name="unique")
418429

419430
if isinstance(values.dtype, ExtensionDtype):
420431
# Dispatch to extension dtype's unique.
@@ -466,7 +477,7 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
466477

467478
if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
468479
orig_values = list(values)
469-
values = _ensure_arraylike(orig_values)
480+
values = _ensure_arraylike(orig_values, func_name="isin-targets")
470481

471482
if (
472483
len(values) > 0
@@ -483,7 +494,7 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
483494
else:
484495
values = extract_array(values, extract_numpy=True, extract_range=True)
485496

486-
comps_array = _ensure_arraylike(comps)
497+
comps_array = _ensure_arraylike(comps, func_name="isin")
487498
comps_array = extract_array(comps_array, extract_numpy=True)
488499
if not isinstance(comps_array, np.ndarray):
489500
# i.e. Extension Array
@@ -669,7 +680,7 @@ def factorize(
669680
``pd.factorize(values)``. The results are identical for methods like
670681
:meth:`Series.factorize`.
671682
672-
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
683+
>>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"))
673684
>>> codes
674685
array([0, 0, 1, 2, 0])
675686
>>> uniques
@@ -678,7 +689,8 @@ def factorize(
678689
With ``sort=True``, the `uniques` will be sorted, and `codes` will be
679690
shuffled so that the relationship is the maintained.
680691
681-
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
692+
>>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"),
693+
... sort=True)
682694
>>> codes
683695
array([1, 1, 0, 2, 1])
684696
>>> uniques
@@ -688,7 +700,7 @@ def factorize(
688700
the `codes` with the sentinel value ``-1`` and missing values are not
689701
included in `uniques`.
690702
691-
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
703+
>>> codes, uniques = pd.factorize(np.array(['b', None, 'a', 'c', 'b'], dtype="O"))
692704
>>> codes
693705
array([ 0, -1, 1, 2, 0])
694706
>>> uniques
@@ -746,7 +758,7 @@ def factorize(
746758
if isinstance(values, (ABCIndex, ABCSeries)):
747759
return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)
748760

749-
values = _ensure_arraylike(values)
761+
values = _ensure_arraylike(values, func_name="factorize")
750762
original = values
751763

752764
if (
@@ -880,7 +892,7 @@ def value_counts(
880892
counts = result._values
881893

882894
else:
883-
values = _ensure_arraylike(values)
895+
values = _ensure_arraylike(values, func_name="value_counts")
884896
keys, counts = value_counts_arraylike(values, dropna)
885897
if keys.dtype == np.float16:
886898
keys = keys.astype(np.float32)
@@ -981,7 +993,7 @@ def mode(
981993
-------
982994
np.ndarray or ExtensionArray
983995
"""
984-
values = _ensure_arraylike(values)
996+
values = _ensure_arraylike(values, func_name="mode")
985997
original = values
986998

987999
if needs_i8_conversion(values.dtype):

pandas/core/arrays/categorical.py

+3
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,9 @@ def __init__(
439439
values = arr
440440

441441
if dtype.categories is None:
442+
if not isinstance(values, ABCIndex):
443+
# in particular RangeIndex xref test_index_equal_range_categories
444+
values = sanitize_array(values, None)
442445
try:
443446
codes, categories = factorize(values, sort=True)
444447
except TypeError as err:

pandas/core/indexes/multi.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
from pandas.core.dtypes.generic import (
7474
ABCDataFrame,
7575
ABCDatetimeIndex,
76+
ABCSeries,
7677
ABCTimedeltaIndex,
7778
)
7879
from pandas.core.dtypes.inference import is_array_like
@@ -83,9 +84,13 @@
8384

8485
import pandas.core.algorithms as algos
8586
from pandas.core.array_algos.putmask import validate_putmask
86-
from pandas.core.arrays import Categorical
87+
from pandas.core.arrays import (
88+
Categorical,
89+
ExtensionArray,
90+
)
8791
from pandas.core.arrays.categorical import factorize_from_iterables
8892
import pandas.core.common as com
93+
from pandas.core.construction import sanitize_array
8994
import pandas.core.indexes.base as ibase
9095
from pandas.core.indexes.base import (
9196
Index,
@@ -3404,6 +3409,8 @@ def _reorder_indexer(
34043409
new_order = np.arange(n)[indexer]
34053410
elif is_list_like(k):
34063411
# Generate a map with all level codes as sorted initially
3412+
if not isinstance(k, (np.ndarray, ExtensionArray, Index, ABCSeries)):
3413+
k = sanitize_array(k, None)
34073414
k = algos.unique(k)
34083415
key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len(
34093416
self.levels[i]

pandas/core/reshape/tile.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,7 @@ def _infer_precision(base_precision: int, bins) -> int:
651651
Infer an appropriate precision for _round_frac
652652
"""
653653
for precision in range(base_precision, 20):
654-
levels = [_round_frac(b, precision) for b in bins]
654+
levels = np.asarray([_round_frac(b, precision) for b in bins])
655655
if algos.unique(levels).size == bins.size:
656656
return precision
657657
return base_precision # default

pandas/core/tools/datetimes.py

+3
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,9 @@ def _maybe_cache(
243243
if not should_cache(arg):
244244
return cache_array
245245

246+
if not isinstance(arg, (np.ndarray, ExtensionArray, Index, ABCSeries)):
247+
arg = np.array(arg)
248+
246249
unique_dates = unique(arg)
247250
if len(unique_dates) < len(arg):
248251
cache_dates = convert_listlike(unique_dates, format)

pandas/tests/indexes/period/methods/test_factorize.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,12 @@ def test_factorize(self):
3939
tm.assert_numpy_array_equal(arr, exp_arr)
4040
tm.assert_index_equal(idx, exp_idx)
4141

42-
def test_factorize_complex(self):
42+
def test_factorize_complex(self): # TODO: WTF is this test doing here?s
4343
# GH 17927
4444
array = [1, 2, 2 + 1j]
45-
labels, uniques = factorize(array)
45+
msg = "factorize with argument that is not not a Series"
46+
with tm.assert_produces_warning(FutureWarning, match=msg):
47+
labels, uniques = factorize(array)
4648

4749
expected_labels = np.array([0, 1, 2], dtype=np.intp)
4850
tm.assert_numpy_array_equal(labels, expected_labels)

pandas/tests/libs/test_hashtable.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -721,14 +721,17 @@ def test_ismember_tuple_with_nans():
721721
# GH-41836
722722
values = [("a", float("nan")), ("b", 1)]
723723
comps = [("a", float("nan"))]
724-
result = isin(values, comps)
724+
725+
msg = "isin with argument that is not not a Series"
726+
with tm.assert_produces_warning(FutureWarning, match=msg):
727+
result = isin(values, comps)
725728
expected = np.array([True, False], dtype=np.bool_)
726729
tm.assert_numpy_array_equal(result, expected)
727730

728731

729732
def test_float_complex_int_are_equal_as_objects():
730733
values = ["a", 5, 5.0, 5.0 + 0j]
731734
comps = list(range(129))
732-
result = isin(values, comps)
735+
result = isin(np.array(values, dtype=object), np.asarray(comps))
733736
expected = np.array([False, True, True, True], dtype=np.bool_)
734737
tm.assert_numpy_array_equal(result, expected)

pandas/tests/reshape/test_cut.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ def test_cut_duplicates_bin(kwargs, msg):
393393
cut(values, bins, **kwargs)
394394
else:
395395
result = cut(values, bins, **kwargs)
396-
expected = cut(values, pd.unique(bins))
396+
expected = cut(values, pd.unique(np.asarray(bins)))
397397
tm.assert_series_equal(result, expected)
398398

399399

0 commit comments

Comments
 (0)