Skip to content

DEPR: accepting non-standard sequences in core.algorithms functions #52986

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ Deprecations
- Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`)
- Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`)
- Deprecated allowing non-standard inputs in :func:`take`, pass either a ``numpy.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series` (:issue:`52981`)
- Deprecated allowing non-standard sequences for :func:`isin`, :func:`value_counts`, :func:`unique`, :func:`factorize`, case to one of ``numpy.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series` before calling (:issue:`52986`)
- Deprecated behavior of :class:`DataFrame` reductions ``sum``, ``prod``, ``std``, ``var``, ``sem`` with ``axis=None``, in a future version this will operate over both axes returning a scalar instead of behaving like ``axis=0``; note this also affects numpy functions e.g. ``np.sum(df)`` (:issue:`21597`)
- Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`)
- Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`)
Expand Down
38 changes: 25 additions & 13 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,22 @@ def _reconstruct_data(
return values


def _ensure_arraylike(values) -> ArrayLike:
def _ensure_arraylike(values, func_name: str) -> ArrayLike:
"""
ensure that we are arraylike if not already
"""
if not is_array_like(values):
if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
# GH#52986
if func_name != "isin-targets":
# Make an exception for the comps argument in isin.
warnings.warn(
f"{func_name} with argument that is not not a Series, Index, "
"ExtensionArray, or np.ndarray is deprecated and will raise in a "
"future version.",
FutureWarning,
stacklevel=find_stack_level(),
)

inferred = lib.infer_dtype(values, skipna=False)
if inferred in ["mixed", "string", "mixed-integer"]:
# "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160
Expand Down Expand Up @@ -356,7 +367,7 @@ def unique(values):
dtype='datetime64[ns, US/Eastern]',
freq=None)

>>> pd.unique(list("baabc"))
>>> pd.unique(np.array(list("baabc"), dtype="O"))
array(['b', 'a', 'c'], dtype=object)

An unordered Categorical will return categories in the
Expand All @@ -382,7 +393,7 @@ def unique(values):

An array of tuples

>>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")])
>>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values)
array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
"""
return unique_with_mask(values)
Expand Down Expand Up @@ -413,7 +424,7 @@ def nunique_ints(values: ArrayLike) -> int:

def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
"""See algorithms.unique for docs. Takes a mask for masked arrays."""
values = _ensure_arraylike(values)
values = _ensure_arraylike(values, func_name="unique")

if isinstance(values.dtype, ExtensionDtype):
# Dispatch to extension dtype's unique.
Expand Down Expand Up @@ -465,7 +476,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:

if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
orig_values = list(values)
values = _ensure_arraylike(orig_values)
values = _ensure_arraylike(orig_values, func_name="isin-targets")

if (
len(values) > 0
Expand All @@ -482,7 +493,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:
else:
values = extract_array(values, extract_numpy=True, extract_range=True)

comps_array = _ensure_arraylike(comps)
comps_array = _ensure_arraylike(comps, func_name="isin")
comps_array = extract_array(comps_array, extract_numpy=True)
if not isinstance(comps_array, np.ndarray):
# i.e. Extension Array
Expand Down Expand Up @@ -668,7 +679,7 @@ def factorize(
``pd.factorize(values)``. The results are identical for methods like
:meth:`Series.factorize`.

>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
>>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"))
>>> codes
array([0, 0, 1, 2, 0])
>>> uniques
Expand All @@ -677,7 +688,8 @@ def factorize(
With ``sort=True``, the `uniques` will be sorted, and `codes` will be
shuffled so that the relationship is the maintained.

>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
>>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"),
... sort=True)
>>> codes
array([1, 1, 0, 2, 1])
>>> uniques
Expand All @@ -687,7 +699,7 @@ def factorize(
the `codes` with the sentinel value ``-1`` and missing values are not
included in `uniques`.

>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
>>> codes, uniques = pd.factorize(np.array(['b', None, 'a', 'c', 'b'], dtype="O"))
>>> codes
array([ 0, -1, 1, 2, 0])
>>> uniques
Expand Down Expand Up @@ -745,7 +757,7 @@ def factorize(
if isinstance(values, (ABCIndex, ABCSeries)):
return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)

values = _ensure_arraylike(values)
values = _ensure_arraylike(values, func_name="factorize")
original = values

if (
Expand Down Expand Up @@ -879,7 +891,7 @@ def value_counts(
counts = result._values

else:
values = _ensure_arraylike(values)
values = _ensure_arraylike(values, func_name="value_counts")
keys, counts = value_counts_arraylike(values, dropna)
if keys.dtype == np.float16:
keys = keys.astype(np.float32)
Expand Down Expand Up @@ -980,7 +992,7 @@ def mode(
-------
np.ndarray or ExtensionArray
"""
values = _ensure_arraylike(values)
values = _ensure_arraylike(values, func_name="mode")
original = values

if needs_i8_conversion(values.dtype):
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,9 @@ def __init__(
values = arr

if dtype.categories is None:
if not isinstance(values, ABCIndex):
# in particular RangeIndex xref test_index_equal_range_categories
values = sanitize_array(values, None)
try:
codes, categories = factorize(values, sort=True)
except TypeError as err:
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCDatetimeIndex,
ABCSeries,
ABCTimedeltaIndex,
)
from pandas.core.dtypes.inference import is_array_like
Expand All @@ -83,9 +84,13 @@

import pandas.core.algorithms as algos
from pandas.core.array_algos.putmask import validate_putmask
from pandas.core.arrays import Categorical
from pandas.core.arrays import (
Categorical,
ExtensionArray,
)
from pandas.core.arrays.categorical import factorize_from_iterables
import pandas.core.common as com
from pandas.core.construction import sanitize_array
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import (
Index,
Expand Down Expand Up @@ -3404,6 +3409,8 @@ def _reorder_indexer(
new_order = np.arange(n)[indexer]
elif is_list_like(k):
# Generate a map with all level codes as sorted initially
if not isinstance(k, (np.ndarray, ExtensionArray, Index, ABCSeries)):
k = sanitize_array(k, None)
k = algos.unique(k)
key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len(
self.levels[i]
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ def _infer_precision(base_precision: int, bins) -> int:
Infer an appropriate precision for _round_frac
"""
for precision in range(base_precision, 20):
levels = [_round_frac(b, precision) for b in bins]
levels = np.asarray([_round_frac(b, precision) for b in bins])
if algos.unique(levels).size == bins.size:
return precision
return base_precision # default
3 changes: 3 additions & 0 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ def _maybe_cache(
if not should_cache(arg):
return cache_array

if not isinstance(arg, (np.ndarray, ExtensionArray, Index, ABCSeries)):
arg = np.array(arg)

unique_dates = unique(arg)
if len(unique_dates) < len(arg):
cache_dates = convert_listlike(unique_dates, format)
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/indexes/period/methods/test_factorize.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@ def test_factorize(self):
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)

def test_factorize_complex(self):
def test_factorize_complex(self): # TODO: WTF is this test doing here?s
# GH 17927
array = [1, 2, 2 + 1j]
labels, uniques = factorize(array)
msg = "factorize with argument that is not not a Series"
with tm.assert_produces_warning(FutureWarning, match=msg):
labels, uniques = factorize(array)

expected_labels = np.array([0, 1, 2], dtype=np.intp)
tm.assert_numpy_array_equal(labels, expected_labels)
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,14 +721,17 @@ def test_ismember_tuple_with_nans():
# GH-41836
values = [("a", float("nan")), ("b", 1)]
comps = [("a", float("nan"))]
result = isin(values, comps)

msg = "isin with argument that is not not a Series"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = isin(values, comps)
expected = np.array([True, False], dtype=np.bool_)
tm.assert_numpy_array_equal(result, expected)


def test_float_complex_int_are_equal_as_objects():
values = ["a", 5, 5.0, 5.0 + 0j]
comps = list(range(129))
result = isin(values, comps)
result = isin(np.array(values, dtype=object), np.asarray(comps))
expected = np.array([False, True, True, True], dtype=np.bool_)
tm.assert_numpy_array_equal(result, expected)
2 changes: 1 addition & 1 deletion pandas/tests/reshape/test_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ def test_cut_duplicates_bin(kwargs, msg):
cut(values, bins, **kwargs)
else:
result = cut(values, bins, **kwargs)
expected = cut(values, pd.unique(bins))
expected = cut(values, pd.unique(np.asarray(bins)))
tm.assert_series_equal(result, expected)


Expand Down
Loading