diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 5c181d02e0102..eb72f5b8d9e9b 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -252,6 +252,7 @@ Deprecations - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) - Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) - Deprecated allowing non-standard inputs in :func:`take`, pass either a ``numpy.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series` (:issue:`52981`) +- Deprecated allowing non-standard sequences for :func:`isin`, :func:`value_counts`, :func:`unique`, :func:`factorize`, case to one of ``numpy.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series` before calling (:issue:`52986`) - Deprecated behavior of :class:`DataFrame` reductions ``sum``, ``prod``, ``std``, ``var``, ``sem`` with ``axis=None``, in a future version this will operate over both axes returning a scalar instead of behaving like ``axis=0``; note this also affects numpy functions e.g. ``np.sum(df)`` (:issue:`21597`) - Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`) - Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 33bd56e5b1068..f1bc1b6e9561f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -212,11 +212,22 @@ def _reconstruct_data( return values -def _ensure_arraylike(values) -> ArrayLike: +def _ensure_arraylike(values, func_name: str) -> ArrayLike: """ ensure that we are arraylike if not already """ - if not is_array_like(values): + if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): + # GH#52986 + if func_name != "isin-targets": + # Make an exception for the comps argument in isin. + warnings.warn( + f"{func_name} with argument that is not not a Series, Index, " + "ExtensionArray, or np.ndarray is deprecated and will raise in a " + "future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + inferred = lib.infer_dtype(values, skipna=False) if inferred in ["mixed", "string", "mixed-integer"]: # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160 @@ -356,7 +367,7 @@ def unique(values): dtype='datetime64[ns, US/Eastern]', freq=None) - >>> pd.unique(list("baabc")) + >>> pd.unique(np.array(list("baabc"), dtype="O")) array(['b', 'a', 'c'], dtype=object) An unordered Categorical will return categories in the @@ -382,7 +393,7 @@ def unique(values): An array of tuples - >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]) + >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) """ return unique_with_mask(values) @@ -413,7 +424,7 @@ def nunique_ints(values: ArrayLike) -> int: def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): """See algorithms.unique for docs. Takes a mask for masked arrays.""" - values = _ensure_arraylike(values) + values = _ensure_arraylike(values, func_name="unique") if isinstance(values.dtype, ExtensionDtype): # Dispatch to extension dtype's unique. @@ -465,7 +476,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]: if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): orig_values = list(values) - values = _ensure_arraylike(orig_values) + values = _ensure_arraylike(orig_values, func_name="isin-targets") if ( len(values) > 0 @@ -482,7 +493,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]: else: values = extract_array(values, extract_numpy=True, extract_range=True) - comps_array = _ensure_arraylike(comps) + comps_array = _ensure_arraylike(comps, func_name="isin") comps_array = extract_array(comps_array, extract_numpy=True) if not isinstance(comps_array, np.ndarray): # i.e. Extension Array @@ -668,7 +679,7 @@ def factorize( ``pd.factorize(values)``. The results are identical for methods like :meth:`Series.factorize`. - >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) + >>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O")) >>> codes array([0, 0, 1, 2, 0]) >>> uniques @@ -677,7 +688,8 @@ def factorize( With ``sort=True``, the `uniques` will be sorted, and `codes` will be shuffled so that the relationship is the maintained. - >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) + >>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"), + ... sort=True) >>> codes array([1, 1, 0, 2, 1]) >>> uniques @@ -687,7 +699,7 @@ def factorize( the `codes` with the sentinel value ``-1`` and missing values are not included in `uniques`. - >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) + >>> codes, uniques = pd.factorize(np.array(['b', None, 'a', 'c', 'b'], dtype="O")) >>> codes array([ 0, -1, 1, 2, 0]) >>> uniques @@ -745,7 +757,7 @@ def factorize( if isinstance(values, (ABCIndex, ABCSeries)): return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel) - values = _ensure_arraylike(values) + values = _ensure_arraylike(values, func_name="factorize") original = values if ( @@ -879,7 +891,7 @@ def value_counts( counts = result._values else: - values = _ensure_arraylike(values) + values = _ensure_arraylike(values, func_name="value_counts") keys, counts = value_counts_arraylike(values, dropna) if keys.dtype == np.float16: keys = keys.astype(np.float32) @@ -980,7 +992,7 @@ def mode( ------- np.ndarray or ExtensionArray """ - values = _ensure_arraylike(values) + values = _ensure_arraylike(values, func_name="mode") original = values if needs_i8_conversion(values.dtype): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6eb21fae29612..e9f8eb9c3f23f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -439,6 +439,9 @@ def __init__( values = arr if dtype.categories is None: + if not isinstance(values, ABCIndex): + # in particular RangeIndex xref test_index_equal_range_categories + values = sanitize_array(values, None) try: codes, categories = factorize(values, sort=True) except TypeError as err: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7a151cb811cbd..a1c240f72a28b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -73,6 +73,7 @@ from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeIndex, + ABCSeries, ABCTimedeltaIndex, ) from pandas.core.dtypes.inference import is_array_like @@ -83,9 +84,13 @@ import pandas.core.algorithms as algos from pandas.core.array_algos.putmask import validate_putmask -from pandas.core.arrays import Categorical +from pandas.core.arrays import ( + Categorical, + ExtensionArray, +) from pandas.core.arrays.categorical import factorize_from_iterables import pandas.core.common as com +from pandas.core.construction import sanitize_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, @@ -3404,6 +3409,8 @@ def _reorder_indexer( new_order = np.arange(n)[indexer] elif is_list_like(k): # Generate a map with all level codes as sorted initially + if not isinstance(k, (np.ndarray, ExtensionArray, Index, ABCSeries)): + k = sanitize_array(k, None) k = algos.unique(k) key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( self.levels[i] diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 83d004c8b8e3e..43eea7c669ce7 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -651,7 +651,7 @@ def _infer_precision(base_precision: int, bins) -> int: Infer an appropriate precision for _round_frac """ for precision in range(base_precision, 20): - levels = [_round_frac(b, precision) for b in bins] + levels = np.asarray([_round_frac(b, precision) for b in bins]) if algos.unique(levels).size == bins.size: return precision return base_precision # default diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 74210a1ce5ad8..dec0331bdc9cd 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -243,6 +243,9 @@ def _maybe_cache( if not should_cache(arg): return cache_array + if not isinstance(arg, (np.ndarray, ExtensionArray, Index, ABCSeries)): + arg = np.array(arg) + unique_dates = unique(arg) if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, format) diff --git a/pandas/tests/indexes/period/methods/test_factorize.py b/pandas/tests/indexes/period/methods/test_factorize.py index 9e297d6caca27..7705da02bb7d2 100644 --- a/pandas/tests/indexes/period/methods/test_factorize.py +++ b/pandas/tests/indexes/period/methods/test_factorize.py @@ -39,10 +39,12 @@ def test_factorize(self): tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - def test_factorize_complex(self): + def test_factorize_complex(self): # TODO: WTF is this test doing here?s # GH 17927 array = [1, 2, 2 + 1j] - labels, uniques = factorize(array) + msg = "factorize with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + labels, uniques = factorize(array) expected_labels = np.array([0, 1, 2], dtype=np.intp) tm.assert_numpy_array_equal(labels, expected_labels) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 0685897e723a2..df3f74fa9bc7c 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -721,7 +721,10 @@ def test_ismember_tuple_with_nans(): # GH-41836 values = [("a", float("nan")), ("b", 1)] comps = [("a", float("nan"))] - result = isin(values, comps) + + msg = "isin with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = isin(values, comps) expected = np.array([True, False], dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) @@ -729,6 +732,6 @@ def test_ismember_tuple_with_nans(): def test_float_complex_int_are_equal_as_objects(): values = ["a", 5, 5.0, 5.0 + 0j] comps = list(range(129)) - result = isin(values, comps) + result = isin(np.array(values, dtype=object), np.asarray(comps)) expected = np.array([False, True, True, True], dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index e47e36ab55f10..28a696be53e7f 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -393,7 +393,7 @@ def test_cut_duplicates_bin(kwargs, msg): cut(values, bins, **kwargs) else: result = cut(values, bins, **kwargs) - expected = cut(values, pd.unique(bins)) + expected = cut(values, pd.unique(np.asarray(bins))) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index af2cdb8cac9e8..6d09488df06e2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -94,12 +94,11 @@ def test_series_factorize_use_na_sentinel_false(self): tm.assert_index_equal(uniques, expected_uniques) def test_basic(self): - codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) + items = np.array(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) + codes, uniques = algos.factorize(items) tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object)) - codes, uniques = algos.factorize( - ["a", "b", "b", "a", "a", "c", "c", "c"], sort=True - ) + codes, uniques = algos.factorize(items, sort=True) exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) exp = np.array(["a", "b", "c"], dtype=object) @@ -249,7 +248,9 @@ def test_factorizer_object_with_nan(self): ) def test_factorize_tuple_list(self, data, expected_codes, expected_uniques): # GH9454 - codes, uniques = pd.factorize(data) + msg = "factorize with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + codes, uniques = pd.factorize(data) tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp)) @@ -462,7 +463,9 @@ def test_factorize_use_na_sentinel(self, sort, data, uniques): def test_object_factorize_use_na_sentinel_false( self, data, expected_codes, expected_uniques ): - codes, uniques = algos.factorize(data, use_na_sentinel=False) + codes, uniques = algos.factorize( + np.array(data, dtype=object), use_na_sentinel=False + ) tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True) tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True) @@ -485,7 +488,9 @@ def test_object_factorize_use_na_sentinel_false( def test_int_factorize_use_na_sentinel_false( self, data, expected_codes, expected_uniques ): - codes, uniques = algos.factorize(data, use_na_sentinel=False) + msg = "factorize with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + codes, uniques = algos.factorize(data, use_na_sentinel=False) tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True) tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True) @@ -532,8 +537,10 @@ def test_objects(self): def test_object_refcount_bug(self): lst = ["A", "B", "C", "D", "E"] - for i in range(1000): - len(algos.unique(lst)) + msg = "unique with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + for i in range(1000): + len(algos.unique(lst)) def test_on_index_object(self): mindex = MultiIndex.from_arrays( @@ -655,7 +662,7 @@ def test_uint64_overflow(self): def test_nan_in_object_array(self): duplicated_items = ["a", np.nan, "c", "c"] - result = pd.unique(duplicated_items) + result = pd.unique(np.array(duplicated_items, dtype=object)) expected = np.array(["a", np.nan, "c"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -782,7 +789,9 @@ def test_order_of_appearance(self): ) tm.assert_index_equal(result, expected) - result = pd.unique(list("aabc")) + msg = "unique with argument that is not not a Series, Index," + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.unique(list("aabc")) expected = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -799,7 +808,9 @@ def test_order_of_appearance(self): ) def test_tuple_with_strings(self, arg, expected): # see GH 17108 - result = pd.unique(arg) + msg = "unique with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.unique(arg) tm.assert_numpy_array_equal(result, expected) def test_obj_none_preservation(self): @@ -886,7 +897,9 @@ def test_invalid(self): algos.isin([1], 1) def test_basic(self): - result = algos.isin([1, 2], [1]) + msg = "isin with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = algos.isin([1, 2], [1]) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) @@ -906,7 +919,8 @@ def test_basic(self): expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(["a", "b"], ["a"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = algos.isin(["a", "b"], ["a"]) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) @@ -918,7 +932,8 @@ def test_basic(self): expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(["a", "b"], [1]) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = algos.isin(["a", "b"], [1]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) @@ -1003,18 +1018,20 @@ def test_same_nan_is_in(self): # nan is special, because from " a is b" doesn't follow "a == b" # at least, isin() should follow python's "np.nan in [nan] == True" # casting to -> np.float64 -> another float-object somewhere on - # the way could lead jepardize this behavior + # the way could lead jeopardize this behavior comps = [np.nan] # could be casted to float64 values = [np.nan] expected = np.array([True]) - result = algos.isin(comps, values) + msg = "isin with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) def test_same_nan_is_in_large(self): # https://github.com/pandas-dev/pandas/issues/22205 s = np.tile(1.0, 1_000_001) s[0] = np.nan - result = algos.isin(s, [np.nan, 1]) + result = algos.isin(s, np.array([np.nan, 1])) expected = np.ones(len(s), dtype=bool) tm.assert_numpy_array_equal(result, expected) @@ -1023,7 +1040,7 @@ def test_same_nan_is_in_large_series(self): s = np.tile(1.0, 1_000_001) series = Series(s) s[0] = np.nan - result = series.isin([np.nan, 1]) + result = series.isin(np.array([np.nan, 1])) expected = Series(np.ones(len(s), dtype=bool)) tm.assert_series_equal(result, expected) @@ -1041,10 +1058,13 @@ def __hash__(self): return 0 a, b = LikeNan(), LikeNan() - # same object -> True - tm.assert_numpy_array_equal(algos.isin([a], [a]), np.array([True])) - # different objects -> False - tm.assert_numpy_array_equal(algos.isin([a], [b]), np.array([False])) + + msg = "isin with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + # same object -> True + tm.assert_numpy_array_equal(algos.isin([a], [a]), np.array([True])) + # different objects -> False + tm.assert_numpy_array_equal(algos.isin([a], [b]), np.array([False])) def test_different_nans(self): # GH 22160 @@ -1055,7 +1075,7 @@ def test_different_nans(self): assert comps[0] is not values[0] # different nan-objects # as list of python-objects: - result = algos.isin(comps, values) + result = algos.isin(np.array(comps), values) tm.assert_numpy_array_equal(np.array([True]), result) # as object-array: @@ -1076,7 +1096,9 @@ def test_no_cast(self): comps = ["ss", 42] values = ["42"] expected = np.array([False, False]) - result = algos.isin(comps, values) + msg = "isin with argument that is not not a Series, Index" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) @@ -1130,7 +1152,7 @@ def test_isin_nan_df_string_search(self): """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") -> should not match values because np.nan is not equal str NaN""" df = DataFrame({"values": [np.nan, 2]}) - result = df.isin(["NaN"]) + result = df.isin(np.array(["NaN"], dtype=object)) expected_false = DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) @@ -1138,7 +1160,7 @@ def test_isin_float_df_string_search(self): """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245") -> should not match values because float 1.4245 is not equal str 1.4245""" df = DataFrame({"values": [1.4245, 2.32441]}) - result = df.isin(["1.4245"]) + result = df.isin(np.array(["1.4245"], dtype=object)) expected_false = DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) @@ -1182,10 +1204,10 @@ def test_value_counts_bins(self): tm.assert_series_equal(result, expected) def test_value_counts_dtypes(self): - result = algos.value_counts([1, 1.0]) + result = algos.value_counts(np.array([1, 1.0])) assert len(result) == 1 - result = algos.value_counts([1, 1.0], bins=1) + result = algos.value_counts(np.array([1, 1.0]), bins=1) assert len(result) == 1 result = algos.value_counts(Series([1, 1.0, "1"])) # object @@ -1193,7 +1215,7 @@ def test_value_counts_dtypes(self): msg = "bins argument only works with numeric data" with pytest.raises(TypeError, match=msg): - algos.value_counts(["1", 1], bins=1) + algos.value_counts(np.array(["1", 1], dtype=object), bins=1) def test_value_counts_nat(self): td = Series([np.timedelta64(10000), NaT], dtype="timedelta64[ns]") @@ -1588,7 +1610,9 @@ def test_unique_tuples(self, arr, uniques): expected = np.empty(len(uniques), dtype=object) expected[:] = uniques - result = pd.unique(arr) + msg = "unique with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.unique(arr) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( @@ -1603,7 +1627,9 @@ def test_unique_tuples(self, arr, uniques): ) def test_unique_complex_numbers(self, array, expected): # GH 17927 - result = pd.unique(array) + msg = "unique with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.unique(array) tm.assert_numpy_array_equal(result, expected) @@ -2206,7 +2232,7 @@ def test_int64_add_overflow(): class TestMode: def test_no_mode(self): exp = Series([], dtype=np.float64, index=Index([], dtype=int)) - tm.assert_numpy_array_equal(algos.mode([]), exp.values) + tm.assert_numpy_array_equal(algos.mode(np.array([])), exp.values) @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"]) def test_mode_single(self, dt): @@ -2229,10 +2255,10 @@ def test_mode_single(self, dt): def test_mode_obj_int(self): exp = Series([1], dtype=int) - tm.assert_numpy_array_equal(algos.mode([1]), exp.values) + tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values) exp = Series(["a", "b", "c"], dtype=object) - tm.assert_numpy_array_equal(algos.mode(["a", "b", "c"]), exp.values) + tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values) @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"]) def test_number_mode(self, dt):