From 47e3db13b5752844633248a8e87c21cef17c08f7 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 7 Jul 2021 18:16:18 +0530 Subject: [PATCH 01/20] BUG: pd.cut with duplicate index Series lowest inclued --- pandas/core/reshape/tile.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 7db30dc1ba9b9..e76683c3e731e 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -421,7 +421,10 @@ def _bins_to_cuts( ids = ensure_platform_int(bins.searchsorted(x, side=side)) if include_lowest: - ids[x == bins[0]] = 1 + if isinstance(x, ABCSeries): + ids[x.values == bins[0]] = 1 + else: + ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() From a9b6e5f3e1193077c3937478e14e9f697f4c0527 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 7 Jul 2021 08:39:33 -0700 Subject: [PATCH 02/20] REF: disambiguate get_loc_level k variable (#42378) --- pandas/core/generic.py | 18 ++++++------------ pandas/core/indexes/multi.py | 29 ++++++++++++++++++----------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c15aa3cf697da..ebe6ba2f26d5b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3765,18 +3765,12 @@ class animal locomotion self._consolidate_inplace() if isinstance(index, MultiIndex): - try: - loc, new_index = index._get_loc_level(key, level=0) - except TypeError as err: - raise TypeError( - f"Expected label or tuple of labels, got {key}" - ) from err - else: - if not drop_level: - if lib.is_integer(loc): - new_index = index[loc : loc + 1] - else: - new_index = index[loc] + loc, new_index = index._get_loc_level(key, level=0) + if not drop_level: + if lib.is_integer(loc): + new_index = index[loc : loc + 1] + else: + new_index = index[loc] else: loc = index.get_loc(key) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 04592e319f7ed..05580b03feba1 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2988,6 +2988,7 @@ def maybe_mi_droplevels(indexer, levels): if isinstance(key, tuple) and level == 0: try: + # Check if this tuple is a single key in our first level if key in self.levels[0]: indexer = self._get_level_indexer(key, level=level) new_index = maybe_mi_droplevels(indexer, [0]) @@ -3021,19 +3022,25 @@ def maybe_mi_droplevels(indexer, levels): indexer = None for i, k in enumerate(key): if not isinstance(k, slice): - k = self._get_level_indexer(k, level=i) - if isinstance(k, slice): - # everything - if k.start == 0 and k.stop == len(self): - k = slice(None, None) + loc_level = self._get_level_indexer(k, level=i) + if isinstance(loc_level, slice): + if com.is_null_slice(loc_level) or com.is_full_slice( + loc_level, len(self) + ): + # everything + continue + else: + raise TypeError( + f"Expected label or tuple of labels, got {key}" + ) else: - k_index = k + k_index = loc_level - if isinstance(k, slice): - if k == slice(None, None): - continue - else: - raise TypeError(key) + elif com.is_null_slice(k): + # taking everything, does not affect `indexer` below + continue + else: + raise TypeError(f"Expected label or tuple of labels, got {key}") if indexer is None: indexer = k_index From 0a9cfcf8da8d64aeca2574aa5d7caa9dcd3f1779 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 7 Jul 2021 11:48:31 -0500 Subject: [PATCH 03/20] CI: update vm image version for Azure (#42419) --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9b8b1e106fcf9..4cdd495fe0c31 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -21,12 +21,12 @@ jobs: - template: ci/azure/posix.yml parameters: name: macOS - vmImage: macOS-10.14 + vmImage: macOS-10.15 - template: ci/azure/windows.yml parameters: name: Windows - vmImage: vs2017-win2016 + vmImage: windows-2019 - job: py38_32bit pool: From 62147ea4a6df78e889243130ed20a2cc30293472 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 8 Jul 2021 11:04:30 +0530 Subject: [PATCH 04/20] added tests --- pandas/tests/reshape/test_cut.py | 45 ++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 944205c66c3e6..480232ef77128 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -691,3 +691,48 @@ def test_cut_no_warnings(): labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)] with tm.assert_produces_warning(False): df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels) + + +def test_cut_with_duplicated_index_lowest_included(): + # GH 42425 + expected = Series( + [Interval(-0.001, 2, closed="right")] * 3 + + [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")], + index=[0, 1, 2, 3, 0], + dtype="category", + ).cat.as_ordered() + + s = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0]) + result = cut(s, bins=[0, 2, 4], include_lowest=True) + tm.assert_series_equal(result, expected) + + +def df_from_series_with_nonexact_categoricalindices_frompdcut(): + # GH 42424 + + ser = Series(range(0, 100)) + ser1 = cut(ser, 10).value_counts().head(5) + ser2 = cut(ser, 10).value_counts().tail(5) + result = DataFrame({"1": ser1, "2": ser2}) + + index = pd.CategoricalIndex( + [ + Interval(-0.099, 9.9, closed="right"), + Interval(9.9, 19.8, closed="right"), + Interval(19.8, 29.7, closed="right"), + Interval(29.7, 39.6, closed="right"), + Interval(39.6, 49.5, closed="right"), + Interval(49.5, 59.4, closed="right"), + Interval(59.4, 69.3, closed="right"), + Interval(69.3, 79.2, closed="right"), + Interval(79.2, 89.1, closed="right"), + Interval(89.1, 99, closed="right"), + ], + ordered=True, + ) + + expected = DataFrame( + {"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index + ) + + tm.assert_frame_equal(expected, result) From f8bcd67ac687fc5677cc61f2d4868835aaefe64a Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 8 Jul 2021 11:49:47 +0530 Subject: [PATCH 05/20] updated whatsnew --- doc/source/whatsnew/v1.3.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 9c17a22bf6d52..b0d313000b2d5 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -24,7 +24,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bugs pertaining to :meth:`pandas.cut` operations on :class:`Series` with duplicate indices(:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) - .. --------------------------------------------------------------------------- From da4ea96ded2cacb3726113123ad9e01714a69cae Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 8 Jul 2021 12:00:55 +0530 Subject: [PATCH 06/20] corrected # GH code to 42185 --- pandas/tests/reshape/test_cut.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 480232ef77128..db12b2fb8f135 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -694,7 +694,7 @@ def test_cut_no_warnings(): def test_cut_with_duplicated_index_lowest_included(): - # GH 42425 + # GH 42185 expected = Series( [Interval(-0.001, 2, closed="right")] * 3 + [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")], From 2500a23fbc2235b6f3dc5d6ffa4d4e0731ba9b5b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 8 Jul 2021 05:42:13 -0700 Subject: [PATCH 07/20] DEPR: treating dt64 as UTC in Timestamp constructor (#42288) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/_libs/tslibs/timestamps.pyx | 13 +++++++++++++ pandas/core/arrays/datetimes.py | 5 +++++ .../scalar/timestamp/test_constructors.py | 18 ++++++++++++++++++ 4 files changed, 37 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 5b09b62fa9e88..ae00309ba964d 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -150,6 +150,7 @@ Deprecations - Deprecated :meth:`Index.is_type_compatible` (:issue:`42113`) - Deprecated ``method`` argument in :meth:`Index.get_loc`, use ``index.get_indexer([label], method=...)`` instead (:issue:`42269`) - Deprecated treating integer keys in :meth:`Series.__setitem__` as positional when the index is a :class:`Float64Index` not containing the key, a :class:`IntervalIndex` with no entries containing the key, or a :class:`MultiIndex` with leading :class:`Float64Index` level not containing the key (:issue:`33469`) +- Deprecated treating ``numpy.datetime64`` objects as UTC times when passed to the :class:`Timestamp` constructor along with a timezone. In a future version, these will be treated as wall-times. To retain the old behavior, use ``Timestamp(dt64).tz_localize("UTC").tz_convert(tz)`` (:issue:`24559`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 88a2706a35aa2..e4e9df5176459 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1329,6 +1329,19 @@ class Timestamp(_Timestamp): "the tz parameter. Use tz_convert instead.") tzobj = maybe_get_tz(tz) + if tzobj is not None and is_datetime64_object(ts_input): + # GH#24559, GH#42288 In the future we will treat datetime64 as + # wall-time (consistent with DatetimeIndex) + warnings.warn( + "In a future version, when passing a np.datetime64 object and " + "a timezone to Timestamp, the datetime64 will be interpreted " + "as a wall time, not a UTC time. To interpret as a UTC time, " + "use `Timestamp(dt64).tz_localize('UTC').tz_convert(tz)`", + FutureWarning, + stacklevel=1, + ) + # Once this deprecation is enforced, we can do + # return Timestamp(ts_input).tz_localize(tzobj) ts = convert_to_tsobject(ts_input, tzobj, unit, 0, 0, nanosecond or 0) if ts.value == NPY_NAT: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 0a517b22d1bb3..8513bbb044e83 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -509,6 +509,11 @@ def _check_compatible_with(self, other, setitem: bool = False): # Descriptive Properties def _box_func(self, x) -> Timestamp | NaTType: + if isinstance(x, np.datetime64): + # GH#42228 + # Argument 1 to "signedinteger" has incompatible type "datetime64"; + # expected "Union[SupportsInt, Union[str, bytes], SupportsIndex]" + x = np.int64(x) # type: ignore[arg-type] ts = Timestamp(x, tz=self.tz) # Non-overlapping identity check (left operand type: "Timestamp", # right operand type: "NaTType") diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index c6c475f5b87a2..b3deb1a57e5c3 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -24,6 +24,24 @@ class TestTimestampConstructors: + def test_constructor_datetime64_with_tz(self): + # GH#42288, GH#24559 + dt = np.datetime64("1970-01-01 05:00:00") + tzstr = "UTC+05:00" + + msg = "interpreted as a wall time" + with tm.assert_produces_warning(FutureWarning, match=msg): + ts = Timestamp(dt, tz=tzstr) + + # Check that we match the old behavior + alt = Timestamp(dt).tz_localize("UTC").tz_convert(tzstr) + assert ts == alt + + # Check that we *don't* match the future behavior + assert ts.hour != 5 + expected_future = Timestamp(dt).tz_localize(tzstr) + assert ts != expected_future + def test_constructor(self): base_str = "2014-07-01 09:00" base_dt = datetime(2014, 7, 1, 9) From 82eb38061a6b3c1cb605dedfd00cb023182e7611 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 8 Jul 2021 05:42:31 -0700 Subject: [PATCH 08/20] PERF/REGR: revert #41785 (#42338) --- doc/source/whatsnew/v1.3.1.rst | 1 + pandas/_libs/lib.pyi | 1 + pandas/_libs/lib.pyx | 22 ++++++++++++++++++++ pandas/core/dtypes/cast.py | 15 +++++++++++++ pandas/core/internals/construction.py | 20 ++++++++++++++---- pandas/tests/dtypes/cast/test_dict_compat.py | 14 +++++++++++++ 6 files changed, 69 insertions(+), 4 deletions(-) create mode 100644 pandas/tests/dtypes/cast/test_dict_compat.py diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 9c17a22bf6d52..65719a11243f8 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -16,6 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Pandas could not be built on PyPy (:issue:`42355`) - :class:`DataFrame` constructed with with an older version of pandas could not be unpickled (:issue:`42345`) +- Performance regression in constructing a :class:`DataFrame` from a dictionary of dictionaries (:issue:`42338`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index d39c5ac5d2967..5be50f16af003 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -51,6 +51,7 @@ def is_string_array(values: np.ndarray, skipna: bool = False): ... def is_float_array(values: np.ndarray, skipna: bool = False): ... def is_integer_array(values: np.ndarray, skipna: bool = False): ... def is_bool_array(values: np.ndarray, skipna: bool = False): ... +def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> np.ndarray: ... def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ... def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ... def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c48b8ed5aec9c..4ab2497be94d5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2955,6 +2955,28 @@ def to_object_array_tuples(rows: object) -> np.ndarray: return result +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray: + cdef: + Py_ssize_t i, n = len(keys) + object val + ndarray[object] output = np.empty(n, dtype='O') + + if n == 0: + # kludge, for Series + return np.empty(0, dtype='f8') + + for i in range(n): + val = keys[i] + if val in mapping: + output[i] = mapping[val] + else: + output[i] = default + + return maybe_convert_objects(output) + + def is_bool_list(obj: list) -> bool: """ Check if this list contains only bool or np.bool_ objects. diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9f0fefc9a10d0..15a18d5027274 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -781,6 +781,21 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, return dtype, val +def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]: + """ + Convert datetimelike-keyed dicts to a Timestamp-keyed dict. + + Parameters + ---------- + d: dict-like object + + Returns + ------- + dict + """ + return {maybe_box_datetimelike(key): value for key, value in d.items()} + + def infer_dtype_from_array( arr, pandas_dtype: bool = False ) -> tuple[DtypeObj, ArrayLike]: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 7bef7ae9b39d7..5e327adfb8905 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -26,6 +26,7 @@ from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, + dict_compat, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -59,7 +60,6 @@ TimedeltaArray, ) from pandas.core.construction import ( - create_series_with_explicit_dtype, ensure_wrapped_if_datetimelike, extract_array, range_to_ndarray, @@ -67,7 +67,9 @@ ) from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( + DatetimeIndex, Index, + TimedeltaIndex, ensure_index, get_objs_combined_axis, union_indexes, @@ -556,6 +558,7 @@ def convert(v): def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: + oindex = None homogenized = [] for val in data: @@ -570,9 +573,18 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: val = val._values else: if isinstance(val, dict): - # see test_constructor_subclass_dict - # test_constructor_dict_datetime64_index - val = create_series_with_explicit_dtype(val, index=index)._values + # GH#41785 this _should_ be equivalent to (but faster than) + # val = create_series_with_explicit_dtype(val, index=index)._values + if oindex is None: + oindex = index.astype("O") + + if isinstance(index, (DatetimeIndex, TimedeltaIndex)): + # see test_constructor_dict_datetime64_index + val = dict_compat(val) + else: + # see test_constructor_subclass_dict + val = dict(val) + val = lib.fast_multiget(val, oindex._values, default=np.nan) val = sanitize_array( val, index, dtype=dtype, copy=False, raise_cast_failure=False diff --git a/pandas/tests/dtypes/cast/test_dict_compat.py b/pandas/tests/dtypes/cast/test_dict_compat.py new file mode 100644 index 0000000000000..13dc82d779f95 --- /dev/null +++ b/pandas/tests/dtypes/cast/test_dict_compat.py @@ -0,0 +1,14 @@ +import numpy as np + +from pandas.core.dtypes.cast import dict_compat + +from pandas import Timestamp + + +def test_dict_compat(): + data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2} + data_unchanged = {1: 2, 3: 4, 5: 6} + expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2} + assert dict_compat(data_datetime64) == expected + assert dict_compat(expected) == expected + assert dict_compat(data_unchanged) == data_unchanged From ec0fdb7bfa7f192fcf5166c47a47af7488f8bac6 Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Thu, 8 Jul 2021 18:24:28 +0530 Subject: [PATCH 09/20] Update doc/source/whatsnew/v1.3.1.rst Co-authored-by: Jeff Reback --- doc/source/whatsnew/v1.3.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index b0d313000b2d5..38b628c91fae4 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -24,7 +24,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Fixed bugs pertaining to :meth:`pandas.cut` operations on :class:`Series` with duplicate indices(:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) +-Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) - .. --------------------------------------------------------------------------- From 35b338e9ca877224cb3138a9d5cea1f538353a26 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 8 Jul 2021 05:58:05 -0700 Subject: [PATCH 10/20] BUG: .loc failing to drop first level (#42435) --- pandas/core/indexes/multi.py | 10 +++++++--- pandas/core/indexing.py | 17 ++++++++++++----- pandas/tests/frame/indexing/test_xs.py | 13 +++++++------ pandas/tests/indexing/test_loc.py | 12 ++++++++++++ pandas/tests/series/test_arithmetic.py | 2 +- 5 files changed, 39 insertions(+), 15 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 05580b03feba1..8903d29782610 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3030,16 +3030,20 @@ def maybe_mi_droplevels(indexer, levels): # everything continue else: - raise TypeError( - f"Expected label or tuple of labels, got {key}" - ) + # e.g. test_xs_IndexSlice_argument_not_implemented + k_index = np.zeros(len(self), dtype=bool) + k_index[loc_level] = True + else: k_index = loc_level elif com.is_null_slice(k): # taking everything, does not affect `indexer` below continue + else: + # FIXME: this message can be inaccurate, e.g. + # test_series_varied_multiindex_alignment raise TypeError(f"Expected label or tuple of labels, got {key}") if indexer is None: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e362213c52bd5..387dcca6897b7 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -882,17 +882,21 @@ def _getitem_nested_tuple(self, tup: tuple): if self.name != "loc": # This should never be reached, but lets be explicit about it raise ValueError("Too many indices") - if isinstance(self.obj, ABCSeries) and any( - isinstance(k, tuple) for k in tup - ): - # GH#35349 Raise if tuple in tuple for series - raise ValueError("Too many indices") if all(is_hashable(x) or com.is_null_slice(x) for x in tup): # GH#10521 Series should reduce MultiIndex dimensions instead of # DataFrame, IndexingError is not raised when slice(None,None,None) # with one row. with suppress(IndexingError): return self._handle_lowerdim_multi_index_axis0(tup) + elif isinstance(self.obj, ABCSeries) and any( + isinstance(k, tuple) for k in tup + ): + # GH#35349 Raise if tuple in tuple for series + # Do this after the all-hashable-or-null-slice check so that + # we are only getting non-hashable tuples, in particular ones + # that themselves contain a slice entry + # See test_loc_series_getitem_too_many_dimensions + raise ValueError("Too many indices") # this is a series with a multi-index specified a tuple of # selectors @@ -1127,6 +1131,9 @@ def _handle_lowerdim_multi_index_axis0(self, tup: tuple): return self._get_label(tup, axis=axis) except TypeError as err: # slices are unhashable + # FIXME: this raises when we have a DatetimeIndex first level and a + # string for the first tup entry + # see test_partial_slicing_with_multiindex raise IndexingError("No label returned") from err except KeyError as ek: diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index a76aec9ebda44..d2704876c31c5 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -318,12 +318,13 @@ def test_xs_IndexSlice_argument_not_implemented(self, klass): if klass is Series: obj = obj[0] - msg = ( - "Expected label or tuple of labels, got " - r"\(\('foo', 'qux', 0\), slice\(None, None, None\)\)" - ) - with pytest.raises(TypeError, match=msg): - obj.xs(IndexSlice[("foo", "qux", 0), :]) + expected = obj.iloc[-2:].droplevel(0) + + result = obj.xs(IndexSlice[("foo", "qux", 0), :]) + tm.assert_equal(result, expected) + + result = obj.loc[IndexSlice[("foo", "qux", 0), :]] + tm.assert_equal(result, expected) @pytest.mark.parametrize("klass", [DataFrame, Series]) def test_xs_levels_raises(self, klass): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index e96b25418d408..2e0f48849530a 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1663,6 +1663,18 @@ def test_loc_multiindex_levels_contain_values_not_in_index_anymore(self, lt_valu with pytest.raises(KeyError, match=r"\['b'\] not in index"): df.loc[df["a"] < lt_value, :].loc[["b"], :] + def test_loc_drops_level(self): + # Based on test_series_varied_multiindex_alignment, where + # this used to fail to drop the first level + mi = MultiIndex.from_product( + [list("ab"), list("xy"), [1, 2]], names=["ab", "xy", "num"] + ) + ser = Series(range(8), index=mi) + + loc_result = ser.loc["a", :, :] + expected = ser.index.droplevel(0)[:4] + tm.assert_index_equal(loc_result.index, expected) + class TestLocSetitemWithExpansion: @pytest.mark.slow diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 6b7c2c698c211..4d1c75da72399 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -924,7 +924,7 @@ def test_series_varied_multiindex_alignment(): [1000 * i for i in range(1, 5)], index=pd.MultiIndex.from_product([list("xy"), [1, 2]], names=["xy", "num"]), ) - result = s1.loc[pd.IndexSlice["a", :, :]] + s2 + result = s1.loc[pd.IndexSlice[["a"], :, :]] + s2 expected = Series( [1000, 2001, 3002, 4003], index=pd.MultiIndex.from_tuples( From 8d64fe9d515a819df49e12d2ef14b1a5bc353a74 Mon Sep 17 00:00:00 2001 From: neelmraman <42706631+neelmraman@users.noreply.github.com> Date: Thu, 8 Jul 2021 08:59:20 -0400 Subject: [PATCH 11/20] =?UTF-8?q?BUG:=20truncate=20has=20incorrect=20behav?= =?UTF-8?q?ior=20when=20index=20has=20only=20one=20unique=20v=E2=80=A6=20(?= =?UTF-8?q?#42366)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/generic.py | 2 +- pandas/tests/frame/methods/test_truncate.py | 10 ++++++++++ pandas/tests/series/methods/test_truncate.py | 8 ++++++++ 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ae00309ba964d..3f557b6a7c424 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -213,7 +213,7 @@ Interval Indexing ^^^^^^^^ - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`) -- +- Bug in :meth:`DataFrame.truncate` and :meth:`Series.truncate` when the object's Index has a length greater than one but only one unique value (:issue:`42365`) Missing ^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ebe6ba2f26d5b..3b8458fac2942 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9382,7 +9382,7 @@ def truncate( if before is not None and after is not None and before > after: raise ValueError(f"Truncate: {after} must be after {before}") - if len(ax) > 1 and ax.is_monotonic_decreasing: + if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1: before, after = after, before slicer = [slice(None, None)] * self._AXIS_LEN diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py index 210e86067566a..f2d60240009c5 100644 --- a/pandas/tests/frame/methods/test_truncate.py +++ b/pandas/tests/frame/methods/test_truncate.py @@ -148,3 +148,13 @@ def test_truncate_multiindex(self, frame_or_series): expected = expected["col"] tm.assert_equal(result, expected) + + def test_truncate_index_only_one_unique_value(self, frame_or_series): + # GH 42365 + obj = Series(0, index=date_range("2021-06-30", "2021-06-30")).repeat(5) + if frame_or_series is DataFrame: + obj = obj.to_frame(name="a") + + truncated = obj.truncate("2021-06-28", "2021-07-01") + + tm.assert_equal(truncated, obj) diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index ca5c3e2639097..a3a27a744b180 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -55,3 +55,11 @@ def test_truncate_one_element_series(self): # the input Series and the expected Series are the same tm.assert_series_equal(result, series) + + def test_truncate_index_only_one_unique_value(self): + # GH 42365 + obj = Series(0, index=date_range("2021-06-30", "2021-06-30")).repeat(5) + + truncated = obj.truncate("2021-06-28", "2021-07-01") + + tm.assert_series_equal(truncated, obj) From 487aafbc367957837c2c8eebdddd77d0659e1cad Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 8 Jul 2021 08:00:43 -0500 Subject: [PATCH 12/20] CLN: clean doc validation script (#42436) --- scripts/validate_docstrings.py | 45 ++++++++++++++-------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 9b65204403612..7562895d9db3e 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -17,37 +17,26 @@ import argparse import doctest -import glob import importlib +import io import json -import os +import pathlib import subprocess import sys import tempfile -try: - from io import StringIO -except ImportError: - from cStringIO import StringIO +import matplotlib +import matplotlib.pyplot as plt +import numpy +from numpydoc.validate import ( + Docstring, + validate, +) -# Template backend makes matplotlib to not plot anything. This is useful -# to avoid that plot windows are open from the doctests while running the -# script. Setting here before matplotlib is loaded. -# We don't warn for the number of open plots, as none is actually being opened -os.environ["MPLBACKEND"] = "Template" -import matplotlib # isort:skip +import pandas -matplotlib.rc("figure", max_open_warning=10000) - -import numpy # isort:skip - -BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -sys.path.insert(0, os.path.join(BASE_PATH)) -import pandas # isort:skip - -sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) -from numpydoc.validate import validate, Docstring # isort:skip +# With template backend, matplotlib plots nothing +matplotlib.use("template") PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] @@ -157,7 +146,7 @@ def examples_errors(self): context = {"np": numpy, "pd": pandas} error_msgs = "" for test in finder.find(self.raw_doc, self.name, globs=context): - f = StringIO() + f = io.StringIO() runner.run(test, out=f.write) error_msgs += f.getvalue() return error_msgs @@ -263,6 +252,7 @@ def pandas_validate(func_name: str): if doc.non_hyphenated_array_like(): result["errors"].append(pandas_error("GL05")) + plt.close("all") return result @@ -288,13 +278,14 @@ def validate_all(prefix, ignore_deprecated=False): result = {} seen = {} - api_doc_fnames = os.path.join(BASE_PATH, "doc", "source", "reference", "*.rst") + base_path = pathlib.Path(__file__).parent.parent + api_doc_fnames = pathlib.Path(base_path, "doc", "source", "reference") api_items = [] - for api_doc_fname in glob.glob(api_doc_fnames): + for api_doc_fname in api_doc_fnames.glob("*.rst"): with open(api_doc_fname) as f: api_items += list(get_api_items(f)) - for func_name, func_obj, section, subsection in api_items: + for func_name, _, section, subsection in api_items: if prefix and not func_name.startswith(prefix): continue doc_info = pandas_validate(func_name) From afeb35ea6654acb26fb47a30babd226827dca080 Mon Sep 17 00:00:00 2001 From: David <63334933+DavidDeLord@users.noreply.github.com> Date: Thu, 8 Jul 2021 06:08:56 -0700 Subject: [PATCH 13/20] Fix Formatting Issue (#42438) --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6e6e8b6d8b178..9eab934d7db83 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -201,7 +201,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): methods from ndarray have been overridden to automatically exclude missing data (currently represented as NaN). - Operations between Series (+, -, /, *, **) align values based on their + Operations between Series (+, -, /, \\*, \\*\\*) align values based on their associated index values-- they need not be the same length. The result index will be the sorted union of the two indexes. From 29e6dc047bcd204bcb16a535e36a57cf2a2bde09 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 8 Jul 2021 09:09:36 -0400 Subject: [PATCH 14/20] DOC: Add more instructions for updating the whatsnew (#42427) --- doc/source/development/contributing_codebase.rst | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index e812aaa760a8f..721b1af126709 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -812,7 +812,21 @@ Changes should be reflected in the release notes located in ``doc/source/whatsne This file contains an ongoing change log for each release. Add an entry to this file to document your fix, enhancement or (unavoidable) breaking change. Make sure to include the GitHub issue number when adding your entry (using ``:issue:`1234``` where ``1234`` is the -issue/pull request number). +issue/pull request number). Your entry should be written using full sentences and proper +grammar. + +When mentioning parts of the API, use a Sphinx ``:func:``, ``:meth:``, or ``:class:`` +directive as appropriate. Not all public API functions and methods have a +documentation page; ideally links would only be added if they resolve. You can +usually find similar examples by checking the release notes for one of the previous +versions. + +If your code is a bugfix, add your entry to the relevant bugfix section. Avoid +adding to the ``Other`` section; only in rare cases should entries go there. +Being as concise as possible, the description of the bug should include how the +user may encounter it and an indication of the bug itself, e.g. +"produces incorrect results" or "incorrectly raises". It may be necessary to also +indicate the new behavior. If your code is an enhancement, it is most likely necessary to add usage examples to the existing documentation. This can be done following the section From 4653b6acf449f159bf4521f8dc1691a1d10f64f8 Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Thu, 8 Jul 2021 18:41:02 +0530 Subject: [PATCH 15/20] DOC fix the incorrect doc style in 1.2.1 (#42386) --- doc/source/whatsnew/v1.0.0.rst | 90 ++++++++++++++++++++-------------- doc/source/whatsnew/v1.2.1.rst | 32 +++++++----- 2 files changed, 71 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b87274307431b..03dfe475475a1 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -338,19 +338,20 @@ maps labels to their new names along the default axis, is allowed to be passed b *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df = pd.DataFrame([[1]]) - >>> df.rename({0: 1}, {0: 2}) + In [1]: df = pd.DataFrame([[1]]) + In [2]: df.rename({0: 1}, {0: 2}) + Out[2]: FutureWarning: ...Use named arguments to resolve ambiguity... 2 1 1 *pandas 1.0.0* -.. code-block:: python +.. code-block:: ipython - >>> df.rename({0: 1}, {0: 2}) + In [3]: df.rename({0: 1}, {0: 2}) Traceback (most recent call last): ... TypeError: rename() takes from 1 to 2 positional arguments but 3 were given @@ -359,26 +360,28 @@ Note that errors will now be raised when conflicting or potentially ambiguous ar *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df.rename({0: 1}, index={0: 2}) + In [4]: df.rename({0: 1}, index={0: 2}) + Out[4]: 0 1 1 - >>> df.rename(mapper={0: 1}, index={0: 2}) + In [5]: df.rename(mapper={0: 1}, index={0: 2}) + Out[5]: 0 2 1 *pandas 1.0.0* -.. code-block:: python +.. code-block:: ipython - >>> df.rename({0: 1}, index={0: 2}) + In [6]: df.rename({0: 1}, index={0: 2}) Traceback (most recent call last): ... TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns' - >>> df.rename(mapper={0: 1}, index={0: 2}) + In [7]: df.rename(mapper={0: 1}, index={0: 2}) Traceback (most recent call last): ... TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns' @@ -405,12 +408,12 @@ Extended verbose info output for :class:`~pandas.DataFrame` *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df = pd.DataFrame({"int_col": [1, 2, 3], + In [1]: df = pd.DataFrame({"int_col": [1, 2, 3], ... "text_col": ["a", "b", "c"], ... "float_col": [0.0, 0.1, 0.2]}) - >>> df.info(verbose=True) + In [2]: df.info(verbose=True) RangeIndex: 3 entries, 0 to 2 Data columns (total 3 columns): @@ -440,14 +443,16 @@ Extended verbose info output for :class:`~pandas.DataFrame` *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> pd.array(["a", None]) + In [1]: pd.array(["a", None]) + Out[1]: ['a', None] Length: 2, dtype: object - >>> pd.array([1, None]) + In [2]: pd.array([1, None]) + Out[2]: [1, None] Length: 2, dtype: object @@ -470,15 +475,17 @@ As a reminder, you can specify the ``dtype`` to disable all inference. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> a = pd.array([1, 2, None], dtype="Int64") - >>> a + In [1]: a = pd.array([1, 2, None], dtype="Int64") + In [2]: a + Out[2]: [1, 2, NaN] Length: 3, dtype: Int64 - >>> a[2] + In [3]: a[2] + Out[3]: nan *pandas 1.0.0* @@ -499,9 +506,10 @@ will now raise. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> np.asarray(a, dtype="float") + In [1]: np.asarray(a, dtype="float") + Out[1]: array([ 1., 2., nan]) *pandas 1.0.0* @@ -525,9 +533,10 @@ will now be ``pd.NA`` instead of ``np.nan`` in presence of missing values *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> pd.Series(a).sum(skipna=False) + In [1]: pd.Series(a).sum(skipna=False) + Out[1]: nan *pandas 1.0.0* @@ -543,9 +552,10 @@ integer dtype for the values. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + In [1]: pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + Out[1]: dtype('int64') *pandas 1.0.0* @@ -565,15 +575,17 @@ Comparison operations on a :class:`arrays.IntegerArray` now returns a *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> a = pd.array([1, 2, None], dtype="Int64") - >>> a + In [1]: a = pd.array([1, 2, None], dtype="Int64") + In [2]: a + Out[2]: [1, 2, NaN] Length: 3, dtype: Int64 - >>> a > 1 + In [3]: a > 1 + Out[3]: array([False, True, False]) *pandas 1.0.0* @@ -640,9 +652,10 @@ scalar values in the result are instances of the extension dtype's scalar type. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df.resample("2D").agg(lambda x: 'a').A.dtype + In [1]> df.resample("2D").agg(lambda x: 'a').A.dtype + Out[1]: CategoricalDtype(categories=['a', 'b'], ordered=False) *pandas 1.0.0* @@ -657,9 +670,10 @@ depending on how the results are cast back to the original dtype. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df.resample("2D").agg(lambda x: 'c') + In [1] df.resample("2D").agg(lambda x: 'c') + Out[1]: A 0 NaN @@ -871,10 +885,10 @@ matplotlib directly rather than :meth:`~DataFrame.plot`. To use pandas formatters with a matplotlib plot, specify -.. code-block:: python +.. code-block:: ipython - >>> import pandas as pd - >>> pd.options.plotting.matplotlib.register_converters = True + In [1]: import pandas as pd + In [2]: pd.options.plotting.matplotlib.register_converters = True Note that plots created by :meth:`DataFrame.plot` and :meth:`Series.plot` *do* register the converters automatically. The only behavior change is when plotting a date-like object via ``matplotlib.pyplot.plot`` diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index bfe30d52e2aff..34e28eab6d4bf 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -52,20 +52,23 @@ DataFrame / Series combination) would ignore the indices, only match the inputs by shape, and use the index/columns of the first DataFrame for the result: -.. code-block:: python +.. code-block:: ipython - >>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1]) - ... df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2]) - >>> df1 + In [1]: df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1]) + In [2]: df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2]) + In [3]: df1 + Out[3]: a b 0 1 3 1 2 4 - >>> df2 + In [4]: df2 + Out[4]: a b 1 1 3 2 2 4 - >>> np.add(df1, df2) + In [5]: np.add(df1, df2) + Out[5]: a b 0 2 6 1 4 8 @@ -73,9 +76,10 @@ the result: This contrasts with how other pandas operations work, which first align the inputs: -.. code-block:: python +.. code-block:: ipython - >>> df1 + df2 + In [6]: df1 + df2 + Out[6]: a b 0 NaN NaN 1 3.0 7.0 @@ -94,9 +98,10 @@ objects (eg ``np.add(s1, s2)``) already aligns and continues to do so. To avoid the warning and keep the current behaviour of ignoring the indices, convert one of the arguments to a NumPy array: -.. code-block:: python +.. code-block:: ipython - >>> np.add(df1, np.asarray(df2)) + In [7]: np.add(df1, np.asarray(df2)) + Out[7]: a b 0 2 6 1 4 8 @@ -104,10 +109,11 @@ convert one of the arguments to a NumPy array: To obtain the future behaviour and silence the warning, you can align manually before passing the arguments to the ufunc: -.. code-block:: python +.. code-block:: ipython - >>> df1, df2 = df1.align(df2) - >>> np.add(df1, df2) + In [8]: df1, df2 = df1.align(df2) + In [9]: np.add(df1, df2) + Out[9]: a b 0 NaN NaN 1 3.0 7.0 From 824047340bbf153c1a55b26e234e231766496096 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 7 Jul 2021 18:16:18 +0530 Subject: [PATCH 16/20] BUG: pd.cut with duplicate index Series lowest inclued --- pandas/core/reshape/tile.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 7db30dc1ba9b9..e76683c3e731e 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -421,7 +421,10 @@ def _bins_to_cuts( ids = ensure_platform_int(bins.searchsorted(x, side=side)) if include_lowest: - ids[x == bins[0]] = 1 + if isinstance(x, ABCSeries): + ids[x.values == bins[0]] = 1 + else: + ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() From 52f53fdb0c9cba7d58a8067db3ede7db8ad3652c Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 8 Jul 2021 11:04:30 +0530 Subject: [PATCH 17/20] added tests --- pandas/tests/reshape/test_cut.py | 45 ++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 944205c66c3e6..480232ef77128 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -691,3 +691,48 @@ def test_cut_no_warnings(): labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)] with tm.assert_produces_warning(False): df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels) + + +def test_cut_with_duplicated_index_lowest_included(): + # GH 42425 + expected = Series( + [Interval(-0.001, 2, closed="right")] * 3 + + [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")], + index=[0, 1, 2, 3, 0], + dtype="category", + ).cat.as_ordered() + + s = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0]) + result = cut(s, bins=[0, 2, 4], include_lowest=True) + tm.assert_series_equal(result, expected) + + +def df_from_series_with_nonexact_categoricalindices_frompdcut(): + # GH 42424 + + ser = Series(range(0, 100)) + ser1 = cut(ser, 10).value_counts().head(5) + ser2 = cut(ser, 10).value_counts().tail(5) + result = DataFrame({"1": ser1, "2": ser2}) + + index = pd.CategoricalIndex( + [ + Interval(-0.099, 9.9, closed="right"), + Interval(9.9, 19.8, closed="right"), + Interval(19.8, 29.7, closed="right"), + Interval(29.7, 39.6, closed="right"), + Interval(39.6, 49.5, closed="right"), + Interval(49.5, 59.4, closed="right"), + Interval(59.4, 69.3, closed="right"), + Interval(69.3, 79.2, closed="right"), + Interval(79.2, 89.1, closed="right"), + Interval(89.1, 99, closed="right"), + ], + ordered=True, + ) + + expected = DataFrame( + {"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index + ) + + tm.assert_frame_equal(expected, result) From 9cb8dbb85e97c7da25d7820dda55cbf388238a46 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 8 Jul 2021 11:49:47 +0530 Subject: [PATCH 18/20] updated whatsnew --- doc/source/whatsnew/v1.3.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 65719a11243f8..f6bae22abf098 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -25,7 +25,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bugs pertaining to :meth:`pandas.cut` operations on :class:`Series` with duplicate indices(:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) - .. --------------------------------------------------------------------------- From a74c2f620200da97a2d91c3c5d9fe0a809e7f80e Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 8 Jul 2021 12:00:55 +0530 Subject: [PATCH 19/20] corrected # GH code to 42185 --- pandas/tests/reshape/test_cut.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 480232ef77128..db12b2fb8f135 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -694,7 +694,7 @@ def test_cut_no_warnings(): def test_cut_with_duplicated_index_lowest_included(): - # GH 42425 + # GH 42185 expected = Series( [Interval(-0.001, 2, closed="right")] * 3 + [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")], From 57b5ddb7b517c55cf34ee564f693fbfb4167d996 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 8 Jul 2021 18:53:24 +0530 Subject: [PATCH 20/20] updated test func name and code --- pandas/core/reshape/tile.py | 4 +--- pandas/tests/reshape/test_cut.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index e76683c3e731e..0b6fec866e591 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -422,9 +422,7 @@ def _bins_to_cuts( if include_lowest: if isinstance(x, ABCSeries): - ids[x.values == bins[0]] = 1 - else: - ids[x == bins[0]] = 1 + ids[np.asarray(x) == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index db12b2fb8f135..127be504e82d5 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -707,7 +707,7 @@ def test_cut_with_duplicated_index_lowest_included(): tm.assert_series_equal(result, expected) -def df_from_series_with_nonexact_categoricalindices_frompdcut(): +def test_cut_with_nonexact_categorical_indices(): # GH 42424 ser = Series(range(0, 100))