From f520eaea50e77422dda6cd5c0aaa9f02fa88ea18 Mon Sep 17 00:00:00 2001 From: "tobias.pitters" Date: Sat, 16 Jul 2022 18:44:57 +0200 Subject: [PATCH 1/9] fix regression when loc is used to create a new element on an categorical series --- pandas/core/dtypes/cast.py | 3 +++ pandas/core/indexing.py | 2 ++ pandas/tests/indexing/test_indexing.py | 7 +++++++ 3 files changed, 12 insertions(+) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 769656d1c4755..66d112e53278e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -646,6 +646,9 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): return np.dtype("object"), fill_value + elif isinstance(dtype, CategoricalDtype): + return object, ensure_object(fill_value) + elif is_float(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index fa1ad7ce3c874..5754728dbbf0e 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1777,6 +1777,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): indexer, missing = convert_missing_indexer(indexer) if missing: + # import pdb; pdb.set_trace() self._setitem_with_indexer_missing(indexer, value) return @@ -2111,6 +2112,7 @@ def _setitem_with_indexer_missing(self, indexer, value): # We should not cast, if we have object dtype because we can # set timedeltas into object series curr_dtype = self.obj.dtype + # import pdb; pdb.set_trace() curr_dtype = getattr(curr_dtype, "numpy_dtype", curr_dtype) new_dtype = maybe_promote(curr_dtype, value)[0] else: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 9d10e487e0cc2..02b9666e31dcb 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1019,3 +1019,10 @@ def test_ser_list_indexer_exceeds_dimensions(indexer_li): res = indexer_li(ser)[[0, 0]] exp = Series([10, 10], index=Index([0, 0])) tm.assert_series_equal(res, exp) + + +def test_additional_element_to_categorical_series_loc(): + result = Series(["a", "b", "c"], dtype="category") + result.loc[3] = 0 + expected = Series(["a", "b", "c", 0], dtype="object") + tm.assert_series_equal(result, expected) From 4edc9563d50a2e85095e779cdd45eb3a658be570 Mon Sep 17 00:00:00 2001 From: "tobias.pitters" Date: Sat, 16 Jul 2022 18:48:06 +0200 Subject: [PATCH 2/9] add whatsnew --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 22a5f2a08362f..a66128775f984 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -908,7 +908,7 @@ Indexing - Bug in :meth:`NDFrame.xs`, :meth:`DataFrame.iterrows`, :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` not always propagating metadata (:issue:`28283`) - Bug in :meth:`DataFrame.sum` min_count changes dtype if input contains NaNs (:issue:`46947`) - Bug in :class:`IntervalTree` that lead to an infinite recursion. (:issue:`46658`) -- +- Bug in :meth:`DataFrame.loc` when creating a new element on a :class:`Series` with dtype :class:`CategoricalDtype` (:issue:`47677`) Missing ^^^^^^^ From a0e29346becb1dcec0df9e160bf728221182be8b Mon Sep 17 00:00:00 2001 From: "tobias.pitters" Date: Sun, 17 Jul 2022 17:01:03 +0200 Subject: [PATCH 3/9] fix enlarging by scalar --- pandas/core/dtypes/cast.py | 5 ++++- pandas/core/indexing.py | 12 +++++++++--- pandas/tests/indexing/test_indexing.py | 7 +++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 66d112e53278e..093b51a38d067 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -647,7 +647,10 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): return np.dtype("object"), fill_value elif isinstance(dtype, CategoricalDtype): - return object, ensure_object(fill_value) + if fill_value in dtype.categories: + return "category", fill_value + else: + return object, ensure_object(fill_value) elif is_float(fill_value): if issubclass(dtype.type, np.bool_): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 5754728dbbf0e..88a784f054750 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -41,6 +41,7 @@ is_sequence, ) from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -1777,7 +1778,6 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): indexer, missing = convert_missing_indexer(indexer) if missing: - # import pdb; pdb.set_trace() self._setitem_with_indexer_missing(indexer, value) return @@ -2112,7 +2112,6 @@ def _setitem_with_indexer_missing(self, indexer, value): # We should not cast, if we have object dtype because we can # set timedeltas into object series curr_dtype = self.obj.dtype - # import pdb; pdb.set_trace() curr_dtype = getattr(curr_dtype, "numpy_dtype", curr_dtype) new_dtype = maybe_promote(curr_dtype, value)[0] else: @@ -2121,9 +2120,16 @@ def _setitem_with_indexer_missing(self, indexer, value): new_values = Series([value], dtype=new_dtype)._values if len(self.obj._values): + # GH#47677 handle enlarging with a scalar as a special case + if ( + isinstance(self.obj.dtype, CategoricalDtype) + and new_dtype == "category" + ): + new_values = Series(self.obj.tolist() + [value], dtype="category") # GH#22717 handle casting compatibility that np.concatenate # does incorrectly - new_values = concat_compat([self.obj._values, new_values]) + else: + new_values = concat_compat([self.obj._values, new_values]) self.obj._mgr = self.obj._constructor( new_values, index=new_index, name=self.obj.name )._mgr diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 02b9666e31dcb..6ca22946ac9c5 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1026,3 +1026,10 @@ def test_additional_element_to_categorical_series_loc(): result.loc[3] = 0 expected = Series(["a", "b", "c", 0], dtype="object") tm.assert_series_equal(result, expected) + + +def test_additional_categorical_element_loc(): + result = Series(["a", "b", "c"], dtype="category") + result.loc[3] = "a" + expected = Series(["a", "b", "c", "a"], dtype="category") + tm.assert_series_equal(result, expected) From 5acbf42c214db4c6fa65957f06c12dfb5088eedb Mon Sep 17 00:00:00 2001 From: "tobias.pitters" Date: Mon, 18 Jul 2022 08:14:35 +0200 Subject: [PATCH 4/9] update due to PR discussions --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/indexing.py | 9 +-------- pandas/tests/indexing/test_indexing.py | 14 -------------- pandas/tests/indexing/test_loc.py | 18 ++++++++++++++++++ 5 files changed, 21 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a66128775f984..f1c3c5793d25d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -908,7 +908,7 @@ Indexing - Bug in :meth:`NDFrame.xs`, :meth:`DataFrame.iterrows`, :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` not always propagating metadata (:issue:`28283`) - Bug in :meth:`DataFrame.sum` min_count changes dtype if input contains NaNs (:issue:`46947`) - Bug in :class:`IntervalTree` that lead to an infinite recursion. (:issue:`46658`) -- Bug in :meth:`DataFrame.loc` when creating a new element on a :class:`Series` with dtype :class:`CategoricalDtype` (:issue:`47677`) +- Bug in :meth:`DataFrame.loc` when enlarging a :class:`Series` with dtype :class:`CategoricalDtype` with a scalar (:issue:`47677`) Missing ^^^^^^^ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 093b51a38d067..8055373e6ec03 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -648,7 +648,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): elif isinstance(dtype, CategoricalDtype): if fill_value in dtype.categories: - return "category", fill_value + return dtype, fill_value else: return object, ensure_object(fill_value) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 88a784f054750..fc4d7c66c3fba 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -41,7 +41,6 @@ is_sequence, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -2121,15 +2120,9 @@ def _setitem_with_indexer_missing(self, indexer, value): if len(self.obj._values): # GH#47677 handle enlarging with a scalar as a special case - if ( - isinstance(self.obj.dtype, CategoricalDtype) - and new_dtype == "category" - ): - new_values = Series(self.obj.tolist() + [value], dtype="category") # GH#22717 handle casting compatibility that np.concatenate # does incorrectly - else: - new_values = concat_compat([self.obj._values, new_values]) + new_values = concat_compat([self.obj._values, new_values]) self.obj._mgr = self.obj._constructor( new_values, index=new_index, name=self.obj.name )._mgr diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 6ca22946ac9c5..9d10e487e0cc2 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1019,17 +1019,3 @@ def test_ser_list_indexer_exceeds_dimensions(indexer_li): res = indexer_li(ser)[[0, 0]] exp = Series([10, 10], index=Index([0, 0])) tm.assert_series_equal(res, exp) - - -def test_additional_element_to_categorical_series_loc(): - result = Series(["a", "b", "c"], dtype="category") - result.loc[3] = 0 - expected = Series(["a", "b", "c", 0], dtype="object") - tm.assert_series_equal(result, expected) - - -def test_additional_categorical_element_loc(): - result = Series(["a", "b", "c"], dtype="category") - result.loc[3] = "a" - expected = Series(["a", "b", "c", "a"], dtype="category") - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 4c38a2219372d..37bee2caafc10 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1820,6 +1820,24 @@ def test_loc_getitem_sorted_index_level_with_duplicates(self): result = df.loc[("foo", "bar")] tm.assert_frame_equal(result, expected) + def test_additional_element_to_categorical_series_loc(self): + result = Series(["a", "b", "c"], dtype="category") + result.loc[3] = 0 + expected = Series(["a", "b", "c", 0], dtype="object") + tm.assert_series_equal(result, expected) + + def test_additional_categorical_element_loc(self): + result = Series(["a", "b", "c"], dtype="category") + result.loc[3] = "a" + expected = Series(["a", "b", "c", "a"], dtype="category") + tm.assert_series_equal(result, expected) + + def test_loc_enlarge_category_series_with_np_nan(self): + result = Series(["a", "b", "c"], dtype="category") + result.loc[3] = np.nan + expected = Series(["a", "b", "c", np.nan], dtype=object) + tm.assert_series_equal(expected, result) + def test_loc_getitem_preserves_index_level_category_dtype(self): # GH#15166 df = DataFrame( From 59538cd2d6136ccb0519162adf422914d6beeb66 Mon Sep 17 00:00:00 2001 From: "tobias.pitters" Date: Mon, 18 Jul 2022 09:55:17 +0200 Subject: [PATCH 5/9] remove unnecessary comment --- pandas/core/indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index fc4d7c66c3fba..fa1ad7ce3c874 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2119,7 +2119,6 @@ def _setitem_with_indexer_missing(self, indexer, value): new_values = Series([value], dtype=new_dtype)._values if len(self.obj._values): - # GH#47677 handle enlarging with a scalar as a special case # GH#22717 handle casting compatibility that np.concatenate # does incorrectly new_values = concat_compat([self.obj._values, new_values]) From efdcf1cae3f4993bb10313f4c44633a958dd2541 Mon Sep 17 00:00:00 2001 From: "tobias.pitters" Date: Mon, 18 Jul 2022 17:09:22 +0200 Subject: [PATCH 6/9] WIP: fix nan for enlarging --- pandas/core/dtypes/cast.py | 6 ++++-- pandas/tests/indexing/test_loc.py | 14 +++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8055373e6ec03..a88e6a0fb0179 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -591,7 +591,9 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): return dtype, fv elif isna(fill_value): - dtype = _dtype_obj + # preserve dtype in case of categoricaldtype + if not isinstance(dtype, CategoricalDtype): + dtype = _dtype_obj if fill_value is None: # but we retain e.g. pd.NA fill_value = np.nan @@ -647,7 +649,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): return np.dtype("object"), fill_value elif isinstance(dtype, CategoricalDtype): - if fill_value in dtype.categories: + if fill_value in dtype.categories or isna(fill_value): return dtype, fill_value else: return object, ensure_object(fill_value) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 37bee2caafc10..c95f91e94b3fd 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1832,10 +1832,18 @@ def test_additional_categorical_element_loc(self): expected = Series(["a", "b", "c", "a"], dtype="category") tm.assert_series_equal(result, expected) - def test_loc_enlarge_category_series_with_np_nan(self): + @pytest.mark.parametrize("na", (np.nan, pd.NA, None)) + def test_loc_enlarge_category_series_with_nan(self, na): result = Series(["a", "b", "c"], dtype="category") - result.loc[3] = np.nan - expected = Series(["a", "b", "c", np.nan], dtype=object) + result.loc[3] = na + expected = Series(["a", "b", "c", na], dtype="category") + tm.assert_series_equal(expected, result) + + @pytest.mark.parametrize("na", (np.nan, pd.NA, None)) + def test_loc_set_nan_into_category_series(self, na): + result = Series(["a", "b", "c", "d"], dtype="category") + result.loc[3] = na + expected = Series(["a", "b", "c", na], dtype="category") tm.assert_series_equal(expected, result) def test_loc_getitem_preserves_index_level_category_dtype(self): From 83f75ceb983235f11675f7761b182a27b56d5e58 Mon Sep 17 00:00:00 2001 From: "tobias.pitters" Date: Mon, 18 Jul 2022 18:08:11 +0200 Subject: [PATCH 7/9] add tests; fix nan in _maybe_promote --- pandas/tests/indexing/test_loc.py | 40 +++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index c95f91e94b3fd..882eff456e1a1 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -18,11 +18,16 @@ import pandas as pd from pandas import ( Categorical, + CategoricalDtype, CategoricalIndex, DataFrame, DatetimeIndex, + Float32Dtype, + Float64Dtype, Index, IndexSlice, + Int32Dtype, + Int64Dtype, MultiIndex, Period, PeriodIndex, @@ -1821,30 +1826,45 @@ def test_loc_getitem_sorted_index_level_with_duplicates(self): tm.assert_frame_equal(result, expected) def test_additional_element_to_categorical_series_loc(self): + # GH#47677 result = Series(["a", "b", "c"], dtype="category") result.loc[3] = 0 expected = Series(["a", "b", "c", 0], dtype="object") tm.assert_series_equal(result, expected) def test_additional_categorical_element_loc(self): + # GH#47677 result = Series(["a", "b", "c"], dtype="category") result.loc[3] = "a" expected = Series(["a", "b", "c", "a"], dtype="category") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("na", (np.nan, pd.NA, None)) - def test_loc_enlarge_category_series_with_nan(self, na): - result = Series(["a", "b", "c"], dtype="category") - result.loc[3] = na - expected = Series(["a", "b", "c", na], dtype="category") - tm.assert_series_equal(expected, result) + @pytest.mark.parametrize( + "dtype", + (np.int64, np.float64, Int64Dtype, Int32Dtype, Float64Dtype, Float32Dtype), + ) + def test_loc_set_nan_in_categorical_series(self, dtype): + # GH#47677 + srs = Series([1, 2, 3], dtype=CategoricalDtype(Index([1, 2, 3], dtype=dtype))) + # enlarge + srs.loc[3] = np.nan + assert srs.values.dtype._categories.dtype == dtype + # set into + srs.loc[1] = np.nan + assert srs.values.dtype._categories.dtype == dtype @pytest.mark.parametrize("na", (np.nan, pd.NA, None)) - def test_loc_set_nan_into_category_series(self, na): - result = Series(["a", "b", "c", "d"], dtype="category") - result.loc[3] = na + def test_loc_consistency_series_enlarge_set_into(self, na): + # GH#47677 + srs_enlarge = Series(["a", "b", "c"], dtype="category") + srs_enlarge.loc[3] = na + + srs_setinto = Series(["a", "b", "c", "a"], dtype="category") + srs_setinto.loc[3] = na + + tm.assert_series_equal(srs_enlarge, srs_setinto) expected = Series(["a", "b", "c", na], dtype="category") - tm.assert_series_equal(expected, result) + tm.assert_series_equal(srs_enlarge, expected) def test_loc_getitem_preserves_index_level_category_dtype(self): # GH#15166 From 7bfb9b758c253730b6459e9fc8847cf1cea6201c Mon Sep 17 00:00:00 2001 From: "tobias.pitters" Date: Mon, 18 Jul 2022 18:17:37 +0200 Subject: [PATCH 8/9] remove unnecessary statement --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a88e6a0fb0179..00b18ef154f27 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -649,7 +649,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): return np.dtype("object"), fill_value elif isinstance(dtype, CategoricalDtype): - if fill_value in dtype.categories or isna(fill_value): + if fill_value in dtype.categories: return dtype, fill_value else: return object, ensure_object(fill_value) From d25f7ebfac3f39eb6d17ce01cf56f6ead826e98d Mon Sep 17 00:00:00 2001 From: "tobias.pitters" Date: Mon, 18 Jul 2022 18:55:31 +0200 Subject: [PATCH 9/9] use any_numeric_ea_dtype for tests --- pandas/tests/indexing/test_loc.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 882eff456e1a1..60c4ee5518047 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -22,12 +22,8 @@ CategoricalIndex, DataFrame, DatetimeIndex, - Float32Dtype, - Float64Dtype, Index, IndexSlice, - Int32Dtype, - Int64Dtype, MultiIndex, Period, PeriodIndex, @@ -1839,19 +1835,18 @@ def test_additional_categorical_element_loc(self): expected = Series(["a", "b", "c", "a"], dtype="category") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "dtype", - (np.int64, np.float64, Int64Dtype, Int32Dtype, Float64Dtype, Float32Dtype), - ) - def test_loc_set_nan_in_categorical_series(self, dtype): + def test_loc_set_nan_in_categorical_series(self, any_numeric_ea_dtype): # GH#47677 - srs = Series([1, 2, 3], dtype=CategoricalDtype(Index([1, 2, 3], dtype=dtype))) + srs = Series( + [1, 2, 3], + dtype=CategoricalDtype(Index([1, 2, 3], dtype=any_numeric_ea_dtype)), + ) # enlarge srs.loc[3] = np.nan - assert srs.values.dtype._categories.dtype == dtype + assert srs.values.dtype._categories.dtype == any_numeric_ea_dtype # set into srs.loc[1] = np.nan - assert srs.values.dtype._categories.dtype == dtype + assert srs.values.dtype._categories.dtype == any_numeric_ea_dtype @pytest.mark.parametrize("na", (np.nan, pd.NA, None)) def test_loc_consistency_series_enlarge_set_into(self, na):