From 4487bec4c98bdfc2ac79dcc85e2c36552c59431e Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Tue, 5 May 2020 22:13:48 +0200 Subject: [PATCH 01/60] Add fix to raise error when category value is not predefined --- pandas/core/generic.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b550857252466..b9b407ea6e495 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -70,6 +70,7 @@ is_float, is_list_like, is_number, + is_categorical_dtype, is_numeric_dtype, is_object_dtype, is_re_compilable, @@ -5226,6 +5227,19 @@ def __setattr__(self, name: str, value) -> None: After regular attribute access, try setting the name This allows simpler access to columns for interactive use. """ + # Fix for handling assignment to a Catagorical dtype + # with a category which is not predefined. + if isinstance(value, BlockManager) and is_categorical_dtype(self): + if len(value) > 0: + new_value = value.as_array()[-1] + else: + new_value = None + if not pd.isna(new_value) and new_value not in self.dtype.categories.values: + raise ValueError( + "Cannot setitem on a Categorical with a new " + "category, set the categories first" + ) + # first try regular attribute access via __getattribute__, so that # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify # the same attribute. From 10098ab2d464882eda2ac4e9289a5af5cebc1963 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Tue, 5 May 2020 23:02:51 +0200 Subject: [PATCH 02/60] Fix linting --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b9b407ea6e495..d76f371563bff 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -63,6 +63,7 @@ ensure_str, is_bool, is_bool_dtype, + is_categorical_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like, @@ -70,7 +71,6 @@ is_float, is_list_like, is_number, - is_categorical_dtype, is_numeric_dtype, is_object_dtype, is_re_compilable, From cb34580a94f4b97313c0adb05925a432ef6db014 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Tue, 5 May 2020 23:25:05 +0200 Subject: [PATCH 03/60] Added new test --- pandas/tests/arrays/categorical/test_indexing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index abfae189bb4d7..90b73104153da 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -234,6 +234,12 @@ def test_where_ordered_differs_rasies(self): with pytest.raises(ValueError, match="without identical categories"): ser.where([True, False, True], other) + def test_loc_new_category_raises(self): + ser = pd.Series(Categorical(["a", "b", "c"])) + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(ValueError, match=msg): + ser.loc[3] = "d" + @pytest.mark.parametrize("index", [True, False]) def test_mask_with_boolean(index): From c627fa69ae9231f0355d3a40f31d993e6449bd1b Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 16 May 2020 18:06:47 +0200 Subject: [PATCH 04/60] Add test case for unused categories --- pandas/tests/arrays/categorical/test_indexing.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 90b73104153da..5282e424a6103 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -240,6 +240,15 @@ def test_loc_new_category_raises(self): with pytest.raises(ValueError, match=msg): ser.loc[3] = "d" + def test_unused_category_retention(self): + # Init case + exp_cats = Index(["a", "b", "c", "d"]) + cat1 = Series(Categorical(["a", "b", "c"], categories=exp_cats)) + tm.assert_index_equal(cat1.cat.categories, exp_cats) + + # Modify case + cat1.loc[0] = "b" + tm.assert_index_equal(cat1.cat.categories, exp_cats) @pytest.mark.parametrize("index", [True, False]) def test_mask_with_boolean(index): From ba3a75107639fe9344e4e583ea8e1a8f84921bd0 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 16 May 2020 18:08:03 +0200 Subject: [PATCH 05/60] Remove trailing whitespace --- pandas/tests/arrays/categorical/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 5282e424a6103..d2acd39f3d089 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -248,7 +248,7 @@ def test_unused_category_retention(self): # Modify case cat1.loc[0] = "b" - tm.assert_index_equal(cat1.cat.categories, exp_cats) + tm.assert_index_equal(cat1.cat.categories, exp_cats) @pytest.mark.parametrize("index", [True, False]) def test_mask_with_boolean(index): From 51dcdfedb10bb1f19a2c49b290ac484283128ae8 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 16 May 2020 22:01:23 +0200 Subject: [PATCH 06/60] Fix linting --- pandas/tests/arrays/categorical/test_indexing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index d2acd39f3d089..dd2302b2bca59 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -249,6 +249,7 @@ def test_unused_category_retention(self): # Modify case cat1.loc[0] = "b" tm.assert_index_equal(cat1.cat.categories, exp_cats) + @pytest.mark.parametrize("index", [True, False]) def test_mask_with_boolean(index): From 9057b262008862c65cdb8181f7bac8db9f74a1b9 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 16 May 2020 22:24:23 +0200 Subject: [PATCH 07/60] Fix linting --- pandas/tests/arrays/categorical/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index dd2302b2bca59..7ed85da4c8502 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -249,7 +249,7 @@ def test_unused_category_retention(self): # Modify case cat1.loc[0] = "b" tm.assert_index_equal(cat1.cat.categories, exp_cats) - + @pytest.mark.parametrize("index", [True, False]) def test_mask_with_boolean(index): From 06fdc3ea4ca95983310ee69c8d415506b6dd99e3 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 23 May 2020 21:02:54 +0200 Subject: [PATCH 08/60] Remove temporary fix from generic.py --- pandas/core/generic.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3e9f87eb5e0a7..c3e6b1e989555 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5229,19 +5229,6 @@ def __setattr__(self, name: str, value) -> None: After regular attribute access, try setting the name This allows simpler access to columns for interactive use. """ - # Fix for handling assignment to a Catagorical dtype - # with a category which is not predefined. - if isinstance(value, BlockManager) and is_categorical_dtype(self): - if len(value) > 0: - new_value = value.as_array()[-1] - else: - new_value = None - if not pd.isna(new_value) and new_value not in self.dtype.categories.values: - raise ValueError( - "Cannot setitem on a Categorical with a new " - "category, set the categories first" - ) - # first try regular attribute access via __getattribute__, so that # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify # the same attribute. From 582c02344e1085af4d886e90523c05596fc05a9b Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 24 May 2020 18:50:12 +0200 Subject: [PATCH 09/60] First fix try through indexing.py --- pandas/core/indexing.py | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b857a59195695..f898bb11a306b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -20,6 +20,8 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna +from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.common import is_categorical_dtype import pandas.core.common as com from pandas.core.indexers import ( @@ -1790,7 +1792,7 @@ def _setitem_with_indexer_missing(self, indexer, value): """ Insert new row(s) or column(s) into the Series or DataFrame. """ - from pandas import Series + from pandas import Series, DataFrame # reindex the axis to the new value # and set inplace @@ -1815,8 +1817,21 @@ def _setitem_with_indexer_missing(self, indexer, value): # GH#22717 handle casting compatibility that np.concatenate # does incorrectly new_values = concat_compat([self.obj._values, new_values]) + if is_object_dtype(new_values.dtype): + dtype = self.obj.dtype + else: + dtype = find_common_type([self.obj.dtype, new_values.dtype]) + else: + dtype = None + + if is_categorical_dtype(self.obj.dtype): + if (~np.in1d(new_values, self.obj.dtypes.categories.values)).any(): + raise ValueError( + "Cannot setitem on a Categorical with a new category" + ) + self.obj._mgr = self.obj._constructor( - new_values, index=new_index, name=self.obj.name + new_values, index=new_index, name=self.obj.name, dtype=dtype )._mgr self.obj._maybe_update_cacher(clear=True) @@ -1838,7 +1853,25 @@ def _setitem_with_indexer_missing(self, indexer, value): if len(value) != len(self.obj.columns): raise ValueError("cannot set a row with mismatched columns") - value = Series(value, index=self.obj.columns, name=indexer) + if len(set(self.obj.dtypes)) > 1: + value = list(value) + for i in range(len(self.obj.columns)): + value[i] = Series(data=[value[i]], dtype=self.obj.dtypes[i]) + if is_categorical_dtype(self.obj.dtypes[i]): + if ( + ~np.in1d( + value[i].values, + self.obj.dtypes[i].categories.values, + ) + ).any(): + raise ValueError( + "Cannot setitem on a Categorical with a new category" + ) + value = dict(zip(self.obj.columns, value)) + value = DataFrame(value) + value.index = [indexer] + else: + value = Series(value, index=self.obj.columns, name=indexer) self.obj._mgr = self.obj.append(value)._mgr self.obj._maybe_update_cacher(clear=True) From 730fc2b1a730ca4d4a10948f54c47c426848b6fd Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 24 May 2020 19:33:50 +0200 Subject: [PATCH 10/60] Fix lint --- pandas/core/generic.py | 1 - pandas/core/indexing.py | 24 +++++++++--------------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 98c9924a932ac..8aa8f8bb60654 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -65,7 +65,6 @@ ensure_str, is_bool, is_bool_dtype, - is_categorical_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f898bb11a306b..18b401021605b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1794,6 +1794,13 @@ def _setitem_with_indexer_missing(self, indexer, value): """ from pandas import Series, DataFrame + def check_valid_categorical(new_values, obj_dtype): + if is_categorical_dtype(obj_dtype): + if (~np.in1d(new_values, obj_dtype.categories.values)).any(): + raise ValueError( + "Cannot setitem on a Categorical with a new category" + ) + # reindex the axis to the new value # and set inplace if self.ndim == 1: @@ -1824,11 +1831,7 @@ def _setitem_with_indexer_missing(self, indexer, value): else: dtype = None - if is_categorical_dtype(self.obj.dtype): - if (~np.in1d(new_values, self.obj.dtypes.categories.values)).any(): - raise ValueError( - "Cannot setitem on a Categorical with a new category" - ) + check_valid_categorical(new_values, self.obj.dtype) self.obj._mgr = self.obj._constructor( new_values, index=new_index, name=self.obj.name, dtype=dtype @@ -1857,16 +1860,7 @@ def _setitem_with_indexer_missing(self, indexer, value): value = list(value) for i in range(len(self.obj.columns)): value[i] = Series(data=[value[i]], dtype=self.obj.dtypes[i]) - if is_categorical_dtype(self.obj.dtypes[i]): - if ( - ~np.in1d( - value[i].values, - self.obj.dtypes[i].categories.values, - ) - ).any(): - raise ValueError( - "Cannot setitem on a Categorical with a new category" - ) + check_valid_categorical(value[i], self.obj.dtypes[i]) value = dict(zip(self.obj.columns, value)) value = DataFrame(value) value.index = [indexer] From c275eb95f0b6d3a2917679a2d1d675f62c0f10ce Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 24 May 2020 19:59:09 +0200 Subject: [PATCH 11/60] Fix import ordering --- pandas/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 18b401021605b..175ae26eb38b9 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -7,7 +7,9 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import doc +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( + is_categorical_dtype, is_hashable, is_integer, is_iterator, @@ -20,8 +22,6 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna -from pandas.core.dtypes.cast import find_common_type -from pandas.core.dtypes.common import is_categorical_dtype import pandas.core.common as com from pandas.core.indexers import ( From 944ae24dbe53905dafd1de5e4b5fd2d452fc2a12 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 24 May 2020 22:13:00 +0200 Subject: [PATCH 12/60] Fix Update --- pandas/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 175ae26eb38b9..dab4c470c6ec9 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1825,14 +1825,14 @@ def check_valid_categorical(new_values, obj_dtype): # does incorrectly new_values = concat_compat([self.obj._values, new_values]) if is_object_dtype(new_values.dtype): - dtype = self.obj.dtype + dtype = None else: dtype = find_common_type([self.obj.dtype, new_values.dtype]) else: dtype = None check_valid_categorical(new_values, self.obj.dtype) - + self.obj._mgr = self.obj._constructor( new_values, index=new_index, name=self.obj.name, dtype=dtype )._mgr From 8372bdbeff5ec238653ce6887b348f2fcc998aa0 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 24 May 2020 22:38:56 +0200 Subject: [PATCH 13/60] Fix lint --- pandas/core/indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index dab4c470c6ec9..b64926f635fae 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1832,7 +1832,6 @@ def check_valid_categorical(new_values, obj_dtype): dtype = None check_valid_categorical(new_values, self.obj.dtype) - self.obj._mgr = self.obj._constructor( new_values, index=new_index, name=self.obj.name, dtype=dtype )._mgr From 0e5e41838b412c1bb18b2d7fbc9eba6e80850e5d Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 24 May 2020 23:16:55 +0200 Subject: [PATCH 14/60] Include more related test cases --- .../tests/arrays/categorical/test_indexing.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 7ed85da4c8502..9e74cdc29da74 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -3,6 +3,8 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series +from pandas.core.dtypes.common import is_categorical_dtype + import pandas._testing as tm import pandas.core.common as com from pandas.tests.arrays.categorical.common import TestCategorical @@ -234,7 +236,7 @@ def test_where_ordered_differs_rasies(self): with pytest.raises(ValueError, match="without identical categories"): ser.where([True, False, True], other) - def test_loc_new_category_raises(self): + def test_loc_new_category_series_raises(self): ser = pd.Series(Categorical(["a", "b", "c"])) msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): @@ -250,6 +252,25 @@ def test_unused_category_retention(self): cat1.loc[0] = "b" tm.assert_index_equal(cat1.cat.categories, exp_cats) + def test_loc_new_category_row_raises(self): + data = { + "int": [0, 1, 2], + "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + } + df = pd.DataFrame.from_dict(data) + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(ValueError, match=msg): + df.loc[3] = [3, "d"] + + def test_loc_new_row_category_dtype_retention(self): + data = { + "int": [0, 1, 2], + "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + } + df = pd.DataFrame.from_dict(data) + df.loc[3] = [3, "c"] + assert is_categorical_dtype(df["cat"]) + @pytest.mark.parametrize("index", [True, False]) def test_mask_with_boolean(index): From eea359acde96e95e722f6cda1c3e7d7e06a33e6d Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 24 May 2020 23:38:05 +0200 Subject: [PATCH 15/60] Fix linting --- pandas/tests/arrays/categorical/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 9e74cdc29da74..fe306e3c1f0e3 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -2,8 +2,8 @@ import pytest import pandas as pd -from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series from pandas.core.dtypes.common import is_categorical_dtype +from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series import pandas._testing as tm import pandas.core.common as com From 5f72d4ec744c66ee9312e4251d918cd03dbfa9a7 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 24 May 2020 23:56:28 +0200 Subject: [PATCH 16/60] Update test_indexing.py --- pandas/tests/arrays/categorical/test_indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index fe306e3c1f0e3..5307118af0d01 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -1,10 +1,10 @@ import numpy as np import pytest -import pandas as pd from pandas.core.dtypes.common import is_categorical_dtype -from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series +import pandas as pd +from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series import pandas._testing as tm import pandas.core.common as com from pandas.tests.arrays.categorical.common import TestCategorical From 26f474bd0afa5ac3be8189777adef3fb7862fc36 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Thu, 8 Oct 2020 23:23:22 +0300 Subject: [PATCH 17/60] import missing dtypes function --- pandas/core/indexing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 16b0c096a0624..d8856ace5c9d5 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( + is_categorical_dtype, is_array_like, is_hashable, is_integer, From 215943e8f5c8e2a1e69253de8e4fe24f676bdb66 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Thu, 8 Oct 2020 23:32:12 +0300 Subject: [PATCH 18/60] Fix linting --- pandas/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d8856ace5c9d5..5c960f90b1459 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -13,8 +13,8 @@ from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( - is_categorical_dtype, is_array_like, + is_categorical_dtype, is_hashable, is_integer, is_iterator, @@ -1844,7 +1844,7 @@ def _setitem_with_indexer_missing(self, indexer, value): """ Insert new row(s) or column(s) into the Series or DataFrame. """ - from pandas import Series, DataFrame + from pandas import DataFrame, Series def check_valid_categorical(new_values, obj_dtype): if is_categorical_dtype(obj_dtype): From 5bacde9276d753546b20627c6b838c52e2d6f28e Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 10 Oct 2020 13:12:56 +0300 Subject: [PATCH 19/60] Include requested changes - Moved the tests for this PR to /tests/series/test_categorical.py - Applied the code changes based on feedback --- .../tests/arrays/categorical/test_indexing.py | 36 ------------- pandas/tests/series/test_categorical.py | 50 +++++++++++++++++++ 2 files changed, 50 insertions(+), 36 deletions(-) create mode 100644 pandas/tests/series/test_categorical.py diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 686aa9ccf764a..209341a4b8634 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -1,7 +1,6 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series @@ -244,41 +243,6 @@ def test_where_ordered_differs_rasies(self): with pytest.raises(ValueError, match="without identical categories"): ser.where([True, False, True], other) - def test_loc_new_category_series_raises(self): - ser = pd.Series(Categorical(["a", "b", "c"])) - msg = "Cannot setitem on a Categorical with a new category" - with pytest.raises(ValueError, match=msg): - ser.loc[3] = "d" - - def test_unused_category_retention(self): - # Init case - exp_cats = Index(["a", "b", "c", "d"]) - cat1 = Series(Categorical(["a", "b", "c"], categories=exp_cats)) - tm.assert_index_equal(cat1.cat.categories, exp_cats) - - # Modify case - cat1.loc[0] = "b" - tm.assert_index_equal(cat1.cat.categories, exp_cats) - - def test_loc_new_category_row_raises(self): - data = { - "int": [0, 1, 2], - "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]), - } - df = pd.DataFrame.from_dict(data) - msg = "Cannot setitem on a Categorical with a new category" - with pytest.raises(ValueError, match=msg): - df.loc[3] = [3, "d"] - - def test_loc_new_row_category_dtype_retention(self): - data = { - "int": [0, 1, 2], - "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]), - } - df = pd.DataFrame.from_dict(data) - df.loc[3] = [3, "c"] - assert is_categorical_dtype(df["cat"]) - @pytest.mark.parametrize("index", [True, False]) def test_mask_with_boolean(index): diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py new file mode 100644 index 0000000000000..cbac23d3adcd9 --- /dev/null +++ b/pandas/tests/series/test_categorical.py @@ -0,0 +1,50 @@ +import pytest +import pandas as pd +import pandas._testing as tm + +from pandas import Categorical, Index + + +class TestCategoricalSeries: + def test_loc_new_category_series_raises(self): + ser = pd.Series(Categorical(["a", "b", "c"])) + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(ValueError, match=msg): + ser.loc[3] = "d" + + def test_unused_category_retention(self): + # Init case + exp_cats = Index(["a", "b", "c", "d"]) + ser = pd.Series(Categorical(["a", "b", "c"], categories=exp_cats)) + tm.assert_index_equal(ser.cat.categories, exp_cats) + + # Modify case + ser.loc[0] = "b" + expected = pd.Series(Categorical(["b", "b", "c"], categories=exp_cats)) + tm.assert_index_equal(ser.cat.categories, exp_cats) + tm.assert_series_equal(ser, expected) + + def test_loc_new_category_row_raises(self): + data = { + "int": [0, 1, 2], + "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + } + df = pd.DataFrame(data) + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(ValueError, match=msg): + df.loc[3] = [3, "d"] + + def test_loc_new_row_category_dtype_retention(self): + df_data = { + "int": [0, 1, 2], + "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + } + df = pd.DataFrame(df_data) + df.loc[3] = [3, "c"] + + expected_data = { + "int": [0, 1, 2, 3], + "cat": pd.Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]), + } + expected = pd.DataFrame(expected_data) + tm.assert_frame_equal(df, expected) From 96e4318d0d1702fd0cadb1e7387fe890be337f3b Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 10 Oct 2020 13:54:19 +0300 Subject: [PATCH 20/60] Fix import ordering/format --- pandas/tests/arrays/categorical/test_indexing.py | 1 - pandas/tests/series/test_categorical.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 209341a4b8634..2c4dd8fe64057 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -1,7 +1,6 @@ import numpy as np import pytest - import pandas as pd from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series import pandas._testing as tm diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index cbac23d3adcd9..28d1dd54b81cf 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -1,8 +1,8 @@ import pytest -import pandas as pd -import pandas._testing as tm +import pandas as pd from pandas import Categorical, Index +import pandas._testing as tm class TestCategoricalSeries: From e7ce2464b556bdc1325da7d061aaac80dade592f Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 11 Oct 2020 22:48:56 +0300 Subject: [PATCH 21/60] Update test_categorical.py --- pandas/tests/series/test_categorical.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index 28d1dd54b81cf..630ae45ebb3d4 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -25,26 +25,24 @@ def test_unused_category_retention(self): tm.assert_series_equal(ser, expected) def test_loc_new_category_row_raises(self): - data = { + df = pd.DataFrame({ "int": [0, 1, 2], "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), - } - df = pd.DataFrame(data) + }) msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): df.loc[3] = [3, "d"] def test_loc_new_row_category_dtype_retention(self): - df_data = { + df = pd.DataFrame({ "int": [0, 1, 2], "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]), - } - df = pd.DataFrame(df_data) + }) df.loc[3] = [3, "c"] - expected_data = { + expected = pd.DataFrame({ "int": [0, 1, 2, 3], "cat": pd.Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]), - } - expected = pd.DataFrame(expected_data) + }) + tm.assert_frame_equal(df, expected) From 31ef609e0571e03d68cdff54655f43d9772961e3 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 11 Oct 2020 22:56:58 +0300 Subject: [PATCH 22/60] Fix format --- pandas/tests/series/test_categorical.py | 30 +++++++++++++++---------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index 630ae45ebb3d4..20ba11a36d046 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -25,24 +25,30 @@ def test_unused_category_retention(self): tm.assert_series_equal(ser, expected) def test_loc_new_category_row_raises(self): - df = pd.DataFrame({ - "int": [0, 1, 2], - "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), - }) + df = pd.DataFrame( + { + "int": [0, 1, 2], + "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + } + ) msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): df.loc[3] = [3, "d"] def test_loc_new_row_category_dtype_retention(self): - df = pd.DataFrame({ - "int": [0, 1, 2], - "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]), - }) + df = pd.DataFrame( + { + "int": [0, 1, 2], + "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + } + ) df.loc[3] = [3, "c"] - expected = pd.DataFrame({ - "int": [0, 1, 2, 3], - "cat": pd.Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]), - }) + expected = pd.DataFrame( + { + "int": [0, 1, 2, 3], + "cat": pd.Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]), + } + ) tm.assert_frame_equal(df, expected) From ce3f46318d23849fe7d78549b9a08d4351fbedda Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 11 Oct 2020 23:05:08 +0300 Subject: [PATCH 23/60] Remove commas --- pandas/tests/series/test_categorical.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index 20ba11a36d046..ed33c44ec26b4 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -28,7 +28,7 @@ def test_loc_new_category_row_raises(self): df = pd.DataFrame( { "int": [0, 1, 2], - "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]) } ) msg = "Cannot setitem on a Categorical with a new category" @@ -39,7 +39,7 @@ def test_loc_new_row_category_dtype_retention(self): df = pd.DataFrame( { "int": [0, 1, 2], - "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]) } ) df.loc[3] = [3, "c"] @@ -47,7 +47,7 @@ def test_loc_new_row_category_dtype_retention(self): expected = pd.DataFrame( { "int": [0, 1, 2, 3], - "cat": pd.Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]), + "cat": pd.Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]) } ) From a825269e7db073e57cf00d5d4285f3eb036bd841 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 11 Oct 2020 23:15:01 +0300 Subject: [PATCH 24/60] Update test_categorical.py --- pandas/tests/series/test_categorical.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index ed33c44ec26b4..d8d6e08ca931d 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -28,7 +28,7 @@ def test_loc_new_category_row_raises(self): df = pd.DataFrame( { "int": [0, 1, 2], - "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]) + "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), } ) msg = "Cannot setitem on a Categorical with a new category" @@ -39,7 +39,7 @@ def test_loc_new_row_category_dtype_retention(self): df = pd.DataFrame( { "int": [0, 1, 2], - "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]) + "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]), } ) df.loc[3] = [3, "c"] @@ -47,8 +47,8 @@ def test_loc_new_row_category_dtype_retention(self): expected = pd.DataFrame( { "int": [0, 1, 2, 3], - "cat": pd.Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]) + "cat": pd.Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]), } ) - + tm.assert_frame_equal(df, expected) From 72726a03296d09a008a81e6401a4b6a3a0729c7e Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 18 Oct 2020 00:37:08 +0300 Subject: [PATCH 25/60] Update solution --- pandas/core/dtypes/cast.py | 20 +++++++++ pandas/core/dtypes/concat.py | 2 +- pandas/core/indexing.py | 59 +++++++++---------------- pandas/tests/series/test_categorical.py | 18 ++++---- 4 files changed, 51 insertions(+), 48 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e550309461de4..ad661d6f8d908 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -71,6 +71,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype, @@ -1581,6 +1582,25 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: # get unique types (dict.fromkeys is used as order-preserving set()) types = list(dict.fromkeys(types).keys()) + # If set of dtypes contains only categoricals (with the exception of strings) + # then the common dtype will be the categorical (in case it's the only one) + is_cat_or_str = lambda x: is_categorical_dtype(x) or is_string_dtype(x) + if all(is_cat_or_str(t) for t in types) and not any(is_object_dtype(t) for t in types): + # Return union of the categorical dtypes? + cat_dtypes = [] + for t in types: + if is_categorical_dtype(t): + cat_dtypes.append(t) + if len(cat_dtypes) > 0: + dtype_ref = cat_dtypes[0] + cat_dtypes_same = True + for dtype in cat_dtypes: + if not is_dtype_equal(dtype, dtype_ref): + cat_dtypes_same = False + break + if cat_dtypes_same: + return dtype_ref + if any(isinstance(t, ExtensionDtype) for t in types): for t in types: if isinstance(t, ExtensionDtype): diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 60fd959701821..5b12a90135fe8 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -281,7 +281,7 @@ def union_categoricals( def _maybe_unwrap(x): if isinstance(x, (ABCCategoricalIndex, ABCSeries)): return x._values - elif isinstance(x, Categorical): + elif isinstance(x, Categorical) or is_categorical_dtype(x): return x else: raise TypeError("all components to combine must be Categorical") diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 3a21202722372..6701974359cce 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -11,10 +11,8 @@ from pandas.errors import AbstractMethodError, InvalidIndexError from pandas.util._decorators import doc -from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_array_like, - is_categorical_dtype, is_hashable, is_integer, is_iterator, @@ -665,14 +663,9 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) ): - for i, k in enumerate(key): + for k in key: if k not in self.obj: - if value is None: - self.obj[k] = np.nan - elif is_list_like(value): - self.obj[k] = value[i] - else: - self.obj[k] = value + self.obj[k] = np.nan def __setitem__(self, key, value): if isinstance(key, tuple): @@ -1542,15 +1535,14 @@ def _setitem_with_indexer(self, indexer, value): info_axis = self.obj._info_axis_number # maybe partial set - take_split_path = not self.obj._mgr.is_single_block + take_split_path = len(self.obj._mgr.blocks) > 1 # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value if not take_split_path and self.obj._mgr.blocks: - if self.ndim > 1: - # in case of dict, keys are indices + (blk,) = self.obj._mgr.blocks + if 1 < blk.ndim: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value - blk = self.obj._mgr.blocks[0] take_split_path = not blk._can_hold_element(val) # if we have any multi-indexes that have non-trivial slices @@ -1584,7 +1576,10 @@ def _setitem_with_indexer(self, indexer, value): # must have all defined axes if we have a scalar # or a list-like on the non-info axes if we have a # list-like - if not len(self.obj): + len_non_info_axes = ( + len(_ax) for _i, _ax in enumerate(self.obj.axes) if _i != i + ) + if any(not l for l in len_non_info_axes): if not is_list_like_indexer(value): raise ValueError( "cannot set a frame with no " @@ -1769,7 +1764,7 @@ def _setitem_with_indexer(self, indexer, value): self._setitem_single_column(loc, value, pi) else: - self._setitem_single_block(indexer, value) + self._setitem_single_block_inplace(indexer, value) def _setitem_single_column(self, loc: int, value, plane_indexer): # positional setting on column loc @@ -1796,9 +1791,10 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): # reset the sliced object if unique self.obj._iset_item(loc, ser) - def _setitem_single_block(self, indexer, value): + def _setitem_single_block_inplace(self, indexer, value): """ - _setitem_with_indexer for the case when we have a single Block. + _setitem_with_indexer for the case when we have a single Block + and the value can be set into it without casting. """ from pandas import Series @@ -1847,13 +1843,7 @@ def _setitem_with_indexer_missing(self, indexer, value): Insert new row(s) or column(s) into the Series or DataFrame. """ from pandas import DataFrame, Series - - def check_valid_categorical(new_values, obj_dtype): - if is_categorical_dtype(obj_dtype): - if (~np.in1d(new_values, obj_dtype.categories.values)).any(): - raise ValueError( - "Cannot setitem on a Categorical with a new category" - ) + from pandas.core.dtypes.cast import find_common_type # reindex the axis to the new value # and set inplace @@ -1878,16 +1868,8 @@ def check_valid_categorical(new_values, obj_dtype): # GH#22717 handle casting compatibility that np.concatenate # does incorrectly new_values = concat_compat([self.obj._values, new_values]) - if is_object_dtype(new_values.dtype): - dtype = None - else: - dtype = find_common_type([self.obj.dtype, new_values.dtype]) - else: - dtype = None - - check_valid_categorical(new_values, self.obj.dtype) self.obj._mgr = self.obj._constructor( - new_values, index=new_index, name=self.obj.name, dtype=dtype + new_values, index=new_index, name=self.obj.name )._mgr self.obj._maybe_update_cacher(clear=True) @@ -1915,13 +1897,14 @@ def check_valid_categorical(new_values, obj_dtype): if len(set(self.obj.dtypes)) > 1: value = list(value) for i in range(len(self.obj.columns)): - value[i] = Series(data=[value[i]], dtype=self.obj.dtypes[i]) - check_valid_categorical(value[i], self.obj.dtypes[i]) - value = dict(zip(self.obj.columns, value)) - value = DataFrame(value) + dtype = find_common_type([self.obj.dtypes[i], type(value[i])]) + value[i] = Series(data=[value[i]], dtype=dtype) + value = DataFrame(dict(zip(self.obj.columns, value))) value.index = [indexer] else: - value = Series(value, index=self.obj.columns, name=indexer) + dtype = find_common_type([self.obj.dtypes[0], type(value)]) + value = Series(value, index=self.obj.columns, name=indexer, dtype=dtype) + self.obj._mgr = self.obj.append(value)._mgr self.obj._maybe_update_cacher(clear=True) diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index d8d6e08ca931d..69589cc598e27 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -6,12 +6,6 @@ class TestCategoricalSeries: - def test_loc_new_category_series_raises(self): - ser = pd.Series(Categorical(["a", "b", "c"])) - msg = "Cannot setitem on a Categorical with a new category" - with pytest.raises(ValueError, match=msg): - ser.loc[3] = "d" - def test_unused_category_retention(self): # Init case exp_cats = Index(["a", "b", "c", "d"]) @@ -31,9 +25,15 @@ def test_loc_new_category_row_raises(self): "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), } ) - msg = "Cannot setitem on a Categorical with a new category" - with pytest.raises(ValueError, match=msg): - df.loc[3] = [3, "d"] + df.loc[3] = [3, "d"] + + expected = pd.DataFrame( + { + "int": [0, 1, 2, 3], + "cat": Categorical(["a", "b", "c", pd.NA], categories=["a", "b", "c"]), + } + ) + tm.assert_frame_equal(df, expected) def test_loc_new_row_category_dtype_retention(self): df = pd.DataFrame( From 51e2032044c67517f4ac96d931c28fa6d943d89b Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 18 Oct 2020 00:39:44 +0300 Subject: [PATCH 26/60] Fix lint --- pandas/core/dtypes/cast.py | 6 ++++-- pandas/core/indexing.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 423f140fe5d6a..656ab1c8d31ed 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1569,7 +1569,9 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: # If set of dtypes contains only categoricals (with the exception of strings) # then the common dtype will be the categorical (in case it's the only one) is_cat_or_str = lambda x: is_categorical_dtype(x) or is_string_dtype(x) - if all(is_cat_or_str(t) for t in types) and not any(is_object_dtype(t) for t in types): + if all(is_cat_or_str(t) for t in types) and not any( + is_object_dtype(t) for t in types + ): # Return union of the categorical dtypes? cat_dtypes = [] for t in types: @@ -1584,7 +1586,7 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: break if cat_dtypes_same: return dtype_ref - + if any(isinstance(t, ExtensionDtype) for t in types): for t in types: if isinstance(t, ExtensionDtype): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6701974359cce..0d923c79fe5c5 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1903,8 +1903,9 @@ def _setitem_with_indexer_missing(self, indexer, value): value.index = [indexer] else: dtype = find_common_type([self.obj.dtypes[0], type(value)]) - value = Series(value, index=self.obj.columns, name=indexer, dtype=dtype) - + value = Series( + value, index=self.obj.columns, name=indexer, dtype=dtype + ) self.obj._mgr = self.obj.append(value)._mgr self.obj._maybe_update_cacher(clear=True) From 7d643573dcafbdd36e4cd2be27470b5522cfd474 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 18 Oct 2020 00:51:42 +0300 Subject: [PATCH 27/60] Fix format issues --- pandas/core/dtypes/cast.py | 1 - pandas/core/indexing.py | 21 +++++---------------- pandas/tests/series/test_categorical.py | 2 -- 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 656ab1c8d31ed..8e318e2674ff0 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -72,7 +72,6 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( - CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0d923c79fe5c5..5e654a44dcec1 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -11,28 +11,17 @@ from pandas.errors import AbstractMethodError, InvalidIndexError from pandas.util._decorators import doc -from pandas.core.dtypes.common import ( - is_array_like, - is_hashable, - is_integer, - is_iterator, - is_list_like, - is_numeric_dtype, - is_object_dtype, - is_scalar, - is_sequence, -) +from pandas.core.dtypes.common import (is_array_like, is_hashable, is_integer, + is_iterator, is_list_like, is_numeric_dtype, + is_object_dtype, is_scalar, is_sequence) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import infer_fill_value, isna import pandas.core.common as com from pandas.core.construction import array as pd_array -from pandas.core.indexers import ( - check_array_indexer, - is_list_like_indexer, - length_of_indexer, -) +from pandas.core.indexers import (check_array_indexer, is_list_like_indexer, + length_of_indexer) from pandas.core.indexes.api import Index if TYPE_CHECKING: diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index 69589cc598e27..21f71d6a9d6d5 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -1,5 +1,3 @@ -import pytest - import pandas as pd from pandas import Categorical, Index import pandas._testing as tm From d68f2152988e5ed558f7276eb084efa4fe12af4d Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 18 Oct 2020 01:01:39 +0300 Subject: [PATCH 28/60] Update indexing.py --- pandas/core/indexing.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 5e654a44dcec1..ac3b82494b5f3 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -11,17 +11,28 @@ from pandas.errors import AbstractMethodError, InvalidIndexError from pandas.util._decorators import doc -from pandas.core.dtypes.common import (is_array_like, is_hashable, is_integer, - is_iterator, is_list_like, is_numeric_dtype, - is_object_dtype, is_scalar, is_sequence) +from pandas.core.dtypes.common import ( + is_array_like, + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_scalar, + is_sequence, +) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import infer_fill_value, isna import pandas.core.common as com from pandas.core.construction import array as pd_array -from pandas.core.indexers import (check_array_indexer, is_list_like_indexer, - length_of_indexer) +from pandas.core.indexers import ( + check_array_indexer, + is_list_like_indexer, + length_of_indexer, +) from pandas.core.indexes.api import Index if TYPE_CHECKING: @@ -1831,9 +1842,10 @@ def _setitem_with_indexer_missing(self, indexer, value): """ Insert new row(s) or column(s) into the Series or DataFrame. """ - from pandas import DataFrame, Series from pandas.core.dtypes.cast import find_common_type + from pandas import DataFrame, Series + # reindex the axis to the new value # and set inplace if self.ndim == 1: From 5ea8ab1ae33978357bbbc1b5365aa96ee8f894a5 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 18 Oct 2020 10:40:20 +0300 Subject: [PATCH 29/60] Update indexing.py --- pandas/core/indexing.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ac3b82494b5f3..9554dc7eef56f 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -663,9 +663,14 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) ): - for k in key: + for i, k in enumerate(key): if k not in self.obj: - self.obj[k] = np.nan + if value is None: + self.obj[k] = np.nan + elif is_list_like(value): + self.obj[k] = value[i] + else: + self.obj[k] = value def __setitem__(self, key, value): if isinstance(key, tuple): @@ -1535,14 +1540,15 @@ def _setitem_with_indexer(self, indexer, value): info_axis = self.obj._info_axis_number # maybe partial set - take_split_path = len(self.obj._mgr.blocks) > 1 + take_split_path = not self.obj._mgr.is_single_block # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value if not take_split_path and self.obj._mgr.blocks: - (blk,) = self.obj._mgr.blocks - if 1 < blk.ndim: # in case of dict, keys are indices + if self.ndim > 1: + # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value + blk = self.obj._mgr.blocks[0] take_split_path = not blk._can_hold_element(val) # if we have any multi-indexes that have non-trivial slices @@ -1576,10 +1582,7 @@ def _setitem_with_indexer(self, indexer, value): # must have all defined axes if we have a scalar # or a list-like on the non-info axes if we have a # list-like - len_non_info_axes = ( - len(_ax) for _i, _ax in enumerate(self.obj.axes) if _i != i - ) - if any(not l for l in len_non_info_axes): + if not len(self.obj): if not is_list_like_indexer(value): raise ValueError( "cannot set a frame with no " @@ -1764,7 +1767,7 @@ def _setitem_with_indexer(self, indexer, value): self._setitem_single_column(loc, value, pi) else: - self._setitem_single_block_inplace(indexer, value) + self._setitem_single_block(indexer, value) def _setitem_single_column(self, loc: int, value, plane_indexer): # positional setting on column loc @@ -1791,10 +1794,9 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): # reset the sliced object if unique self.obj._iset_item(loc, ser) - def _setitem_single_block_inplace(self, indexer, value): + def _setitem_single_block(self, indexer, value): """ - _setitem_with_indexer for the case when we have a single Block - and the value can be set into it without casting. + _setitem_with_indexer for the case when we have a single Block. """ from pandas import Series @@ -2381,4 +2383,4 @@ def maybe_numeric_slice(df, slice_, include_bool: bool = False): if include_bool: dtypes.append(bool) slice_ = IndexSlice[:, df.select_dtypes(include=dtypes).columns] - return slice_ + return slice_ \ No newline at end of file From b08efc127c92c07b935fc4ab9fa2b39afe9f8eda Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 18 Oct 2020 10:43:09 +0300 Subject: [PATCH 30/60] Update indexing.py --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9554dc7eef56f..490f0e2fbd307 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2383,4 +2383,4 @@ def maybe_numeric_slice(df, slice_, include_bool: bool = False): if include_bool: dtypes.append(bool) slice_ = IndexSlice[:, df.select_dtypes(include=dtypes).columns] - return slice_ \ No newline at end of file + return slice_ From 69f4e626fb1d58ab48d42ec9ec7c6f87aa9698b7 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 18 Oct 2020 10:44:36 +0300 Subject: [PATCH 31/60] Update test_categorical.py --- pandas/tests/series/test_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index 21f71d6a9d6d5..469682c1b57ac 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -16,7 +16,7 @@ def test_unused_category_retention(self): tm.assert_index_equal(ser.cat.categories, exp_cats) tm.assert_series_equal(ser, expected) - def test_loc_new_category_row_raises(self): + def test_loc_new_category_nan_value(self): df = pd.DataFrame( { "int": [0, 1, 2], From 4c33040e9e9c2c8d721fc24b036a1c99b8f29bfe Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 18 Oct 2020 20:20:06 +0300 Subject: [PATCH 32/60] Update concat.py --- pandas/core/dtypes/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 5b12a90135fe8..60fd959701821 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -281,7 +281,7 @@ def union_categoricals( def _maybe_unwrap(x): if isinstance(x, (ABCCategoricalIndex, ABCSeries)): return x._values - elif isinstance(x, Categorical) or is_categorical_dtype(x): + elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") From e9367366c7af0980d83c7248f2b77c159531ec6b Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 18 Oct 2020 20:22:26 +0300 Subject: [PATCH 33/60] Update cast.py --- pandas/core/dtypes/cast.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8e318e2674ff0..5e7712281f748 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1578,12 +1578,12 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: cat_dtypes.append(t) if len(cat_dtypes) > 0: dtype_ref = cat_dtypes[0] - cat_dtypes_same = True + cat_dtypes_equal = True for dtype in cat_dtypes: if not is_dtype_equal(dtype, dtype_ref): - cat_dtypes_same = False + cat_dtypes_equal = False break - if cat_dtypes_same: + if cat_dtypes_equal: return dtype_ref if any(isinstance(t, ExtensionDtype) for t in types): From 8031f8fdd143fdcf1bc2ab32ca9c3ce08de4079a Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 18 Oct 2020 20:55:10 +0300 Subject: [PATCH 34/60] Update cast.py --- pandas/core/dtypes/cast.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5e7712281f748..a9fbadcd29de3 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1578,12 +1578,7 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: cat_dtypes.append(t) if len(cat_dtypes) > 0: dtype_ref = cat_dtypes[0] - cat_dtypes_equal = True - for dtype in cat_dtypes: - if not is_dtype_equal(dtype, dtype_ref): - cat_dtypes_equal = False - break - if cat_dtypes_equal: + if all(is_dtype_equal(dtype, dtype_ref) for dtype in cat_dtypes): return dtype_ref if any(isinstance(t, ExtensionDtype) for t in types): From c08c6c0ff7070f93a3074d8e6078d07b34e6c1d4 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Wed, 4 Nov 2020 20:34:34 +0200 Subject: [PATCH 35/60] Update test_categorical.py --- pandas/tests/series/test_categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index 469682c1b57ac..a0ec9a1f8be98 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -37,7 +37,7 @@ def test_loc_new_row_category_dtype_retention(self): df = pd.DataFrame( { "int": [0, 1, 2], - "cat": pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), } ) df.loc[3] = [3, "c"] @@ -45,7 +45,7 @@ def test_loc_new_row_category_dtype_retention(self): expected = pd.DataFrame( { "int": [0, 1, 2, 3], - "cat": pd.Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]), + "cat": Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]), } ) From c862d991035609103c6e7b95ab822ae8071c5df2 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 28 Nov 2020 21:47:04 +0200 Subject: [PATCH 36/60] Revert previous approach and include concat changes --- pandas/core/dtypes/cast.py | 6 ++---- pandas/core/dtypes/concat.py | 13 +++++++++++-- pandas/core/indexing.py | 17 ++--------------- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 13ef014eaa47d..96a590b1924f2 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1603,10 +1603,8 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: # If set of dtypes contains only categoricals (with the exception of strings) # then the common dtype will be the categorical (in case it's the only one) is_cat_or_str = lambda x: is_categorical_dtype(x) or is_string_dtype(x) - if all(is_cat_or_str(t) for t in types) and not any( - is_object_dtype(t) for t in types - ): - # Return union of the categorical dtypes? + if all(is_cat_or_str(t) for t in types): + # Should we extend this to use the union of categorical dtypes? cat_dtypes = [] for t in types: if is_categorical_dtype(t): diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 63e3440558c75..9ab87c28a911a 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -7,7 +7,11 @@ from pandas._typing import ArrayLike, DtypeObj -from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.cast import ( + convert_dtypes, + find_common_type, + maybe_downcast_to_dtype, +) from pandas.core.dtypes.common import ( is_categorical_dtype, is_dtype_equal, @@ -142,7 +146,12 @@ def is_nonempty(x) -> bool: # we ignore axis here, as internally concatting with EAs is always # for axis=0 if not single_dtype: - target_dtype = find_common_type([x.dtype for x in to_concat]) + conv_types = [convert_dtypes(x) for x in to_concat] + for i in range(len(to_concat)): + if conv_types[i] == "string": + conv_types[i] = np.dtype(str) + to_concat[i] = maybe_downcast_to_dtype(to_concat[i], conv_types[i]) + target_dtype = find_common_type(conv_types) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] if isinstance(to_concat[0], ExtensionArray): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 4e11f2becb8b2..6aa031af64833 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1860,9 +1860,7 @@ def _setitem_with_indexer_missing(self, indexer, value): """ Insert new row(s) or column(s) into the Series or DataFrame. """ - from pandas.core.dtypes.cast import find_common_type - - from pandas import DataFrame, Series + from pandas import Series # reindex the axis to the new value # and set inplace @@ -1914,18 +1912,7 @@ def _setitem_with_indexer_missing(self, indexer, value): if len(value) != len(self.obj.columns): raise ValueError("cannot set a row with mismatched columns") - if len(set(self.obj.dtypes)) > 1: - value = list(value) - for i in range(len(self.obj.columns)): - dtype = find_common_type([self.obj.dtypes[i], type(value[i])]) - value[i] = Series(data=[value[i]], dtype=dtype) - value = DataFrame(dict(zip(self.obj.columns, value))) - value.index = [indexer] - else: - dtype = find_common_type([self.obj.dtypes[0], type(value)]) - value = Series( - value, index=self.obj.columns, name=indexer, dtype=dtype - ) + value = Series(value, index=self.obj.columns, name=indexer) self.obj._mgr = self.obj.append(value)._mgr self.obj._maybe_update_cacher(clear=True) From 5baa314c0961d17cacfea5f6c0583be8ade96bee Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 28 Nov 2020 22:05:57 +0200 Subject: [PATCH 37/60] Remove non-required convertion --- pandas/core/dtypes/concat.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 9ab87c28a911a..8e4b1020a2555 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -147,10 +147,6 @@ def is_nonempty(x) -> bool: # for axis=0 if not single_dtype: conv_types = [convert_dtypes(x) for x in to_concat] - for i in range(len(to_concat)): - if conv_types[i] == "string": - conv_types[i] = np.dtype(str) - to_concat[i] = maybe_downcast_to_dtype(to_concat[i], conv_types[i]) target_dtype = find_common_type(conv_types) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] From cb5d8e49a2eebda89f7bf00223ac82065b52867d Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 28 Nov 2020 22:12:56 +0200 Subject: [PATCH 38/60] Update concat.py --- pandas/core/dtypes/concat.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 8e4b1020a2555..266d9bf72754d 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -9,8 +9,7 @@ from pandas.core.dtypes.cast import ( convert_dtypes, - find_common_type, - maybe_downcast_to_dtype, + find_common_type ) from pandas.core.dtypes.common import ( is_categorical_dtype, From 7d7da20908ccebc93fb6b236d1002623766a2e70 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 28 Nov 2020 22:18:58 +0200 Subject: [PATCH 39/60] Update concat.py --- pandas/core/dtypes/concat.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 266d9bf72754d..e17aa72de6285 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -7,10 +7,7 @@ from pandas._typing import ArrayLike, DtypeObj -from pandas.core.dtypes.cast import ( - convert_dtypes, - find_common_type -) +from pandas.core.dtypes.cast import convert_dtypes, find_common_type from pandas.core.dtypes.common import ( is_categorical_dtype, is_dtype_equal, From ecad50f07cd533ff06ef7a8fd7093e3a1e2cb127 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 6 Dec 2020 17:15:38 +0200 Subject: [PATCH 40/60] Update cast.py --- pandas/core/dtypes/cast.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 96a590b1924f2..edd0e9d4cb75f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1602,8 +1602,10 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: # If set of dtypes contains only categoricals (with the exception of strings) # then the common dtype will be the categorical (in case it's the only one) - is_cat_or_str = lambda x: is_categorical_dtype(x) or is_string_dtype(x) - if all(is_cat_or_str(t) for t in types): + is_cat_or_str = lambda x: is_categorical_dtype(x) | is_string_dtype(x) + if all(is_cat_or_str(t) for t in types) and not any( + is_object_dtype(t) for t in types + ): # Should we extend this to use the union of categorical dtypes? cat_dtypes = [] for t in types: From af5e1414a9a48b5431c557e56a02e2514565e93d Mon Sep 17 00:00:00 2001 From: chrispe Date: Mon, 15 Feb 2021 11:07:53 +0200 Subject: [PATCH 41/60] Add new version with raise --- pandas/core/dtypes/cast.py | 16 ---- pandas/core/dtypes/concat.py | 79 +++++++++++++++- pandas/tests/series/test_categorical.py | 116 ++++++++++++++++++------ 3 files changed, 162 insertions(+), 49 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 51291f6401c98..e27c519304e2e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1609,22 +1609,6 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: # get unique types (dict.fromkeys is used as order-preserving set()) types = list(dict.fromkeys(types).keys()) - # If set of dtypes contains only categoricals (with the exception of strings) - # then the common dtype will be the categorical (in case it's the only one) - is_cat_or_str = lambda x: is_categorical_dtype(x) | is_string_dtype(x) - if all(is_cat_or_str(t) for t in types) and not any( - is_object_dtype(t) for t in types - ): - # Should we extend this to use the union of categorical dtypes? - cat_dtypes = [] - for t in types: - if is_categorical_dtype(t): - cat_dtypes.append(t) - if len(cat_dtypes) > 0: - dtype_ref = cat_dtypes[0] - if all(is_dtype_equal(dtype, dtype_ref) for dtype in cat_dtypes): - return dtype_ref - if any(isinstance(t, ExtensionDtype) for t in types): for t in types: if isinstance(t, ExtensionDtype): diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 70ad212ccd980..7bec779f6607c 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -7,7 +7,7 @@ from pandas._typing import ArrayLike, DtypeObj -from pandas.core.dtypes.cast import convert_dtypes, find_common_type +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_categorical_dtype, is_dtype_equal, @@ -15,6 +15,7 @@ is_sparse, ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCSeries +from pandas.core.dtypes.missing import isna from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray @@ -61,6 +62,69 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: return arr.astype(dtype, copy=False) +def _can_cast_to_categorical(to_cast): + """ + Evaluates if a list of arrays can be casted to a single categorical dtype. + The categorical dtype to cast to, is determined by any of the arrays which + is already of categorical dtype. If no such array exists, or if the existing + categorical dtype does not contain any of the unique values of the other arrays, + then it will return False. + + Parameters + ---------- + to_cast : array of arrays + + Returns + ------- + True if possible to cast to a single categorical dtype, False otherwise. + """ + if len(to_cast) == 0: + raise ValueError("No arrays to cast") + + types = [x.dtype for x in to_cast] + + # If any of the arrays is of categorical dtype, then we will use it as a reference. + # If no such array exists, then we just return. + if any(is_categorical_dtype(t) for t in types): + cat_dtypes = [] + for t in types: + if ( + is_categorical_dtype(t) + and len(t.categories.values) > 0 + and any(isna(t.categories.values) == False) + ): + categorical_values_dtype = t.categories.values.dtype + if all( + is_categorical_dtype(x) or np.can_cast(categorical_values_dtype, x) + for x in types + ): + cat_dtypes.append(t) + if len(cat_dtypes) == 0 or any( + not is_dtype_equal(dtype, cat_dtypes[0]) for dtype in cat_dtypes[1:] + ): + return False + else: + return False + + def categorical_contains_values(categorical_dtype, x): + unique_values = np.unique(x[~isna(x)]) + if any( + val not in categorical_dtype.categories for val in unique_values.tolist() + ): + return False + return True + + if not all( + categorical_contains_values(to_cast[0].dtype, other) or len(other) == 0 + for other in to_cast[1:] + ): + raise ValueError( + "Cannot concat on a Categorical with a new category, set the categories first" + ) + + return True + + def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False): """ provide concatenation of an array of arrays each of which is a single @@ -108,8 +172,17 @@ def is_nonempty(x) -> bool: # we ignore axis here, as internally concatting with EAs is always # for axis=0 if not single_dtype: - conv_types = [convert_dtypes(x) for x in to_concat] - target_dtype = find_common_type(conv_types) + # Special case for handling concat with categorical series. + # We need to make sure that categorical dtype is preserved + # when an array of valid values is given (GH#25383) + if ( + isinstance(to_concat[0], ExtensionArray) + and all(x.shape[0] == 1 for x in to_concat[1:]) + and _can_cast_to_categorical(to_concat) + ): + target_dtype = to_concat[0].dtype + else: + target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] if isinstance(to_concat[0], ExtensionArray): diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index a0ec9a1f8be98..058da1bcc16a2 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -1,39 +1,27 @@ +import pytest + +from pandas.core.dtypes.concat import _can_cast_to_categorical + import pandas as pd -from pandas import Categorical, Index +from pandas import Categorical import pandas._testing as tm class TestCategoricalSeries: - def test_unused_category_retention(self): - # Init case - exp_cats = Index(["a", "b", "c", "d"]) - ser = pd.Series(Categorical(["a", "b", "c"], categories=exp_cats)) - tm.assert_index_equal(ser.cat.categories, exp_cats) - - # Modify case - ser.loc[0] = "b" - expected = pd.Series(Categorical(["b", "b", "c"], categories=exp_cats)) - tm.assert_index_equal(ser.cat.categories, exp_cats) - tm.assert_series_equal(ser, expected) - - def test_loc_new_category_nan_value(self): - df = pd.DataFrame( - { - "int": [0, 1, 2], - "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), - } - ) - df.loc[3] = [3, "d"] + def test_setitem_undefined_category_raises(self): + ser = pd.Series(Categorical(["a", "b", "c"])) + msg = r"Cannot setitem on a Categorical with a new category, set the categories first" + with pytest.raises(ValueError, match=msg): + ser.loc[2] = "d" - expected = pd.DataFrame( - { - "int": [0, 1, 2, 3], - "cat": Categorical(["a", "b", "c", pd.NA], categories=["a", "b", "c"]), - } - ) - tm.assert_frame_equal(df, expected) + def test_concat_undefined_category_raises(self): + ser = pd.Series(Categorical(["a", "b", "c"])) + msg = r"Cannot concat on a Categorical with a new category, set the categories first" + with pytest.raises(ValueError, match=msg): + ser.loc[3] = "d" - def test_loc_new_row_category_dtype_retention(self): + def test_loc_category_dtype_retention(self): + # Case 1 df = pd.DataFrame( { "int": [0, 1, 2], @@ -41,7 +29,6 @@ def test_loc_new_row_category_dtype_retention(self): } ) df.loc[3] = [3, "c"] - expected = pd.DataFrame( { "int": [0, 1, 2, 3], @@ -50,3 +37,72 @@ def test_loc_new_row_category_dtype_retention(self): ) tm.assert_frame_equal(df, expected) + + # Case 2 + ser = pd.Series(Categorical(["a", "b", "c"])) + ser.loc[3] = "c" + expected = pd.Series(Categorical(["a", "b", "c", "c"])) + tm.assert_series_equal(ser, expected) + + # Case 3 + ser = pd.Series(Categorical([1, 2, 3])) + ser.loc[3] = 3 + expected = pd.Series(Categorical([1, 2, 3, 3])) + tm.assert_series_equal(ser, expected) + + # Case 4 + ser = pd.Series(Categorical([1, 2, 3])) + ser.loc[3] = pd.NA + expected = pd.Series(Categorical([1, 2, 3, pd.NA])) + tm.assert_series_equal(ser, expected) + + def test_can_cast_to_categorical(self): + # Case 1: + # Series of identical categorical dtype should + # be able to concat to categorical + ser1 = pd.Series(Categorical(["a", "b", "c"])) + ser2 = pd.Series(Categorical(["a", "b", "c"])) + arr = [ser1, ser2] + assert _can_cast_to_categorical(arr) == True + + # Case 2: + # Series of non-identical categorical dtype should + # not be able to concat to categoorical + ser1 = pd.Series(Categorical(["a", "b", "c"])) + ser2 = pd.Series(Categorical(["a", "b", "d"])) + arr = [ser1, ser2] + assert _can_cast_to_categorical(arr) == False + + # Concat of a categorical series with a series + # containing only values identical to the + # categorical values should be possible + + # Case 3: For string categorical values + ser1 = pd.Series(Categorical(["a", "b", "c"])) + ser2 = pd.Series(["a", "a", "b"]) + arr = [ser1, ser2] + assert _can_cast_to_categorical(arr) == True + + # Case 4: For int categorical values + ser1 = pd.Series(Categorical([1, 2, 3])) + ser2 = pd.Series([1, 2]) + arr = [ser1, ser2] + assert _can_cast_to_categorical(arr) == True + + # The rest should raise because not all values + # are present in the categorical. + + # Case 5 + ser1 = pd.Series(Categorical([1, 2, 3])) + ser2 = pd.Series([3, 4]) + arr = [ser1, ser2] + msg = r"Cannot concat on a Categorical with a new category, set the categories first" + with pytest.raises(ValueError, match=msg): + _can_cast_to_categorical(arr) + + # Case 6 + ser1 = pd.Series(Categorical(["a", "b", "c"])) + ser2 = pd.Series(["d", "e"]) + arr = [ser1, ser2] + with pytest.raises(ValueError, match=msg): + _can_cast_to_categorical(arr) From 6d45570b6fbbb1d492f3d8461e6f704eaaf1e168 Mon Sep 17 00:00:00 2001 From: chrispe Date: Mon, 15 Feb 2021 11:24:31 +0200 Subject: [PATCH 42/60] Add format fixes --- pandas/core/dtypes/concat.py | 5 +++-- pandas/tests/series/test_categorical.py | 18 ++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 7bec779f6607c..e4d61be8c895c 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -91,7 +91,7 @@ def _can_cast_to_categorical(to_cast): if ( is_categorical_dtype(t) and len(t.categories.values) > 0 - and any(isna(t.categories.values) == False) + and any(~isna(t.categories.values)) ): categorical_values_dtype = t.categories.values.dtype if all( @@ -119,7 +119,8 @@ def categorical_contains_values(categorical_dtype, x): for other in to_cast[1:] ): raise ValueError( - "Cannot concat on a Categorical with a new category, set the categories first" + "Cannot concat on a Categorical with a new category, " + "set the categories first" ) return True diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index 058da1bcc16a2..8f35290fdf396 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -10,13 +10,15 @@ class TestCategoricalSeries: def test_setitem_undefined_category_raises(self): ser = pd.Series(Categorical(["a", "b", "c"])) - msg = r"Cannot setitem on a Categorical with a new category, set the categories first" + msg = "Cannot setitem on a Categorical with a new category, "\ + "set the categories first" with pytest.raises(ValueError, match=msg): ser.loc[2] = "d" def test_concat_undefined_category_raises(self): ser = pd.Series(Categorical(["a", "b", "c"])) - msg = r"Cannot concat on a Categorical with a new category, set the categories first" + msg = "Cannot concat on a Categorical with a new category, "\ + "set the categories first" with pytest.raises(ValueError, match=msg): ser.loc[3] = "d" @@ -35,7 +37,6 @@ def test_loc_category_dtype_retention(self): "cat": Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]), } ) - tm.assert_frame_equal(df, expected) # Case 2 @@ -63,7 +64,7 @@ def test_can_cast_to_categorical(self): ser1 = pd.Series(Categorical(["a", "b", "c"])) ser2 = pd.Series(Categorical(["a", "b", "c"])) arr = [ser1, ser2] - assert _can_cast_to_categorical(arr) == True + assert _can_cast_to_categorical(arr) is True # Case 2: # Series of non-identical categorical dtype should @@ -71,7 +72,7 @@ def test_can_cast_to_categorical(self): ser1 = pd.Series(Categorical(["a", "b", "c"])) ser2 = pd.Series(Categorical(["a", "b", "d"])) arr = [ser1, ser2] - assert _can_cast_to_categorical(arr) == False + assert _can_cast_to_categorical(arr) is False # Concat of a categorical series with a series # containing only values identical to the @@ -81,13 +82,13 @@ def test_can_cast_to_categorical(self): ser1 = pd.Series(Categorical(["a", "b", "c"])) ser2 = pd.Series(["a", "a", "b"]) arr = [ser1, ser2] - assert _can_cast_to_categorical(arr) == True + assert _can_cast_to_categorical(arr) is True # Case 4: For int categorical values ser1 = pd.Series(Categorical([1, 2, 3])) ser2 = pd.Series([1, 2]) arr = [ser1, ser2] - assert _can_cast_to_categorical(arr) == True + assert _can_cast_to_categorical(arr) is True # The rest should raise because not all values # are present in the categorical. @@ -96,7 +97,8 @@ def test_can_cast_to_categorical(self): ser1 = pd.Series(Categorical([1, 2, 3])) ser2 = pd.Series([3, 4]) arr = [ser1, ser2] - msg = r"Cannot concat on a Categorical with a new category, set the categories first" + msg = "Cannot concat on a Categorical with a new category, "\ + "set the categories first" with pytest.raises(ValueError, match=msg): _can_cast_to_categorical(arr) From 31612ed4bde8302a36f41e305a2abcc8adc4bc41 Mon Sep 17 00:00:00 2001 From: chrispe Date: Mon, 15 Feb 2021 11:31:04 +0200 Subject: [PATCH 43/60] Update test_categorical.py --- pandas/tests/series/test_categorical.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index 8f35290fdf396..44aa318c1d3d6 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -10,15 +10,19 @@ class TestCategoricalSeries: def test_setitem_undefined_category_raises(self): ser = pd.Series(Categorical(["a", "b", "c"])) - msg = "Cannot setitem on a Categorical with a new category, "\ - "set the categories first" + msg = ( + "Cannot setitem on a Categorical with a new category, " + "set the categories first" + ) with pytest.raises(ValueError, match=msg): ser.loc[2] = "d" def test_concat_undefined_category_raises(self): ser = pd.Series(Categorical(["a", "b", "c"])) - msg = "Cannot concat on a Categorical with a new category, "\ - "set the categories first" + msg = ( + "Cannot concat on a Categorical with a new category, " + "set the categories first" + ) with pytest.raises(ValueError, match=msg): ser.loc[3] = "d" @@ -97,8 +101,10 @@ def test_can_cast_to_categorical(self): ser1 = pd.Series(Categorical([1, 2, 3])) ser2 = pd.Series([3, 4]) arr = [ser1, ser2] - msg = "Cannot concat on a Categorical with a new category, "\ - "set the categories first" + msg = ( + "Cannot concat on a Categorical with a new category, " + "set the categories first" + ) with pytest.raises(ValueError, match=msg): _can_cast_to_categorical(arr) From 6d9e6670874ca503389df9308fe59fd55affe5ef Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 20 Feb 2021 13:58:49 +0200 Subject: [PATCH 44/60] Update --- pandas/core/dtypes/cast.py | 26 +++++++- pandas/core/dtypes/concat.py | 87 +++++-------------------- pandas/tests/series/test_categorical.py | 58 +---------------- 3 files changed, 41 insertions(+), 130 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 669bfe08d42b0..deb9dec8fbb6a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1639,13 +1639,14 @@ def sanitize_to_nanoseconds(values: np.ndarray) -> np.ndarray: return values -def find_common_type(types: List[DtypeObj]) -> DtypeObj: +def find_common_type(types: List[DtypeObj], prio_cat_dtype: bool = False) -> DtypeObj: """ Find a common data type among the given dtypes. Parameters ---------- types : list of dtypes + prio_cat_dtype: set priority towards finding a categorical dtype Returns ------- @@ -1661,6 +1662,29 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: first = types[0] + # categorical dtypes should not be casted to a new dtype + # if priority is set accodring to prio_cat_dtype + if prio_cat_dtype: + if any(is_categorical_dtype(t) for t in types): + cat_dtypes = [] + for t in types: + if ( + is_categorical_dtype(t) + and len(t.categories.values) > 0 + and any(~isna(t.categories.values)) + ): + categorical_values_dtype = t.categories.values.dtype + if all( + is_categorical_dtype(x) + or np.can_cast(categorical_values_dtype, x) + for x in types + ): + cat_dtypes.append(t) + if len(cat_dtypes) > 0: + dtype_ref = cat_dtypes[0] + if all(is_dtype_equal(dtype, dtype_ref) for dtype in cat_dtypes[1:]): + return dtype_ref + # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2) # => object if all(is_dtype_equal(first, t) for t in types[1:]): diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b8aa163fc027f..644fa4b75ca87 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -21,6 +21,7 @@ ABCCategoricalIndex, ABCSeries, ) +from pandas.core.dtypes.missing import isna from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray @@ -35,6 +36,14 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: Helper function for `arr.astype(common_dtype)` but handling all special cases. """ + if is_categorical_dtype(dtype): + unique_values = np.unique(arr[~isna(arr)]) + if any(val not in dtype.categories for val in unique_values.tolist()): + raise ValueError( + "Cannot setitem on a Categorical with a new category, " + "set the categories first" + ) + if ( is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) @@ -70,70 +79,6 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: return arr.astype(dtype, copy=False) -def _can_cast_to_categorical(to_cast): - """ - Evaluates if a list of arrays can be casted to a single categorical dtype. - The categorical dtype to cast to, is determined by any of the arrays which - is already of categorical dtype. If no such array exists, or if the existing - categorical dtype does not contain any of the unique values of the other arrays, - then it will return False. - - Parameters - ---------- - to_cast : array of arrays - - Returns - ------- - True if possible to cast to a single categorical dtype, False otherwise. - """ - if len(to_cast) == 0: - raise ValueError("No arrays to cast") - - types = [x.dtype for x in to_cast] - - # If any of the arrays is of categorical dtype, then we will use it as a reference. - # If no such array exists, then we just return. - if any(is_categorical_dtype(t) for t in types): - cat_dtypes = [] - for t in types: - if ( - is_categorical_dtype(t) - and len(t.categories.values) > 0 - and any(~isna(t.categories.values)) - ): - categorical_values_dtype = t.categories.values.dtype - if all( - is_categorical_dtype(x) or np.can_cast(categorical_values_dtype, x) - for x in types - ): - cat_dtypes.append(t) - if len(cat_dtypes) == 0 or any( - not is_dtype_equal(dtype, cat_dtypes[0]) for dtype in cat_dtypes[1:] - ): - return False - else: - return False - - def categorical_contains_values(categorical_dtype, x): - unique_values = np.unique(x[~isna(x)]) - if any( - val not in categorical_dtype.categories for val in unique_values.tolist() - ): - return False - return True - - if not all( - categorical_contains_values(to_cast[0].dtype, other) or len(other) == 0 - for other in to_cast[1:] - ): - raise ValueError( - "Cannot concat on a Categorical with a new category, " - "set the categories first" - ) - - return True - - def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False): """ provide concatenation of an array of arrays each of which is a single @@ -184,14 +129,12 @@ def is_nonempty(x) -> bool: # Special case for handling concat with categorical series. # We need to make sure that categorical dtype is preserved # when an array of valid values is given (GH#25383) - if ( - isinstance(to_concat[0], ExtensionArray) - and all(x.shape[0] == 1 for x in to_concat[1:]) - and _can_cast_to_categorical(to_concat) - ): - target_dtype = to_concat[0].dtype - else: - target_dtype = find_common_type([x.dtype for x in to_concat]) + use_index_expansion = len(to_concat) == 2 and all( + x.shape[0] == 1 for x in to_concat[1:] + ) + target_dtype = find_common_type( + [x.dtype for x in to_concat], prio_cat_dtype=use_index_expansion + ) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] if isinstance(to_concat[0], ExtensionArray): diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index 44aa318c1d3d6..ab5afeaef7381 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -1,7 +1,5 @@ import pytest -from pandas.core.dtypes.concat import _can_cast_to_categorical - import pandas as pd from pandas import Categorical import pandas._testing as tm @@ -20,7 +18,7 @@ def test_setitem_undefined_category_raises(self): def test_concat_undefined_category_raises(self): ser = pd.Series(Categorical(["a", "b", "c"])) msg = ( - "Cannot concat on a Categorical with a new category, " + "Cannot setitem on a Categorical with a new category, " "set the categories first" ) with pytest.raises(ValueError, match=msg): @@ -60,57 +58,3 @@ def test_loc_category_dtype_retention(self): ser.loc[3] = pd.NA expected = pd.Series(Categorical([1, 2, 3, pd.NA])) tm.assert_series_equal(ser, expected) - - def test_can_cast_to_categorical(self): - # Case 1: - # Series of identical categorical dtype should - # be able to concat to categorical - ser1 = pd.Series(Categorical(["a", "b", "c"])) - ser2 = pd.Series(Categorical(["a", "b", "c"])) - arr = [ser1, ser2] - assert _can_cast_to_categorical(arr) is True - - # Case 2: - # Series of non-identical categorical dtype should - # not be able to concat to categoorical - ser1 = pd.Series(Categorical(["a", "b", "c"])) - ser2 = pd.Series(Categorical(["a", "b", "d"])) - arr = [ser1, ser2] - assert _can_cast_to_categorical(arr) is False - - # Concat of a categorical series with a series - # containing only values identical to the - # categorical values should be possible - - # Case 3: For string categorical values - ser1 = pd.Series(Categorical(["a", "b", "c"])) - ser2 = pd.Series(["a", "a", "b"]) - arr = [ser1, ser2] - assert _can_cast_to_categorical(arr) is True - - # Case 4: For int categorical values - ser1 = pd.Series(Categorical([1, 2, 3])) - ser2 = pd.Series([1, 2]) - arr = [ser1, ser2] - assert _can_cast_to_categorical(arr) is True - - # The rest should raise because not all values - # are present in the categorical. - - # Case 5 - ser1 = pd.Series(Categorical([1, 2, 3])) - ser2 = pd.Series([3, 4]) - arr = [ser1, ser2] - msg = ( - "Cannot concat on a Categorical with a new category, " - "set the categories first" - ) - with pytest.raises(ValueError, match=msg): - _can_cast_to_categorical(arr) - - # Case 6 - ser1 = pd.Series(Categorical(["a", "b", "c"])) - ser2 = pd.Series(["d", "e"]) - arr = [ser1, ser2] - with pytest.raises(ValueError, match=msg): - _can_cast_to_categorical(arr) From e0da6558ea6e72386c43cb946908337031408fd4 Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 20 Feb 2021 16:31:29 +0200 Subject: [PATCH 45/60] Use prio_cat_dtype only for EAs --- pandas/core/dtypes/concat.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 644fa4b75ca87..aac8b505d2596 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -37,6 +37,8 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: cases. """ if is_categorical_dtype(dtype): + # if casting an array to a categorical dtype, then we need to ensure + # that its unique values are predefined as categories in that dtype unique_values = np.unique(arr[~isna(arr)]) if any(val not in dtype.categories for val in unique_values.tolist()): raise ValueError( @@ -121,23 +123,21 @@ def is_nonempty(x) -> bool: all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) + first_ea = isinstance(to_concat[0], ExtensionArray) + arr_index_expansion = ( + first_ea and len(to_concat) == 2 and to_concat[1].shape[0] == 1 + ) if any_ea: # we ignore axis here, as internally concatting with EAs is always # for axis=0 if not single_dtype: - # Special case for handling concat with categorical series. - # We need to make sure that categorical dtype is preserved - # when an array of valid values is given (GH#25383) - use_index_expansion = len(to_concat) == 2 and all( - x.shape[0] == 1 for x in to_concat[1:] - ) target_dtype = find_common_type( - [x.dtype for x in to_concat], prio_cat_dtype=use_index_expansion + [x.dtype for x in to_concat], prio_cat_dtype=arr_index_expansion ) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] - if isinstance(to_concat[0], ExtensionArray): + if first_ea: cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: From 92d1f143989e3ec2399de0fee2f05e2049040068 Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 20 Feb 2021 17:37:47 +0200 Subject: [PATCH 46/60] Revert usage of first_ea --- pandas/core/dtypes/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index aac8b505d2596..8e268689fb959 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -137,7 +137,7 @@ def is_nonempty(x) -> bool: ) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] - if first_ea: + if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: From 9b9b382e22fc0698894cafb02e0153608d3fbe1e Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 20 Feb 2021 19:24:50 +0200 Subject: [PATCH 47/60] Fix mypy errors --- pandas/core/dtypes/cast.py | 21 +++++++++------------ pandas/core/dtypes/concat.py | 3 ++- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index deb9dec8fbb6a..c9eeb8c9bce22 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -88,6 +88,7 @@ is_unsigned_integer_dtype, ) from pandas.core.dtypes.dtypes import ( + CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype, @@ -1668,18 +1669,14 @@ def find_common_type(types: List[DtypeObj], prio_cat_dtype: bool = False) -> Dty if any(is_categorical_dtype(t) for t in types): cat_dtypes = [] for t in types: - if ( - is_categorical_dtype(t) - and len(t.categories.values) > 0 - and any(~isna(t.categories.values)) - ): - categorical_values_dtype = t.categories.values.dtype - if all( - is_categorical_dtype(x) - or np.can_cast(categorical_values_dtype, x) - for x in types - ): - cat_dtypes.append(t) + if isinstance(t, CategoricalDtype): + if any(~isna(t.categories.values)): + cat_values_dtype = t.categories.values.dtype + if all( + is_categorical_dtype(x) or np.can_cast(cat_values_dtype, x) + for x in types + ): + cat_dtypes.append(t) if len(cat_dtypes) > 0: dtype_ref = cat_dtypes[0] if all(is_dtype_equal(dtype, dtype_ref) for dtype in cat_dtypes[1:]): diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 8e268689fb959..60a5a0b9d0e15 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -17,6 +17,7 @@ is_extension_array_dtype, is_sparse, ) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ( ABCCategoricalIndex, ABCSeries, @@ -36,7 +37,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: Helper function for `arr.astype(common_dtype)` but handling all special cases. """ - if is_categorical_dtype(dtype): + if isinstance(dtype, CategoricalDtype): # if casting an array to a categorical dtype, then we need to ensure # that its unique values are predefined as categories in that dtype unique_values = np.unique(arr[~isna(arr)]) From d3df994e91e9d1265f1c2d0e1ee2250944736947 Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 20 Feb 2021 20:12:40 +0200 Subject: [PATCH 48/60] Use unique1d in _cast_to_common_type --- pandas/core/dtypes/concat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 60a5a0b9d0e15..b822e64a81f86 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -10,6 +10,7 @@ DtypeObj, ) +from pandas.core.algorithms import unique1d from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -40,7 +41,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: if isinstance(dtype, CategoricalDtype): # if casting an array to a categorical dtype, then we need to ensure # that its unique values are predefined as categories in that dtype - unique_values = np.unique(arr[~isna(arr)]) + unique_values = unique1d(arr[~isna(arr)]) if any(val not in dtype.categories for val in unique_values.tolist()): raise ValueError( "Cannot setitem on a Categorical with a new category, " From 41aa9e37bb6686b7069c9ad75e9a52f4c6362baf Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 20 Feb 2021 20:33:39 +0200 Subject: [PATCH 49/60] Fix isort error --- pandas/core/dtypes/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b822e64a81f86..dc841bf050798 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -10,7 +10,6 @@ DtypeObj, ) -from pandas.core.algorithms import unique1d from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -25,6 +24,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.algorithms import unique1d from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray from pandas.core.construction import ( From ca0eb1fbd009121964ed80e761eb6f98eac88573 Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 20 Feb 2021 22:19:17 +0200 Subject: [PATCH 50/60] Renamed input variable for find_common_type --- pandas/core/dtypes/cast.py | 14 +++++++++----- pandas/core/dtypes/concat.py | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c9eeb8c9bce22..cb971f28e6a55 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1640,14 +1640,16 @@ def sanitize_to_nanoseconds(values: np.ndarray) -> np.ndarray: return values -def find_common_type(types: List[DtypeObj], prio_cat_dtype: bool = False) -> DtypeObj: +def find_common_type( + types: List[DtypeObj], promote_categorical: bool = False +) -> DtypeObj: """ Find a common data type among the given dtypes. Parameters ---------- types : list of dtypes - prio_cat_dtype: set priority towards finding a categorical dtype + promote_categorical : find if possible, a categorical dtype that fits all the dtypes Returns ------- @@ -1663,9 +1665,11 @@ def find_common_type(types: List[DtypeObj], prio_cat_dtype: bool = False) -> Dty first = types[0] - # categorical dtypes should not be casted to a new dtype - # if priority is set accodring to prio_cat_dtype - if prio_cat_dtype: + # We will first try to find a common categorical dtype + # if promote_categorical is set to True. This is used + # to preserve the categorical dtype (since categorical + # values can consist of multiple dtypes). + if promote_categorical: if any(is_categorical_dtype(t) for t in types): cat_dtypes = [] for t in types: diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index dc841bf050798..73f0d70c1f5f7 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -135,7 +135,7 @@ def is_nonempty(x) -> bool: # for axis=0 if not single_dtype: target_dtype = find_common_type( - [x.dtype for x in to_concat], prio_cat_dtype=arr_index_expansion + [x.dtype for x in to_concat], promote_categorical=arr_index_expansion ) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] From 931d6c804786c23c128781a8e57e6d04731a47a7 Mon Sep 17 00:00:00 2001 From: chrispe Date: Sun, 7 Mar 2021 19:31:40 +0200 Subject: [PATCH 51/60] Remove new argument in find_common_type --- pandas/core/dtypes/cast.py | 43 +++++++++++++------------ pandas/core/dtypes/concat.py | 8 +---- pandas/tests/series/test_categorical.py | 6 ---- 3 files changed, 24 insertions(+), 33 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2c1a6e7b5f710..5ad5d7865429e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1769,16 +1769,13 @@ def ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj: return dtype -def find_common_type( - types: List[DtypeObj], promote_categorical: bool = False -) -> DtypeObj: +def find_common_type(types: List[DtypeObj]) -> DtypeObj: """ Find a common data type among the given dtypes. Parameters ---------- types : list of dtypes - promote_categorical : find if possible, a categorical dtype that fits all the dtypes Returns ------- @@ -1798,22 +1795,28 @@ def find_common_type( # if promote_categorical is set to True. This is used # to preserve the categorical dtype (since categorical # values can consist of multiple dtypes). - if promote_categorical: - if any(is_categorical_dtype(t) for t in types): - cat_dtypes = [] - for t in types: - if isinstance(t, CategoricalDtype): - if any(~isna(t.categories.values)): - cat_values_dtype = t.categories.values.dtype - if all( - is_categorical_dtype(x) or np.can_cast(cat_values_dtype, x) - for x in types - ): - cat_dtypes.append(t) - if len(cat_dtypes) > 0: - dtype_ref = cat_dtypes[0] - if all(is_dtype_equal(dtype, dtype_ref) for dtype in cat_dtypes[1:]): - return dtype_ref + if any(is_categorical_dtype(t) for t in types): + cat_dtypes = [] + for t in types: + if isinstance(t, CategoricalDtype): + if any(~isna(t.categories.values)): + cat_values_dtype = t.categories.values.dtype + if all( + ( + is_categorical_dtype(x) + or ( + is_numeric_dtype(cat_values_dtype) + and is_numeric_dtype(x) + ) + or np.can_cast(x, cat_values_dtype) + ) + for x in types + ): + cat_dtypes.append(t) + if len(cat_dtypes) > 0: + dtype_ref = cat_dtypes[0] + if all(is_dtype_equal(dtype, dtype_ref) for dtype in cat_dtypes[1:]): + return dtype_ref # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2) # => object diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index c01e0d5f0f6d3..f6049dc8d295d 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -125,18 +125,12 @@ def is_nonempty(x) -> bool: all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) - first_ea = isinstance(to_concat[0], ExtensionArray) - arr_index_expansion = ( - first_ea and len(to_concat) == 2 and to_concat[1].shape[0] == 1 - ) if any_ea: # we ignore axis here, as internally concatting with EAs is always # for axis=0 if not single_dtype: - target_dtype = find_common_type( - [x.dtype for x in to_concat], promote_categorical=arr_index_expansion - ) + target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] if isinstance(to_concat[0], ExtensionArray): diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py index ab5afeaef7381..9dba345c3db36 100644 --- a/pandas/tests/series/test_categorical.py +++ b/pandas/tests/series/test_categorical.py @@ -52,9 +52,3 @@ def test_loc_category_dtype_retention(self): ser.loc[3] = 3 expected = pd.Series(Categorical([1, 2, 3, 3])) tm.assert_series_equal(ser, expected) - - # Case 4 - ser = pd.Series(Categorical([1, 2, 3])) - ser.loc[3] = pd.NA - expected = pd.Series(Categorical([1, 2, 3, pd.NA])) - tm.assert_series_equal(ser, expected) From 8065ddb51af54b61e56f7b9e683f95a790d62f55 Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 13 Mar 2021 13:16:17 +0200 Subject: [PATCH 52/60] Add check to _get_common_dtype --- pandas/core/dtypes/cast.py | 28 ---------------------------- pandas/core/dtypes/dtypes.py | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5ad5d7865429e..6eca89e1a8744 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -88,7 +88,6 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( - CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype, @@ -1791,33 +1790,6 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: first = types[0] - # We will first try to find a common categorical dtype - # if promote_categorical is set to True. This is used - # to preserve the categorical dtype (since categorical - # values can consist of multiple dtypes). - if any(is_categorical_dtype(t) for t in types): - cat_dtypes = [] - for t in types: - if isinstance(t, CategoricalDtype): - if any(~isna(t.categories.values)): - cat_values_dtype = t.categories.values.dtype - if all( - ( - is_categorical_dtype(x) - or ( - is_numeric_dtype(cat_values_dtype) - and is_numeric_dtype(x) - ) - or np.can_cast(x, cat_values_dtype) - ) - for x in types - ): - cat_dtypes.append(t) - if len(cat_dtypes) > 0: - dtype_ref = cat_dtypes[0] - if all(is_dtype_equal(dtype, dtype_ref) for dtype in cat_dtypes[1:]): - return dtype_ref - # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2) # => object if all(is_dtype_equal(first, t) for t in types[1:]): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index da3a9269cf2c4..76691108f7dc5 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -614,6 +614,25 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: elif any(non_init_cats): return None + # case for compatible dtypes with categories.dtype + from pandas.core.dtypes.common import is_dtype_equal, is_extension_array_dtype + + non_identical_cat_dtype = [ + isinstance(x, CategoricalDtype) and x != self for x in dtypes + ] + if not any(non_identical_cat_dtype): + non_cat_dtypes_compat = [ + isinstance(x, CategoricalDtype) + or is_dtype_equal(x, self.categories.dtype) + or ( + not is_extension_array_dtype(x) + and np.can_cast(x, self.categories.dtype) + ) + for x in dtypes + ] + if all(non_cat_dtypes_compat): + return self + # categorical is aware of Sparse -> extract sparse subdtypes dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] # extract the categories' dtype From b21326b39ca46d4930d1618122af4a66cda4c9cf Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 13 Mar 2021 13:30:39 +0200 Subject: [PATCH 53/60] Update dtypes.py --- pandas/core/dtypes/dtypes.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 3eb1cbee3f9ba..b5f849261bf5c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -623,8 +623,11 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: return None # case for compatible dtypes with categories.dtype - from pandas.core.dtypes.common import is_dtype_equal, is_extension_array_dtype - + from pandas.core.dtypes.common import ( + is_dtype_equal, + is_extension_array_dtype + ) + non_identical_cat_dtype = [ isinstance(x, CategoricalDtype) and x != self for x in dtypes ] From 335fc0695c938599bfcbfbdc3d960cf6ab6ac62f Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 13 Mar 2021 13:48:05 +0200 Subject: [PATCH 54/60] Update dtypes.py --- pandas/core/dtypes/dtypes.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index b5f849261bf5c..3eb1cbee3f9ba 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -623,11 +623,8 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: return None # case for compatible dtypes with categories.dtype - from pandas.core.dtypes.common import ( - is_dtype_equal, - is_extension_array_dtype - ) - + from pandas.core.dtypes.common import is_dtype_equal, is_extension_array_dtype + non_identical_cat_dtype = [ isinstance(x, CategoricalDtype) and x != self for x in dtypes ] From 950dcc4ed6db7517b64404f3e3c1bbad8d8f0eb0 Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 13 Mar 2021 14:03:28 +0200 Subject: [PATCH 55/60] Update dtypes.py --- pandas/core/dtypes/dtypes.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 3eb1cbee3f9ba..e9579ce0276bc 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -42,6 +42,10 @@ ExtensionDtype, register_extension_dtype, ) +from pandas.core.dtypes.common import ( + is_dtype_equal, + is_extension_array_dtype, +) from pandas.core.dtypes.generic import ( ABCCategoricalIndex, ABCIndex, @@ -623,8 +627,6 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: return None # case for compatible dtypes with categories.dtype - from pandas.core.dtypes.common import is_dtype_equal, is_extension_array_dtype - non_identical_cat_dtype = [ isinstance(x, CategoricalDtype) and x != self for x in dtypes ] From 2ee1df89d8b18762fcc6d83895c53ed3d973cad3 Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 13 Mar 2021 14:43:10 +0200 Subject: [PATCH 56/60] Update dtypes.py --- pandas/core/dtypes/dtypes.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e9579ce0276bc..30ea2b7d3ac23 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -42,10 +42,6 @@ ExtensionDtype, register_extension_dtype, ) -from pandas.core.dtypes.common import ( - is_dtype_equal, - is_extension_array_dtype, -) from pandas.core.dtypes.generic import ( ABCCategoricalIndex, ABCIndex, @@ -633,9 +629,9 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: if not any(non_identical_cat_dtype): non_cat_dtypes_compat = [ isinstance(x, CategoricalDtype) - or is_dtype_equal(x, self.categories.dtype) + or x == self.categories.dtype or ( - not is_extension_array_dtype(x) + not isinstance(x, ExtensionDtype) and np.can_cast(x, self.categories.dtype) ) for x in dtypes From 17120f0f0fc46c074162287b5ddc9bf0d24370ea Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 13 Mar 2021 17:39:08 +0200 Subject: [PATCH 57/60] Test --- pandas/core/indexes/category.py | 43 +++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 62941a23c6459..e9a4cc32af588 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -620,6 +620,49 @@ def map(self, mapper): mapped = self._values.map(mapper) return Index(mapped, name=self.name) + def insert(self, loc: int, item): + """ + Make new Index inserting new item at location. Follows + Python list.append semantics for negative values. + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + + Raises + ------ + ValueError if the item is not valid for this dtype. + """ + from pandas.core.dtypes.cast import ( + find_common_type, + infer_dtype_from, + ) + arr = self._data + try: + code = arr._validate_scalar(item) + except (ValueError, TypeError): + # e.g. trying to insert an integer into a DatetimeIndex + # We cannot keep the same dtype, so cast to the (often object) + # minimal shared dtype before doing the insert. + dtype, _ = infer_dtype_from(item, pandas_dtype=True) + dtype = find_common_type([self.dtype.categories.dtype, dtype]) + return self.astype(dtype).insert(loc, item) + else: + new_vals = np.concatenate( + ( + arr._ndarray[:loc], + np.asarray([code], dtype=arr._ndarray.dtype), + arr._ndarray[loc:], + ) + ) + new_arr = arr._from_backing_data(new_vals) + return type(self)._simple_new(new_arr, name=self.name) + def _concat(self, to_concat: List[Index], name: Hashable) -> Index: # if calling index is category, don't check dtype of others try: From 439b49f6ea976af1e9fec15cad0456f125601ef5 Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 13 Mar 2021 20:57:11 +0200 Subject: [PATCH 58/60] Add flag in get_common_type --- pandas/core/dtypes/cast.py | 8 +++++- pandas/core/dtypes/concat.py | 4 ++- pandas/core/dtypes/dtypes.py | 16 ++++++------ pandas/core/indexes/category.py | 43 --------------------------------- 4 files changed, 17 insertions(+), 54 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 44650500e0f65..f5db5c8f2f981 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1849,7 +1849,9 @@ def ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj: return dtype -def find_common_type(types: List[DtypeObj]) -> DtypeObj: +def find_common_type( + types: List[DtypeObj], downcast_cat_dtype: Optional[bool] = True +) -> DtypeObj: """ Find a common data type among the given dtypes. @@ -1876,6 +1878,10 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: if all(is_dtype_equal(first, t) for t in types[1:]): return first + # downcast categorical to the dtype of their categories + if downcast_cat_dtype and not all(is_categorical_dtype(t) for t in types): + types = [t.categories.dtype if is_categorical_dtype(t) else t for t in types] + # get unique types (dict.fromkeys is used as order-preserving set()) types = list(dict.fromkeys(types).keys()) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index f0265ecbe06b9..6110a442da008 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -134,7 +134,9 @@ def is_nonempty(x) -> bool: # we ignore axis here, as internally concatting with EAs is always # for axis=0 if not single_dtype: - target_dtype = find_common_type([x.dtype for x in to_concat]) + target_dtype = find_common_type( + [x.dtype for x in to_concat], downcast_cat_dtype=False + ) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] if isinstance(to_concat[0], ExtensionArray): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 30ea2b7d3ac23..a27d9599158e5 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -605,7 +605,6 @@ def _is_boolean(self) -> bool: def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: from pandas.core.arrays.sparse import SparseDtype - # check if we have all categorical dtype with identical categories if all(isinstance(x, CategoricalDtype) for x in dtypes): first = dtypes[0] @@ -627,16 +626,15 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: isinstance(x, CategoricalDtype) and x != self for x in dtypes ] if not any(non_identical_cat_dtype): + non_cat_dtypes = [ + x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes + ] non_cat_dtypes_compat = [ - isinstance(x, CategoricalDtype) - or x == self.categories.dtype - or ( - not isinstance(x, ExtensionDtype) - and np.can_cast(x, self.categories.dtype) - ) - for x in dtypes + not isinstance(x, ExtensionDtype) + and np.can_cast(x, self.categories.dtype) + for x in non_cat_dtypes ] - if all(non_cat_dtypes_compat): + if all(non_cat_dtypes_compat) and self.categories is not None and len(self.categories) > 0: return self # categorical is aware of Sparse -> extract sparse subdtypes diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e9a4cc32af588..62941a23c6459 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -620,49 +620,6 @@ def map(self, mapper): mapped = self._values.map(mapper) return Index(mapped, name=self.name) - def insert(self, loc: int, item): - """ - Make new Index inserting new item at location. Follows - Python list.append semantics for negative values. - - Parameters - ---------- - loc : int - item : object - - Returns - ------- - new_index : Index - - Raises - ------ - ValueError if the item is not valid for this dtype. - """ - from pandas.core.dtypes.cast import ( - find_common_type, - infer_dtype_from, - ) - arr = self._data - try: - code = arr._validate_scalar(item) - except (ValueError, TypeError): - # e.g. trying to insert an integer into a DatetimeIndex - # We cannot keep the same dtype, so cast to the (often object) - # minimal shared dtype before doing the insert. - dtype, _ = infer_dtype_from(item, pandas_dtype=True) - dtype = find_common_type([self.dtype.categories.dtype, dtype]) - return self.astype(dtype).insert(loc, item) - else: - new_vals = np.concatenate( - ( - arr._ndarray[:loc], - np.asarray([code], dtype=arr._ndarray.dtype), - arr._ndarray[loc:], - ) - ) - new_arr = arr._from_backing_data(new_vals) - return type(self)._simple_new(new_arr, name=self.name) - def _concat(self, to_concat: List[Index], name: Hashable) -> Index: # if calling index is category, don't check dtype of others try: From c6e3435103f7f3412abcc1176d72e9fcf0670d46 Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 13 Mar 2021 22:29:00 +0200 Subject: [PATCH 59/60] Revert --- pandas/core/dtypes/cast.py | 24 ++++++++++++++++++++---- pandas/core/dtypes/concat.py | 6 +++++- pandas/core/dtypes/dtypes.py | 19 ++----------------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f5db5c8f2f981..d0fa6239fc1cb 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -88,6 +88,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype, @@ -1850,7 +1851,7 @@ def ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj: def find_common_type( - types: List[DtypeObj], downcast_cat_dtype: Optional[bool] = True + types: List[DtypeObj], promote_categorical: Optional[bool] = False ) -> DtypeObj: """ Find a common data type among the given dtypes. @@ -1858,6 +1859,7 @@ def find_common_type( Parameters ---------- types : list of dtypes + promote_categorical : find if possible, a categorical dtype that fits all the dtypes Returns ------- @@ -1878,9 +1880,23 @@ def find_common_type( if all(is_dtype_equal(first, t) for t in types[1:]): return first - # downcast categorical to the dtype of their categories - if downcast_cat_dtype and not all(is_categorical_dtype(t) for t in types): - types = [t.categories.dtype if is_categorical_dtype(t) else t for t in types] + # special case for categorical + if promote_categorical: + if any(is_categorical_dtype(t) for t in types): + cat_dtypes = [] + for t in types: + if isinstance(t, CategoricalDtype) and t.categories is not None: + if any(~isna(t.categories.values)): + cat_values_dtype = t.categories.values.dtype + if all( + is_categorical_dtype(x) or np.can_cast(x, cat_values_dtype) + for x in types + ): + cat_dtypes.append(t) + if len(cat_dtypes) > 0: + dtype_ref = cat_dtypes[0] + if all(is_dtype_equal(dtype, dtype_ref) for dtype in cat_dtypes[1:]): + return dtype_ref # get unique types (dict.fromkeys is used as order-preserving set()) types = list(dict.fromkeys(types).keys()) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 6110a442da008..1e27b93e5abf1 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -129,13 +129,17 @@ def is_nonempty(x) -> bool: all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) + first_ea = isinstance(to_concat[0], ExtensionArray) + arr_index_expansion = ( + first_ea and len(to_concat) == 2 and to_concat[1].shape[0] == 1 + ) if any_ea: # we ignore axis here, as internally concatting with EAs is always # for axis=0 if not single_dtype: target_dtype = find_common_type( - [x.dtype for x in to_concat], downcast_cat_dtype=False + [x.dtype for x in to_concat], promote_categorical=arr_index_expansion ) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index a27d9599158e5..2b94822a630c1 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -605,6 +605,7 @@ def _is_boolean(self) -> bool: def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: from pandas.core.arrays.sparse import SparseDtype + # check if we have all categorical dtype with identical categories if all(isinstance(x, CategoricalDtype) for x in dtypes): first = dtypes[0] @@ -621,22 +622,6 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: elif any(non_init_cats): return None - # case for compatible dtypes with categories.dtype - non_identical_cat_dtype = [ - isinstance(x, CategoricalDtype) and x != self for x in dtypes - ] - if not any(non_identical_cat_dtype): - non_cat_dtypes = [ - x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes - ] - non_cat_dtypes_compat = [ - not isinstance(x, ExtensionDtype) - and np.can_cast(x, self.categories.dtype) - for x in non_cat_dtypes - ] - if all(non_cat_dtypes_compat) and self.categories is not None and len(self.categories) > 0: - return self - # categorical is aware of Sparse -> extract sparse subdtypes dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] # extract the categories' dtype @@ -1380,4 +1365,4 @@ def itemsize(self) -> int: """ The element size of this data-type object. """ - return self._dtype.itemsize + return self._dtype.itemsize \ No newline at end of file From fc40817f21b5d7d569faf2ac9313e6036948a06d Mon Sep 17 00:00:00 2001 From: chrispe Date: Sat, 13 Mar 2021 22:30:14 +0200 Subject: [PATCH 60/60] Update dtypes.py --- pandas/core/dtypes/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 2b94822a630c1..d44d2a564fb78 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1365,4 +1365,4 @@ def itemsize(self) -> int: """ The element size of this data-type object. """ - return self._dtype.itemsize \ No newline at end of file + return self._dtype.itemsize