From d396b1c7de883734e5cbfe433dd61ade82773158 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 13 Jan 2022 18:05:24 -0800 Subject: [PATCH 1/5] BUG: categorical-with-nas to float64 instead of object --- pandas/core/dtypes/concat.py | 18 +++++++----------- .../tests/reshape/concat/test_append_common.py | 10 +++++----- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 83c2668242129..26eb6d7f3375a 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -40,17 +40,13 @@ def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ if is_dtype_equal(arr.dtype, dtype): return arr - if ( - is_categorical_dtype(arr.dtype) - and isinstance(dtype, np.dtype) - and np.issubdtype(dtype, np.integer) - ): - # problem case: categorical of int -> gives int as result dtype, - # but categorical can contain NAs -> fall back to object dtype - try: - return arr.astype(dtype, copy=False) - except ValueError: - return arr.astype(object, copy=False) + + if isinstance(dtype, np.dtype) and dtype.kind in ["i", "u"]: + + if is_categorical_dtype(arr.dtype) and arr._hasnans: + # problem case: categorical of int -> gives int as result dtype, + # but categorical can contain NAs -> float64 instead + dtype = np.dtype(np.float64) if is_sparse(arr) and not is_sparse(dtype): # problem case: SparseArray.astype(dtype) doesn't follow the specified diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index f36bee9dd8dec..36bca1c2b654e 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -508,7 +508,7 @@ def test_concat_categorical(self): s1 = Series([10, 11, np.nan], dtype="category") s2 = Series([np.nan, 1, 3, 2], dtype="category") - exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype="object") + exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype=np.float64) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) @@ -529,12 +529,12 @@ def test_concat_categorical_coercion(self): s1 = Series([1, 2, np.nan], dtype="category") s2 = Series([2, 1, 2]) - exp = Series([1, 2, np.nan, 2, 1, 2], dtype="object") + exp = Series([1, 2, np.nan, 2, 1, 2], dtype=np.float64) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) # result shouldn't be affected by 1st elem dtype - exp = Series([2, 1, 2, 1, 2, np.nan], dtype="object") + exp = Series([2, 1, 2, 1, 2, np.nan], dtype=np.float64) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) @@ -554,11 +554,11 @@ def test_concat_categorical_coercion(self): s1 = Series([10, 11, np.nan], dtype="category") s2 = Series([1, 3, 2]) - exp = Series([10, 11, np.nan, 1, 3, 2], dtype="object") + exp = Series([10, 11, np.nan, 1, 3, 2], dtype=np.float64) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) - exp = Series([1, 3, 2, 10, 11, np.nan], dtype="object") + exp = Series([1, 3, 2, 10, 11, np.nan], dtype=np.float64) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) From 7f8b6b74cc58ee5f1d60e7c0b61571c0c0e54f80 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 13 Jan 2022 19:38:20 -0800 Subject: [PATCH 2/5] whatsnew --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 9a6455d4d012f..3252818460238 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -191,7 +191,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :func:`concat` between a :class:`Series` with integer dtype and another with :class:`CategoricalDtype` with integer categories and containing ``NaN`` values casting to object dtype instead of ``float64`` (:issue:`??`) - Sparse From ac594b52a9164b7ae9dd88d1612e65ab64a69515 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 13 Jan 2022 19:40:10 -0800 Subject: [PATCH 3/5] GH ref --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/dtypes/concat.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 3252818460238..acdac0949ee34 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -191,7 +191,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- Bug in :func:`concat` between a :class:`Series` with integer dtype and another with :class:`CategoricalDtype` with integer categories and containing ``NaN`` values casting to object dtype instead of ``float64`` (:issue:`??`) +- Bug in :func:`concat` between a :class:`Series` with integer dtype and another with :class:`CategoricalDtype` with integer categories and containing ``NaN`` values casting to object dtype instead of ``float64`` (:issue:`45359`) - Sparse diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 26eb6d7f3375a..66d0b5ad5c93b 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -46,6 +46,7 @@ def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: if is_categorical_dtype(arr.dtype) and arr._hasnans: # problem case: categorical of int -> gives int as result dtype, # but categorical can contain NAs -> float64 instead + # GH#45359 dtype = np.dtype(np.float64) if is_sparse(arr) and not is_sparse(dtype): From 6408153f779b7edd6ec59169fe044aa94ba828c7 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 13 Jan 2022 20:10:43 -0800 Subject: [PATCH 4/5] mypy fixup --- pandas/core/dtypes/concat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 66d0b5ad5c93b..40f5af9d0c0dd 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -30,6 +30,7 @@ ) if TYPE_CHECKING: + from pandas import Categorical from pandas.core.arrays.sparse import SparseArray @@ -43,7 +44,7 @@ def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: if isinstance(dtype, np.dtype) and dtype.kind in ["i", "u"]: - if is_categorical_dtype(arr.dtype) and arr._hasnans: + if is_categorical_dtype(arr.dtype) and cast("Categorical", arr)._hasnans: # problem case: categorical of int -> gives int as result dtype, # but categorical can contain NAs -> float64 instead # GH#45359 From 639383254c717a9e2d6698ec6990c73a9c9e9820 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 14 Jan 2022 11:10:02 -0800 Subject: [PATCH 5/5] fix concat_compat call --- pandas/core/indexes/category.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index a04605071a862..c2bcd90ff10fb 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -576,7 +576,7 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index: # not all to_concat elements are among our categories (or NA) from pandas.core.dtypes.concat import concat_compat - res = concat_compat(to_concat) + res = concat_compat([x._values for x in to_concat]) return Index(res, name=name) else: cat = self._data._from_backing_data(codes)