From bd316c34cb8fb5455b1b938c38722d5e215ec93e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 10 Apr 2020 16:23:05 -0500 Subject: [PATCH 1/5] BUG: Fixed concat with reindex and extension types Closes https://github.com/pandas-dev/pandas/issues/27692 Closes https://github.com/pandas-dev/pandas/issues/33027 --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/dtypes/concat.py | 8 +++-- pandas/core/internals/concat.py | 36 ++++++++++++++++++---- pandas/core/reshape/concat.py | 6 +++- pandas/tests/extension/base/reshaping.py | 13 ++++++++ pandas/tests/extension/test_categorical.py | 3 +- 6 files changed, 57 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2f4e961ff433f..96126c1133a2a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -583,7 +583,7 @@ ExtensionArray ^^^^^^^^^^^^^^ - Fixed bug where :meth:`Serires.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`) -- +- Fixed bug in :func:`concat` when concatenating DataFrames with non-overlaping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`) Other diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 301c9bb7b3f5c..b21ed51816edc 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -68,7 +68,7 @@ def get_dtype_kinds(l): return typs -def concat_compat(to_concat, axis: int = 0): +def concat_compat(to_concat, axis: int = 0, ignore_2d_ea: bool = False): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a @@ -122,7 +122,11 @@ def is_nonempty(x) -> bool: any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) if any_ea and axis == 1: - to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] + if single_dtype and ignore_2d_ea: + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + else: + to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] elif any_ea and single_dtype and axis == 0: cls = type(to_concat[0]) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 720e6799a3bf3..40671bf9c7638 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -28,7 +28,7 @@ def concatenate_block_managers( - mgrs_indexers, axes, concat_axis: int, copy: bool + mgrs_indexers, axes, concat_axis: int, copy: bool, ignore_2d_ea: bool = False, ) -> BlockManager: """ Concatenate block managers into one. @@ -65,7 +65,9 @@ def concatenate_block_managers( b.mgr_locs = placement else: b = make_block( - _concatenate_join_units(join_units, concat_axis, copy=copy), + _concatenate_join_units( + join_units, concat_axis, copy=copy, ignore_2d_ea=ignore_2d_ea + ), placement=placement, ) blocks.append(b) @@ -247,6 +249,16 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): pass elif getattr(self.block, "is_extension", False): pass + elif is_extension_array_dtype(empty_dtype): + missing_arr = empty_dtype.construct_array_type()._from_sequence( + [], dtype=empty_dtype + ) + ncols, nrows = self.shape + assert ncols == 1, ncols + empty_arr = -1 * np.ones((nrows,), dtype="int8") + return missing_arr.take( + empty_arr, allow_fill=True, fill_value=fill_value + ) else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) @@ -280,7 +292,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): return values -def _concatenate_join_units(join_units, concat_axis, copy): +def _concatenate_join_units(join_units, concat_axis, copy, ignore_2d_ea=False): """ Concatenate values from several join units along selected axis. """ @@ -307,7 +319,9 @@ def _concatenate_join_units(join_units, concat_axis, copy): else: concat_values = concat_values.copy() else: - concat_values = concat_compat(to_concat, axis=concat_axis) + concat_values = concat_compat( + to_concat, axis=concat_axis, ignore_2d_ea=ignore_2d_ea + ) return concat_values @@ -344,6 +358,7 @@ def _get_empty_dtype_and_na(join_units): upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) + for dtype, unit in zip(dtypes, join_units): if dtype is None: continue @@ -352,6 +367,11 @@ def _get_empty_dtype_and_na(join_units): upcast_cls = "category" elif is_datetime64tz_dtype(dtype): upcast_cls = "datetimetz" + + # may need to move sparse back up + elif is_extension_array_dtype(dtype): + upcast_cls = "extension" + elif issubclass(dtype.type, np.bool_): upcast_cls = "bool" elif issubclass(dtype.type, np.object_): @@ -362,8 +382,6 @@ def _get_empty_dtype_and_na(join_units): upcast_cls = "timedelta" elif is_sparse(dtype): upcast_cls = dtype.subtype.name - elif is_extension_array_dtype(dtype): - upcast_cls = "object" elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: @@ -379,6 +397,12 @@ def _get_empty_dtype_and_na(join_units): if not upcast_classes: upcast_classes = null_upcast_classes + if "extension" in upcast_classes: + if len(upcast_classes) == 1: + cls = upcast_classes["extension"][0] + return cls, cls.na_value + else: + return np.dtype("object"), np.nan # TODO: de-duplicate with maybe_promote? # create the result diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index a868e663b06a5..19ad6d17d4d91 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -495,7 +495,11 @@ def get_result(self): mgrs_indexers.append((obj._mgr, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy + mgrs_indexers, + self.new_axes, + concat_axis=self.bm_axis, + copy=self.copy, + ignore_2d_ea=self.bm_axis == 1 and self._is_frame, ) if not self.copy: new_data._consolidate_inplace() diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index c9445ceec2c77..c4628980e1621 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -107,6 +107,19 @@ def test_concat_extension_arrays_copy_false(self, data, na_value): result = pd.concat([df1, df2], axis=1, copy=False) self.assert_frame_equal(result, expected) + def test_concat_with_reindex(self, data): + # GH-33027 + a = pd.DataFrame({"a": data[:5]}) + b = pd.DataFrame({"b": data[:5]}) + result = pd.concat([a, b], ignore_index=True) + expected = pd.DataFrame( + { + "a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True), + "b": data.take(([-1] * 5) + list(range(5)), allow_fill=True), + } + ) + self.assert_frame_equal(result, expected) + def test_align(self, data, na_value): a = data[:3] b = data[2:5] diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 059d3453995bd..ed3e55dafab0f 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -93,7 +93,8 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - pass + def test_concat_with_reindex(self, data): + pytest.xfail(reason="Deliberate?") class TestGetitem(base.BaseGetitemTests): From 7fc30a0893bc9fae214beba83324ebf8a282a5a7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 May 2020 16:26:46 -0500 Subject: [PATCH 2/5] rebase --- pandas/core/dtypes/concat.py | 11 ++++++++--- pandas/tests/extension/test_categorical.py | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index ca3a41813f3d3..f464ee8146671 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -98,7 +98,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: return arr.astype(dtype, copy=False) -def concat_compat(to_concat, axis: int = 0): +def concat_compat(to_concat, axis: int = 0, ignore_2d_ea: bool = True): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a @@ -153,8 +153,13 @@ def is_nonempty(x) -> bool: return concat_datetime(to_concat, axis=axis, typs=typs) elif any_ea and axis == 1: - to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] - return np.concatenate(to_concat, axis=axis) + if single_dtype and ignore_2d_ea: + # TODO(EA2D): special-casing not needed with 2D EAs + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + else: + to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] + return np.concatenate(to_concat, axis=axis) elif all_empty: # we have all empties, but may need to coerce the result dtype to diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 05fd2ad12015f..f7b572a70073a 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -94,7 +94,7 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): def test_concat_with_reindex(self, data): - pytest.xfail(reason="Deliberate?") + pytest.xfail(reason="Deliberately upcast to object?") class TestGetitem(base.BaseGetitemTests): From 187551c1de55b74913e7e69986dd19cc35446b62 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jun 2020 14:34:18 -0500 Subject: [PATCH 3/5] fixup --- pandas/core/internals/concat.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 02de292b8af94..367420ab0e624 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -303,7 +303,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): return values -def _concatenate_join_units(join_units, concat_axis, copy, ignore_2d_ea=False): +def _concatenate_join_units(join_units, concat_axis, copy): """ Concatenate values from several join units along selected axis. """ @@ -339,9 +339,7 @@ def _concatenate_join_units(join_units, concat_axis, copy, ignore_2d_ea=False): # 2D to put it a non-EA Block concat_values = np.atleast_2d(concat_values) else: - concat_values = concat_compat( - to_concat, axis=concat_axis, ignore_2d_ea=ignore_2d_ea - ) + concat_values = concat_compat(to_concat, axis=concat_axis,) return concat_values From a83a5fe7af7e15d94e540c0cd791d0b710a213c5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jun 2020 14:35:33 -0500 Subject: [PATCH 4/5] cleanup --- pandas/core/internals/concat.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 367420ab0e624..94910c1314e32 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -376,7 +376,6 @@ def _get_empty_dtype_and_na(join_units): upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) - for dtype, unit in zip(dtypes, join_units): if dtype is None: continue @@ -386,7 +385,6 @@ def _get_empty_dtype_and_na(join_units): elif is_datetime64tz_dtype(dtype): upcast_cls = "datetimetz" - # may need to move sparse back up elif is_extension_array_dtype(dtype): upcast_cls = "extension" From 13c1fe7e52043ab0a2ef5afbba94ce50e8253e7a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 22 Jun 2020 09:25:39 -0500 Subject: [PATCH 5/5] fixups --- pandas/core/internals/concat.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 94910c1314e32..2cc7461986c8f 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -266,7 +266,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): ) ncols, nrows = self.shape assert ncols == 1, ncols - empty_arr = -1 * np.ones((nrows,), dtype="int8") + empty_arr = -1 * np.ones((nrows,), dtype=np.intp) return missing_arr.take( empty_arr, allow_fill=True, fill_value=fill_value ) @@ -413,16 +413,15 @@ def _get_empty_dtype_and_na(join_units): if not upcast_classes: upcast_classes = null_upcast_classes + # TODO: de-duplicate with maybe_promote? + # create the result if "extension" in upcast_classes: if len(upcast_classes) == 1: cls = upcast_classes["extension"][0] return cls, cls.na_value else: return np.dtype("object"), np.nan - - # TODO: de-duplicate with maybe_promote? - # create the result - if "object" in upcast_classes: + elif "object" in upcast_classes: return np.dtype(np.object_), np.nan elif "bool" in upcast_classes: if has_none_blocks: