diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 567b6853bd633..0476c34db3773 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1073,6 +1073,7 @@ ExtensionArray ^^^^^^^^^^^^^^ - Fixed bug where :meth:`Series.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`) +- Fixed bug in :func:`concat` when concatenating DataFrames with non-overlaping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`) - Fixed bug where :meth:`StringArray.isna` would return ``False`` for NA values when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`33655`) - Fixed bug in :class:`Series` construction with EA dtype and index but no data or scalar data fails (:issue:`26469`) - Fixed bug that caused :meth:`Series.__repr__()` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`). diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index fd8c5f5e27c02..2cc7461986c8f 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -29,7 +29,7 @@ def concatenate_block_managers( - mgrs_indexers, axes, concat_axis: int, copy: bool + mgrs_indexers, axes, concat_axis: int, copy: bool, ) -> BlockManager: """ Concatenate block managers into one. @@ -76,7 +76,7 @@ def concatenate_block_managers( b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( - _concatenate_join_units(join_units, concat_axis, copy=copy), + _concatenate_join_units(join_units, concat_axis, copy=copy,), placement=placement, ) blocks.append(b) @@ -260,6 +260,16 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): pass elif getattr(self.block, "is_extension", False): pass + elif is_extension_array_dtype(empty_dtype): + missing_arr = empty_dtype.construct_array_type()._from_sequence( + [], dtype=empty_dtype + ) + ncols, nrows = self.shape + assert ncols == 1, ncols + empty_arr = -1 * np.ones((nrows,), dtype=np.intp) + return missing_arr.take( + empty_arr, allow_fill=True, fill_value=fill_value + ) else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) @@ -329,7 +339,7 @@ def _concatenate_join_units(join_units, concat_axis, copy): # 2D to put it a non-EA Block concat_values = np.atleast_2d(concat_values) else: - concat_values = concat_compat(to_concat, axis=concat_axis) + concat_values = concat_compat(to_concat, axis=concat_axis,) return concat_values @@ -374,6 +384,10 @@ def _get_empty_dtype_and_na(join_units): upcast_cls = "category" elif is_datetime64tz_dtype(dtype): upcast_cls = "datetimetz" + + elif is_extension_array_dtype(dtype): + upcast_cls = "extension" + elif issubclass(dtype.type, np.bool_): upcast_cls = "bool" elif issubclass(dtype.type, np.object_): @@ -384,8 +398,6 @@ def _get_empty_dtype_and_na(join_units): upcast_cls = "timedelta" elif is_sparse(dtype): upcast_cls = dtype.subtype.name - elif is_extension_array_dtype(dtype): - upcast_cls = "object" elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: @@ -401,10 +413,15 @@ def _get_empty_dtype_and_na(join_units): if not upcast_classes: upcast_classes = null_upcast_classes - # TODO: de-duplicate with maybe_promote? # create the result - if "object" in upcast_classes: + if "extension" in upcast_classes: + if len(upcast_classes) == 1: + cls = upcast_classes["extension"][0] + return cls, cls.na_value + else: + return np.dtype("object"), np.nan + elif "object" in upcast_classes: return np.dtype(np.object_), np.nan elif "bool" in upcast_classes: if has_none_blocks: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index db7e9265ac21d..3a7ad61694224 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -497,7 +497,7 @@ def get_result(self): mgrs_indexers.append((obj._mgr, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy + mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy, ) if not self.copy: new_data._consolidate_inplace() diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index cd932e842e00c..3774e018a8e51 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -107,6 +107,19 @@ def test_concat_extension_arrays_copy_false(self, data, na_value): result = pd.concat([df1, df2], axis=1, copy=False) self.assert_frame_equal(result, expected) + def test_concat_with_reindex(self, data): + # GH-33027 + a = pd.DataFrame({"a": data[:5]}) + b = pd.DataFrame({"b": data[:5]}) + result = pd.concat([a, b], ignore_index=True) + expected = pd.DataFrame( + { + "a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True), + "b": data.take(([-1] * 5) + list(range(5)), allow_fill=True), + } + ) + self.assert_frame_equal(result, expected) + def test_align(self, data, na_value): a = data[:3] b = data[2:5] diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index d1211e477fe3e..f7b572a70073a 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -93,7 +93,8 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - pass + def test_concat_with_reindex(self, data): + pytest.xfail(reason="Deliberately upcast to object?") class TestGetitem(base.BaseGetitemTests):