Skip to content

Commit bf12604

Browse files
BUG: Fixed concat with reindex and extension types (#33522)
* BUG: Fixed concat with reindex and extension types Closes #27692 Closes #33027 * rebase * fixup * cleanup * fixups
1 parent d9a09ca commit bf12604

File tree

5 files changed

+41
-9
lines changed

5 files changed

+41
-9
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1123,6 +1123,7 @@ ExtensionArray
11231123
^^^^^^^^^^^^^^
11241124

11251125
- Fixed bug where :meth:`Series.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`)
1126+
- Fixed bug in :func:`concat` when concatenating DataFrames with non-overlaping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`)
11261127
- Fixed bug where :meth:`StringArray.isna` would return ``False`` for NA values when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`33655`)
11271128
- Fixed bug in :class:`Series` construction with EA dtype and index but no data or scalar data fails (:issue:`26469`)
11281129
- Fixed bug that caused :meth:`Series.__repr__()` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`).

pandas/core/internals/concat.py

+24-7
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030

3131
def concatenate_block_managers(
32-
mgrs_indexers, axes, concat_axis: int, copy: bool
32+
mgrs_indexers, axes, concat_axis: int, copy: bool,
3333
) -> BlockManager:
3434
"""
3535
Concatenate block managers into one.
@@ -76,7 +76,7 @@ def concatenate_block_managers(
7676
b = make_block(values, placement=placement, ndim=blk.ndim)
7777
else:
7878
b = make_block(
79-
_concatenate_join_units(join_units, concat_axis, copy=copy),
79+
_concatenate_join_units(join_units, concat_axis, copy=copy,),
8080
placement=placement,
8181
)
8282
blocks.append(b)
@@ -260,6 +260,16 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
260260
pass
261261
elif getattr(self.block, "is_extension", False):
262262
pass
263+
elif is_extension_array_dtype(empty_dtype):
264+
missing_arr = empty_dtype.construct_array_type()._from_sequence(
265+
[], dtype=empty_dtype
266+
)
267+
ncols, nrows = self.shape
268+
assert ncols == 1, ncols
269+
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
270+
return missing_arr.take(
271+
empty_arr, allow_fill=True, fill_value=fill_value
272+
)
263273
else:
264274
missing_arr = np.empty(self.shape, dtype=empty_dtype)
265275
missing_arr.fill(fill_value)
@@ -329,7 +339,7 @@ def _concatenate_join_units(join_units, concat_axis, copy):
329339
# 2D to put it a non-EA Block
330340
concat_values = np.atleast_2d(concat_values)
331341
else:
332-
concat_values = concat_compat(to_concat, axis=concat_axis)
342+
concat_values = concat_compat(to_concat, axis=concat_axis,)
333343

334344
return concat_values
335345

@@ -374,6 +384,10 @@ def _get_empty_dtype_and_na(join_units):
374384
upcast_cls = "category"
375385
elif is_datetime64tz_dtype(dtype):
376386
upcast_cls = "datetimetz"
387+
388+
elif is_extension_array_dtype(dtype):
389+
upcast_cls = "extension"
390+
377391
elif issubclass(dtype.type, np.bool_):
378392
upcast_cls = "bool"
379393
elif issubclass(dtype.type, np.object_):
@@ -384,8 +398,6 @@ def _get_empty_dtype_and_na(join_units):
384398
upcast_cls = "timedelta"
385399
elif is_sparse(dtype):
386400
upcast_cls = dtype.subtype.name
387-
elif is_extension_array_dtype(dtype):
388-
upcast_cls = "object"
389401
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
390402
upcast_cls = dtype.name
391403
else:
@@ -401,10 +413,15 @@ def _get_empty_dtype_and_na(join_units):
401413

402414
if not upcast_classes:
403415
upcast_classes = null_upcast_classes
404-
405416
# TODO: de-duplicate with maybe_promote?
406417
# create the result
407-
if "object" in upcast_classes:
418+
if "extension" in upcast_classes:
419+
if len(upcast_classes) == 1:
420+
cls = upcast_classes["extension"][0]
421+
return cls, cls.na_value
422+
else:
423+
return np.dtype("object"), np.nan
424+
elif "object" in upcast_classes:
408425
return np.dtype(np.object_), np.nan
409426
elif "bool" in upcast_classes:
410427
if has_none_blocks:

pandas/core/reshape/concat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ def get_result(self):
500500
mgrs_indexers.append((obj._mgr, indexers))
501501

502502
new_data = concatenate_block_managers(
503-
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
503+
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy,
504504
)
505505
if not self.copy:
506506
new_data._consolidate_inplace()

pandas/tests/extension/base/reshaping.py

+13
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,19 @@ def test_concat_extension_arrays_copy_false(self, data, na_value):
107107
result = pd.concat([df1, df2], axis=1, copy=False)
108108
self.assert_frame_equal(result, expected)
109109

110+
def test_concat_with_reindex(self, data):
111+
# GH-33027
112+
a = pd.DataFrame({"a": data[:5]})
113+
b = pd.DataFrame({"b": data[:5]})
114+
result = pd.concat([a, b], ignore_index=True)
115+
expected = pd.DataFrame(
116+
{
117+
"a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
118+
"b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
119+
}
120+
)
121+
self.assert_frame_equal(result, expected)
122+
110123
def test_align(self, data, na_value):
111124
a = data[:3]
112125
b = data[2:5]

pandas/tests/extension/test_categorical.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ class TestConstructors(base.BaseConstructorsTests):
9393

9494

9595
class TestReshaping(base.BaseReshapingTests):
96-
pass
96+
def test_concat_with_reindex(self, data):
97+
pytest.xfail(reason="Deliberately upcast to object?")
9798

9899

99100
class TestGetitem(base.BaseGetitemTests):

0 commit comments

Comments
 (0)