Skip to content

Commit bd316c3

Browse files
committed
BUG: Fixed concat with reindex and extension types
Closes pandas-dev#27692 Closes pandas-dev#33027
1 parent 70ca246 commit bd316c3

File tree

6 files changed

+57
-11
lines changed

6 files changed

+57
-11
lines changed

doc/source/whatsnew/v1.1.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,7 @@ ExtensionArray
583583
^^^^^^^^^^^^^^
584584

585585
- Fixed bug where :meth:`Serires.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`)
586-
-
586+
- Fixed bug in :func:`concat` when concatenating DataFrames with non-overlaping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`)
587587

588588

589589
Other

pandas/core/dtypes/concat.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def get_dtype_kinds(l):
6868
return typs
6969

7070

71-
def concat_compat(to_concat, axis: int = 0):
71+
def concat_compat(to_concat, axis: int = 0, ignore_2d_ea: bool = False):
7272
"""
7373
provide concatenation of an array of arrays each of which is a single
7474
'normalized' dtypes (in that for example, if it's object, then it is a
@@ -122,7 +122,11 @@ def is_nonempty(x) -> bool:
122122
any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat)
123123

124124
if any_ea and axis == 1:
125-
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
125+
if single_dtype and ignore_2d_ea:
126+
cls = type(to_concat[0])
127+
return cls._concat_same_type(to_concat)
128+
else:
129+
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
126130

127131
elif any_ea and single_dtype and axis == 0:
128132
cls = type(to_concat[0])

pandas/core/internals/concat.py

+30-6
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929

3030
def concatenate_block_managers(
31-
mgrs_indexers, axes, concat_axis: int, copy: bool
31+
mgrs_indexers, axes, concat_axis: int, copy: bool, ignore_2d_ea: bool = False,
3232
) -> BlockManager:
3333
"""
3434
Concatenate block managers into one.
@@ -65,7 +65,9 @@ def concatenate_block_managers(
6565
b.mgr_locs = placement
6666
else:
6767
b = make_block(
68-
_concatenate_join_units(join_units, concat_axis, copy=copy),
68+
_concatenate_join_units(
69+
join_units, concat_axis, copy=copy, ignore_2d_ea=ignore_2d_ea
70+
),
6971
placement=placement,
7072
)
7173
blocks.append(b)
@@ -247,6 +249,16 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
247249
pass
248250
elif getattr(self.block, "is_extension", False):
249251
pass
252+
elif is_extension_array_dtype(empty_dtype):
253+
missing_arr = empty_dtype.construct_array_type()._from_sequence(
254+
[], dtype=empty_dtype
255+
)
256+
ncols, nrows = self.shape
257+
assert ncols == 1, ncols
258+
empty_arr = -1 * np.ones((nrows,), dtype="int8")
259+
return missing_arr.take(
260+
empty_arr, allow_fill=True, fill_value=fill_value
261+
)
250262
else:
251263
missing_arr = np.empty(self.shape, dtype=empty_dtype)
252264
missing_arr.fill(fill_value)
@@ -280,7 +292,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
280292
return values
281293

282294

283-
def _concatenate_join_units(join_units, concat_axis, copy):
295+
def _concatenate_join_units(join_units, concat_axis, copy, ignore_2d_ea=False):
284296
"""
285297
Concatenate values from several join units along selected axis.
286298
"""
@@ -307,7 +319,9 @@ def _concatenate_join_units(join_units, concat_axis, copy):
307319
else:
308320
concat_values = concat_values.copy()
309321
else:
310-
concat_values = concat_compat(to_concat, axis=concat_axis)
322+
concat_values = concat_compat(
323+
to_concat, axis=concat_axis, ignore_2d_ea=ignore_2d_ea
324+
)
311325

312326
return concat_values
313327

@@ -344,6 +358,7 @@ def _get_empty_dtype_and_na(join_units):
344358

345359
upcast_classes = defaultdict(list)
346360
null_upcast_classes = defaultdict(list)
361+
347362
for dtype, unit in zip(dtypes, join_units):
348363
if dtype is None:
349364
continue
@@ -352,6 +367,11 @@ def _get_empty_dtype_and_na(join_units):
352367
upcast_cls = "category"
353368
elif is_datetime64tz_dtype(dtype):
354369
upcast_cls = "datetimetz"
370+
371+
# may need to move sparse back up
372+
elif is_extension_array_dtype(dtype):
373+
upcast_cls = "extension"
374+
355375
elif issubclass(dtype.type, np.bool_):
356376
upcast_cls = "bool"
357377
elif issubclass(dtype.type, np.object_):
@@ -362,8 +382,6 @@ def _get_empty_dtype_and_na(join_units):
362382
upcast_cls = "timedelta"
363383
elif is_sparse(dtype):
364384
upcast_cls = dtype.subtype.name
365-
elif is_extension_array_dtype(dtype):
366-
upcast_cls = "object"
367385
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
368386
upcast_cls = dtype.name
369387
else:
@@ -379,6 +397,12 @@ def _get_empty_dtype_and_na(join_units):
379397

380398
if not upcast_classes:
381399
upcast_classes = null_upcast_classes
400+
if "extension" in upcast_classes:
401+
if len(upcast_classes) == 1:
402+
cls = upcast_classes["extension"][0]
403+
return cls, cls.na_value
404+
else:
405+
return np.dtype("object"), np.nan
382406

383407
# TODO: de-duplicate with maybe_promote?
384408
# create the result

pandas/core/reshape/concat.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,11 @@ def get_result(self):
495495
mgrs_indexers.append((obj._mgr, indexers))
496496

497497
new_data = concatenate_block_managers(
498-
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
498+
mgrs_indexers,
499+
self.new_axes,
500+
concat_axis=self.bm_axis,
501+
copy=self.copy,
502+
ignore_2d_ea=self.bm_axis == 1 and self._is_frame,
499503
)
500504
if not self.copy:
501505
new_data._consolidate_inplace()

pandas/tests/extension/base/reshaping.py

+13
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,19 @@ def test_concat_extension_arrays_copy_false(self, data, na_value):
107107
result = pd.concat([df1, df2], axis=1, copy=False)
108108
self.assert_frame_equal(result, expected)
109109

110+
def test_concat_with_reindex(self, data):
111+
# GH-33027
112+
a = pd.DataFrame({"a": data[:5]})
113+
b = pd.DataFrame({"b": data[:5]})
114+
result = pd.concat([a, b], ignore_index=True)
115+
expected = pd.DataFrame(
116+
{
117+
"a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
118+
"b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
119+
}
120+
)
121+
self.assert_frame_equal(result, expected)
122+
110123
def test_align(self, data, na_value):
111124
a = data[:3]
112125
b = data[2:5]

pandas/tests/extension/test_categorical.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ class TestConstructors(base.BaseConstructorsTests):
9393

9494

9595
class TestReshaping(base.BaseReshapingTests):
96-
pass
96+
def test_concat_with_reindex(self, data):
97+
pytest.xfail(reason="Deliberate?")
9798

9899

99100
class TestGetitem(base.BaseGetitemTests):

0 commit comments

Comments
 (0)