Skip to content

BUG: Fixed concat with reindex and extension types #33522

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 1, 2020
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ ExtensionArray
^^^^^^^^^^^^^^

- Fixed bug where :meth:`Serires.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`)
-
- Fixed bug in :func:`concat` when concatenating DataFrames with non-overlaping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`)


Other
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def get_dtype_kinds(l):
return typs


def concat_compat(to_concat, axis: int = 0):
def concat_compat(to_concat, axis: int = 0, ignore_2d_ea: bool = False):
"""
provide concatenation of an array of arrays each of which is a single
'normalized' dtypes (in that for example, if it's object, then it is a
Expand Down Expand Up @@ -122,7 +122,11 @@ def is_nonempty(x) -> bool:
any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat)

if any_ea and axis == 1:
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
if single_dtype and ignore_2d_ea:
cls = type(to_concat[0])
return cls._concat_same_type(to_concat)
else:
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]

elif any_ea and single_dtype and axis == 0:
cls = type(to_concat[0])
Expand Down
36 changes: 30 additions & 6 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@


def concatenate_block_managers(
mgrs_indexers, axes, concat_axis: int, copy: bool
mgrs_indexers, axes, concat_axis: int, copy: bool, ignore_2d_ea: bool = False,
) -> BlockManager:
"""
Concatenate block managers into one.
Expand Down Expand Up @@ -65,7 +65,9 @@ def concatenate_block_managers(
b.mgr_locs = placement
else:
b = make_block(
_concatenate_join_units(join_units, concat_axis, copy=copy),
_concatenate_join_units(
join_units, concat_axis, copy=copy, ignore_2d_ea=ignore_2d_ea
),
placement=placement,
)
blocks.append(b)
Expand Down Expand Up @@ -247,6 +249,16 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
pass
elif getattr(self.block, "is_extension", False):
pass
elif is_extension_array_dtype(empty_dtype):
missing_arr = empty_dtype.construct_array_type()._from_sequence(
[], dtype=empty_dtype
)
ncols, nrows = self.shape
assert ncols == 1, ncols
empty_arr = -1 * np.ones((nrows,), dtype="int8")
return missing_arr.take(
empty_arr, allow_fill=True, fill_value=fill_value
)
else:
missing_arr = np.empty(self.shape, dtype=empty_dtype)
missing_arr.fill(fill_value)
Expand Down Expand Up @@ -280,7 +292,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
return values


def _concatenate_join_units(join_units, concat_axis, copy):
def _concatenate_join_units(join_units, concat_axis, copy, ignore_2d_ea=False):
"""
Concatenate values from several join units along selected axis.
"""
Expand All @@ -307,7 +319,9 @@ def _concatenate_join_units(join_units, concat_axis, copy):
else:
concat_values = concat_values.copy()
else:
concat_values = concat_compat(to_concat, axis=concat_axis)
concat_values = concat_compat(
to_concat, axis=concat_axis, ignore_2d_ea=ignore_2d_ea
)

return concat_values

Expand Down Expand Up @@ -344,6 +358,7 @@ def _get_empty_dtype_and_na(join_units):

upcast_classes = defaultdict(list)
null_upcast_classes = defaultdict(list)

for dtype, unit in zip(dtypes, join_units):
if dtype is None:
continue
Expand All @@ -352,6 +367,11 @@ def _get_empty_dtype_and_na(join_units):
upcast_cls = "category"
elif is_datetime64tz_dtype(dtype):
upcast_cls = "datetimetz"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sort of dumb, but no blank lines before/after

# may need to move sparse back up
elif is_extension_array_dtype(dtype):
upcast_cls = "extension"

elif issubclass(dtype.type, np.bool_):
upcast_cls = "bool"
elif issubclass(dtype.type, np.object_):
Expand All @@ -362,8 +382,6 @@ def _get_empty_dtype_and_na(join_units):
upcast_cls = "timedelta"
elif is_sparse(dtype):
upcast_cls = dtype.subtype.name
elif is_extension_array_dtype(dtype):
upcast_cls = "object"
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
upcast_cls = dtype.name
else:
Expand All @@ -379,6 +397,12 @@ def _get_empty_dtype_and_na(join_units):

if not upcast_classes:
upcast_classes = null_upcast_classes
if "extension" in upcast_classes:
if len(upcast_classes) == 1:
cls = upcast_classes["extension"][0]
return cls, cls.na_value
else:
return np.dtype("object"), np.nan

# TODO: de-duplicate with maybe_promote?
# create the result
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,11 @@ def get_result(self):
mgrs_indexers.append((obj._mgr, indexers))

new_data = concatenate_block_managers(
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
mgrs_indexers,
self.new_axes,
concat_axis=self.bm_axis,
copy=self.copy,
ignore_2d_ea=self.bm_axis == 1 and self._is_frame,
)
if not self.copy:
new_data._consolidate_inplace()
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/extension/base/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,19 @@ def test_concat_extension_arrays_copy_false(self, data, na_value):
result = pd.concat([df1, df2], axis=1, copy=False)
self.assert_frame_equal(result, expected)

def test_concat_with_reindex(self, data):
# GH-33027
a = pd.DataFrame({"a": data[:5]})
b = pd.DataFrame({"b": data[:5]})
result = pd.concat([a, b], ignore_index=True)
expected = pd.DataFrame(
{
"a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
"b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
}
)
self.assert_frame_equal(result, expected)

def test_align(self, data, na_value):
a = data[:3]
b = data[2:5]
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ class TestConstructors(base.BaseConstructorsTests):


class TestReshaping(base.BaseReshapingTests):
pass
def test_concat_with_reindex(self, data):
pytest.xfail(reason="Deliberate?")


class TestGetitem(base.BaseGetitemTests):
Expand Down