Skip to content

BUG: Fixed concat with reindex and extension types #33522

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 1, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,7 @@ ExtensionArray
^^^^^^^^^^^^^^

- Fixed bug where :meth:`Series.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`)
- Fixed bug in :func:`concat` when concatenating DataFrames with non-overlaping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`)
- Fixed bug where :meth:`StringArray.isna` would return ``False`` for NA values when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`33655`)
- Fixed bug in :class:`Series` construction with EA dtype and index but no data or scalar data fails (:issue:`26469`)
- Fixed bug that caused :meth:`Series.__repr__()` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`).
Expand Down
31 changes: 24 additions & 7 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@


def concatenate_block_managers(
mgrs_indexers, axes, concat_axis: int, copy: bool
mgrs_indexers, axes, concat_axis: int, copy: bool,
) -> BlockManager:
"""
Concatenate block managers into one.
Expand Down Expand Up @@ -76,7 +76,7 @@ def concatenate_block_managers(
b = make_block(values, placement=placement, ndim=blk.ndim)
else:
b = make_block(
_concatenate_join_units(join_units, concat_axis, copy=copy),
_concatenate_join_units(join_units, concat_axis, copy=copy,),
placement=placement,
)
blocks.append(b)
Expand Down Expand Up @@ -260,6 +260,16 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
pass
elif getattr(self.block, "is_extension", False):
pass
elif is_extension_array_dtype(empty_dtype):
missing_arr = empty_dtype.construct_array_type()._from_sequence(
[], dtype=empty_dtype
)
ncols, nrows = self.shape
assert ncols == 1, ncols
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(
empty_arr, allow_fill=True, fill_value=fill_value
)
else:
missing_arr = np.empty(self.shape, dtype=empty_dtype)
missing_arr.fill(fill_value)
Expand Down Expand Up @@ -329,7 +339,7 @@ def _concatenate_join_units(join_units, concat_axis, copy):
# 2D to put it a non-EA Block
concat_values = np.atleast_2d(concat_values)
else:
concat_values = concat_compat(to_concat, axis=concat_axis)
concat_values = concat_compat(to_concat, axis=concat_axis,)

return concat_values

Expand Down Expand Up @@ -374,6 +384,10 @@ def _get_empty_dtype_and_na(join_units):
upcast_cls = "category"
elif is_datetime64tz_dtype(dtype):
upcast_cls = "datetimetz"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sort of dumb, but no blank lines before/after

elif is_extension_array_dtype(dtype):
upcast_cls = "extension"

elif issubclass(dtype.type, np.bool_):
upcast_cls = "bool"
elif issubclass(dtype.type, np.object_):
Expand All @@ -384,8 +398,6 @@ def _get_empty_dtype_and_na(join_units):
upcast_cls = "timedelta"
elif is_sparse(dtype):
upcast_cls = dtype.subtype.name
elif is_extension_array_dtype(dtype):
upcast_cls = "object"
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
upcast_cls = dtype.name
else:
Expand All @@ -401,10 +413,15 @@ def _get_empty_dtype_and_na(join_units):

if not upcast_classes:
upcast_classes = null_upcast_classes

# TODO: de-duplicate with maybe_promote?
# create the result
if "object" in upcast_classes:
if "extension" in upcast_classes:
if len(upcast_classes) == 1:
cls = upcast_classes["extension"][0]
return cls, cls.na_value
else:
return np.dtype("object"), np.nan
elif "object" in upcast_classes:
return np.dtype(np.object_), np.nan
elif "bool" in upcast_classes:
if has_none_blocks:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ def get_result(self):
mgrs_indexers.append((obj._mgr, indexers))

new_data = concatenate_block_managers(
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy,
)
if not self.copy:
new_data._consolidate_inplace()
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/extension/base/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,19 @@ def test_concat_extension_arrays_copy_false(self, data, na_value):
result = pd.concat([df1, df2], axis=1, copy=False)
self.assert_frame_equal(result, expected)

def test_concat_with_reindex(self, data):
# GH-33027
a = pd.DataFrame({"a": data[:5]})
b = pd.DataFrame({"b": data[:5]})
result = pd.concat([a, b], ignore_index=True)
expected = pd.DataFrame(
{
"a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
"b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
}
)
self.assert_frame_equal(result, expected)

def test_align(self, data, na_value):
a = data[:3]
b = data[2:5]
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ class TestConstructors(base.BaseConstructorsTests):


class TestReshaping(base.BaseReshapingTests):
pass
def test_concat_with_reindex(self, data):
pytest.xfail(reason="Deliberately upcast to object?")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have an issue for this? (it seems we shouldn't cast to object when only needing to append NaNs?)



class TestGetitem(base.BaseGetitemTests):
Expand Down