Skip to content

Commit c5b8006

Browse files
jorisvandenbosschewesbarnett
authored andcommitted
BUG: Fix concat of frames with extension types (no reindexed columns) (pandas-dev#34339)
1 parent e6e0889 commit c5b8006

File tree

7 files changed

+50
-12
lines changed

7 files changed

+50
-12
lines changed

doc/source/whatsnew/v1.1.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,9 @@ Other enhancements
292292
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`)
293293
- :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`).
294294
- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`).
295+
- :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example
296+
combining a nullable integer column with a numpy integer column will no longer
297+
result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`).
295298
- :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`).
296299
- :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`).
297300
- :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list of dict to change only some specific columns' width (:issue:`28917`).

pandas/core/arrays/integer.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,14 @@ def construct_array_type(cls) -> Type["IntegerArray"]:
9393

9494
def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
9595
# for now only handle other integer types
96-
if not all(isinstance(t, _IntegerDtype) for t in dtypes):
96+
if not all(
97+
isinstance(t, _IntegerDtype)
98+
or (isinstance(t, np.dtype) and np.issubdtype(t, np.integer))
99+
for t in dtypes
100+
):
97101
return None
98102
np_dtype = np.find_common_type(
99-
[t.numpy_dtype for t in dtypes], [] # type: ignore
103+
[t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], []
100104
)
101105
if np.issubdtype(np_dtype, np.integer):
102106
return _dtypes[str(np_dtype)]

pandas/core/dtypes/concat.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def is_nonempty(x) -> bool:
147147
single_dtype = len({x.dtype for x in to_concat}) == 1
148148
any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat)
149149

150-
if any_ea and axis == 0:
150+
if any_ea:
151151
if not single_dtype:
152152
target_dtype = find_common_type([x.dtype for x in to_concat])
153153
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
@@ -161,10 +161,6 @@ def is_nonempty(x) -> bool:
161161
elif _contains_datetime or "timedelta" in typs:
162162
return concat_datetime(to_concat, axis=axis, typs=typs)
163163

164-
elif any_ea and axis == 1:
165-
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
166-
return np.concatenate(to_concat, axis=axis)
167-
168164
elif all_empty:
169165
# we have all empties, but may need to coerce the result dtype to
170166
# object if we have non-numeric type operands (numpy would otherwise

pandas/core/internals/concat.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,15 @@ def _concatenate_join_units(join_units, concat_axis, copy):
319319
concat_values = concat_values.copy()
320320
else:
321321
concat_values = concat_values.copy()
322+
elif any(isinstance(t, ExtensionArray) for t in to_concat):
323+
# concatting with at least one EA means we are concatting a single column
324+
# the non-EA values are 2D arrays with shape (1, n)
325+
to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
326+
concat_values = concat_compat(to_concat, axis=concat_axis)
327+
if not isinstance(concat_values, ExtensionArray):
328+
# if the result of concat is not an EA but an ndarray, reshape to
329+
# 2D to put it a non-EA Block
330+
concat_values = np.atleast_2d(concat_values)
322331
else:
323332
concat_values = concat_compat(to_concat, axis=concat_axis)
324333

@@ -443,7 +452,7 @@ def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool:
443452
# cannot necessarily join
444453
return (
445454
# all blocks need to have the same type
446-
all(isinstance(ju.block, type(join_units[0].block)) for ju in join_units)
455+
all(type(ju.block) is type(join_units[0].block) for ju in join_units)
447456
and # noqa
448457
# no blocks that would get missing values (can lead to type upcasts)
449458
# unless we're an extension dtype.

pandas/tests/extension/base/reshaping.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,11 @@ def test_concat_mixed_dtypes(self, data):
6262
self.assert_series_equal(result, expected)
6363

6464
# simple test for just EA and one other
65-
result = pd.concat([df1, df2])
65+
result = pd.concat([df1, df2.astype(object)])
6666
expected = pd.concat([df1.astype("object"), df2.astype("object")])
6767
self.assert_frame_equal(result, expected)
6868

69-
result = pd.concat([df1["A"], df2["A"]])
69+
result = pd.concat([df1["A"], df2["A"].astype(object)])
7070
expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")])
7171
self.assert_series_equal(result, expected)
7272

pandas/tests/indexing/test_indexing.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -1006,12 +1006,24 @@ def test_extension_array_cross_section():
10061006

10071007

10081008
def test_extension_array_cross_section_converts():
1009+
# all numeric columns -> numeric series
10091010
df = pd.DataFrame(
1010-
{"A": pd.core.arrays.integer_array([1, 2]), "B": np.array([1, 2])},
1011+
{"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"],
1012+
)
1013+
result = df.loc["a"]
1014+
expected = pd.Series([1, 1], dtype="Int64", index=["A", "B"], name="a")
1015+
tm.assert_series_equal(result, expected)
1016+
1017+
result = df.iloc[0]
1018+
tm.assert_series_equal(result, expected)
1019+
1020+
# mixed columns -> object series
1021+
df = pd.DataFrame(
1022+
{"A": pd.array([1, 2], dtype="Int64"), "B": np.array(["a", "b"])},
10111023
index=["a", "b"],
10121024
)
10131025
result = df.loc["a"]
1014-
expected = pd.Series([1, 1], dtype=object, index=["A", "B"], name="a")
1026+
expected = pd.Series([1, "a"], dtype=object, index=["A", "B"], name="a")
10151027
tm.assert_series_equal(result, expected)
10161028

10171029
result = df.iloc[0]

pandas/tests/reshape/test_concat.py

+14
Original file line numberDiff line numberDiff line change
@@ -2843,3 +2843,17 @@ def test_concat_preserves_subclass(obj):
28432843

28442844
result = concat([obj, obj])
28452845
assert isinstance(result, type(obj))
2846+
2847+
2848+
def test_concat_frame_axis0_extension_dtypes():
2849+
# preserve extension dtype (through common_dtype mechanism)
2850+
df1 = pd.DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")})
2851+
df2 = pd.DataFrame({"a": np.array([4, 5, 6])})
2852+
2853+
result = pd.concat([df1, df2], ignore_index=True)
2854+
expected = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64")
2855+
tm.assert_frame_equal(result, expected)
2856+
2857+
result = pd.concat([df2, df1], ignore_index=True)
2858+
expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64")
2859+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)