Skip to content

Commit 1ac1391

Browse files
authored
BUG: concat losing columns dtypes for join=outer (#47586)
1 parent b81f431 commit 1ac1391

File tree

3 files changed

+42
-4
lines changed

3 files changed

+42
-4
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -996,6 +996,7 @@ Reshaping
996996
- Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`)
997997
- Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`)
998998
- Bug in concatenation with ``IntegerDtype``, or ``FloatingDtype`` arrays where the resulting dtype did not mirror the behavior of the non-nullable dtypes (:issue:`46379`)
999+
- Bug in :func:`concat` losing dtype of columns when ``join="outer"`` and ``sort=True`` (:issue:`47329`)
9991000
- Bug in :func:`concat` not sorting the column names when ``None`` is included (:issue:`47331`)
10001001
- Bug in :func:`concat` with identical key leads to error when indexing :class:`MultiIndex` (:issue:`46519`)
10011002
- Bug in :meth:`DataFrame.join` with a list when using suffixes to join DataFrames with duplicate column names (:issue:`46396`)

pandas/core/indexes/api.py

+30-4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
)
1212
from pandas.errors import InvalidIndexError
1313

14+
from pandas.core.dtypes.cast import find_common_type
1415
from pandas.core.dtypes.common import is_dtype_equal
1516

1617
from pandas.core.algorithms import safe_sort
@@ -223,7 +224,7 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
223224

224225
indexes, kind = _sanitize_and_check(indexes)
225226

226-
def _unique_indices(inds) -> Index:
227+
def _unique_indices(inds, dtype) -> Index:
227228
"""
228229
Convert indexes to lists and concatenate them, removing duplicates.
229230
@@ -243,7 +244,30 @@ def conv(i):
243244
i = i.tolist()
244245
return i
245246

246-
return Index(lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))
247+
return Index(
248+
lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort),
249+
dtype=dtype,
250+
)
251+
252+
def _find_common_index_dtype(inds):
253+
"""
254+
Finds a common type for the indexes to pass through to resulting index.
255+
256+
Parameters
257+
----------
258+
inds: list of Index or list objects
259+
260+
Returns
261+
-------
262+
The common type or None if no indexes were given
263+
"""
264+
dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)]
265+
if dtypes:
266+
dtype = find_common_type(dtypes)
267+
else:
268+
dtype = None
269+
270+
return dtype
247271

248272
if kind == "special":
249273
result = indexes[0]
@@ -283,16 +307,18 @@ def conv(i):
283307
return result
284308

285309
elif kind == "array":
310+
dtype = _find_common_index_dtype(indexes)
286311
index = indexes[0]
287312
if not all(index.equals(other) for other in indexes[1:]):
288-
index = _unique_indices(indexes)
313+
index = _unique_indices(indexes, dtype)
289314

290315
name = get_unanimous_names(*indexes)[0]
291316
if name != index.name:
292317
index = index.rename(name)
293318
return index
294319
else: # kind='list'
295-
return _unique_indices(indexes)
320+
dtype = _find_common_index_dtype(indexes)
321+
return _unique_indices(indexes, dtype)
296322

297323

298324
def _sanitize_and_check(indexes):

pandas/tests/reshape/concat/test_index.py

+11
Original file line numberDiff line numberDiff line change
@@ -398,3 +398,14 @@ def test_concat_range_index_result(self):
398398
tm.assert_frame_equal(result, expected)
399399
expected_index = pd.RangeIndex(0, 2)
400400
tm.assert_index_equal(result.index, expected_index, exact=True)
401+
402+
@pytest.mark.parametrize("dtype", ["Int64", "object"])
403+
def test_concat_index_keep_dtype(self, dtype):
404+
# GH#47329
405+
df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype=dtype))
406+
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype=dtype))
407+
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
408+
expected = DataFrame(
409+
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype=dtype)
410+
)
411+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)