Skip to content

Commit bcf2406

Browse files
authored
BUG: DataFrame.append with timedelta64 (#39574)
1 parent 934fc81 commit bcf2406

File tree

5 files changed

+103
-132
lines changed

5 files changed

+103
-132
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,7 @@ Reshaping
431431
- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
432432
- Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`)
433433
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)
434+
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`)
434435

435436
Sparse
436437
^^^^^^

pandas/core/dtypes/concat.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
6161
return arr.astype(dtype, copy=False)
6262

6363

64-
def concat_compat(to_concat, axis: int = 0):
64+
def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False):
6565
"""
6666
provide concatenation of an array of arrays each of which is a single
6767
'normalized' dtypes (in that for example, if it's object, then it is a
@@ -72,6 +72,9 @@ def concat_compat(to_concat, axis: int = 0):
7272
----------
7373
to_concat : array of arrays
7474
axis : axis to provide concatenation
75+
ea_compat_axis : bool, default False
76+
For ExtensionArray compat, behave as if axis == 1 when determining
77+
whether to drop empty arrays.
7578
7679
Returns
7780
-------
@@ -91,7 +94,8 @@ def is_nonempty(x) -> bool:
9194
# marginal given that it would still require shape & dtype calculation and
9295
# np.concatenate which has them both implemented is compiled.
9396
non_empties = [x for x in to_concat if is_nonempty(x)]
94-
if non_empties and axis == 0:
97+
if non_empties and axis == 0 and not ea_compat_axis:
98+
# ea_compat_axis see GH#39574
9599
to_concat = non_empties
96100

97101
kinds = {obj.dtype.kind for obj in to_concat}

pandas/core/internals/concat.py

+44-118
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
from __future__ import annotations
22

3-
from collections import defaultdict
43
import copy
54
import itertools
6-
from typing import TYPE_CHECKING, Dict, List, Sequence, cast
5+
from typing import TYPE_CHECKING, Dict, List, Sequence
76

87
import numpy as np
98

@@ -14,16 +13,13 @@
1413
from pandas.core.dtypes.cast import ensure_dtype_can_hold_na, find_common_type
1514
from pandas.core.dtypes.common import (
1615
is_categorical_dtype,
17-
is_datetime64_dtype,
1816
is_datetime64tz_dtype,
17+
is_dtype_equal,
1918
is_extension_array_dtype,
20-
is_float_dtype,
21-
is_numeric_dtype,
2219
is_sparse,
23-
is_timedelta64_dtype,
2420
)
2521
from pandas.core.dtypes.concat import concat_compat
26-
from pandas.core.dtypes.missing import isna_all
22+
from pandas.core.dtypes.missing import is_valid_na_for_dtype, isna_all
2723

2824
import pandas.core.algorithms as algos
2925
from pandas.core.arrays import DatetimeArray, ExtensionArray
@@ -33,7 +29,6 @@
3329

3430
if TYPE_CHECKING:
3531
from pandas import Index
36-
from pandas.core.arrays.sparse.dtype import SparseDtype
3732

3833

3934
def concatenate_block_managers(
@@ -232,6 +227,29 @@ def dtype(self):
232227
return blk.dtype
233228
return ensure_dtype_can_hold_na(blk.dtype)
234229

230+
def is_valid_na_for(self, dtype: DtypeObj) -> bool:
231+
"""
232+
Check that we are all-NA of a type/dtype that is compatible with this dtype.
233+
Augments `self.is_na` with an additional check of the type of NA values.
234+
"""
235+
if not self.is_na:
236+
return False
237+
if self.block is None:
238+
return True
239+
240+
if self.dtype == object:
241+
values = self.block.values
242+
return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))
243+
244+
if self.dtype.kind == dtype.kind == "M" and not is_dtype_equal(
245+
self.dtype, dtype
246+
):
247+
# fill_values match but we should not cast self.block.values to dtype
248+
return False
249+
250+
na_value = self.block.fill_value
251+
return is_valid_na_for_dtype(na_value, dtype)
252+
235253
@cache_readonly
236254
def is_na(self) -> bool:
237255
if self.block is None:
@@ -262,7 +280,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
262280
else:
263281
fill_value = upcasted_na
264282

265-
if self.is_na:
283+
if self.is_valid_na_for(empty_dtype):
266284
blk_dtype = getattr(self.block, "dtype", None)
267285

268286
if blk_dtype == np.dtype(object):
@@ -276,10 +294,9 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
276294
if is_datetime64tz_dtype(blk_dtype) or is_datetime64tz_dtype(
277295
empty_dtype
278296
):
279-
if self.block is None:
280-
# TODO(EA2D): special case unneeded with 2D EAs
281-
i8values = np.full(self.shape[1], fill_value.value)
282-
return DatetimeArray(i8values, dtype=empty_dtype)
297+
# TODO(EA2D): special case unneeded with 2D EAs
298+
i8values = np.full(self.shape[1], fill_value.value)
299+
return DatetimeArray(i8values, dtype=empty_dtype)
283300
elif is_categorical_dtype(blk_dtype):
284301
pass
285302
elif is_extension_array_dtype(blk_dtype):
@@ -295,6 +312,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
295312
empty_arr, allow_fill=True, fill_value=fill_value
296313
)
297314
else:
315+
# NB: we should never get here with empty_dtype integer or bool;
316+
# if we did, the missing_arr.fill would cast to gibberish
298317
missing_arr = np.empty(self.shape, dtype=empty_dtype)
299318
missing_arr.fill(fill_value)
300319
return missing_arr
@@ -362,14 +381,12 @@ def _concatenate_join_units(
362381
# concatting with at least one EA means we are concatting a single column
363382
# the non-EA values are 2D arrays with shape (1, n)
364383
to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
365-
concat_values = concat_compat(to_concat, axis=0)
366-
if not isinstance(concat_values, ExtensionArray) or (
367-
isinstance(concat_values, DatetimeArray) and concat_values.tz is None
368-
):
384+
concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
385+
if not is_extension_array_dtype(concat_values.dtype):
369386
# if the result of concat is not an EA but an ndarray, reshape to
370387
# 2D to put it a non-EA Block
371-
# special case DatetimeArray, which *is* an EA, but is put in a
372-
# consolidated 2D block
388+
# special case DatetimeArray/TimedeltaArray, which *is* an EA, but
389+
# is put in a consolidated 2D block
373390
concat_values = np.atleast_2d(concat_values)
374391
else:
375392
concat_values = concat_compat(to_concat, axis=concat_axis)
@@ -419,108 +436,17 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
419436
return empty_dtype
420437

421438
has_none_blocks = any(unit.block is None for unit in join_units)
422-
dtypes = [None if unit.block is None else unit.dtype for unit in join_units]
423439

424-
filtered_dtypes = [
440+
dtypes = [
425441
unit.dtype for unit in join_units if unit.block is not None and not unit.is_na
426442
]
427-
if not len(filtered_dtypes):
428-
filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None]
429-
dtype_alt = find_common_type(filtered_dtypes)
430-
431-
upcast_classes = _get_upcast_classes(join_units, dtypes)
432-
433-
if is_extension_array_dtype(dtype_alt):
434-
return dtype_alt
435-
elif dtype_alt == object:
436-
return dtype_alt
437-
438-
# TODO: de-duplicate with maybe_promote?
439-
# create the result
440-
if "extension" in upcast_classes:
441-
return np.dtype("object")
442-
elif "bool" in upcast_classes:
443-
if has_none_blocks:
444-
return np.dtype(np.object_)
445-
else:
446-
return np.dtype(np.bool_)
447-
elif "datetimetz" in upcast_classes:
448-
# GH-25014. We use NaT instead of iNaT, since this eventually
449-
# ends up in DatetimeArray.take, which does not allow iNaT.
450-
dtype = upcast_classes["datetimetz"]
451-
return dtype[0]
452-
elif "datetime" in upcast_classes:
453-
return np.dtype("M8[ns]")
454-
elif "timedelta" in upcast_classes:
455-
return np.dtype("m8[ns]")
456-
else:
457-
try:
458-
common_dtype = np.find_common_type(upcast_classes, [])
459-
except TypeError:
460-
# At least one is an ExtensionArray
461-
return np.dtype(np.object_)
462-
else:
463-
if is_float_dtype(common_dtype):
464-
return common_dtype
465-
elif is_numeric_dtype(common_dtype):
466-
if has_none_blocks:
467-
return np.dtype(np.float64)
468-
else:
469-
return common_dtype
470-
471-
msg = "invalid dtype determination in get_concat_dtype"
472-
raise AssertionError(msg)
473-
474-
475-
def _get_upcast_classes(
476-
join_units: Sequence[JoinUnit],
477-
dtypes: Sequence[DtypeObj],
478-
) -> Dict[str, List[DtypeObj]]:
479-
"""Create mapping between upcast class names and lists of dtypes."""
480-
upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
481-
null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
482-
for dtype, unit in zip(dtypes, join_units):
483-
if dtype is None:
484-
continue
485-
486-
upcast_cls = _select_upcast_cls_from_dtype(dtype)
487-
# Null blocks should not influence upcast class selection, unless there
488-
# are only null blocks, when same upcasting rules must be applied to
489-
# null upcast classes.
490-
if unit.is_na:
491-
null_upcast_classes[upcast_cls].append(dtype)
492-
else:
493-
upcast_classes[upcast_cls].append(dtype)
494-
495-
if not upcast_classes:
496-
upcast_classes = null_upcast_classes
497-
498-
return upcast_classes
499-
500-
501-
def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str:
502-
"""Select upcast class name based on dtype."""
503-
if is_categorical_dtype(dtype):
504-
return "extension"
505-
elif is_datetime64tz_dtype(dtype):
506-
return "datetimetz"
507-
elif is_extension_array_dtype(dtype):
508-
return "extension"
509-
elif issubclass(dtype.type, np.bool_):
510-
return "bool"
511-
elif issubclass(dtype.type, np.object_):
512-
return "object"
513-
elif is_datetime64_dtype(dtype):
514-
return "datetime"
515-
elif is_timedelta64_dtype(dtype):
516-
return "timedelta"
517-
elif is_sparse(dtype):
518-
dtype = cast("SparseDtype", dtype)
519-
return dtype.subtype.name
520-
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
521-
return dtype.name
522-
else:
523-
return "float"
443+
if not len(dtypes):
444+
dtypes = [unit.dtype for unit in join_units if unit.block is not None]
445+
446+
dtype = find_common_type(dtypes)
447+
if has_none_blocks:
448+
dtype = ensure_dtype_can_hold_na(dtype)
449+
return dtype
524450

525451

526452
def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool:

pandas/tests/reshape/concat/test_append.py

+21-7
Original file line numberDiff line numberDiff line change
@@ -334,9 +334,9 @@ def test_append_missing_column_proper_upcast(self, sort):
334334
def test_append_empty_frame_to_series_with_dateutil_tz(self):
335335
# GH 23682
336336
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
337-
s = Series({"date": date, "a": 1.0, "b": 2.0})
337+
ser = Series({"date": date, "a": 1.0, "b": 2.0})
338338
df = DataFrame(columns=["c", "d"])
339-
result_a = df.append(s, ignore_index=True)
339+
result_a = df.append(ser, ignore_index=True)
340340
expected = DataFrame(
341341
[[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
342342
)
@@ -350,13 +350,12 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self):
350350
)
351351
expected["c"] = expected["c"].astype(object)
352352
expected["d"] = expected["d"].astype(object)
353-
354-
result_b = result_a.append(s, ignore_index=True)
353+
result_b = result_a.append(ser, ignore_index=True)
355354
tm.assert_frame_equal(result_b, expected)
356355

357356
# column order is different
358357
expected = expected[["c", "d", "date", "a", "b"]]
359-
result = df.append([s, s], ignore_index=True)
358+
result = df.append([ser, ser], ignore_index=True)
360359
tm.assert_frame_equal(result, expected)
361360

362361
def test_append_empty_tz_frame_with_datetime64ns(self):
@@ -378,12 +377,27 @@ def test_append_empty_tz_frame_with_datetime64ns(self):
378377
@pytest.mark.parametrize(
379378
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
380379
)
381-
def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str):
380+
@pytest.mark.parametrize("val", [1, "NaT"])
381+
def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val):
382382
# https://github.com/pandas-dev/pandas/issues/35460
383383
df = DataFrame(columns=["a"]).astype(dtype_str)
384384

385-
other = DataFrame({"a": [np.timedelta64("NaT", "ns")]})
385+
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
386386
result = df.append(other, ignore_index=True)
387387

388388
expected = other.astype(object)
389389
tm.assert_frame_equal(result, expected)
390+
391+
@pytest.mark.parametrize(
392+
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
393+
)
394+
@pytest.mark.parametrize("val", [1, "NaT"])
395+
def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
396+
# https://github.com/pandas-dev/pandas/issues/35460
397+
df = DataFrame({"a": pd.array([1], dtype=dtype_str)})
398+
399+
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
400+
result = df.append(other, ignore_index=True)
401+
402+
expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object)
403+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)