diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 99ae60859b68c..71874aea77e58 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -430,6 +430,7 @@ Reshaping - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) - Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) +- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`) Sparse ^^^^^^ diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 3bded4dd2834b..b766392e35601 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -61,7 +61,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: return arr.astype(dtype, copy=False) -def concat_compat(to_concat, axis: int = 0): +def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a @@ -72,6 +72,9 @@ def concat_compat(to_concat, axis: int = 0): ---------- to_concat : array of arrays axis : axis to provide concatenation + ea_compat_axis : bool, default False + For ExtensionArray compat, behave as if axis == 1 when determining + whether to drop empty arrays. Returns ------- @@ -91,7 +94,8 @@ def is_nonempty(x) -> bool: # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. non_empties = [x for x in to_concat if is_nonempty(x)] - if non_empties and axis == 0: + if non_empties and axis == 0 and not ea_compat_axis: + # ea_compat_axis see GH#39574 to_concat = non_empties kinds = {obj.dtype.kind for obj in to_concat} diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index f7705dd00c3be..a2c930f6d9b22 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,9 +1,8 @@ from __future__ import annotations -from collections import defaultdict import copy import itertools -from typing import TYPE_CHECKING, Dict, List, Sequence, cast +from typing import TYPE_CHECKING, Dict, List, Sequence import numpy as np @@ -14,16 +13,13 @@ from pandas.core.dtypes.cast import ensure_dtype_can_hold_na, find_common_type from pandas.core.dtypes.common import ( is_categorical_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, + is_dtype_equal, is_extension_array_dtype, - is_float_dtype, - is_numeric_dtype, is_sparse, - is_timedelta64_dtype, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.missing import isna_all +from pandas.core.dtypes.missing import is_valid_na_for_dtype, isna_all import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, ExtensionArray @@ -33,7 +29,6 @@ if TYPE_CHECKING: from pandas import Index - from pandas.core.arrays.sparse.dtype import SparseDtype def concatenate_block_managers( @@ -232,6 +227,29 @@ def dtype(self): return blk.dtype return ensure_dtype_can_hold_na(blk.dtype) + def is_valid_na_for(self, dtype: DtypeObj) -> bool: + """ + Check that we are all-NA of a type/dtype that is compatible with this dtype. + Augments `self.is_na` with an additional check of the type of NA values. + """ + if not self.is_na: + return False + if self.block is None: + return True + + if self.dtype == object: + values = self.block.values + return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K")) + + if self.dtype.kind == dtype.kind == "M" and not is_dtype_equal( + self.dtype, dtype + ): + # fill_values match but we should not cast self.block.values to dtype + return False + + na_value = self.block.fill_value + return is_valid_na_for_dtype(na_value, dtype) + @cache_readonly def is_na(self) -> bool: if self.block is None: @@ -262,7 +280,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: else: fill_value = upcasted_na - if self.is_na: + if self.is_valid_na_for(empty_dtype): blk_dtype = getattr(self.block, "dtype", None) if blk_dtype == np.dtype(object): @@ -276,10 +294,9 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if is_datetime64tz_dtype(blk_dtype) or is_datetime64tz_dtype( empty_dtype ): - if self.block is None: - # TODO(EA2D): special case unneeded with 2D EAs - i8values = np.full(self.shape[1], fill_value.value) - return DatetimeArray(i8values, dtype=empty_dtype) + # TODO(EA2D): special case unneeded with 2D EAs + i8values = np.full(self.shape[1], fill_value.value) + return DatetimeArray(i8values, dtype=empty_dtype) elif is_categorical_dtype(blk_dtype): pass elif is_extension_array_dtype(blk_dtype): @@ -295,6 +312,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: empty_arr, allow_fill=True, fill_value=fill_value ) else: + # NB: we should never get here with empty_dtype integer or bool; + # if we did, the missing_arr.fill would cast to gibberish missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr @@ -362,14 +381,12 @@ def _concatenate_join_units( # concatting with at least one EA means we are concatting a single column # the non-EA values are 2D arrays with shape (1, n) to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] - concat_values = concat_compat(to_concat, axis=0) - if not isinstance(concat_values, ExtensionArray) or ( - isinstance(concat_values, DatetimeArray) and concat_values.tz is None - ): + concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) + if not is_extension_array_dtype(concat_values.dtype): # if the result of concat is not an EA but an ndarray, reshape to # 2D to put it a non-EA Block - # special case DatetimeArray, which *is* an EA, but is put in a - # consolidated 2D block + # special case DatetimeArray/TimedeltaArray, which *is* an EA, but + # is put in a consolidated 2D block concat_values = np.atleast_2d(concat_values) else: concat_values = concat_compat(to_concat, axis=concat_axis) @@ -419,108 +436,17 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: return empty_dtype has_none_blocks = any(unit.block is None for unit in join_units) - dtypes = [None if unit.block is None else unit.dtype for unit in join_units] - filtered_dtypes = [ + dtypes = [ unit.dtype for unit in join_units if unit.block is not None and not unit.is_na ] - if not len(filtered_dtypes): - filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None] - dtype_alt = find_common_type(filtered_dtypes) - - upcast_classes = _get_upcast_classes(join_units, dtypes) - - if is_extension_array_dtype(dtype_alt): - return dtype_alt - elif dtype_alt == object: - return dtype_alt - - # TODO: de-duplicate with maybe_promote? - # create the result - if "extension" in upcast_classes: - return np.dtype("object") - elif "bool" in upcast_classes: - if has_none_blocks: - return np.dtype(np.object_) - else: - return np.dtype(np.bool_) - elif "datetimetz" in upcast_classes: - # GH-25014. We use NaT instead of iNaT, since this eventually - # ends up in DatetimeArray.take, which does not allow iNaT. - dtype = upcast_classes["datetimetz"] - return dtype[0] - elif "datetime" in upcast_classes: - return np.dtype("M8[ns]") - elif "timedelta" in upcast_classes: - return np.dtype("m8[ns]") - else: - try: - common_dtype = np.find_common_type(upcast_classes, []) - except TypeError: - # At least one is an ExtensionArray - return np.dtype(np.object_) - else: - if is_float_dtype(common_dtype): - return common_dtype - elif is_numeric_dtype(common_dtype): - if has_none_blocks: - return np.dtype(np.float64) - else: - return common_dtype - - msg = "invalid dtype determination in get_concat_dtype" - raise AssertionError(msg) - - -def _get_upcast_classes( - join_units: Sequence[JoinUnit], - dtypes: Sequence[DtypeObj], -) -> Dict[str, List[DtypeObj]]: - """Create mapping between upcast class names and lists of dtypes.""" - upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - for dtype, unit in zip(dtypes, join_units): - if dtype is None: - continue - - upcast_cls = _select_upcast_cls_from_dtype(dtype) - # Null blocks should not influence upcast class selection, unless there - # are only null blocks, when same upcasting rules must be applied to - # null upcast classes. - if unit.is_na: - null_upcast_classes[upcast_cls].append(dtype) - else: - upcast_classes[upcast_cls].append(dtype) - - if not upcast_classes: - upcast_classes = null_upcast_classes - - return upcast_classes - - -def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: - """Select upcast class name based on dtype.""" - if is_categorical_dtype(dtype): - return "extension" - elif is_datetime64tz_dtype(dtype): - return "datetimetz" - elif is_extension_array_dtype(dtype): - return "extension" - elif issubclass(dtype.type, np.bool_): - return "bool" - elif issubclass(dtype.type, np.object_): - return "object" - elif is_datetime64_dtype(dtype): - return "datetime" - elif is_timedelta64_dtype(dtype): - return "timedelta" - elif is_sparse(dtype): - dtype = cast("SparseDtype", dtype) - return dtype.subtype.name - elif is_float_dtype(dtype) or is_numeric_dtype(dtype): - return dtype.name - else: - return "float" + if not len(dtypes): + dtypes = [unit.dtype for unit in join_units if unit.block is not None] + + dtype = find_common_type(dtypes) + if has_none_blocks: + dtype = ensure_dtype_can_hold_na(dtype) + return dtype def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index dd6dbd79113e5..81d5526f5bd15 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -334,9 +334,9 @@ def test_append_missing_column_proper_upcast(self, sort): def test_append_empty_frame_to_series_with_dateutil_tz(self): # GH 23682 date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) - s = Series({"date": date, "a": 1.0, "b": 2.0}) + ser = Series({"date": date, "a": 1.0, "b": 2.0}) df = DataFrame(columns=["c", "d"]) - result_a = df.append(s, ignore_index=True) + result_a = df.append(ser, ignore_index=True) expected = DataFrame( [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"] ) @@ -350,13 +350,12 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): ) expected["c"] = expected["c"].astype(object) expected["d"] = expected["d"].astype(object) - - result_b = result_a.append(s, ignore_index=True) + result_b = result_a.append(ser, ignore_index=True) tm.assert_frame_equal(result_b, expected) # column order is different expected = expected[["c", "d", "date", "a", "b"]] - result = df.append([s, s], ignore_index=True) + result = df.append([ser, ser], ignore_index=True) tm.assert_frame_equal(result, expected) def test_append_empty_tz_frame_with_datetime64ns(self): @@ -378,12 +377,27 @@ def test_append_empty_tz_frame_with_datetime64ns(self): @pytest.mark.parametrize( "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"] ) - def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str): + @pytest.mark.parametrize("val", [1, "NaT"]) + def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame(columns=["a"]).astype(dtype_str) - other = DataFrame({"a": [np.timedelta64("NaT", "ns")]}) + other = DataFrame({"a": [np.timedelta64(val, "ns")]}) result = df.append(other, ignore_index=True) expected = other.astype(object) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"] + ) + @pytest.mark.parametrize("val", [1, "NaT"]) + def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val): + # https://github.com/pandas-dev/pandas/issues/35460 + df = DataFrame({"a": pd.array([1], dtype=dtype_str)}) + + other = DataFrame({"a": [np.timedelta64(val, "ns")]}) + result = df.append(other, ignore_index=True) + + expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 673c97740594f..f47f4e1577277 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -528,10 +528,8 @@ def test_merge_datetime_multi_index_empty_df(self, merge_type): tm.assert_frame_equal(results_merge, expected) tm.assert_frame_equal(results_join, expected) - def test_join_multi_levels(self): - - # GH 3662 - # merge multi-levels + @pytest.fixture + def household(self): household = DataFrame( { "household_id": [1, 2, 3], @@ -540,6 +538,10 @@ def test_join_multi_levels(self): }, columns=["household_id", "male", "wealth"], ).set_index("household_id") + return household + + @pytest.fixture + def portfolio(self): portfolio = DataFrame( { "household_id": [1, 2, 2, 3, 3, 3, 4], @@ -565,7 +567,10 @@ def test_join_multi_levels(self): }, columns=["household_id", "asset_id", "name", "share"], ).set_index(["household_id", "asset_id"]) - result = household.join(portfolio, how="inner") + return portfolio + + @pytest.fixture + def expected(self): expected = ( DataFrame( { @@ -601,8 +606,21 @@ def test_join_multi_levels(self): .set_index(["household_id", "asset_id"]) .reindex(columns=["male", "wealth", "name", "share"]) ) + return expected + + def test_join_multi_levels(self, portfolio, household, expected): + portfolio = portfolio.copy() + household = household.copy() + + # GH 3662 + # merge multi-levels + result = household.join(portfolio, how="inner") tm.assert_frame_equal(result, expected) + def test_join_multi_levels_merge_equivalence(self, portfolio, household, expected): + portfolio = portfolio.copy() + household = household.copy() + # equivalency result = merge( household.reset_index(), @@ -612,6 +630,10 @@ def test_join_multi_levels(self): ).set_index(["household_id", "asset_id"]) tm.assert_frame_equal(result, expected) + def test_join_multi_levels_outer(self, portfolio, household, expected): + portfolio = portfolio.copy() + household = household.copy() + result = household.join(portfolio, how="outer") expected = concat( [ @@ -630,6 +652,10 @@ def test_join_multi_levels(self): ).reindex(columns=expected.columns) tm.assert_frame_equal(result, expected) + def test_join_multi_levels_invalid(self, portfolio, household): + portfolio = portfolio.copy() + household = household.copy() + # invalid cases household.index.name = "foo"