From b3fb477074af15590d383b439ac87b1712977178 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Feb 2021 20:12:49 -0800 Subject: [PATCH 1/9] BUG: DataFrame.append with timedelta64 --- pandas/core/dtypes/concat.py | 4 +- pandas/core/internals/concat.py | 133 ++++----------------- pandas/tests/frame/methods/test_append.py | 6 +- pandas/tests/indexing/test_partial.py | 3 +- pandas/tests/reshape/concat/test_append.py | 34 ++++-- pandas/tests/reshape/merge/test_merge.py | 1 + pandas/tests/reshape/merge/test_multi.py | 36 +++++- 7 files changed, 87 insertions(+), 130 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 624e71a5cf760..802d39ae5a5ac 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -97,7 +97,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: return arr.astype(dtype, copy=False) -def concat_compat(to_concat, axis: int = 0): +def concat_compat(to_concat, axis: int = 0, pretend_axis1: bool = False): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a @@ -127,7 +127,7 @@ def is_nonempty(x) -> bool: # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. non_empties = [x for x in to_concat if is_nonempty(x)] - if non_empties and axis == 0: + if non_empties and axis == 0 and not pretend_axis1: to_concat = non_empties typs = _get_dtype_kinds(to_concat) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 3dcfa85ed5c08..69ac54740c1fc 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,9 +1,8 @@ from __future__ import annotations -from collections import defaultdict import copy import itertools -from typing import TYPE_CHECKING, Dict, List, Sequence, cast +from typing import TYPE_CHECKING, Dict, List, Sequence import numpy as np @@ -15,26 +14,21 @@ from pandas.core.dtypes.common import ( get_dtype, is_categorical_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, is_extension_array_dtype, - is_float_dtype, - is_numeric_dtype, is_sparse, - is_timedelta64_dtype, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.missing import isna_all import pandas.core.algorithms as algos -from pandas.core.arrays import DatetimeArray, ExtensionArray +from pandas.core.arrays import DatetimeArray, ExtensionArray, TimedeltaArray from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager if TYPE_CHECKING: from pandas import Index - from pandas.core.arrays.sparse.dtype import SparseDtype def concatenate_block_managers( @@ -296,6 +290,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: empty_arr, allow_fill=True, fill_value=fill_value ) else: + # NB: we should never get here with empty_dtype integer or bool; + # if we did, the missing_arr.fill would cast to gibberish missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr @@ -363,9 +359,11 @@ def _concatenate_join_units( # concatting with at least one EA means we are concatting a single column # the non-EA values are 2D arrays with shape (1, n) to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] - concat_values = concat_compat(to_concat, axis=0) - if not isinstance(concat_values, ExtensionArray) or ( - isinstance(concat_values, DatetimeArray) and concat_values.tz is None + concat_values = concat_compat(to_concat, axis=0, pretend_axis1=True) + if ( + not isinstance(concat_values, ExtensionArray) + or (isinstance(concat_values, DatetimeArray) and concat_values.tz is None) + or isinstance(concat_values, TimedeltaArray) ): # if the result of concat is not an EA but an ndarray, reshape to # 2D to put it a non-EA Block @@ -421,107 +419,18 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: has_none_blocks = any(unit.block is None for unit in join_units) dtypes = [None if unit.block is None else unit.dtype for unit in join_units] - - filtered_dtypes = [ - unit.dtype for unit in join_units if unit.block is not None and not unit.is_na - ] - if not len(filtered_dtypes): - filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None] - dtype_alt = find_common_type(filtered_dtypes) - - upcast_classes = _get_upcast_classes(join_units, dtypes) - - if is_extension_array_dtype(dtype_alt): - return dtype_alt - elif dtype_alt == object: - return dtype_alt - - # TODO: de-duplicate with maybe_promote? - # create the result - if "extension" in upcast_classes: - return np.dtype("object") - elif "bool" in upcast_classes: - if has_none_blocks: - return np.dtype(np.object_) - else: - return np.dtype(np.bool_) - elif "datetimetz" in upcast_classes: - # GH-25014. We use NaT instead of iNaT, since this eventually - # ends up in DatetimeArray.take, which does not allow iNaT. - dtype = upcast_classes["datetimetz"] - return dtype[0] - elif "datetime" in upcast_classes: - return np.dtype("M8[ns]") - elif "timedelta" in upcast_classes: - return np.dtype("m8[ns]") - else: - try: - common_dtype = np.find_common_type(upcast_classes, []) - except TypeError: - # At least one is an ExtensionArray - return np.dtype(np.object_) - else: - if is_float_dtype(common_dtype): - return common_dtype - elif is_numeric_dtype(common_dtype): - if has_none_blocks: - return np.dtype(np.float64) - else: - return common_dtype - - msg = "invalid dtype determination in get_concat_dtype" - raise AssertionError(msg) - - -def _get_upcast_classes( - join_units: Sequence[JoinUnit], - dtypes: Sequence[DtypeObj], -) -> Dict[str, List[DtypeObj]]: - """Create mapping between upcast class names and lists of dtypes.""" - upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - for dtype, unit in zip(dtypes, join_units): - if dtype is None: - continue - - upcast_cls = _select_upcast_cls_from_dtype(dtype) - # Null blocks should not influence upcast class selection, unless there - # are only null blocks, when same upcasting rules must be applied to - # null upcast classes. - if unit.is_na: - null_upcast_classes[upcast_cls].append(dtype) - else: - upcast_classes[upcast_cls].append(dtype) - - if not upcast_classes: - upcast_classes = null_upcast_classes - - return upcast_classes - - -def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: - """Select upcast class name based on dtype.""" - if is_categorical_dtype(dtype): - return "extension" - elif is_datetime64tz_dtype(dtype): - return "datetimetz" - elif is_extension_array_dtype(dtype): - return "extension" - elif issubclass(dtype.type, np.bool_): - return "bool" - elif issubclass(dtype.type, np.object_): - return "object" - elif is_datetime64_dtype(dtype): - return "datetime" - elif is_timedelta64_dtype(dtype): - return "timedelta" - elif is_sparse(dtype): - dtype = cast("SparseDtype", dtype) - return dtype.subtype.name - elif is_float_dtype(dtype) or is_numeric_dtype(dtype): - return dtype.name - else: - return "float" + dtypes = [x for x in dtypes if x is not None] + + dtype = find_common_type(dtypes) + if has_none_blocks: + if not isinstance(dtype, np.dtype): + # EA dtype + pass + elif dtype.kind in ["i", "u"]: + dtype = np.dtype(np.float64) + elif dtype.kind == "b": + dtype = np.dtype(object) + return dtype def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 36c875b8abe6f..44e21ba616a45 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -165,7 +165,7 @@ def test_append_dtypes(self): df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( - {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + {"bar": Series([Timestamp("20130101"), np.nan], dtype="object")} ) tm.assert_frame_equal(result, expected) @@ -173,7 +173,7 @@ def test_append_dtypes(self): df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) result = df1.append(df2) expected = DataFrame( - {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + {"bar": Series([Timestamp("20130101"), np.nan], dtype="object")} ) tm.assert_frame_equal(result, expected) @@ -181,7 +181,7 @@ def test_append_dtypes(self): df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( - {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} + {"bar": Series([np.nan, Timestamp("20130101")], dtype="object")} ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index ad2d7250d9d6c..f9ec6feaab776 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -160,7 +160,8 @@ def test_partial_setting_mixed_dtype(self): df = DataFrame(columns=["A", "B"]) df.loc[0] = Series(1, index=["B"]) - exp = DataFrame([[np.nan, 1]], columns=["A", "B"], index=[0], dtype="float64") + # TODO: having this be float64 would not be unreasonable + exp = DataFrame([[np.nan, 1]], columns=["A", "B"], index=[0], dtype="object") tm.assert_frame_equal(df, exp) # list-like must conform diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index dd6dbd79113e5..97086a83ae6e5 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -334,15 +334,17 @@ def test_append_missing_column_proper_upcast(self, sort): def test_append_empty_frame_to_series_with_dateutil_tz(self): # GH 23682 date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) - s = Series({"date": date, "a": 1.0, "b": 2.0}) + ser = Series({"date": date, "a": 1.0, "b": 2.0}) df = DataFrame(columns=["c", "d"]) - result_a = df.append(s, ignore_index=True) + result_a = df.append(ser, ignore_index=True) expected = DataFrame( [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"] ) # These columns get cast to object after append expected["c"] = expected["c"].astype(object) expected["d"] = expected["d"].astype(object) + expected["date"] = expected["date"].astype(object) + # TODO: "date" might make sense to keep as dt64tz tm.assert_frame_equal(result_a, expected) expected = DataFrame( @@ -350,13 +352,16 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): ) expected["c"] = expected["c"].astype(object) expected["d"] = expected["d"].astype(object) - - result_b = result_a.append(s, ignore_index=True) + expected["date"] = expected["date"].astype(object) + # TODO: "date" might make sense to keep as dt64tz + result_b = result_a.append(ser, ignore_index=True) tm.assert_frame_equal(result_b, expected) # column order is different expected = expected[["c", "d", "date", "a", "b"]] - result = df.append([s, s], ignore_index=True) + dtype = Series([date]).dtype + expected["date"] = expected["date"].astype(dtype) + result = df.append([ser, ser], ignore_index=True) tm.assert_frame_equal(result, expected) def test_append_empty_tz_frame_with_datetime64ns(self): @@ -378,12 +383,27 @@ def test_append_empty_tz_frame_with_datetime64ns(self): @pytest.mark.parametrize( "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"] ) - def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str): + @pytest.mark.parametrize("val", [1, "NaT"]) + def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame(columns=["a"]).astype(dtype_str) - other = DataFrame({"a": [np.timedelta64("NaT", "ns")]}) + other = DataFrame({"a": [np.timedelta64(val, "ns")]}) result = df.append(other, ignore_index=True) expected = other.astype(object) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"] + ) + @pytest.mark.parametrize("val", [1, "NaT"]) + def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val): + # https://github.com/pandas-dev/pandas/issues/35460 + df = DataFrame({"a": pd.array([1], dtype=dtype_str)}) + + other = DataFrame({"a": [np.timedelta64(val, "ns")]}) + result = df.append(other, ignore_index=True) + + expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index da3ac81c4aa17..9163cab237278 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -670,6 +670,7 @@ def test_join_append_timedeltas(self): "t": [timedelta(0, 22500), timedelta(0, 22500)], } ) + expected = expected.astype(object) tm.assert_frame_equal(result, expected) td = np.timedelta64(300000000) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 673c97740594f..f47f4e1577277 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -528,10 +528,8 @@ def test_merge_datetime_multi_index_empty_df(self, merge_type): tm.assert_frame_equal(results_merge, expected) tm.assert_frame_equal(results_join, expected) - def test_join_multi_levels(self): - - # GH 3662 - # merge multi-levels + @pytest.fixture + def household(self): household = DataFrame( { "household_id": [1, 2, 3], @@ -540,6 +538,10 @@ def test_join_multi_levels(self): }, columns=["household_id", "male", "wealth"], ).set_index("household_id") + return household + + @pytest.fixture + def portfolio(self): portfolio = DataFrame( { "household_id": [1, 2, 2, 3, 3, 3, 4], @@ -565,7 +567,10 @@ def test_join_multi_levels(self): }, columns=["household_id", "asset_id", "name", "share"], ).set_index(["household_id", "asset_id"]) - result = household.join(portfolio, how="inner") + return portfolio + + @pytest.fixture + def expected(self): expected = ( DataFrame( { @@ -601,8 +606,21 @@ def test_join_multi_levels(self): .set_index(["household_id", "asset_id"]) .reindex(columns=["male", "wealth", "name", "share"]) ) + return expected + + def test_join_multi_levels(self, portfolio, household, expected): + portfolio = portfolio.copy() + household = household.copy() + + # GH 3662 + # merge multi-levels + result = household.join(portfolio, how="inner") tm.assert_frame_equal(result, expected) + def test_join_multi_levels_merge_equivalence(self, portfolio, household, expected): + portfolio = portfolio.copy() + household = household.copy() + # equivalency result = merge( household.reset_index(), @@ -612,6 +630,10 @@ def test_join_multi_levels(self): ).set_index(["household_id", "asset_id"]) tm.assert_frame_equal(result, expected) + def test_join_multi_levels_outer(self, portfolio, household, expected): + portfolio = portfolio.copy() + household = household.copy() + result = household.join(portfolio, how="outer") expected = concat( [ @@ -630,6 +652,10 @@ def test_join_multi_levels(self): ).reindex(columns=expected.columns) tm.assert_frame_equal(result, expected) + def test_join_multi_levels_invalid(self, portfolio, household): + portfolio = portfolio.copy() + household = household.copy() + # invalid cases household.index.name = "foo" From 512a50c4a88fa8fe43a32d76d26212e6cf09a4cf Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 7 Feb 2021 09:23:42 -0800 Subject: [PATCH 2/9] pretend_axi1 -> ea_compat_axis --- pandas/core/dtypes/concat.py | 7 +++++-- pandas/core/internals/concat.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index f253cf3323dca..fa7d03a8dc9c8 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -61,7 +61,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: return arr.astype(dtype, copy=False) -def concat_compat(to_concat, axis: int = 0, pretend_axis1: bool = False): +def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a @@ -72,6 +72,8 @@ def concat_compat(to_concat, axis: int = 0, pretend_axis1: bool = False): ---------- to_concat : array of arrays axis : axis to provide concatenation + ea_compat_axis : bool, default False + For ExtensionArray compat, behave as if axis == 1 Returns ------- @@ -91,7 +93,8 @@ def is_nonempty(x) -> bool: # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. non_empties = [x for x in to_concat if is_nonempty(x)] - if non_empties and axis == 0 and not pretend_axis1: + if non_empties and axis == 0 and not ea_compat_axis: + # ea_compat_axis see GH#39574 to_concat = non_empties kinds = {obj.dtype.kind for obj in to_concat} diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 69ac54740c1fc..53d2b45c50dee 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -359,7 +359,7 @@ def _concatenate_join_units( # concatting with at least one EA means we are concatting a single column # the non-EA values are 2D arrays with shape (1, n) to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] - concat_values = concat_compat(to_concat, axis=0, pretend_axis1=True) + concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) if ( not isinstance(concat_values, ExtensionArray) or (isinstance(concat_values, DatetimeArray) and concat_values.tz is None) From 1c63c05d4842dd71bcde19ffa221a532c6e7d3aa Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Feb 2021 17:50:32 -0800 Subject: [PATCH 3/9] check compatibility in JoinUnit.is_na --- pandas/core/dtypes/concat.py | 3 ++- pandas/core/internals/concat.py | 30 ++++++++++++++++++++--- pandas/tests/frame/methods/test_append.py | 6 ++--- pandas/tests/indexing/test_partial.py | 3 +-- pandas/tests/reshape/merge/test_merge.py | 1 - 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index fa7d03a8dc9c8..0119c3bdb1b87 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -73,7 +73,8 @@ def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False): to_concat : array of arrays axis : axis to provide concatenation ea_compat_axis : bool, default False - For ExtensionArray compat, behave as if axis == 1 + For ExtensionArray compat, behave as if axis == 1 when determining + whether to drop empty arrays. Returns ------- diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 53d2b45c50dee..8e8050ca51e92 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -19,7 +19,7 @@ is_sparse, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.missing import isna_all +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna_all import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, ExtensionArray, TimedeltaArray @@ -227,6 +227,24 @@ def dtype(self): else: return get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) + def is_valid_na_for(self, dtype: DtypeObj) -> bool: + """ + Check that we are all-NA of a type/dtype that is compatible with this dtype. + """ + if not self.is_na: + return False + if self.block is None: + return True + + if self.dtype == object: + values = self.block.values + return all( + is_valid_nat_for_dtype(x, dtype) for x in values.ravel(order="K") + ) + + na_value = self.block.fill_value + return is_valid_nat_for_dtype(na_value, dtype) + @cache_readonly def is_na(self) -> bool: if self.block is None: @@ -257,7 +275,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: else: fill_value = upcasted_na - if self.is_na: + if self.is_valid_na_for(empty_dtype): blk_dtype = getattr(self.block, "dtype", None) if blk_dtype == np.dtype(object): @@ -418,8 +436,12 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: return empty_dtype has_none_blocks = any(unit.block is None for unit in join_units) - dtypes = [None if unit.block is None else unit.dtype for unit in join_units] - dtypes = [x for x in dtypes if x is not None] + + dtypes = [ + unit.dtype for unit in join_units if unit.block is not None and not unit.is_na + ] + if not len(dtypes): + dtypes = [unit.dtype for unit in join_units if unit.block is not None] dtype = find_common_type(dtypes) if has_none_blocks: diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 44e21ba616a45..36c875b8abe6f 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -165,7 +165,7 @@ def test_append_dtypes(self): df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( - {"bar": Series([Timestamp("20130101"), np.nan], dtype="object")} + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) tm.assert_frame_equal(result, expected) @@ -173,7 +173,7 @@ def test_append_dtypes(self): df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) result = df1.append(df2) expected = DataFrame( - {"bar": Series([Timestamp("20130101"), np.nan], dtype="object")} + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) tm.assert_frame_equal(result, expected) @@ -181,7 +181,7 @@ def test_append_dtypes(self): df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( - {"bar": Series([np.nan, Timestamp("20130101")], dtype="object")} + {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index f9ec6feaab776..ad2d7250d9d6c 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -160,8 +160,7 @@ def test_partial_setting_mixed_dtype(self): df = DataFrame(columns=["A", "B"]) df.loc[0] = Series(1, index=["B"]) - # TODO: having this be float64 would not be unreasonable - exp = DataFrame([[np.nan, 1]], columns=["A", "B"], index=[0], dtype="object") + exp = DataFrame([[np.nan, 1]], columns=["A", "B"], index=[0], dtype="float64") tm.assert_frame_equal(df, exp) # list-like must conform diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 9163cab237278..da3ac81c4aa17 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -670,7 +670,6 @@ def test_join_append_timedeltas(self): "t": [timedelta(0, 22500), timedelta(0, 22500)], } ) - expected = expected.astype(object) tm.assert_frame_equal(result, expected) td = np.timedelta64(300000000) From 7de38000965cd68e5d40395a8c4f131ed506ac7a Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 10 Feb 2021 09:49:47 -0800 Subject: [PATCH 4/9] avoid object dtype --- pandas/core/internals/concat.py | 22 +++++++++++++--------- pandas/tests/reshape/concat/test_append.py | 4 ---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 8e8050ca51e92..1aef830248018 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -15,11 +15,12 @@ get_dtype, is_categorical_dtype, is_datetime64tz_dtype, + is_dtype_equal, is_extension_array_dtype, is_sparse, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna_all +from pandas.core.dtypes.missing import is_valid_na_for_dtype, isna_all import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, ExtensionArray, TimedeltaArray @@ -238,12 +239,16 @@ def is_valid_na_for(self, dtype: DtypeObj) -> bool: if self.dtype == object: values = self.block.values - return all( - is_valid_nat_for_dtype(x, dtype) for x in values.ravel(order="K") - ) + return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K")) + + if self.dtype.kind == dtype.kind == "M" and not is_dtype_equal( + self.dtype, dtype + ): + # fill_values match but we should not case self.block.values to dtype + return False na_value = self.block.fill_value - return is_valid_nat_for_dtype(na_value, dtype) + return is_valid_na_for_dtype(na_value, dtype) @cache_readonly def is_na(self) -> bool: @@ -289,10 +294,9 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if is_datetime64tz_dtype(blk_dtype) or is_datetime64tz_dtype( empty_dtype ): - if self.block is None: - # TODO(EA2D): special case unneeded with 2D EAs - i8values = np.full(self.shape[1], fill_value.value) - return DatetimeArray(i8values, dtype=empty_dtype) + # TODO(EA2D): special case unneeded with 2D EAs + i8values = np.full(self.shape[1], fill_value.value) + return DatetimeArray(i8values, dtype=empty_dtype) elif is_categorical_dtype(blk_dtype): pass elif is_extension_array_dtype(blk_dtype): diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 97086a83ae6e5..5eff6193f20e4 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -343,8 +343,6 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): # These columns get cast to object after append expected["c"] = expected["c"].astype(object) expected["d"] = expected["d"].astype(object) - expected["date"] = expected["date"].astype(object) - # TODO: "date" might make sense to keep as dt64tz tm.assert_frame_equal(result_a, expected) expected = DataFrame( @@ -352,8 +350,6 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): ) expected["c"] = expected["c"].astype(object) expected["d"] = expected["d"].astype(object) - expected["date"] = expected["date"].astype(object) - # TODO: "date" might make sense to keep as dt64tz result_b = result_a.append(ser, ignore_index=True) tm.assert_frame_equal(result_b, expected) From dbb59e73ba1ae8bb4b2bea4b293d0a145b4cc693 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Feb 2021 19:31:04 -0800 Subject: [PATCH 5/9] REF: re-use helper func --- pandas/core/internals/concat.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 7bd9022ee0a50..cd98295cdb571 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -22,7 +22,7 @@ from pandas.core.dtypes.missing import is_valid_na_for_dtype, isna_all import pandas.core.algorithms as algos -from pandas.core.arrays import DatetimeArray, ExtensionArray, TimedeltaArray +from pandas.core.arrays import DatetimeArray, ExtensionArray from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager @@ -381,15 +381,11 @@ def _concatenate_join_units( # the non-EA values are 2D arrays with shape (1, n) to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) - if ( - not isinstance(concat_values, ExtensionArray) - or (isinstance(concat_values, DatetimeArray) and concat_values.tz is None) - or isinstance(concat_values, TimedeltaArray) - ): + if not is_extension_array_dtype(concat_values.dtype): # if the result of concat is not an EA but an ndarray, reshape to # 2D to put it a non-EA Block - # special case DatetimeArray, which *is* an EA, but is put in a - # consolidated 2D block + # special case DatetimeArray/TimedeltaArray, which *is* an EA, but + # is put in a consolidated 2D block concat_values = np.atleast_2d(concat_values) else: concat_values = concat_compat(to_concat, axis=concat_axis) @@ -448,13 +444,7 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: dtype = find_common_type(dtypes) if has_none_blocks: - if not isinstance(dtype, np.dtype): - # EA dtype - pass - elif dtype.kind in ["i", "u"]: - dtype = np.dtype(np.float64) - elif dtype.kind == "b": - dtype = np.dtype(object) + dtype = ensure_dtype_can_hold_na(dtype) return dtype From f40cf7c6fc713af8a6f212548de85b2fd72d569b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Feb 2021 10:19:03 -0800 Subject: [PATCH 6/9] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 99ae60859b68c..71874aea77e58 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -430,6 +430,7 @@ Reshaping - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) - Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) +- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`) Sparse ^^^^^^ From 73150047f56bfeabaca03030db526c5c2a427208 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 12 Feb 2021 10:26:26 -0800 Subject: [PATCH 7/9] Update pandas/core/internals/concat.py Co-authored-by: Joris Van den Bossche --- pandas/core/internals/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index cd98295cdb571..50fca395ecb29 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -243,7 +243,7 @@ def is_valid_na_for(self, dtype: DtypeObj) -> bool: if self.dtype.kind == dtype.kind == "M" and not is_dtype_equal( self.dtype, dtype ): - # fill_values match but we should not case self.block.values to dtype + # fill_values match but we should not cast self.block.values to dtype return False na_value = self.block.fill_value From faf6c35cf8d3044815683ad7832373cd21f3d15c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 12 Feb 2021 10:40:44 -0800 Subject: [PATCH 8/9] Update pandas/core/internals/concat.py Co-authored-by: Joris Van den Bossche --- pandas/core/internals/concat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 50fca395ecb29..a2c930f6d9b22 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -230,6 +230,7 @@ def dtype(self): def is_valid_na_for(self, dtype: DtypeObj) -> bool: """ Check that we are all-NA of a type/dtype that is compatible with this dtype. + Augments `self.is_na` with an additional check of the type of NA values. """ if not self.is_na: return False From 3ed534ca7aaf420317d16687aa735520e7bd4fa2 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Feb 2021 11:09:16 -0800 Subject: [PATCH 9/9] revert no-longer necessary --- pandas/tests/reshape/concat/test_append.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 5eff6193f20e4..81d5526f5bd15 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -355,8 +355,6 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): # column order is different expected = expected[["c", "d", "date", "a", "b"]] - dtype = Series([date]).dtype - expected["date"] = expected["date"].astype(dtype) result = df.append([ser, ser], ignore_index=True) tm.assert_frame_equal(result, expected)