Skip to content

BUG: Series.combine_first with datetime-tz dtype (#21469) #21544

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,19 +410,6 @@ def _apply_if_callable(maybe_callable, obj, **kwargs):
return maybe_callable


def _where_compat(mask, arr1, arr2):
if arr1.dtype == _NS_DTYPE and arr2.dtype == _NS_DTYPE:
new_vals = np.where(mask, arr1.view('i8'), arr2.view('i8'))
return new_vals.view(_NS_DTYPE)

if arr1.dtype == _NS_DTYPE:
arr1 = tslib.ints_to_pydatetime(arr1.view('i8'))
if arr2.dtype == _NS_DTYPE:
arr2 = tslib.ints_to_pydatetime(arr2.view('i8'))

return np.where(mask, arr1, arr2)


def _dict_compat(d):
"""
Helper function to convert datetimelike-keyed dicts to Timestamp-keyed dict
Expand Down
24 changes: 22 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,16 @@
is_hashable,
is_iterator,
is_dict_like,
is_dtype_equal,
is_scalar,
_is_unorderable_exception,
_ensure_platform_int,
pandas_dtype)
pandas_dtype,
needs_i8_conversion)
from pandas.core.dtypes.generic import (
ABCSparseArray, ABCDataFrame, ABCIndexClass)
from pandas.core.dtypes.cast import (
find_common_type, maybe_downcast_to_dtype,
maybe_upcast, infer_dtype_from_scalar,
maybe_convert_platform,
maybe_cast_to_datetime, maybe_castable,
Expand Down Expand Up @@ -2304,7 +2307,24 @@ def combine_first(self, other):
other = other.reindex(new_index, copy=False)
# TODO: do we need name?
name = ops.get_op_result_name(self, other) # noqa
rs_vals = com._where_compat(isna(this), other._values, this._values)
if not is_dtype_equal(this.dtype, other.dtype):
new_dtype = find_common_type([this.dtype, other.dtype])
if not is_dtype_equal(this.dtype, new_dtype):
this = this.astype(new_dtype)
if not is_dtype_equal(other.dtype, new_dtype):
other = other.astype(new_dtype)

if needs_i8_conversion(this.dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can just use the in-built

other.where(isna(this), this) which already knows how to handle the dtypes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback In theory that should work, but where is broken on datetime tz values on both Series and DataFrame, see #21546. I considered trying to fix where for this PR, but since the current combine_first impl doesn't point to where and where is a problem on both Series and DataFrame I thought better to separate the issues.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then convert using pd.Index and use .where

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

really would prefer to fix #21546 first in any event.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback I'm taking a look at #21546. Agree it makes sense ultimately to fix the underlying where issue and keep the implementation of combine_first simpler as a result. I just posted a comment over there with some questions about the contract for the dimension of the return value of Block.values and Block.get_values that appears to be driving some of the issues over there (DatetimeTZBlock internally is using values with ndim==1 even for DataFrame blocks, which have ndim==2).

mask = isna(this)
this_values = this.values.view('i8')
other_values = other.values.view('i8')
else:
this_values = this.values
other_values = other.values
mask = isna(this_values)

rs_vals = np.where(mask, other_values, this_values)
rs_vals = maybe_downcast_to_dtype(rs_vals, this.dtype)
return self._constructor(rs_vals, index=new_index).__finalize__(self)

def update(self, other):
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/series/test_combine_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,19 @@ def get_result_type(dtype, dtype2):
]).dtype
assert result.kind == expected

def test_combine_first_dt_tz_values(self):
dts1 = pd.date_range('20150101', '20150105', tz='America/New_York')
df1 = pd.DataFrame({'date': dts1})
dts2 = pd.date_range('20160514', '20160518', tz='America/New_York')
df2 = pd.DataFrame({'date': dts2}, index=range(3, 8))
result = df1.date.combine_first(df2.date)
exp_vals = pd.DatetimeIndex(['20150101', '20150102', '20150103',
'20150104', '20150105', '20160516',
'20160517', '20160518'],
tz='America/New_York')
exp = pd.Series(exp_vals, name='date')
assert_series_equal(exp, result)

def test_concat_empty_series_dtypes(self):

# booleans
Expand Down