From aa9f4571a3fa003952c88c6538b0fa1fd846bb67 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 1 Jan 2019 11:32:31 -0800 Subject: [PATCH 1/2] implement fillna from 24024, with fixes and tests --- pandas/core/arrays/datetimelike.py | 50 +++++++++++++++++++++++- pandas/core/arrays/period.py | 39 +----------------- pandas/core/missing.py | 10 ++++- pandas/tests/arrays/test_datetimelike.py | 14 +++++++ pandas/tests/arrays/test_datetimes.py | 17 ++++++++ 5 files changed, 90 insertions(+), 40 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 98a1f1b925447..ab5621d857e89 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -16,6 +16,7 @@ from pandas.errors import ( AbstractMethodError, NullFrequencyError, PerformanceWarning) from pandas.util._decorators import Appender, Substitution +from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical_dtype, is_datetime64_any_dtype, @@ -25,9 +26,10 @@ is_string_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna -from pandas.core import nanops +from pandas.core import missing, nanops from pandas.core.algorithms import ( checked_add_with_arr, take, unique1d, value_counts) import pandas.core.common as com @@ -787,6 +789,52 @@ def _maybe_mask_results(self, result, fill_value=iNaT, convert=None): result[self._isnan] = fill_value return result + def fillna(self, value=None, method=None, limit=None): + # TODO(GH-20300): remove this + # Just overriding to ensure that we avoid an astype(object). + # Either 20300 or a `_values_for_fillna` would avoid this duplication. + if isinstance(value, ABCSeries): + value = value.array + + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError("Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self))) + value = value[mask] + + if mask.any(): + if method is not None: + if method == 'pad': + func = missing.pad_1d + else: + func = missing.backfill_1d + + values = self._data + if not is_period_dtype(self): + # For PeriodArray self._data is i8, which gets copied + # by `func`. Otherwise we need to make a copy manually + # to avoid modifying `self` in-place. + values = values.copy() + + new_values = func(values, limit=limit, + mask=mask) + if is_datetime64tz_dtype(self): + # we need to pass int64 values to the constructor to avoid + # re-localizing incorrectly + new_values = new_values.view("i8") + new_values = type(self)(new_values, dtype=self.dtype) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + # ------------------------------------------------------------------ # Frequency Properties/Methods diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 5a74f04c237d0..ff637dc6331a8 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -12,10 +12,9 @@ from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds import pandas.compat as compat from pandas.util._decorators import Appender, cache_readonly -from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import ( - _TD_DTYPE, ensure_object, is_array_like, is_datetime64_dtype, + _TD_DTYPE, ensure_object, is_datetime64_dtype, is_float_dtype, is_list_like, is_period_dtype, pandas_dtype) from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCPeriodIndex, ABCSeries @@ -24,7 +23,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com -from pandas.core.missing import backfill_1d, pad_1d from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick @@ -381,41 +379,6 @@ def _validate_fill_value(self, fill_value): "Got '{got}'.".format(got=fill_value)) return fill_value - def fillna(self, value=None, method=None, limit=None): - # TODO(#20300) - # To avoid converting to object, we re-implement here with the changes - # 1. Passing `_data` to func instead of self.astype(object) - # 2. Re-boxing output of 1. - # #20300 should let us do this kind of logic on ExtensionArray.fillna - # and we can use it. - - if isinstance(value, ABCSeries): - value = value._values - - value, method = validate_fillna_kwargs(value, method) - - mask = self.isna() - - if is_array_like(value): - if len(value) != len(self): - raise ValueError("Length of 'value' does not match. Got ({}) " - " expected {}".format(len(value), len(self))) - value = value[mask] - - if mask.any(): - if method is not None: - func = pad_1d if method == 'pad' else backfill_1d - new_values = func(self._data, limit=limit, - mask=mask) - new_values = type(self)(new_values, freq=self.freq) - else: - # fill with value - new_values = self.copy() - new_values[mask] = value - else: - new_values = self.copy() - return new_values - # -------------------------------------------------------------------- def _time_shift(self, periods, freq=None): diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 1012639fe0f9d..ee9aa9e229126 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -13,7 +13,7 @@ from pandas.core.dtypes.common import ( ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, is_float_dtype, is_integer, is_integer_dtype, is_numeric_v_string_like, is_scalar, - needs_i8_conversion) + is_timedelta64_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import isna @@ -481,6 +481,10 @@ def pad_1d(values, limit=None, mask=None, dtype=None): _method = algos.pad_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_inplace_object + elif is_timedelta64_dtype(values): + # NaTs are treated identically to datetime64, so we can dispatch + # to that implementation + _method = _pad_1d_datetime if _method is None: raise ValueError('Invalid dtype for pad_1d [{name}]' @@ -507,6 +511,10 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): _method = algos.backfill_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_inplace_object + elif is_timedelta64_dtype(values): + # NaTs are treated identically to datetime64, so we can dispatch + # to that implementation + _method = _backfill_1d_datetime if _method is None: raise ValueError('Invalid dtype for backfill_1d [{name}]' diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 9ef331be32417..348ac4579ffb5 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -164,6 +164,20 @@ def test_reduce_invalid(self): with pytest.raises(TypeError, match='cannot perform'): arr._reduce("not a method") + @pytest.mark.parametrize('method', ['pad', 'backfill']) + def test_fillna_method_doesnt_change_orig(self, method): + data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + arr = self.array_cls(data, freq='D') + arr[4] = pd.NaT + + fill_value = arr[3] if method == 'pad' else arr[5] + + result = arr.fillna(method=method) + assert result[4] == fill_value + + # check that the original was not changed + assert arr[4] is pd.NaT + def test_searchsorted(self): data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 arr = self.array_cls(data, freq='D') diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 9f0954d328f89..8a833d8197381 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -138,6 +138,23 @@ def test_value_counts_preserves_tz(self): index=[pd.NaT, dti[0], dti[1]]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('method', ['pad', 'backfill']) + def test_fillna_preserves_tz(self, method): + dti = pd.date_range('2000-01-01', periods=5, freq='D', tz='US/Central') + arr = DatetimeArray(dti, copy=True) + arr[2] = pd.NaT + + fill_val = dti[1] if method == 'pad' else dti[3] + expected = DatetimeArray([dti[0], dti[1], fill_val, dti[3], dti[4]], + freq=None, tz='US/Central') + + result = arr.fillna(method=method) + tm.assert_extension_array_equal(result, expected) + + # assert that arr and dti were not modified in-place + assert arr[2] is pd.NaT + assert dti[2] == pd.Timestamp('2000-01-03', tz='US/Central') + class TestSequenceToDT64NS(object): From 52e01ae3cfa3f92fa75bd66a7bc7d4fb85f280ff Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Jan 2019 13:08:35 -0800 Subject: [PATCH 2/2] Isort fixup --- pandas/core/arrays/period.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index ff637dc6331a8..7199d88d4bde5 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -14,8 +14,8 @@ from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - _TD_DTYPE, ensure_object, is_datetime64_dtype, - is_float_dtype, is_list_like, is_period_dtype, pandas_dtype) + _TD_DTYPE, ensure_object, is_datetime64_dtype, is_float_dtype, + is_list_like, is_period_dtype, pandas_dtype) from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCPeriodIndex, ABCSeries from pandas.core.dtypes.missing import isna, notna