From c4604df534abb15a9127adb887066e35dc16a3cc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 3 Dec 2018 14:23:25 -0600 Subject: [PATCH 01/19] API: Added ExtensionArray.where We need some way to do `.where` on EA object for DatetimeArray. Adding it to the interface is, I think, the easiest way. Initially I started to write a version on ExtensionBlock, but it proved to be unwieldy. to write a version that performed well for all types. It *may* be possible to do using `_ndarray_values` but we'd need a few more things around that (missing values, converting an arbitrary array to the "same' ndarary_values, error handling, re-constructing). It seemed easier to push this down to the array. The implementation on ExtensionArray is readable, but likely slow since it'll involve a conversion to object-dtype. Closes #24077 --- doc/source/whatsnew/v0.24.0.rst | 3 ++ pandas/core/arrays/base.py | 35 ++++++++++++++++++ pandas/core/arrays/categorical.py | 28 ++++++++++++++ pandas/core/arrays/interval.py | 12 ++++++ pandas/core/arrays/period.py | 22 +++++++++++ pandas/core/arrays/sparse.py | 14 +++++++ pandas/core/dtypes/base.py | 5 +++ pandas/core/indexes/category.py | 6 +-- pandas/core/internals/blocks.py | 27 +++++++++++++- .../tests/arrays/categorical/test_indexing.py | 26 +++++++++++++ pandas/tests/arrays/interval/test_interval.py | 12 +++++- pandas/tests/arrays/test_period.py | 15 ++++++++ pandas/tests/extension/base/methods.py | 37 +++++++++++++++++++ pandas/tests/extension/json/test_json.py | 7 ++++ pandas/tests/extension/test_sparse.py | 22 +++++++++++ 15 files changed, 264 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index eab5956735f12..5b2a5314108e9 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -994,6 +994,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). - Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) +- Added :meth:`pandas.api.extensions.ExtensionArray.where` (:issue:`24077`) - Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`) - Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) @@ -1236,6 +1237,7 @@ Performance Improvements - Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`) - Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`) - Improved performance of :class:`Categorical` constructor for `Series` objects (:issue:`23814`) +- Improved performance of :meth:`~DataFrame.where` for Categorical data (:issue:`24077`) .. _whatsnew_0240.docs: @@ -1262,6 +1264,7 @@ Categorical - In meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`) - Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) +- Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9c6aa4a12923f..294c5e99d66f4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -64,6 +64,7 @@ class ExtensionArray(object): * unique * factorize / _values_for_factorize * argsort / _values_for_argsort + * where The remaining methods implemented on this class should be performant, as they only compose abstract methods. Still, a more efficient @@ -661,6 +662,40 @@ def take(self, indices, allow_fill=False, fill_value=None): # pandas.api.extensions.take raise AbstractMethodError(self) + def where(self, cond, other): + """ + Replace values where the condition is False. + + Parameters + ---------- + cond : ndarray or ExtensionArray + The mask indicating which values should be kept (True) + or replaced from `other` (False). + + other : ndarray, ExtensionArray, or scalar + Entries where `cond` is False are replaced with + corresponding value from `other`. + + Notes + ----- + Note that `cond` and `other` *cannot* be a Series, Index, or callable. + When used from, e.g., :meth:`Series.where`, pandas will unbox + Series and Indexes, and will apply callables before they arrive here. + + Returns + ------- + ExtensionArray + Same dtype as the original. + + See Also + -------- + Series.where : Similar method for Series. + DataFrame.where : Similar method for DataFrame. + """ + return type(self)._from_sequence(np.where(cond, self, other), + dtype=self.dtype, + copy=False) + def copy(self, deep=False): # type: (bool) -> ExtensionArray """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 938ca53b5fdce..76d956861a9b6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1906,6 +1906,34 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): take = take_nd + def where(self, cond, other): + # n.b. this now preserves the type + codes = self._codes + + if is_scalar(other) and isna(other): + other = -1 + elif is_scalar(other): + item = self.categories.get_indexer([other]).item() + + if item == -1: + raise ValueError("The value '{}' is not present in " + "this Categorical's categories".format(other)) + other = item + + elif is_categorical_dtype(other): + if not is_dtype_equal(self, other): + raise TypeError("The type of 'other' does not match.") + other = _get_codes_for_values(other, self.categories) + # get the codes from other that match our categories + pass + else: + other = np.where(isna(other), -1, other) + + new_codes = np.where(cond, codes, other) + return type(self).from_codes(new_codes, + categories=self.categories, + ordered=self.ordered) + def _slice(self, slicer): """ Return a slice of myself. diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 785fb02c4d95d..fd35f01641750 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -777,6 +777,18 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, return self._shallow_copy(left_take, right_take) + def where(self, cond, other): + if is_scalar(other) and isna(other): + lother = other + rother = other + else: + self._check_closed_matches(other, name='other') + lother = other.left + rother = other.right + left = np.where(cond, self.left, lother) + right = np.where(cond, self.right, rother) + return self._shallow_copy(left, right) + def value_counts(self, dropna=True): """ Returns a Series containing counts of each interval. diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 4d466ef7281b7..0dff368fcd5f0 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -4,6 +4,7 @@ import numpy as np +from pandas._libs import lib from pandas._libs.tslibs import NaT, iNaT, period as libperiod from pandas._libs.tslibs.fields import isleapyear_arr from pandas._libs.tslibs.period import ( @@ -241,6 +242,11 @@ def _generate_range(cls, start, end, periods, freq, fields): return subarr, freq + def _check_compatible_with(self, other): + if self.freqstr != other.freqstr: + msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) + raise IncompatibleFrequency(msg) + # -------------------------------------------------------------------- # Data / Attributes @@ -341,6 +347,22 @@ def to_timestamp(self, freq=None, how='start'): # -------------------------------------------------------------------- # Array-like / EA-Interface Methods + def where(self, cond, other): + # TODO(DatetimeArray): move to DatetimeLikeArrayMixin + # n.b. _ndarray_values candidate. + i8 = self.asi8 + if lib.is_scalar(other): + if isna(other): + other = iNaT + elif isinstance(other, Period): + self._check_compatible_with(other) + other = other.ordinal + elif isinstance(other, type(self)): + self._check_compatible_with(other) + other = other.asi8 + result = np.where(cond, i8, other) + return type(self)._simple_new(result, dtype=self.dtype) + def _formatter(self, boxed=False): if boxed: return str diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 134466d769ada..3897b4efc480b 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1063,6 +1063,20 @@ def take(self, indices, allow_fill=False, fill_value=None): return type(self)(result, fill_value=self.fill_value, kind=self.kind, **kwargs) + def where(self, cond, other): + if is_scalar(other): + result_dtype = np.result_type(self.dtype.subtype, other) + elif isinstance(other, type(self)): + result_dtype = np.result_type(self.dtype.subtype, + other.dtype.subtype) + else: + result_dtype = np.result_type(self.dtype.subtype, other.dtype) + + dtype = self.dtype.update_dtype(result_dtype) + # TODO: avoid converting to dense. + values = np.where(cond, self, other) + return type(self)(values, dtype=dtype) + def _take_with_fill(self, indices, fill_value=None): if fill_value is None: fill_value = self.dtype.na_value diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index aa81e88abf28e..e271e11398678 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -26,6 +26,11 @@ class _DtypeOpsMixin(object): na_value = np.nan _metadata = () + @property + def _ndarray_na_value(self): + """Private method internal to pandas""" + raise AbstractMethodError(self) + def __eq__(self, other): """Check whether 'other' is equal to self. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 6d26894514a9c..94f932d5e8123 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -501,11 +501,7 @@ def _can_reindex(self, indexer): @Appender(_index_shared_docs['where']) def where(self, cond, other=None): - if other is None: - other = self._na_value - values = np.where(cond, self.values, other) - - cat = Categorical(values, dtype=self.dtype) + cat = self.values.where(cond, other=other) return self._shallow_copy(cat, **self._get_attributes_dict()) def reindex(self, target, method=None, level=None, limit=None, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 198e832ca4603..40952c0ae0688 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -28,7 +28,8 @@ from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, ExtensionDtype, PandasExtensionDtype) from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass, ABCSeries) + ABCDataFrame, ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass, + ABCSeries) from pandas.core.dtypes.missing import ( _isna_compat, array_equivalent, is_null_datelike_scalar, isna, notna) @@ -1967,6 +1968,30 @@ def shift(self, periods, axis=0): placement=self.mgr_locs, ndim=self.ndim)] + def where(self, other, cond, align=True, errors='raise', + try_cast=False, axis=0, transpose=False): + if isinstance(other, (ABCIndexClass, ABCSeries)): + other = other.array + + if isinstance(cond, ABCDataFrame): + assert cond.shape[1] == 1 + cond = cond.iloc[:, 0].array + + if isinstance(other, ABCDataFrame): + assert other.shape[1] == 1 + other = other.iloc[:, 0].array + + if isinstance(cond, (ABCIndexClass, ABCSeries)): + cond = cond.array + + if lib.is_scalar(other) and isna(other): + # The default `other` for Series / Frame is np.nan + # we want to replace that with the correct NA value + # for the type + other = self.dtype.na_value + result = self.values.where(cond, other) + return self.make_block_same_class(result, placement=self.mgr_locs) + @property def _ftype(self): return getattr(self.values, '_pandas_ftype', Block._ftype) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 8df5728f7d895..73a09b9a67e71 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -122,6 +122,32 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class): tm.assert_numpy_array_equal(expected, result) tm.assert_numpy_array_equal(exp_miss, res_miss) + def test_where_raises(self): + arr = Categorical(['a', 'b', 'c']) + with pytest.raises(ValueError, match="The value 'd'"): + arr.where([True, False, True], 'd') + + def test_where_unobserved_categories(self): + arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a']) + result = arr.where([True, True, False], other='b') + expected = Categorical(['a', 'b', 'b'], categories=arr.categories) + tm.assert_categorical_equal(result, expected) + + def test_where_other_categorical(self): + arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a']) + other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd']) + result = arr.where([True, False, True], other) + expected = Categorical(['a', 'c', 'c'], dtype=arr.dtype) + tm.assert_categorical_equal(result, expected) + + def test_where_ordered_differs_rasies(self): + arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'], + ordered=True) + other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'], + ordered=True) + with pytest.raises(TypeError, match="The type of"): + arr.where([True, False, True], other) + @pytest.mark.parametrize("index", [True, False]) def test_mask_with_boolean(index): diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index a04579dbbb6b1..1bc8f7087e54e 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from pandas import Index, IntervalIndex, date_range, timedelta_range +from pandas import Index, Interval, IntervalIndex, date_range, timedelta_range from pandas.core.arrays import IntervalArray import pandas.util.testing as tm @@ -50,6 +50,16 @@ def test_set_closed(self, closed, new_closed): expected = IntervalArray.from_breaks(range(10), closed=new_closed) tm.assert_extension_array_equal(result, expected) + @pytest.mark.parametrize('other', [ + Interval(0, 1, closed='right'), + IntervalArray.from_breaks([1, 2, 3, 4], closed='right'), + ]) + def test_where_raises(self, other): + arr = IntervalArray.from_breaks([1, 2, 3, 4], closed='left') + match = "'other.closed' is 'right', expected 'left'." + with pytest.raises(ValueError, match=match): + arr.where([True, False, True], other=other) + class TestSetitem(object): diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index bf139bb0ce616..f439a268d08ed 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -197,6 +197,21 @@ def test_sub_period(): arr - other +# ---------------------------------------------------------------------------- +# Methods + +@pytest.mark.parametrize('other', [ + pd.Period('2000', freq='H'), + period_array(['2000', '2001', '2000'], freq='H') +]) +def test_where_different_freq_raises(other): + arr = period_array(['2000', '2001', '2002'], freq='D') + cond = np.array([True, False, True]) + with pytest.raises(IncompatibleFrequency, + match="Input has different freq=H"): + arr.where(cond, other) + + # ---------------------------------------------------------------------------- # Printing diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index e9a89c1af2f22..c3654ffbd64dc 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -198,3 +198,40 @@ def test_hash_pandas_object_works(self, data, as_frame): a = pd.util.hash_pandas_object(data) b = pd.util.hash_pandas_object(data) self.assert_equal(a, b) + + @pytest.mark.parametrize("as_frame", [True, False]) + def test_where_series(self, data, na_value, as_frame): + assert data[0] != data[1] + cls = type(data) + a, b = data[:2] + + ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype)) + cond = np.array([True, True, False, False]) + + if as_frame: + ser = ser.to_frame(name='a') + # TODO: alignment is broken for ndarray `cond` + cond = pd.DataFrame({"a": cond}) + + result = ser.where(cond) + expected = pd.Series(cls._from_sequence([a, a, na_value, na_value], + dtype=data.dtype)) + + if as_frame: + expected = expected.to_frame(name='a') + self.assert_equal(result, expected) + + # array other + cond = np.array([True, False, True, True]) + other = cls._from_sequence([a, b, a, b], dtype=data.dtype) + if as_frame: + # TODO: alignment is broken for ndarray `cond` + other = pd.DataFrame({"a": other}) + # TODO: alignment is broken for array `other` + cond = pd.DataFrame({"a": cond}) + result = ser.where(cond, other) + expected = pd.Series(cls._from_sequence([a, b, b, b], + dtype=data.dtype)) + if as_frame: + expected = expected.to_frame(name='a') + self.assert_equal(result, expected) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index a941b562fe1ec..4571f3f6d4040 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -221,6 +221,13 @@ def test_combine_add(self, data_repeated): def test_hash_pandas_object_works(self, data, kind): super().test_hash_pandas_object_works(data, kind) + @pytest.mark.skip(reason="broadcasting error") + def test_where_series(self, data, na_value): + # Fails with + # *** ValueError: operands could not be broadcast together + # with shapes (4,) (4,) (0,) + super().test_where_series(data, na_value) + class TestCasting(BaseJSON, base.BaseCastingTests): @pytest.mark.skip(reason="failing on np.array(self, dtype=str)") diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 891e5f4dd9a95..75327a8b9affe 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -255,6 +255,28 @@ def test_fillna_copy_series(self, data_missing): def test_fillna_length_mismatch(self, data_missing): pass + def test_where_series(self, data, na_value): + assert data[0] != data[1] + cls = type(data) + a, b = data[:2] + + ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype)) + + cond = np.array([True, True, False, False]) + result = ser.where(cond) + # new_dtype is the only difference + new_dtype = SparseDtype('float64', 0.0) + expected = pd.Series(cls._from_sequence([a, a, na_value, na_value], + dtype=new_dtype)) + self.assert_series_equal(result, expected) + + other = cls._from_sequence([a, b, a, b]) + cond = np.array([True, False, True, True]) + result = ser.where(cond, other) + expected = pd.Series(cls._from_sequence([a, b, b, b], + dtype=data.dtype)) + self.assert_series_equal(result, expected) + class TestCasting(BaseSparseTests, base.BaseCastingTests): pass From 56470c31a71e5f416ab3b9af21347781209432cb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 5 Dec 2018 11:39:48 -0600 Subject: [PATCH 02/19] Fixups: * Ensure data generated OK. * Remove erroneous comments about alignment. That was user error. --- pandas/core/arrays/interval.py | 3 +-- pandas/tests/extension/base/methods.py | 5 +---- pandas/tests/extension/conftest.py | 6 +++++- pandas/tests/extension/test_categorical.py | 7 ++++++- pandas/tests/extension/test_sparse.py | 2 ++ 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index fd35f01641750..ce209f71aca2f 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -779,8 +779,7 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, def where(self, cond, other): if is_scalar(other) and isna(other): - lother = other - rother = other + lother = rother = other else: self._check_closed_matches(other, name='other') lother = other.left diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c3654ffbd64dc..9820b421ce9cd 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -210,8 +210,7 @@ def test_where_series(self, data, na_value, as_frame): if as_frame: ser = ser.to_frame(name='a') - # TODO: alignment is broken for ndarray `cond` - cond = pd.DataFrame({"a": cond}) + cond = cond.reshape(-1, 1) result = ser.where(cond) expected = pd.Series(cls._from_sequence([a, a, na_value, na_value], @@ -225,9 +224,7 @@ def test_where_series(self, data, na_value, as_frame): cond = np.array([True, False, True, True]) other = cls._from_sequence([a, b, a, b], dtype=data.dtype) if as_frame: - # TODO: alignment is broken for ndarray `cond` other = pd.DataFrame({"a": other}) - # TODO: alignment is broken for array `other` cond = pd.DataFrame({"a": cond}) result = ser.where(cond, other) expected = pd.Series(cls._from_sequence([a, b, b, b], diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 7758bd01840ae..5349dd919f2a2 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -11,7 +11,11 @@ def dtype(): @pytest.fixture def data(): - """Length-100 array for this type.""" + """Length-100 array for this type. + + * data[0] and data[1] should both be non missing + * data[0] and data[1] should not gbe equal + """ raise NotImplementedError diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 5b873b337880e..ce9b2f2435231 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -25,7 +25,12 @@ def make_data(): - return np.random.choice(list(string.ascii_letters), size=100) + while True: + values = np.random.choice(list(string.ascii_letters), size=100) + # ensure we meet the requirement + if values[0] != values[1]: + break + return values @pytest.fixture diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 75327a8b9affe..4bc49e8b1b8c0 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -13,6 +13,8 @@ def make_data(fill_value): data = np.random.uniform(size=100) else: data = np.random.randint(1, 100, size=100) + if data[0] == data[1]: + data[0] += 1 data[2::3] = fill_value return data From 6f79282127d923deaa30ef0bcb703e5d89d243a7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 5 Dec 2018 12:45:54 -0600 Subject: [PATCH 03/19] 32-bit compat --- pandas/tests/extension/test_sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 4bc49e8b1b8c0..aafc192bf8497 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -267,7 +267,7 @@ def test_where_series(self, data, na_value): cond = np.array([True, True, False, False]) result = ser.where(cond) # new_dtype is the only difference - new_dtype = SparseDtype('float64', 0.0) + new_dtype = SparseDtype('float', 0.0) expected = pd.Series(cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype)) self.assert_series_equal(result, expected) From a69dbb3c31cf6007cae3be629855f381703eafe4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 5 Dec 2018 15:49:17 -0600 Subject: [PATCH 04/19] warn for categorical --- pandas/core/arrays/categorical.py | 21 ++++++++++++++++--- pandas/core/internals/blocks.py | 11 ++++++++++ .../tests/arrays/categorical/test_indexing.py | 20 +++++++++++------- 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 76d956861a9b6..3e6a8368949d1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,5 +1,6 @@ # pylint: disable=E1101,W0232 +import reprlib import textwrap from warnings import warn @@ -1909,6 +1910,15 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): def where(self, cond, other): # n.b. this now preserves the type codes = self._codes + object_msg = ( + "Implicitly converting categorical to object-dtype ndarray. " + "The values `{}' are not present in this categorical's " + "categories. A future version of pandas will raise a ValueError " + "when 'other' contains different categories.\n\n" + "To preserve the current behavior, add the new categories to " + "the categorical before calling 'where', or convert the " + "categorical to a different dtype." + ) if is_scalar(other) and isna(other): other = -1 @@ -1916,13 +1926,18 @@ def where(self, cond, other): item = self.categories.get_indexer([other]).item() if item == -1: - raise ValueError("The value '{}' is not present in " - "this Categorical's categories".format(other)) + # note: when removing this, also remove CategoricalBlock.where + warn(object_msg.format(other), FutureWarning, stacklevel=2) + return np.where(cond, self, other) + other = item elif is_categorical_dtype(other): if not is_dtype_equal(self, other): - raise TypeError("The type of 'other' does not match.") + extra = list(other.categories.difference(self.categories)) + warn(object_msg.format(reprlib.repr(extra)), FutureWarning, + stacklevel=2) + return np.where(cond, self, other) other = _get_codes_for_values(other, self.categories) # get the codes from other that match our categories pass diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 40952c0ae0688..2e297d1ccbd57 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2671,6 +2671,17 @@ def concat_same_type(self, to_concat, placement=None): values, placement=placement or slice(0, len(values), 1), ndim=self.ndim) + def where(self, other, cond, align=True, errors='raise', + try_cast=False, axis=0, transpose=False): + result = super(CategoricalBlock, self).where( + other, cond, align, errors, try_cast, axis, transpose + ) + if result.values.dtype != self.values.dtype: + # For backwards compatability, we allow upcasting to object. + # This fallback will be removed in the future. + result = result.astype(object) + return result + class DatetimeBlock(DatetimeLikeBlockMixin, Block): __slots__ = () diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 73a09b9a67e71..2ef91ad2426be 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -122,11 +122,6 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class): tm.assert_numpy_array_equal(expected, result) tm.assert_numpy_array_equal(exp_miss, res_miss) - def test_where_raises(self): - arr = Categorical(['a', 'b', 'c']) - with pytest.raises(ValueError, match="The value 'd'"): - arr.where([True, False, True], 'd') - def test_where_unobserved_categories(self): arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a']) result = arr.where([True, True, False], other='b') @@ -140,13 +135,24 @@ def test_where_other_categorical(self): expected = Categorical(['a', 'c', 'c'], dtype=arr.dtype) tm.assert_categorical_equal(result, expected) + def test_where_warns(self): + arr = Categorical(['a', 'b', 'c']) + with tm.assert_produces_warning(FutureWarning): + result = arr.where([True, False, True], 'd') + + expected = np.array(['a', 'd', 'c'], dtype='object') + tm.assert_numpy_array_equal(result, expected) + def test_where_ordered_differs_rasies(self): arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'], ordered=True) other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'], ordered=True) - with pytest.raises(TypeError, match="The type of"): - arr.where([True, False, True], other) + with tm.assert_produces_warning(FutureWarning): + result = arr.where([True, False, True], other) + + expected = np.array(['a', 'c', 'c'], dtype=object) + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("index", [True, False]) From 911a2daf74d20290965b80159793911f2c2d6249 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 5 Dec 2018 15:55:24 -0600 Subject: [PATCH 05/19] debug 32-bit issue --- pandas/tests/extension/test_sparse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index aafc192bf8497..764d58c263933 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -10,9 +10,9 @@ def make_data(fill_value): if np.isnan(fill_value): - data = np.random.uniform(size=100) + data = np.random.uniform(size=100).astype('float64') else: - data = np.random.randint(1, 100, size=100) + data = np.random.randint(1, 100, size=100, dtype='int64') if data[0] == data[1]: data[0] += 1 From badb5be34672888fcb1267ad4254b14ebb94d447 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 6 Dec 2018 06:21:44 -0600 Subject: [PATCH 06/19] compat, revert --- pandas/compat/__init__.py | 3 +++ pandas/core/arrays/categorical.py | 4 ++-- pandas/tests/extension/test_sparse.py | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index f9c659106a516..f3748acfe6b1b 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -116,6 +116,7 @@ def get_range_parameters(data): reduce = functools.reduce long = int unichr = chr + import reprlib # This was introduced in Python 3.3, but we don't support # Python 3.x < 3.5, so checking PY3 is safe. @@ -271,6 +272,7 @@ class to receive bound method class_types = type, text_type = str binary_type = bytes + import reprlib def u(s): return s @@ -323,6 +325,7 @@ def set_function_name(f, name, cls): class_types = (type, types.ClassType) text_type = unicode binary_type = str + import repr as reprlib def u(s): return unicode(s, "unicode_escape") diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3e6a8368949d1..e336fc0d4a82e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,6 +1,5 @@ # pylint: disable=E1101,W0232 -import reprlib import textwrap from warnings import warn @@ -1935,7 +1934,8 @@ def where(self, cond, other): elif is_categorical_dtype(other): if not is_dtype_equal(self, other): extra = list(other.categories.difference(self.categories)) - warn(object_msg.format(reprlib.repr(extra)), FutureWarning, + warn(object_msg.format(compat.reprlib.repr(extra)), + FutureWarning, stacklevel=2) return np.where(cond, self, other) other = _get_codes_for_values(other, self.categories) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 764d58c263933..aafc192bf8497 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -10,9 +10,9 @@ def make_data(fill_value): if np.isnan(fill_value): - data = np.random.uniform(size=100).astype('float64') + data = np.random.uniform(size=100) else: - data = np.random.randint(1, 100, size=100, dtype='int64') + data = np.random.randint(1, 100, size=100) if data[0] == data[1]: data[0] += 1 From edff47ee7f84aa86849c7ed3fc39969319ed7ea4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 6 Dec 2018 08:15:21 -0600 Subject: [PATCH 07/19] 32-bit compat --- pandas/tests/extension/test_sparse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index aafc192bf8497..10eccea64069f 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -266,13 +266,13 @@ def test_where_series(self, data, na_value): cond = np.array([True, True, False, False]) result = ser.where(cond) - # new_dtype is the only difference + new_dtype = SparseDtype('float', 0.0) expected = pd.Series(cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype)) self.assert_series_equal(result, expected) - other = cls._from_sequence([a, b, a, b]) + other = cls._from_sequence([a, b, a, b], dtype=data.dtype) cond = np.array([True, False, True, True]) result = ser.where(cond, other) expected = pd.Series(cls._from_sequence([a, b, b, b], From d90f3849b360b6fe1680a856c8f918226fc16833 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 6 Dec 2018 09:17:43 -0600 Subject: [PATCH 08/19] deprecation note for categorical --- doc/source/whatsnew/v0.24.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index b25086e530173..c5b3e0602932d 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1137,6 +1137,8 @@ Deprecations - :func:`pandas.types.is_datetimetz` is deprecated in favor of `pandas.types.is_datetime64tz` (:issue:`23917`) - Creating a :class:`TimedeltaIndex` or :class:`DatetimeIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range` and :func:`date_range` (:issue:`23919`) - Passing a string alias like ``'datetime64[ns, UTC]'`` as the `unit` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`). +- In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype first, or add the ``other`` to the categories first (:issue:`24077`). + .. _whatsnew_0240.deprecations.datetimelike_int_ops: From 5e144149ef4dba21df299cf65ed4e4641ee53cdc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 6 Dec 2018 09:18:54 -0600 Subject: [PATCH 09/19] where versionadded --- pandas/core/arrays/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 294c5e99d66f4..3eb101b0267b2 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -666,6 +666,8 @@ def where(self, cond, other): """ Replace values where the condition is False. + .. versionadded :: 0.24.0 + Parameters ---------- cond : ndarray or ExtensionArray From 033ac9ceb00bf5212bec971ca76092584579b89b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 7 Dec 2018 06:30:18 -0600 Subject: [PATCH 10/19] Setitem-based where --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/arrays/base.py | 36 ------- pandas/core/arrays/categorical.py | 55 +++-------- pandas/core/arrays/interval.py | 11 --- pandas/core/arrays/period.py | 17 ---- pandas/core/arrays/sparse.py | 19 ---- pandas/core/internals/blocks.py | 83 ++++++++++++++-- .../tests/arrays/categorical/test_indexing.py | 96 +++++++++++++++---- pandas/tests/arrays/interval/test_interval.py | 8 +- pandas/tests/arrays/test_period.py | 4 +- 10 files changed, 174 insertions(+), 156 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index debd855751b56..1f906247e6d59 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1310,6 +1310,7 @@ Datetimelike - Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`) - Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`) - Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`) +- Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are undordered and have the same categories, but in a different order (:issue:`24142`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3eb101b0267b2..6e1e35bb4fb02 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -662,42 +662,6 @@ def take(self, indices, allow_fill=False, fill_value=None): # pandas.api.extensions.take raise AbstractMethodError(self) - def where(self, cond, other): - """ - Replace values where the condition is False. - - .. versionadded :: 0.24.0 - - Parameters - ---------- - cond : ndarray or ExtensionArray - The mask indicating which values should be kept (True) - or replaced from `other` (False). - - other : ndarray, ExtensionArray, or scalar - Entries where `cond` is False are replaced with - corresponding value from `other`. - - Notes - ----- - Note that `cond` and `other` *cannot* be a Series, Index, or callable. - When used from, e.g., :meth:`Series.where`, pandas will unbox - Series and Indexes, and will apply callables before they arrive here. - - Returns - ------- - ExtensionArray - Same dtype as the original. - - See Also - -------- - Series.where : Similar method for Series. - DataFrame.where : Similar method for DataFrame. - """ - return type(self)._from_sequence(np.where(cond, self, other), - dtype=self.dtype, - copy=False) - def copy(self, deep=False): # type: (bool) -> ExtensionArray """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e336fc0d4a82e..f217acb4a7a1e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1906,49 +1906,6 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): take = take_nd - def where(self, cond, other): - # n.b. this now preserves the type - codes = self._codes - object_msg = ( - "Implicitly converting categorical to object-dtype ndarray. " - "The values `{}' are not present in this categorical's " - "categories. A future version of pandas will raise a ValueError " - "when 'other' contains different categories.\n\n" - "To preserve the current behavior, add the new categories to " - "the categorical before calling 'where', or convert the " - "categorical to a different dtype." - ) - - if is_scalar(other) and isna(other): - other = -1 - elif is_scalar(other): - item = self.categories.get_indexer([other]).item() - - if item == -1: - # note: when removing this, also remove CategoricalBlock.where - warn(object_msg.format(other), FutureWarning, stacklevel=2) - return np.where(cond, self, other) - - other = item - - elif is_categorical_dtype(other): - if not is_dtype_equal(self, other): - extra = list(other.categories.difference(self.categories)) - warn(object_msg.format(compat.reprlib.repr(extra)), - FutureWarning, - stacklevel=2) - return np.where(cond, self, other) - other = _get_codes_for_values(other, self.categories) - # get the codes from other that match our categories - pass - else: - other = np.where(isna(other), -1, other) - - new_codes = np.where(cond, codes, other) - return type(self).from_codes(new_codes, - categories=self.categories, - ordered=self.ordered) - def _slice(self, slicer): """ Return a slice of myself. @@ -2121,11 +2078,21 @@ def __setitem__(self, key, value): `Categorical` does not have the same categories """ + if isinstance(value, (ABCIndexClass, ABCSeries)): + value = value.array + # require identical categories set if isinstance(value, Categorical): - if not value.categories.equals(self.categories): + if not is_dtype_equal(self, value): raise ValueError("Cannot set a Categorical with another, " "without identical categories") + if not self.categories.equals(value.categories): + new_codes = _recode_for_categories( + value.codes, value.categories, self.categories + ) + value = Categorical.from_codes(new_codes, + categories=self.categories, + ordered=self.ordered) rvalue = value if is_list_like(value) else [value] diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index ce209f71aca2f..785fb02c4d95d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -777,17 +777,6 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, return self._shallow_copy(left_take, right_take) - def where(self, cond, other): - if is_scalar(other) and isna(other): - lother = rother = other - else: - self._check_closed_matches(other, name='other') - lother = other.left - rother = other.right - left = np.where(cond, self.left, lother) - right = np.where(cond, self.right, rother) - return self._shallow_copy(left, right) - def value_counts(self, dropna=True): """ Returns a Series containing counts of each interval. diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 5506711594f88..c1f9d3b946171 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -4,7 +4,6 @@ import numpy as np -from pandas._libs import lib from pandas._libs.tslibs import NaT, iNaT, period as libperiod from pandas._libs.tslibs.fields import isleapyear_arr from pandas._libs.tslibs.period import ( @@ -347,22 +346,6 @@ def to_timestamp(self, freq=None, how='start'): # -------------------------------------------------------------------- # Array-like / EA-Interface Methods - def where(self, cond, other): - # TODO(DatetimeArray): move to DatetimeLikeArrayMixin - # n.b. _ndarray_values candidate. - i8 = self.asi8 - if lib.is_scalar(other): - if isna(other): - other = iNaT - elif isinstance(other, Period): - self._check_compatible_with(other) - other = other.ordinal - elif isinstance(other, type(self)): - self._check_compatible_with(other) - other = other.asi8 - result = np.where(cond, i8, other) - return type(self)._simple_new(result, dtype=self.dtype) - def _formatter(self, boxed=False): if boxed: return str diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 3897b4efc480b..1788bdd80153b 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -704,11 +704,6 @@ def __array__(self, dtype=None, copy=True): out[self.sp_index.to_int_index().indices] = self.sp_values return out - def __setitem__(self, key, value): - # I suppose we could allow setting of non-fill_value elements. - msg = "SparseArray does not support item assignment via setitem" - raise TypeError(msg) - @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(scalars, dtype=dtype) @@ -1063,20 +1058,6 @@ def take(self, indices, allow_fill=False, fill_value=None): return type(self)(result, fill_value=self.fill_value, kind=self.kind, **kwargs) - def where(self, cond, other): - if is_scalar(other): - result_dtype = np.result_type(self.dtype.subtype, other) - elif isinstance(other, type(self)): - result_dtype = np.result_type(self.dtype.subtype, - other.dtype.subtype) - else: - result_dtype = np.result_type(self.dtype.subtype, other.dtype) - - dtype = self.dtype.update_dtype(result_dtype) - # TODO: avoid converting to dense. - values = np.where(cond, self, other) - return type(self)(values, dtype=dtype) - def _take_with_fill(self, indices, fill_value=None): if fill_value is None: fill_value = self.dtype.na_value diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c6ff7c9f0dca9..5f435a0455657 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1970,6 +1970,7 @@ def shift(self, periods, axis=0): def where(self, other, cond, align=True, errors='raise', try_cast=False, axis=0, transpose=False): + # rough attempt to see if if isinstance(other, (ABCIndexClass, ABCSeries)): other = other.array @@ -1989,7 +1990,33 @@ def where(self, other, cond, align=True, errors='raise', # we want to replace that with the correct NA value # for the type other = self.dtype.na_value - result = self.values.where(cond, other) + + if is_sparse(self.values): + # ugly workaround for ensure that the dtype is OK + # after we insert NaNs. + if is_sparse(other): + otype = other.dtype.subtype + else: + otype = other + dtype = self.dtype.update_dtype( + np.result_type(self.values.dtype.subtype, otype) + ) + else: + dtype = self.dtype + + if self._holder.__setitem__ is ExtensionArray.__setitem__: + # the array doesn't implement setitem, so convert to ndarray + result = self._holder._from_sequence( + np.where(cond, self.values, other), + dtype=dtype, + ) + else: + result = self.values.copy() + icond = ~cond + if lib.is_scalar(other): + result[icond] = other + else: + result[icond] = other[icond] return self.make_block_same_class(result, placement=self.mgr_locs) @property @@ -2673,13 +2700,55 @@ def concat_same_type(self, to_concat, placement=None): def where(self, other, cond, align=True, errors='raise', try_cast=False, axis=0, transpose=False): - result = super(CategoricalBlock, self).where( - other, cond, align, errors, try_cast, axis, transpose + # This can all be deleted in favor of ExtensionBlock.where once + # we enforce the deprecation. + object_msg = ( + "Implicitly converting categorical to object-dtype ndarray. " + "The values `{}' are not present in this categorical's " + "categories. A future version of pandas will raise a ValueError " + "when 'other' contains different categories.\n\n" + "To preserve the current behavior, add the new categories to " + "the categorical before calling 'where', or convert the " + "categorical to a different dtype." ) - if result.values.dtype != self.values.dtype: - # For backwards compatability, we allow upcasting to object. - # This fallback will be removed in the future. - result = result.astype(object) + + scalar_other = lib.is_scalar(other) + categorical_other = is_categorical_dtype(other) + if isinstance(other, ABCDataFrame): + # should be 1d + assert other.shape[1] == 1 + other = other.iloc[:, 0] + + if isinstance(other, (ABCSeries, ABCIndexClass)): + other = other._values + + do_as_object = ( + # Two categoricals with different dtype (ignoring order) + (categorical_other and not is_dtype_equal(self.values, other)) or + # a not-na scalar not present in our categories + (scalar_other and (other not in self.values.categories + and notna(other))) or + # an array not present in our categories + (not scalar_other and + (self.values.categories.get_indexer( + other[notna(other)]) < 0).any()) + ) + + if do_as_object: + if scalar_other: + msg = object_msg.format(other) + else: + msg = compat.reprlib.repr(other) + + warnings.warn(msg, FutureWarning, stacklevel=6) + result = self.astype(object).where(other, cond, align=align, + errors=errors, + try_cast=try_cast, + axis=axis, transpose=transpose) + else: + result = super(CategoricalBlock, self).where( + other, cond, align, errors, try_cast, axis, transpose + ) return result diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 2ef91ad2426be..44b4589d5a663 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -3,6 +3,7 @@ import numpy as np import pytest +import pandas as pd from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series import pandas.core.common as com from pandas.tests.arrays.categorical.common import TestCategorical @@ -43,6 +44,45 @@ def test_setitem(self): tm.assert_categorical_equal(c, expected) + @pytest.mark.parametrize('other', [ + pd.Categorical(['b', 'a']), + pd.Categorical(['b', 'a'], categories=['b', 'a']), + ]) + def test_setitem_same_but_unordered(self, other): + # GH-24142 + target = pd.Categorical(['a', 'b'], categories=['a', 'b']) + mask = np.array([True, False]) + target[mask] = other[mask] + expected = pd.Categorical(['b', 'b'], categories=['a', 'b']) + tm.assert_categorical_equal(target, expected) + + @pytest.mark.parametrize('other', [ + pd.Categorical(['b', 'a'], categories=['b', 'a', 'c']), + pd.Categorical(['b', 'a'], categories=['a', 'b', 'c']), + pd.Categorical(['a', 'a'], categories=['a']), + pd.Categorical(['b', 'b'], categories=['b']), + ]) + def test_setitem_different_unordered_raises(self, other): + # GH-24142 + target = pd.Categorical(['a', 'b'], categories=['a', 'b']) + mask = np.array([True, False]) + with pytest.raises(ValueError): + target[mask] = other[mask] + + @pytest.mark.parametrize('other', [ + pd.Categorical(['b', 'a']), + pd.Categorical(['b', 'a'], categories=['b', 'a'], ordered=True), + pd.Categorical(['b', 'a'], categories=['a', 'b', 'c'], ordered=True), + ]) + def test_setitem_same_ordered_rasies(self, other): + # Gh-24142 + target = pd.Categorical(['a', 'b'], categories=['a', 'b'], + ordered=True) + mask = np.array([True, False]) + + with pytest.raises(ValueError): + target[mask] = other[mask] + class TestCategoricalIndexing(object): @@ -122,37 +162,59 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class): tm.assert_numpy_array_equal(expected, result) tm.assert_numpy_array_equal(exp_miss, res_miss) + def test_where_unobserved_nan(self): + ser = pd.Series(pd.Categorical(['a', 'b'])) + result = ser.where([True, False]) + expected = pd.Series(pd.Categorical(['a', None], + categories=['a', 'b'])) + tm.assert_series_equal(result, expected) + + # all NA + ser = pd.Series(pd.Categorical(['a', 'b'])) + result = ser.where([False, False]) + expected = pd.Series(pd.Categorical([None, None], + categories=['a', 'b'])) + tm.assert_series_equal(result, expected) + def test_where_unobserved_categories(self): - arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a']) - result = arr.where([True, True, False], other='b') - expected = Categorical(['a', 'b', 'b'], categories=arr.categories) - tm.assert_categorical_equal(result, expected) + ser = pd.Series( + Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a']) + ) + result = ser.where([True, True, False], other='b') + expected = pd.Series( + Categorical(['a', 'b', 'b'], categories=ser.cat.categories) + ) + tm.assert_series_equal(result, expected) def test_where_other_categorical(self): - arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a']) + ser = pd.Series( + Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a']) + ) other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd']) - result = arr.where([True, False, True], other) - expected = Categorical(['a', 'c', 'c'], dtype=arr.dtype) - tm.assert_categorical_equal(result, expected) + result = ser.where([True, False, True], other) + expected = pd.Series(Categorical(['a', 'c', 'c'], dtype=ser.dtype)) + tm.assert_series_equal(result, expected) def test_where_warns(self): - arr = Categorical(['a', 'b', 'c']) + ser = pd.Series(Categorical(['a', 'b', 'c'])) with tm.assert_produces_warning(FutureWarning): - result = arr.where([True, False, True], 'd') + result = ser.where([True, False, True], 'd') - expected = np.array(['a', 'd', 'c'], dtype='object') - tm.assert_numpy_array_equal(result, expected) + expected = pd.Series(np.array(['a', 'd', 'c'], dtype='object')) + tm.assert_series_equal(result, expected) def test_where_ordered_differs_rasies(self): - arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'], - ordered=True) + ser = pd.Series( + Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'], + ordered=True) + ) other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'], ordered=True) with tm.assert_produces_warning(FutureWarning): - result = arr.where([True, False, True], other) + result = ser.where([True, False, True], other) - expected = np.array(['a', 'c', 'c'], dtype=object) - tm.assert_numpy_array_equal(result, expected) + expected = pd.Series(np.array(['a', 'c', 'c'], dtype=object)) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("index", [True, False]) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 1bc8f7087e54e..9604010571294 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -2,6 +2,7 @@ import numpy as np import pytest +import pandas as pd from pandas import Index, Interval, IntervalIndex, date_range, timedelta_range from pandas.core.arrays import IntervalArray import pandas.util.testing as tm @@ -55,10 +56,11 @@ def test_set_closed(self, closed, new_closed): IntervalArray.from_breaks([1, 2, 3, 4], closed='right'), ]) def test_where_raises(self, other): - arr = IntervalArray.from_breaks([1, 2, 3, 4], closed='left') - match = "'other.closed' is 'right', expected 'left'." + ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4], + closed='left')) + match = "'value.closed' is 'right', expected 'left'." with pytest.raises(ValueError, match=match): - arr.where([True, False, True], other=other) + ser.where([True, False, True], other=other) class TestSetitem(object): diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index f439a268d08ed..4425cc8eb1139 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -205,11 +205,11 @@ def test_sub_period(): period_array(['2000', '2001', '2000'], freq='H') ]) def test_where_different_freq_raises(other): - arr = period_array(['2000', '2001', '2002'], freq='D') + ser = pd.Series(period_array(['2000', '2001', '2002'], freq='D')) cond = np.array([True, False, True]) with pytest.raises(IncompatibleFrequency, match="Input has different freq=H"): - arr.where(cond, other) + ser.where(cond, other) # ---------------------------------------------------------------------------- From 9e0d87d71dd3fafdd2fb4d30c3ea4cdb52e1849a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 7 Dec 2018 07:18:58 -0600 Subject: [PATCH 11/19] update docs, cleanup --- doc/source/whatsnew/v0.24.0.rst | 1 - pandas/core/arrays/base.py | 3 ++- pandas/core/arrays/period.py | 5 ----- pandas/core/dtypes/base.py | 5 ----- pandas/core/indexes/category.py | 9 ++++++++- pandas/core/internals/blocks.py | 3 +-- 6 files changed, 11 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 1f906247e6d59..deabc8949126c 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -994,7 +994,6 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). - Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) -- Added :meth:`pandas.api.extensions.ExtensionArray.where` (:issue:`24077`) - Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`) - Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6e1e35bb4fb02..da2125749434f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -64,7 +64,6 @@ class ExtensionArray(object): * unique * factorize / _values_for_factorize * argsort / _values_for_argsort - * where The remaining methods implemented on this class should be performant, as they only compose abstract methods. Still, a more efficient @@ -221,6 +220,8 @@ def __setitem__(self, key, value): # example, a string like '2018-01-01' is coerced to a datetime # when setting on a datetime64ns array. In general, if the # __init__ method coerces that value, then so should __setitem__ + # Note, also, that Series/DataFrame.where internally use __setitem__ + # on a copy of the data. raise NotImplementedError(_not_implemented_message.format( type(self), '__setitem__') ) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index c1f9d3b946171..d9dde1c699761 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -241,11 +241,6 @@ def _generate_range(cls, start, end, periods, freq, fields): return subarr, freq - def _check_compatible_with(self, other): - if self.freqstr != other.freqstr: - msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) - raise IncompatibleFrequency(msg) - # -------------------------------------------------------------------- # Data / Attributes diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index d453a909fc42e..ab1cb9cf2499a 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -26,11 +26,6 @@ class _DtypeOpsMixin(object): na_value = np.nan _metadata = () - @property - def _ndarray_na_value(self): - """Private method internal to pandas""" - raise AbstractMethodError(self) - def __eq__(self, other): """Check whether 'other' is equal to self. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 94f932d5e8123..bebde5f779dc7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -501,7 +501,14 @@ def _can_reindex(self, indexer): @Appender(_index_shared_docs['where']) def where(self, cond, other=None): - cat = self.values.where(cond, other=other) + # TODO: Investigate an alternative implementation with + # 1. copy the underyling Categorical + # 2. setitem with `cond` and `other` + # 3. Rebuild CategoricalIndex. + if other is None: + other = self._na_value + values = np.where(cond, self.values, other) + cat = Categorical(values, dtype=self.dtype) return self._shallow_copy(cat, **self._get_attributes_dict()) def reindex(self, target, method=None, level=None, limit=None, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5f435a0455657..2fa9b47eb6277 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1970,7 +1970,6 @@ def shift(self, periods, axis=0): def where(self, other, cond, align=True, errors='raise', try_cast=False, axis=0, transpose=False): - # rough attempt to see if if isinstance(other, (ABCIndexClass, ABCSeries)): other = other.array @@ -2004,8 +2003,8 @@ def where(self, other, cond, align=True, errors='raise', else: dtype = self.dtype + # rough heuristic to see if the other array implements setitem if self._holder.__setitem__ is ExtensionArray.__setitem__: - # the array doesn't implement setitem, so convert to ndarray result = self._holder._from_sequence( np.where(cond, self.values, other), dtype=dtype, From e05a597416b2fd7396440257546a17b1f9b42b1f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 7 Dec 2018 08:25:38 -0600 Subject: [PATCH 12/19] wip --- pandas/tests/arrays/sparse/test_array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index b8cef92f6a6d4..e36e73f682371 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -357,10 +357,10 @@ def setitem(): def setslice(): self.arr[1:5] = 2 - with pytest.raises(TypeError, match="item assignment"): + with pytest.raises(TypeError, match="does not implement"): setitem() - with pytest.raises(TypeError, match="item assignment"): + with pytest.raises(TypeError, match="does not implement"): setslice() def test_constructor_from_too_large_array(self): From 796332c91851fdc454afb83249eb35fc33233f98 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 7 Dec 2018 11:10:05 -0600 Subject: [PATCH 13/19] cleanup --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/arrays/sparse.py | 7 ++++ pandas/core/internals/blocks.py | 41 ++++++------------------ pandas/tests/arrays/sparse/test_array.py | 4 +-- 4 files changed, 20 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index deabc8949126c..71ea258c2bcc2 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1136,7 +1136,7 @@ Deprecations - :func:`pandas.types.is_datetimetz` is deprecated in favor of `pandas.types.is_datetime64tz` (:issue:`23917`) - Creating a :class:`TimedeltaIndex` or :class:`DatetimeIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range` and :func:`date_range` (:issue:`23919`) - Passing a string alias like ``'datetime64[ns, UTC]'`` as the `unit` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`). -- In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype first, or add the ``other`` to the categories first (:issue:`24077`). +- In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`). .. _whatsnew_0240.deprecations.datetimelike_int_ops: diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 1788bdd80153b..4966638a4cf90 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -704,6 +704,13 @@ def __array__(self, dtype=None, copy=True): out[self.sp_index.to_int_index().indices] = self.sp_values return out + def __setitem__(self, key, value): + # I suppose we could allow setting of non-fill_value elements. + # TODO(SparseArray.__setitem__): remove special cases in + # ExtensionBlock.where + msg = "SparseArray does not support item assignment via setitem" + raise TypeError(msg) + @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(scalars, dtype=dtype) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2fa9b47eb6277..897096809a7cd 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -34,7 +34,7 @@ _isna_compat, array_equivalent, is_null_datelike_scalar, isna, notna) import pandas.core.algorithms as algos -from pandas.core.arrays import Categorical, ExtensionArray +from pandas.core.arrays import Categorical, ExtensionArray, SparseArray from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.indexes.datetimes import DatetimeIndex @@ -2004,7 +2004,8 @@ def where(self, other, cond, align=True, errors='raise', dtype = self.dtype # rough heuristic to see if the other array implements setitem - if self._holder.__setitem__ is ExtensionArray.__setitem__: + if (self._holder.__setitem__ is ExtensionArray.__setitem__ + or self._holder.__setitem__ is SparseArray.__setitem__): result = self._holder._from_sequence( np.where(cond, self.values, other), dtype=dtype, @@ -2710,31 +2711,13 @@ def where(self, other, cond, align=True, errors='raise', "the categorical before calling 'where', or convert the " "categorical to a different dtype." ) - - scalar_other = lib.is_scalar(other) - categorical_other = is_categorical_dtype(other) - if isinstance(other, ABCDataFrame): - # should be 1d - assert other.shape[1] == 1 - other = other.iloc[:, 0] - - if isinstance(other, (ABCSeries, ABCIndexClass)): - other = other._values - - do_as_object = ( - # Two categoricals with different dtype (ignoring order) - (categorical_other and not is_dtype_equal(self.values, other)) or - # a not-na scalar not present in our categories - (scalar_other and (other not in self.values.categories - and notna(other))) or - # an array not present in our categories - (not scalar_other and - (self.values.categories.get_indexer( - other[notna(other)]) < 0).any()) - ) - - if do_as_object: - if scalar_other: + try: + # Attempt to do preserve categorical dtype. + result = super(CategoricalBlock, self).where( + other, cond, align, errors, try_cast, axis, transpose + ) + except (TypeError, ValueError): + if lib.is_scalar(other): msg = object_msg.format(other) else: msg = compat.reprlib.repr(other) @@ -2744,10 +2727,6 @@ def where(self, other, cond, align=True, errors='raise', errors=errors, try_cast=try_cast, axis=axis, transpose=transpose) - else: - result = super(CategoricalBlock, self).where( - other, cond, align, errors, try_cast, axis, transpose - ) return result diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index e36e73f682371..7d8cc34ae1462 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -357,10 +357,10 @@ def setitem(): def setslice(): self.arr[1:5] = 2 - with pytest.raises(TypeError, match="does not implement"): + with pytest.raises(TypeError, match="assignment via setitem"): setitem() - with pytest.raises(TypeError, match="does not implement"): + with pytest.raises(TypeError, match="assignment via setitem"): setslice() def test_constructor_from_too_large_array(self): From 6edd2866eb3c6f2f4c08ddc6258e5e188864e0c6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 7 Dec 2018 12:20:47 -0600 Subject: [PATCH 14/19] py2 compat --- pandas/core/internals/blocks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 897096809a7cd..687e0c5f5640b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2004,8 +2004,8 @@ def where(self, other, cond, align=True, errors='raise', dtype = self.dtype # rough heuristic to see if the other array implements setitem - if (self._holder.__setitem__ is ExtensionArray.__setitem__ - or self._holder.__setitem__ is SparseArray.__setitem__): + if (self._holder.__setitem__ == ExtensionArray.__setitem__ + or self._holder.__setitem__ == SparseArray.__setitem__): result = self._holder._from_sequence( np.where(cond, self.values, other), dtype=dtype, From 4de8bb573f99594b0b1ef688c6167ac91c23a2c8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 7 Dec 2018 16:14:41 -0600 Subject: [PATCH 15/19] Updated * Unxfail (most) of the new combine_first tests * Removed stale comment * group conditions --- pandas/core/internals/blocks.py | 11 +++++------ pandas/tests/extension/base/methods.py | 1 - pandas/tests/extension/json/test_json.py | 4 ++++ pandas/tests/extension/test_sparse.py | 7 +++++++ 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f006596480517..cab63126a6928 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1887,7 +1887,6 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): new_values = self.values.take(indexer, fill_value=fill_value, allow_fill=True) - # if we are a 1-dim object, then always place at 0 if self.ndim == 1 and new_mgr_locs is None: new_mgr_locs = [0] else: @@ -1973,15 +1972,15 @@ def where(self, other, cond, align=True, errors='raise', if isinstance(other, (ABCIndexClass, ABCSeries)): other = other.array + elif isinstance(other, ABCDataFrame): + assert other.shape[1] == 1 + other = other.iloc[:, 0].array + if isinstance(cond, ABCDataFrame): assert cond.shape[1] == 1 cond = cond.iloc[:, 0].array - if isinstance(other, ABCDataFrame): - assert other.shape[1] == 1 - other = other.iloc[:, 0].array - - if isinstance(cond, (ABCIndexClass, ABCSeries)): + elif isinstance(cond, (ABCIndexClass, ABCSeries)): cond = cond.array if lib.is_scalar(other) and isna(other): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index f77abfeb1275e..cb62df5b86370 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -164,7 +164,6 @@ def test_combine_add(self, data_repeated): orig_data1._from_sequence([a + val for a in list(orig_data1)])) self.assert_series_equal(result, expected) - @pytest.mark.xfail(reason="GH-24147", strict=True) def test_combine_first(self, data): # https://github.com/pandas-dev/pandas/issues/24147 a = pd.Series(data[:3]) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 4571f3f6d4040..a35997b07fd83 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -217,6 +217,10 @@ def test_combine_le(self, data_repeated): def test_combine_add(self, data_repeated): pass + @pytest.mark.skip(reason="combine for JSONArray not supported") + def test_combine_first(self, data): + pass + @unhashable def test_hash_pandas_object_works(self, data, kind): super().test_hash_pandas_object_works(data, kind) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 10eccea64069f..ea849a78cda12 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -279,6 +279,13 @@ def test_where_series(self, data, na_value): dtype=data.dtype)) self.assert_series_equal(result, expected) + def test_combine_first(self, data): + if data.dtype.subtype == 'int': + # Right now this is upcasted to float, just like combine_first + # for Series[int] + pytest.skip("TODO(SparseArray.__setitem__ will preserve dtype.") + super(TestMethods, self).test_combine_first(data) + class TestCasting(BaseSparseTests, base.BaseCastingTests): pass From f98a82c470ed60005e7d046c94df9c247711f22a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 9 Dec 2018 14:44:01 -0600 Subject: [PATCH 16/19] Clarify --- pandas/core/internals/blocks.py | 35 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4a854758e6a56..7f2bdb3ade95f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -34,7 +34,7 @@ _isna_compat, array_equivalent, is_null_datelike_scalar, isna, notna) import pandas.core.algorithms as algos -from pandas.core.arrays import Categorical, ExtensionArray, SparseArray +from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.indexes.datetimes import DatetimeIndex @@ -1969,10 +1969,13 @@ def shift(self, periods, axis=0): def where(self, other, cond, align=True, errors='raise', try_cast=False, axis=0, transpose=False): + # Extract the underlying arrays. if isinstance(other, (ABCIndexClass, ABCSeries)): other = other.array elif isinstance(other, ABCDataFrame): + # ExtensionArrays are 1-D, so if we get here then + # `other` should be a DataFrame with a single column. assert other.shape[1] == 1 other = other.iloc[:, 0].array @@ -1990,32 +1993,28 @@ def where(self, other, cond, align=True, errors='raise', other = self.dtype.na_value if is_sparse(self.values): - # ugly workaround for ensure that the dtype is OK - # after we insert NaNs. - if is_sparse(other): - otype = other.dtype.subtype - else: - otype = other - dtype = self.dtype.update_dtype( - np.result_type(self.values.dtype.subtype, otype) - ) + # We need to re-infer the type of the data after doing the + # where, for cases where the subtypes don't match + dtype = None else: dtype = self.dtype - # rough heuristic to see if the other array implements setitem - if (self._holder.__setitem__ == ExtensionArray.__setitem__ - or self._holder.__setitem__ == SparseArray.__setitem__): - result = self._holder._from_sequence( - np.where(cond, self.values, other), - dtype=dtype, - ) - else: + try: result = self.values.copy() icond = ~cond if lib.is_scalar(other): result[icond] = other else: result[icond] = other[icond] + except (NotImplementedError, TypeError): + # NotImplementedError for class not implementing `__setitem__` + # TypeError for SparseArray, which implements just to raise + # a TypeError + result = self._holder._from_sequence( + np.where(cond, self.values, other), + dtype=dtype, + ) + return self.make_block_same_class(result, placement=self.mgr_locs) @property From 8d9b20bf9031e90b9fc7a637e626b3eedae7132b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 10 Dec 2018 07:22:36 -0600 Subject: [PATCH 17/19] Simplify error message --- pandas/compat/__init__.py | 3 --- pandas/core/internals/blocks.py | 13 ++++--------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index f3748acfe6b1b..f9c659106a516 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -116,7 +116,6 @@ def get_range_parameters(data): reduce = functools.reduce long = int unichr = chr - import reprlib # This was introduced in Python 3.3, but we don't support # Python 3.x < 3.5, so checking PY3 is safe. @@ -272,7 +271,6 @@ class to receive bound method class_types = type, text_type = str binary_type = bytes - import reprlib def u(s): return s @@ -325,7 +323,6 @@ def set_function_name(f, name, cls): class_types = (type, types.ClassType) text_type = unicode binary_type = str - import repr as reprlib def u(s): return unicode(s, "unicode_escape") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7f2bdb3ade95f..c9962e444ea06 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2714,9 +2714,9 @@ def where(self, other, cond, align=True, errors='raise', # we enforce the deprecation. object_msg = ( "Implicitly converting categorical to object-dtype ndarray. " - "The values `{}' are not present in this categorical's " - "categories. A future version of pandas will raise a ValueError " - "when 'other' contains different categories.\n\n" + "One or more of the values in 'other' are not present in this " + "categorical's categories. A future version of pandas will raise " + "a ValueError when 'other' contains different categories.\n\n" "To preserve the current behavior, add the new categories to " "the categorical before calling 'where', or convert the " "categorical to a different dtype." @@ -2727,12 +2727,7 @@ def where(self, other, cond, align=True, errors='raise', other, cond, align, errors, try_cast, axis, transpose ) except (TypeError, ValueError): - if lib.is_scalar(other): - msg = object_msg.format(other) - else: - msg = compat.reprlib.repr(other) - - warnings.warn(msg, FutureWarning, stacklevel=6) + warnings.warn(object_msg, FutureWarning, stacklevel=6) result = self.astype(object).where(other, cond, align=align, errors=errors, try_cast=try_cast, From c0351fd6c55c1d4a00cb0ac1440808b011484aae Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 10 Dec 2018 07:27:40 -0600 Subject: [PATCH 18/19] sparse whatsnew --- doc/source/whatsnew/v0.24.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 785670ce7c6f0..a18c26f911f1d 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -675,6 +675,7 @@ changes were made: - ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. - Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. - ``DataFrame[column]`` is now a :class:`Series` with sparse values, rather than a :class:`SparseSeries`, when slicing a single column with sparse values (:issue:`23559`). +- The result of :meth:`Series.where` is now a ``Series`` with sparse values, like with other extension arrays (:issue:`24077`) Some new warnings are issued for operations that require or are likely to materialize a large dense array: From 539d3cb5aec33eb5fc7534eb09b6f0715a809abc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 10 Dec 2018 08:10:23 -0600 Subject: [PATCH 19/19] updates --- pandas/core/internals/blocks.py | 2 ++ pandas/tests/extension/test_categorical.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c9962e444ea06..1383ce09bc2d0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1993,6 +1993,7 @@ def where(self, other, cond, align=True, errors='raise', other = self.dtype.na_value if is_sparse(self.values): + # TODO(SparseArray.__setitem__): remove this if condition # We need to re-infer the type of the data after doing the # where, for cases where the subtypes don't match dtype = None @@ -2710,6 +2711,7 @@ def concat_same_type(self, to_concat, placement=None): def where(self, other, cond, align=True, errors='raise', try_cast=False, axis=0, transpose=False): + # TODO(CategoricalBlock.where): # This can all be deleted in favor of ExtensionBlock.where once # we enforce the deprecation. object_msg = ( diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index ce9b2f2435231..6106bc3d58620 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -27,7 +27,9 @@ def make_data(): while True: values = np.random.choice(list(string.ascii_letters), size=100) - # ensure we meet the requirement + # ensure we meet the requirements + # 1. first two not null + # 2. first and second are different if values[0] != values[1]: break return values @@ -40,7 +42,11 @@ def dtype(): @pytest.fixture def data(): - """Length-100 PeriodArray for semantics test.""" + """Length-100 array for this type. + + * data[0] and data[1] should both be non missing + * data[0] and data[1] should not gbe equal + """ return Categorical(make_data())