Skip to content

Commit abd019a

Browse files
committed
Squashed commit of the following:
commit 9e0d87d Author: Tom Augspurger <[email protected]> Date: Fri Dec 7 07:18:58 2018 -0600 update docs, cleanup commit 1271d3d Merge: 033ac9c f74fc59 Author: Tom Augspurger <[email protected]> Date: Fri Dec 7 07:12:49 2018 -0600 Merge remote-tracking branch 'upstream/master' into ea-where commit 033ac9c Author: Tom Augspurger <[email protected]> Date: Fri Dec 7 06:30:18 2018 -0600 Setitem-based where commit e9665b8 Merge: 5e14414 03134cb Author: Tom Augspurger <[email protected]> Date: Thu Dec 6 21:38:42 2018 -0600 Merge remote-tracking branch 'upstream/master' into ea-where commit 5e14414 Author: Tom Augspurger <[email protected]> Date: Thu Dec 6 09:18:54 2018 -0600 where versionadded commit d90f384 Author: Tom Augspurger <[email protected]> Date: Thu Dec 6 09:17:43 2018 -0600 deprecation note for categorical commit 4715ef6 Merge: edff47e b78aa8d Author: Tom Augspurger <[email protected]> Date: Thu Dec 6 08:15:26 2018 -0600 Merge remote-tracking branch 'upstream/master' into ea-where commit edff47e Author: Tom Augspurger <[email protected]> Date: Thu Dec 6 08:15:21 2018 -0600 32-bit compat commit badb5be Author: Tom Augspurger <[email protected]> Date: Thu Dec 6 06:21:44 2018 -0600 compat, revert commit 911a2da Author: Tom Augspurger <[email protected]> Date: Wed Dec 5 15:55:24 2018 -0600 debug 32-bit issue commit a69dbb3 Author: Tom Augspurger <[email protected]> Date: Wed Dec 5 15:49:17 2018 -0600 warn for categorical commit 6f79282 Author: Tom Augspurger <[email protected]> Date: Wed Dec 5 12:45:54 2018 -0600 32-bit compat commit 56470c3 Author: Tom Augspurger <[email protected]> Date: Wed Dec 5 11:39:48 2018 -0600 Fixups: * Ensure data generated OK. * Remove erroneous comments about alignment. That was user error. commit c4604df Author: Tom Augspurger <[email protected]> Date: Mon Dec 3 14:23:25 2018 -0600 API: Added ExtensionArray.where We need some way to do `.where` on EA object for DatetimeArray. Adding it to the interface is, I think, the easiest way. Initially I started to write a version on ExtensionBlock, but it proved to be unwieldy. to write a version that performed well for all types. It *may* be possible to do using `_ndarray_values` but we'd need a few more things around that (missing values, converting an arbitrary array to the "same' ndarary_values, error handling, re-constructing). It seemed easier to push this down to the array. The implementation on ExtensionArray is readable, but likely slow since it'll involve a conversion to object-dtype. Closes pandas-dev#24077
1 parent c22a30a commit abd019a

File tree

11 files changed

+191
-39
lines changed

11 files changed

+191
-39
lines changed

doc/source/whatsnew/v0.24.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -1137,6 +1137,8 @@ Deprecations
11371137
- :func:`pandas.types.is_datetimetz` is deprecated in favor of `pandas.types.is_datetime64tz` (:issue:`23917`)
11381138
- Creating a :class:`TimedeltaIndex` or :class:`DatetimeIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range` and :func:`date_range` (:issue:`23919`)
11391139
- Passing a string alias like ``'datetime64[ns, UTC]'`` as the `unit` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`).
1140+
- In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype first, or add the ``other`` to the categories first (:issue:`24077`).
1141+
11401142

11411143
.. _whatsnew_0240.deprecations.datetimelike_int_ops:
11421144

@@ -1308,6 +1310,7 @@ Datetimelike
13081310
- Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`)
13091311
- Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`)
13101312
- Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`)
1313+
- Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are undordered and have the same categories, but in a different order (:issue:`24142`)
13111314

13121315
Timedelta
13131316
^^^^^^^^^

pandas/compat/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ def get_range_parameters(data):
116116
reduce = functools.reduce
117117
long = int
118118
unichr = chr
119+
import reprlib
119120

120121
# This was introduced in Python 3.3, but we don't support
121122
# Python 3.x < 3.5, so checking PY3 is safe.
@@ -271,6 +272,7 @@ class to receive bound method
271272
class_types = type,
272273
text_type = str
273274
binary_type = bytes
275+
import reprlib
274276

275277
def u(s):
276278
return s
@@ -323,6 +325,7 @@ def set_function_name(f, name, cls):
323325
class_types = (type, types.ClassType)
324326
text_type = unicode
325327
binary_type = str
328+
import repr as reprlib
326329

327330
def u(s):
328331
return unicode(s, "unicode_escape")

pandas/core/arrays/base.py

+2
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,8 @@ def __setitem__(self, key, value):
221221
# example, a string like '2018-01-01' is coerced to a datetime
222222
# when setting on a datetime64ns array. In general, if the
223223
# __init__ method coerces that value, then so should __setitem__
224+
# Note, also, that Series/DataFrame.where internally use __setitem__
225+
# on a copy of the data.
224226
raise NotImplementedError(_not_implemented_message.format(
225227
type(self), '__setitem__')
226228
)

pandas/core/arrays/categorical.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -2121,11 +2121,21 @@ def __setitem__(self, key, value):
21212121
`Categorical` does not have the same categories
21222122
"""
21232123

2124+
if isinstance(value, (ABCIndexClass, ABCSeries)):
2125+
value = value.array
2126+
21242127
# require identical categories set
21252128
if isinstance(value, Categorical):
2126-
if not value.categories.equals(self.categories):
2129+
if not is_dtype_equal(self, value):
21272130
raise ValueError("Cannot set a Categorical with another, "
21282131
"without identical categories")
2132+
if not self.categories.equals(value.categories):
2133+
new_codes = _recode_for_categories(
2134+
value.codes, value.categories, self.categories
2135+
)
2136+
value = Categorical.from_codes(new_codes,
2137+
categories=self.categories,
2138+
ordered=self.ordered)
21292139

21302140
rvalue = value if is_list_like(value) else [value]
21312141

pandas/core/arrays/sparse.py

-5
Original file line numberDiff line numberDiff line change
@@ -704,11 +704,6 @@ def __array__(self, dtype=None, copy=True):
704704
out[self.sp_index.to_int_index().indices] = self.sp_values
705705
return out
706706

707-
def __setitem__(self, key, value):
708-
# I suppose we could allow setting of non-fill_value elements.
709-
msg = "SparseArray does not support item assignment via setitem"
710-
raise TypeError(msg)
711-
712707
@classmethod
713708
def _from_sequence(cls, scalars, dtype=None, copy=False):
714709
return cls(scalars, dtype=dtype)

pandas/core/indexes/category.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,14 @@ def _can_reindex(self, indexer):
501501

502502
@Appender(_index_shared_docs['where'])
503503
def where(self, cond, other=None):
504-
cat = self.values.where(cond, other=other)
504+
# TODO: Investigate an alternative implementation with
505+
# 1. copy the underyling Categorical
506+
# 2. setitem with `cond` and `other`
507+
# 3. Rebuild CategoricalIndex.
508+
if other is None:
509+
other = self._na_value
510+
values = np.where(cond, self.values, other)
511+
cat = Categorical(values, dtype=self.dtype)
505512
return self._shallow_copy(cat, **self._get_attributes_dict())
506513

507514
def reindex(self, target, method=None, level=None, limit=None,

pandas/core/internals/blocks.py

+75-7
Original file line numberDiff line numberDiff line change
@@ -1991,7 +1991,33 @@ def where(self, other, cond, align=True, errors='raise',
19911991
# we want to replace that with the correct NA value
19921992
# for the type
19931993
other = self.dtype.na_value
1994-
result = self.values.where(cond, other)
1994+
1995+
if is_sparse(self.values):
1996+
# ugly workaround for ensure that the dtype is OK
1997+
# after we insert NaNs.
1998+
if is_sparse(other):
1999+
otype = other.dtype.subtype
2000+
else:
2001+
otype = other
2002+
dtype = self.dtype.update_dtype(
2003+
np.result_type(self.values.dtype.subtype, otype)
2004+
)
2005+
else:
2006+
dtype = self.dtype
2007+
2008+
# rough heuristic to see if the other array implements setitem
2009+
if self._holder.__setitem__ is ExtensionArray.__setitem__:
2010+
result = self._holder._from_sequence(
2011+
np.where(cond, self.values, other),
2012+
dtype=dtype,
2013+
)
2014+
else:
2015+
result = self.values.copy()
2016+
icond = ~cond
2017+
if lib.is_scalar(other):
2018+
result[icond] = other
2019+
else:
2020+
result[icond] = other[icond]
19952021
return self.make_block_same_class(result, placement=self.mgr_locs)
19962022

19972023
@property
@@ -2701,13 +2727,55 @@ def concat_same_type(self, to_concat, placement=None):
27012727

27022728
def where(self, other, cond, align=True, errors='raise',
27032729
try_cast=False, axis=0, transpose=False):
2704-
result = super(CategoricalBlock, self).where(
2705-
other, cond, align, errors, try_cast, axis, transpose
2730+
# This can all be deleted in favor of ExtensionBlock.where once
2731+
# we enforce the deprecation.
2732+
object_msg = (
2733+
"Implicitly converting categorical to object-dtype ndarray. "
2734+
"The values `{}' are not present in this categorical's "
2735+
"categories. A future version of pandas will raise a ValueError "
2736+
"when 'other' contains different categories.\n\n"
2737+
"To preserve the current behavior, add the new categories to "
2738+
"the categorical before calling 'where', or convert the "
2739+
"categorical to a different dtype."
27062740
)
2707-
if result.values.dtype != self.values.dtype:
2708-
# For backwards compatability, we allow upcasting to object.
2709-
# This fallback will be removed in the future.
2710-
result = result.astype(object)
2741+
2742+
scalar_other = lib.is_scalar(other)
2743+
categorical_other = is_categorical_dtype(other)
2744+
if isinstance(other, ABCDataFrame):
2745+
# should be 1d
2746+
assert other.shape[1] == 1
2747+
other = other.iloc[:, 0]
2748+
2749+
if isinstance(other, (ABCSeries, ABCIndexClass)):
2750+
other = other._values
2751+
2752+
do_as_object = (
2753+
# Two categoricals with different dtype (ignoring order)
2754+
(categorical_other and not is_dtype_equal(self.values, other)) or
2755+
# a not-na scalar not present in our categories
2756+
(scalar_other and (other not in self.values.categories
2757+
and notna(other))) or
2758+
# an array not present in our categories
2759+
(not scalar_other and
2760+
(self.values.categories.get_indexer(
2761+
other[notna(other)]) < 0).any())
2762+
)
2763+
2764+
if do_as_object:
2765+
if scalar_other:
2766+
msg = object_msg.format(other)
2767+
else:
2768+
msg = compat.reprlib.repr(other)
2769+
2770+
warnings.warn(msg, FutureWarning, stacklevel=6)
2771+
result = self.astype(object).where(other, cond, align=align,
2772+
errors=errors,
2773+
try_cast=try_cast,
2774+
axis=axis, transpose=transpose)
2775+
else:
2776+
result = super(CategoricalBlock, self).where(
2777+
other, cond, align, errors, try_cast, axis, transpose
2778+
)
27112779
return result
27122780

27132781

pandas/tests/arrays/categorical/test_indexing.py

+79-17
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import numpy as np
44
import pytest
55

6+
import pandas as pd
67
from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series
78
import pandas.core.common as com
89
from pandas.tests.arrays.categorical.common import TestCategorical
@@ -43,6 +44,45 @@ def test_setitem(self):
4344

4445
tm.assert_categorical_equal(c, expected)
4546

47+
@pytest.mark.parametrize('other', [
48+
pd.Categorical(['b', 'a']),
49+
pd.Categorical(['b', 'a'], categories=['b', 'a']),
50+
])
51+
def test_setitem_same_but_unordered(self, other):
52+
# GH-24142
53+
target = pd.Categorical(['a', 'b'], categories=['a', 'b'])
54+
mask = np.array([True, False])
55+
target[mask] = other[mask]
56+
expected = pd.Categorical(['b', 'b'], categories=['a', 'b'])
57+
tm.assert_categorical_equal(target, expected)
58+
59+
@pytest.mark.parametrize('other', [
60+
pd.Categorical(['b', 'a'], categories=['b', 'a', 'c']),
61+
pd.Categorical(['b', 'a'], categories=['a', 'b', 'c']),
62+
pd.Categorical(['a', 'a'], categories=['a']),
63+
pd.Categorical(['b', 'b'], categories=['b']),
64+
])
65+
def test_setitem_different_unordered_raises(self, other):
66+
# GH-24142
67+
target = pd.Categorical(['a', 'b'], categories=['a', 'b'])
68+
mask = np.array([True, False])
69+
with pytest.raises(ValueError):
70+
target[mask] = other[mask]
71+
72+
@pytest.mark.parametrize('other', [
73+
pd.Categorical(['b', 'a']),
74+
pd.Categorical(['b', 'a'], categories=['b', 'a'], ordered=True),
75+
pd.Categorical(['b', 'a'], categories=['a', 'b', 'c'], ordered=True),
76+
])
77+
def test_setitem_same_ordered_rasies(self, other):
78+
# Gh-24142
79+
target = pd.Categorical(['a', 'b'], categories=['a', 'b'],
80+
ordered=True)
81+
mask = np.array([True, False])
82+
83+
with pytest.raises(ValueError):
84+
target[mask] = other[mask]
85+
4686

4787
class TestCategoricalIndexing(object):
4888

@@ -122,37 +162,59 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class):
122162
tm.assert_numpy_array_equal(expected, result)
123163
tm.assert_numpy_array_equal(exp_miss, res_miss)
124164

165+
def test_where_unobserved_nan(self):
166+
ser = pd.Series(pd.Categorical(['a', 'b']))
167+
result = ser.where([True, False])
168+
expected = pd.Series(pd.Categorical(['a', None],
169+
categories=['a', 'b']))
170+
tm.assert_series_equal(result, expected)
171+
172+
# all NA
173+
ser = pd.Series(pd.Categorical(['a', 'b']))
174+
result = ser.where([False, False])
175+
expected = pd.Series(pd.Categorical([None, None],
176+
categories=['a', 'b']))
177+
tm.assert_series_equal(result, expected)
178+
125179
def test_where_unobserved_categories(self):
126-
arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
127-
result = arr.where([True, True, False], other='b')
128-
expected = Categorical(['a', 'b', 'b'], categories=arr.categories)
129-
tm.assert_categorical_equal(result, expected)
180+
ser = pd.Series(
181+
Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
182+
)
183+
result = ser.where([True, True, False], other='b')
184+
expected = pd.Series(
185+
Categorical(['a', 'b', 'b'], categories=ser.cat.categories)
186+
)
187+
tm.assert_series_equal(result, expected)
130188

131189
def test_where_other_categorical(self):
132-
arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
190+
ser = pd.Series(
191+
Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
192+
)
133193
other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'])
134-
result = arr.where([True, False, True], other)
135-
expected = Categorical(['a', 'c', 'c'], dtype=arr.dtype)
136-
tm.assert_categorical_equal(result, expected)
194+
result = ser.where([True, False, True], other)
195+
expected = pd.Series(Categorical(['a', 'c', 'c'], dtype=ser.dtype))
196+
tm.assert_series_equal(result, expected)
137197

138198
def test_where_warns(self):
139-
arr = Categorical(['a', 'b', 'c'])
199+
ser = pd.Series(Categorical(['a', 'b', 'c']))
140200
with tm.assert_produces_warning(FutureWarning):
141-
result = arr.where([True, False, True], 'd')
201+
result = ser.where([True, False, True], 'd')
142202

143-
expected = np.array(['a', 'd', 'c'], dtype='object')
144-
tm.assert_numpy_array_equal(result, expected)
203+
expected = pd.Series(np.array(['a', 'd', 'c'], dtype='object'))
204+
tm.assert_series_equal(result, expected)
145205

146206
def test_where_ordered_differs_rasies(self):
147-
arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'],
148-
ordered=True)
207+
ser = pd.Series(
208+
Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'],
209+
ordered=True)
210+
)
149211
other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'],
150212
ordered=True)
151213
with tm.assert_produces_warning(FutureWarning):
152-
result = arr.where([True, False, True], other)
214+
result = ser.where([True, False, True], other)
153215

154-
expected = np.array(['a', 'c', 'c'], dtype=object)
155-
tm.assert_numpy_array_equal(result, expected)
216+
expected = pd.Series(np.array(['a', 'c', 'c'], dtype=object))
217+
tm.assert_series_equal(result, expected)
156218

157219

158220
@pytest.mark.parametrize("index", [True, False])

pandas/tests/arrays/interval/test_interval.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import numpy as np
33
import pytest
44

5+
import pandas as pd
56
from pandas import Index, Interval, IntervalIndex, date_range, timedelta_range
67
from pandas.core.arrays import IntervalArray
78
import pandas.util.testing as tm
@@ -55,10 +56,11 @@ def test_set_closed(self, closed, new_closed):
5556
IntervalArray.from_breaks([1, 2, 3, 4], closed='right'),
5657
])
5758
def test_where_raises(self, other):
58-
arr = IntervalArray.from_breaks([1, 2, 3, 4], closed='left')
59-
match = "'other.closed' is 'right', expected 'left'."
59+
ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4],
60+
closed='left'))
61+
match = "'value.closed' is 'right', expected 'left'."
6062
with pytest.raises(ValueError, match=match):
61-
arr.where([True, False, True], other=other)
63+
ser.where([True, False, True], other=other)
6264

6365

6466
class TestSetitem(object):

pandas/tests/arrays/test_period.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -207,11 +207,11 @@ def test_sub_period():
207207
period_array(['2000', '2001', '2000'], freq='H')
208208
])
209209
def test_where_different_freq_raises(other):
210-
arr = period_array(['2000', '2001', '2002'], freq='D')
210+
ser = pd.Series(period_array(['2000', '2001', '2002'], freq='D'))
211211
cond = np.array([True, False, True])
212212
with pytest.raises(IncompatibleFrequency,
213213
match="Input has different freq=H"):
214-
arr.where(cond, other)
214+
ser.where(cond, other)
215215

216216

217217
# ----------------------------------------------------------------------------

pandas/tests/extension/test_sparse.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def make_data(fill_value):
1212
if np.isnan(fill_value):
1313
data = np.random.uniform(size=100).astype('float64')
1414
else:
15-
data = np.random.randint(1, 100, size=100, dtype='int64')
15+
data = np.random.randint(1, 100, size=100)
1616
if data[0] == data[1]:
1717
data[0] += 1
1818

@@ -266,13 +266,13 @@ def test_where_series(self, data, na_value):
266266

267267
cond = np.array([True, True, False, False])
268268
result = ser.where(cond)
269-
# new_dtype is the only difference
269+
270270
new_dtype = SparseDtype('float', 0.0)
271271
expected = pd.Series(cls._from_sequence([a, a, na_value, na_value],
272272
dtype=new_dtype))
273273
self.assert_series_equal(result, expected)
274274

275-
other = cls._from_sequence([a, b, a, b])
275+
other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
276276
cond = np.array([True, False, True, True])
277277
result = ser.where(cond, other)
278278
expected = pd.Series(cls._from_sequence([a, b, b, b],

0 commit comments

Comments
 (0)