Skip to content

Commit 0646ad5

Browse files
committed
Merge pull request #7892 from immerrr/add-level-kwarg-for-index-isin
API: add 'level' kwarg to 'Index.isin' method
2 parents 50f0959 + 7708590 commit 0646ad5

File tree

4 files changed

+185
-33
lines changed

4 files changed

+185
-33
lines changed

doc/source/indexing.rst

+21-10
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,7 @@ and :ref:`Advanced Indexing <indexing.advanced>` you may select along more than
582582
.. _indexing.basics.indexing_isin:
583583

584584
Indexing with isin
585-
~~~~~~~~~~~~~~~~~~
585+
------------------
586586

587587
Consider the ``isin`` method of Series, which returns a boolean vector that is
588588
true wherever the Series elements exist in the passed list. This allows you to
@@ -591,13 +591,30 @@ select rows where one or more columns have values you want:
591591
.. ipython:: python
592592
593593
s = Series(np.arange(5),index=np.arange(5)[::-1],dtype='int64')
594-
595594
s
595+
s.isin([2, 4, 6])
596+
s[s.isin([2, 4, 6])]
597+
598+
The same method is available for ``Index`` objects and is useful for the cases
599+
when you don't know which of the sought labels are in fact present:
596600

597-
s.isin([2, 4])
601+
.. ipython:: python
602+
603+
s[s.index.isin([2, 4, 6])]
598604
599-
s[s.isin([2, 4])]
605+
# compare it to the following
606+
s[[2, 4, 6]]
600607
608+
In addition to that, ``MultiIndex`` allows selecting a separate level to use
609+
in the membership check:
610+
611+
.. ipython:: python
612+
613+
s_mi = Series(np.arange(6),
614+
index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
615+
s_mi
616+
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]
617+
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]
601618
602619
DataFrame also has an ``isin`` method. When calling ``isin``, pass a set of
603620
values as either an array or dict. If values is an array, ``isin`` returns
@@ -1622,12 +1639,6 @@ with duplicates dropped.
16221639
idx1.sym_diff(idx2)
16231640
idx1 ^ idx2
16241641
1625-
The ``isin`` method of Index objects
1626-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1627-
1628-
One additional operation is the ``isin`` method that works analogously to the
1629-
``Series.isin`` method found :ref:`here <indexing.boolean>`.
1630-
16311642
.. _indexing.hierarchical:
16321643

16331644
Hierarchical indexing (MultiIndex)

doc/source/v0.15.0.txt

+13
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,19 @@ API changes
129129
strings must contain 244 or fewer characters. Attempting to write Stata
130130
dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`)
131131

132+
- ``Index.isin`` now supports a ``level`` argument to specify which index level
133+
to use for membership tests (:issue:`7892`, :issue:`7890`)
134+
135+
.. code-block:: python
136+
137+
In [1]: idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']])
138+
139+
In [2]: idx.values
140+
Out[2]: array([(0, 'a'), (0, 'b'), (0, 'c'), (1, 'a'), (1, 'b'), (1, 'c')], dtype=object)
141+
142+
In [3]: idx.isin(['a', 'c', 'e'], level=1)
143+
Out[3]: array([ True, False, True, True, False, True], dtype=bool)
144+
132145

133146
.. _whatsnew_0150.cat:
134147

pandas/core/index.py

+58-22
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import pandas.index as _index
1313
from pandas.lib import Timestamp, is_datetime_array
1414
from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin
15-
from pandas.util.decorators import cache_readonly, deprecate
15+
from pandas.util.decorators import cache_readonly, deprecate, Appender
1616
from pandas.core.common import isnull, array_equivalent
1717
import pandas.core.common as com
1818
from pandas.core.common import (_values_from_object, is_float, is_integer,
@@ -687,13 +687,29 @@ def _engine(self):
687687
# property, for now, slow to look up
688688
return self._engine_type(lambda: self.values, len(self))
689689

690+
def _validate_index_level(self, level):
691+
"""
692+
Validate index level.
693+
694+
For single-level Index getting level number is a no-op, but some
695+
verification must be done like in MultiIndex.
696+
697+
"""
698+
if isinstance(level, int):
699+
if level < 0 and level != -1:
700+
raise IndexError("Too many levels: Index has only 1 level,"
701+
" %d is not a valid level number" % (level,))
702+
elif level > 0:
703+
raise IndexError("Too many levels:"
704+
" Index has only 1 level, not %d" %
705+
(level + 1))
706+
elif level != self.name:
707+
raise KeyError('Level %s must be same as name (%s)'
708+
% (level, self.name))
709+
690710
def _get_level_number(self, level):
691-
if not isinstance(level, int):
692-
if level != self.name:
693-
raise AssertionError('Level %s must be same as name (%s)'
694-
% (level, self.name))
695-
level = 0
696-
return level
711+
self._validate_index_level(level)
712+
return 0
697713

698714
@cache_readonly
699715
def inferred_type(self):
@@ -1271,7 +1287,7 @@ def get_level_values(self, level):
12711287
values : ndarray
12721288
"""
12731289
# checks that level number is actually just 1
1274-
self._get_level_number(level)
1290+
self._validate_index_level(level)
12751291
return self
12761292

12771293
def get_indexer(self, target, method=None, limit=None):
@@ -1370,20 +1386,34 @@ def groupby(self, to_groupby):
13701386
def map(self, mapper):
13711387
return self._arrmap(self.values, mapper)
13721388

1373-
def isin(self, values):
1389+
def isin(self, values, level=None):
13741390
"""
13751391
Compute boolean array of whether each index value is found in the
13761392
passed set of values
13771393
13781394
Parameters
13791395
----------
13801396
values : set or sequence of values
1397+
Sought values.
1398+
level : str or int, optional
1399+
Name or position of the index level to use (if the index is a
1400+
MultiIndex).
1401+
1402+
Notes
1403+
-----
1404+
If `level` is specified:
1405+
1406+
- if it is the name of one *and only one* index level, use that level;
1407+
- otherwise it should be a number indicating level position.
13811408
13821409
Returns
13831410
-------
13841411
is_contained : ndarray (boolean dtype)
1412+
13851413
"""
13861414
value_set = set(values)
1415+
if level is not None:
1416+
self._validate_index_level(level)
13871417
return lib.ismember(self._array_values(), value_set)
13881418

13891419
def _array_values(self):
@@ -2149,20 +2179,11 @@ def hasnans(self):
21492179
def is_unique(self):
21502180
return super(Float64Index, self).is_unique and self._nan_idxs.size < 2
21512181

2152-
def isin(self, values):
2153-
"""
2154-
Compute boolean array of whether each index value is found in the
2155-
passed set of values
2156-
2157-
Parameters
2158-
----------
2159-
values : set or sequence of values
2160-
2161-
Returns
2162-
-------
2163-
is_contained : ndarray (boolean dtype)
2164-
"""
2182+
@Appender(Index.isin.__doc__)
2183+
def isin(self, values, level=None):
21652184
value_set = set(values)
2185+
if level is not None:
2186+
self._validate_index_level(level)
21662187
return lib.ismember_nans(self._array_values(), value_set,
21672188
isnull(list(value_set)).any())
21682189

@@ -4052,6 +4073,21 @@ def _wrap_joined_index(self, joined, other):
40524073
names = self.names if self.names == other.names else None
40534074
return MultiIndex.from_tuples(joined, names=names)
40544075

4076+
@Appender(Index.isin.__doc__)
4077+
def isin(self, values, level=None):
4078+
if level is None:
4079+
return lib.ismember(self._array_values(), set(values))
4080+
else:
4081+
num = self._get_level_number(level)
4082+
levs = self.levels[num]
4083+
labs = self.labels[num]
4084+
4085+
sought_labels = levs.isin(values).nonzero()[0]
4086+
if levs.size == 0:
4087+
return np.zeros(len(labs), dtype=np.bool_)
4088+
else:
4089+
return np.lib.arraysetops.in1d(labs, sought_labels)
4090+
40554091

40564092
# For utility purposes
40574093

pandas/tests/test_index.py

+93-1
Original file line numberDiff line numberDiff line change
@@ -840,7 +840,7 @@ def test_get_set_value(self):
840840
self.assertEqual(values[67], 10)
841841

842842
def test_isin(self):
843-
values = ['foo', 'bar']
843+
values = ['foo', 'bar', 'quux']
844844

845845
idx = Index(['qux', 'baz', 'foo', 'bar'])
846846
result = idx.isin(values)
@@ -853,6 +853,49 @@ def test_isin(self):
853853
self.assertEqual(len(result), 0)
854854
self.assertEqual(result.dtype, np.bool_)
855855

856+
def test_isin_nan(self):
857+
self.assert_numpy_array_equal(
858+
Index(['a', np.nan]).isin([np.nan]), [False, True])
859+
self.assert_numpy_array_equal(
860+
Index(['a', pd.NaT]).isin([pd.NaT]), [False, True])
861+
self.assert_numpy_array_equal(
862+
Index(['a', np.nan]).isin([float('nan')]), [False, False])
863+
self.assert_numpy_array_equal(
864+
Index(['a', np.nan]).isin([pd.NaT]), [False, False])
865+
# Float64Index overrides isin, so must be checked separately
866+
self.assert_numpy_array_equal(
867+
Float64Index([1.0, np.nan]).isin([np.nan]), [False, True])
868+
self.assert_numpy_array_equal(
869+
Float64Index([1.0, np.nan]).isin([float('nan')]), [False, True])
870+
self.assert_numpy_array_equal(
871+
Float64Index([1.0, np.nan]).isin([pd.NaT]), [False, True])
872+
873+
def test_isin_level_kwarg(self):
874+
def check_idx(idx):
875+
values = idx.tolist()[-2:] + ['nonexisting']
876+
877+
expected = np.array([False, False, True, True])
878+
self.assert_numpy_array_equal(expected, idx.isin(values, level=0))
879+
self.assert_numpy_array_equal(expected, idx.isin(values, level=-1))
880+
881+
self.assertRaises(IndexError, idx.isin, values, level=1)
882+
self.assertRaises(IndexError, idx.isin, values, level=10)
883+
self.assertRaises(IndexError, idx.isin, values, level=-2)
884+
885+
self.assertRaises(KeyError, idx.isin, values, level=1.0)
886+
self.assertRaises(KeyError, idx.isin, values, level='foobar')
887+
888+
idx.name = 'foobar'
889+
self.assert_numpy_array_equal(expected,
890+
idx.isin(values, level='foobar'))
891+
892+
self.assertRaises(KeyError, idx.isin, values, level='xyzzy')
893+
self.assertRaises(KeyError, idx.isin, values, level=np.nan)
894+
895+
check_idx(Index(['qux', 'baz', 'foo', 'bar']))
896+
# Float64Index overrides isin, so must be checked separately
897+
check_idx(Float64Index([1.0, 2.0, 3.0, 4.0]))
898+
856899
def test_boolean_cmp(self):
857900
values = [1, 2, 3, 4]
858901

@@ -2948,6 +2991,55 @@ def test_level_setting_resets_attributes(self):
29482991
# if this fails, probably didn't reset the cache correctly.
29492992
assert not ind.is_monotonic
29502993

2994+
def test_isin(self):
2995+
values = [('foo', 2), ('bar', 3), ('quux', 4)]
2996+
2997+
idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'],
2998+
np.arange(4)])
2999+
result = idx.isin(values)
3000+
expected = np.array([False, False, True, True])
3001+
self.assert_numpy_array_equal(result, expected)
3002+
3003+
# empty, return dtype bool
3004+
idx = MultiIndex.from_arrays([[], []])
3005+
result = idx.isin(values)
3006+
self.assertEqual(len(result), 0)
3007+
self.assertEqual(result.dtype, np.bool_)
3008+
3009+
def test_isin_nan(self):
3010+
idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]])
3011+
self.assert_numpy_array_equal(idx.isin([('bar', np.nan)]),
3012+
[False, False])
3013+
self.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]),
3014+
[False, False])
3015+
3016+
def test_isin_level_kwarg(self):
3017+
idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'],
3018+
np.arange(4)])
3019+
3020+
vals_0 = ['foo', 'bar', 'quux']
3021+
vals_1 = [2, 3, 10]
3022+
3023+
expected = np.array([False, False, True, True])
3024+
self.assert_numpy_array_equal(expected, idx.isin(vals_0, level=0))
3025+
self.assert_numpy_array_equal(expected, idx.isin(vals_0, level=-2))
3026+
3027+
self.assert_numpy_array_equal(expected, idx.isin(vals_1, level=1))
3028+
self.assert_numpy_array_equal(expected, idx.isin(vals_1, level=-1))
3029+
3030+
self.assertRaises(IndexError, idx.isin, vals_0, level=5)
3031+
self.assertRaises(IndexError, idx.isin, vals_0, level=-5)
3032+
3033+
self.assertRaises(KeyError, idx.isin, vals_0, level=1.0)
3034+
self.assertRaises(KeyError, idx.isin, vals_1, level=-1.0)
3035+
self.assertRaises(KeyError, idx.isin, vals_1, level='A')
3036+
3037+
idx.names = ['A', 'B']
3038+
self.assert_numpy_array_equal(expected, idx.isin(vals_0, level='A'))
3039+
self.assert_numpy_array_equal(expected, idx.isin(vals_1, level='B'))
3040+
3041+
self.assertRaises(KeyError, idx.isin, vals_1, level='C')
3042+
29513043

29523044
def test_get_combined_index():
29533045
from pandas.core.index import _get_combined_index

0 commit comments

Comments
 (0)