Skip to content

Commit b538892

Browse files
committed
Merge pull request #5199 from TomAugspurger/isin_dfs
ENH/API: Accept DataFrame for isin
2 parents 97f5878 + 38b8fca commit b538892

File tree

3 files changed

+132
-31
lines changed

3 files changed

+132
-31
lines changed

doc/source/v0.13.0.txt

+3-2
Original file line numberDiff line numberDiff line change
@@ -476,15 +476,16 @@ Enhancements
476476
t = Timestamp('20130101 09:01:02')
477477
t + pd.datetools.Nano(123)
478478

479-
- A new method, ``isin`` for DataFrames, plays nicely with boolean indexing. See :ref:`the docs<indexing.basics.indexing_isin>` for more.
479+
- A new method, ``isin`` for DataFrames, which plays nicely with boolean indexing. The argument to ``isin``, what we're comparing the DataFrame to, can be a DataFrame, Series, dict, or array of values. See :ref:`the docs<indexing.basics.indexing_isin>` for more.
480480

481481
To get the rows where any of the conditions are met:
482482

483483
.. ipython:: python
484484

485485
dfi = DataFrame({'A': [1, 2, 3, 4], 'B': ['a', 'b', 'f', 'n']})
486486
dfi
487-
mask = dfi.isin({'A': [1, 2], 'B': ['e', 'f']})
487+
other = DataFrame({'A': [1, 3, 3, 7], 'B': ['e', 'f', 'f', 'e']})
488+
mask = dfi.isin(other)
488489
mask
489490
dfi[mask.any(1)]
490491

pandas/core/frame.py

+51-15
Original file line numberDiff line numberDiff line change
@@ -4239,35 +4239,71 @@ def to_period(self, freq=None, axis=0, copy=True):
42394239

42404240
return self._constructor(new_data)
42414241

4242-
4243-
def isin(self, values, iloc=False):
4242+
def isin(self, values):
42444243
"""
4245-
Return boolean DataFrame showing whether each element in the DataFrame is
4246-
contained in values.
4244+
Return boolean DataFrame showing whether each element in the
4245+
DataFrame is contained in values.
42474246
42484247
Parameters
42494248
----------
4250-
values : iterable or dictionary of columns to values
4251-
iloc : boolean, if passing a dict as values, describe columns using integer
4252-
locations (default is to use labels)
4249+
values : iterable, Series, DataFrame or dictionary
4250+
The result will only be true at a location if all the
4251+
labels match. If `values` is a Series, that's the index. If
4252+
`values` is a dictionary, the keys must be the column names,
4253+
which must match. If `values` is a DataFrame,
4254+
then both the index and column labels must match.
42534255
42544256
Returns
42554257
-------
42564258
42574259
DataFrame of booleans
4260+
4261+
Examples
4262+
--------
4263+
When ``values`` is a list:
4264+
4265+
>>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
4266+
>>> df.isin([1, 3, 12, 'a'])
4267+
A B
4268+
0 True True
4269+
1 False False
4270+
2 True False
4271+
4272+
When ``values`` is a dict:
4273+
4274+
>>> df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
4275+
>>> df.isin({'A': [1, 3], 'B': [4, 7, 12]})
4276+
A B
4277+
0 True False # Note that B didn't match the 1 here.
4278+
1 False True
4279+
2 True True
4280+
4281+
When ``values`` is a Series or DataFrame:
4282+
4283+
>>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
4284+
>>> other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
4285+
>>> df.isin(other)
4286+
A B
4287+
0 True False
4288+
1 False False # Column A in `other` has a 3, but not at index 1.
4289+
2 True True
42584290
"""
42594291
if isinstance(values, dict):
42604292
from collections import defaultdict
42614293
from pandas.tools.merge import concat
42624294
values = defaultdict(list, values)
4263-
if iloc:
4264-
return concat((self.iloc[:, [i]].isin(values[i])
4265-
for i, col in enumerate(self.columns)), axis=1)
4266-
else:
4267-
return concat((self.iloc[:, [i]].isin(values[col])
4268-
for i, col in enumerate(self.columns)), axis=1)
4269-
4270-
4295+
return concat((self.iloc[:, [i]].isin(values[col])
4296+
for i, col in enumerate(self.columns)), axis=1)
4297+
elif isinstance(values, Series):
4298+
if not values.index.is_unique:
4299+
raise ValueError("ValueError: cannot compute isin with"
4300+
" a duplicate axis.")
4301+
return self.eq(values.reindex_like(self), axis='index')
4302+
elif isinstance(values, DataFrame):
4303+
if not (values.columns.is_unique and values.index.is_unique):
4304+
raise ValueError("ValueError: cannot compute isin with"
4305+
" a duplicate axis.")
4306+
return self.eq(values.reindex_like(self))
42714307
else:
42724308
if not is_list_like(values):
42734309
raise TypeError("only list-like or dict-like objects are"

pandas/tests/test_frame.py

+78-14
Original file line numberDiff line numberDiff line change
@@ -11431,20 +11431,6 @@ def test_isin_dict(self):
1143111431
result = df.isin(d)
1143211432
assert_frame_equal(result, expected)
1143311433

11434-
# iloc
11435-
df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
11436-
d = {0: ['a']}
11437-
expected = DataFrame(False, df.index, df.columns)
11438-
11439-
# without using iloc
11440-
result = df.isin(d)
11441-
assert_frame_equal(result, expected)
11442-
11443-
# using iloc
11444-
result = df.isin(d, iloc=True)
11445-
expected.iloc[0, 0] = True
11446-
assert_frame_equal(result, expected)
11447-
1144811434
def test_isin_with_string_scalar(self):
1144911435
#GH4763
1145011436
df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
@@ -11456,6 +11442,84 @@ def test_isin_with_string_scalar(self):
1145611442
with tm.assertRaises(TypeError):
1145711443
df.isin('aaa')
1145811444

11445+
def test_isin_df(self):
11446+
df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
11447+
df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]})
11448+
expected = DataFrame(False, df1.index, df1.columns)
11449+
result = df1.isin(df2)
11450+
expected['A'].loc[[1, 3]] = True
11451+
expected['B'].loc[[0, 2]] = True
11452+
assert_frame_equal(result, expected)
11453+
11454+
# partial overlapping columns
11455+
df2.columns = ['A', 'C']
11456+
result = df1.isin(df2)
11457+
expected['B'] = False
11458+
assert_frame_equal(result, expected)
11459+
11460+
def test_isin_df_dupe_values(self):
11461+
df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
11462+
# just cols duped
11463+
df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
11464+
columns=['B', 'B'])
11465+
with tm.assertRaises(ValueError):
11466+
df1.isin(df2)
11467+
11468+
# just index duped
11469+
df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
11470+
columns=['A', 'B'], index=[0, 0, 1, 1])
11471+
with tm.assertRaises(ValueError):
11472+
df1.isin(df2)
11473+
11474+
# cols and index:
11475+
df2.columns = ['B', 'B']
11476+
with tm.assertRaises(ValueError):
11477+
df1.isin(df2)
11478+
11479+
def test_isin_dupe_self(self):
11480+
other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]})
11481+
df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A','A'])
11482+
result = df.isin(other)
11483+
expected = DataFrame(False, index=df.index, columns=df.columns)
11484+
expected.loc[0] = True
11485+
expected.iloc[1, 1] = True
11486+
assert_frame_equal(result, expected)
11487+
11488+
11489+
def test_isin_against_series(self):
11490+
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]},
11491+
index=['a', 'b', 'c', 'd'])
11492+
s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd'])
11493+
expected = DataFrame(False, index=df.index, columns=df.columns)
11494+
expected['A'].loc['a'] = True
11495+
expected.loc['d'] = True
11496+
result = df.isin(s)
11497+
assert_frame_equal(result, expected)
11498+
11499+
def test_isin_multiIndex(self):
11500+
idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'),
11501+
(0, 'b', 'bar'), (0, 'b', 'baz'),
11502+
(2, 'a', 'foo'), (2, 'a', 'bar'),
11503+
(2, 'c', 'bar'), (2, 'c', 'baz'),
11504+
(1, 'b', 'foo'), (1, 'b', 'bar'),
11505+
(1, 'c', 'bar'), (1, 'c', 'baz')])
11506+
df1 = DataFrame({'A': np.ones(12),
11507+
'B': np.zeros(12)}, index=idx)
11508+
df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
11509+
'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]})
11510+
# against regular index
11511+
expected = DataFrame(False, index=df1.index, columns=df1.columns)
11512+
result = df1.isin(df2)
11513+
assert_frame_equal(result, expected)
11514+
11515+
df2.index = idx
11516+
expected = df2.values.astype(np.bool)
11517+
expected[:, 1] = ~expected[:, 1]
11518+
expected = DataFrame(expected, columns=['A', 'B'], index=idx)
11519+
11520+
result = df1.isin(df2)
11521+
assert_frame_equal(result, expected)
11522+
1145911523
def test_to_csv_date_format(self):
1146011524
from pandas import to_datetime
1146111525
pname = '__tmp_to_csv_date_format__'

0 commit comments

Comments
 (0)