Skip to content

ENH/API: Accept DataFrame for isin #5199

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 17, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -476,15 +476,16 @@ Enhancements
t = Timestamp('20130101 09:01:02')
t + pd.datetools.Nano(123)

- A new method, ``isin`` for DataFrames, plays nicely with boolean indexing. See :ref:`the docs<indexing.basics.indexing_isin>` for more.
- A new method, ``isin`` for DataFrames, which plays nicely with boolean indexing. The argument to ``isin``, what we're comparing the DataFrame to, can be a DataFrame, Series, dict, or array of values. See :ref:`the docs<indexing.basics.indexing_isin>` for more.

To get the rows where any of the conditions are met:

.. ipython:: python

dfi = DataFrame({'A': [1, 2, 3, 4], 'B': ['a', 'b', 'f', 'n']})
dfi
mask = dfi.isin({'A': [1, 2], 'B': ['e', 'f']})
other = DataFrame({'A': [1, 3, 3, 7], 'B': ['e', 'f', 'f', 'e']})
mask = dfi.isin(other)
mask
dfi[mask.any(1)]

Expand Down
66 changes: 51 additions & 15 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4239,35 +4239,71 @@ def to_period(self, freq=None, axis=0, copy=True):

return self._constructor(new_data)


def isin(self, values, iloc=False):
def isin(self, values):
"""
Return boolean DataFrame showing whether each element in the DataFrame is
contained in values.
Return boolean DataFrame showing whether each element in the
DataFrame is contained in values.

Parameters
----------
values : iterable or dictionary of columns to values
iloc : boolean, if passing a dict as values, describe columns using integer
locations (default is to use labels)
values : iterable, Series, DataFrame or dictionary
The result will only be true at a location if all the
labels match. If `values` is a Series, that's the index. If
`values` is a dictionary, the keys must be the column names,
which must match. If `values` is a DataFrame,
then both the index and column labels must match.

Returns
-------

DataFrame of booleans

Examples
--------
When ``values`` is a list:

>>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
>>> df.isin([1, 3, 12, 'a'])
A B
0 True True
1 False False
2 True False

When ``values`` is a dict:

>>> df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
>>> df.isin({'A': [1, 3], 'B': [4, 7, 12]})
A B
0 True False # Note that B didn't match the 1 here.
1 False True
2 True True

When ``values`` is a Series or DataFrame:

>>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
>>> other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
>>> df.isin(other)
A B
0 True False
1 False False # Column A in `other` has a 3, but not at index 1.
2 True True
"""
if isinstance(values, dict):
from collections import defaultdict
from pandas.tools.merge import concat
values = defaultdict(list, values)
if iloc:
return concat((self.iloc[:, [i]].isin(values[i])
for i, col in enumerate(self.columns)), axis=1)
else:
return concat((self.iloc[:, [i]].isin(values[col])
for i, col in enumerate(self.columns)), axis=1)


return concat((self.iloc[:, [i]].isin(values[col])
for i, col in enumerate(self.columns)), axis=1)
elif isinstance(values, Series):
if not values.index.is_unique:
raise ValueError("ValueError: cannot compute isin with"
" a duplicate axis.")
return self.eq(values.reindex_like(self), axis='index')
elif isinstance(values, DataFrame):
if not (values.columns.is_unique and values.index.is_unique):
raise ValueError("ValueError: cannot compute isin with"
" a duplicate axis.")
return self.eq(values.reindex_like(self))
else:
if not is_list_like(values):
raise TypeError("only list-like or dict-like objects are"
Expand Down
92 changes: 78 additions & 14 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11431,20 +11431,6 @@ def test_isin_dict(self):
result = df.isin(d)
assert_frame_equal(result, expected)

# iloc
df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
d = {0: ['a']}
expected = DataFrame(False, df.index, df.columns)

# without using iloc
result = df.isin(d)
assert_frame_equal(result, expected)

# using iloc
result = df.isin(d, iloc=True)
expected.iloc[0, 0] = True
assert_frame_equal(result, expected)

def test_isin_with_string_scalar(self):
#GH4763
df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
Expand All @@ -11456,6 +11442,84 @@ def test_isin_with_string_scalar(self):
with tm.assertRaises(TypeError):
df.isin('aaa')

def test_isin_df(self):
df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]})
expected = DataFrame(False, df1.index, df1.columns)
result = df1.isin(df2)
expected['A'].loc[[1, 3]] = True
expected['B'].loc[[0, 2]] = True
assert_frame_equal(result, expected)

# partial overlapping columns
df2.columns = ['A', 'C']
result = df1.isin(df2)
expected['B'] = False
assert_frame_equal(result, expected)

def test_isin_df_dupe_values(self):
df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
# just cols duped
df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
columns=['B', 'B'])
with tm.assertRaises(ValueError):
df1.isin(df2)

# just index duped
df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
columns=['A', 'B'], index=[0, 0, 1, 1])
with tm.assertRaises(ValueError):
df1.isin(df2)

# cols and index:
df2.columns = ['B', 'B']
with tm.assertRaises(ValueError):
df1.isin(df2)

def test_isin_dupe_self(self):
other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]})
df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A','A'])
result = df.isin(other)
expected = DataFrame(False, index=df.index, columns=df.columns)
expected.loc[0] = True
expected.iloc[1, 1] = True
assert_frame_equal(result, expected)


def test_isin_against_series(self):
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]},
index=['a', 'b', 'c', 'd'])
s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd'])
expected = DataFrame(False, index=df.index, columns=df.columns)
expected['A'].loc['a'] = True
expected.loc['d'] = True
result = df.isin(s)
assert_frame_equal(result, expected)

def test_isin_multiIndex(self):
idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'),
(0, 'b', 'bar'), (0, 'b', 'baz'),
(2, 'a', 'foo'), (2, 'a', 'bar'),
(2, 'c', 'bar'), (2, 'c', 'baz'),
(1, 'b', 'foo'), (1, 'b', 'bar'),
(1, 'c', 'bar'), (1, 'c', 'baz')])
df1 = DataFrame({'A': np.ones(12),
'B': np.zeros(12)}, index=idx)
df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]})
# against regular index
expected = DataFrame(False, index=df1.index, columns=df1.columns)
result = df1.isin(df2)
assert_frame_equal(result, expected)

df2.index = idx
expected = df2.values.astype(np.bool)
expected[:, 1] = ~expected[:, 1]
expected = DataFrame(expected, columns=['A', 'B'], index=idx)

result = df1.isin(df2)
assert_frame_equal(result, expected)

def test_to_csv_date_format(self):
from pandas import to_datetime
pname = '__tmp_to_csv_date_format__'
Expand Down