Skip to content

ENH: Dataframe isin2 #4258

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 24, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions doc/source/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,39 @@ and :ref:`Advanced Indexing <indexing.advanced>` you may select along more than

df2.loc[criterion & (df2['b'] == 'x'),'b':'c']

DataFrame also has an ``isin`` method. When calling ``isin``, pass a set of
values as either an array or dict. If values is an array, ``isin`` returns
a DataFrame of booleans that is the same shape as the original DataFrame, with True
wherever the element is in the sequence of values.

.. ipython:: python

df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
'ids2': ['a', 'n', 'c', 'n']})

values = ['a', 'b', 1, 3]

df.isin(values)

Oftentimes you'll want to match certain values with certain columns.
Just make values a ``dict`` where the key is the column, and the value is
a list of items you want to check for.

.. ipython:: python

values = {'ids': ['a', 'b'], 'vals': [1, 3]}

df.isin(values)

You can also describe columns using integer location:

.. ipython:: python

values = {0: ['a', 'b']}

df.isin(values, iloc=True)


Where and Masking
~~~~~~~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ pandas 0.12
- Access to historical Google Finance data in pandas.io.data (:issue:`3814`)
- DataFrame plotting methods can sample column colors from a Matplotlib
colormap via the ``colormap`` keyword. (:issue:`3860`)
- Added ``isin`` method to DataFrame (:issue:`4211`)

**Improvements to existing features**

Expand Down
35 changes: 35 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5481,6 +5481,41 @@ def to_period(self, freq=None, axis=0, copy=True):

return self._constructor(new_data)


def isin(self, values, iloc=False):
"""
Return boolean DataFrame showing whether each element in the DataFrame is
contained in values.

Parameters
----------
values : iterable or dictionary of columns to values
iloc : boolean, if passing a dict as values, describe columns using integer
locations (default is to use labels)

Returns
-------

DataFrame of booleans
"""
if isinstance(values, dict):
from collections import defaultdict
from pandas.tools.merge import concat
values = defaultdict(list, values)
if iloc:
return concat((self.iloc[:, [i]].isin(values[i])
for i, col in enumerate(self.columns)), axis=1)
else:
return concat((self.iloc[:, [i]].isin(values[col])
for i, col in enumerate(self.columns)), axis=1)


else:
return DataFrame(lib.ismember(self.values.ravel(),
set(values)).reshape(self.shape),
self.index,
self.columns)

#----------------------------------------------------------------------
# Deprecated stuff

Expand Down
49 changes: 49 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10633,6 +10633,55 @@ def _check_f(base, f):
f = lambda x: x.rename({1: 'foo'}, inplace=True)
_check_f(data.copy()['c'], f)

def test_isin(self):
# GH #4211
df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
'ids2': ['a', 'n', 'c', 'n']},
index=['foo', 'bar', 'baz', 'qux'])
other = ['a', 'b', 'c']

result = df.isin(other)
expected = DataFrame([df.loc[s].isin(other) for s in df.index])
assert_frame_equal(result, expected)

def test_isin_empty(self):
df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
result = df.isin([])
expected = pd.DataFrame(False, df.index, df.columns)
assert_frame_equal(result, expected)

def test_isin_dict(self):
df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
d = {'A': ['a']}

expected = DataFrame(False, df.index, df.columns)
expected.loc[0, 'A'] = True

result = df.isin(d)
assert_frame_equal(result, expected)

# non unique columns
df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
df.columns = ['A', 'A']
expected = DataFrame(False, df.index, df.columns)
expected.loc[0, 'A'] = True
result = df.isin(d)
assert_frame_equal(result, expected)

# iloc
df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
d = {0: ['a']}
expected = DataFrame(False, df.index, df.columns)

# without using iloc
result = df.isin(d)
assert_frame_equal(result, expected)

# using iloc
result = df.isin(d, iloc=True)
expected.iloc[0, 0] = True
assert_frame_equal(result, expected)


if __name__ == '__main__':
# unittest.main()
Expand Down