diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index e8d9fd52cf352..213a7ab659dae 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -456,6 +456,39 @@ and :ref:`Advanced Indexing ` you may select along more than df2.loc[criterion & (df2['b'] == 'x'),'b':'c'] +DataFrame also has an ``isin`` method. When calling ``isin``, pass a set of +values as either an array or dict. If values is an array, ``isin`` returns +a DataFrame of booleans that is the same shape as the original DataFrame, with True +wherever the element is in the sequence of values. + +.. ipython:: python + + df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}) + + values = ['a', 'b', 1, 3] + + df.isin(values) + +Oftentimes you'll want to match certain values with certain columns. +Just make values a ``dict`` where the key is the column, and the value is +a list of items you want to check for. + +.. ipython:: python + + values = {'ids': ['a', 'b'], 'vals': [1, 3]} + + df.isin(values) + +You can also describe columns using integer location: + +.. ipython:: python + + values = {0: ['a', 'b']} + + df.isin(values, iloc=True) + + Where and Masking ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 3b7d25789aa40..d03cdac14676a 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -54,6 +54,7 @@ pandas 0.12 - Access to historical Google Finance data in pandas.io.data (:issue:`3814`) - DataFrame plotting methods can sample column colors from a Matplotlib colormap via the ``colormap`` keyword. (:issue:`3860`) + - Added ``isin`` method to DataFrame (:issue:`4211`) **Improvements to existing features** diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 401a7746953cb..22dc27ff977d9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5481,6 +5481,41 @@ def to_period(self, freq=None, axis=0, copy=True): return self._constructor(new_data) + + def isin(self, values, iloc=False): + """ + Return boolean DataFrame showing whether each element in the DataFrame is + contained in values. + + Parameters + ---------- + values : iterable or dictionary of columns to values + iloc : boolean, if passing a dict as values, describe columns using integer + locations (default is to use labels) + + Returns + ------- + + DataFrame of booleans + """ + if isinstance(values, dict): + from collections import defaultdict + from pandas.tools.merge import concat + values = defaultdict(list, values) + if iloc: + return concat((self.iloc[:, [i]].isin(values[i]) + for i, col in enumerate(self.columns)), axis=1) + else: + return concat((self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns)), axis=1) + + + else: + return DataFrame(lib.ismember(self.values.ravel(), + set(values)).reshape(self.shape), + self.index, + self.columns) + #---------------------------------------------------------------------- # Deprecated stuff diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a9df56a498f63..577cbfe9dc744 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10633,6 +10633,55 @@ def _check_f(base, f): f = lambda x: x.rename({1: 'foo'}, inplace=True) _check_f(data.copy()['c'], f) + def test_isin(self): + # GH #4211 + df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}, + index=['foo', 'bar', 'baz', 'qux']) + other = ['a', 'b', 'c'] + + result = df.isin(other) + expected = DataFrame([df.loc[s].isin(other) for s in df.index]) + assert_frame_equal(result, expected) + + def test_isin_empty(self): + df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + result = df.isin([]) + expected = pd.DataFrame(False, df.index, df.columns) + assert_frame_equal(result, expected) + + def test_isin_dict(self): + df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + d = {'A': ['a']} + + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, 'A'] = True + + result = df.isin(d) + assert_frame_equal(result, expected) + + # non unique columns + df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + df.columns = ['A', 'A'] + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, 'A'] = True + result = df.isin(d) + assert_frame_equal(result, expected) + + # iloc + df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + d = {0: ['a']} + expected = DataFrame(False, df.index, df.columns) + + # without using iloc + result = df.isin(d) + assert_frame_equal(result, expected) + + # using iloc + result = df.isin(d, iloc=True) + expected.iloc[0, 0] = True + assert_frame_equal(result, expected) + if __name__ == '__main__': # unittest.main()