diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index e8d9fd52cf352..4f8fc5e78ece3 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -456,6 +456,36 @@ and :ref:`Advanced Indexing ` you may select along more than df2.loc[criterion & (df2['b'] == 'x'),'b':'c'] +*New in 0.12.0* + +DataFrame also has an ``isin`` method. When calling ``isin``, pass a set of +values as either an array or dict. If values is just an array, ``isin`` returns +a DataFrame of booleans that is the same shape as the original DataFrame, with Trues +wherever the element is in the sequence of values. + +.. ipython:: python + + df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}) + + values = ['a', 'b', 1, 3] + + df.isin(values) + +Oftentimes you'll want to match certain values with certain columns or rows. +Just make values a ``dict`` where the key is the row or column, and the value is +a list of items you want to check for. Make sure to set axis equal to 0 for +row-wise or 1 for column-wise matching. + +.. ipython:: python + + df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}) + + values = {'ids': ['a', 'b'], 'vals': [1, 3]} + + df.isin(values, axis=1) + Where and Masking ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 3b7d25789aa40..d03cdac14676a 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -54,6 +54,7 @@ pandas 0.12 - Access to historical Google Finance data in pandas.io.data (:issue:`3814`) - DataFrame plotting methods can sample column colors from a Matplotlib colormap via the ``colormap`` keyword. (:issue:`3860`) + - Added ``isin`` method to DataFrame (:issue:`4211`) **Improvements to existing features** diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 401a7746953cb..702baa9550a00 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5481,6 +5481,38 @@ def to_period(self, freq=None, axis=0, copy=True): return self._constructor(new_data) + def isin(self, values, axis=None): + """ + Return boolean vector showing whether elements in the DataFrame are + exactly contained in the passed sequence of values. + + Parameters + ---------- + values : sequence (array-like) or dict of {label: sequence}. + axis : {None, 0, 1} + Compute isin row-wise (axis=0) or column-wise (axis=1) + Mandatory if values is a dict, ignored otherwise. + + Returns + ------- + + bools : Series of booleans + """ + if not isinstance(values, dict): + return self.applymap(values.__contains__) + + else: + from pandas.tools.merge import concat + if axis == 1: + return concat((self[col].isin(vals) for col, vals in + values.iteritems()), axis=1) + elif axis == 0: + return concat((self.loc[row].isin(vals) for row, vals in + values.iteritems()), axis=1).T + else: + raise TypeError('Axis must be "0" or "1" when values is a dict ' + 'Got "%s" instead.' % str(axis)) + #---------------------------------------------------------------------- # Deprecated stuff diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a9df56a498f63..07aa4fd13e1a1 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10633,6 +10633,56 @@ def _check_f(base, f): f = lambda x: x.rename({1: 'foo'}, inplace=True) _check_f(data.copy()['c'], f) + def test_isin(self): + # GH #4211 + df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}, + index=['foo', 'bar', 'baz', 'qux']) + other = ['a', 'b', 'c'] + result_none = df[['ids', 'ids2']].isin(other) + expected_none = DataFrame({'ids': [True, True, False, False], + 'ids2': [True, False, True, False]}, + index=['foo', 'bar', 'baz', 'qux']) + + assert_frame_equal(result_none, expected_none) + + # axis = None + result_none_full = df.isin(other) + expected_none_full = DataFrame({'ids': [True, True, False, False], + 'ids2': [True, False, True, False], + 'vals': [False, False, False, False]}, + index=['foo', 'bar', 'baz', 'qux']) + + assert_frame_equal(result_none_full, expected_none_full) + + def test_isin_dict(self): + df = DataFrame({'A': ['a', 'b', 'c', 'd'], 'B': [1, 2, 3, 4], + 'C': [1, 5, 7, 8]}, + index=['foo', 'bar', 'baz', 'qux']) + other = {'A': ('a', 'b'), 'B': (1, 3)} + result = df.isin(other, axis=1) + expected = DataFrame({'A': [True, True, False, False], + 'B': [True, False, True, False]}, + index=['foo', 'bar', 'baz', 'qux']) + assert_frame_equal(result, expected) + + def test_isin_row(self): + df = DataFrame({'A': ['a', 'b', 'c', 'd'], 'B': [1, 2, 3, 4], + 'C': [1, 5, 7, 8]}, + index=['foo', 'bar', 'baz', 'qux']) + ind_other = {'foo': ['a', 1, 1], + 'bar': ['d', 2, 1], + 'baz': ['nn', 'nn', 'nn']} + + result_ind = df.isin(ind_other, axis=0) + expected_ind = DataFrame({'A': [True, False, False], + 'B': [True, True, False], + 'C': [True, False, False]}, + index=['foo', 'bar', 'baz']).reindex_like(result_ind) + + assert_frame_equal(result_ind, expected_ind) + + self.assertRaises(TypeError, df.isin, ind_other) if __name__ == '__main__': # unittest.main()