From 0309899dd7855969850c217f2b4a510602dd2aac Mon Sep 17 00:00:00 2001 From: TomAugspurger Date: Mon, 15 Jul 2013 17:47:59 -0500 Subject: [PATCH 1/3] ENH: Add `isin` method to DataFrame. docs. to be rebased ENH: Add isin method to DataFrame Basic tests. Added method and fixed tests. ENH: Add ordered argument to df.isin() Expects a sequence of arrays. Updated release notes for df.isin() CLN: cleanup going to remove ordered argument. Using a dict for ordered matching. Docs BUG: fixed subselection length check issues. Updated release notes for df.isin() remove merge conflict note --- doc/source/indexing.rst | 30 +++++++++++++++++++++++ doc/source/release.rst | 1 + pandas/core/frame.py | 32 ++++++++++++++++++++++++ pandas/tests/test_frame.py | 50 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index e8d9fd52cf352..4f8fc5e78ece3 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -456,6 +456,36 @@ and :ref:`Advanced Indexing ` you may select along more than df2.loc[criterion & (df2['b'] == 'x'),'b':'c'] +*New in 0.12.0* + +DataFrame also has an ``isin`` method. When calling ``isin``, pass a set of +values as either an array or dict. If values is just an array, ``isin`` returns +a DataFrame of booleans that is the same shape as the original DataFrame, with Trues +wherever the element is in the sequence of values. + +.. ipython:: python + + df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}) + + values = ['a', 'b', 1, 3] + + df.isin(values) + +Oftentimes you'll want to match certain values with certain columns or rows. +Just make values a ``dict`` where the key is the row or column, and the value is +a list of items you want to check for. Make sure to set axis equal to 0 for +row-wise or 1 for column-wise matching. + +.. ipython:: python + + df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}) + + values = {'ids': ['a', 'b'], 'vals': [1, 3]} + + df.isin(values, axis=1) + Where and Masking ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 3b7d25789aa40..d03cdac14676a 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -54,6 +54,7 @@ pandas 0.12 - Access to historical Google Finance data in pandas.io.data (:issue:`3814`) - DataFrame plotting methods can sample column colors from a Matplotlib colormap via the ``colormap`` keyword. (:issue:`3860`) + - Added ``isin`` method to DataFrame (:issue:`4211`) **Improvements to existing features** diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 401a7746953cb..702baa9550a00 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5481,6 +5481,38 @@ def to_period(self, freq=None, axis=0, copy=True): return self._constructor(new_data) + def isin(self, values, axis=None): + """ + Return boolean vector showing whether elements in the DataFrame are + exactly contained in the passed sequence of values. + + Parameters + ---------- + values : sequence (array-like) or dict of {label: sequence}. + axis : {None, 0, 1} + Compute isin row-wise (axis=0) or column-wise (axis=1) + Mandatory if values is a dict, ignored otherwise. + + Returns + ------- + + bools : Series of booleans + """ + if not isinstance(values, dict): + return self.applymap(values.__contains__) + + else: + from pandas.tools.merge import concat + if axis == 1: + return concat((self[col].isin(vals) for col, vals in + values.iteritems()), axis=1) + elif axis == 0: + return concat((self.loc[row].isin(vals) for row, vals in + values.iteritems()), axis=1).T + else: + raise TypeError('Axis must be "0" or "1" when values is a dict ' + 'Got "%s" instead.' % str(axis)) + #---------------------------------------------------------------------- # Deprecated stuff diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a9df56a498f63..07aa4fd13e1a1 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10633,6 +10633,56 @@ def _check_f(base, f): f = lambda x: x.rename({1: 'foo'}, inplace=True) _check_f(data.copy()['c'], f) + def test_isin(self): + # GH #4211 + df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}, + index=['foo', 'bar', 'baz', 'qux']) + other = ['a', 'b', 'c'] + result_none = df[['ids', 'ids2']].isin(other) + expected_none = DataFrame({'ids': [True, True, False, False], + 'ids2': [True, False, True, False]}, + index=['foo', 'bar', 'baz', 'qux']) + + assert_frame_equal(result_none, expected_none) + + # axis = None + result_none_full = df.isin(other) + expected_none_full = DataFrame({'ids': [True, True, False, False], + 'ids2': [True, False, True, False], + 'vals': [False, False, False, False]}, + index=['foo', 'bar', 'baz', 'qux']) + + assert_frame_equal(result_none_full, expected_none_full) + + def test_isin_dict(self): + df = DataFrame({'A': ['a', 'b', 'c', 'd'], 'B': [1, 2, 3, 4], + 'C': [1, 5, 7, 8]}, + index=['foo', 'bar', 'baz', 'qux']) + other = {'A': ('a', 'b'), 'B': (1, 3)} + result = df.isin(other, axis=1) + expected = DataFrame({'A': [True, True, False, False], + 'B': [True, False, True, False]}, + index=['foo', 'bar', 'baz', 'qux']) + assert_frame_equal(result, expected) + + def test_isin_row(self): + df = DataFrame({'A': ['a', 'b', 'c', 'd'], 'B': [1, 2, 3, 4], + 'C': [1, 5, 7, 8]}, + index=['foo', 'bar', 'baz', 'qux']) + ind_other = {'foo': ['a', 1, 1], + 'bar': ['d', 2, 1], + 'baz': ['nn', 'nn', 'nn']} + + result_ind = df.isin(ind_other, axis=0) + expected_ind = DataFrame({'A': [True, False, False], + 'B': [True, True, False], + 'C': [True, False, False]}, + index=['foo', 'bar', 'baz']).reindex_like(result_ind) + + assert_frame_equal(result_ind, expected_ind) + + self.assertRaises(TypeError, df.isin, ind_other) if __name__ == '__main__': # unittest.main() From ab1b17e54bbe8e6de62632b3c0a222eb89c43c46 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Tue, 16 Jul 2013 01:13:05 +0100 Subject: [PATCH 2/3] ENH tweak DataFrame isin method --- doc/source/indexing.rst | 15 ++++------ pandas/core/frame.py | 36 +++++++++++------------- pandas/tests/test_frame.py | 56 ++++++++++++++------------------------ 3 files changed, 42 insertions(+), 65 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 4f8fc5e78ece3..d3d6fe367a0bd 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -456,11 +456,9 @@ and :ref:`Advanced Indexing ` you may select along more than df2.loc[criterion & (df2['b'] == 'x'),'b':'c'] -*New in 0.12.0* - DataFrame also has an ``isin`` method. When calling ``isin``, pass a set of -values as either an array or dict. If values is just an array, ``isin`` returns -a DataFrame of booleans that is the same shape as the original DataFrame, with Trues +values as either an array or dict. If values is an array, ``isin`` returns +a DataFrame of booleans that is the same shape as the original DataFrame, with True wherever the element is in the sequence of values. .. ipython:: python @@ -472,10 +470,9 @@ wherever the element is in the sequence of values. df.isin(values) -Oftentimes you'll want to match certain values with certain columns or rows. -Just make values a ``dict`` where the key is the row or column, and the value is -a list of items you want to check for. Make sure to set axis equal to 0 for -row-wise or 1 for column-wise matching. +Oftentimes you'll want to match certain values with certain columns. +Just make values a ``dict`` where the key is the column, and the value is +a list of items you want to check for. .. ipython:: python @@ -484,7 +481,7 @@ row-wise or 1 for column-wise matching. values = {'ids': ['a', 'b'], 'vals': [1, 3]} - df.isin(values, axis=1) + df.isin(values) Where and Masking ~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 702baa9550a00..331deccaf80e3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5481,37 +5481,33 @@ def to_period(self, freq=None, axis=0, copy=True): return self._constructor(new_data) - def isin(self, values, axis=None): + + def isin(self, values): """ - Return boolean vector showing whether elements in the DataFrame are - exactly contained in the passed sequence of values. + Return boolean DataFrame showing whether each elements in the DataFrame is + contained in items. Parameters ---------- - values : sequence (array-like) or dict of {label: sequence}. - axis : {None, 0, 1} - Compute isin row-wise (axis=0) or column-wise (axis=1) - Mandatory if values is a dict, ignored otherwise. + values : iterable or dictionary of columns to values Returns ------- - bools : Series of booleans + DataFrame of booleans """ - if not isinstance(values, dict): - return self.applymap(values.__contains__) + if isinstance(values, dict): + from collections import defaultdict + from pandas.tools.merge import concat + values = defaultdict(list, values) + return concat((self.iloc[:, [i]].isin(values[ind] or values[i]) + for i, ind in enumerate(self.columns)), axis=1) else: - from pandas.tools.merge import concat - if axis == 1: - return concat((self[col].isin(vals) for col, vals in - values.iteritems()), axis=1) - elif axis == 0: - return concat((self.loc[row].isin(vals) for row, vals in - values.iteritems()), axis=1).T - else: - raise TypeError('Axis must be "0" or "1" when values is a dict ' - 'Got "%s" instead.' % str(axis)) + return DataFrame(lib.ismember(self.values.ravel(), + set(values)).reshape(self.shape), + self.index, + self.columns) #---------------------------------------------------------------------- # Deprecated stuff diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 07aa4fd13e1a1..916a38ae872d5 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10639,50 +10639,34 @@ def test_isin(self): 'ids2': ['a', 'n', 'c', 'n']}, index=['foo', 'bar', 'baz', 'qux']) other = ['a', 'b', 'c'] - result_none = df[['ids', 'ids2']].isin(other) - expected_none = DataFrame({'ids': [True, True, False, False], - 'ids2': [True, False, True, False]}, - index=['foo', 'bar', 'baz', 'qux']) - assert_frame_equal(result_none, expected_none) - - # axis = None - result_none_full = df.isin(other) - expected_none_full = DataFrame({'ids': [True, True, False, False], - 'ids2': [True, False, True, False], - 'vals': [False, False, False, False]}, - index=['foo', 'bar', 'baz', 'qux']) + result = df.isin(other) + expected = DataFrame([df.loc[s].isin(other) for s in df.index]) + assert_frame_equal(result, expected) - assert_frame_equal(result_none_full, expected_none_full) + def test_isin_empty(self): + df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + result = df.isin([]) + expected = pd.DataFrame(False, df.index, df.columns) + assert_frame_equal(result, expected) def test_isin_dict(self): - df = DataFrame({'A': ['a', 'b', 'c', 'd'], 'B': [1, 2, 3, 4], - 'C': [1, 5, 7, 8]}, - index=['foo', 'bar', 'baz', 'qux']) - other = {'A': ('a', 'b'), 'B': (1, 3)} - result = df.isin(other, axis=1) - expected = DataFrame({'A': [True, True, False, False], - 'B': [True, False, True, False]}, - index=['foo', 'bar', 'baz', 'qux']) - assert_frame_equal(result, expected) + df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + d = {'A': ['a']} - def test_isin_row(self): - df = DataFrame({'A': ['a', 'b', 'c', 'd'], 'B': [1, 2, 3, 4], - 'C': [1, 5, 7, 8]}, - index=['foo', 'bar', 'baz', 'qux']) - ind_other = {'foo': ['a', 1, 1], - 'bar': ['d', 2, 1], - 'baz': ['nn', 'nn', 'nn']} + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, 'A'] = True - result_ind = df.isin(ind_other, axis=0) - expected_ind = DataFrame({'A': [True, False, False], - 'B': [True, True, False], - 'C': [True, False, False]}, - index=['foo', 'bar', 'baz']).reindex_like(result_ind) + result = df.isin(d) + assert_frame_equal(result, expected) - assert_frame_equal(result_ind, expected_ind) + # non unique columns + df.columns = ['A', 'A'] + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, 'A'] = True + result = df.isin(d) + assert_frame_equal(result, expected) - self.assertRaises(TypeError, df.isin, ind_other) if __name__ == '__main__': # unittest.main() From 60b623fd1fb311d8e9cf2a2f97d7160e32656248 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Tue, 16 Jul 2013 14:15:06 +0100 Subject: [PATCH 3/3] ENH add iloc argument to DataFrame isin --- doc/source/indexing.rst | 12 +++++++++--- pandas/core/frame.py | 17 ++++++++++++----- pandas/tests/test_frame.py | 15 +++++++++++++++ 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index d3d6fe367a0bd..213a7ab659dae 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -476,13 +476,19 @@ a list of items you want to check for. .. ipython:: python - df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], - 'ids2': ['a', 'n', 'c', 'n']}) - values = {'ids': ['a', 'b'], 'vals': [1, 3]} df.isin(values) +You can also describe columns using integer location: + +.. ipython:: python + + values = {0: ['a', 'b']} + + df.isin(values, iloc=True) + + Where and Masking ~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 331deccaf80e3..22dc27ff977d9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5482,14 +5482,16 @@ def to_period(self, freq=None, axis=0, copy=True): return self._constructor(new_data) - def isin(self, values): + def isin(self, values, iloc=False): """ - Return boolean DataFrame showing whether each elements in the DataFrame is - contained in items. + Return boolean DataFrame showing whether each element in the DataFrame is + contained in values. Parameters ---------- values : iterable or dictionary of columns to values + iloc : boolean, if passing a dict as values, describe columns using integer + locations (default is to use labels) Returns ------- @@ -5500,8 +5502,13 @@ def isin(self, values): from collections import defaultdict from pandas.tools.merge import concat values = defaultdict(list, values) - return concat((self.iloc[:, [i]].isin(values[ind] or values[i]) - for i, ind in enumerate(self.columns)), axis=1) + if iloc: + return concat((self.iloc[:, [i]].isin(values[i]) + for i, col in enumerate(self.columns)), axis=1) + else: + return concat((self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns)), axis=1) + else: return DataFrame(lib.ismember(self.values.ravel(), diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 916a38ae872d5..577cbfe9dc744 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10661,12 +10661,27 @@ def test_isin_dict(self): assert_frame_equal(result, expected) # non unique columns + df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) df.columns = ['A', 'A'] expected = DataFrame(False, df.index, df.columns) expected.loc[0, 'A'] = True result = df.isin(d) assert_frame_equal(result, expected) + # iloc + df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + d = {0: ['a']} + expected = DataFrame(False, df.index, df.columns) + + # without using iloc + result = df.isin(d) + assert_frame_equal(result, expected) + + # using iloc + result = df.isin(d, iloc=True) + expected.iloc[0, 0] = True + assert_frame_equal(result, expected) + if __name__ == '__main__': # unittest.main()