diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index b2cda7d1f4041..4c1ece032310f 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -476,7 +476,7 @@ Enhancements t = Timestamp('20130101 09:01:02') t + pd.datetools.Nano(123) -- A new method, ``isin`` for DataFrames, plays nicely with boolean indexing. See :ref:`the docs` for more. +- A new method, ``isin`` for DataFrames, which plays nicely with boolean indexing. The argument to ``isin``, what we're comparing the DataFrame to, can be a DataFrame, Series, dict, or array of values. See :ref:`the docs` for more. To get the rows where any of the conditions are met: @@ -484,7 +484,8 @@ Enhancements dfi = DataFrame({'A': [1, 2, 3, 4], 'B': ['a', 'b', 'f', 'n']}) dfi - mask = dfi.isin({'A': [1, 2], 'B': ['e', 'f']}) + other = DataFrame({'A': [1, 3, 3, 7], 'B': ['e', 'f', 'f', 'e']}) + mask = dfi.isin(other) mask dfi[mask.any(1)] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ece38e18e3688..e6ad5bf550f7f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4239,35 +4239,71 @@ def to_period(self, freq=None, axis=0, copy=True): return self._constructor(new_data) - - def isin(self, values, iloc=False): + def isin(self, values): """ - Return boolean DataFrame showing whether each element in the DataFrame is - contained in values. + Return boolean DataFrame showing whether each element in the + DataFrame is contained in values. Parameters ---------- - values : iterable or dictionary of columns to values - iloc : boolean, if passing a dict as values, describe columns using integer - locations (default is to use labels) + values : iterable, Series, DataFrame or dictionary + The result will only be true at a location if all the + labels match. If `values` is a Series, that's the index. If + `values` is a dictionary, the keys must be the column names, + which must match. If `values` is a DataFrame, + then both the index and column labels must match. Returns ------- DataFrame of booleans + + Examples + -------- + When ``values`` is a list: + + >>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']}) + >>> df.isin([1, 3, 12, 'a']) + A B + 0 True True + 1 False False + 2 True False + + When ``values`` is a dict: + + >>> df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]}) + >>> df.isin({'A': [1, 3], 'B': [4, 7, 12]}) + A B + 0 True False # Note that B didn't match the 1 here. + 1 False True + 2 True True + + When ``values`` is a Series or DataFrame: + + >>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']}) + >>> other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']}) + >>> df.isin(other) + A B + 0 True False + 1 False False # Column A in `other` has a 3, but not at index 1. + 2 True True """ if isinstance(values, dict): from collections import defaultdict from pandas.tools.merge import concat values = defaultdict(list, values) - if iloc: - return concat((self.iloc[:, [i]].isin(values[i]) - for i, col in enumerate(self.columns)), axis=1) - else: - return concat((self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns)), axis=1) - - + return concat((self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns)), axis=1) + elif isinstance(values, Series): + if not values.index.is_unique: + raise ValueError("ValueError: cannot compute isin with" + " a duplicate axis.") + return self.eq(values.reindex_like(self), axis='index') + elif isinstance(values, DataFrame): + if not (values.columns.is_unique and values.index.is_unique): + raise ValueError("ValueError: cannot compute isin with" + " a duplicate axis.") + return self.eq(values.reindex_like(self)) else: if not is_list_like(values): raise TypeError("only list-like or dict-like objects are" diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3f5eef8c04f7d..f6db680d30061 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11431,20 +11431,6 @@ def test_isin_dict(self): result = df.isin(d) assert_frame_equal(result, expected) - # iloc - df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) - d = {0: ['a']} - expected = DataFrame(False, df.index, df.columns) - - # without using iloc - result = df.isin(d) - assert_frame_equal(result, expected) - - # using iloc - result = df.isin(d, iloc=True) - expected.iloc[0, 0] = True - assert_frame_equal(result, expected) - def test_isin_with_string_scalar(self): #GH4763 df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], @@ -11456,6 +11442,84 @@ def test_isin_with_string_scalar(self): with tm.assertRaises(TypeError): df.isin('aaa') + def test_isin_df(self): + df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) + df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]}) + expected = DataFrame(False, df1.index, df1.columns) + result = df1.isin(df2) + expected['A'].loc[[1, 3]] = True + expected['B'].loc[[0, 2]] = True + assert_frame_equal(result, expected) + + # partial overlapping columns + df2.columns = ['A', 'C'] + result = df1.isin(df2) + expected['B'] = False + assert_frame_equal(result, expected) + + def test_isin_df_dupe_values(self): + df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) + # just cols duped + df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], + columns=['B', 'B']) + with tm.assertRaises(ValueError): + df1.isin(df2) + + # just index duped + df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], + columns=['A', 'B'], index=[0, 0, 1, 1]) + with tm.assertRaises(ValueError): + df1.isin(df2) + + # cols and index: + df2.columns = ['B', 'B'] + with tm.assertRaises(ValueError): + df1.isin(df2) + + def test_isin_dupe_self(self): + other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]}) + df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A','A']) + result = df.isin(other) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected.loc[0] = True + expected.iloc[1, 1] = True + assert_frame_equal(result, expected) + + + def test_isin_against_series(self): + df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}, + index=['a', 'b', 'c', 'd']) + s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd']) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected['A'].loc['a'] = True + expected.loc['d'] = True + result = df.isin(s) + assert_frame_equal(result, expected) + + def test_isin_multiIndex(self): + idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'), + (0, 'b', 'bar'), (0, 'b', 'baz'), + (2, 'a', 'foo'), (2, 'a', 'bar'), + (2, 'c', 'bar'), (2, 'c', 'baz'), + (1, 'b', 'foo'), (1, 'b', 'bar'), + (1, 'c', 'bar'), (1, 'c', 'baz')]) + df1 = DataFrame({'A': np.ones(12), + 'B': np.zeros(12)}, index=idx) + df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + 'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]}) + # against regular index + expected = DataFrame(False, index=df1.index, columns=df1.columns) + result = df1.isin(df2) + assert_frame_equal(result, expected) + + df2.index = idx + expected = df2.values.astype(np.bool) + expected[:, 1] = ~expected[:, 1] + expected = DataFrame(expected, columns=['A', 'B'], index=idx) + + result = df1.isin(df2) + assert_frame_equal(result, expected) + def test_to_csv_date_format(self): from pandas import to_datetime pname = '__tmp_to_csv_date_format__'