diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 1819cfa2725db..50789d64edda1 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -412,21 +412,35 @@ def time_frame_nunique(self): class Duplicated(object): goal_time = 0.2 + params = (['first', 'last', False], [True, False]) + param_names = ['keep', 'return_inverse'] + + def setup(self, keep, return_inverse): + if keep is False and return_inverse: + raise NotImplementedError - def setup(self): n = (1 << 20) t = date_range('2015-01-01', freq='S', periods=(n // 64)) xs = np.random.randn(n // 64).round(2) self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n), 'b': np.random.choice(t, n), 'c': np.random.choice(xs, n)}) - self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T + # df2 will not have any duplicates + self.df2 = DataFrame(np.random.randn(100, 1000).astype(str)) + + df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)), + columns=list('ABCDE')) + df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str)) + self.df3 = df3 + + def time_frame_duplicated(self, keep, return_inverse): + self.df.duplicated(keep=keep, return_inverse=return_inverse) - def time_frame_duplicated(self): - self.df.duplicated() + def time_frame_duplicated_wide(self, keep, return_inverse): + self.df2.duplicated(keep=keep, return_inverse=return_inverse) - def time_frame_duplicated_wide(self): - self.df2.duplicated() + def time_frame_duplicated_mixed(self, keep, return_inverse): + self.df3.duplicated(keep=keep, return_inverse=return_inverse) class XS(object): diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index f1703e163917a..a3a7f7f17d332 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -84,6 +84,24 @@ def time_modulo(self, dtype): self.index % 2 +class Duplicated(object): + + goal_time = 0.2 + params = (['first', 'last', False], [True, False]) + param_names = ['keep', 'return_inverse'] + + def setup(self, keep, return_inverse): + if keep is False and return_inverse: + raise NotImplementedError + + n, k = 200, 1000 + base = tm.makeStringIndex(n) + self.idx = Index(base[np.random.choice(n, k * n)]) + + def time_duplicated(self, keep, return_inverse): + self.idx.duplicated(keep=keep, return_inverse=return_inverse) + + class Range(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 0c92214795557..ac73c2d9c72dc 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -83,17 +83,22 @@ def time_is_monotonic(self): class Duplicated(object): goal_time = 0.2 + params = (['first', 'last', False], [True, False]) + param_names = ['keep', 'return_inverse'] - def setup(self): - n, k = 200, 5000 + def setup(self, keep, return_inverse): + if keep is False and return_inverse: + raise NotImplementedError + + n, k = 200, 1000 levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] labels = [np.random.choice(n, (k * n)) for lev in levels] self.mi = MultiIndex(levels=levels, labels=labels) - def time_duplicated(self): - self.mi.duplicated() + def time_duplicated(self, keep, return_inverse): + self.mi.duplicated(keep=keep, return_inverse=return_inverse) class Sortlevel(object): diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index a26c5d89bc483..cc08355b61e88 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -192,3 +192,21 @@ def setup(self): def time_series_datetimeindex_repr(self): getattr(self.s, 'a', None) + + +class Duplicated(object): + + goal_time = 0.2 + params = (['first', 'last', False], [True, False]) + param_names = ['keep', 'return_inverse'] + + def setup(self, keep, return_inverse): + if keep is False and return_inverse: + raise NotImplementedError + + n, k = 200, 1000 + base = tm.makeStringIndex(n) + self.s = Series(base[np.random.choice(n, k * n)]) + + def time_series_duplicated(self, keep, return_inverse): + self.s.duplicated(keep=keep, return_inverse=return_inverse) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 618d7454c67fe..c477c66604993 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -163,6 +163,52 @@ This is the same behavior as ``Series.values`` for categorical data. See :ref:`whatsnew_0240.api_breaking.interval_values` for more. +.. _whatsnew_0240.enhancements.duplicated_inverse: + +The `duplicated`-method has gained the `return_inverse` kwarg +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`~DataFrame.duplicated`-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword, +which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple) +that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`). + +For ``Index`` objects, the inverse is an ``np.ndarray``: + +.. ipython:: python + + idx = pd.Index(['a', 'b', 'b', 'c', 'a']) + isduplicate, inverse = idx.duplicated(return_inverse=True) # default: keep='first' + isduplicate + inverse + +This allows to reconstruct the original ``Index`` as follows: + +.. ipython:: python + + unique = idx[~isduplicate] # same as idx.drop_duplicates() + unique + + reconstruct = unique[inverse] + reconstruct.equals(idx) + +For ``DataFrame`` and ``Series`` the inverse needs to take into account the original index as well, and is therefore a ``Series``, +which contains the mapping from the index of the deduplicated, unique subset back to the original index. + +.. ipython:: python + + df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}, + index=[1, 4, 9, 16, 25]) + df + isduplicate, inverse = df.duplicated(keep='last', return_inverse=True) + isduplicate + inverse + + unique = df.loc[~isduplicate] # same as df.drop_duplicates(keep='last') + unique + reconstruct = unique.reindex(inverse.values).set_index(inverse.index) + reconstruct.equals(df) + + .. _whatsnew_0240.enhancements.other: Other Enhancements diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d39e9e08e2947..4c8dbb8dafdb2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -771,7 +771,7 @@ def _value_counts_arraylike(values, dropna): return keys, counts -def duplicated(values, keep='first'): +def duplicated(values, keep='first', return_inverse=False): """ Return boolean ndarray denoting duplicate values. @@ -786,16 +786,69 @@ def duplicated(values, keep='first'): occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - - False : Mark all duplicates as ``True``. + - False : Mark all duplicates as ``True``. This option is not + compatible with ``return_inverse``. + return_inverse : boolean, default False + If True, also return the selection of (integer) indices from the array + of unique values (created e.g. by selecting the boolean complement of + the first output, or by using `.drop_duplicates` with the same + `keep`-parameter) that can be used to reconstruct "values". + + .. versionadded:: 0.24.0 Returns ------- - duplicated : ndarray + duplicated : ndarray or tuple of ndarray if ``return_inverse`` is True """ + if return_inverse and keep is False: + raise ValueError("The parameters return_inverse=True and " + "keep=False cannot be used together (impossible " + "to calculate an inverse when discarding all " + "instances of a duplicate).") + values, dtype, ndtype = _ensure_data(values) f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype)) - return f(values, keep=keep) + isduplicate = f(values, keep=keep) + if not return_inverse: + return isduplicate + elif not isduplicate.any(): + # no need to calculate inverse if no duplicates + inverse = np.arange(len(values)) + return isduplicate, inverse + + if keep == 'first': + # values2unique: original indices to indices of ARRAY of unique values + # unique2values: reduplication from array of uniques to original array + # this fits together in the way that values[values2unique] are the + # unique values and values[values2unique][unique2values] == values + _, values2unique, unique2values = np.unique(values, return_index=True, + return_inverse=True) + elif keep == 'last': + # np.unique takes first occurrence per unique value, + # so we flip values that first becomes last + values = values[::-1] + _, values2unique, unique2values = np.unique(values, return_index=True, + return_inverse=True) + # the values in "values" correspond(ed) to the index of "values", + # which is simply np.arange(len(values)). + # By flipping "values" around, we need to do the same for the index, + # _because values2unique and unique2values are relative to that order_. + # Finally, to fit with the original order again, we need to flip the + # result around one last time. + values2unique = np.arange(len(values))[::-1][values2unique] + unique2values = unique2values[::-1] + + # np.unique yields a ___sorted___ list of uniques, and values2unique resp. + # unique2values are relative to this order. To restore the original order, + # we argsort values2unique, because values2unique would be ordered if + # np.unique had not sorted implicitly. + # The first argsort gives the permutation from values2unique to its sorted + # form, but we need the inverse permutation (the map from the unsorted + # uniques to values2unique, from which we can continue with unique2values). + # This inversion (as a permutation) is achieved by the second argsort. + inverse = np.argsort(np.argsort(values2unique))[unique2values] + return isduplicate, inverse def mode(values, dropna=True): diff --git a/pandas/core/base.py b/pandas/core/base.py index 26fea89b45ae1..e9ae6004be5a0 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1259,16 +1259,40 @@ def drop_duplicates(self, keep='first', inplace=False): else: return result - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): from pandas.core.algorithms import duplicated + + if return_inverse and keep is False: + raise ValueError("The parameters return_inverse=True and " + "keep=False cannot be used together (impossible " + "to calculate an inverse when discarding all " + "instances of a duplicate).") + if isinstance(self, ABCIndexClass): if self.is_unique: - return np.zeros(len(self), dtype=np.bool) - return duplicated(self, keep=keep) - else: + isduplicate = np.zeros(len(self), dtype=np.bool) + if not return_inverse: + return isduplicate + return isduplicate, np.arange(len(self)) + # core.algorithms.duplicated has the same output signature as + # Index.duplicated -> no need to distinguish cases here + return duplicated(self, keep=keep, return_inverse=return_inverse) + + # Series case + if not return_inverse: return self._constructor(duplicated(self, keep=keep), index=self.index).__finalize__(self) + # return_inverse = True + isduplicate_array, inverse_array = duplicated(self, keep=keep, + return_inverse=True) + isduplicate = self._constructor(isduplicate_array, + index=self.index).__finalize__(self) + inverse = self._constructor( + self.loc[~isduplicate_array].index[inverse_array], + index=self.index) + return isduplicate, inverse + # ---------------------------------------------------------------------- # abstracts diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 81d5c112885ec..d4ff421d4793a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4384,7 +4384,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): else: return self[-duplicated] - def duplicated(self, subset=None, keep='first'): + def duplicated(self, subset=None, keep='first', return_inverse=False): """ Return boolean Series denoting duplicate rows, optionally only considering certain columns @@ -4399,14 +4399,156 @@ def duplicated(self, subset=None, keep='first'): first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - - False : Mark all duplicates as ``True``. + - False : Mark all duplicates as ``True``. This option is not + compatible with ``return_inverse``. + return_inverse : boolean, default False + If True, also return a Series mapping the index of the current + DataFrame to the index after deduplication (created e.g. by using + `.drop_duplicates` or by selecting everything that is not + duplicate). This allows to reconstruct the original DataFrame from + the subset of deduplicated (=unique) values, see example below. + + .. versionadded:: 0.24.0 Returns ------- - duplicated : Series + duplicated : Series or tuple of Series if return_inverse is True + + Notes + ----- + The `return_inverse`-keyword works as expected for + ``keep='first'|'last'``, but cannot be used together with + ``keep=False`` (since discarding all duplicates makes it impossible to + construct an inverse). + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set on False and all others on True: + + >>> data = {'species': ['lama', 'cow', 'lama', 'ant', 'lama', 'bee'], + ... 'type': ['mammal'] * 3 + ['insect', 'mammal', 'insect']} + >>> animals = pd.DataFrame(data, index=[1, 4, 9, 16, 25, 36]) + >>> animals + species type + 1 lama mammal + 4 cow mammal + 9 lama mammal + 16 ant insect + 25 lama mammal + 36 bee insect + >>> + >>> animals.duplicated() # default: keep='first' + 1 False + 4 False + 9 True + 16 False + 25 True + 36 False + dtype: bool + + By using `'last'`, the last occurrence of each set of duplicated values + is set to False and all others to True: + + >>> animals.duplicated(keep='last') + 1 True + 4 False + 9 True + 16 False + 25 False + 36 False + dtype: bool + + By specifying `keep=False`, all duplicates are set to True: + + >>> animals.duplicated(keep=False) + 1 True + 4 False + 9 True + 16 False + 25 True + 36 False + dtype: bool + + By specifying the `subset`-keyword, the duplicates will be calculated + based on just the subset of columns given + + >>> animals.duplicated(subset=['type']) # default: keep='first' + 1 False + 4 True + 9 True + 16 False + 25 True + 36 True + dtype: bool + + Using the keyword `return_inverse=True`, the output becomes a tuple of + `Series`: + + >>> isduplicate, inverse = animals.duplicated(return_inverse=True) + >>> inverse + 1 1 + 4 4 + 9 1 + 16 16 + 25 1 + 36 36 + dtype: int64 + + This can be used to reconstruct the original object from its unique + elements as follows: + + >>> # same as animals.drop_duplicates() + >>> animals_unique = animals.loc[~isduplicate] + >>> animals_unique + species type + 1 lama mammal + 4 cow mammal + 16 ant insect + 36 bee insect + >>> + >>> reconstruct = animals_unique.reindex(inverse) + >>> reconstruct + species type + 1 lama mammal + 4 cow mammal + 1 lama mammal + 16 ant insect + 1 lama mammal + 36 bee insect + + We see that the values of `animals` get reconstructed correctly, but + the index does not match yet -- consequently, the last step is to + correctly set the index. + + >>> reconstruct = reconstruct.set_index(inverse.index) + >>> reconstruct + species type + 1 lama mammal + 4 cow mammal + 9 lama mammal + 16 ant insect + 25 lama mammal + 36 bee insect + >>> + >>> reconstruct.equals(animals) + True + + See Also + -------- + pandas.Index.duplicated : Equivalent method on pandas.Index + pandas.Series.duplicated : Equivalent method on pandas.Series + pandas.DataFrame.drop_duplicates : Remove duplicate values """ from pandas.core.sorting import get_group_index - from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT + from pandas._libs.hashtable import _SIZE_HINT_LIMIT + from pandas.core.algorithms import duplicated + + if return_inverse and keep is False: + raise ValueError("The parameters return_inverse=True and " + "keep=False cannot be used together (impossible " + "to calculate an inverse when discarding all " + "instances of a duplicate).") if self.empty: return Series() @@ -4435,7 +4577,16 @@ def f(vals): labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) - return Series(duplicated_int64(ids, keep), index=self.index) + if not return_inverse: + return Series(duplicated(ids, keep=keep), index=self.index) + + # return_inverse = True + isduplicated_array, inverse_array = duplicated(ids, keep=keep, + return_inverse=True) + isduplicated = Series(isduplicated_array, index=self.index) + inverse = Series(self.loc[~isduplicated_array].index[inverse_array], + index=self.index) + return isduplicated, inverse # ---------------------------------------------------------------------- # Sorting diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b42bbdafcab45..0a9ad8098f2e7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4587,7 +4587,7 @@ def drop_duplicates(self, keep='first'): """ return super(Index, self).drop_duplicates(keep=keep) - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): """ Indicate duplicate index values. @@ -4604,7 +4604,27 @@ def duplicated(self, keep='first'): occurrence. - 'last' : Mark duplicates as ``True`` except for the last occurrence. - - ``False`` : Mark all duplicates as ``True``. + - ``False`` : Mark all duplicates as ``True``. This option is not + compatible with ``return_inverse``. + return_inverse : boolean, default False + If True, also return the selection of (integer) indices from the + Index with unique values (created e.g. by selecting the boolean + complement of the first output, or by using `.drop_duplicates` with + the same `keep`-parameter). This allows to reconstruct the original + Index from the subset of unique values, see example below. + + .. versionadded:: 0.24.0 + + Returns + ------- + duplicated : ndarray or or tuple of ndarray if return_inverse is True + + Notes + ----- + The `return_inverse`-keyword works as expected for + ``keep='first'|'last'``, but cannot be used together with + ``keep=False`` (since discarding all duplicates makes it impossible to + construct an inverse). Examples -------- @@ -4620,20 +4640,37 @@ def duplicated(self, keep='first'): >>> idx.duplicated(keep='first') array([False, False, True, False, True]) - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True: + By using `'last'`, the last occurrence of each set of duplicated values + is set to False and all others to True: >>> idx.duplicated(keep='last') array([ True, False, True, False, False]) - By setting keep on ``False``, all duplicates are True: + By specifying `keep=False`, all duplicates are set to True: >>> idx.duplicated(keep=False) array([ True, False, True, False, True]) - Returns - ------- - numpy.ndarray + Using the keyword `return_inverse=True`, the output becomes a tuple of + `np.ndarray`: + + >>> isduplicate, inverse = idx.duplicated(return_inverse=True) + >>> inverse + array([0, 1, 0, 2, 0], dtype=int64) + + This can be used to reconstruct the original object from its unique + elements as follows: + + >>> idx_unique = idx[~isduplicate] # same as idx.drop_duplicates() + >>> idx_unique + Index(['lama', 'cow', 'beetle'], dtype='object') + >>> + >>> reconstruct = idx_unique[inverse] + >>> reconstruct + Index(['lama', 'cow', 'lama', 'beetle', 'lama'], dtype='object') + >>> + >>> reconstruct.equals(idx) + True See Also -------- @@ -4641,7 +4678,8 @@ def duplicated(self, keep='first'): pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame pandas.Index.drop_duplicates : Remove duplicate values from Index """ - return super(Index, self).duplicated(keep=keep) + return super(Index, self).duplicated(keep=keep, + return_inverse=return_inverse) _index_shared_docs['fillna'] = """ Fill NA/NaN values with the specified value diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 45703c220a4be..28f67b4d21fdb 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -403,10 +403,10 @@ def unique(self, level=None): ordered=result.ordered) @Appender(Index.duplicated.__doc__) - def duplicated(self, keep='first'): - from pandas._libs.hashtable import duplicated_int64 + def duplicated(self, keep='first', return_inverse=False): + from pandas.core.algorithms import duplicated codes = self.codes.astype('i8') - return duplicated_int64(codes, keep) + return duplicated(codes, keep=keep, return_inverse=return_inverse) def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ad38f037b6578..a333208a6f632 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -923,14 +923,19 @@ def f(k, stringify): return hash_tuple(key) @Appender(Index.duplicated.__doc__) - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): from pandas.core.sorting import get_group_index - from pandas._libs.hashtable import duplicated_int64 + from pandas.core.algorithms import duplicated + + if return_inverse and keep is False: + raise ValueError("The parameters return_inverse=True and " + "keep=False cannot be used together (impossible " + "to calculate an inverse when discarding all " + "instances of a duplicate).") shape = map(len, self.levels) ids = get_group_index(self.labels, shape, sort=False, xnull=False) - - return duplicated_int64(ids, keep) + return duplicated(ids, keep=keep, return_inverse=return_inverse) def fillna(self, value=None, downcast=None): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index fdb9ef59c1d3e..c5cc536496dd8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1585,7 +1585,7 @@ def drop_duplicates(self, keep='first', inplace=False): """ return super(Series, self).drop_duplicates(keep=keep, inplace=inplace) - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): """ Indicate duplicate Series values. @@ -1600,56 +1600,122 @@ def duplicated(self, keep='first'): occurrence. - 'last' : Mark duplicates as ``True`` except for the last occurrence. - - ``False`` : Mark all duplicates as ``True``. + - ``False`` : Mark all duplicates as ``True``. This option is not + compatible with ``return_inverse``. + return_inverse : boolean, default False + If True, also return the selection from the index from the Series + of unique values (created e.g. by selecting the boolean complement + of the first output, or by using `.drop_duplicates` with the same + `keep`-parameter) and how they relate to the index of the current + Series. This allows to reconstruct the original Series from the + subset of unique values, see example below. + + .. versionadded:: 0.24.0 + + Returns + ------- + duplicated : Series or or tuple of Series if return_inverse is True + + Notes + ----- + The `return_inverse`-keyword works as expected for + ``keep='first'|'last'``, but cannot be used together with + ``keep=False`` (since discarding all duplicates makes it impossible to + construct an inverse). Examples -------- By default, for each set of duplicated values, the first occurrence is set on False and all others on True: - >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'], + ... index=[1, 4, 9, 16, 25]) >>> animals.duplicated() - 0 False - 1 False - 2 True - 3 False - 4 True + 1 False + 4 False + 9 True + 16 False + 25 True dtype: bool which is equivalent to >>> animals.duplicated(keep='first') - 0 False - 1 False - 2 True - 3 False - 4 True + 1 False + 4 False + 9 True + 16 False + 25 True dtype: bool - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True: + By using `'last'`, the last occurrence of each set of duplicated values + is set to False and all others to True: >>> animals.duplicated(keep='last') - 0 True - 1 False - 2 True - 3 False - 4 False + 1 True + 4 False + 9 True + 16 False + 25 False dtype: bool - By setting keep on ``False``, all duplicates are True: + By specifying `keep=False`, all duplicates are set to True: >>> animals.duplicated(keep=False) - 0 True - 1 False - 2 True - 3 False - 4 True + 1 True + 4 False + 9 True + 16 False + 25 True dtype: bool - Returns - ------- - pandas.core.series.Series + Using the keyword `return_inverse=True`, the output becomes a tuple of + `Series`: + + >>> isduplicate, inverse = animals.duplicated(return_inverse=True) + >>> inverse + 1 1 + 4 4 + 9 1 + 16 16 + 25 1 + dtype: int64 + + This can be used to reconstruct the original object from its unique + elements as follows: + + >>> # same as animals.drop_duplicates() + >>> animals_unique = animals.loc[~isduplicate] + >>> animals_unique + 1 lama + 4 cow + 16 beetle + dtype: object + >>> + >>> reconstruct = animals_unique.reindex(inverse) + >>> reconstruct + 1 lama + 4 cow + 1 lama + 16 beetle + 1 lama + dtype: object + + We see that the values of `animals` get reconstructed correctly, but + the index does not match yet -- consequently, the last step is to + correctly set the index. + + >>> reconstruct.index = inverse.index + >>> reconstruct + 1 lama + 4 cow + 9 lama + 16 beetle + 25 lama + dtype: object + >>> + >>> reconstruct.equals(animals) + True See Also -------- @@ -1657,7 +1723,8 @@ def duplicated(self, keep='first'): pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame pandas.Series.drop_duplicates : Remove duplicate values from Series """ - return super(Series, self).duplicated(keep=keep) + return super(Series, self).duplicated(keep=keep, + return_inverse=return_inverse) def idxmin(self, axis=0, skipna=True, *args, **kwargs): """ diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 3478d66b919a6..9e1521a4cfff6 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -88,6 +88,70 @@ def test_duplicated_subset(subset, keep): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize('keep, expected_inv_values', [ + ('first', [1, 4, 4, 16, 1]), + ('last', [25, 9, 9, 16, 25]) +]) +def test_duplicated_inverse(keep, expected_inv_values): + # GH 21357 + # check that return_inverse kwarg does not affect outcome; + # index of inverse must be correctly transformed as well + idx = [1, 4, 9, 16, 25] + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}, + index=idx) + + expected_isdup = df.duplicated(keep=keep) + expected_inv = Series(expected_inv_values, index=idx) + result_isdup, result_inv = df.duplicated(keep=keep, + return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + tm.assert_series_equal(result_inv, expected_inv) + + # test that result_inv works (and fits together with expected_isdup) + unique = df.loc[~expected_isdup] + reconstr = unique.reindex(result_inv).set_index(result_inv.index) + tm.assert_frame_equal(reconstr, df) + + +def test_duplicated_inverse_raises(): + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) + + rgx = 'The parameters return_inverse=True and keep=False cannot be.*' + with tm.assert_raises_regex(ValueError, rgx): + df.duplicated(keep=False, return_inverse=True) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +@pytest.mark.parametrize('subset', [['A', 'B', 'C'], ['A', 'B'], ['A']]) +def test_duplicated_inverse_large(subset, keep): + # unsorted index (through .sample); important to check correct + # 'first'/'last' functionality of return_inverse + df = DataFrame(np.random.randint(0, 10, (10000, 3)), + columns=list('ABC')).sample(5000) + + expected_isdup = df.duplicated(keep=keep, subset=subset) + result_isdup, result_inv = df.duplicated(keep=keep, subset=subset, + return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + + unique = df.loc[~expected_isdup, subset] + reconstr = unique.reindex(result_inv).set_index(result_inv.index) + tm.assert_frame_equal(reconstr, df[subset]) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse_fastpath(keep): + df = DataFrame({'A': range(10)}) # no duplicates + + expected_isdup = df.duplicated(keep=keep) + result_isdup, result_inv = df.duplicated(keep=keep, + return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + + expected_inv = Series(range(10)) + tm.assert_series_equal(result_inv, expected_inv) + + def test_drop_duplicates(): df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 49a247608ab0b..f8a9538f8416c 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -383,6 +383,71 @@ def test_duplicated(self, indices, keep): result = idx.duplicated(keep=keep) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize('keep', ['first', 'last']) + def test_duplicated_inverse(self, indices, keep): + # GH 21357 + # check that return_inverse kwarg does not affect outcome + if type(indices) is not self._holder: + pytest.skip('Can only check if we have the correct type') + if not len(indices) or isinstance(indices, MultiIndex): + # MultiIndex tested separately in: + # tests/indexes/multi/test_unique_and_duplicates + pytest.skip('Skip check for empty Index and MultiIndex') + + idx = self._holder(indices) + if idx.has_duplicates: + # We need to be able to control creation of duplicates here + # This is slightly circular, as drop_duplicates depends on + # duplicated, but in the end, it all works out because we + # cross-check with Series.duplicated + idx = idx.drop_duplicates() + + n, k = len(idx), 10 + duplicated_selection = np.random.choice(n, k * n) + idx = self._holder(idx.values[duplicated_selection]) + + expected_isdup = idx.duplicated(keep=keep) + result_isdup, result_inv = idx.duplicated(keep=keep, + return_inverse=True) + tm.assert_numpy_array_equal(result_isdup, expected_isdup) + + # the following tests the correctness of result_inv in two ways: + # - it needs to fit together with expected_isdup + # - it needs to correctly reconstruct the object + unique = idx[~expected_isdup] + reconstr = unique[result_inv] + tm.assert_index_equal(reconstr, idx) + + def test_duplicated_inverse_raises(self, indices): + if type(indices) is not self._holder: + pytest.skip('Can only check if we have the correct type') + + rgx = 'The parameters return_inverse=True and keep=False cannot be.*' + with tm.assert_raises_regex(ValueError, rgx): + self._holder(indices).duplicated(keep=False, return_inverse=True) + + @pytest.mark.parametrize('keep', ['first', 'last']) + def test_duplicated_inverse_fastpath(self, indices, keep): + if type(indices) is not self._holder: + pytest.skip('Can only check if we have the correct type') + if not len(indices) or isinstance(indices, MultiIndex): + # MultiIndex tested separately in: + # tests/indexes/multi/test_unique_and_duplicates + pytest.skip('Skip check for empty Index and MultiIndex') + + idx = self._holder(indices) + if idx.has_duplicates: + # fastpath only possible if no duplicates + idx = idx.drop_duplicates() + + expected_isdup = idx.duplicated(keep=keep) + result_isdup, result_inv = idx.duplicated(keep=keep, + return_inverse=True) + tm.assert_numpy_array_equal(result_isdup, expected_isdup) + + expected_inv = np.arange(len(idx)) + tm.assert_numpy_array_equal(result_inv, expected_inv) + def test_unique(self, indices): # don't test a MultiIndex here (as its tested separated) # don't test a CategoricalIndex because categories change (GH 18291) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 54a12137c9457..7065c65e40127 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -211,29 +211,6 @@ def f(a): check(8, True) -@pytest.mark.parametrize('keep, expected', [ - ('first', np.array([False, False, False, True, True, False])), - ('last', np.array([False, True, True, False, False, False])), - (False, np.array([False, True, True, True, True, False])) -]) -def test_duplicated(idx_dup, keep, expected): - result = idx_dup.duplicated(keep=keep) - tm.assert_numpy_array_equal(result, expected) - - -@pytest.mark.parametrize('keep', ['first', 'last', False]) -def test_duplicated_large(keep): - # GH 9125 - n, k = 200, 5000 - levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] - labels = [np.random.choice(n, k * n) for lev in levels] - mi = MultiIndex(levels=levels, labels=labels) - - result = mi.duplicated(keep=keep) - expected = hashtable.duplicated_object(mi.values, keep=keep) - tm.assert_numpy_array_equal(result, expected) - - def test_get_duplicates(): # GH5873 for a in [101, 102]: @@ -263,3 +240,85 @@ def test_get_duplicates(): tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(len(mi), dtype='bool')) + + +@pytest.mark.parametrize('keep, expected', [ + ('first', np.array([False, False, False, True, True, False])), + ('last', np.array([False, True, True, False, False, False])), + (False, np.array([False, True, True, True, True, False])) +]) +def test_duplicated(idx_dup, keep, expected): + result = idx_dup.duplicated(keep=keep) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize('keep', ['first', 'last', False]) +def test_duplicated_large(keep): + # GH 9125 + n, k = 200, 5000 + levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] + labels = [np.random.choice(n, k * n) for lev in levels] + mi = MultiIndex(levels=levels, labels=labels) + + result = mi.duplicated(keep=keep) + expected = hashtable.duplicated_object(mi.values, keep=keep) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse(idx_dup, keep): + # GH 21357 + # check that return_inverse kwarg does not affect outcome; + # index of inverse must be correctly transformed as well + + expected_isdup = idx_dup.duplicated(keep=keep) + expected_inv = np.array([0, 1, 2, 1, 2, 3], dtype='int64') + result_isdup, result_inv = idx_dup.duplicated(keep=keep, + return_inverse=True) + tm.assert_numpy_array_equal(result_isdup, expected_isdup) + tm.assert_numpy_array_equal(result_inv, expected_inv) + + # test that result_inv works (and fits together with expected_isdup) + unique = MultiIndex.from_tuples(idx_dup.values[~expected_isdup]) + reconstr = MultiIndex.from_tuples(unique.values[result_inv], + names=idx_dup.names) + tm.assert_index_equal(reconstr, idx_dup) + + +def test_duplicated_inverse_raises(idx_dup): + rgx = 'The parameters return_inverse=True and keep=False cannot be.*' + with tm.assert_raises_regex(ValueError, rgx): + idx_dup.duplicated(keep=False, return_inverse=True) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse_large(keep): + n, k = 200, 5000 + levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] + labels = [np.random.choice(n, k * n) for lev in levels] + mi = MultiIndex(levels=levels, labels=labels) + + expected_isdup = mi.duplicated(keep=keep) + result_isdup, result_inv = mi.duplicated(keep=keep, + return_inverse=True) + tm.assert_numpy_array_equal(result_isdup, expected_isdup) + + # test that result_inv works (and fits together with expected_isdup) + unique = MultiIndex.from_tuples(mi.values[~expected_isdup]) + reconstr = MultiIndex.from_tuples(unique.values[result_inv], + names=mi.names) + tm.assert_index_equal(reconstr, mi) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse_fastpath(idx_dup, keep): + # fastpath is only taken if there are no duplicates + mi = idx_dup.drop_duplicates() + + expected_isdup = mi.duplicated(keep=keep) + result_isdup, result_inv = mi.duplicated(keep=keep, + return_inverse=True) + tm.assert_numpy_array_equal(result_isdup, expected_isdup) + + expected_inv = np.arange(4) + tm.assert_numpy_array_equal(result_inv, expected_inv) diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 2e4d64188307c..61a628081702d 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -138,3 +138,67 @@ def test_duplicated_nan_none(keep, expected): result = s.duplicated(keep=keep) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('keep, expected_inv_values', [ + ('first', [1, 4, 4, 16, 1]), + ('last', [25, 9, 9, 16, 25]) +]) +def test_duplicated_inverse(keep, expected_inv_values): + # GH 21357 + # check that return_inverse kwarg does not affect outcome; + # index of inverse must be correctly transformed as well + idx = [1, 4, 9, 16, 25] + s = Series(['a', 'b', 'b', 'c', 'a'], index=idx) + + expected_isdup = s.duplicated(keep=keep) + expected_inv = Series(expected_inv_values, index=idx) + result_isdup, result_inv = s.duplicated(keep=keep, + return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + tm.assert_series_equal(result_inv, expected_inv) + + # test that result_inv works (and fits together with expected_isdup) + unique = s.loc[~expected_isdup] + reconstr = unique.reindex(result_inv) + # Series has no set_index (GH21684) + reconstr.index = result_inv.index + tm.assert_series_equal(reconstr, s) + + +def test_duplicated_inverse_raises(): + s = Series(['a', 'b', 'b', 'c', 'a']) + + rgx = 'The parameters return_inverse=True and keep=False cannot be.*' + with tm.assert_raises_regex(ValueError, rgx): + s.duplicated(keep=False, return_inverse=True) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse_large(keep): + # unsorted index important to check 'first'/'last' functionality + s = Series(np.random.randint(0, 1000, 10000)).sample(5000) + + expected_isdup = s.duplicated(keep=keep) + result_isdup, result_inv = s.duplicated(keep=keep, return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + + # test that result_inv works (and fits together with expected_isdup) + unique = s.loc[~expected_isdup] + reconstr = unique.reindex(result_inv) + # Series has no set_index (GH21684) + reconstr.index = result_inv.index + tm.assert_series_equal(reconstr, s) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse_fastpath(keep): + s = Series(range(10)) # no duplicates + + expected_isdup = s.duplicated(keep=keep) + result_isdup, result_inv = s.duplicated(keep=keep, + return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + + expected_inv = Series(range(10)) + tm.assert_series_equal(result_inv, expected_inv)