Skip to content

Commit 7da9178

Browse files
committed
simplied DataFrame.duplicated a bit
1 parent 1ab0e5f commit 7da9178

File tree

2 files changed

+9
-14
lines changed

2 files changed

+9
-14
lines changed

pandas/core/algorithms.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def _unique_generic(values, table_type, type_caster):
9595

9696

9797

98-
def factorize(values, sort=False, order=None, na_sentinel=-1):
98+
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
9999
"""
100100
Encode input values as an enumerated type or categorical variable
101101
@@ -106,8 +106,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
106106
sort : boolean, default False
107107
Sort by values
108108
order : deprecated
109-
na_sentinel: int, default -1
109+
na_sentinel : int, default -1
110110
Value to mark "not found"
111+
size_hint : hint to the hashtable sizer
111112
112113
Returns
113114
-------
@@ -129,7 +130,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
129130
is_timedelta = com.is_timedelta64_dtype(vals)
130131
(hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)
131132

132-
table = hash_klass(len(vals))
133+
table = hash_klass(size_hint or len(vals))
133134
uniques = vec_klass()
134135
labels = table.get_labels(vals, uniques, 0, na_sentinel)
135136

pandas/core/frame.py

+5-11
Original file line numberDiff line numberDiff line change
@@ -2832,18 +2832,12 @@ def duplicated(self, subset=None, take_last=False):
28322832
duplicated : Series
28332833
"""
28342834
from pandas.core.groupby import get_group_index
2835+
from pandas.core.algorithms import factorize
28352836
from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
28362837

2837-
size_hint = min(len(self), _SIZE_HINT_LIMIT)
2838-
2839-
def factorize(vals):
2840-
(hash_klass, vec_klass), vals = \
2841-
algos._get_data_algo(vals, algos._hashtables)
2842-
2843-
uniques, table = vec_klass(), hash_klass(size_hint)
2844-
labels = table.get_labels(vals, uniques, 0, -1)
2845-
2846-
return labels.astype('i8', copy=False), len(uniques)
2838+
def f(vals):
2839+
labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
2840+
return labels.astype('i8',copy=False), len(shape)
28472841

28482842
if subset is None:
28492843
subset = self.columns
@@ -2853,7 +2847,7 @@ def factorize(vals):
28532847
subset = subset,
28542848

28552849
vals = (self[col].values for col in subset)
2856-
labels, shape = map(list, zip( * map(factorize, vals)))
2850+
labels, shape = map(list, zip( * map(f, vals)))
28572851

28582852
ids = get_group_index(labels, shape, sort=False, xnull=False)
28592853
return Series(duplicated_int64(ids, take_last), index=self.index)

0 commit comments

Comments
 (0)