Skip to content

Commit 923e35c

Browse files
committed
simplied DataFrame.duplicated a bit
1 parent 4122098 commit 923e35c

File tree

2 files changed

+9
-14
lines changed

2 files changed

+9
-14
lines changed

pandas/core/algorithms.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def _unique_generic(values, table_type, type_caster):
9595

9696

9797

98-
def factorize(values, sort=False, order=None, na_sentinel=-1):
98+
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
9999
"""
100100
Encode input values as an enumerated type or categorical variable
101101
@@ -106,8 +106,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
106106
sort : boolean, default False
107107
Sort by values
108108
order : deprecated
109-
na_sentinel: int, default -1
109+
na_sentinel : int, default -1
110110
Value to mark "not found"
111+
size_hint : hint to the hashtable sizer
111112
112113
Returns
113114
-------
@@ -129,7 +130,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
129130
is_timedelta = com.is_timedelta64_dtype(vals)
130131
(hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)
131132

132-
table = hash_klass(len(vals))
133+
table = hash_klass(size_hint or len(vals))
133134
uniques = vec_klass()
134135
labels = table.get_labels(vals, uniques, 0, na_sentinel)
135136

pandas/core/frame.py

+5-11
Original file line numberDiff line numberDiff line change
@@ -2750,18 +2750,12 @@ def duplicated(self, subset=None, take_last=False):
27502750
duplicated : Series
27512751
"""
27522752
from pandas.core.groupby import get_group_index
2753+
from pandas.core.algorithms import factorize
27532754
from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
27542755

2755-
size_hint = min(len(self), _SIZE_HINT_LIMIT)
2756-
2757-
def factorize(vals):
2758-
(hash_klass, vec_klass), vals = \
2759-
algos._get_data_algo(vals, algos._hashtables)
2760-
2761-
uniques, table = vec_klass(), hash_klass(size_hint)
2762-
labels = table.get_labels(vals, uniques, 0, -1)
2763-
2764-
return labels.astype('i8', copy=False), len(uniques)
2756+
def f(vals):
2757+
labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
2758+
return labels.astype('i8',copy=False), len(shape)
27652759

27662760
if subset is None:
27672761
subset = self.columns
@@ -2771,7 +2765,7 @@ def factorize(vals):
27712765
subset = subset,
27722766

27732767
vals = (self[col].values for col in subset)
2774-
labels, shape = map(list, zip( * map(factorize, vals)))
2768+
labels, shape = map(list, zip( * map(f, vals)))
27752769

27762770
ids = get_group_index(labels, shape, sort=False, xnull=False)
27772771
return Series(duplicated_int64(ids, take_last), index=self.index)

0 commit comments

Comments
 (0)