Skip to content

Commit f122e2e

Browse files
authored
PERF: Reject non-string object arrays faster in factorize (#51921)
1 parent ecc6ead commit f122e2e

File tree

3 files changed

+8
-2
lines changed

3 files changed

+8
-2
lines changed

asv_bench/benchmarks/algorithms.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class Factorize:
2323
"uint",
2424
"float",
2525
"object",
26+
"object_str",
2627
"datetime64[ns]",
2728
"datetime64[ns, tz]",
2829
"Int64",
@@ -46,7 +47,8 @@ def setup(self, unique, sort, dtype):
4647
"int": pd.Index(np.arange(N), dtype="int64"),
4748
"uint": pd.Index(np.arange(N), dtype="uint64"),
4849
"float": pd.Index(np.random.randn(N), dtype="float64"),
49-
"object": string_index,
50+
"object_str": string_index,
51+
"object": pd.Index(np.arange(N), dtype="object"),
5052
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
5153
"datetime64[ns, tz]": pd.date_range(
5254
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
@@ -62,6 +64,9 @@ def setup(self, unique, sort, dtype):
6264
def time_factorize(self, unique, sort, dtype):
6365
pd.factorize(self.data, sort=sort)
6466

67+
def peakmem_factorize(self, unique, sort, dtype):
68+
pd.factorize(self.data, sort=sort)
69+
6570

6671
class Duplicated:
6772
params = [

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ Performance improvements
117117
- Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`)
118118
- Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`)
119119
- Performance improvement in :meth:`Series.combine_first` (:issue:`51777`)
120+
- Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`)
120121

121122
.. ---------------------------------------------------------------------------
122123
.. _whatsnew_210.bug_fixes:

pandas/core/algorithms.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ def _check_object_for_strings(values: np.ndarray) -> str:
293293
# it's cheaper to use a String Hash Table than Object; we infer
294294
# including nulls because that is the only difference between
295295
# StringHashTable and ObjectHashtable
296-
if lib.infer_dtype(values, skipna=False) in ["string"]:
296+
if lib.is_string_array(values, skipna=False):
297297
ndtype = "string"
298298
return ndtype
299299

0 commit comments

Comments
 (0)