Skip to content

Commit ed77ca3

Browse files
committed
PERF: Reject non-string object arrays faster in factorize
1 parent 6169cba commit ed77ca3

File tree

3 files changed

+8
-2
lines changed

3 files changed

+8
-2
lines changed

asv_bench/benchmarks/algorithms.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class Factorize:
2323
"uint",
2424
"float",
2525
"object",
26+
"object_str",
2627
"datetime64[ns]",
2728
"datetime64[ns, tz]",
2829
"Int64",
@@ -46,7 +47,8 @@ def setup(self, unique, sort, dtype):
4647
"int": pd.Index(np.arange(N), dtype="int64"),
4748
"uint": pd.Index(np.arange(N), dtype="uint64"),
4849
"float": pd.Index(np.random.randn(N), dtype="float64"),
49-
"object": string_index,
50+
"object_str": string_index,
51+
"object": pd.Index(np.arange(N), dtype="object"),
5052
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
5153
"datetime64[ns, tz]": pd.date_range(
5254
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
@@ -62,6 +64,9 @@ def setup(self, unique, sort, dtype):
6264
def time_factorize(self, unique, sort, dtype):
6365
pd.factorize(self.data, sort=sort)
6466

67+
def peakmem_factorize(self, unique, sort, dtype):
68+
pd.factorize(self.data, sort=sort)
69+
6570

6671
class Duplicated:
6772
params = [

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ Performance improvements
116116
- Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`)
117117
- Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`)
118118
- Performance improvement in :meth:`Series.combine_first` (:issue:`51777`)
119+
- Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`)
119120

120121
.. ---------------------------------------------------------------------------
121122
.. _whatsnew_210.bug_fixes:

pandas/core/algorithms.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def _check_object_for_strings(values: np.ndarray) -> str:
292292
# it's cheaper to use a String Hash Table than Object; we infer
293293
# including nulls because that is the only difference between
294294
# StringHashTable and ObjectHashtable
295-
if lib.infer_dtype(values, skipna=False) in ["string"]:
295+
if lib.is_string_array(values, skipna=False):
296296
ndtype = "string"
297297
return ndtype
298298

0 commit comments

Comments
 (0)