Skip to content

Commit 150cf99

Browse files
authored
PERF: avoid unnecessary copies in algorithms (#46109)
1 parent 3b946d3 commit 150cf99

File tree

3 files changed

+13
-26
lines changed

3 files changed

+13
-26
lines changed

pandas/core/algorithms.py

+9-22
Original file line numberDiff line numberDiff line change
@@ -294,25 +294,6 @@ def _get_hashtable_algo(values: np.ndarray):
294294
return htable, values
295295

296296

297-
def _get_values_for_rank(values: ArrayLike) -> np.ndarray:
298-
299-
values = _ensure_data(values)
300-
if values.dtype.kind in ["i", "u", "f"]:
301-
# rank_t includes only object, int64, uint64, float64
302-
dtype = values.dtype.kind + "8"
303-
values = values.astype(dtype, copy=False)
304-
return values
305-
306-
307-
def _get_data_algo(values: ArrayLike):
308-
values = _get_values_for_rank(values)
309-
310-
ndtype = _check_object_for_strings(values)
311-
htable = _hashtables.get(ndtype, _hashtables["object"])
312-
313-
return htable, values
314-
315-
316297
def _check_object_for_strings(values: np.ndarray) -> str:
317298
"""
318299
Check if we can use string hashtable instead of object hashtable.
@@ -562,7 +543,7 @@ def factorize_array(
562543
codes : ndarray[np.intp]
563544
uniques : ndarray
564545
"""
565-
hash_klass, values = _get_data_algo(values)
546+
hash_klass, values = _get_hashtable_algo(values)
566547

567548
table = hash_klass(size_hint or len(values))
568549
uniques, codes = table.factorize(
@@ -1020,7 +1001,13 @@ def rank(
10201001
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
10211002
"""
10221003
is_datetimelike = needs_i8_conversion(values.dtype)
1023-
values = _get_values_for_rank(values)
1004+
values = _ensure_data(values)
1005+
1006+
if values.dtype.kind in ["i", "u", "f"]:
1007+
# rank_t includes only object, int64, uint64, float64
1008+
dtype = values.dtype.kind + "8"
1009+
values = values.astype(dtype, copy=False)
1010+
10241011
if values.ndim == 1:
10251012
ranks = algos.rank_1d(
10261013
values,
@@ -1765,7 +1752,7 @@ def safe_sort(
17651752

17661753
if sorter is None:
17671754
# mixed types
1768-
hash_klass, values = _get_data_algo(values)
1755+
hash_klass, values = _get_hashtable_algo(values)
17691756
t = hash_klass(len(values))
17701757
t.map_locations(values)
17711758
sorter = ensure_platform_int(t.lookup(ordered))

pandas/core/arrays/categorical.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -2315,9 +2315,9 @@ def _values_for_factorize(self):
23152315

23162316
@classmethod
23172317
def _from_factorized(cls, uniques, original):
2318-
return original._constructor(
2319-
original.categories.take(uniques), dtype=original.dtype
2320-
)
2318+
# ensure we have the same itemsize for codes
2319+
codes = coerce_indexer_dtype(uniques, original.dtype.categories)
2320+
return original._from_backing_data(codes)
23212321

23222322
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
23232323
# make sure we have correct itemsize for resulting codes

pandas/core/arrays/numpy_.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def _from_sequence(
111111

112112
@classmethod
113113
def _from_factorized(cls, values, original) -> PandasArray:
114-
return cls(values)
114+
return original._from_backing_data(values)
115115

116116
def _from_backing_data(self, arr: np.ndarray) -> PandasArray:
117117
return type(self)(arr)

0 commit comments

Comments
 (0)