Skip to content

Commit 4c3af48

Browse files
PERF: fix merging on datetimelike columns to not use object-dtype factorizer (#53231)
* PERF: fix merging on datetimelike columns to not use object-dtype factorizer * don't try to match resos for Arrow extension array * fix typing * try fix typing * update dtypes check * add whatsnew
1 parent 11b297c commit 4c3af48

File tree

3 files changed

+43
-1
lines changed

3 files changed

+43
-1
lines changed

asv_bench/benchmarks/join_merge.py

+32
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,38 @@ def time_i8merge(self, how):
324324
merge(self.left, self.right, how=how)
325325

326326

327+
class MergeDatetime:
328+
params = [
329+
[
330+
("ns", "ns"),
331+
("ms", "ms"),
332+
("ns", "ms"),
333+
],
334+
[None, "Europe/Brussels"],
335+
]
336+
param_names = ["units", "tz"]
337+
338+
def setup(self, units, tz):
339+
unit_left, unit_right = units
340+
N = 10_000
341+
keys = Series(date_range("2012-01-01", freq="T", periods=N, tz=tz))
342+
self.left = DataFrame(
343+
{
344+
"key": keys.sample(N * 10, replace=True).dt.as_unit(unit_left),
345+
"value1": np.random.randn(N * 10),
346+
}
347+
)
348+
self.right = DataFrame(
349+
{
350+
"key": keys[:8000].dt.as_unit(unit_right),
351+
"value2": np.random.randn(8000),
352+
}
353+
)
354+
355+
def time_merge(self, units, tz):
356+
merge(self.left, self.right)
357+
358+
327359
class MergeCategoricals:
328360
def setup(self):
329361
self.left_object = DataFrame(

doc/source/whatsnew/v2.0.3.rst

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ including other versions of pandas.
1313

1414
Fixed regressions
1515
~~~~~~~~~~~~~~~~~
16+
- Fixed performance regression in merging on datetime-like columns (:issue:`53231`)
1617
-
1718

1819
.. ---------------------------------------------------------------------------

pandas/core/reshape/merge.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -2358,7 +2358,9 @@ def _factorize_keys(
23582358
rk = extract_array(rk, extract_numpy=True, extract_range=True)
23592359
# TODO: if either is a RangeIndex, we can likely factorize more efficiently?
23602360

2361-
if isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype):
2361+
if (
2362+
isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype)
2363+
) or (lib.is_np_dtype(lk.dtype, "M") and lib.is_np_dtype(rk.dtype, "M")):
23622364
# Extract the ndarray (UTC-localized) values
23632365
# Note: we dont need the dtypes to match, as these can still be compared
23642366
lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
@@ -2391,6 +2393,13 @@ def _factorize_keys(
23912393
# "_values_for_factorize"
23922394
rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
23932395

2396+
if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
2397+
# GH#23917 TODO: Needs tests for non-matching dtypes
2398+
# GH#23917 TODO: needs tests for case where lk is integer-dtype
2399+
# and rk is datetime-dtype
2400+
lk = np.asarray(lk, dtype=np.int64)
2401+
rk = np.asarray(rk, dtype=np.int64)
2402+
23942403
klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk)
23952404

23962405
rizer = klass(max(len(lk), len(rk)))

0 commit comments

Comments
 (0)