From 2b96945c08578301bd8ed54bfb30f69ddf66f188 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 12 Aug 2023 09:32:34 +0200 Subject: [PATCH 1/5] Speed up StringDtype arrow implementation --- pandas/core/reshape/merge.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6987a0ac7bf6b..3015b3ab448a7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -76,6 +76,7 @@ na_value_for_dtype, ) +import pandas as pd from pandas import ( ArrowDtype, Categorical, @@ -2407,13 +2408,20 @@ def _factorize_keys( or is_string_dtype(lk.dtype) and not sort ) + or is_string_dtype(lk.dtype) + and lk.dtype.storage == "pyarrow" ): lk, _ = lk._values_for_factorize() # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute # "_values_for_factorize" rk, _ = rk._values_for_factorize() # type: ignore[union-attr] - elif isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype): + elif ( + isinstance(lk.dtype, ArrowDtype) + and is_string_dtype(lk.dtype) + or isinstance(lk.dtype, pd.StringDtype) + and lk.dtype.storage == "pyarrow" + ): import pyarrow as pa import pyarrow.compute as pc From c05918e283f462d4f9ecbaf2a4283c3b619cebcb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 12 Aug 2023 10:30:04 +0200 Subject: [PATCH 2/5] Fixups --- pandas/core/reshape/merge.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3015b3ab448a7..303b311f6dd7d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -76,7 +76,6 @@ na_value_for_dtype, ) -import pandas as pd from pandas import ( ArrowDtype, Categorical, @@ -2408,19 +2407,15 @@ def _factorize_keys( or is_string_dtype(lk.dtype) and not sort ) - or is_string_dtype(lk.dtype) - and lk.dtype.storage == "pyarrow" + or (is_string_dtype(lk.dtype) and lk.dtype.storage == "pyarrow") ): lk, _ = lk._values_for_factorize() # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute # "_values_for_factorize" rk, _ = rk._values_for_factorize() # type: ignore[union-attr] - elif ( - isinstance(lk.dtype, ArrowDtype) - and is_string_dtype(lk.dtype) - or isinstance(lk.dtype, pd.StringDtype) - and lk.dtype.storage == "pyarrow" + elif (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( + is_string_dtype(lk.dtype) and lk.dtype.storage == "pyarrow" ): import pyarrow as pa import pyarrow.compute as pc From 97cbdbb67353f8ee14567426379fe89c3b34fd32 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 12 Aug 2023 11:25:49 +0200 Subject: [PATCH 3/5] Fixups --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 303b311f6dd7d..01ccda8a661d1 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2407,7 +2407,7 @@ def _factorize_keys( or is_string_dtype(lk.dtype) and not sort ) - or (is_string_dtype(lk.dtype) and lk.dtype.storage == "pyarrow") + or (is_string_dtype(lk.dtype) and lk.dtype.storage == "pyarrow") # type: ignore[attr-defined] # noqa: E501 ): lk, _ = lk._values_for_factorize() @@ -2415,7 +2415,7 @@ def _factorize_keys( # "_values_for_factorize" rk, _ = rk._values_for_factorize() # type: ignore[union-attr] elif (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - is_string_dtype(lk.dtype) and lk.dtype.storage == "pyarrow" + is_string_dtype(lk.dtype) and lk.dtype.storage == "pyarrow" # type: ignore[attr-defined] # noqa: E501 ): import pyarrow as pa import pyarrow.compute as pc From f592c48b2af8f12582cc985d2883b910882502ca Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 16 Aug 2023 18:56:48 +0200 Subject: [PATCH 4/5] Update --- pandas/core/reshape/merge.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 01ccda8a661d1..bceb8d1481ba1 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -90,6 +90,7 @@ ExtensionArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -2407,7 +2408,7 @@ def _factorize_keys( or is_string_dtype(lk.dtype) and not sort ) - or (is_string_dtype(lk.dtype) and lk.dtype.storage == "pyarrow") # type: ignore[attr-defined] # noqa: E501 + or (isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow") ): lk, _ = lk._values_for_factorize() @@ -2415,7 +2416,7 @@ def _factorize_keys( # "_values_for_factorize" rk, _ = rk._values_for_factorize() # type: ignore[union-attr] elif (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - is_string_dtype(lk.dtype) and lk.dtype.storage == "pyarrow" # type: ignore[attr-defined] # noqa: E501 + isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): import pyarrow as pa import pyarrow.compute as pc From b0b9962852cddc54eda276728447ff5abb305151 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 21 Aug 2023 22:21:45 +0200 Subject: [PATCH 5/5] Update --- pandas/core/reshape/merge.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index bceb8d1481ba1..c2cb9d643ca87 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2400,22 +2400,7 @@ def _factorize_keys( rk = ensure_int64(rk.codes) elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: - if not isinstance(lk, BaseMaskedArray) and not ( - # exclude arrow dtypes that would get cast to object - isinstance(lk.dtype, ArrowDtype) - and ( - is_numeric_dtype(lk.dtype.numpy_dtype) - or is_string_dtype(lk.dtype) - and not sort - ) - or (isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow") - ): - lk, _ = lk._values_for_factorize() - - # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute - # "_values_for_factorize" - rk, _ = rk._values_for_factorize() # type: ignore[union-attr] - elif (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( + if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): import pyarrow as pa @@ -2440,6 +2425,21 @@ def _factorize_keys( return rlab, llab, count return llab, rlab, count + if not isinstance(lk, BaseMaskedArray) and not ( + # exclude arrow dtypes that would get cast to object + isinstance(lk.dtype, ArrowDtype) + and ( + is_numeric_dtype(lk.dtype.numpy_dtype) + or is_string_dtype(lk.dtype) + and not sort + ) + ): + lk, _ = lk._values_for_factorize() + + # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute + # "_values_for_factorize" + rk, _ = rk._values_for_factorize() # type: ignore[union-attr] + if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype: # GH#23917 TODO: Needs tests for non-matching dtypes # GH#23917 TODO: needs tests for case where lk is integer-dtype