From 1aa9150a48fa370f878d6242e9b00b7488fefc29 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 11 Aug 2023 15:08:11 +0200 Subject: [PATCH 1/3] Speed up string inference in maybe_convert_objects --- pandas/_libs/lib.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 38695fbb8222b..0df739462b99d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2680,14 +2680,13 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if is_string_array(objects, skipna=True): - if using_pyarrow_string_dtype(): - import pyarrow as pa + if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): + import pyarrow as pa - from pandas.core.dtypes.dtypes import ArrowDtype + from pandas.core.dtypes.dtypes import ArrowDtype - dtype = ArrowDtype(pa.string()) - return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + dtype = ArrowDtype(pa.string()) + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True elif seen.interval_: From 81d9fb5777bdc8d403f3c97309c27dc67eaae3ff Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 12 Aug 2023 09:32:34 +0200 Subject: [PATCH 2/3] Speed up StringDtype arrow implementation --- pandas/core/reshape/merge.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6987a0ac7bf6b..3015b3ab448a7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -76,6 +76,7 @@ na_value_for_dtype, ) +import pandas as pd from pandas import ( ArrowDtype, Categorical, @@ -2407,13 +2408,20 @@ def _factorize_keys( or is_string_dtype(lk.dtype) and not sort ) + or is_string_dtype(lk.dtype) + and lk.dtype.storage == "pyarrow" ): lk, _ = lk._values_for_factorize() # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute # "_values_for_factorize" rk, _ = rk._values_for_factorize() # type: ignore[union-attr] - elif isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype): + elif ( + isinstance(lk.dtype, ArrowDtype) + and is_string_dtype(lk.dtype) + or isinstance(lk.dtype, pd.StringDtype) + and lk.dtype.storage == "pyarrow" + ): import pyarrow as pa import pyarrow.compute as pc From 4115867aca66312fa6a32edf8581ad1279e170d9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 12 Aug 2023 09:34:52 +0200 Subject: [PATCH 3/3] Revert "Speed up StringDtype arrow implementation" This reverts commit 81d9fb5777bdc8d403f3c97309c27dc67eaae3ff. --- pandas/core/reshape/merge.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3015b3ab448a7..6987a0ac7bf6b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -76,7 +76,6 @@ na_value_for_dtype, ) -import pandas as pd from pandas import ( ArrowDtype, Categorical, @@ -2408,20 +2407,13 @@ def _factorize_keys( or is_string_dtype(lk.dtype) and not sort ) - or is_string_dtype(lk.dtype) - and lk.dtype.storage == "pyarrow" ): lk, _ = lk._values_for_factorize() # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute # "_values_for_factorize" rk, _ = rk._values_for_factorize() # type: ignore[union-attr] - elif ( - isinstance(lk.dtype, ArrowDtype) - and is_string_dtype(lk.dtype) - or isinstance(lk.dtype, pd.StringDtype) - and lk.dtype.storage == "pyarrow" - ): + elif isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype): import pyarrow as pa import pyarrow.compute as pc