From 505fcd622aaad41a992fe1fe794fd90f2f9d10b9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 13 Jun 2023 07:32:02 -0400 Subject: [PATCH 1/2] Series.str.join to support ArrowDtype(pa.string()) --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/arrays/arrow/array.py | 7 ++++++- pandas/tests/extension/test_arrow.py | 7 +++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index ba5334b2f4fa8..5a3ddd66c8b82 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -109,6 +109,7 @@ Other enhancements - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) +- :meth:`Series.str.join` now supports ``ArrowDtype(pa.string())`` (:issue:`#####`) .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 817d5d0932744..601b418296e7f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2073,7 +2073,12 @@ def _str_get(self, i: int): return type(self)(result) def _str_join(self, sep: str): - return type(self)(pc.binary_join(self._pa_array, sep)) + if pa.types.is_string(self._pa_array.type): + result = self._apply_elementwise(list) + result = pa.chunked_array(result, type=pa.list_(pa.string())) + else: + result = self._pa_array + return type(self)(pc.binary_join(result, sep)) def _str_partition(self, sep: str, expand: bool): predicate = lambda val: val.partition(sep) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 8625500a83e79..05d6e0e8722c5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2026,6 +2026,13 @@ def test_str_join(): tm.assert_series_equal(result, expected) +def test_str_join_string_type(): + ser = pd.Series(ArrowExtensionArray(pa.array(["abc", "123", None]))) + result = ser.str.join("=") + expected = pd.Series(["a=b=c", "1=2=3", None], dtype=ArrowDtype(pa.string())) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "start, stop, step, exp", [ From 7dc086084b10d687aea7358a8251d12f0afd146c Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 13 Jun 2023 07:36:09 -0400 Subject: [PATCH 2/2] gh ref --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 5a3ddd66c8b82..b37ef3e1bec0d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -102,6 +102,7 @@ Other enhancements - :meth:`DataFrame.stack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - :meth:`DataFrameGroupby.agg` and :meth:`DataFrameGroupby.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`) +- :meth:`Series.str.join` now supports ``ArrowDtype(pa.string())`` (:issue:`53646`) - :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`) - :meth:`SeriesGroupby.transform` and :meth:`DataFrameGroupby.transform` now support passing in a string as the function for ``engine="numba"`` (:issue:`53579`) - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) @@ -109,7 +110,6 @@ Other enhancements - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) -- :meth:`Series.str.join` now supports ``ArrowDtype(pa.string())`` (:issue:`#####`) .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: