From 8381b24f615d6f0ad70f9708aa096fb38cd95ba1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 11:16:15 +0100 Subject: [PATCH 1/7] BUG: indexing empty pyarrow backed object returning corrupt object --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_testing/__init__.py | 1 + pandas/core/arrays/arrow/array.py | 2 +- pandas/tests/reshape/concat/test_series.py | 9 +++++++++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index cfa3c5c54b953..40c87df7f3a1b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1247,6 +1247,7 @@ Indexing - Bug in :meth:`DataFrame.compare` does not recognize differences when comparing ``NA`` with value in nullable dtypes (:issue:`48939`) - Bug in :meth:`Series.rename` with :class:`MultiIndex` losing extension array dtypes (:issue:`21055`) - Bug in :meth:`DataFrame.isetitem` coercing extension array dtypes in :class:`DataFrame` to object (:issue:`49922`) +- Bug in :meth:`Series.__getitem__` returning corrupt object when selecting from an empty pyarrow backed object (:issue:`51734`) - Bug in :class:`BusinessHour` would cause creation of :class:`DatetimeIndex` to fail when no opening hour was included in the index (:issue:`49835`) Missing diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 69ca809e4f498..f9add5c2c5d88 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -252,6 +252,7 @@ else: FLOAT_PYARROW_DTYPES_STR_REPR = [] ALL_INT_PYARROW_DTYPES_STR_REPR = [] + ALL_PYARROW_DTYPES = [] EMPTY_STRING_PATTERN = re.compile("^$") diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index bb4bdae188fd2..c3fac3b63ba65 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -349,7 +349,7 @@ def __getitem__(self, item: PositionalIndexer): pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype - return type(self)(pa.chunked_array([], type=pa_dtype)) + return type(self)(pa.array([], type=pa_dtype, from_pandas=True)) elif is_integer_dtype(item.dtype): return self.take(item) elif is_bool_dtype(item.dtype): diff --git a/pandas/tests/reshape/concat/test_series.py b/pandas/tests/reshape/concat/test_series.py index c5d3a8a7c74d1..f4fcb9a60fe39 100644 --- a/pandas/tests/reshape/concat/test_series.py +++ b/pandas/tests/reshape/concat/test_series.py @@ -2,6 +2,7 @@ import pytest from pandas import ( + ArrowDtype, DataFrame, DatetimeIndex, Index, @@ -151,3 +152,11 @@ def test_concat_series_length_one_reversed(self, frame_or_series): obj = frame_or_series([100]) result = concat([obj.iloc[::-1]]) tm.assert_equal(result, obj) + + @pytest.mark.parametrize("dtype", tm.ALL_PYARROW_DTYPES) + def test_concat_empty_arrow_backed_series(self, dtype): + # GH#51734 + ser = Series([], dtype=ArrowDtype(dtype)) + expected = ser.copy() + result = concat([ser[np.array([], dtype=np.bool_)]]) + tm.assert_series_equal(result, expected) From 8d0abf874899cc20de77381a1f7d0626333c9e80 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 11:18:44 +0100 Subject: [PATCH 2/7] remove from pandas --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c3fac3b63ba65..bdfd07b1b501d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -349,7 +349,7 @@ def __getitem__(self, item: PositionalIndexer): pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype - return type(self)(pa.array([], type=pa_dtype, from_pandas=True)) + return type(self)(pa.array([], type=pa_dtype)) elif is_integer_dtype(item.dtype): return self.take(item) elif is_bool_dtype(item.dtype): From 198c175655c0864adea878837c1cac145df28930 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 14:51:18 +0100 Subject: [PATCH 3/7] Fix no pyarrow installed error --- pandas/_testing/__init__.py | 1 - pandas/tests/reshape/concat/test_series.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index f2f0aaffcd6b5..a82a00d3de70a 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -253,7 +253,6 @@ else: FLOAT_PYARROW_DTYPES_STR_REPR = [] ALL_INT_PYARROW_DTYPES_STR_REPR = [] - ALL_PYARROW_DTYPES = [] EMPTY_STRING_PATTERN = re.compile("^$") diff --git a/pandas/tests/reshape/concat/test_series.py b/pandas/tests/reshape/concat/test_series.py index f4fcb9a60fe39..309212fa68285 100644 --- a/pandas/tests/reshape/concat/test_series.py +++ b/pandas/tests/reshape/concat/test_series.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( ArrowDtype, DataFrame, @@ -153,6 +155,7 @@ def test_concat_series_length_one_reversed(self, frame_or_series): result = concat([obj.iloc[::-1]]) tm.assert_equal(result, obj) + @td.skip_if_no("pyarrow", min_version="7.0.0") @pytest.mark.parametrize("dtype", tm.ALL_PYARROW_DTYPES) def test_concat_empty_arrow_backed_series(self, dtype): # GH#51734 From 7b6dbca6a704703ff050fa3a3722cde52d077fe7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 15:24:04 +0100 Subject: [PATCH 4/7] Fix --- pandas/_testing/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index a82a00d3de70a..f2f0aaffcd6b5 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -253,6 +253,7 @@ else: FLOAT_PYARROW_DTYPES_STR_REPR = [] ALL_INT_PYARROW_DTYPES_STR_REPR = [] + ALL_PYARROW_DTYPES = [] EMPTY_STRING_PATTERN = re.compile("^$") From c4d580ff6222669d3308d2c5ff1d8cc7bbb2b48b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 23:47:11 +0100 Subject: [PATCH 5/7] Move test --- pandas/tests/extension/test_arrow.py | 8 ++++++++ pandas/tests/reshape/concat/test_series.py | 12 ------------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f9af3a3063386..068c8b6adf451 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2156,3 +2156,11 @@ def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): else: expected = pd.NA assert result is expected + + +def test_concat_empty_arrow_backed_series(dtype): + # GH#51734 + ser = pd.Series([], dtype=dtype) + expected = ser.copy() + result = pd.concat([ser[np.array([], dtype=np.bool_)]]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_series.py b/pandas/tests/reshape/concat/test_series.py index 309212fa68285..c5d3a8a7c74d1 100644 --- a/pandas/tests/reshape/concat/test_series.py +++ b/pandas/tests/reshape/concat/test_series.py @@ -1,10 +1,7 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( - ArrowDtype, DataFrame, DatetimeIndex, Index, @@ -154,12 +151,3 @@ def test_concat_series_length_one_reversed(self, frame_or_series): obj = frame_or_series([100]) result = concat([obj.iloc[::-1]]) tm.assert_equal(result, obj) - - @td.skip_if_no("pyarrow", min_version="7.0.0") - @pytest.mark.parametrize("dtype", tm.ALL_PYARROW_DTYPES) - def test_concat_empty_arrow_backed_series(self, dtype): - # GH#51734 - ser = Series([], dtype=ArrowDtype(dtype)) - expected = ser.copy() - result = concat([ser[np.array([], dtype=np.bool_)]]) - tm.assert_series_equal(result, expected) From 59d39d5bb2a2afe2f1c8aeb3d9bf992d91962525 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Mar 2023 16:32:26 +0100 Subject: [PATCH 6/7] Refactor --- pandas/core/arrays/arrow/array.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index bdfd07b1b501d..7378a92de4415 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -349,7 +349,7 @@ def __getitem__(self, item: PositionalIndexer): pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype - return type(self)(pa.array([], type=pa_dtype)) + return type(self)(pa.chunked_array([], type=pa_dtype)) elif is_integer_dtype(item.dtype): return self.take(item) elif is_bool_dtype(item.dtype): @@ -1012,7 +1012,11 @@ def _concat_same_type( ArrowExtensionArray """ chunks = [array for ea in to_concat for array in ea._data.iterchunks()] - arr = pa.chunked_array(chunks) + if to_concat[0].dtype == "string": + pa_dtype = pa.string() + else: + pa_dtype = to_concat[0].dtype.pyarrow_dtype + arr = pa.chunked_array(chunks, type=pa_dtype) return cls(arr) def _accumulate( From e70427c1581068fdc2e46696bbd57badfcaea93e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Mar 2023 22:28:34 +0000 Subject: [PATCH 7/7] Add comment --- pandas/core/arrays/arrow/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7378a92de4415..af5e6687c5de1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1013,6 +1013,7 @@ def _concat_same_type( """ chunks = [array for ea in to_concat for array in ea._data.iterchunks()] if to_concat[0].dtype == "string": + # StringDtype has no attrivute pyarrow_dtype pa_dtype = pa.string() else: pa_dtype = to_concat[0].dtype.pyarrow_dtype