From dd630ff22766ce4b07442fd889b8896cbd02089c Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Tue, 11 Jul 2023 04:17:53 +0000 Subject: [PATCH 1/6] checking for value type when parquet is partitioned --- pandas/core/arrays/string_arrow.py | 5 ++++- pandas/io/parquet.py | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 12f4b5486b6b9..4a70fcf6b5a93 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -118,7 +118,10 @@ def __init__(self, values) -> None: super().__init__(values) self._dtype = StringDtype(storage="pyarrow") - if not pa.types.is_string(self._pa_array.type): + if not pa.types.is_string(self._pa_array.type) and not ( + pa.types.is_dictionary(self._pa_array.type) + and pa.types.is_string(self._pa_array.type.value_type) + ): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of string type" ) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index e8670757e1669..bb024f2451a0e 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -183,7 +183,6 @@ def write( from_pandas_kwargs["preserve_index"] = index table = self.api.Table.from_pandas(df, **from_pandas_kwargs) - path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, From d754d1a7b1bdd2b098b3a7fa32a48e535b636d3a Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 12 Jul 2023 00:53:31 +0000 Subject: [PATCH 2/6] whatsnew formatting --- pandas/io/parquet.py | 1 + .../tests/arrays/string_/test_string_arrow.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index bb024f2451a0e..e8670757e1669 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -183,6 +183,7 @@ def write( from_pandas_kwargs["preserve_index"] = index table = self.api.Table.from_pandas(df, **from_pandas_kwargs) + path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 45098e12ccb38..a29a9f150c095 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -69,6 +69,24 @@ def test_constructor_not_string_type_raises(array, chunked): ArrowStringArray(arr) +@skip_if_no_pyarrow +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_not_string_type_value_dictionary_raises(chunked): + import pyarrow as pa + + array = pa + + arr = array.array([1, 2, 3], array.dictionary(array.int32(), array.int32())) + if chunked: + arr = pa.chunked_array(arr) + + msg = re.escape( + "ArrowStringArray requires a PyArrow (chunked) array of string type" + ) + with pytest.raises(ValueError, match=msg): + ArrowStringArray(arr) + + @skip_if_no_pyarrow def test_from_sequence_wrong_dtype_raises(): with pd.option_context("string_storage", "python"): From 465c9d77af642a3a246c30ce5e1b883c46bf0be4 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 12 Jul 2023 04:28:32 +0000 Subject: [PATCH 3/6] added whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 0f669beaa036f..4352f412d5804 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -533,13 +533,13 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ +- Bug in :class:`ArrowStringArray` constructor raises value error on dictionary on values with string type (:issue:`54074`) - Bug in :class:`DataFrame` constructor not copying :class:`Series` with extension dtype when given in dict (:issue:`53744`) - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) - Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) - Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) -- Styler ^^^^^^ From 5423fc47478bf66a00297d3ba8592bd212ff8d97 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Thu, 13 Jul 2023 04:42:45 +0000 Subject: [PATCH 4/6] removed aliasing and skip_if_no_arrow decorator --- pandas/tests/arrays/string_/test_string_arrow.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index a29a9f150c095..7144f81638f8a 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -69,14 +69,11 @@ def test_constructor_not_string_type_raises(array, chunked): ArrowStringArray(arr) -@skip_if_no_pyarrow @pytest.mark.parametrize("chunked", [True, False]) def test_constructor_not_string_type_value_dictionary_raises(chunked): - import pyarrow as pa - - array = pa + pa = pytest.importorskip("pyarrow") - arr = array.array([1, 2, 3], array.dictionary(array.int32(), array.int32())) + arr = pa.array([1, 2, 3], pa.dictionary(pa.int32(), pa.int32())) if chunked: arr = pa.chunked_array(arr) From e184bc65973278a260c47713bd1b41a9cad97553 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Thu, 13 Jul 2023 06:55:16 +0000 Subject: [PATCH 5/6] added unit test for valid pyarrow array of dictionary with string value_type --- pandas/tests/arrays/string_/test_string_arrow.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 7144f81638f8a..0f899f4c8e876 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -84,6 +84,18 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): ArrowStringArray(arr) +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_valid_string_type_value_dictionary(chunked): + pa = pytest.importorskip("pyarrow") + + arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8())) + if chunked: + arr = pa.chunked_array(arr) + + arr = ArrowStringArray(arr) + assert pa.types.is_string(arr._pa_array.type.value_type) + + @skip_if_no_pyarrow def test_from_sequence_wrong_dtype_raises(): with pd.option_context("string_storage", "python"): From 9bdb61186d87ca6b9f2a84b69bdb2ccec8a21baa Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Thu, 13 Jul 2023 06:58:36 +0000 Subject: [PATCH 6/6] updated whatsnew with suggested changes --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 4352f412d5804..255c707a04a94 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -533,7 +533,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- Bug in :class:`ArrowStringArray` constructor raises value error on dictionary on values with string type (:issue:`54074`) +- Bug in :class:`ArrowStringArray` constructor raises ``ValueError`` with dictionary types of strings (:issue:`54074`) - Bug in :class:`DataFrame` constructor not copying :class:`Series` with extension dtype when given in dict (:issue:`53744`) - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) - Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`)