diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 80c8ad9a8b2eb..e7a6e873baec1 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -219,7 +219,7 @@ Conversion - Bug in :meth:`DataFrame.eval` incorrectly raising an ``AttributeError`` when there are negative values in function call (:issue:`46471`) - Bug in :meth:`Series.convert_dtypes` not converting dtype to nullable dtype when :class:`Series` contains ``NA`` and has dtype ``object`` (:issue:`48791`) - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) -- +- Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) Strings ^^^^^^^ diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index da4db1fcc9e04..6cce1137e707b 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -201,8 +201,11 @@ SIGNED_INT_PYARROW_DTYPES = [pa.int8(), pa.int16(), pa.int32(), pa.int64()] ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES + # pa.float16 doesn't seem supported + # https://github.com/apache/arrow/blob/master/python/pyarrow/src/arrow/python/helpers.cc#L86 FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()] - STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()] + STRING_PYARROW_DTYPES = [pa.string()] + BINARY_PYARROW_DTYPES = [pa.binary()] TIME_PYARROW_DTYPES = [ pa.time32("s"), @@ -225,6 +228,8 @@ ALL_PYARROW_DTYPES = ( ALL_INT_PYARROW_DTYPES + FLOAT_PYARROW_DTYPES + + STRING_PYARROW_DTYPES + + BINARY_PYARROW_DTYPES + TIME_PYARROW_DTYPES + DATE_PYARROW_DTYPES + DATETIME_PYARROW_DTYPES diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f957ff63feeac..ea33bf58bacda 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -220,8 +220,13 @@ def _from_sequence_of_strings( Construct a new ExtensionArray from a sequence of strings. """ pa_type = to_pyarrow_type(dtype) - if pa_type is None: - # Let pyarrow try to infer or raise + if ( + pa_type is None + or pa.types.is_binary(pa_type) + or pa.types.is_string(pa_type) + ): + # pa_type is None: Let pa.array infer + # pa_type is string/binary: scalars already correct type scalars = strings elif pa.types.is_timestamp(pa_type): from pandas.core.tools.datetimes import to_datetime diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d1f695bc06224..9a6b24583c525 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -16,6 +16,10 @@ time, timedelta, ) +from io import ( + BytesIO, + StringIO, +) import numpy as np import pytest @@ -90,6 +94,10 @@ def data(dtype): + [None] + [time(0, 5), time(5, 0)] ) + elif pa.types.is_string(pa_dtype): + data = ["a", "b"] * 4 + [None] + ["1", "2"] * 44 + [None] + ["!", ">"] + elif pa.types.is_binary(pa_dtype): + data = [b"a", b"b"] * 4 + [None] + [b"1", b"2"] * 44 + [None] + [b"!", b">"] else: raise NotImplementedError return pd.array(data, dtype=dtype) @@ -155,6 +163,14 @@ def data_for_grouping(dtype): A = time(0, 0) B = time(0, 12) C = time(12, 12) + elif pa.types.is_string(pa_dtype): + A = "a" + B = "b" + C = "c" + elif pa.types.is_binary(pa_dtype): + A = b"a" + B = b"b" + C = b"c" else: raise NotImplementedError return pd.array([B, B, None, None, A, A, B, C], dtype=dtype) @@ -203,17 +219,30 @@ def na_value(): class TestBaseCasting(base.BaseCastingTests): - pass + def test_astype_str(self, data, request): + pa_dtype = data.dtype.pyarrow_dtype + if pa.types.is_binary(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason=f"For {pa_dtype} .astype(str) decodes.", + ) + ) + super().test_astype_str(data) class TestConstructors(base.BaseConstructorsTests): def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz: + if (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz) or pa.types.is_string( + pa_dtype + ): + if pa.types.is_string(pa_dtype): + reason = "ArrowDtype(pa.string()) != StringDtype('pyarrow')" + else: + reason = f"pyarrow.type_for_alias cannot infer {pa_dtype}" request.node.add_marker( pytest.mark.xfail( - raises=NotImplementedError, - reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", + reason=reason, ) ) super().test_from_dtype(data) @@ -302,7 +331,7 @@ class TestGetitemTests(base.BaseGetitemTests): reason=( "data.dtype.type return pyarrow.DataType " "but this (intentionally) returns " - "Python scalars or pd.Na" + "Python scalars or pd.NA" ) ) def test_getitem_scalar(self, data): @@ -361,7 +390,11 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request): or pa.types.is_boolean(pa_dtype) ) and not ( all_numeric_reductions in {"min", "max"} - and (pa.types.is_temporal(pa_dtype) and not pa.types.is_duration(pa_dtype)) + and ( + (pa.types.is_temporal(pa_dtype) and not pa.types.is_duration(pa_dtype)) + or pa.types.is_string(pa_dtype) + or pa.types.is_binary(pa_dtype) + ) ): request.node.add_marker(xfail_mark) elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { @@ -494,6 +527,16 @@ def test_construct_from_string_own_name(self, dtype, request): reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", ) ) + elif pa.types.is_string(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=TypeError, + reason=( + "Still support StringDtype('pyarrow') " + "over ArrowDtype(pa.string())" + ), + ) + ) super().test_construct_from_string_own_name(dtype) def test_is_dtype_from_name(self, dtype, request): @@ -505,6 +548,15 @@ def test_is_dtype_from_name(self, dtype, request): reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", ) ) + elif pa.types.is_string(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason=( + "Still support StringDtype('pyarrow') " + "over ArrowDtype(pa.string())" + ), + ) + ) super().test_is_dtype_from_name(dtype) def test_construct_from_string(self, dtype, request): @@ -516,6 +568,16 @@ def test_construct_from_string(self, dtype, request): reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", ) ) + elif pa.types.is_string(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=TypeError, + reason=( + "Still support StringDtype('pyarrow') " + "over ArrowDtype(pa.string())" + ), + ) + ) super().test_construct_from_string(dtype) def test_construct_from_string_another_type_raises(self, dtype): @@ -533,6 +595,8 @@ def test_get_common_dtype(self, dtype, request): and (pa_dtype.unit != "ns" or pa_dtype.tz is not None) ) or (pa.types.is_duration(pa_dtype) and pa_dtype.unit != "ns") + or pa.types.is_string(pa_dtype) + or pa.types.is_binary(pa_dtype) ): request.node.add_marker( pytest.mark.xfail( @@ -592,7 +656,21 @@ def test_EA_types(self, engine, data, request): reason=f"Parameterized types with tz={pa_dtype.tz} not supported.", ) ) - super().test_EA_types(engine, data) + elif pa.types.is_binary(pa_dtype): + request.node.add_marker( + pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") + ) + df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) + csv_output = df.to_csv(index=False, na_rep=np.nan) + if pa.types.is_binary(pa_dtype): + csv_output = BytesIO(csv_output) + else: + csv_output = StringIO(csv_output) + result = pd.read_csv( + csv_output, dtype={"with_dtype": str(data.dtype)}, engine=engine + ) + expected = df + self.assert_frame_equal(result, expected) class TestBaseUnaryOps(base.BaseUnaryOpsTests): @@ -899,7 +977,11 @@ def test_arith_series_with_scalar( or all_arithmetic_operators in ("__sub__", "__rsub__") and pa.types.is_temporal(pa_dtype) ) - if all_arithmetic_operators in { + if all_arithmetic_operators == "__rmod__" and ( + pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) + ): + pytest.skip("Skip testing Python string formatting") + elif all_arithmetic_operators in { "__mod__", "__rmod__", }: @@ -965,7 +1047,11 @@ def test_arith_frame_with_scalar( or all_arithmetic_operators in ("__sub__", "__rsub__") and pa.types.is_temporal(pa_dtype) ) - if all_arithmetic_operators in { + if all_arithmetic_operators == "__rmod__" and ( + pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) + ): + pytest.skip("Skip testing Python string formatting") + elif all_arithmetic_operators in { "__mod__", "__rmod__", }: @@ -1224,7 +1310,11 @@ def test_quantile(data, interpolation, quantile, request): ) def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_temporal(pa_dtype): + if ( + pa.types.is_temporal(pa_dtype) + or pa.types.is_string(pa_dtype) + or pa.types.is_binary(pa_dtype) + ): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError,