From 8c2104673ca8320a16109b4cc86cfc50499bad9e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 25 Aug 2022 19:29:30 -0700 Subject: [PATCH 1/5] BUG: ArrowExtensionArray._from_* accepts pyarrow arrays --- pandas/core/arrays/arrow/array.py | 20 ++++++++---- pandas/core/tools/times.py | 25 ++++++++------- pandas/tests/extension/test_arrow.py | 48 ++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1f7939011a1f1..cfae5b4cae681 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -224,11 +224,13 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): Construct a new ExtensionArray from a sequence of scalars. """ pa_dtype = to_pyarrow_type(dtype) - if isinstance(scalars, cls): - data = scalars._data + is_cls = isinstance(scalars, cls) + if is_cls or isinstance(scalars, (pa.Array, pa.ChunkedArray)): + if is_cls: + scalars = scalars._data if pa_dtype: - data = data.cast(pa_dtype) - return cls(data) + scalars = scalars.cast(pa_dtype) + return cls(scalars) else: return cls( pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True)) @@ -242,7 +244,10 @@ def _from_sequence_of_strings( Construct a new ExtensionArray from a sequence of strings. """ pa_type = to_pyarrow_type(dtype) - if pa.types.is_timestamp(pa_type): + if pa_type is None: + # Let pyarrow try to infer or raise + scalars = strings + elif pa.types.is_timestamp(pa_type): from pandas.core.tools.datetimes import to_datetime scalars = to_datetime(strings, errors="raise") @@ -272,8 +277,9 @@ def _from_sequence_of_strings( scalars = to_numeric(strings, errors="raise") else: - # Let pyarrow try to infer or raise - scalars = strings + raise NotImplementedError( + f"Converting strings to {pa_type} is not implemented." + ) return cls._from_sequence(scalars, dtype=pa_type, copy=copy) def __getitem__(self, item: PositionalIndexer): diff --git a/pandas/core/tools/times.py b/pandas/core/tools/times.py index 030cee3f678f4..87667921bf75a 100644 --- a/pandas/core/tools/times.py +++ b/pandas/core/tools/times.py @@ -80,17 +80,20 @@ def _convert_listlike(arg, format): format_found = False for element in arg: time_object = None - for time_format in formats: - try: - time_object = datetime.strptime(element, time_format).time() - if not format_found: - # Put the found format in front - fmt = formats.pop(formats.index(time_format)) - formats.insert(0, fmt) - format_found = True - break - except (ValueError, TypeError): - continue + try: + time_object = time.fromisoformat(element) + except (ValueError, TypeError): + for time_format in formats: + try: + time_object = datetime.strptime(element, time_format).time() + if not format_found: + # Put the found format in front + fmt = formats.pop(formats.index(time_format)) + formats.insert(0, fmt) + format_found = True + break + except (ValueError, TypeError): + continue if time_object is not None: times.append(time_object) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 43c52ef8848e2..0d1f77d380cfc 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -35,6 +35,8 @@ pa = pytest.importorskip("pyarrow", minversion="1.0.1") +from pandas.core.arrays.arrow.array import ArrowExtensionArray + from pandas.core.arrays.arrow.dtype import ArrowDtype # isort:skip @@ -222,6 +224,52 @@ def test_from_dtype(self, data, request): ) super().test_from_dtype(data) + def test_from_sequence_pa_array(self, data): + # https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784 + # data._data = pa.ChunkedArray + result = type(data)._from_sequence(data._data) + tm.assert_extension_array_equal(result, data) + assert isinstance(result._data, pa.ChunkedArray) + + result = type(data)._from_sequence(data._data.combine_chunks()) + tm.assert_extension_array_equal(result, data) + assert isinstance(result._data, pa.ChunkedArray) + + def test_from_sequence_pa_array_notimplemented(self): + with pytest.raises(NotImplementedError, match="Converting strings to"): + ArrowExtensionArray._from_sequence_of_strings( + ["12-1"], dtype=pa.month_day_nano_interval() + ) + + def test_from_sequence_of_strings_pa_array(self, data, request): + pa_dtype = data.dtype.pyarrow_dtype + if pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]"): + request.node.add_marker( + pytest.mark.xfail( + reason="Nanosecond time parsing not supported.", + ) + ) + elif pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"pyarrow doesn't support factorizing {pa_dtype}", + ) + ) + elif pa.types.is_boolean(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason="Iterating over ChunkedArray returns PyArrow scalars.", + ) + ) + pa_array = data._data.cast(pa.string()) + result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype) + tm.assert_extension_array_equal(result, data) + + pa_array = pa_array.combine_chunks() + result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype) + tm.assert_extension_array_equal(result, data) + @pytest.mark.xfail( raises=NotImplementedError, reason="pyarrow.ChunkedArray backing is 1D." From 19dc40456a770e0da527c8ee4a29efe782e42609 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 26 Aug 2022 16:07:33 -0700 Subject: [PATCH 2/5] Add xfails by pa version --- pandas/tests/extension/test_arrow.py | 45 +++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6dced811b6bda..9f2251b73ca93 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -226,9 +226,15 @@ def test_from_dtype(self, data, request): ) super().test_from_dtype(data) - def test_from_sequence_pa_array(self, data): + def test_from_sequence_pa_array(self, data, request): # https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784 # data._data = pa.ChunkedArray + if pa_version_under3p0: + request.node.add_marker( + pytest.mark.xfail( + reason="ChunkedArray has no attribute combine_chunks", + ) + ) result = type(data)._from_sequence(data._data) tm.assert_extension_array_equal(result, data) assert isinstance(result._data, pa.ChunkedArray) @@ -237,7 +243,14 @@ def test_from_sequence_pa_array(self, data): tm.assert_extension_array_equal(result, data) assert isinstance(result._data, pa.ChunkedArray) - def test_from_sequence_pa_array_notimplemented(self): + def test_from_sequence_pa_array_notimplemented(self, request): + if pa_version_under6p0: + request.node.add_marker( + pytest.mark.xfail( + raises=AttributeError, + reason="month_day_nano_interval not implemented by pyarrow.", + ) + ) with pytest.raises(NotImplementedError, match="Converting strings to"): ArrowExtensionArray._from_sequence_of_strings( ["12-1"], dtype=pa.month_day_nano_interval() @@ -245,7 +258,13 @@ def test_from_sequence_pa_array_notimplemented(self): def test_from_sequence_of_strings_pa_array(self, data, request): pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]"): + if pa_version_under3p0: + request.node.add_marker( + pytest.mark.xfail( + reason="ChunkedArray has no attribute combine_chunks", + ) + ) + elif pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]"): request.node.add_marker( pytest.mark.xfail( reason="Nanosecond time parsing not supported.", @@ -261,7 +280,25 @@ def test_from_sequence_of_strings_pa_array(self, data, request): elif pa.types.is_boolean(pa_dtype): request.node.add_marker( pytest.mark.xfail( - reason="Iterating over ChunkedArray returns PyArrow scalars.", + reason="Iterating over ChunkedArray[bool] returns PyArrow scalars.", + ) + ) + elif ( + pa_version_under7p0 + and pa.types.is_timestamp(pa_dtype) + and pa_dtype.tz is not None + ): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"pyarrow doesn't support string cast from {pa_dtype}", + ) + ) + elif pa_version_under6p0 and pa.types.is_temporal(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"pyarrow doesn't support string cast from {pa_dtype}", ) ) pa_array = data._data.cast(pa.string()) From 50c81127b4ee9929cc523c84db5a9f0a107e9bd0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Aug 2022 14:15:34 -0700 Subject: [PATCH 3/5] Add future note about tzdata --- pandas/tests/extension/test_arrow.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9f2251b73ca93..0d81e6926f283 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -274,7 +274,7 @@ def test_from_sequence_of_strings_pa_array(self, data, request): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, - reason=f"pyarrow doesn't support factorizing {pa_dtype}", + reason=f"pyarrow doesn't support parsing {pa_dtype}", ) ) elif pa.types.is_boolean(pa_dtype): @@ -283,6 +283,8 @@ def test_from_sequence_of_strings_pa_array(self, data, request): reason="Iterating over ChunkedArray[bool] returns PyArrow scalars.", ) ) + # TODO: Path to the tzdata needs to be provided once supported + # https://arrow.apache.org/docs/developers/cpp/windows.html?#downloading-the-timezone-database elif ( pa_version_under7p0 and pa.types.is_timestamp(pa_dtype) @@ -290,14 +292,12 @@ def test_from_sequence_of_strings_pa_array(self, data, request): ): request.node.add_marker( pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, reason=f"pyarrow doesn't support string cast from {pa_dtype}", ) ) elif pa_version_under6p0 and pa.types.is_temporal(pa_dtype): request.node.add_marker( pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, reason=f"pyarrow doesn't support string cast from {pa_dtype}", ) ) From 59036d8120fdcfe19d72b41edb1bae01a477c278 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Aug 2022 16:09:42 -0700 Subject: [PATCH 4/5] Try another condition --- pandas/tests/extension/test_arrow.py | 31 ++++++++++++++++++---------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0d81e6926f283..0385e4482a32b 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -21,6 +21,8 @@ import pytest from pandas.compat import ( + is_ci_environment, + is_platform_windows, pa_version_under2p0, pa_version_under3p0, pa_version_under4p0, @@ -283,21 +285,28 @@ def test_from_sequence_of_strings_pa_array(self, data, request): reason="Iterating over ChunkedArray[bool] returns PyArrow scalars.", ) ) - # TODO: Path to the tzdata needs to be provided once supported - # https://arrow.apache.org/docs/developers/cpp/windows.html?#downloading-the-timezone-database - elif ( - pa_version_under7p0 - and pa.types.is_timestamp(pa_dtype) - and pa_dtype.tz is not None - ): - request.node.add_marker( - pytest.mark.xfail( - reason=f"pyarrow doesn't support string cast from {pa_dtype}", + elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: + if pa_version_under7p0: + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"pyarrow doesn't support string cast from {pa_dtype}", + ) + ) + elif is_platform_windows() and is_ci_environment(): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason=( + "TODO: Set ARROW_TIMEZONE_DATABASE environment variable " + "on CI to path to the tzdata for pyarrow." + ), + ) ) - ) elif pa_version_under6p0 and pa.types.is_temporal(pa_dtype): request.node.add_marker( pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, reason=f"pyarrow doesn't support string cast from {pa_dtype}", ) ) From 2e747eeac8df9d5e0b1f803f625f223e86d65b1e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Aug 2022 16:11:40 -0700 Subject: [PATCH 5/5] PY311 time isoformat can parse time test --- pandas/tests/tools/test_to_time.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/tools/test_to_time.py b/pandas/tests/tools/test_to_time.py index a8316e0f3970c..c80b1e080a1d1 100644 --- a/pandas/tests/tools/test_to_time.py +++ b/pandas/tests/tools/test_to_time.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import PY311 + from pandas import Series import pandas._testing as tm from pandas.core.tools.datetimes import to_time as to_time_alias @@ -40,8 +42,9 @@ def test_parsers_time(self, time_string): def test_odd_format(self): new_string = "14.15" msg = r"Cannot convert arg \['14\.15'\] to a time" - with pytest.raises(ValueError, match=msg): - to_time(new_string) + if not PY311: + with pytest.raises(ValueError, match=msg): + to_time(new_string) assert to_time(new_string, format="%H.%M") == time(14, 15) def test_arraylike(self):