Skip to content

Commit 50c119d

Browse files
authored
BUG: ArrowExtensionArray._from_* accepts pyarrow arrays (#48264)
1 parent edf0fce commit 50c119d

File tree

4 files changed

+126
-20
lines changed

4 files changed

+126
-20
lines changed

pandas/core/arrays/arrow/array.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -224,11 +224,13 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
224224
Construct a new ExtensionArray from a sequence of scalars.
225225
"""
226226
pa_dtype = to_pyarrow_type(dtype)
227-
if isinstance(scalars, cls):
228-
data = scalars._data
227+
is_cls = isinstance(scalars, cls)
228+
if is_cls or isinstance(scalars, (pa.Array, pa.ChunkedArray)):
229+
if is_cls:
230+
scalars = scalars._data
229231
if pa_dtype:
230-
data = data.cast(pa_dtype)
231-
return cls(data)
232+
scalars = scalars.cast(pa_dtype)
233+
return cls(scalars)
232234
else:
233235
return cls(
234236
pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True))
@@ -242,7 +244,10 @@ def _from_sequence_of_strings(
242244
Construct a new ExtensionArray from a sequence of strings.
243245
"""
244246
pa_type = to_pyarrow_type(dtype)
245-
if pa.types.is_timestamp(pa_type):
247+
if pa_type is None:
248+
# Let pyarrow try to infer or raise
249+
scalars = strings
250+
elif pa.types.is_timestamp(pa_type):
246251
from pandas.core.tools.datetimes import to_datetime
247252

248253
scalars = to_datetime(strings, errors="raise")
@@ -272,8 +277,9 @@ def _from_sequence_of_strings(
272277

273278
scalars = to_numeric(strings, errors="raise")
274279
else:
275-
# Let pyarrow try to infer or raise
276-
scalars = strings
280+
raise NotImplementedError(
281+
f"Converting strings to {pa_type} is not implemented."
282+
)
277283
return cls._from_sequence(scalars, dtype=pa_type, copy=copy)
278284

279285
def __getitem__(self, item: PositionalIndexer):

pandas/core/tools/times.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -80,17 +80,20 @@ def _convert_listlike(arg, format):
8080
format_found = False
8181
for element in arg:
8282
time_object = None
83-
for time_format in formats:
84-
try:
85-
time_object = datetime.strptime(element, time_format).time()
86-
if not format_found:
87-
# Put the found format in front
88-
fmt = formats.pop(formats.index(time_format))
89-
formats.insert(0, fmt)
90-
format_found = True
91-
break
92-
except (ValueError, TypeError):
93-
continue
83+
try:
84+
time_object = time.fromisoformat(element)
85+
except (ValueError, TypeError):
86+
for time_format in formats:
87+
try:
88+
time_object = datetime.strptime(element, time_format).time()
89+
if not format_found:
90+
# Put the found format in front
91+
fmt = formats.pop(formats.index(time_format))
92+
formats.insert(0, fmt)
93+
format_found = True
94+
break
95+
except (ValueError, TypeError):
96+
continue
9497

9598
if time_object is not None:
9699
times.append(time_object)

pandas/tests/extension/test_arrow.py

+94
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import pytest
2222

2323
from pandas.compat import (
24+
is_ci_environment,
25+
is_platform_windows,
2426
pa_version_under2p0,
2527
pa_version_under3p0,
2628
pa_version_under4p0,
@@ -37,6 +39,8 @@
3739

3840
pa = pytest.importorskip("pyarrow", minversion="1.0.1")
3941

42+
from pandas.core.arrays.arrow.array import ArrowExtensionArray
43+
4044
from pandas.core.arrays.arrow.dtype import ArrowDtype # isort:skip
4145

4246

@@ -224,6 +228,96 @@ def test_from_dtype(self, data, request):
224228
)
225229
super().test_from_dtype(data)
226230

231+
def test_from_sequence_pa_array(self, data, request):
232+
# https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784
233+
# data._data = pa.ChunkedArray
234+
if pa_version_under3p0:
235+
request.node.add_marker(
236+
pytest.mark.xfail(
237+
reason="ChunkedArray has no attribute combine_chunks",
238+
)
239+
)
240+
result = type(data)._from_sequence(data._data)
241+
tm.assert_extension_array_equal(result, data)
242+
assert isinstance(result._data, pa.ChunkedArray)
243+
244+
result = type(data)._from_sequence(data._data.combine_chunks())
245+
tm.assert_extension_array_equal(result, data)
246+
assert isinstance(result._data, pa.ChunkedArray)
247+
248+
def test_from_sequence_pa_array_notimplemented(self, request):
249+
if pa_version_under6p0:
250+
request.node.add_marker(
251+
pytest.mark.xfail(
252+
raises=AttributeError,
253+
reason="month_day_nano_interval not implemented by pyarrow.",
254+
)
255+
)
256+
with pytest.raises(NotImplementedError, match="Converting strings to"):
257+
ArrowExtensionArray._from_sequence_of_strings(
258+
["12-1"], dtype=pa.month_day_nano_interval()
259+
)
260+
261+
def test_from_sequence_of_strings_pa_array(self, data, request):
262+
pa_dtype = data.dtype.pyarrow_dtype
263+
if pa_version_under3p0:
264+
request.node.add_marker(
265+
pytest.mark.xfail(
266+
reason="ChunkedArray has no attribute combine_chunks",
267+
)
268+
)
269+
elif pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]"):
270+
request.node.add_marker(
271+
pytest.mark.xfail(
272+
reason="Nanosecond time parsing not supported.",
273+
)
274+
)
275+
elif pa.types.is_duration(pa_dtype):
276+
request.node.add_marker(
277+
pytest.mark.xfail(
278+
raises=pa.ArrowNotImplementedError,
279+
reason=f"pyarrow doesn't support parsing {pa_dtype}",
280+
)
281+
)
282+
elif pa.types.is_boolean(pa_dtype):
283+
request.node.add_marker(
284+
pytest.mark.xfail(
285+
reason="Iterating over ChunkedArray[bool] returns PyArrow scalars.",
286+
)
287+
)
288+
elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
289+
if pa_version_under7p0:
290+
request.node.add_marker(
291+
pytest.mark.xfail(
292+
raises=pa.ArrowNotImplementedError,
293+
reason=f"pyarrow doesn't support string cast from {pa_dtype}",
294+
)
295+
)
296+
elif is_platform_windows() and is_ci_environment():
297+
request.node.add_marker(
298+
pytest.mark.xfail(
299+
raises=pa.ArrowInvalid,
300+
reason=(
301+
"TODO: Set ARROW_TIMEZONE_DATABASE environment variable "
302+
"on CI to path to the tzdata for pyarrow."
303+
),
304+
)
305+
)
306+
elif pa_version_under6p0 and pa.types.is_temporal(pa_dtype):
307+
request.node.add_marker(
308+
pytest.mark.xfail(
309+
raises=pa.ArrowNotImplementedError,
310+
reason=f"pyarrow doesn't support string cast from {pa_dtype}",
311+
)
312+
)
313+
pa_array = data._data.cast(pa.string())
314+
result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
315+
tm.assert_extension_array_equal(result, data)
316+
317+
pa_array = pa_array.combine_chunks()
318+
result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
319+
tm.assert_extension_array_equal(result, data)
320+
227321

228322
@pytest.mark.xfail(
229323
raises=NotImplementedError, reason="pyarrow.ChunkedArray backing is 1D."

pandas/tests/tools/test_to_time.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import numpy as np
55
import pytest
66

7+
from pandas.compat import PY311
8+
79
from pandas import Series
810
import pandas._testing as tm
911
from pandas.core.tools.datetimes import to_time as to_time_alias
@@ -40,8 +42,9 @@ def test_parsers_time(self, time_string):
4042
def test_odd_format(self):
4143
new_string = "14.15"
4244
msg = r"Cannot convert arg \['14\.15'\] to a time"
43-
with pytest.raises(ValueError, match=msg):
44-
to_time(new_string)
45+
if not PY311:
46+
with pytest.raises(ValueError, match=msg):
47+
to_time(new_string)
4548
assert to_time(new_string, format="%H.%M") == time(14, 15)
4649

4750
def test_arraylike(self):

0 commit comments

Comments
 (0)