Skip to content

Commit 5d04432

Browse files
authored
REF: ArrowEA _data->_pa_array (#50987)
1 parent 5b9f980 commit 5d04432

File tree

8 files changed

+212
-197
lines changed

8 files changed

+212
-197
lines changed

pandas/_testing/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1028,8 +1028,8 @@ def shares_memory(left, right) -> bool:
10281028
left = cast("ArrowExtensionArray", left)
10291029
if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]":
10301030
right = cast("ArrowExtensionArray", right)
1031-
left_pa_data = left._data
1032-
right_pa_data = right._data
1031+
left_pa_data = left._pa_array
1032+
right_pa_data = right._pa_array
10331033
left_buf1 = left_pa_data.chunk(0).buffers()[1]
10341034
right_buf1 = right_pa_data.chunk(0).buffers()[1]
10351035
return left_buf1 == right_buf1

pandas/core/arrays/arrow/array.py

+153-153
Large diffs are not rendered by default.

pandas/core/arrays/string_arrow.py

+39-26
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
Callable,
66
Union,
77
)
8+
import warnings
89

910
import numpy as np
1011

@@ -18,6 +19,7 @@
1819
npt,
1920
)
2021
from pandas.compat import pa_version_under7p0
22+
from pandas.util._exceptions import find_stack_level
2123

2224
from pandas.core.dtypes.common import (
2325
is_bool_dtype,
@@ -112,7 +114,7 @@ def __init__(self, values) -> None:
112114
super().__init__(values)
113115
self._dtype = StringDtype(storage="pyarrow")
114116

115-
if not pa.types.is_string(self._data.type):
117+
if not pa.types.is_string(self._pa_array.type):
116118
raise ValueError(
117119
"ArrowStringArray requires a PyArrow (chunked) array of string type"
118120
)
@@ -125,7 +127,7 @@ def __len__(self) -> int:
125127
-------
126128
length : int
127129
"""
128-
return len(self._data)
130+
return len(self._pa_array)
129131

130132
@classmethod
131133
def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
@@ -193,7 +195,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
193195
if not len(value_set):
194196
return np.zeros(len(self), dtype=bool)
195197

196-
result = pc.is_in(self._data, value_set=pa.array(value_set))
198+
result = pc.is_in(self._pa_array, value_set=pa.array(value_set))
197199
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
198200
# to False
199201
return np.array(result, dtype=np.bool_)
@@ -206,13 +208,24 @@ def astype(self, dtype, copy: bool = True):
206208
return self.copy()
207209
return self
208210
elif isinstance(dtype, NumericDtype):
209-
data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
211+
data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
210212
return dtype.__from_arrow__(data)
211213
elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
212214
return self.to_numpy(dtype=dtype, na_value=np.nan)
213215

214216
return super().astype(dtype, copy=copy)
215217

218+
@property
219+
def _data(self):
220+
# dask accesses ._data directlys
221+
warnings.warn(
222+
f"{type(self).__name__}._data is a deprecated and will be removed "
223+
"in a future version, use ._pa_array instead",
224+
FutureWarning,
225+
stacklevel=find_stack_level(),
226+
)
227+
return self._pa_array
228+
216229
# ------------------------------------------------------------------------
217230
# String methods interface
218231

@@ -292,12 +305,12 @@ def _str_contains(
292305
fallback_performancewarning()
293306
return super()._str_contains(pat, case, flags, na, regex)
294307
else:
295-
result = pc.match_substring_regex(self._data, pat)
308+
result = pc.match_substring_regex(self._pa_array, pat)
296309
else:
297310
if case:
298-
result = pc.match_substring(self._data, pat)
311+
result = pc.match_substring(self._pa_array, pat)
299312
else:
300-
result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
313+
result = pc.match_substring(pc.utf8_upper(self._pa_array), pat.upper())
301314
result = BooleanDtype().__from_arrow__(result)
302315
if not isna(na):
303316
result[isna(result)] = bool(na)
@@ -325,7 +338,7 @@ def _str_replace(
325338
return super()._str_replace(pat, repl, n, case, flags, regex)
326339

327340
func = pc.replace_substring_regex if regex else pc.replace_substring
328-
result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
341+
result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n)
329342
return type(self)(result)
330343

331344
def _str_match(
@@ -343,68 +356,68 @@ def _str_fullmatch(
343356
return self._str_match(pat, case, flags, na)
344357

345358
def _str_isalnum(self):
346-
result = pc.utf8_is_alnum(self._data)
359+
result = pc.utf8_is_alnum(self._pa_array)
347360
return BooleanDtype().__from_arrow__(result)
348361

349362
def _str_isalpha(self):
350-
result = pc.utf8_is_alpha(self._data)
363+
result = pc.utf8_is_alpha(self._pa_array)
351364
return BooleanDtype().__from_arrow__(result)
352365

353366
def _str_isdecimal(self):
354-
result = pc.utf8_is_decimal(self._data)
367+
result = pc.utf8_is_decimal(self._pa_array)
355368
return BooleanDtype().__from_arrow__(result)
356369

357370
def _str_isdigit(self):
358-
result = pc.utf8_is_digit(self._data)
371+
result = pc.utf8_is_digit(self._pa_array)
359372
return BooleanDtype().__from_arrow__(result)
360373

361374
def _str_islower(self):
362-
result = pc.utf8_is_lower(self._data)
375+
result = pc.utf8_is_lower(self._pa_array)
363376
return BooleanDtype().__from_arrow__(result)
364377

365378
def _str_isnumeric(self):
366-
result = pc.utf8_is_numeric(self._data)
379+
result = pc.utf8_is_numeric(self._pa_array)
367380
return BooleanDtype().__from_arrow__(result)
368381

369382
def _str_isspace(self):
370-
result = pc.utf8_is_space(self._data)
383+
result = pc.utf8_is_space(self._pa_array)
371384
return BooleanDtype().__from_arrow__(result)
372385

373386
def _str_istitle(self):
374-
result = pc.utf8_is_title(self._data)
387+
result = pc.utf8_is_title(self._pa_array)
375388
return BooleanDtype().__from_arrow__(result)
376389

377390
def _str_isupper(self):
378-
result = pc.utf8_is_upper(self._data)
391+
result = pc.utf8_is_upper(self._pa_array)
379392
return BooleanDtype().__from_arrow__(result)
380393

381394
def _str_len(self):
382-
result = pc.utf8_length(self._data)
395+
result = pc.utf8_length(self._pa_array)
383396
return Int64Dtype().__from_arrow__(result)
384397

385398
def _str_lower(self):
386-
return type(self)(pc.utf8_lower(self._data))
399+
return type(self)(pc.utf8_lower(self._pa_array))
387400

388401
def _str_upper(self):
389-
return type(self)(pc.utf8_upper(self._data))
402+
return type(self)(pc.utf8_upper(self._pa_array))
390403

391404
def _str_strip(self, to_strip=None):
392405
if to_strip is None:
393-
result = pc.utf8_trim_whitespace(self._data)
406+
result = pc.utf8_trim_whitespace(self._pa_array)
394407
else:
395-
result = pc.utf8_trim(self._data, characters=to_strip)
408+
result = pc.utf8_trim(self._pa_array, characters=to_strip)
396409
return type(self)(result)
397410

398411
def _str_lstrip(self, to_strip=None):
399412
if to_strip is None:
400-
result = pc.utf8_ltrim_whitespace(self._data)
413+
result = pc.utf8_ltrim_whitespace(self._pa_array)
401414
else:
402-
result = pc.utf8_ltrim(self._data, characters=to_strip)
415+
result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
403416
return type(self)(result)
404417

405418
def _str_rstrip(self, to_strip=None):
406419
if to_strip is None:
407-
result = pc.utf8_rtrim_whitespace(self._data)
420+
result = pc.utf8_rtrim_whitespace(self._pa_array)
408421
else:
409-
result = pc.utf8_rtrim(self._data, characters=to_strip)
422+
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
410423
return type(self)(result)

pandas/core/indexes/accessors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def isocalendar(self):
218218
result = (
219219
cast(ArrowExtensionArray, self._parent.array)
220220
._dt_isocalendar()
221-
._data.combine_chunks()
221+
._pa_array.combine_chunks()
222222
)
223223
iso_calendar_df = DataFrame(
224224
{

pandas/tests/copy_view/test_astype.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,9 @@ def test_astype_arrow_timestamp(using_copy_on_write):
192192
result = df.astype("timestamp[ns][pyarrow]")
193193
if using_copy_on_write:
194194
assert not result._mgr._has_no_reference(0)
195-
assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data)
195+
assert np.shares_memory(
196+
get_array(df, "a").asi8, get_array(result, "a")._pa_array
197+
)
196198

197199

198200
def test_convert_dtypes_infer_objects(using_copy_on_write):

pandas/tests/extension/test_arrow.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -274,14 +274,14 @@ def test_from_dtype(self, data, request):
274274

275275
def test_from_sequence_pa_array(self, data):
276276
# https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784
277-
# data._data = pa.ChunkedArray
278-
result = type(data)._from_sequence(data._data)
277+
# data._pa_array = pa.ChunkedArray
278+
result = type(data)._from_sequence(data._pa_array)
279279
tm.assert_extension_array_equal(result, data)
280-
assert isinstance(result._data, pa.ChunkedArray)
280+
assert isinstance(result._pa_array, pa.ChunkedArray)
281281

282-
result = type(data)._from_sequence(data._data.combine_chunks())
282+
result = type(data)._from_sequence(data._pa_array.combine_chunks())
283283
tm.assert_extension_array_equal(result, data)
284-
assert isinstance(result._data, pa.ChunkedArray)
284+
assert isinstance(result._pa_array, pa.ChunkedArray)
285285

286286
def test_from_sequence_pa_array_notimplemented(self, request):
287287
with pytest.raises(NotImplementedError, match="Converting strings to"):
@@ -317,7 +317,7 @@ def test_from_sequence_of_strings_pa_array(self, data, request):
317317
),
318318
)
319319
)
320-
pa_array = data._data.cast(pa.string())
320+
pa_array = data._pa_array.cast(pa.string())
321321
result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
322322
tm.assert_extension_array_equal(result, data)
323323

@@ -1456,7 +1456,7 @@ def test_quantile(data, interpolation, quantile, request):
14561456
or (pa.types.is_decimal(pa_dtype) and not pa_version_under7p0)
14571457
):
14581458
pass
1459-
elif pa.types.is_temporal(data._data.type):
1459+
elif pa.types.is_temporal(data._pa_array.type):
14601460
pass
14611461
else:
14621462
request.node.add_marker(
@@ -1619,7 +1619,7 @@ def test_pickle_roundtrip(data):
16191619

16201620
def test_astype_from_non_pyarrow(data):
16211621
# GH49795
1622-
pd_array = data._data.to_pandas().array
1622+
pd_array = data._pa_array.to_pandas().array
16231623
result = pd_array.astype(data.dtype)
16241624
assert not isinstance(pd_array.dtype, ArrowDtype)
16251625
assert isinstance(result.dtype, ArrowDtype)
@@ -1638,11 +1638,11 @@ def test_to_numpy_with_defaults(data):
16381638
# GH49973
16391639
result = data.to_numpy()
16401640

1641-
pa_type = data._data.type
1641+
pa_type = data._pa_array.type
16421642
if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
16431643
expected = np.array(list(data))
16441644
else:
1645-
expected = np.array(data._data)
1645+
expected = np.array(data._pa_array)
16461646

16471647
if data._hasna:
16481648
expected = expected.astype(object)
@@ -1668,7 +1668,7 @@ def test_setitem_null_slice(data):
16681668
result = orig.copy()
16691669
result[:] = data[0]
16701670
expected = ArrowExtensionArray(
1671-
pa.array([data[0]] * len(data), type=data._data.type)
1671+
pa.array([data[0]] * len(data), type=data._pa_array.type)
16721672
)
16731673
tm.assert_extension_array_equal(result, expected)
16741674

@@ -1685,7 +1685,7 @@ def test_setitem_null_slice(data):
16851685

16861686
def test_setitem_invalid_dtype(data):
16871687
# GH50248
1688-
pa_type = data._data.type
1688+
pa_type = data._pa_array.type
16891689
if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
16901690
fill_value = 123
16911691
err = TypeError

pandas/tests/extension/test_string.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def split_array(arr):
3535
def _split_array(arr):
3636
import pyarrow as pa
3737

38-
arrow_array = arr._data
38+
arrow_array = arr._pa_array
3939
split = len(arrow_array) // 2
4040
arrow_array = pa.chunked_array(
4141
[*arrow_array[:split].chunks, *arrow_array[split:].chunks]

pandas/tests/io/excel/test_readers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ def test_use_nullable_dtypes(self, read_ext, dtype_backend, option):
578578
)
579579
# pyarrow by default infers timestamp resolution as us, not ns
580580
expected["i"] = ArrowExtensionArray(
581-
expected["i"].array._data.cast(pa.timestamp(unit="us"))
581+
expected["i"].array._pa_array.cast(pa.timestamp(unit="us"))
582582
)
583583
# pyarrow supports a null type, so don't have to default to Int64
584584
expected["j"] = ArrowExtensionArray(pa.array([None, None]))

0 commit comments

Comments
 (0)