Skip to content

REF: ArrowEA _data->_pa_array #50987

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Mar 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
d57ef73
REF: ArrowEA _data->_pa_array
jbrockmendel Jan 26, 2023
fa480eb
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 2, 2023
369b683
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 2, 2023
9ad7eac
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 7, 2023
2e0a41d
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 9, 2023
89afa58
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 9, 2023
ccad453
fixup accessor
jbrockmendel Feb 9, 2023
c2d99b7
fix copy/view test
jbrockmendel Feb 11, 2023
4e61fe3
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 11, 2023
76adc10
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 14, 2023
5ca07fd
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 15, 2023
8a90775
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 17, 2023
662afac
fix ArrowStringArray
jbrockmendel Feb 17, 2023
2a8ab98
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 17, 2023
86959c9
fix pickle tests
jbrockmendel Feb 18, 2023
3f20485
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 18, 2023
0f48359
update
jbrockmendel Feb 18, 2023
3abe547
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 23, 2023
a0c8ee6
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 25, 2023
f0c878e
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 26, 2023
1f2c172
Merge branch 'main' into ref-pa-name
jbrockmendel Feb 27, 2023
7c4b1e5
Merge branch 'main' into ref-pa-name
jbrockmendel Mar 7, 2023
c4d0959
Merge branch 'main' into ref-pa-name
jbrockmendel Mar 9, 2023
bc53686
Merge branch 'main' into ref-pa-name
jbrockmendel Mar 9, 2023
60c912f
update setstate, _to_pydatetime
jbrockmendel Mar 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,8 +1028,8 @@ def shares_memory(left, right) -> bool:
left = cast("ArrowExtensionArray", left)
if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]":
right = cast("ArrowExtensionArray", right)
left_pa_data = left._data
right_pa_data = right._data
left_pa_data = left._pa_array
right_pa_data = right._pa_array
left_buf1 = left_pa_data.chunk(0).buffers()[1]
right_buf1 = right_pa_data.chunk(0).buffers()[1]
return left_buf1 == right_buf1
Expand Down
306 changes: 153 additions & 153 deletions pandas/core/arrays/arrow/array.py

Large diffs are not rendered by default.

65 changes: 39 additions & 26 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Callable,
Union,
)
import warnings

import numpy as np

Expand All @@ -18,6 +19,7 @@
npt,
)
from pandas.compat import pa_version_under7p0
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import (
is_bool_dtype,
Expand Down Expand Up @@ -112,7 +114,7 @@ def __init__(self, values) -> None:
super().__init__(values)
self._dtype = StringDtype(storage="pyarrow")

if not pa.types.is_string(self._data.type):
if not pa.types.is_string(self._pa_array.type):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
)
Expand All @@ -125,7 +127,7 @@ def __len__(self) -> int:
-------
length : int
"""
return len(self._data)
return len(self._pa_array)

@classmethod
def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
Expand Down Expand Up @@ -193,7 +195,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
if not len(value_set):
return np.zeros(len(self), dtype=bool)

result = pc.is_in(self._data, value_set=pa.array(value_set))
result = pc.is_in(self._pa_array, value_set=pa.array(value_set))
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)
Expand All @@ -206,13 +208,24 @@ def astype(self, dtype, copy: bool = True):
return self.copy()
return self
elif isinstance(dtype, NumericDtype):
data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
return dtype.__from_arrow__(data)
elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
return self.to_numpy(dtype=dtype, na_value=np.nan)

return super().astype(dtype, copy=copy)

@property
def _data(self):
# dask accesses ._data directlys
warnings.warn(
f"{type(self).__name__}._data is a deprecated and will be removed "
"in a future version, use ._pa_array instead",
FutureWarning,
stacklevel=find_stack_level(),
)
return self._pa_array

# ------------------------------------------------------------------------
# String methods interface

Expand Down Expand Up @@ -292,12 +305,12 @@ def _str_contains(
fallback_performancewarning()
return super()._str_contains(pat, case, flags, na, regex)
else:
result = pc.match_substring_regex(self._data, pat)
result = pc.match_substring_regex(self._pa_array, pat)
else:
if case:
result = pc.match_substring(self._data, pat)
result = pc.match_substring(self._pa_array, pat)
else:
result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
result = pc.match_substring(pc.utf8_upper(self._pa_array), pat.upper())
result = BooleanDtype().__from_arrow__(result)
if not isna(na):
result[isna(result)] = bool(na)
Expand Down Expand Up @@ -325,7 +338,7 @@ def _str_replace(
return super()._str_replace(pat, repl, n, case, flags, regex)

func = pc.replace_substring_regex if regex else pc.replace_substring
result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n)
return type(self)(result)

def _str_match(
Expand All @@ -343,68 +356,68 @@ def _str_fullmatch(
return self._str_match(pat, case, flags, na)

def _str_isalnum(self):
result = pc.utf8_is_alnum(self._data)
result = pc.utf8_is_alnum(self._pa_array)
return BooleanDtype().__from_arrow__(result)

def _str_isalpha(self):
result = pc.utf8_is_alpha(self._data)
result = pc.utf8_is_alpha(self._pa_array)
return BooleanDtype().__from_arrow__(result)

def _str_isdecimal(self):
result = pc.utf8_is_decimal(self._data)
result = pc.utf8_is_decimal(self._pa_array)
return BooleanDtype().__from_arrow__(result)

def _str_isdigit(self):
result = pc.utf8_is_digit(self._data)
result = pc.utf8_is_digit(self._pa_array)
return BooleanDtype().__from_arrow__(result)

def _str_islower(self):
result = pc.utf8_is_lower(self._data)
result = pc.utf8_is_lower(self._pa_array)
return BooleanDtype().__from_arrow__(result)

def _str_isnumeric(self):
result = pc.utf8_is_numeric(self._data)
result = pc.utf8_is_numeric(self._pa_array)
return BooleanDtype().__from_arrow__(result)

def _str_isspace(self):
result = pc.utf8_is_space(self._data)
result = pc.utf8_is_space(self._pa_array)
return BooleanDtype().__from_arrow__(result)

def _str_istitle(self):
result = pc.utf8_is_title(self._data)
result = pc.utf8_is_title(self._pa_array)
return BooleanDtype().__from_arrow__(result)

def _str_isupper(self):
result = pc.utf8_is_upper(self._data)
result = pc.utf8_is_upper(self._pa_array)
return BooleanDtype().__from_arrow__(result)

def _str_len(self):
result = pc.utf8_length(self._data)
result = pc.utf8_length(self._pa_array)
return Int64Dtype().__from_arrow__(result)

def _str_lower(self):
return type(self)(pc.utf8_lower(self._data))
return type(self)(pc.utf8_lower(self._pa_array))

def _str_upper(self):
return type(self)(pc.utf8_upper(self._data))
return type(self)(pc.utf8_upper(self._pa_array))

def _str_strip(self, to_strip=None):
if to_strip is None:
result = pc.utf8_trim_whitespace(self._data)
result = pc.utf8_trim_whitespace(self._pa_array)
else:
result = pc.utf8_trim(self._data, characters=to_strip)
result = pc.utf8_trim(self._pa_array, characters=to_strip)
return type(self)(result)

def _str_lstrip(self, to_strip=None):
if to_strip is None:
result = pc.utf8_ltrim_whitespace(self._data)
result = pc.utf8_ltrim_whitespace(self._pa_array)
else:
result = pc.utf8_ltrim(self._data, characters=to_strip)
result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
return type(self)(result)

def _str_rstrip(self, to_strip=None):
if to_strip is None:
result = pc.utf8_rtrim_whitespace(self._data)
result = pc.utf8_rtrim_whitespace(self._pa_array)
else:
result = pc.utf8_rtrim(self._data, characters=to_strip)
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
return type(self)(result)
2 changes: 1 addition & 1 deletion pandas/core/indexes/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def isocalendar(self):
result = (
cast(ArrowExtensionArray, self._parent.array)
._dt_isocalendar()
._data.combine_chunks()
._pa_array.combine_chunks()
)
iso_calendar_df = DataFrame(
{
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,9 @@ def test_astype_arrow_timestamp(using_copy_on_write):
result = df.astype("timestamp[ns][pyarrow]")
if using_copy_on_write:
assert not result._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data)
assert np.shares_memory(
get_array(df, "a").asi8, get_array(result, "a")._pa_array
)


def test_convert_dtypes_infer_objects(using_copy_on_write):
Expand Down
24 changes: 12 additions & 12 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,14 +274,14 @@ def test_from_dtype(self, data, request):

def test_from_sequence_pa_array(self, data):
# https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784
# data._data = pa.ChunkedArray
result = type(data)._from_sequence(data._data)
# data._pa_array = pa.ChunkedArray
result = type(data)._from_sequence(data._pa_array)
tm.assert_extension_array_equal(result, data)
assert isinstance(result._data, pa.ChunkedArray)
assert isinstance(result._pa_array, pa.ChunkedArray)

result = type(data)._from_sequence(data._data.combine_chunks())
result = type(data)._from_sequence(data._pa_array.combine_chunks())
tm.assert_extension_array_equal(result, data)
assert isinstance(result._data, pa.ChunkedArray)
assert isinstance(result._pa_array, pa.ChunkedArray)

def test_from_sequence_pa_array_notimplemented(self, request):
with pytest.raises(NotImplementedError, match="Converting strings to"):
Expand Down Expand Up @@ -317,7 +317,7 @@ def test_from_sequence_of_strings_pa_array(self, data, request):
),
)
)
pa_array = data._data.cast(pa.string())
pa_array = data._pa_array.cast(pa.string())
result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
tm.assert_extension_array_equal(result, data)

Expand Down Expand Up @@ -1456,7 +1456,7 @@ def test_quantile(data, interpolation, quantile, request):
or (pa.types.is_decimal(pa_dtype) and not pa_version_under7p0)
):
pass
elif pa.types.is_temporal(data._data.type):
elif pa.types.is_temporal(data._pa_array.type):
pass
else:
request.node.add_marker(
Expand Down Expand Up @@ -1619,7 +1619,7 @@ def test_pickle_roundtrip(data):

def test_astype_from_non_pyarrow(data):
# GH49795
pd_array = data._data.to_pandas().array
pd_array = data._pa_array.to_pandas().array
result = pd_array.astype(data.dtype)
assert not isinstance(pd_array.dtype, ArrowDtype)
assert isinstance(result.dtype, ArrowDtype)
Expand All @@ -1638,11 +1638,11 @@ def test_to_numpy_with_defaults(data):
# GH49973
result = data.to_numpy()

pa_type = data._data.type
pa_type = data._pa_array.type
if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
expected = np.array(list(data))
else:
expected = np.array(data._data)
expected = np.array(data._pa_array)

if data._hasna:
expected = expected.astype(object)
Expand All @@ -1668,7 +1668,7 @@ def test_setitem_null_slice(data):
result = orig.copy()
result[:] = data[0]
expected = ArrowExtensionArray(
pa.array([data[0]] * len(data), type=data._data.type)
pa.array([data[0]] * len(data), type=data._pa_array.type)
)
tm.assert_extension_array_equal(result, expected)

Expand All @@ -1685,7 +1685,7 @@ def test_setitem_null_slice(data):

def test_setitem_invalid_dtype(data):
# GH50248
pa_type = data._data.type
pa_type = data._pa_array.type
if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
fill_value = 123
err = TypeError
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def split_array(arr):
def _split_array(arr):
import pyarrow as pa

arrow_array = arr._data
arrow_array = arr._pa_array
split = len(arrow_array) // 2
arrow_array = pa.chunked_array(
[*arrow_array[:split].chunks, *arrow_array[split:].chunks]
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ def test_use_nullable_dtypes(self, read_ext, dtype_backend, option):
)
# pyarrow by default infers timestamp resolution as us, not ns
expected["i"] = ArrowExtensionArray(
expected["i"].array._data.cast(pa.timestamp(unit="us"))
expected["i"].array._pa_array.cast(pa.timestamp(unit="us"))
)
# pyarrow supports a null type, so don't have to default to Int64
expected["j"] = ArrowExtensionArray(pa.array([None, None]))
Expand Down