REF: ArrowEA _data->_pa_array (#50987)

jbrockmendel · web-flow · commit 5d04432f2896 · 2023-03-10T11:32:06.000Z
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -1028,8 +1028,8 @@ def shares_memory(left, right) -> bool:
         left = cast("ArrowExtensionArray", left)
         if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]":
             right = cast("ArrowExtensionArray", right)
-            left_pa_data = left._data
-            right_pa_data = right._data
+            left_pa_data = left._pa_array
+            right_pa_data = right._pa_array
             left_buf1 = left_pa_data.chunk(0).buffers()[1]
             right_buf1 = right_pa_data.chunk(0).buffers()[1]
             return left_buf1 == right_buf1
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -5,6 +5,7 @@
     Callable,
     Union,
 )
+import warnings
 
 import numpy as np
 
@@ -18,6 +19,7 @@
     npt,
 )
 from pandas.compat import pa_version_under7p0
+from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
     is_bool_dtype,
@@ -112,7 +114,7 @@ def __init__(self, values) -> None:
         super().__init__(values)
         self._dtype = StringDtype(storage="pyarrow")
 
-        if not pa.types.is_string(self._data.type):
+        if not pa.types.is_string(self._pa_array.type):
             raise ValueError(
                 "ArrowStringArray requires a PyArrow (chunked) array of string type"
             )
@@ -125,7 +127,7 @@ def __len__(self) -> int:
         -------
         length : int
         """
-        return len(self._data)
+        return len(self._pa_array)
 
     @classmethod
     def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
@@ -193,7 +195,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
         if not len(value_set):
             return np.zeros(len(self), dtype=bool)
 
-        result = pc.is_in(self._data, value_set=pa.array(value_set))
+        result = pc.is_in(self._pa_array, value_set=pa.array(value_set))
         # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
         # to False
         return np.array(result, dtype=np.bool_)
@@ -206,13 +208,24 @@ def astype(self, dtype, copy: bool = True):
                 return self.copy()
             return self
         elif isinstance(dtype, NumericDtype):
-            data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
+            data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
             return dtype.__from_arrow__(data)
         elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
             return self.to_numpy(dtype=dtype, na_value=np.nan)
 
         return super().astype(dtype, copy=copy)
 
+    @property
+    def _data(self):
+        # dask accesses ._data directlys
+        warnings.warn(
+            f"{type(self).__name__}._data is a deprecated and will be removed "
+            "in a future version, use ._pa_array instead",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+        return self._pa_array
+
     # ------------------------------------------------------------------------
     # String methods interface
 
@@ -292,12 +305,12 @@ def _str_contains(
                 fallback_performancewarning()
                 return super()._str_contains(pat, case, flags, na, regex)
             else:
-                result = pc.match_substring_regex(self._data, pat)
+                result = pc.match_substring_regex(self._pa_array, pat)
         else:
             if case:
-                result = pc.match_substring(self._data, pat)
+                result = pc.match_substring(self._pa_array, pat)
             else:
-                result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
+                result = pc.match_substring(pc.utf8_upper(self._pa_array), pat.upper())
         result = BooleanDtype().__from_arrow__(result)
         if not isna(na):
             result[isna(result)] = bool(na)
@@ -325,7 +338,7 @@ def _str_replace(
             return super()._str_replace(pat, repl, n, case, flags, regex)
 
         func = pc.replace_substring_regex if regex else pc.replace_substring
-        result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
+        result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n)
         return type(self)(result)
 
     def _str_match(
@@ -343,68 +356,68 @@ def _str_fullmatch(
         return self._str_match(pat, case, flags, na)
 
     def _str_isalnum(self):
-        result = pc.utf8_is_alnum(self._data)
+        result = pc.utf8_is_alnum(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isalpha(self):
-        result = pc.utf8_is_alpha(self._data)
+        result = pc.utf8_is_alpha(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isdecimal(self):
-        result = pc.utf8_is_decimal(self._data)
+        result = pc.utf8_is_decimal(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isdigit(self):
-        result = pc.utf8_is_digit(self._data)
+        result = pc.utf8_is_digit(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_islower(self):
-        result = pc.utf8_is_lower(self._data)
+        result = pc.utf8_is_lower(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isnumeric(self):
-        result = pc.utf8_is_numeric(self._data)
+        result = pc.utf8_is_numeric(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isspace(self):
-        result = pc.utf8_is_space(self._data)
+        result = pc.utf8_is_space(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_istitle(self):
-        result = pc.utf8_is_title(self._data)
+        result = pc.utf8_is_title(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isupper(self):
-        result = pc.utf8_is_upper(self._data)
+        result = pc.utf8_is_upper(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_len(self):
-        result = pc.utf8_length(self._data)
+        result = pc.utf8_length(self._pa_array)
         return Int64Dtype().__from_arrow__(result)
 
     def _str_lower(self):
-        return type(self)(pc.utf8_lower(self._data))
+        return type(self)(pc.utf8_lower(self._pa_array))
 
     def _str_upper(self):
-        return type(self)(pc.utf8_upper(self._data))
+        return type(self)(pc.utf8_upper(self._pa_array))
 
     def _str_strip(self, to_strip=None):
         if to_strip is None:
-            result = pc.utf8_trim_whitespace(self._data)
+            result = pc.utf8_trim_whitespace(self._pa_array)
         else:
-            result = pc.utf8_trim(self._data, characters=to_strip)
+            result = pc.utf8_trim(self._pa_array, characters=to_strip)
         return type(self)(result)
 
     def _str_lstrip(self, to_strip=None):
         if to_strip is None:
-            result = pc.utf8_ltrim_whitespace(self._data)
+            result = pc.utf8_ltrim_whitespace(self._pa_array)
         else:
-            result = pc.utf8_ltrim(self._data, characters=to_strip)
+            result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
         return type(self)(result)
 
     def _str_rstrip(self, to_strip=None):
         if to_strip is None:
-            result = pc.utf8_rtrim_whitespace(self._data)
+            result = pc.utf8_rtrim_whitespace(self._pa_array)
         else:
-            result = pc.utf8_rtrim(self._data, characters=to_strip)
+            result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
         return type(self)(result)
diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py
@@ -218,7 +218,7 @@ def isocalendar(self):
         result = (
             cast(ArrowExtensionArray, self._parent.array)
             ._dt_isocalendar()
-            ._data.combine_chunks()
+            ._pa_array.combine_chunks()
         )
         iso_calendar_df = DataFrame(
             {
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
@@ -192,7 +192,9 @@ def test_astype_arrow_timestamp(using_copy_on_write):
     result = df.astype("timestamp[ns][pyarrow]")
     if using_copy_on_write:
         assert not result._mgr._has_no_reference(0)
-        assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data)
+        assert np.shares_memory(
+            get_array(df, "a").asi8, get_array(result, "a")._pa_array
+        )
 
 
 def test_convert_dtypes_infer_objects(using_copy_on_write):
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -274,14 +274,14 @@ def test_from_dtype(self, data, request):
 
     def test_from_sequence_pa_array(self, data):
         # https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784
-        # data._data = pa.ChunkedArray
-        result = type(data)._from_sequence(data._data)
+        # data._pa_array = pa.ChunkedArray
+        result = type(data)._from_sequence(data._pa_array)
         tm.assert_extension_array_equal(result, data)
-        assert isinstance(result._data, pa.ChunkedArray)
+        assert isinstance(result._pa_array, pa.ChunkedArray)
 
-        result = type(data)._from_sequence(data._data.combine_chunks())
+        result = type(data)._from_sequence(data._pa_array.combine_chunks())
         tm.assert_extension_array_equal(result, data)
-        assert isinstance(result._data, pa.ChunkedArray)
+        assert isinstance(result._pa_array, pa.ChunkedArray)
 
     def test_from_sequence_pa_array_notimplemented(self, request):
         with pytest.raises(NotImplementedError, match="Converting strings to"):
@@ -317,7 +317,7 @@ def test_from_sequence_of_strings_pa_array(self, data, request):
                         ),
                     )
                 )
-        pa_array = data._data.cast(pa.string())
+        pa_array = data._pa_array.cast(pa.string())
         result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
         tm.assert_extension_array_equal(result, data)
 
@@ -1456,7 +1456,7 @@ def test_quantile(data, interpolation, quantile, request):
         or (pa.types.is_decimal(pa_dtype) and not pa_version_under7p0)
     ):
         pass
-    elif pa.types.is_temporal(data._data.type):
+    elif pa.types.is_temporal(data._pa_array.type):
         pass
     else:
         request.node.add_marker(
@@ -1619,7 +1619,7 @@ def test_pickle_roundtrip(data):
 
 def test_astype_from_non_pyarrow(data):
     # GH49795
-    pd_array = data._data.to_pandas().array
+    pd_array = data._pa_array.to_pandas().array
     result = pd_array.astype(data.dtype)
     assert not isinstance(pd_array.dtype, ArrowDtype)
     assert isinstance(result.dtype, ArrowDtype)
@@ -1638,11 +1638,11 @@ def test_to_numpy_with_defaults(data):
     # GH49973
     result = data.to_numpy()
 
-    pa_type = data._data.type
+    pa_type = data._pa_array.type
     if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
         expected = np.array(list(data))
     else:
-        expected = np.array(data._data)
+        expected = np.array(data._pa_array)
 
     if data._hasna:
         expected = expected.astype(object)
@@ -1668,7 +1668,7 @@ def test_setitem_null_slice(data):
     result = orig.copy()
     result[:] = data[0]
     expected = ArrowExtensionArray(
-        pa.array([data[0]] * len(data), type=data._data.type)
+        pa.array([data[0]] * len(data), type=data._pa_array.type)
     )
     tm.assert_extension_array_equal(result, expected)
 
@@ -1685,7 +1685,7 @@ def test_setitem_null_slice(data):
 
 def test_setitem_invalid_dtype(data):
     # GH50248
-    pa_type = data._data.type
+    pa_type = data._pa_array.type
     if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
         fill_value = 123
         err = TypeError
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -35,7 +35,7 @@ def split_array(arr):
     def _split_array(arr):
         import pyarrow as pa
 
-        arrow_array = arr._data
+        arrow_array = arr._pa_array
         split = len(arrow_array) // 2
         arrow_array = pa.chunked_array(
             [*arrow_array[:split].chunks, *arrow_array[split:].chunks]
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -578,7 +578,7 @@ def test_use_nullable_dtypes(self, read_ext, dtype_backend, option):
             )
             # pyarrow by default infers timestamp resolution as us, not ns
             expected["i"] = ArrowExtensionArray(
-                expected["i"].array._data.cast(pa.timestamp(unit="us"))
+                expected["i"].array._pa_array.cast(pa.timestamp(unit="us"))
             )
             # pyarrow supports a null type, so don't have to default to Int64
             expected["j"] = ArrowExtensionArray(pa.array([None, None]))

Original file line number	Diff line number	Diff line change
`@@ -218,7 +218,7 @@ def isocalendar(self):`
`218`	`218`	`result = (`
`219`	`219`	`cast(ArrowExtensionArray, self._parent.array)`
`220`	`220`	`._dt_isocalendar()`
`221`		`- ._data.combine_chunks()`
	`221`	`+ ._pa_array.combine_chunks()`
`222`	`222`	`)`
`223`	`223`	`iso_calendar_df = DataFrame(`
`224`	`224`	`{`
Original file line number	Diff line number	Diff line change
`@@ -578,7 +578,7 @@ def test_use_nullable_dtypes(self, read_ext, dtype_backend, option):`
`578`	`578`	`)`
`579`	`579`	`# pyarrow by default infers timestamp resolution as us, not ns`
`580`	`580`	`expected["i"] = ArrowExtensionArray(`
`581`		`- expected["i"].array._data.cast(pa.timestamp(unit="us"))`
	`581`	`+ expected["i"].array._pa_array.cast(pa.timestamp(unit="us"))`
`582`	`582`	`)`
`583`	`583`	`# pyarrow supports a null type, so don't have to default to Int64`
`584`	`584`	`expected["j"] = ArrowExtensionArray(pa.array([None, None]))`