diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index f2f0aaffcd6b5..c49dda2763c83 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -1028,8 +1028,8 @@ def shares_memory(left, right) -> bool: left = cast("ArrowExtensionArray", left) if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]": right = cast("ArrowExtensionArray", right) - left_pa_data = left._data - right_pa_data = right._data + left_pa_data = left._pa_array + right_pa_data = right._pa_array left_buf1 = left_pa_data.chunk(0).buffers()[1] right_buf1 = right_pa_data.chunk(0).buffers()[1] return left_buf1 == right_buf1 diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 6084634503d97..2eb723ecb8e37 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -227,14 +227,14 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: msg = "pyarrow>=7.0.0 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) if isinstance(values, pa.Array): - self._data = pa.chunked_array([values]) + self._pa_array = pa.chunked_array([values]) elif isinstance(values, pa.ChunkedArray): - self._data = values + self._pa_array = values else: raise ValueError( f"Unsupported type '{type(values)}' for ArrowExtensionArray" ) - self._dtype = ArrowDtype(self._data.type) + self._dtype = ArrowDtype(self._pa_array.type) @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): @@ -243,7 +243,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal """ pa_dtype = to_pyarrow_type(dtype) if isinstance(scalars, cls): - scalars = scalars._data + scalars = scalars._pa_array elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): if copy and is_array_like(scalars): # pa array should not get updated when numpy array is updated @@ -367,7 +367,7 @@ def __getitem__(self, item: PositionalIndexer): elif is_integer_dtype(item.dtype): return self.take(item) elif is_bool_dtype(item.dtype): - return type(self)(self._data.filter(item)) + return type(self)(self._pa_array.filter(item)) else: raise IndexError( "Only integers, slices and integer or " @@ -389,7 +389,7 @@ def __getitem__(self, item: PositionalIndexer): ) # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. - value = self._data[item] + value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) else: @@ -404,7 +404,7 @@ def __iter__(self) -> Iterator[Any]: Iterate over elements of the array. """ na_value = self._dtype.na_value - for value in self._data: + for value in self._pa_array: val = value.as_py() if val is None: yield na_value @@ -413,44 +413,44 @@ def __iter__(self) -> Iterator[Any]: def __arrow_array__(self, type=None): """Convert myself to a pyarrow ChunkedArray.""" - return self._data + return self._pa_array def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" return self.to_numpy(dtype=dtype) def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - return type(self)(pc.invert(self._data)) + return type(self)(pc.invert(self._pa_array)) def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - return type(self)(pc.negate_checked(self._data)) + return type(self)(pc.negate_checked(self._pa_array)) def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - return type(self)(self._data) + return type(self)(self._pa_array) def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - return type(self)(pc.abs_checked(self._data)) + return type(self)(pc.abs_checked(self._pa_array)) # GH 42600: __getstate__/__setstate__ not necessary once # https://issues.apache.org/jira/browse/ARROW-10739 is addressed def __getstate__(self): state = self.__dict__.copy() - state["_data"] = self._data.combine_chunks() + state["_data"] = self._pa_array.combine_chunks() return state def __setstate__(self, state) -> None: - state["_data"] = pa.chunked_array(state["_data"]) + state["_pa_array"] = pa.chunked_array(state["_data"]) self.__dict__.update(state) def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] if isinstance(other, ArrowExtensionArray): - result = pc_func(self._data, other._data) + result = pc_func(self._pa_array, other._pa_array) elif isinstance(other, (np.ndarray, list)): - result = pc_func(self._data, other) + result = pc_func(self._pa_array, other) elif is_scalar(other): try: - result = pc_func(self._data, pa.scalar(other)) + result = pc_func(self._pa_array, pa.scalar(other)) except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): mask = isna(self) | isna(other) valid = ~mask @@ -465,12 +465,12 @@ def _cmp_method(self, other, op): return ArrowExtensionArray(result) def _evaluate_op_method(self, other, op, arrow_funcs): - pa_type = self._data.type + pa_type = self._pa_array.type if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [ operator.add, roperator.radd, ]: - length = self._data.length() + length = self._pa_array.length() seps: list[str] | list[bytes] if pa.types.is_string(pa_type): @@ -481,27 +481,27 @@ def _evaluate_op_method(self, other, op, arrow_funcs): if is_scalar(other): other = [other] * length elif isinstance(other, type(self)): - other = other._data + other = other._pa_array if op is operator.add: - result = pc.binary_join_element_wise(self._data, other, seps) + result = pc.binary_join_element_wise(self._pa_array, other, seps) else: - result = pc.binary_join_element_wise(other, self._data, seps) + result = pc.binary_join_element_wise(other, self._pa_array, seps) return type(self)(result) pc_func = arrow_funcs[op.__name__] if pc_func is NotImplemented: raise NotImplementedError(f"{op.__name__} not implemented.") if isinstance(other, ArrowExtensionArray): - result = pc_func(self._data, other._data) + result = pc_func(self._pa_array, other._pa_array) elif isinstance(other, (np.ndarray, list)): - result = pc_func(self._data, pa.array(other, from_pandas=True)) + result = pc_func(self._pa_array, pa.array(other, from_pandas=True)) elif is_scalar(other): if isna(other) and op.__name__ in ARROW_LOGICAL_FUNCS: # pyarrow kleene ops require null to be typed - pa_scalar = pa.scalar(None, type=self._data.type) + pa_scalar = pa.scalar(None, type=self._pa_array.type) else: pa_scalar = pa.scalar(other) - result = pc_func(self._data, pa_scalar) + result = pc_func(self._pa_array, pa_scalar) else: raise NotImplementedError( f"{op.__name__} not implemented for {type(other)}" @@ -519,7 +519,7 @@ def equals(self, other) -> bool: return False # I'm told that pyarrow makes __eq__ behave like pandas' equals; # TODO: is this documented somewhere? - return self._data == other._data + return self._pa_array == other._pa_array @property def dtype(self) -> ArrowDtype: @@ -533,7 +533,7 @@ def nbytes(self) -> int: """ The number of bytes needed to store this object in memory. """ - return self._data.nbytes + return self._pa_array.nbytes def __len__(self) -> int: """ @@ -543,13 +543,13 @@ def __len__(self) -> int: ------- length : int """ - return len(self._data) + return len(self._pa_array) def __contains__(self, key) -> bool: # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604 if isna(key) and key is not self.dtype.na_value: if self.dtype.kind == "f" and lib.is_float(key) and isna(key): - return pc.any(pc.is_nan(self._data)).as_py() + return pc.any(pc.is_nan(self._pa_array)).as_py() # e.g. date or timestamp types we do not allow None here to match pd.NA return False @@ -559,7 +559,7 @@ def __contains__(self, key) -> bool: @property def _hasna(self) -> bool: - return self._data.null_count > 0 + return self._pa_array.null_count > 0 def isna(self) -> npt.NDArray[np.bool_]: """ @@ -568,13 +568,13 @@ def isna(self) -> npt.NDArray[np.bool_]: This should return a 1-D array the same length as 'self'. """ # GH51630: fast paths - null_count = self._data.null_count + null_count = self._pa_array.null_count if null_count == 0: return np.zeros(len(self), dtype=np.bool_) elif null_count == len(self): return np.ones(len(self), dtype=np.bool_) - return self._data.is_null().to_numpy() + return self._pa_array.is_null().to_numpy() def any(self, *, skipna: bool = True, **kwargs): """ @@ -706,13 +706,13 @@ def argsort( raise ValueError(f"invalid na_position: {na_position}") result = pc.array_sort_indices( - self._data, order=order, null_placement=null_placement + self._pa_array, order=order, null_placement=null_placement ) np_result = result.to_numpy() return np_result.astype(np.intp, copy=False) def _argmin_max(self, skipna: bool, method: str) -> int: - if self._data.length() in (0, self._data.null_count) or ( + if self._pa_array.length() in (0, self._pa_array.null_count) or ( self._hasna and not skipna ): # For empty or all null, pyarrow returns -1 but pandas expects TypeError @@ -720,7 +720,7 @@ def _argmin_max(self, skipna: bool, method: str) -> int: # let ExtensionArray.arg{max|min} raise return getattr(super(), f"arg{method}")(skipna=skipna) - data = self._data + data = self._pa_array if pa.types.is_duration(data.type): data = data.cast(pa.int64()) @@ -743,7 +743,7 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: ------- type(self) """ - return type(self)(self._data) + return type(self)(self._pa_array) def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ @@ -753,7 +753,7 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: ------- ArrowExtensionArray """ - return type(self)(pc.drop_null(self._data)) + return type(self)(pc.drop_null(self._pa_array)) @doc(ExtensionArray.fillna) def fillna( @@ -795,15 +795,15 @@ def convert_fill_value(value, pa_type, dtype): raise TypeError(msg) from err return value - fill_value = convert_fill_value(value, self._data.type, self.dtype) + fill_value = convert_fill_value(value, self._pa_array.type, self.dtype) try: if method is None: - return type(self)(pc.fill_null(self._data, fill_value=fill_value)) + return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value)) elif method == "pad": - return type(self)(pc.fill_null_forward(self._data)) + return type(self)(pc.fill_null_forward(self._pa_array)) elif method == "backfill": - return type(self)(pc.fill_null_backward(self._data)) + return type(self)(pc.fill_null_backward(self._pa_array)) except pa.ArrowNotImplementedError: # ArrowNotImplementedError: Function 'coalesce' has no kernel # matching input types (duration[ns], duration[ns]) @@ -818,7 +818,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: if not len(values): return np.zeros(len(self), dtype=bool) - result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True)) + result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True)) # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) @@ -837,7 +837,7 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: The values returned by this method are also used in :func:`pandas.util.hash_pandas_object`. """ - values = self._data.to_numpy() + values = self._pa_array.to_numpy() return values, self.dtype.na_value @doc(ExtensionArray.factorize) @@ -847,12 +847,12 @@ def factorize( ) -> tuple[np.ndarray, ExtensionArray]: null_encoding = "mask" if use_na_sentinel else "encode" - pa_type = self._data.type + pa_type = self._pa_array.type if pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._data.cast(pa.int64()) + data = self._pa_array.cast(pa.int64()) else: - data = self._data + data = self._pa_array encoded = data.dictionary_encode(null_encoding=null_encoding) if encoded.length() == 0: @@ -901,7 +901,7 @@ def round( DataFrame.round : Round values of a DataFrame. Series.round : Round values of a Series. """ - return type(self)(pc.round(self._data, ndigits=decimals)) + return type(self)(pc.round(self._pa_array, ndigits=decimals)) @doc(ExtensionArray.searchsorted) def searchsorted( @@ -987,18 +987,18 @@ def take( # "Sequence[int]", variable has type "ndarray") indices_array = indices # type: ignore[assignment] - if len(self._data) == 0 and (indices_array >= 0).any(): + if len(self._pa_array) == 0 and (indices_array >= 0).any(): raise IndexError("cannot do a non-empty take") - if indices_array.size > 0 and indices_array.max() >= len(self._data): + if indices_array.size > 0 and indices_array.max() >= len(self._pa_array): raise IndexError("out of bounds value in 'indices'.") if allow_fill: fill_mask = indices_array < 0 if fill_mask.any(): - validate_indices(indices_array, len(self._data)) + validate_indices(indices_array, len(self._pa_array)) # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=fill_mask) - result = self._data.take(indices_array) + result = self._pa_array.take(indices_array) if isna(fill_value): return type(self)(result) # TODO: ArrowNotImplementedError: Function fill_null has no @@ -1009,14 +1009,14 @@ def take( # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: # Nothing to fill - return type(self)(self._data.take(indices)) + return type(self)(self._pa_array.take(indices)) else: # allow_fill=False # TODO(ARROW-9432): Treat negative indices as indices from the right. if (indices_array < 0).any(): # Don't modify in-place indices_array = np.copy(indices_array) - indices_array[indices_array < 0] += len(self._data) - return type(self)(self._data.take(indices_array)) + indices_array[indices_array < 0] += len(self._pa_array) + return type(self)(self._pa_array.take(indices_array)) @doc(ExtensionArray.to_numpy) def to_numpy( @@ -1030,7 +1030,7 @@ def to_numpy( if na_value is lib.no_default: na_value = self.dtype.na_value - pa_type = self._data.type + pa_type = self._pa_array.type if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type): # temporal types with units and/or timezones currently # require pandas/python scalars to pass all tests @@ -1039,13 +1039,13 @@ def to_numpy( elif is_object_dtype(dtype) and self._hasna: result = np.empty(len(self), dtype=object) mask = ~self.isna() - result[mask] = np.asarray(self[mask]._data) + result[mask] = np.asarray(self[mask]._pa_array) elif self._hasna: data = self.copy() data[self.isna()] = na_value - return np.asarray(data._data, dtype=dtype) + return np.asarray(data._pa_array, dtype=dtype) else: - result = np.asarray(self._data, dtype=dtype) + result = np.asarray(self._pa_array, dtype=dtype) if copy: result = result.copy() if self._hasna: @@ -1060,13 +1060,13 @@ def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: ------- ArrowExtensionArray """ - pa_type = self._data.type + pa_type = self._pa_array.type if pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._data.cast(pa.int64()) + data = self._pa_array.cast(pa.int64()) else: - data = self._data + data = self._pa_array pa_result = pc.unique(data) @@ -1092,12 +1092,12 @@ def value_counts(self, dropna: bool = True) -> Series: -------- Series.value_counts """ - pa_type = self._data.type + pa_type = self._pa_array.type if pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._data.cast(pa.int64()) + data = self._pa_array.cast(pa.int64()) else: - data = self._data + data = self._pa_array from pandas import ( Index, @@ -1137,7 +1137,7 @@ def _concat_same_type( ------- ArrowExtensionArray """ - chunks = [array for ea in to_concat for array in ea._data.iterchunks()] + chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] if to_concat[0].dtype == "string": # StringDtype has no attrivute pyarrow_dtype pa_dtype = pa.string() @@ -1183,7 +1183,7 @@ def _accumulate( if pyarrow_meth is None: return super()._accumulate(name, skipna=skipna, **kwargs) - data_to_accum = self._data + data_to_accum = self._pa_array pa_dtype = data_to_accum.type if pa.types.is_duration(pa_dtype): @@ -1220,9 +1220,9 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): ------ TypeError : subclass does not define reductions """ - pa_type = self._data.type + pa_type = self._pa_array.type - data_to_reduce = self._data + data_to_reduce = self._pa_array if name in ["any", "all"] and ( pa.types.is_integer(pa_type) @@ -1234,28 +1234,28 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): # for other dtypes, matching our non-pyarrow behavior if pa.types.is_duration(pa_type): - data_to_cmp = self._data.cast(pa.int64()) + data_to_cmp = self._pa_array.cast(pa.int64()) else: - data_to_cmp = self._data + data_to_cmp = self._pa_array not_eq = pc.not_equal(data_to_cmp, 0) data_to_reduce = not_eq elif name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): - data_to_reduce = self._data.cast(pa.int64()) + data_to_reduce = self._pa_array.cast(pa.int64()) elif name in ["median", "mean", "std", "sem"] and pa.types.is_temporal(pa_type): nbits = pa_type.bit_width if nbits == 32: - data_to_reduce = self._data.cast(pa.int32()) + data_to_reduce = self._pa_array.cast(pa.int32()) else: - data_to_reduce = self._data.cast(pa.int64()) + data_to_reduce = self._pa_array.cast(pa.int64()) if name == "sem": def pyarrow_meth(data, skip_nulls, **kwargs): numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs) - denominator = pc.sqrt_checked(pc.count(self._data)) + denominator = pc.sqrt_checked(pc.count(self._pa_array)) return pc.divide_checked(numerator, denominator) else: @@ -1340,7 +1340,7 @@ def __setitem__(self, key, value) -> None: if com.is_null_slice(key): # fast path (GH50248) - data = self._if_else(True, value, self._data) + data = self._if_else(True, value, self._pa_array) elif is_integer(key): # fast path @@ -1357,20 +1357,20 @@ def __setitem__(self, key, value) -> None: elif isinstance(value, pa.Scalar): value = value.as_py() chunks = [ - *self._data[:key].chunks, - pa.array([value], type=self._data.type, from_pandas=True), - *self._data[key + 1 :].chunks, + *self._pa_array[:key].chunks, + pa.array([value], type=self._pa_array.type, from_pandas=True), + *self._pa_array[key + 1 :].chunks, ] data = pa.chunked_array(chunks).combine_chunks() elif is_bool_dtype(key): key = np.asarray(key, dtype=np.bool_) - data = self._replace_with_mask(self._data, key, value) + data = self._replace_with_mask(self._pa_array, key, value) elif is_scalar(value) or isinstance(value, pa.Scalar): mask = np.zeros(len(self), dtype=np.bool_) mask[key] = True - data = self._if_else(mask, value, self._data) + data = self._if_else(mask, value, self._pa_array) else: indices = np.arange(len(self))[key] @@ -1383,11 +1383,11 @@ def __setitem__(self, key, value) -> None: value = value.take(argsort) mask = np.zeros(len(self), dtype=np.bool_) mask[indices] = True - data = self._replace_with_mask(self._data, mask, value) + data = self._replace_with_mask(self._pa_array, mask, value) if isinstance(data, pa.Array): data = pa.chunked_array([data]) - self._data = data + self._pa_array = data def _rank( self, @@ -1417,7 +1417,7 @@ def _rank( result = pa.array(ranked, type=pa_type, from_pandas=True) return type(self)(result) - data = self._data.combine_chunks() + data = self._pa_array.combine_chunks() sort_keys = "ascending" if ascending else "descending" null_placement = "at_start" if na_option == "top" else "at_end" tiebreaker = "min" if method == "average" else method @@ -1430,7 +1430,7 @@ def _rank( ) if na_option == "keep": - mask = pc.is_null(self._data) + mask = pc.is_null(self._pa_array) null = pa.scalar(None, type=result.type) result = pc.if_else(mask, null, result) @@ -1471,9 +1471,9 @@ def _quantile( ------- same type as self """ - pa_dtype = self._data.type + pa_dtype = self._pa_array.type - data = self._data + data = self._pa_array if pa.types.is_temporal(pa_dtype): # https://github.com/apache/arrow/issues/33769 in these cases # we can cast to ints and back @@ -1511,17 +1511,17 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra same type as self Sorted, if possible. """ - pa_type = self._data.type + pa_type = self._pa_array.type if pa.types.is_temporal(pa_type): nbits = pa_type.bit_width if nbits == 32: - data = self._data.cast(pa.int32()) + data = self._pa_array.cast(pa.int32()) elif nbits == 64: - data = self._data.cast(pa.int64()) + data = self._pa_array.cast(pa.int64()) else: raise NotImplementedError(pa_type) else: - data = self._data + data = self._pa_array if dropna: data = data.drop_null() @@ -1547,7 +1547,7 @@ def _maybe_convert_setitem_value(self, value): else: pa_box = pa.scalar try: - value = pa_box(value, type=self._data.type, from_pandas=True) + value = pa_box(value, type=self._pa_array.type, from_pandas=True) except pa.ArrowTypeError as err: msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" raise TypeError(msg) from err @@ -1649,7 +1649,7 @@ def _replace_with_mask( def _str_count(self, pat: str, flags: int = 0): if flags: raise NotImplementedError(f"count not implemented with {flags=}") - return type(self)(pc.count_substring_regex(self._data, pat)) + return type(self)(pc.count_substring_regex(self._pa_array, pat)) def _str_pad( self, @@ -1667,7 +1667,7 @@ def _str_pad( raise ValueError( f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" ) - return type(self)(pa_pad(self._data, width=width, padding=fillchar)) + return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) def _str_contains( self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True @@ -1679,19 +1679,19 @@ def _str_contains( pa_contains = pc.match_substring_regex else: pa_contains = pc.match_substring - result = pa_contains(self._data, pat, ignore_case=not case) + result = pa_contains(self._pa_array, pat, ignore_case=not case) if not isna(na): result = result.fill_null(na) return type(self)(result) def _str_startswith(self, pat: str, na=None): - result = pc.starts_with(self._data, pattern=pat) + result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) return type(self)(result) def _str_endswith(self, pat: str, na=None): - result = pc.ends_with(self._data, pattern=pat) + result = pc.ends_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) return type(self)(result) @@ -1712,7 +1712,7 @@ def _str_replace( ) func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._data, pattern=pat, replacement=repl, max_replacements=n) + result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) return type(self)(result) def _str_repeat(self, repeats: int | Sequence[int]): @@ -1723,7 +1723,7 @@ def _str_repeat(self, repeats: int | Sequence[int]): elif pa_version_under7p0: raise NotImplementedError("repeat is not implemented for pyarrow < 7") else: - return type(self)(pc.binary_repeat(self._data, repeats)) + return type(self)(pc.binary_repeat(self._pa_array, repeats)) def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None @@ -1741,13 +1741,13 @@ def _str_fullmatch( def _str_find(self, sub: str, start: int = 0, end: int | None = None): if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._data, start, stop=end) + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) result = pc.find_substring(slices, sub) not_found = pc.equal(result, -1) offset_result = pc.add(result, end - start) result = pc.if_else(not_found, result, offset_result) elif start == 0 and end is None: - slices = self._data + slices = self._pa_array result = pc.find_substring(slices, sub) else: raise NotImplementedError( @@ -1756,7 +1756,7 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return type(self)(result) def _str_get(self, i: int): - lengths = pc.utf8_length(self._data) + lengths = pc.utf8_length(self._pa_array) if i >= 0: out_of_bounds = pc.greater_equal(i, lengths) start = i @@ -1769,14 +1769,14 @@ def _str_get(self, i: int): step = -1 not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) selected = pc.utf8_slice_codeunits( - self._data, start=start, stop=stop, step=step + self._pa_array, start=start, stop=stop, step=step ) - result = pa.array([None] * self._data.length(), type=self._data.type) + result = pa.array([None] * self._pa_array.length(), type=self._pa_array.type) result = pc.if_else(not_out_of_bounds, selected, result) return type(self)(result) def _str_join(self, sep: str): - return type(self)(pc.binary_join(self._data, sep)) + return type(self)(pc.binary_join(self._pa_array, sep)) def _str_partition(self, sep: str, expand: bool): raise NotImplementedError( @@ -1796,7 +1796,7 @@ def _str_slice( if step is None: step = 1 return type(self)( - pc.utf8_slice_codeunits(self._data, start=start, stop=stop, step=step) + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) ) def _str_slice_replace( @@ -1806,72 +1806,72 @@ def _str_slice_replace( repl = "" if start is None: start = 0 - return type(self)(pc.utf8_replace_slice(self._data, start, stop, repl)) + return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) def _str_isalnum(self): - return type(self)(pc.utf8_is_alnum(self._data)) + return type(self)(pc.utf8_is_alnum(self._pa_array)) def _str_isalpha(self): - return type(self)(pc.utf8_is_alpha(self._data)) + return type(self)(pc.utf8_is_alpha(self._pa_array)) def _str_isdecimal(self): - return type(self)(pc.utf8_is_decimal(self._data)) + return type(self)(pc.utf8_is_decimal(self._pa_array)) def _str_isdigit(self): - return type(self)(pc.utf8_is_digit(self._data)) + return type(self)(pc.utf8_is_digit(self._pa_array)) def _str_islower(self): - return type(self)(pc.utf8_is_lower(self._data)) + return type(self)(pc.utf8_is_lower(self._pa_array)) def _str_isnumeric(self): - return type(self)(pc.utf8_is_numeric(self._data)) + return type(self)(pc.utf8_is_numeric(self._pa_array)) def _str_isspace(self): - return type(self)(pc.utf8_is_space(self._data)) + return type(self)(pc.utf8_is_space(self._pa_array)) def _str_istitle(self): - return type(self)(pc.utf8_is_title(self._data)) + return type(self)(pc.utf8_is_title(self._pa_array)) def _str_capitalize(self): - return type(self)(pc.utf8_capitalize(self._data)) + return type(self)(pc.utf8_capitalize(self._pa_array)) def _str_title(self): - return type(self)(pc.utf8_title(self._data)) + return type(self)(pc.utf8_title(self._pa_array)) def _str_isupper(self): - return type(self)(pc.utf8_is_upper(self._data)) + return type(self)(pc.utf8_is_upper(self._pa_array)) def _str_swapcase(self): - return type(self)(pc.utf8_swapcase(self._data)) + return type(self)(pc.utf8_swapcase(self._pa_array)) def _str_len(self): - return type(self)(pc.utf8_length(self._data)) + return type(self)(pc.utf8_length(self._pa_array)) def _str_lower(self): - return type(self)(pc.utf8_lower(self._data)) + return type(self)(pc.utf8_lower(self._pa_array)) def _str_upper(self): - return type(self)(pc.utf8_upper(self._data)) + return type(self)(pc.utf8_upper(self._pa_array)) def _str_strip(self, to_strip=None): if to_strip is None: - result = pc.utf8_trim_whitespace(self._data) + result = pc.utf8_trim_whitespace(self._pa_array) else: - result = pc.utf8_trim(self._data, characters=to_strip) + result = pc.utf8_trim(self._pa_array, characters=to_strip) return type(self)(result) def _str_lstrip(self, to_strip=None): if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._data) + result = pc.utf8_ltrim_whitespace(self._pa_array) else: - result = pc.utf8_ltrim(self._data, characters=to_strip) + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) return type(self)(result) def _str_rstrip(self, to_strip=None): if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._data) + result = pc.utf8_rtrim_whitespace(self._pa_array) else: - result = pc.utf8_rtrim(self._data, characters=to_strip) + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) def _str_removeprefix(self, prefix: str): @@ -1879,15 +1879,15 @@ def _str_removeprefix(self, prefix: str): "str.removeprefix not supported with pd.ArrowDtype(pa.string())." ) # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed - # starts_with = pc.starts_with(self._data, pattern=prefix) - # removed = pc.utf8_slice_codeunits(self._data, len(prefix)) - # result = pc.if_else(starts_with, removed, self._data) + # starts_with = pc.starts_with(self._pa_array, pattern=prefix) + # removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + # result = pc.if_else(starts_with, removed, self._pa_array) # return type(self)(result) def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._data, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._data, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._data) + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) return type(self)(result) def _str_casefold(self): @@ -1959,59 +1959,59 @@ def _str_wrap(self, width, **kwargs): @property def _dt_day(self): - return type(self)(pc.day(self._data)) + return type(self)(pc.day(self._pa_array)) @property def _dt_day_of_week(self): - return type(self)(pc.day_of_week(self._data)) + return type(self)(pc.day_of_week(self._pa_array)) _dt_dayofweek = _dt_day_of_week _dt_weekday = _dt_day_of_week @property def _dt_day_of_year(self): - return type(self)(pc.day_of_year(self._data)) + return type(self)(pc.day_of_year(self._pa_array)) _dt_dayofyear = _dt_day_of_year @property def _dt_hour(self): - return type(self)(pc.hour(self._data)) + return type(self)(pc.hour(self._pa_array)) def _dt_isocalendar(self): - return type(self)(pc.iso_calendar(self._data)) + return type(self)(pc.iso_calendar(self._pa_array)) @property def _dt_is_leap_year(self): - return type(self)(pc.is_leap_year(self._data)) + return type(self)(pc.is_leap_year(self._pa_array)) @property def _dt_microsecond(self): - return type(self)(pc.microsecond(self._data)) + return type(self)(pc.microsecond(self._pa_array)) @property def _dt_minute(self): - return type(self)(pc.minute(self._data)) + return type(self)(pc.minute(self._pa_array)) @property def _dt_month(self): - return type(self)(pc.month(self._data)) + return type(self)(pc.month(self._pa_array)) @property def _dt_nanosecond(self): - return type(self)(pc.nanosecond(self._data)) + return type(self)(pc.nanosecond(self._pa_array)) @property def _dt_quarter(self): - return type(self)(pc.quarter(self._data)) + return type(self)(pc.quarter(self._pa_array)) @property def _dt_second(self): - return type(self)(pc.second(self._data)) + return type(self)(pc.second(self._pa_array)) @property def _dt_date(self): - return type(self)(self._data.cast(pa.date64())) + return type(self)(self._pa_array.cast(pa.date64())) @property def _dt_time(self): @@ -2020,14 +2020,14 @@ def _dt_time(self): if self.dtype.pyarrow_dtype.unit in {"us", "ns"} else "ns" ) - return type(self)(self._data.cast(pa.time64(unit))) + return type(self)(self._pa_array.cast(pa.time64(unit))) @property def _dt_tz(self): return self.dtype.pyarrow_dtype.tz def _dt_strftime(self, format: str): - return type(self)(pc.strftime(self._data, format=format)) + return type(self)(pc.strftime(self._pa_array, format=format)) def _round_temporally( self, @@ -2064,7 +2064,7 @@ def _round_temporally( raise ValueError(f"{freq=} is not supported") multiple = offset.n rounding_method = getattr(pc, f"{method}_temporal") - return type(self)(rounding_method(self._data, multiple=multiple, unit=unit)) + return type(self)(rounding_method(self._pa_array, multiple=multiple, unit=unit)) def _dt_ceil( self, @@ -2091,7 +2091,7 @@ def _dt_round( return self._round_temporally("round", freq, ambiguous, nonexistent) def _dt_to_pydatetime(self): - return np.array(self._data.to_pylist(), dtype=object) + return np.array(self._pa_array.to_pylist(), dtype=object) def _dt_tz_localize( self, @@ -2105,8 +2105,8 @@ def _dt_tz_localize( raise NotImplementedError(f"{nonexistent=} is not supported") if tz is None: new_type = pa.timestamp(self.dtype.pyarrow_dtype.unit) - return type(self)(self._data.cast(new_type)) + return type(self)(self._pa_array.cast(new_type)) pa_tz = str(tz) return type(self)( - self._data.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit, pa_tz)) + self._pa_array.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit, pa_tz)) ) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4d2b39ec61fca..2086a93ea6e14 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -5,6 +5,7 @@ Callable, Union, ) +import warnings import numpy as np @@ -18,6 +19,7 @@ npt, ) from pandas.compat import pa_version_under7p0 +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_bool_dtype, @@ -112,7 +114,7 @@ def __init__(self, values) -> None: super().__init__(values) self._dtype = StringDtype(storage="pyarrow") - if not pa.types.is_string(self._data.type): + if not pa.types.is_string(self._pa_array.type): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of string type" ) @@ -125,7 +127,7 @@ def __len__(self) -> int: ------- length : int """ - return len(self._data) + return len(self._pa_array) @classmethod def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): @@ -193,7 +195,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: if not len(value_set): return np.zeros(len(self), dtype=bool) - result = pc.is_in(self._data, value_set=pa.array(value_set)) + result = pc.is_in(self._pa_array, value_set=pa.array(value_set)) # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) @@ -206,13 +208,24 @@ def astype(self, dtype, copy: bool = True): return self.copy() return self elif isinstance(dtype, NumericDtype): - data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) + data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) return dtype.__from_arrow__(data) elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating): return self.to_numpy(dtype=dtype, na_value=np.nan) return super().astype(dtype, copy=copy) + @property + def _data(self): + # dask accesses ._data directlys + warnings.warn( + f"{type(self).__name__}._data is a deprecated and will be removed " + "in a future version, use ._pa_array instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._pa_array + # ------------------------------------------------------------------------ # String methods interface @@ -292,12 +305,12 @@ def _str_contains( fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) else: - result = pc.match_substring_regex(self._data, pat) + result = pc.match_substring_regex(self._pa_array, pat) else: if case: - result = pc.match_substring(self._data, pat) + result = pc.match_substring(self._pa_array, pat) else: - result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) + result = pc.match_substring(pc.utf8_upper(self._pa_array), pat.upper()) result = BooleanDtype().__from_arrow__(result) if not isna(na): result[isna(result)] = bool(na) @@ -325,7 +338,7 @@ def _str_replace( return super()._str_replace(pat, repl, n, case, flags, regex) func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._data, pattern=pat, replacement=repl, max_replacements=n) + result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) return type(self)(result) def _str_match( @@ -343,68 +356,68 @@ def _str_fullmatch( return self._str_match(pat, case, flags, na) def _str_isalnum(self): - result = pc.utf8_is_alnum(self._data) + result = pc.utf8_is_alnum(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isalpha(self): - result = pc.utf8_is_alpha(self._data) + result = pc.utf8_is_alpha(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isdecimal(self): - result = pc.utf8_is_decimal(self._data) + result = pc.utf8_is_decimal(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isdigit(self): - result = pc.utf8_is_digit(self._data) + result = pc.utf8_is_digit(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_islower(self): - result = pc.utf8_is_lower(self._data) + result = pc.utf8_is_lower(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isnumeric(self): - result = pc.utf8_is_numeric(self._data) + result = pc.utf8_is_numeric(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isspace(self): - result = pc.utf8_is_space(self._data) + result = pc.utf8_is_space(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_istitle(self): - result = pc.utf8_is_title(self._data) + result = pc.utf8_is_title(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isupper(self): - result = pc.utf8_is_upper(self._data) + result = pc.utf8_is_upper(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_len(self): - result = pc.utf8_length(self._data) + result = pc.utf8_length(self._pa_array) return Int64Dtype().__from_arrow__(result) def _str_lower(self): - return type(self)(pc.utf8_lower(self._data)) + return type(self)(pc.utf8_lower(self._pa_array)) def _str_upper(self): - return type(self)(pc.utf8_upper(self._data)) + return type(self)(pc.utf8_upper(self._pa_array)) def _str_strip(self, to_strip=None): if to_strip is None: - result = pc.utf8_trim_whitespace(self._data) + result = pc.utf8_trim_whitespace(self._pa_array) else: - result = pc.utf8_trim(self._data, characters=to_strip) + result = pc.utf8_trim(self._pa_array, characters=to_strip) return type(self)(result) def _str_lstrip(self, to_strip=None): if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._data) + result = pc.utf8_ltrim_whitespace(self._pa_array) else: - result = pc.utf8_ltrim(self._data, characters=to_strip) + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) return type(self)(result) def _str_rstrip(self, to_strip=None): if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._data) + result = pc.utf8_rtrim_whitespace(self._pa_array) else: - result = pc.utf8_rtrim(self._data, characters=to_strip) + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 236449881dc41..85460a04298e6 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -218,7 +218,7 @@ def isocalendar(self): result = ( cast(ArrowExtensionArray, self._parent.array) ._dt_isocalendar() - ._data.combine_chunks() + ._pa_array.combine_chunks() ) iso_calendar_df = DataFrame( { diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 73343976e92fb..310e811c0c6d8 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -192,7 +192,9 @@ def test_astype_arrow_timestamp(using_copy_on_write): result = df.astype("timestamp[ns][pyarrow]") if using_copy_on_write: assert not result._mgr._has_no_reference(0) - assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data) + assert np.shares_memory( + get_array(df, "a").asi8, get_array(result, "a")._pa_array + ) def test_convert_dtypes_infer_objects(using_copy_on_write): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6a3c49c826536..a342b2bc794f4 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -274,14 +274,14 @@ def test_from_dtype(self, data, request): def test_from_sequence_pa_array(self, data): # https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784 - # data._data = pa.ChunkedArray - result = type(data)._from_sequence(data._data) + # data._pa_array = pa.ChunkedArray + result = type(data)._from_sequence(data._pa_array) tm.assert_extension_array_equal(result, data) - assert isinstance(result._data, pa.ChunkedArray) + assert isinstance(result._pa_array, pa.ChunkedArray) - result = type(data)._from_sequence(data._data.combine_chunks()) + result = type(data)._from_sequence(data._pa_array.combine_chunks()) tm.assert_extension_array_equal(result, data) - assert isinstance(result._data, pa.ChunkedArray) + assert isinstance(result._pa_array, pa.ChunkedArray) def test_from_sequence_pa_array_notimplemented(self, request): with pytest.raises(NotImplementedError, match="Converting strings to"): @@ -317,7 +317,7 @@ def test_from_sequence_of_strings_pa_array(self, data, request): ), ) ) - pa_array = data._data.cast(pa.string()) + pa_array = data._pa_array.cast(pa.string()) result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype) tm.assert_extension_array_equal(result, data) @@ -1456,7 +1456,7 @@ def test_quantile(data, interpolation, quantile, request): or (pa.types.is_decimal(pa_dtype) and not pa_version_under7p0) ): pass - elif pa.types.is_temporal(data._data.type): + elif pa.types.is_temporal(data._pa_array.type): pass else: request.node.add_marker( @@ -1619,7 +1619,7 @@ def test_pickle_roundtrip(data): def test_astype_from_non_pyarrow(data): # GH49795 - pd_array = data._data.to_pandas().array + pd_array = data._pa_array.to_pandas().array result = pd_array.astype(data.dtype) assert not isinstance(pd_array.dtype, ArrowDtype) assert isinstance(result.dtype, ArrowDtype) @@ -1638,11 +1638,11 @@ def test_to_numpy_with_defaults(data): # GH49973 result = data.to_numpy() - pa_type = data._data.type + pa_type = data._pa_array.type if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type): expected = np.array(list(data)) else: - expected = np.array(data._data) + expected = np.array(data._pa_array) if data._hasna: expected = expected.astype(object) @@ -1668,7 +1668,7 @@ def test_setitem_null_slice(data): result = orig.copy() result[:] = data[0] expected = ArrowExtensionArray( - pa.array([data[0]] * len(data), type=data._data.type) + pa.array([data[0]] * len(data), type=data._pa_array.type) ) tm.assert_extension_array_equal(result, expected) @@ -1685,7 +1685,7 @@ def test_setitem_null_slice(data): def test_setitem_invalid_dtype(data): # GH50248 - pa_type = data._data.type + pa_type = data._pa_array.type if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): fill_value = 123 err = TypeError diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 11ac6a151324a..cc8bd91ca3cb4 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -35,7 +35,7 @@ def split_array(arr): def _split_array(arr): import pyarrow as pa - arrow_array = arr._data + arrow_array = arr._pa_array split = len(arrow_array) // 2 arrow_array = pa.chunked_array( [*arrow_array[:split].chunks, *arrow_array[split:].chunks] diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 66e55fe86d964..ab40d6f26d7fa 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -578,7 +578,7 @@ def test_use_nullable_dtypes(self, read_ext, dtype_backend, option): ) # pyarrow by default infers timestamp resolution as us, not ns expected["i"] = ArrowExtensionArray( - expected["i"].array._data.cast(pa.timestamp(unit="us")) + expected["i"].array._pa_array.cast(pa.timestamp(unit="us")) ) # pyarrow supports a null type, so don't have to default to Int64 expected["j"] = ArrowExtensionArray(pa.array([None, None]))