Skip to content

[ArrowStringArray] API: StringArray -> ObjectStringArray #40962

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
IntervalArray,
PandasArray,
PeriodArray,
PythonStringArray,
SparseArray,
StringArray,
TimedeltaArray,
)

Expand All @@ -27,6 +27,6 @@
"PandasArray",
"PeriodArray",
"SparseArray",
"StringArray",
"PythonStringArray",
"TimedeltaArray",
]
4 changes: 2 additions & 2 deletions pandas/core/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
period_array,
)
from pandas.core.arrays.sparse import SparseArray
from pandas.core.arrays.string_ import StringArray
from pandas.core.arrays.string_ import PythonStringArray
from pandas.core.arrays.timedeltas import TimedeltaArray

__all__ = [
Expand All @@ -34,6 +34,6 @@
"PeriodArray",
"period_array",
"SparseArray",
"StringArray",
"PythonStringArray",
"TimedeltaArray",
]
64 changes: 39 additions & 25 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import (
TYPE_CHECKING,
TypeVar,
)

import numpy as np

Expand Down Expand Up @@ -37,6 +40,7 @@
IntegerArray,
PandasArray,
)
from pandas.core.arrays.base import ExtensionArray
from pandas.core.arrays.floating import FloatingDtype
from pandas.core.arrays.integer import _IntegerDtype
from pandas.core.construction import extract_array
Expand Down Expand Up @@ -86,24 +90,24 @@ def type(self) -> type[str]:
return str

@classmethod
def construct_array_type(cls) -> type_t[StringArray]:
def construct_array_type(cls) -> type_t[PythonStringArray]:
"""
Return the array type associated with this dtype.

Returns
-------
type
"""
return StringArray
return PythonStringArray

def __repr__(self) -> str:
return "StringDtype"

def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> StringArray:
) -> PythonStringArray:
"""
Construct StringArray from pyarrow Array/ChunkedArray.
Construct PythonStringArray from pyarrow Array/ChunkedArray.
"""
import pyarrow

Expand All @@ -116,24 +120,31 @@ def __from_arrow__(
results = []
for arr in chunks:
# using _from_sequence to ensure None is converted to NA
str_arr = StringArray._from_sequence(np.array(arr))
str_arr = PythonStringArray._from_sequence(np.array(arr))
results.append(str_arr)

if results:
return StringArray._concat_same_type(results)
return PythonStringArray._concat_same_type(results)
else:
return StringArray(np.array([], dtype="object"))
return PythonStringArray(np.array([], dtype="object"))


StringArrayT = TypeVar("StringArrayT", bound="StringArray")


class StringArray(ExtensionArray):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will rename this StringArrayBase in next commit

pass

class StringArray(PandasArray):

class PythonStringArray(StringArray, PandasArray):
"""
Extension array for string data.

.. versionadded:: 1.0.0

.. warning::

StringArray is considered experimental. The implementation and
PythonStringArray is considered experimental. The implementation and
parts of the API may change without warning.

Parameters
Expand All @@ -147,7 +158,7 @@ class StringArray(PandasArray):
where the elements are Python strings or :attr:`pandas.NA`.
This may change without warning in the future. Use
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
creating a `StringArray` from any sequence.
creating a `PythonStringArray` from any sequence.

copy : bool, default False
Whether to copy the array of data.
Expand All @@ -163,23 +174,23 @@ class StringArray(PandasArray):
See Also
--------
array
The recommended function for creating a StringArray.
The recommended function for creating a PythonStringArray.
Series.str
The string methods are available on Series backed by
a StringArray.
a PythonStringArray.

Notes
-----
StringArray returns a BooleanArray for comparison methods.
PythonStringArray returns a BooleanArray for comparison methods.

Examples
--------
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
<StringArray>
<PythonStringArray>
['This is', 'some text', <NA>, 'data.']
Length: 4, dtype: string

Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
Unlike arrays instantiated with ``dtype="object"``, ``PythonStringArray``
will convert the values to strings.

>>> pd.array(['1', 1], dtype="object")
Expand All @@ -191,9 +202,10 @@ class StringArray(PandasArray):
['1', '1']
Length: 2, dtype: string

However, instantiating StringArrays directly with non-strings will raise an error.
However, instantiating PythonStringArrays directly with non-strings will raise an
error.

For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
For comparison methods, `PythonStringArray` returns a :class:`pandas.BooleanArray`:

>>> pd.array(["a", None, "c"], dtype="string") == "a"
<BooleanArray>
Expand All @@ -217,10 +229,12 @@ def __init__(self, values, copy=False):
def _validate(self):
"""Validate that we only store NA or strings."""
if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
raise ValueError("StringArray requires a sequence of strings or pandas.NA")
raise ValueError(
"PythonStringArray requires a sequence of strings or pandas.NA"
)
if self._ndarray.dtype != "object":
raise ValueError(
"StringArray requires a sequence of strings or pandas.NA. Got "
"PythonStringArray requires a sequence of strings or pandas.NA. Got "
f"'{self._ndarray.dtype}' dtype instead."
)

Expand Down Expand Up @@ -258,7 +272,7 @@ def _from_sequence_of_strings(
return cls._from_sequence(strings, dtype=dtype, copy=copy)

@classmethod
def _empty(cls, shape, dtype) -> StringArray:
def _empty(cls, shape, dtype) -> PythonStringArray:
values = np.empty(shape, dtype=object)
values[:] = libmissing.NA
return cls(values).astype(dtype, copy=False)
Expand Down Expand Up @@ -300,7 +314,7 @@ def __setitem__(self, key, value):
value = StringDtype.na_value
elif not isinstance(value, str):
raise ValueError(
f"Cannot set non-string value '{value}' into a StringArray."
f"Cannot set non-string value '{value}' into a PythonStringArray."
)
else:
if not is_array_like(value):
Expand Down Expand Up @@ -377,7 +391,7 @@ def memory_usage(self, deep: bool = False) -> int:
def _cmp_method(self, other, op):
from pandas.arrays import BooleanArray

if isinstance(other, StringArray):
if isinstance(other, PythonStringArray):
other = other._ndarray

mask = isna(self) | isna(other)
Expand All @@ -397,7 +411,7 @@ def _cmp_method(self, other, op):
result = np.empty_like(self._ndarray, dtype="object")
result[mask] = StringDtype.na_value
result[valid] = op(self._ndarray[valid], other)
return StringArray(result)
return PythonStringArray(result)
else:
# logical
result = np.zeros(len(self._ndarray), dtype="bool")
Expand Down Expand Up @@ -457,7 +471,7 @@ def _str_map(
result = lib.map_infer_mask(
arr, f, mask.view("uint8"), convert=False, na_value=na_value
)
return StringArray(result)
return PythonStringArray(result)
else:
# This is when the result type is object. We reach this when
# -> We know the result type is truly object (e.g. .encode returns bytes
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,10 @@
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.integer import Int64Dtype
from pandas.core.arrays.numeric import NumericDtype
from pandas.core.arrays.string_ import StringDtype
from pandas.core.arrays.string_ import (
StringArray,
StringDtype,
)
from pandas.core.indexers import (
check_array_indexer,
validate_indices,
Expand Down Expand Up @@ -178,7 +181,7 @@ def __eq__(self, other) -> bool:
# fallback for the ones that pyarrow doesn't yet support


class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin):
class ArrowStringArray(OpsMixin, StringArray, ObjectStringArrayMixin):
"""
Extension array for string data in a ``pyarrow.ChunkedArray``.

Expand Down
8 changes: 4 additions & 4 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def array(
:class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`
:class:`int` :class:`pandas.arrays.IntegerArray`
:class:`float` :class:`pandas.arrays.FloatingArray`
:class:`str` :class:`pandas.arrays.StringArray`
:class:`str` :class:`pandas.arrays.PythonStringArray`
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is overlap here with the dtype change. could add the docstring to the baseclass and no need to reference the two array types.

:class:`bool` :class:`pandas.arrays.BooleanArray`
============================== =====================================

Expand Down Expand Up @@ -233,7 +233,7 @@ def array(
Length: 2, dtype: Float64

>>> pd.array(["a", None, "c"])
<StringArray>
<PythonStringArray>
['a', <NA>, 'c']
Length: 3, dtype: string

Expand Down Expand Up @@ -290,7 +290,7 @@ def array(
IntervalArray,
PandasArray,
PeriodArray,
StringArray,
PythonStringArray,
TimedeltaArray,
)

Expand Down Expand Up @@ -333,7 +333,7 @@ def array(
return TimedeltaArray._from_sequence(data, copy=copy)

elif inferred_dtype == "string":
return StringArray._from_sequence(data, copy=copy)
return PythonStringArray._from_sequence(data, copy=copy)

elif inferred_dtype == "integer":
return IntegerArray._from_sequence(data, copy=copy)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/strings/object_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def scalar_rep(x):

return self._str_map(scalar_rep, dtype=str)
else:
from pandas.core.arrays.string_ import StringArray
from pandas.core.arrays.string_ import PythonStringArray
from pandas.core.arrays.string_arrow import ArrowStringArray

def rep(x, r):
Expand All @@ -186,7 +186,7 @@ def rep(x, r):

repeats = np.asarray(repeats, dtype=object)
result = libops.vec_binop(np.asarray(self), repeats, rep)
if isinstance(self, (StringArray, ArrowStringArray)):
if isinstance(self, (PythonStringArray, ArrowStringArray)):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could now use the base class. will change in next commit

# Not going through map, so we have to do this here.
result = type(self)._from_sequence(result)
return result
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,7 @@ def test_interval(self):

def test_categorical_extension_array_nullable(self, nulls_fixture):
# GH:
arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2)
arr = pd.arrays.PythonStringArray._from_sequence([nulls_fixture] * 2)
result = Categorical(arr)
expected = Categorical(Series([pd.NA, pd.NA], dtype="object"))
tm.assert_categorical_equal(result, expected)
Expand Down
16 changes: 8 additions & 8 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def dtype_object(dtype):

@pytest.fixture(
params=[
pd.arrays.StringArray,
pd.arrays.PythonStringArray,
pytest.param(ArrowStringArray, marks=skip_if_no_pyarrow),
]
)
Expand All @@ -55,7 +55,7 @@ def test_repr(dtype):
expected = f"0 a\n1 <NA>\n2 b\nName: A, dtype: {dtype}"
assert repr(df.A) == expected

arr_name = "ArrowStringArray" if dtype == "arrow_string" else "StringArray"
arr_name = "ArrowStringArray" if dtype == "arrow_string" else "PythonStringArray"
expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: {dtype}"
assert repr(df.A.array) == expected

Expand All @@ -69,14 +69,14 @@ def test_none_to_nan(cls):
def test_setitem_validates(cls):
arr = cls._from_sequence(["a", "b"])

if cls is pd.arrays.StringArray:
msg = "Cannot set non-string value '10' into a StringArray."
if cls is pd.arrays.PythonStringArray:
msg = "Cannot set non-string value '10' into a PythonStringArray."
else:
msg = "Scalar must be NA or str"
with pytest.raises(ValueError, match=msg):
arr[0] = 10

if cls is pd.arrays.StringArray:
if cls is pd.arrays.PythonStringArray:
msg = "Must provide strings."
else:
msg = "Scalar must be NA or str"
Expand Down Expand Up @@ -280,8 +280,8 @@ def test_comparison_methods_array(all_compare_operators, dtype, request):


def test_constructor_raises(cls):
if cls is pd.arrays.StringArray:
msg = "StringArray requires a sequence of strings or pandas.NA"
if cls is pd.arrays.PythonStringArray:
msg = "PythonStringArray requires a sequence of strings or pandas.NA"
else:
msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowStringArray"

Expand Down Expand Up @@ -431,7 +431,7 @@ def test_fillna_args(dtype, request):
expected = pd.array(["a", "b"], dtype=dtype)
tm.assert_extension_array_equal(res, expected)

msg = "Cannot set non-string value '1' into a StringArray."
msg = "Cannot set non-string value '1' into a PythonStringArray."
with pytest.raises(ValueError, match=msg):
arr.fillna(value=1)

Expand Down
Loading