Skip to content

Commit 2107dad

Browse files
simonjayhawkinsxhochyTomAugspurgerjorisvandenbossche
authored andcommitted
[ArrowStringArray] API: StringDtype parameterized by storage (python or pyarrow) (pandas-dev#39908)
Co-authored-by: Uwe L. Korn <[email protected]> Co-authored-by: Tom Augspurger <[email protected]> Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 64a72c3 commit 2107dad

29 files changed

+478
-294
lines changed

asv_bench/benchmarks/algorithms.py

+10-13
Original file line numberDiff line numberDiff line change
@@ -23,41 +23,38 @@ class Factorize:
2323
"int",
2424
"uint",
2525
"float",
26-
"string",
26+
"object",
2727
"datetime64[ns]",
2828
"datetime64[ns, tz]",
2929
"Int64",
3030
"boolean",
31-
"string_arrow",
31+
"string[pyarrow]",
3232
],
3333
]
3434
param_names = ["unique", "sort", "dtype"]
3535

3636
def setup(self, unique, sort, dtype):
3737
N = 10 ** 5
3838
string_index = tm.makeStringIndex(N)
39-
try:
40-
from pandas.core.arrays.string_arrow import ArrowStringDtype
41-
42-
string_arrow = pd.array(string_index, dtype=ArrowStringDtype())
43-
except ImportError:
44-
string_arrow = None
45-
46-
if dtype == "string_arrow" and not string_arrow:
47-
raise NotImplementedError
39+
string_arrow = None
40+
if dtype == "string[pyarrow]":
41+
try:
42+
string_arrow = pd.array(string_index, dtype="string[pyarrow]")
43+
except ImportError:
44+
raise NotImplementedError
4845

4946
data = {
5047
"int": pd.Int64Index(np.arange(N)),
5148
"uint": pd.UInt64Index(np.arange(N)),
5249
"float": pd.Float64Index(np.random.randn(N)),
53-
"string": string_index,
50+
"object": string_index,
5451
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
5552
"datetime64[ns, tz]": pd.date_range(
5653
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
5754
),
5855
"Int64": pd.array(np.arange(N), dtype="Int64"),
5956
"boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
60-
"string_arrow": string_arrow,
57+
"string[pyarrow]": string_arrow,
6158
}[dtype]
6259
if not unique:
6360
data = data.repeat(5)

asv_bench/benchmarks/algos/isin.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ class IsIn:
2525
"category[object]",
2626
"category[int]",
2727
"str",
28-
"string",
29-
"arrow_string",
28+
"string[python]",
29+
"string[pyarrow]",
3030
]
3131
param_names = ["dtype"]
3232

@@ -62,9 +62,7 @@ def setup(self, dtype):
6262
self.values = np.random.choice(arr, sample_size)
6363
self.series = Series(arr).astype("category")
6464

65-
elif dtype in ["str", "string", "arrow_string"]:
66-
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
67-
65+
elif dtype in ["str", "string[python]", "string[pyarrow]"]:
6866
try:
6967
self.series = Series(tm.makeStringIndex(N), dtype=dtype)
7068
except ImportError:

asv_bench/benchmarks/strings.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,10 @@
1212

1313

1414
class Dtypes:
15-
params = ["str", "string", "arrow_string"]
15+
params = ["str", "string[python]", "string[pyarrow]"]
1616
param_names = ["dtype"]
1717

1818
def setup(self, dtype):
19-
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
20-
2119
try:
2220
self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
2321
except ImportError:

doc/source/reference/arrays.rst

+1
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,7 @@ we recommend using :class:`StringDtype` (with the alias ``"string"``).
480480
:template: autosummary/class_without_autosummary.rst
481481

482482
arrays.StringArray
483+
arrays.ArrowStringArray
483484

484485
.. autosummary::
485486
:toctree: api/

doc/source/whatsnew/v1.3.0.rst

+52
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,58 @@ a copy will no longer be made (:issue:`32960`)
171171
The default behavior when not passing ``copy`` will remain unchanged, i.e.
172172
a copy will be made.
173173

174+
.. _whatsnew_130.arrow_string:
175+
176+
PyArrow backed string data type
177+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
178+
179+
We've enhanced the :class:`StringDtype`, an extension type dedicated to string data.
180+
(:issue:`39908`)
181+
182+
It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`. Use
183+
pandas options or specify the dtype using ``dtype='string[pyarrow]'`` to allow the
184+
StringArray to be backed by a PyArrow array instead of a NumPy array of Python objects.
185+
186+
The PyArrow backed StringArray requires pyarrow 1.0.0 or greater to be installed.
187+
188+
.. warning::
189+
190+
``string[pyarrow]`` is currently considered experimental. The implementation
191+
and parts of the API may change without warning.
192+
193+
.. ipython:: python
194+
195+
pd.Series(['abc', None, 'def'], dtype=pd.StringDtype(storage="pyarrow"))
196+
197+
You can use the alias ``"string[pyarrow]"`` as well.
198+
199+
.. ipython:: python
200+
201+
s = pd.Series(['abc', None, 'def'], dtype="string[pyarrow]")
202+
s
203+
204+
You can also create a PyArrow backed string array using pandas options.
205+
206+
.. ipython:: python
207+
208+
with pd.option_context("string_storage", "pyarrow"):
209+
s = pd.Series(['abc', None, 'def'], dtype="string")
210+
s
211+
212+
The usual string accessor methods work. Where appropriate, the return type of the Series
213+
or columns of a DataFrame will also have string dtype.
214+
215+
.. ipython:: python
216+
217+
s.str.upper()
218+
s.str.split('b', expand=True).dtypes
219+
220+
String accessor methods returning integers will return a value with :class:`Int64Dtype`
221+
222+
.. ipython:: python
223+
224+
s.str.count("a")
225+
174226
Centered Datetime-Like Rolling Windows
175227
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
176228

pandas/_testing/asserters.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
TimedeltaArray,
4949
)
5050
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
51+
from pandas.core.arrays.string_ import StringDtype
5152

5253
from pandas.io.formats.printing import pprint_thing
5354

@@ -638,12 +639,20 @@ def raise_assert_detail(obj, message, left, right, diff=None, index_values=None)
638639

639640
if isinstance(left, np.ndarray):
640641
left = pprint_thing(left)
641-
elif is_categorical_dtype(left) or isinstance(left, PandasDtype):
642+
elif (
643+
is_categorical_dtype(left)
644+
or isinstance(left, PandasDtype)
645+
or isinstance(left, StringDtype)
646+
):
642647
left = repr(left)
643648

644649
if isinstance(right, np.ndarray):
645650
right = pprint_thing(right)
646-
elif is_categorical_dtype(right) or isinstance(right, PandasDtype):
651+
elif (
652+
is_categorical_dtype(right)
653+
or isinstance(right, PandasDtype)
654+
or isinstance(right, StringDtype)
655+
):
647656
right = repr(right)
648657

649658
msg += f"""

pandas/arrays/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
See :ref:`extending.extension-types` for more.
55
"""
66
from pandas.core.arrays import (
7+
ArrowStringArray,
78
BooleanArray,
89
Categorical,
910
DatetimeArray,
@@ -18,6 +19,7 @@
1819
)
1920

2021
__all__ = [
22+
"ArrowStringArray",
2123
"BooleanArray",
2224
"Categorical",
2325
"DatetimeArray",

pandas/conftest.py

+27-11
Original file line numberDiff line numberDiff line change
@@ -1120,24 +1120,42 @@ def string_dtype(request):
11201120

11211121
@pytest.fixture(
11221122
params=[
1123-
"string",
1123+
"string[python]",
11241124
pytest.param(
1125-
"arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
1125+
"string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
11261126
),
11271127
]
11281128
)
11291129
def nullable_string_dtype(request):
11301130
"""
11311131
Parametrized fixture for string dtypes.
11321132
1133-
* 'string'
1134-
* 'arrow_string'
1133+
* 'string[python]'
1134+
* 'string[pyarrow]'
1135+
"""
1136+
return request.param
1137+
1138+
1139+
@pytest.fixture(
1140+
params=[
1141+
"python",
1142+
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")),
1143+
]
1144+
)
1145+
def string_storage(request):
11351146
"""
1136-
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
1147+
Parametrized fixture for pd.options.mode.string_storage.
11371148
1149+
* 'python'
1150+
* 'pyarrow'
1151+
"""
11381152
return request.param
11391153

11401154

1155+
# Alias so we can test with cartesian product of string_storage
1156+
string_storage2 = string_storage
1157+
1158+
11411159
@pytest.fixture(params=tm.BYTES_DTYPES)
11421160
def bytes_dtype(request):
11431161
"""
@@ -1163,21 +1181,19 @@ def object_dtype(request):
11631181
@pytest.fixture(
11641182
params=[
11651183
"object",
1166-
"string",
1184+
"string[python]",
11671185
pytest.param(
1168-
"arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
1186+
"string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
11691187
),
11701188
]
11711189
)
11721190
def any_string_dtype(request):
11731191
"""
11741192
Parametrized fixture for string dtypes.
11751193
* 'object'
1176-
* 'string'
1177-
* 'arrow_string'
1194+
* 'string[python]'
1195+
* 'string[pyarrow]'
11781196
"""
1179-
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
1180-
11811197
return request.param
11821198

11831199

pandas/core/arrays/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@
1717
)
1818
from pandas.core.arrays.sparse import SparseArray
1919
from pandas.core.arrays.string_ import StringArray
20+
from pandas.core.arrays.string_arrow import ArrowStringArray
2021
from pandas.core.arrays.timedeltas import TimedeltaArray
2122

2223
__all__ = [
2324
"ExtensionArray",
2425
"ExtensionOpsMixin",
2526
"ExtensionScalarOpsMixin",
27+
"ArrowStringArray",
2628
"BaseMaskedArray",
2729
"BooleanArray",
2830
"Categorical",

0 commit comments

Comments
 (0)