Skip to content

Commit 9cc7a2f

Browse files
Backport PR #54720 on branch 2.1.x (Infer strings as pyarrow_numpy backed strings) (#54735)
Backport PR #54720: Infer strings as pyarrow_numpy backed strings Co-authored-by: Patrick Hoefler <[email protected]>
1 parent c5adf1a commit 9cc7a2f

File tree

17 files changed

+47
-68
lines changed

17 files changed

+47
-68
lines changed

doc/source/whatsnew/v2.1.0.rst

+4-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ PyArrow will become a required dependency with pandas 3.0
2121

2222
`PyArrow <https://arrow.apache.org/docs/python/index.html>`_ will become a required
2323
dependency of pandas starting with pandas 3.0. This decision was made based on
24-
`PDEP 12 <https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html>`_.
24+
`PDEP 10 <https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html>`_.
2525

2626
This will enable more changes that are hugely beneficial to pandas users, including
2727
but not limited to:
@@ -41,7 +41,9 @@ Avoid NumPy object dtype for strings by default
4141

4242
Previously, all strings were stored in columns with NumPy object dtype.
4343
This release introduces an option ``future.infer_string`` that infers all
44-
strings as PyArrow backed strings with dtype ``pd.ArrowDtype(pa.string())`` instead.
44+
strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]"`` instead.
45+
This is a new string dtype implementation that follows NumPy semantics in comparison
46+
operations and will return ``np.nan`` as the missing value indicator.
4547
This option only works if PyArrow is installed. PyArrow backed strings have a
4648
significantly reduced memory footprint and provide a big performance improvement
4749
compared to NumPy object (:issue:`54430`).

pandas/_libs/lib.pyx

+2-4
Original file line numberDiff line numberDiff line change
@@ -2682,11 +2682,9 @@ def maybe_convert_objects(ndarray[object] objects,
26822682

26832683
elif seen.str_:
26842684
if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True):
2685-
import pyarrow as pa
2685+
from pandas.core.arrays.string_ import StringDtype
26862686

2687-
from pandas.core.dtypes.dtypes import ArrowDtype
2688-
2689-
dtype = ArrowDtype(pa.string())
2687+
dtype = StringDtype(storage="pyarrow_numpy")
26902688
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
26912689

26922690
seen.object_ = True

pandas/core/construction.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,7 @@
5151
is_object_dtype,
5252
pandas_dtype,
5353
)
54-
from pandas.core.dtypes.dtypes import (
55-
ArrowDtype,
56-
NumpyEADtype,
57-
)
54+
from pandas.core.dtypes.dtypes import NumpyEADtype
5855
from pandas.core.dtypes.generic import (
5956
ABCDataFrame,
6057
ABCExtensionArray,
@@ -595,9 +592,9 @@ def sanitize_array(
595592
if data.dtype == object:
596593
subarr = maybe_infer_to_datetimelike(data)
597594
elif data.dtype.kind == "U" and using_pyarrow_string_dtype():
598-
import pyarrow as pa
595+
from pandas.core.arrays.string_ import StringDtype
599596

600-
dtype = ArrowDtype(pa.string())
597+
dtype = StringDtype(storage="pyarrow_numpy")
601598
subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
602599

603600
if subarr is data and copy:

pandas/core/dtypes/cast.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -799,10 +799,9 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
799799

800800
dtype = _dtype_obj
801801
if using_pyarrow_string_dtype():
802-
import pyarrow as pa
802+
from pandas.core.arrays.string_ import StringDtype
803803

804-
pa_dtype = pa.string()
805-
dtype = ArrowDtype(pa_dtype)
804+
dtype = StringDtype(storage="pyarrow_numpy")
806805

807806
elif isinstance(val, (np.datetime64, dt.datetime)):
808807
try:

pandas/core/internals/construction.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,7 @@
3232
is_named_tuple,
3333
is_object_dtype,
3434
)
35-
from pandas.core.dtypes.dtypes import (
36-
ArrowDtype,
37-
ExtensionDtype,
38-
)
35+
from pandas.core.dtypes.dtypes import ExtensionDtype
3936
from pandas.core.dtypes.generic import (
4037
ABCDataFrame,
4138
ABCSeries,
@@ -379,10 +376,9 @@ def ndarray_to_mgr(
379376
nb = new_block_2d(values, placement=bp, refs=refs)
380377
block_values = [nb]
381378
elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype():
382-
import pyarrow as pa
379+
dtype = StringDtype(storage="pyarrow_numpy")
383380

384381
obj_columns = list(values)
385-
dtype = ArrowDtype(pa.string())
386382
block_values = [
387383
new_block(
388384
dtype.construct_array_type()._from_sequence(data, dtype=dtype),

pandas/io/_util.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,4 @@ def _arrow_dtype_mapping() -> dict:
2828
def arrow_string_types_mapper() -> Callable:
2929
pa = import_optional_dependency("pyarrow")
3030

31-
return {pa.string(): pd.ArrowDtype(pa.string())}.get
31+
return {pa.string(): pd.StringDtype(storage="pyarrow_numpy")}.get

pandas/io/pytables.py

+3-10
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@
6868
)
6969
from pandas.core.dtypes.missing import array_equivalent
7070

71-
import pandas as pd
7271
from pandas import (
7372
DataFrame,
7473
DatetimeIndex,
@@ -3224,9 +3223,7 @@ def read(
32243223
values = self.read_array("values", start=start, stop=stop)
32253224
result = Series(values, index=index, name=self.name, copy=False)
32263225
if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
3227-
import pyarrow as pa
3228-
3229-
result = result.astype(pd.ArrowDtype(pa.string()))
3226+
result = result.astype("string[pyarrow_numpy]")
32303227
return result
32313228

32323229
# error: Signature of "write" incompatible with supertype "Fixed"
@@ -3296,9 +3293,7 @@ def read(
32963293
columns = items[items.get_indexer(blk_items)]
32973294
df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
32983295
if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
3299-
import pyarrow as pa
3300-
3301-
df = df.astype(pd.ArrowDtype(pa.string()))
3296+
df = df.astype("string[pyarrow_numpy]")
33023297
dfs.append(df)
33033298

33043299
if len(dfs) > 0:
@@ -4686,9 +4681,7 @@ def read(
46864681
values, # type: ignore[arg-type]
46874682
skipna=True,
46884683
):
4689-
import pyarrow as pa
4690-
4691-
df = df.astype(pd.ArrowDtype(pa.string()))
4684+
df = df.astype("string[pyarrow_numpy]")
46924685
frames.append(df)
46934686

46944687
if len(frames) == 1:

pandas/tests/frame/test_constructors.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -2685,8 +2685,8 @@ def test_construct_with_strings_and_none(self):
26852685

26862686
def test_frame_string_inference(self):
26872687
# GH#54430
2688-
pa = pytest.importorskip("pyarrow")
2689-
dtype = pd.ArrowDtype(pa.string())
2688+
pytest.importorskip("pyarrow")
2689+
dtype = "string[pyarrow_numpy]"
26902690
expected = DataFrame(
26912691
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
26922692
)
@@ -2720,8 +2720,8 @@ def test_frame_string_inference(self):
27202720

27212721
def test_frame_string_inference_array_string_dtype(self):
27222722
# GH#54496
2723-
pa = pytest.importorskip("pyarrow")
2724-
dtype = pd.ArrowDtype(pa.string())
2723+
pytest.importorskip("pyarrow")
2724+
dtype = "string[pyarrow_numpy]"
27252725
expected = DataFrame(
27262726
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
27272727
)

pandas/tests/indexes/base_class/test_constructors.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ def test_construct_empty_tuples(self, tuple_list):
4646

4747
def test_index_string_inference(self):
4848
# GH#54430
49-
pa = pytest.importorskip("pyarrow")
50-
dtype = pd.ArrowDtype(pa.string())
49+
pytest.importorskip("pyarrow")
50+
dtype = "string[pyarrow_numpy]"
5151
expected = Index(["a", "b"], dtype=dtype)
5252
with pd.option_context("future.infer_string", True):
5353
ser = Index(["a", "b"])

pandas/tests/io/json/test_pandas.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -2099,7 +2099,7 @@ def test_pyarrow_engine_lines_false():
20992099

21002100

21012101
def test_json_roundtrip_string_inference(orient):
2102-
pa = pytest.importorskip("pyarrow")
2102+
pytest.importorskip("pyarrow")
21032103
df = DataFrame(
21042104
[["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"]
21052105
)
@@ -2108,8 +2108,8 @@ def test_json_roundtrip_string_inference(orient):
21082108
result = read_json(StringIO(out))
21092109
expected = DataFrame(
21102110
[["a", "b"], ["c", "d"]],
2111-
dtype=pd.ArrowDtype(pa.string()),
2112-
index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())),
2113-
columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())),
2111+
dtype="string[pyarrow_numpy]",
2112+
index=pd.Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"),
2113+
columns=pd.Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"),
21142114
)
21152115
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -542,8 +542,8 @@ def test_ea_int_avoid_overflow(all_parsers):
542542

543543
def test_string_inference(all_parsers):
544544
# GH#54430
545-
pa = pytest.importorskip("pyarrow")
546-
dtype = pd.ArrowDtype(pa.string())
545+
pytest.importorskip("pyarrow")
546+
dtype = "string[pyarrow_numpy]"
547547

548548
data = """a,b
549549
x,1

pandas/tests/io/pytables/test_read.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -392,15 +392,15 @@ def test_read_py2_hdf_file_in_py3(datapath):
392392

393393
def test_read_infer_string(tmp_path, setup_path):
394394
# GH#54431
395-
pa = pytest.importorskip("pyarrow")
395+
pytest.importorskip("pyarrow")
396396
df = DataFrame({"a": ["a", "b", None]})
397397
path = tmp_path / setup_path
398398
df.to_hdf(path, key="data", format="table")
399399
with pd.option_context("future.infer_string", True):
400400
result = read_hdf(path, key="data", mode="r")
401401
expected = DataFrame(
402402
{"a": ["a", "b", None]},
403-
dtype=pd.ArrowDtype(pa.string()),
404-
columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())),
403+
dtype="string[pyarrow_numpy]",
404+
columns=Index(["a"], dtype="string[pyarrow_numpy]"),
405405
)
406406
tm.assert_frame_equal(result, expected)

pandas/tests/io/test_feather.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -222,14 +222,10 @@ def test_invalid_dtype_backend(self):
222222

223223
def test_string_inference(self, tmp_path):
224224
# GH#54431
225-
import pyarrow as pa
226-
227225
path = tmp_path / "test_string_inference.p"
228226
df = pd.DataFrame(data={"a": ["x", "y"]})
229227
df.to_feather(path)
230228
with pd.option_context("future.infer_string", True):
231229
result = read_feather(path)
232-
expected = pd.DataFrame(
233-
data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string())
234-
)
230+
expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]")
235231
tm.assert_frame_equal(result, expected)

pandas/tests/io/test_orc.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ def test_string_inference(tmp_path):
426426
result = read_orc(path)
427427
expected = pd.DataFrame(
428428
data={"a": ["x", "y"]},
429-
dtype=pd.ArrowDtype(pa.string()),
430-
columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())),
429+
dtype="string[pyarrow_numpy]",
430+
columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"),
431431
)
432432
tm.assert_frame_equal(result, expected)

pandas/tests/io/test_parquet.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -1099,17 +1099,15 @@ def test_df_attrs_persistence(self, tmp_path, pa):
10991099

11001100
def test_string_inference(self, tmp_path, pa):
11011101
# GH#54431
1102-
import pyarrow as pa
1103-
11041102
path = tmp_path / "test_string_inference.p"
11051103
df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"])
11061104
df.to_parquet(path, engine="pyarrow")
11071105
with pd.option_context("future.infer_string", True):
11081106
result = read_parquet(path, engine="pyarrow")
11091107
expected = pd.DataFrame(
11101108
data={"a": ["x", "y"]},
1111-
dtype=pd.ArrowDtype(pa.string()),
1112-
index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
1109+
dtype="string[pyarrow_numpy]",
1110+
index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
11131111
)
11141112
tm.assert_frame_equal(result, expected)
11151113

pandas/tests/io/test_sql.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2946,15 +2946,15 @@ def test_read_sql_dtype_backend_table(self, string_storage, func):
29462946

29472947
def test_read_sql_string_inference(self):
29482948
# GH#54430
2949-
pa = pytest.importorskip("pyarrow")
2949+
pytest.importorskip("pyarrow")
29502950
table = "test"
29512951
df = DataFrame({"a": ["x", "y"]})
29522952
df.to_sql(table, con=self.conn, index=False, if_exists="replace")
29532953

29542954
with pd.option_context("future.infer_string", True):
29552955
result = read_sql_table(table, self.conn)
29562956

2957-
dtype = pd.ArrowDtype(pa.string())
2957+
dtype = "string[pyarrow_numpy]"
29582958
expected = DataFrame(
29592959
{"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
29602960
)

pandas/tests/series/test_constructors.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -2077,8 +2077,8 @@ def test_series_from_index_dtype_equal_does_not_copy(self):
20772077

20782078
def test_series_string_inference(self):
20792079
# GH#54430
2080-
pa = pytest.importorskip("pyarrow")
2081-
dtype = pd.ArrowDtype(pa.string())
2080+
pytest.importorskip("pyarrow")
2081+
dtype = "string[pyarrow_numpy]"
20822082
expected = Series(["a", "b"], dtype=dtype)
20832083
with pd.option_context("future.infer_string", True):
20842084
ser = Series(["a", "b"])
@@ -2092,25 +2092,25 @@ def test_series_string_inference(self):
20922092
@pytest.mark.parametrize("na_value", [None, np.nan, pd.NA])
20932093
def test_series_string_with_na_inference(self, na_value):
20942094
# GH#54430
2095-
pa = pytest.importorskip("pyarrow")
2096-
dtype = pd.ArrowDtype(pa.string())
2095+
pytest.importorskip("pyarrow")
2096+
dtype = "string[pyarrow_numpy]"
20972097
expected = Series(["a", na_value], dtype=dtype)
20982098
with pd.option_context("future.infer_string", True):
20992099
ser = Series(["a", na_value])
21002100
tm.assert_series_equal(ser, expected)
21012101

21022102
def test_series_string_inference_scalar(self):
21032103
# GH#54430
2104-
pa = pytest.importorskip("pyarrow")
2105-
expected = Series("a", index=[1], dtype=pd.ArrowDtype(pa.string()))
2104+
pytest.importorskip("pyarrow")
2105+
expected = Series("a", index=[1], dtype="string[pyarrow_numpy]")
21062106
with pd.option_context("future.infer_string", True):
21072107
ser = Series("a", index=[1])
21082108
tm.assert_series_equal(ser, expected)
21092109

21102110
def test_series_string_inference_array_string_dtype(self):
21112111
# GH#54496
2112-
pa = pytest.importorskip("pyarrow")
2113-
expected = Series(["a", "b"], dtype=pd.ArrowDtype(pa.string()))
2112+
pytest.importorskip("pyarrow")
2113+
expected = Series(["a", "b"], dtype="string[pyarrow_numpy]")
21142114
with pd.option_context("future.infer_string", True):
21152115
ser = Series(np.array(["a", "b"]))
21162116
tm.assert_series_equal(ser, expected)

0 commit comments

Comments
 (0)