From ebe0bd51c2939f10f10164eb169276537fa15c51 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 31 Jul 2023 20:02:14 -0700 Subject: [PATCH 1/7] ENH: allow opt-in to inferring pyarrow strings --- pandas/_libs/lib.pyx | 38 ++++++++++++++++++++++++++++++++++++++ pandas/core/config_init.py | 11 +++++++++++ pandas/core/dtypes/cast.py | 19 +++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c3fbd3ee4853e..183a111249710 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1299,6 +1299,7 @@ cdef class Seen: bint datetimetz_ # seen_datetimetz bint period_ # seen_period bint interval_ # seen_interval + bint str_ # seen_str def __cinit__(self, bint coerce_numeric=False): """ @@ -1325,6 +1326,7 @@ cdef class Seen: self.datetimetz_ = False self.period_ = False self.interval_ = False + self.str_ = False self.coerce_numeric = coerce_numeric cdef bint check_uint64_conflict(self) except -1: @@ -2615,6 +2617,13 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif isinstance(val, str): + if convert_non_numeric: + seen.str_ = True + break + else: + seen.object_ = True + break else: seen.object_ = True break @@ -2669,6 +2678,35 @@ def maybe_convert_objects(ndarray[object] objects, return pi._data seen.object_ = True + elif seen.str_: + if is_string_array(objects): + from pandas._config import get_option + opt = get_option("future.infer_string") + if opt is True: + import pyarrow as pa + + from pandas.core.dtypes.dtypes import ArrowDtype + + obj = pa.array(objects) + dtype = ArrowDtype(obj.type) + return dtype.construct_array_type()(obj) + # elif opt is False: + # # explicitly set to keep the old behavior and avoid the warning + # pass + # else: + # from pandas.util._exceptions import find_stack_level + # warnings.warn( + # "Pandas type inference with a sequence of `str` " + # "objects is deprecated. In a future version, this will give " + # "string[pyarrow] dtype, which will require pyarrow to be " + # "installed. To opt in to the new behavior immediately set " + # "`pd.set_option('future.infer_string', True)`. To keep the " + # "old behavior pass `dtype=object`.", + # FutureWarning, + # stacklevel=find_stack_level(), + # ) + + seen.object_ = True elif seen.interval_: if is_interval_array(objects): from pandas import IntervalIndex diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 3f662073f0357..4c02d90827760 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -889,3 +889,14 @@ def register_converter_cb(key) -> None: styler_environment, validator=is_instance_factory([type(None), str]), ) + + +with cf.config_prefix("future"): + cf.register_option( + "future.infer_string", + None, + "Whether to infer sequence of str objects as pyarrow string " + "dtype, which will be the default in pandas 3.0 " + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False, None]), + ) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 09105bf49c050..d33d884832c60 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,6 +18,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._libs.missing import ( NA, @@ -796,6 +798,23 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj + opt = get_option("future.infer_string") + if opt is True: + import pyarrow as pa + + pa_dtype = pa.string() + dtype = ArrowDtype(pa_dtype) + # elif opt is None: + # warnings.warn( + # "Pandas type inference with a `str` " + # "object is deprecated. In a future version, this will give " + # "string[pyarrow] dtype, which will require pyarrow to be " + # "installed. To opt in to the new behavior immediately set " + # "`pd.set_option('future.infer_string', True)`. To keep the " + # "old behavior pass `dtype=object`.", + # FutureWarning, + # stacklevel=find_stack_level(), + # ) elif isinstance(val, (np.datetime64, dt.datetime)): try: From 0889028e1b20e087aefedab1560e064e814f01f7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 5 Aug 2023 19:30:54 +0200 Subject: [PATCH 2/7] Remove comments and add tests --- pandas/_libs/lib.pyx | 15 -------- pandas/core/config_init.py | 6 ++-- pandas/core/dtypes/cast.py | 11 ------ pandas/tests/frame/test_constructors.py | 35 +++++++++++++++++++ .../indexes/base_class/test_constructors.py | 15 ++++++++ .../io/parser/dtypes/test_dtypes_basic.py | 21 +++++++++++ pandas/tests/io/test_sql.py | 17 +++++++++ pandas/tests/series/test_constructors.py | 14 ++++++++ 8 files changed, 105 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 183a111249710..2bd99724b1cad 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2690,21 +2690,6 @@ def maybe_convert_objects(ndarray[object] objects, obj = pa.array(objects) dtype = ArrowDtype(obj.type) return dtype.construct_array_type()(obj) - # elif opt is False: - # # explicitly set to keep the old behavior and avoid the warning - # pass - # else: - # from pandas.util._exceptions import find_stack_level - # warnings.warn( - # "Pandas type inference with a sequence of `str` " - # "objects is deprecated. In a future version, this will give " - # "string[pyarrow] dtype, which will require pyarrow to be " - # "installed. To opt in to the new behavior immediately set " - # "`pd.set_option('future.infer_string', True)`. To keep the " - # "old behavior pass `dtype=object`.", - # FutureWarning, - # stacklevel=find_stack_level(), - # ) seen.object_ = True elif seen.interval_: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 4c02d90827760..27e9bf8958ab0 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -893,10 +893,10 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( - "future.infer_string", - None, + "infer_string", + False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", - validator=is_one_of_factory([True, False, None]), + validator=is_one_of_factory([True, False]), ) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d33d884832c60..9d2530ddc4e12 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -804,17 +804,6 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: pa_dtype = pa.string() dtype = ArrowDtype(pa_dtype) - # elif opt is None: - # warnings.warn( - # "Pandas type inference with a `str` " - # "object is deprecated. In a future version, this will give " - # "string[pyarrow] dtype, which will require pyarrow to be " - # "installed. To opt in to the new behavior immediately set " - # "`pd.set_option('future.infer_string', True)`. To keep the " - # "old behavior pass `dtype=object`.", - # FutureWarning, - # stacklevel=find_stack_level(), - # ) elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c87f04efffcf4..b82dc98cd0210 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2670,6 +2670,41 @@ def test_construct_with_strings_and_none(self): expected = DataFrame({"a": ["1", "2", None]}, dtype="str") tm.assert_frame_equal(df, expected) + def test_frame_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = DataFrame( + {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", "b"]}) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", "b"]}, + dtype=dtype, + columns=Index(["a"], dtype=dtype), + index=Index(["x", "y"], dtype=dtype), + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", "b"]}, index=["x", "y"]) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", 1]}, dtype="object", columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", 1]}) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", "b"]}, dtype="object", columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": ["a", "b"]}, dtype="object") + tm.assert_frame_equal(df, expected) + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index cf8b7214f3b91..638124ac20e06 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( Index, MultiIndex, @@ -42,3 +43,17 @@ def test_construct_empty_tuples(self, tuple_list): expected = MultiIndex.from_tuples(tuple_list) tm.assert_index_equal(result, expected) + + def test_index_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = Index(["a", "b"], dtype=dtype) + with pd.option_context("future.infer_string", True): + ser = Index(["a", "b"]) + tm.assert_index_equal(ser, expected) + + expected = Index(["a", 1], dtype="object") + with pd.option_context("future.infer_string", True): + ser = Index(["a", 1]) + tm.assert_index_equal(ser, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 915cc9a9a1f95..1a613c91880ea 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -538,3 +538,24 @@ def test_ea_int_avoid_overflow(all_parsers): } ) tm.assert_frame_equal(result, expected) + + +def test_string_inference(all_parsers): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + + data = """a,b +x,1 +y,2""" + parser = all_parsers + if parser.engine == "pyarrow": + pytest.skip("TODO: Follow up") + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data)) + + expected = DataFrame( + {"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]}, + columns=pd.Index(["a", "b"], dtype=dtype), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6800e55396d7b..63ca91cc89ede 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2920,6 +2920,23 @@ def test_read_sql_dtype_backend_table(self, string_storage, func): # GH#50048 Not supported for sqlite pass + def test_read_sql_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + table = "test" + df = DataFrame({"a": ["x", "y"]}) + df.to_sql(table, self.conn, index=False, if_exists="replace") + + with pd.option_context("future.infer_string", True): + result = read_sql_table(table, self.conn) + + dtype = pd.ArrowDtype(pa.string()) + expected = DataFrame( + {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.db class TestMySQLAlchemy(_TestSQLAlchemy): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 9540d7a014409..e67196edcd444 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2070,6 +2070,20 @@ def test_series_from_index_dtype_equal_does_not_copy(self): ser.iloc[0] = 100 tm.assert_index_equal(idx, expected) + def test_series_string_inference(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = Series(["a", "b"], dtype=dtype) + with pd.option_context("future.infer_string", True): + ser = Series(["a", "b"]) + tm.assert_series_equal(ser, expected) + + expected = Series(["a", 1], dtype="object") + with pd.option_context("future.infer_string", True): + ser = Series(["a", 1]) + tm.assert_series_equal(ser, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From 533a642d78ff8ebc8a769d54f2e2c9e66e4ef695 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 6 Aug 2023 16:43:09 +0200 Subject: [PATCH 3/7] Add json tests --- pandas/tests/io/json/test_pandas.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 563f8005bfa72..ba6854c296841 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2094,3 +2094,20 @@ def test_pyarrow_engine_lines_false(): out = ser.to_json() with pytest.raises(ValueError, match="currently pyarrow engine only supports"): read_json(out, engine="pyarrow", lines=False) + + +def test_json_roundtrip_string_inference(orient): + pa = pytest.importorskip("pyarrow") + df = DataFrame( + [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] + ) + out = df.to_json() + with pd.option_context("future.infer_string", True): + result = read_json(StringIO(out)) + expected = DataFrame( + [["a", "b"], ["c", "d"]], + dtype=pd.ArrowDtype(pa.string()), + index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())), + columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())), + ) + tm.assert_frame_equal(result, expected) From 066160dd6467f228e776b5d3100dd558992008d8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 7 Aug 2023 13:09:40 +0200 Subject: [PATCH 4/7] Update --- pandas/_libs/lib.pyx | 4 ++-- pandas/tests/series/test_constructors.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2bd99724b1cad..9c4350c80bd93 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2679,7 +2679,7 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if is_string_array(objects): + if is_string_array(objects, skipna=True): from pandas._config import get_option opt = get_option("future.infer_string") if opt is True: @@ -2687,7 +2687,7 @@ def maybe_convert_objects(ndarray[object] objects, from pandas.core.dtypes.dtypes import ArrowDtype - obj = pa.array(objects) + obj = pa.array(objects, from_pandas=True) dtype = ArrowDtype(obj.type) return dtype.construct_array_type()(obj) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index e67196edcd444..4d7f0f4bd7e34 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2084,6 +2084,16 @@ def test_series_string_inference(self): ser = Series(["a", 1]) tm.assert_series_equal(ser, expected) + @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) + def test_series_string_with_na_inference(self, na_value): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = Series(["a", na_value], dtype=dtype) + with pd.option_context("future.infer_string", True): + ser = Series(["a", na_value]) + tm.assert_series_equal(ser, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From 364112a70b98f43fc8fc9ba27f70afb1b0e236ea Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 7 Aug 2023 18:54:09 +0200 Subject: [PATCH 5/7] Update pandas/_libs/lib.pyx Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_libs/lib.pyx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 9c4350c80bd93..d5139322e1042 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2683,13 +2683,10 @@ def maybe_convert_objects(ndarray[object] objects, from pandas._config import get_option opt = get_option("future.infer_string") if opt is True: - import pyarrow as pa - from pandas.core.dtypes.dtypes import ArrowDtype - obj = pa.array(objects, from_pandas=True) dtype = ArrowDtype(obj.type) - return dtype.construct_array_type()(obj) + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True elif seen.interval_: From 2c36db27ffa82d27d6e63bdd4ad8b023b6ea67e9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 7 Aug 2023 19:36:55 +0200 Subject: [PATCH 6/7] Update --- pandas/_libs/lib.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d5139322e1042..924cf360a35cc 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2683,9 +2683,11 @@ def maybe_convert_objects(ndarray[object] objects, from pandas._config import get_option opt = get_option("future.infer_string") if opt is True: + import pyarrow as pa + from pandas.core.dtypes.dtypes import ArrowDtype - dtype = ArrowDtype(obj.type) + dtype = ArrowDtype(pa.string()) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True From 157cb84135f36f2b1deacb69dea1a4119866eeb0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 9 Aug 2023 14:27:48 +0200 Subject: [PATCH 7/7] Add test --- pandas/tests/series/test_constructors.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 4d7f0f4bd7e34..b50b05faa523e 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2094,6 +2094,14 @@ def test_series_string_with_na_inference(self, na_value): ser = Series(["a", na_value]) tm.assert_series_equal(ser, expected) + def test_series_string_inference_scalar(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + expected = Series("a", index=[1], dtype=pd.ArrowDtype(pa.string())) + with pd.option_context("future.infer_string", True): + ser = Series("a", index=[1]) + tm.assert_series_equal(ser, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self):