From ebe0bd51c2939f10f10164eb169276537fa15c51 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 31 Jul 2023 20:02:14 -0700 Subject: [PATCH] ENH: allow opt-in to inferring pyarrow strings --- pandas/_libs/lib.pyx | 38 ++++++++++++++++++++++++++++++++++++++ pandas/core/config_init.py | 11 +++++++++++ pandas/core/dtypes/cast.py | 19 +++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c3fbd3ee4853e..183a111249710 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1299,6 +1299,7 @@ cdef class Seen: bint datetimetz_ # seen_datetimetz bint period_ # seen_period bint interval_ # seen_interval + bint str_ # seen_str def __cinit__(self, bint coerce_numeric=False): """ @@ -1325,6 +1326,7 @@ cdef class Seen: self.datetimetz_ = False self.period_ = False self.interval_ = False + self.str_ = False self.coerce_numeric = coerce_numeric cdef bint check_uint64_conflict(self) except -1: @@ -2615,6 +2617,13 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif isinstance(val, str): + if convert_non_numeric: + seen.str_ = True + break + else: + seen.object_ = True + break else: seen.object_ = True break @@ -2669,6 +2678,35 @@ def maybe_convert_objects(ndarray[object] objects, return pi._data seen.object_ = True + elif seen.str_: + if is_string_array(objects): + from pandas._config import get_option + opt = get_option("future.infer_string") + if opt is True: + import pyarrow as pa + + from pandas.core.dtypes.dtypes import ArrowDtype + + obj = pa.array(objects) + dtype = ArrowDtype(obj.type) + return dtype.construct_array_type()(obj) + # elif opt is False: + # # explicitly set to keep the old behavior and avoid the warning + # pass + # else: + # from pandas.util._exceptions import find_stack_level + # warnings.warn( + # "Pandas type inference with a sequence of `str` " + # "objects is deprecated. In a future version, this will give " + # "string[pyarrow] dtype, which will require pyarrow to be " + # "installed. To opt in to the new behavior immediately set " + # "`pd.set_option('future.infer_string', True)`. To keep the " + # "old behavior pass `dtype=object`.", + # FutureWarning, + # stacklevel=find_stack_level(), + # ) + + seen.object_ = True elif seen.interval_: if is_interval_array(objects): from pandas import IntervalIndex diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 3f662073f0357..4c02d90827760 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -889,3 +889,14 @@ def register_converter_cb(key) -> None: styler_environment, validator=is_instance_factory([type(None), str]), ) + + +with cf.config_prefix("future"): + cf.register_option( + "future.infer_string", + None, + "Whether to infer sequence of str objects as pyarrow string " + "dtype, which will be the default in pandas 3.0 " + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False, None]), + ) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 09105bf49c050..d33d884832c60 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,6 +18,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._libs.missing import ( NA, @@ -796,6 +798,23 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj + opt = get_option("future.infer_string") + if opt is True: + import pyarrow as pa + + pa_dtype = pa.string() + dtype = ArrowDtype(pa_dtype) + # elif opt is None: + # warnings.warn( + # "Pandas type inference with a `str` " + # "object is deprecated. In a future version, this will give " + # "string[pyarrow] dtype, which will require pyarrow to be " + # "installed. To opt in to the new behavior immediately set " + # "`pd.set_option('future.infer_string', True)`. To keep the " + # "old behavior pass `dtype=object`.", + # FutureWarning, + # stacklevel=find_stack_level(), + # ) elif isinstance(val, (np.datetime64, dt.datetime)): try: