From ebe0bd51c2939f10f10164eb169276537fa15c51 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 31 Jul 2023 20:02:14 -0700
Subject: [PATCH 1/7] ENH: allow opt-in to inferring pyarrow strings

---
 pandas/_libs/lib.pyx       | 38 ++++++++++++++++++++++++++++++++++++++
 pandas/core/config_init.py | 11 +++++++++++
 pandas/core/dtypes/cast.py | 19 +++++++++++++++++++
 3 files changed, 68 insertions(+)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index c3fbd3ee4853e..183a111249710 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1299,6 +1299,7 @@ cdef class Seen:
         bint datetimetz_      # seen_datetimetz
         bint period_          # seen_period
         bint interval_        # seen_interval
+        bint str_             # seen_str
 
     def __cinit__(self, bint coerce_numeric=False):
         """
@@ -1325,6 +1326,7 @@ cdef class Seen:
         self.datetimetz_ = False
         self.period_ = False
         self.interval_ = False
+        self.str_ = False
         self.coerce_numeric = coerce_numeric
 
     cdef bint check_uint64_conflict(self) except -1:
@@ -2615,6 +2617,13 @@ def maybe_convert_objects(ndarray[object] objects,
             else:
                 seen.object_ = True
                 break
+        elif isinstance(val, str):
+            if convert_non_numeric:
+                seen.str_ = True
+                break
+            else:
+                seen.object_ = True
+                break
         else:
             seen.object_ = True
             break
@@ -2669,6 +2678,35 @@ def maybe_convert_objects(ndarray[object] objects,
             return pi._data
         seen.object_ = True
 
+    elif seen.str_:
+        if is_string_array(objects):
+            from pandas._config import get_option
+            opt = get_option("future.infer_string")
+            if opt is True:
+                import pyarrow as pa
+
+                from pandas.core.dtypes.dtypes import ArrowDtype
+
+                obj = pa.array(objects)
+                dtype = ArrowDtype(obj.type)
+                return dtype.construct_array_type()(obj)
+            # elif opt is False:
+            #    # explicitly set to keep the old behavior and avoid the warning
+            #    pass
+            # else:
+            #    from pandas.util._exceptions import find_stack_level
+            #    warnings.warn(
+            #        "Pandas type inference with a sequence of `str` "
+            #        "objects is deprecated. In a future version, this will give "
+            #        "string[pyarrow] dtype, which will require pyarrow to be "
+            #        "installed. To opt in to the new behavior immediately set "
+            #        "`pd.set_option('future.infer_string', True)`. To keep the "
+            #        "old behavior pass `dtype=object`.",
+            #        FutureWarning,
+            #        stacklevel=find_stack_level(),
+            #    )
+
+        seen.object_ = True
     elif seen.interval_:
         if is_interval_array(objects):
             from pandas import IntervalIndex
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 3f662073f0357..4c02d90827760 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -889,3 +889,14 @@ def register_converter_cb(key) -> None:
         styler_environment,
         validator=is_instance_factory([type(None), str]),
     )
+
+
+with cf.config_prefix("future"):
+    cf.register_option(
+        "future.infer_string",
+        None,
+        "Whether to infer sequence of str objects as pyarrow string "
+        "dtype, which will be the default in pandas 3.0 "
+        "(at which point this option will be deprecated).",
+        validator=is_one_of_factory([True, False, None]),
+    )
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 09105bf49c050..d33d884832c60 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -18,6 +18,8 @@
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import lib
 from pandas._libs.missing import (
     NA,
@@ -796,6 +798,23 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
         # coming out as np.str_!
 
         dtype = _dtype_obj
+        opt = get_option("future.infer_string")
+        if opt is True:
+            import pyarrow as pa
+
+            pa_dtype = pa.string()
+            dtype = ArrowDtype(pa_dtype)
+        # elif opt is None:
+        #    warnings.warn(
+        #        "Pandas type inference with a `str` "
+        #        "object is deprecated. In a future version, this will give "
+        #        "string[pyarrow] dtype, which will require pyarrow to be "
+        #        "installed. To opt in to the new behavior immediately set "
+        #        "`pd.set_option('future.infer_string', True)`. To keep the "
+        #        "old behavior pass `dtype=object`.",
+        #        FutureWarning,
+        #        stacklevel=find_stack_level(),
+        #    )
 
     elif isinstance(val, (np.datetime64, dt.datetime)):
         try:

From 0889028e1b20e087aefedab1560e064e814f01f7 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sat, 5 Aug 2023 19:30:54 +0200
Subject: [PATCH 2/7] Remove comments and add tests

---
 pandas/_libs/lib.pyx                          | 15 --------
 pandas/core/config_init.py                    |  6 ++--
 pandas/core/dtypes/cast.py                    | 11 ------
 pandas/tests/frame/test_constructors.py       | 35 +++++++++++++++++++
 .../indexes/base_class/test_constructors.py   | 15 ++++++++
 .../io/parser/dtypes/test_dtypes_basic.py     | 21 +++++++++++
 pandas/tests/io/test_sql.py                   | 17 +++++++++
 pandas/tests/series/test_constructors.py      | 14 ++++++++
 8 files changed, 105 insertions(+), 29 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 183a111249710..2bd99724b1cad 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2690,21 +2690,6 @@ def maybe_convert_objects(ndarray[object] objects,
                 obj = pa.array(objects)
                 dtype = ArrowDtype(obj.type)
                 return dtype.construct_array_type()(obj)
-            # elif opt is False:
-            #    # explicitly set to keep the old behavior and avoid the warning
-            #    pass
-            # else:
-            #    from pandas.util._exceptions import find_stack_level
-            #    warnings.warn(
-            #        "Pandas type inference with a sequence of `str` "
-            #        "objects is deprecated. In a future version, this will give "
-            #        "string[pyarrow] dtype, which will require pyarrow to be "
-            #        "installed. To opt in to the new behavior immediately set "
-            #        "`pd.set_option('future.infer_string', True)`. To keep the "
-            #        "old behavior pass `dtype=object`.",
-            #        FutureWarning,
-            #        stacklevel=find_stack_level(),
-            #    )
 
         seen.object_ = True
     elif seen.interval_:
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 4c02d90827760..27e9bf8958ab0 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -893,10 +893,10 @@ def register_converter_cb(key) -> None:
 
 with cf.config_prefix("future"):
     cf.register_option(
-        "future.infer_string",
-        None,
+        "infer_string",
+        False,
         "Whether to infer sequence of str objects as pyarrow string "
         "dtype, which will be the default in pandas 3.0 "
         "(at which point this option will be deprecated).",
-        validator=is_one_of_factory([True, False, None]),
+        validator=is_one_of_factory([True, False]),
     )
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index d33d884832c60..9d2530ddc4e12 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -804,17 +804,6 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
 
             pa_dtype = pa.string()
             dtype = ArrowDtype(pa_dtype)
-        # elif opt is None:
-        #    warnings.warn(
-        #        "Pandas type inference with a `str` "
-        #        "object is deprecated. In a future version, this will give "
-        #        "string[pyarrow] dtype, which will require pyarrow to be "
-        #        "installed. To opt in to the new behavior immediately set "
-        #        "`pd.set_option('future.infer_string', True)`. To keep the "
-        #        "old behavior pass `dtype=object`.",
-        #        FutureWarning,
-        #        stacklevel=find_stack_level(),
-        #    )
 
     elif isinstance(val, (np.datetime64, dt.datetime)):
         try:
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index c87f04efffcf4..b82dc98cd0210 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -2670,6 +2670,41 @@ def test_construct_with_strings_and_none(self):
         expected = DataFrame({"a": ["1", "2", None]}, dtype="str")
         tm.assert_frame_equal(df, expected)
 
+    def test_frame_string_inference(self):
+        # GH#54430
+        pa = pytest.importorskip("pyarrow")
+        dtype = pd.ArrowDtype(pa.string())
+        expected = DataFrame(
+            {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", "b"]})
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame(
+            {"a": ["a", "b"]},
+            dtype=dtype,
+            columns=Index(["a"], dtype=dtype),
+            index=Index(["x", "y"], dtype=dtype),
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", "b"]}, index=["x", "y"])
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame(
+            {"a": ["a", 1]}, dtype="object", columns=Index(["a"], dtype=dtype)
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", 1]})
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame(
+            {"a": ["a", "b"]}, dtype="object", columns=Index(["a"], dtype=dtype)
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", "b"]}, dtype="object")
+        tm.assert_frame_equal(df, expected)
+
 
 class TestDataFrameConstructorIndexInference:
     def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):
diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py
index cf8b7214f3b91..638124ac20e06 100644
--- a/pandas/tests/indexes/base_class/test_constructors.py
+++ b/pandas/tests/indexes/base_class/test_constructors.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 
+import pandas as pd
 from pandas import (
     Index,
     MultiIndex,
@@ -42,3 +43,17 @@ def test_construct_empty_tuples(self, tuple_list):
         expected = MultiIndex.from_tuples(tuple_list)
 
         tm.assert_index_equal(result, expected)
+
+    def test_index_string_inference(self):
+        # GH#54430
+        pa = pytest.importorskip("pyarrow")
+        dtype = pd.ArrowDtype(pa.string())
+        expected = Index(["a", "b"], dtype=dtype)
+        with pd.option_context("future.infer_string", True):
+            ser = Index(["a", "b"])
+        tm.assert_index_equal(ser, expected)
+
+        expected = Index(["a", 1], dtype="object")
+        with pd.option_context("future.infer_string", True):
+            ser = Index(["a", 1])
+        tm.assert_index_equal(ser, expected)
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index 915cc9a9a1f95..1a613c91880ea 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -538,3 +538,24 @@ def test_ea_int_avoid_overflow(all_parsers):
         }
     )
     tm.assert_frame_equal(result, expected)
+
+
+def test_string_inference(all_parsers):
+    # GH#54430
+    pa = pytest.importorskip("pyarrow")
+    dtype = pd.ArrowDtype(pa.string())
+
+    data = """a,b
+x,1
+y,2"""
+    parser = all_parsers
+    if parser.engine == "pyarrow":
+        pytest.skip("TODO: Follow up")
+    with pd.option_context("future.infer_string", True):
+        result = parser.read_csv(StringIO(data))
+
+    expected = DataFrame(
+        {"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]},
+        columns=pd.Index(["a", "b"], dtype=dtype),
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index 6800e55396d7b..63ca91cc89ede 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -2920,6 +2920,23 @@ def test_read_sql_dtype_backend_table(self, string_storage, func):
         # GH#50048 Not supported for sqlite
         pass
 
+    def test_read_sql_string_inference(self):
+        # GH#54430
+        pa = pytest.importorskip("pyarrow")
+        table = "test"
+        df = DataFrame({"a": ["x", "y"]})
+        df.to_sql(table, self.conn, index=False, if_exists="replace")
+
+        with pd.option_context("future.infer_string", True):
+            result = read_sql_table(table, self.conn)
+
+        dtype = pd.ArrowDtype(pa.string())
+        expected = DataFrame(
+            {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
+        )
+
+        tm.assert_frame_equal(result, expected)
+
 
 @pytest.mark.db
 class TestMySQLAlchemy(_TestSQLAlchemy):
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 9540d7a014409..e67196edcd444 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -2070,6 +2070,20 @@ def test_series_from_index_dtype_equal_does_not_copy(self):
         ser.iloc[0] = 100
         tm.assert_index_equal(idx, expected)
 
+    def test_series_string_inference(self):
+        # GH#54430
+        pa = pytest.importorskip("pyarrow")
+        dtype = pd.ArrowDtype(pa.string())
+        expected = Series(["a", "b"], dtype=dtype)
+        with pd.option_context("future.infer_string", True):
+            ser = Series(["a", "b"])
+        tm.assert_series_equal(ser, expected)
+
+        expected = Series(["a", 1], dtype="object")
+        with pd.option_context("future.infer_string", True):
+            ser = Series(["a", 1])
+        tm.assert_series_equal(ser, expected)
+
 
 class TestSeriesConstructorIndexCoercion:
     def test_series_constructor_datetimelike_index_coercion(self):

From 533a642d78ff8ebc8a769d54f2e2c9e66e4ef695 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 6 Aug 2023 16:43:09 +0200
Subject: [PATCH 3/7] Add json tests

---
 pandas/tests/io/json/test_pandas.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 563f8005bfa72..ba6854c296841 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -2094,3 +2094,20 @@ def test_pyarrow_engine_lines_false():
     out = ser.to_json()
     with pytest.raises(ValueError, match="currently pyarrow engine only supports"):
         read_json(out, engine="pyarrow", lines=False)
+
+
+def test_json_roundtrip_string_inference(orient):
+    pa = pytest.importorskip("pyarrow")
+    df = DataFrame(
+        [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"]
+    )
+    out = df.to_json()
+    with pd.option_context("future.infer_string", True):
+        result = read_json(StringIO(out))
+    expected = DataFrame(
+        [["a", "b"], ["c", "d"]],
+        dtype=pd.ArrowDtype(pa.string()),
+        index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())),
+        columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())),
+    )
+    tm.assert_frame_equal(result, expected)

From 066160dd6467f228e776b5d3100dd558992008d8 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 7 Aug 2023 13:09:40 +0200
Subject: [PATCH 4/7] Update

---
 pandas/_libs/lib.pyx                     |  4 ++--
 pandas/tests/series/test_constructors.py | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 2bd99724b1cad..9c4350c80bd93 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2679,7 +2679,7 @@ def maybe_convert_objects(ndarray[object] objects,
         seen.object_ = True
 
     elif seen.str_:
-        if is_string_array(objects):
+        if is_string_array(objects, skipna=True):
             from pandas._config import get_option
             opt = get_option("future.infer_string")
             if opt is True:
@@ -2687,7 +2687,7 @@ def maybe_convert_objects(ndarray[object] objects,
 
                 from pandas.core.dtypes.dtypes import ArrowDtype
 
-                obj = pa.array(objects)
+                obj = pa.array(objects, from_pandas=True)
                 dtype = ArrowDtype(obj.type)
                 return dtype.construct_array_type()(obj)
 
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index e67196edcd444..4d7f0f4bd7e34 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -2084,6 +2084,16 @@ def test_series_string_inference(self):
             ser = Series(["a", 1])
         tm.assert_series_equal(ser, expected)
 
+    @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA])
+    def test_series_string_with_na_inference(self, na_value):
+        # GH#54430
+        pa = pytest.importorskip("pyarrow")
+        dtype = pd.ArrowDtype(pa.string())
+        expected = Series(["a", na_value], dtype=dtype)
+        with pd.option_context("future.infer_string", True):
+            ser = Series(["a", na_value])
+        tm.assert_series_equal(ser, expected)
+
 
 class TestSeriesConstructorIndexCoercion:
     def test_series_constructor_datetimelike_index_coercion(self):

From 364112a70b98f43fc8fc9ba27f70afb1b0e236ea Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 7 Aug 2023 18:54:09 +0200
Subject: [PATCH 5/7] Update pandas/_libs/lib.pyx

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 pandas/_libs/lib.pyx | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 9c4350c80bd93..d5139322e1042 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2683,13 +2683,10 @@ def maybe_convert_objects(ndarray[object] objects,
             from pandas._config import get_option
             opt = get_option("future.infer_string")
             if opt is True:
-                import pyarrow as pa
-
                 from pandas.core.dtypes.dtypes import ArrowDtype
 
-                obj = pa.array(objects, from_pandas=True)
                 dtype = ArrowDtype(obj.type)
-                return dtype.construct_array_type()(obj)
+                return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         seen.object_ = True
     elif seen.interval_:

From 2c36db27ffa82d27d6e63bdd4ad8b023b6ea67e9 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 7 Aug 2023 19:36:55 +0200
Subject: [PATCH 6/7] Update

---
 pandas/_libs/lib.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index d5139322e1042..924cf360a35cc 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2683,9 +2683,11 @@ def maybe_convert_objects(ndarray[object] objects,
             from pandas._config import get_option
             opt = get_option("future.infer_string")
             if opt is True:
+                import pyarrow as pa
+
                 from pandas.core.dtypes.dtypes import ArrowDtype
 
-                dtype = ArrowDtype(obj.type)
+                dtype = ArrowDtype(pa.string())
                 return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         seen.object_ = True

From 157cb84135f36f2b1deacb69dea1a4119866eeb0 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Wed, 9 Aug 2023 14:27:48 +0200
Subject: [PATCH 7/7] Add test

---
 pandas/tests/series/test_constructors.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 4d7f0f4bd7e34..b50b05faa523e 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -2094,6 +2094,14 @@ def test_series_string_with_na_inference(self, na_value):
             ser = Series(["a", na_value])
         tm.assert_series_equal(ser, expected)
 
+    def test_series_string_inference_scalar(self):
+        # GH#54430
+        pa = pytest.importorskip("pyarrow")
+        expected = Series("a", index=[1], dtype=pd.ArrowDtype(pa.string()))
+        with pd.option_context("future.infer_string", True):
+            ser = Series("a", index=[1])
+        tm.assert_series_equal(ser, expected)
+
 
 class TestSeriesConstructorIndexCoercion:
     def test_series_constructor_datetimelike_index_coercion(self):