pandas-dev · mroeschke · Aug 9, 2023 · Aug 1, 2023 · Aug 5, 2023 · Aug 6, 2023
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1299,6 +1299,7 @@ cdef class Seen:
         bint datetimetz_      # seen_datetimetz
         bint period_          # seen_period
         bint interval_        # seen_interval
+        bint str_             # seen_str
 
     def __cinit__(self, bint coerce_numeric=False):
         """
@@ -1325,6 +1326,7 @@ cdef class Seen:
         self.datetimetz_ = False
         self.period_ = False
         self.interval_ = False
+        self.str_ = False
         self.coerce_numeric = coerce_numeric
 
     cdef bint check_uint64_conflict(self) except -1:
@@ -2615,6 +2617,13 @@ def maybe_convert_objects(ndarray[object] objects,
             else:
                 seen.object_ = True
                 break
+        elif isinstance(val, str):
+            if convert_non_numeric:
+                seen.str_ = True
+                break
+            else:
+                seen.object_ = True
+                break
         else:
             seen.object_ = True
             break
@@ -2669,6 +2678,20 @@ def maybe_convert_objects(ndarray[object] objects,
             return pi._data
         seen.object_ = True
 
+    elif seen.str_:
+        if is_string_array(objects, skipna=True):
+            from pandas._config import get_option
+            opt = get_option("future.infer_string")
+            if opt is True:
+                import pyarrow as pa
+
+                from pandas.core.dtypes.dtypes import ArrowDtype
+
+                obj = pa.array(objects, from_pandas=True)
+                dtype = ArrowDtype(obj.type)
+                return dtype.construct_array_type()(obj)
+
+        seen.object_ = True
     elif seen.interval_:
         if is_interval_array(objects):
             from pandas import IntervalIndex

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -889,3 +889,14 @@ def register_converter_cb(key) -> None:
         styler_environment,
         validator=is_instance_factory([type(None), str]),
     )
+
+
+with cf.config_prefix("future"):
+    cf.register_option(
+        "infer_string",
+        False,
+        "Whether to infer sequence of str objects as pyarrow string "
+        "dtype, which will be the default in pandas 3.0 "
+        "(at which point this option will be deprecated).",
+        validator=is_one_of_factory([True, False]),
+    )
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -18,6 +18,8 @@
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import lib
 from pandas._libs.missing import (
     NA,
@@ -796,6 +798,12 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
         # coming out as np.str_!
 
         dtype = _dtype_obj
+        opt = get_option("future.infer_string")
+        if opt is True:
+            import pyarrow as pa
+
+            pa_dtype = pa.string()
+            dtype = ArrowDtype(pa_dtype)
 
     elif isinstance(val, (np.datetime64, dt.datetime)):
         try:

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -2670,6 +2670,41 @@ def test_construct_with_strings_and_none(self):
         expected = DataFrame({"a": ["1", "2", None]}, dtype="str")
         tm.assert_frame_equal(df, expected)
 
+    def test_frame_string_inference(self):
+        # GH#54430
+        pa = pytest.importorskip("pyarrow")
+        dtype = pd.ArrowDtype(pa.string())
+        expected = DataFrame(
+            {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", "b"]})
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame(
+            {"a": ["a", "b"]},
+            dtype=dtype,
+            columns=Index(["a"], dtype=dtype),
+            index=Index(["x", "y"], dtype=dtype),
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", "b"]}, index=["x", "y"])
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame(
+            {"a": ["a", 1]}, dtype="object", columns=Index(["a"], dtype=dtype)
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", 1]})
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame(
+            {"a": ["a", "b"]}, dtype="object", columns=Index(["a"], dtype=dtype)
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", "b"]}, dtype="object")
+        tm.assert_frame_equal(df, expected)
+
 
 class TestDataFrameConstructorIndexInference:
     def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):

diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 
+import pandas as pd
 from pandas import (
     Index,
     MultiIndex,
@@ -42,3 +43,17 @@ def test_construct_empty_tuples(self, tuple_list):
         expected = MultiIndex.from_tuples(tuple_list)
 
         tm.assert_index_equal(result, expected)
+
+    def test_index_string_inference(self):
+        # GH#54430
+        pa = pytest.importorskip("pyarrow")
+        dtype = pd.ArrowDtype(pa.string())
+        expected = Index(["a", "b"], dtype=dtype)
+        with pd.option_context("future.infer_string", True):
+            ser = Index(["a", "b"])
+        tm.assert_index_equal(ser, expected)
+
+        expected = Index(["a", 1], dtype="object")
+        with pd.option_context("future.infer_string", True):
+            ser = Index(["a", 1])
+        tm.assert_index_equal(ser, expected)
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -2094,3 +2094,20 @@ def test_pyarrow_engine_lines_false():
     out = ser.to_json()
     with pytest.raises(ValueError, match="currently pyarrow engine only supports"):
         read_json(out, engine="pyarrow", lines=False)
+
+
+def test_json_roundtrip_string_inference(orient):
+    pa = pytest.importorskip("pyarrow")
+    df = DataFrame(
+        [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"]
+    )
+    out = df.to_json()
+    with pd.option_context("future.infer_string", True):
+        result = read_json(StringIO(out))
+    expected = DataFrame(
+        [["a", "b"], ["c", "d"]],
+        dtype=pd.ArrowDtype(pa.string()),
+        index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())),
+        columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())),
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -538,3 +538,24 @@ def test_ea_int_avoid_overflow(all_parsers):
         }
     )
     tm.assert_frame_equal(result, expected)
+
+
+def test_string_inference(all_parsers):
+    # GH#54430
+    pa = pytest.importorskip("pyarrow")
+    dtype = pd.ArrowDtype(pa.string())
+
+    data = """a,b
+x,1
+y,2"""
+    parser = all_parsers
+    if parser.engine == "pyarrow":
+        pytest.skip("TODO: Follow up")
+    with pd.option_context("future.infer_string", True):
+        result = parser.read_csv(StringIO(data))
+
+    expected = DataFrame(
+        {"a": pd.Series(["x", "y"], dtype=dtype), "b": [1, 2]},
+        columns=pd.Index(["a", "b"], dtype=dtype),
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -2920,6 +2920,23 @@ def test_read_sql_dtype_backend_table(self, string_storage, func):
         # GH#50048 Not supported for sqlite
         pass
 
+    def test_read_sql_string_inference(self):
+        # GH#54430
+        pa = pytest.importorskip("pyarrow")
+        table = "test"
+        df = DataFrame({"a": ["x", "y"]})
+        df.to_sql(table, self.conn, index=False, if_exists="replace")
+
+        with pd.option_context("future.infer_string", True):
+            result = read_sql_table(table, self.conn)
+
+        dtype = pd.ArrowDtype(pa.string())
+        expected = DataFrame(
+            {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
+        )
+
+        tm.assert_frame_equal(result, expected)
+
 
 @pytest.mark.db
 class TestMySQLAlchemy(_TestSQLAlchemy):

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -2070,6 +2070,30 @@ def test_series_from_index_dtype_equal_does_not_copy(self):
         ser.iloc[0] = 100
         tm.assert_index_equal(idx, expected)
 
+    def test_series_string_inference(self):
+        # GH#54430
+        pa = pytest.importorskip("pyarrow")
+        dtype = pd.ArrowDtype(pa.string())
+        expected = Series(["a", "b"], dtype=dtype)
+        with pd.option_context("future.infer_string", True):
+            ser = Series(["a", "b"])
+        tm.assert_series_equal(ser, expected)
+
+        expected = Series(["a", 1], dtype="object")
+        with pd.option_context("future.infer_string", True):
+            ser = Series(["a", 1])
+        tm.assert_series_equal(ser, expected)
+
+    @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA])
+    def test_series_string_with_na_inference(self, na_value):
+        # GH#54430
+        pa = pytest.importorskip("pyarrow")
+        dtype = pd.ArrowDtype(pa.string())
+        expected = Series(["a", na_value], dtype=dtype)
+        with pd.option_context("future.infer_string", True):
+            ser = Series(["a", na_value])
+        tm.assert_series_equal(ser, expected)
+
 
 class TestSeriesConstructorIndexCoercion:
     def test_series_constructor_datetimelike_index_coercion(self):