diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 5757c69bb6ec7..f733ba3b445fd 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -19,6 +19,8 @@ import numpy as np from numpy import ma +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas._libs.tslibs import ( Period, @@ -49,7 +51,10 @@ is_object_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import NumpyEADtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + NumpyEADtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -589,6 +594,11 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) + elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + import pyarrow as pa + + dtype = ArrowDtype(pa.string()) + subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: subarr = subarr.copy() diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2290cd86f35e6..8879d3318f7ca 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,6 +13,8 @@ import numpy as np from numpy import ma +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.core.dtypes.astype import astype_is_view @@ -30,7 +32,10 @@ is_named_tuple, is_object_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + ExtensionDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -65,6 +70,7 @@ from pandas.core.internals.blocks import ( BlockPlacement, ensure_block_shape, + new_block, new_block_2d, ) from pandas.core.internals.managers import ( @@ -372,6 +378,20 @@ def ndarray_to_mgr( bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] + elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): + import pyarrow as pa + + obj_columns = list(values) + dtype = ArrowDtype(pa.string()) + block_values = [ + new_block( + dtype.construct_array_type()._from_sequence(data, dtype=dtype), + BlockPlacement(slice(i, i + 1)), + ndim=1, + ) + for i, data in enumerate(obj_columns) + ] + else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c170704150383..63cddb7f192e6 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2718,6 +2718,31 @@ def test_frame_string_inference(self): df = DataFrame({"a": ["a", "b"]}, dtype="object") tm.assert_frame_equal(df, expected) + def test_frame_string_inference_array_string_dtype(self): + # GH#54496 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = DataFrame( + {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": np.array(["a", "b"])}) + tm.assert_frame_equal(df, expected) + + expected = DataFrame({0: ["a", "b"], 1: ["c", "d"]}, dtype=dtype) + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["a", "c"], ["b", "d"]])) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", "b"], "b": ["c", "d"]}, + dtype=dtype, + columns=Index(["a", "b"], dtype=dtype), + ) + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"]) + tm.assert_frame_equal(df, expected) + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 611f4a7f790a6..97bd8633954d8 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2107,6 +2107,14 @@ def test_series_string_inference_scalar(self): ser = Series("a", index=[1]) tm.assert_series_equal(ser, expected) + def test_series_string_inference_array_string_dtype(self): + # GH#54496 + pa = pytest.importorskip("pyarrow") + expected = Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())) + with pd.option_context("future.infer_string", True): + ser = Series(np.array(["a", "b"])) + tm.assert_series_equal(ser, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self):