From 7701587357ef8ba1dff6b4c520a7711a1b85c806 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 11 Aug 2023 14:14:37 +0200 Subject: [PATCH 1/3] Fix inference for fixed with numpy strings with arrow string option --- pandas/core/construction.py | 12 +++++++++++- pandas/core/internals/construction.py | 22 +++++++++++++++++++++- pandas/tests/frame/test_constructors.py | 20 ++++++++++++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 5757c69bb6ec7..f733ba3b445fd 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -19,6 +19,8 @@ import numpy as np from numpy import ma +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas._libs.tslibs import ( Period, @@ -49,7 +51,10 @@ is_object_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import NumpyEADtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + NumpyEADtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -589,6 +594,11 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) + elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + import pyarrow as pa + + dtype = ArrowDtype(pa.string()) + subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: subarr = subarr.copy() diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2290cd86f35e6..8879d3318f7ca 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,6 +13,8 @@ import numpy as np from numpy import ma +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.core.dtypes.astype import astype_is_view @@ -30,7 +32,10 @@ is_named_tuple, is_object_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + ExtensionDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -65,6 +70,7 @@ from pandas.core.internals.blocks import ( BlockPlacement, ensure_block_shape, + new_block, new_block_2d, ) from pandas.core.internals.managers import ( @@ -372,6 +378,20 @@ def ndarray_to_mgr( bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] + elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): + import pyarrow as pa + + obj_columns = list(values) + dtype = ArrowDtype(pa.string()) + block_values = [ + new_block( + dtype.construct_array_type()._from_sequence(data, dtype=dtype), + BlockPlacement(slice(i, i + 1)), + ndim=1, + ) + for i, data in enumerate(obj_columns) + ] + else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a493084142f7b..ef9aeb29eee0a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2720,6 +2720,26 @@ def test_frame_string_inference(self): df = DataFrame({"a": ["a", "b"]}, dtype="object") tm.assert_frame_equal(df, expected) + def test_frame_string_inference_array_string_dtype(self): + # GH#54430 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = DataFrame( + {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": np.array(["a", "b"])}) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", "b"], "b": ["c", "d"]}, + dtype=dtype, + columns=Index(["a", "b"], dtype=dtype), + ) + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"]) + tm.assert_frame_equal(df, expected) + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): From 0b9ea36b257fd58316cbb77ed68afe0f47c98255 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 11 Aug 2023 14:19:35 +0200 Subject: [PATCH 2/3] Update gh ref --- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/series/test_constructors.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ef9aeb29eee0a..d186559c13232 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2721,7 +2721,7 @@ def test_frame_string_inference(self): tm.assert_frame_equal(df, expected) def test_frame_string_inference_array_string_dtype(self): - # GH#54430 + # GH#54496 pa = pytest.importorskip("pyarrow") dtype = pd.ArrowDtype(pa.string()) expected = DataFrame( diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 331afc4345616..b4b226c447c95 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2107,6 +2107,14 @@ def test_series_string_inference_scalar(self): ser = Series("a", index=[1]) tm.assert_series_equal(ser, expected) + def test_series_string_inference_array_string_dtype(self): + # GH#54496 + pa = pytest.importorskip("pyarrow") + expected = Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())) + with pd.option_context("future.infer_string", True): + ser = Series(np.array(["a", "b"])) + tm.assert_series_equal(ser, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From 128ddff223c66a39b17a336b192e139a9bcb6346 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 20:04:56 +0200 Subject: [PATCH 3/3] Add test --- pandas/tests/frame/test_constructors.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fdf38ac438f1c..63cddb7f192e6 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2729,6 +2729,11 @@ def test_frame_string_inference_array_string_dtype(self): df = DataFrame({"a": np.array(["a", "b"])}) tm.assert_frame_equal(df, expected) + expected = DataFrame({0: ["a", "b"], 1: ["c", "d"]}, dtype=dtype) + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["a", "c"], ["b", "d"]])) + tm.assert_frame_equal(df, expected) + expected = DataFrame( {"a": ["a", "b"], "b": ["c", "d"]}, dtype=dtype,