Skip to content

Commit eb36710

Browse files
Backport PR #54496 on branch 2.1.x (Fix inference for fixed with numpy strings with arrow string option) (#54672)
Backport PR #54496: Fix inference for fixed with numpy strings with arrow string option Co-authored-by: Patrick Hoefler <[email protected]>
1 parent 23cb250 commit eb36710

File tree

4 files changed

+65
-2
lines changed

4 files changed

+65
-2
lines changed

pandas/core/construction.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
import numpy as np
2020
from numpy import ma
2121

22+
from pandas._config import using_pyarrow_string_dtype
23+
2224
from pandas._libs import lib
2325
from pandas._libs.tslibs import (
2426
Period,
@@ -49,7 +51,10 @@
4951
is_object_dtype,
5052
pandas_dtype,
5153
)
52-
from pandas.core.dtypes.dtypes import NumpyEADtype
54+
from pandas.core.dtypes.dtypes import (
55+
ArrowDtype,
56+
NumpyEADtype,
57+
)
5358
from pandas.core.dtypes.generic import (
5459
ABCDataFrame,
5560
ABCExtensionArray,
@@ -589,6 +594,11 @@ def sanitize_array(
589594
subarr = data
590595
if data.dtype == object:
591596
subarr = maybe_infer_to_datetimelike(data)
597+
elif data.dtype.kind == "U" and using_pyarrow_string_dtype():
598+
import pyarrow as pa
599+
600+
dtype = ArrowDtype(pa.string())
601+
subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
592602

593603
if subarr is data and copy:
594604
subarr = subarr.copy()

pandas/core/internals/construction.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import numpy as np
1414
from numpy import ma
1515

16+
from pandas._config import using_pyarrow_string_dtype
17+
1618
from pandas._libs import lib
1719

1820
from pandas.core.dtypes.astype import astype_is_view
@@ -30,7 +32,10 @@
3032
is_named_tuple,
3133
is_object_dtype,
3234
)
33-
from pandas.core.dtypes.dtypes import ExtensionDtype
35+
from pandas.core.dtypes.dtypes import (
36+
ArrowDtype,
37+
ExtensionDtype,
38+
)
3439
from pandas.core.dtypes.generic import (
3540
ABCDataFrame,
3641
ABCSeries,
@@ -65,6 +70,7 @@
6570
from pandas.core.internals.blocks import (
6671
BlockPlacement,
6772
ensure_block_shape,
73+
new_block,
6874
new_block_2d,
6975
)
7076
from pandas.core.internals.managers import (
@@ -372,6 +378,20 @@ def ndarray_to_mgr(
372378
bp = BlockPlacement(slice(len(columns)))
373379
nb = new_block_2d(values, placement=bp, refs=refs)
374380
block_values = [nb]
381+
elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype():
382+
import pyarrow as pa
383+
384+
obj_columns = list(values)
385+
dtype = ArrowDtype(pa.string())
386+
block_values = [
387+
new_block(
388+
dtype.construct_array_type()._from_sequence(data, dtype=dtype),
389+
BlockPlacement(slice(i, i + 1)),
390+
ndim=1,
391+
)
392+
for i, data in enumerate(obj_columns)
393+
]
394+
375395
else:
376396
bp = BlockPlacement(slice(len(columns)))
377397
nb = new_block_2d(values, placement=bp, refs=refs)

pandas/tests/frame/test_constructors.py

+25
Original file line numberDiff line numberDiff line change
@@ -2718,6 +2718,31 @@ def test_frame_string_inference(self):
27182718
df = DataFrame({"a": ["a", "b"]}, dtype="object")
27192719
tm.assert_frame_equal(df, expected)
27202720

2721+
def test_frame_string_inference_array_string_dtype(self):
2722+
# GH#54496
2723+
pa = pytest.importorskip("pyarrow")
2724+
dtype = pd.ArrowDtype(pa.string())
2725+
expected = DataFrame(
2726+
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
2727+
)
2728+
with pd.option_context("future.infer_string", True):
2729+
df = DataFrame({"a": np.array(["a", "b"])})
2730+
tm.assert_frame_equal(df, expected)
2731+
2732+
expected = DataFrame({0: ["a", "b"], 1: ["c", "d"]}, dtype=dtype)
2733+
with pd.option_context("future.infer_string", True):
2734+
df = DataFrame(np.array([["a", "c"], ["b", "d"]]))
2735+
tm.assert_frame_equal(df, expected)
2736+
2737+
expected = DataFrame(
2738+
{"a": ["a", "b"], "b": ["c", "d"]},
2739+
dtype=dtype,
2740+
columns=Index(["a", "b"], dtype=dtype),
2741+
)
2742+
with pd.option_context("future.infer_string", True):
2743+
df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"])
2744+
tm.assert_frame_equal(df, expected)
2745+
27212746

27222747
class TestDataFrameConstructorIndexInference:
27232748
def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):

pandas/tests/series/test_constructors.py

+8
Original file line numberDiff line numberDiff line change
@@ -2107,6 +2107,14 @@ def test_series_string_inference_scalar(self):
21072107
ser = Series("a", index=[1])
21082108
tm.assert_series_equal(ser, expected)
21092109

2110+
def test_series_string_inference_array_string_dtype(self):
2111+
# GH#54496
2112+
pa = pytest.importorskip("pyarrow")
2113+
expected = Series(["a", "b"], dtype=pd.ArrowDtype(pa.string()))
2114+
with pd.option_context("future.infer_string", True):
2115+
ser = Series(np.array(["a", "b"]))
2116+
tm.assert_series_equal(ser, expected)
2117+
21102118

21112119
class TestSeriesConstructorIndexCoercion:
21122120
def test_series_constructor_datetimelike_index_coercion(self):

0 commit comments

Comments
 (0)