Skip to content

Commit 1fb626d

Browse files
authored
BUG: Handle zero-chunked pyarrow.ChunkedArray in StringArray (pandas-dev#41052)
1 parent f502748 commit 1fb626d

File tree

9 files changed

+83
-4
lines changed

9 files changed

+83
-4
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,7 @@ Conversion
710710
Strings
711711
^^^^^^^
712712

713-
-
713+
- Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`~arrays.StringArray` when the original had zero chunks (:issue:`41040`)
714714
-
715715

716716
Interval

pandas/core/arrays/boolean.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,12 @@ def __from_arrow__(
140140
bool_arr = BooleanArray(data, mask)
141141
results.append(bool_arr)
142142

143-
return BooleanArray._concat_same_type(results)
143+
if not results:
144+
return BooleanArray(
145+
np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
146+
)
147+
else:
148+
return BooleanArray._concat_same_type(results)
144149

145150

146151
def coerce_to_array(

pandas/core/arrays/numeric.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,11 @@ def __from_arrow__(
6666
num_arr = array_class(data.copy(), ~mask, copy=False)
6767
results.append(num_arr)
6868

69-
if len(results) == 1:
69+
if not results:
70+
return array_class(
71+
np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_)
72+
)
73+
elif len(results) == 1:
7074
# avoid additional copy in _concat_same_type
7175
return results[0]
7276
else:

pandas/core/arrays/string_.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,10 @@ def __from_arrow__(
118118
str_arr = StringArray._from_sequence(np.array(arr))
119119
results.append(str_arr)
120120

121-
return StringArray._concat_same_type(results)
121+
if results:
122+
return StringArray._concat_same_type(results)
123+
else:
124+
return StringArray(np.array([], dtype="object"))
122125

123126

124127
class StringArray(PandasArray):

pandas/core/dtypes/dtypes.py

+8
Original file line numberDiff line numberDiff line change
@@ -1005,6 +1005,8 @@ def __from_arrow__(
10051005
parr[~mask] = NaT
10061006
results.append(parr)
10071007

1008+
if not results:
1009+
return PeriodArray(np.array([], dtype="int64"), freq=self.freq, copy=False)
10081010
return PeriodArray._concat_same_type(results)
10091011

10101012

@@ -1238,6 +1240,12 @@ def __from_arrow__(
12381240
iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed)
12391241
results.append(iarr)
12401242

1243+
if not results:
1244+
return IntervalArray.from_arrays(
1245+
np.array([], dtype=self.subtype),
1246+
np.array([], dtype=self.subtype),
1247+
closed=array.type.closed,
1248+
)
12411249
return IntervalArray._concat_same_type(results)
12421250

12431251
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:

pandas/tests/arrays/interval/test_interval.py

+7
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,13 @@ def test_arrow_table_roundtrip(breaks):
271271
expected = pd.concat([df, df], ignore_index=True)
272272
tm.assert_frame_equal(result, expected)
273273

274+
# GH-41040
275+
table = pa.table(
276+
[pa.chunked_array([], type=table.column(0).type)], schema=table.schema
277+
)
278+
result = table.to_pandas()
279+
tm.assert_frame_equal(result, expected[0:0])
280+
274281

275282
@pyarrow_skip
276283
@pytest.mark.parametrize(

pandas/tests/arrays/masked/test_arrow_compat.py

+16
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,22 @@ def test_arrow_roundtrip(data):
4141
tm.assert_frame_equal(result, df)
4242

4343

44+
@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
45+
def test_arrow_load_from_zero_chunks(data):
46+
# GH-41040
47+
import pyarrow as pa
48+
49+
df = pd.DataFrame({"a": data[0:0]})
50+
table = pa.table(df)
51+
assert table.field("a").type == str(data.dtype.numpy_dtype)
52+
table = pa.table(
53+
[pa.chunked_array([], type=table.field("a").type)], schema=table.schema
54+
)
55+
result = table.to_pandas()
56+
assert result["a"].dtype == data.dtype
57+
tm.assert_frame_equal(result, df)
58+
59+
4460
@td.skip_if_no("pyarrow", min_version="0.16.0")
4561
def test_arrow_from_arrow_uint():
4662
# https://github.com/pandas-dev/pandas/issues/31896

pandas/tests/arrays/period/test_arrow_compat.py

+20
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,26 @@ def test_arrow_table_roundtrip():
100100
tm.assert_frame_equal(result, expected)
101101

102102

103+
@pyarrow_skip
104+
def test_arrow_load_from_zero_chunks():
105+
# GH-41040
106+
import pyarrow as pa
107+
108+
from pandas.core.arrays._arrow_utils import ArrowPeriodType
109+
110+
arr = PeriodArray([], freq="D")
111+
df = pd.DataFrame({"a": arr})
112+
113+
table = pa.table(df)
114+
assert isinstance(table.field("a").type, ArrowPeriodType)
115+
table = pa.table(
116+
[pa.chunked_array([], type=table.column(0).type)], schema=table.schema
117+
)
118+
result = table.to_pandas()
119+
assert isinstance(result["a"].dtype, PeriodDtype)
120+
tm.assert_frame_equal(result, df)
121+
122+
103123
@pyarrow_skip
104124
def test_arrow_table_roundtrip_without_metadata():
105125
import pyarrow as pa

pandas/tests/arrays/string_/test_string.py

+16
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,22 @@ def test_arrow_roundtrip(dtype, dtype_object):
476476
assert result.loc[2, "a"] is pd.NA
477477

478478

479+
@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
480+
def test_arrow_load_from_zero_chunks(dtype, dtype_object):
481+
# GH-41040
482+
import pyarrow as pa
483+
484+
data = pd.array([], dtype=dtype)
485+
df = pd.DataFrame({"a": data})
486+
table = pa.table(df)
487+
assert table.field("a").type == "string"
488+
# Instantiate the same table with no chunks at all
489+
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
490+
result = table.to_pandas()
491+
assert isinstance(result["a"].dtype, dtype_object)
492+
tm.assert_frame_equal(result, df)
493+
494+
479495
def test_value_counts_na(dtype):
480496
arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
481497
result = arr.value_counts(dropna=False)

0 commit comments

Comments
 (0)