BUG: Handle zero-chunked pyarrow.ChunkedArray in StringArray (pandas-dev#41052)

xhochy · web-flow · commit 1fb626d0252e · 2021-04-21T21:44:50.000+02:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -710,7 +710,7 @@ Conversion
 Strings
 ^^^^^^^
 
--
+- Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`~arrays.StringArray` when the original had zero chunks (:issue:`41040`)
 -
 
 Interval
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -140,7 +140,12 @@ def __from_arrow__(
             bool_arr = BooleanArray(data, mask)
             results.append(bool_arr)
 
-        return BooleanArray._concat_same_type(results)
+        if not results:
+            return BooleanArray(
+                np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
+            )
+        else:
+            return BooleanArray._concat_same_type(results)
 
 
 def coerce_to_array(
diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py
@@ -66,7 +66,11 @@ def __from_arrow__(
             num_arr = array_class(data.copy(), ~mask, copy=False)
             results.append(num_arr)
 
-        if len(results) == 1:
+        if not results:
+            return array_class(
+                np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_)
+            )
+        elif len(results) == 1:
             # avoid additional copy in _concat_same_type
             return results[0]
         else:
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -118,7 +118,10 @@ def __from_arrow__(
             str_arr = StringArray._from_sequence(np.array(arr))
             results.append(str_arr)
 
-        return StringArray._concat_same_type(results)
+        if results:
+            return StringArray._concat_same_type(results)
+        else:
+            return StringArray(np.array([], dtype="object"))
 
 
 class StringArray(PandasArray):
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -1005,6 +1005,8 @@ def __from_arrow__(
             parr[~mask] = NaT
             results.append(parr)
 
+        if not results:
+            return PeriodArray(np.array([], dtype="int64"), freq=self.freq, copy=False)
         return PeriodArray._concat_same_type(results)
 
 
@@ -1238,6 +1240,12 @@ def __from_arrow__(
             iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed)
             results.append(iarr)
 
+        if not results:
+            return IntervalArray.from_arrays(
+                np.array([], dtype=self.subtype),
+                np.array([], dtype=self.subtype),
+                closed=array.type.closed,
+            )
         return IntervalArray._concat_same_type(results)
 
     def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py
@@ -271,6 +271,13 @@ def test_arrow_table_roundtrip(breaks):
     expected = pd.concat([df, df], ignore_index=True)
     tm.assert_frame_equal(result, expected)
 
+    # GH-41040
+    table = pa.table(
+        [pa.chunked_array([], type=table.column(0).type)], schema=table.schema
+    )
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, expected[0:0])
+
 
 @pyarrow_skip
 @pytest.mark.parametrize(
diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py
@@ -41,6 +41,22 @@ def test_arrow_roundtrip(data):
     tm.assert_frame_equal(result, df)
 
 
+@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
+def test_arrow_load_from_zero_chunks(data):
+    # GH-41040
+    import pyarrow as pa
+
+    df = pd.DataFrame({"a": data[0:0]})
+    table = pa.table(df)
+    assert table.field("a").type == str(data.dtype.numpy_dtype)
+    table = pa.table(
+        [pa.chunked_array([], type=table.field("a").type)], schema=table.schema
+    )
+    result = table.to_pandas()
+    assert result["a"].dtype == data.dtype
+    tm.assert_frame_equal(result, df)
+
+
 @td.skip_if_no("pyarrow", min_version="0.16.0")
 def test_arrow_from_arrow_uint():
     # https://github.com/pandas-dev/pandas/issues/31896
diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py
@@ -100,6 +100,26 @@ def test_arrow_table_roundtrip():
     tm.assert_frame_equal(result, expected)
 
 
+@pyarrow_skip
+def test_arrow_load_from_zero_chunks():
+    # GH-41040
+    import pyarrow as pa
+
+    from pandas.core.arrays._arrow_utils import ArrowPeriodType
+
+    arr = PeriodArray([], freq="D")
+    df = pd.DataFrame({"a": arr})
+
+    table = pa.table(df)
+    assert isinstance(table.field("a").type, ArrowPeriodType)
+    table = pa.table(
+        [pa.chunked_array([], type=table.column(0).type)], schema=table.schema
+    )
+    result = table.to_pandas()
+    assert isinstance(result["a"].dtype, PeriodDtype)
+    tm.assert_frame_equal(result, df)
+
+
 @pyarrow_skip
 def test_arrow_table_roundtrip_without_metadata():
     import pyarrow as pa
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -476,6 +476,22 @@ def test_arrow_roundtrip(dtype, dtype_object):
     assert result.loc[2, "a"] is pd.NA
 
 
+@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
+def test_arrow_load_from_zero_chunks(dtype, dtype_object):
+    # GH-41040
+    import pyarrow as pa
+
+    data = pd.array([], dtype=dtype)
+    df = pd.DataFrame({"a": data})
+    table = pa.table(df)
+    assert table.field("a").type == "string"
+    # Instantiate the same table with no chunks at all
+    table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
+    result = table.to_pandas()
+    assert isinstance(result["a"].dtype, dtype_object)
+    tm.assert_frame_equal(result, df)
+
+
 def test_value_counts_na(dtype):
     arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
     result = arr.value_counts(dropna=False)

Original file line number	Diff line number	Diff line change
`@@ -710,7 +710,7 @@ Conversion`
`710`	`710`	`Strings`
`711`	`711`	`^^^^^^^`
`712`	`712`
`713`		`--`
	`713`	+- Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`~arrays.StringArray` when the original had zero chunks (:issue:`41040`)
`714`	`714`	`-`
`715`	`715`
`716`	`716`	`Interval`