Skip to content

Commit 95e83c5

Browse files
Backport PR pandas-dev#31918: BUG: fix parquet roundtrip with unsigned integer dtypes (pandas-dev#31928)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 494c108 commit 95e83c5

File tree

4 files changed

+29
-7
lines changed

4 files changed

+29
-7
lines changed

doc/source/whatsnew/v1.0.2.rst

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ Bug fixes
3333
**I/O**
3434

3535
- Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`)
36+
- Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`).
37+
3638

3739

3840
**Experimental dtypes**

pandas/core/arrays/integer.py

+4
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ def __from_arrow__(self, array):
9494
import pyarrow
9595
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
9696

97+
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
98+
if not array.type.equals(pyarrow_type):
99+
array = array.cast(pyarrow_type)
100+
97101
if isinstance(array, pyarrow.Array):
98102
chunks = [array]
99103
else:

pandas/tests/arrays/test_integer.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -1016,9 +1016,9 @@ def test_arrow_array(data):
10161016
assert arr.equals(expected)
10171017

10181018

1019-
@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
1019+
@td.skip_if_no("pyarrow", min_version="0.16.0")
10201020
def test_arrow_roundtrip(data):
1021-
# roundtrip possible from arrow 1.0.0
1021+
# roundtrip possible from arrow 0.16.0
10221022
import pyarrow as pa
10231023

10241024
df = pd.DataFrame({"a": data})
@@ -1028,6 +1028,19 @@ def test_arrow_roundtrip(data):
10281028
tm.assert_frame_equal(result, df)
10291029

10301030

1031+
@td.skip_if_no("pyarrow", min_version="0.16.0")
1032+
def test_arrow_from_arrow_uint():
1033+
# https://github.com/pandas-dev/pandas/issues/31896
1034+
# possible mismatch in types
1035+
import pyarrow as pa
1036+
1037+
dtype = pd.UInt32Dtype()
1038+
result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
1039+
expected = pd.array([1, 2, 3, 4, None], dtype="UInt32")
1040+
1041+
tm.assert_extension_array_equal(result, expected)
1042+
1043+
10311044
@pytest.mark.parametrize(
10321045
"pandasmethname, kwargs",
10331046
[

pandas/tests/io/test_parquet.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -533,25 +533,28 @@ def test_additional_extension_arrays(self, pa):
533533
df = pd.DataFrame(
534534
{
535535
"a": pd.Series([1, 2, 3], dtype="Int64"),
536-
"b": pd.Series(["a", None, "c"], dtype="string"),
536+
"b": pd.Series([1, 2, 3], dtype="UInt32"),
537+
"c": pd.Series(["a", None, "c"], dtype="string"),
537538
}
538539
)
539-
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"):
540+
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.16.0"):
540541
expected = df
541542
else:
542543
# de-serialized as plain int / object
543-
expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object"))
544+
expected = df.assign(
545+
a=df.a.astype("int64"), b=df.b.astype("int64"), c=df.c.astype("object")
546+
)
544547
check_round_trip(df, pa, expected=expected)
545548

546549
df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
547-
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"):
550+
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.16.0"):
548551
expected = df
549552
else:
550553
# if missing values in integer, currently de-serialized as float
551554
expected = df.assign(a=df.a.astype("float64"))
552555
check_round_trip(df, pa, expected=expected)
553556

554-
@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
557+
@td.skip_if_no("pyarrow", min_version="0.16.0")
555558
def test_additional_extension_types(self, pa):
556559
# test additional ExtensionArrays that are supported through the
557560
# __arrow_array__ protocol + by defining a custom ExtensionType

0 commit comments

Comments
 (0)