Skip to content

Commit 9767da6

Browse files
BUG: fix parquet roundtrip with unsigned integer dtypes (#31918)
1 parent 361a938 commit 9767da6

File tree

4 files changed

+29
-7
lines changed

4 files changed

+29
-7
lines changed

doc/source/whatsnew/v1.0.2.rst

+2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ Bug fixes
3535
**I/O**
3636

3737
- Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`)
38+
- Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`).
39+
3840

3941
**Experimental dtypes**
4042

pandas/core/arrays/integer.py

+4
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ def __from_arrow__(
103103
import pyarrow # noqa: F811
104104
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
105105

106+
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
107+
if not array.type.equals(pyarrow_type):
108+
array = array.cast(pyarrow_type)
109+
106110
if isinstance(array, pyarrow.Array):
107111
chunks = [array]
108112
else:

pandas/tests/arrays/test_integer.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -1036,9 +1036,9 @@ def test_arrow_array(data):
10361036
assert arr.equals(expected)
10371037

10381038

1039-
@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
1039+
@td.skip_if_no("pyarrow", min_version="0.16.0")
10401040
def test_arrow_roundtrip(data):
1041-
# roundtrip possible from arrow 1.0.0
1041+
# roundtrip possible from arrow 0.16.0
10421042
import pyarrow as pa
10431043

10441044
df = pd.DataFrame({"a": data})
@@ -1048,6 +1048,19 @@ def test_arrow_roundtrip(data):
10481048
tm.assert_frame_equal(result, df)
10491049

10501050

1051+
@td.skip_if_no("pyarrow", min_version="0.16.0")
1052+
def test_arrow_from_arrow_uint():
1053+
# https://github.com/pandas-dev/pandas/issues/31896
1054+
# possible mismatch in types
1055+
import pyarrow as pa
1056+
1057+
dtype = pd.UInt32Dtype()
1058+
result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
1059+
expected = pd.array([1, 2, 3, 4, None], dtype="UInt32")
1060+
1061+
tm.assert_extension_array_equal(result, expected)
1062+
1063+
10511064
@pytest.mark.parametrize(
10521065
"pandasmethname, kwargs",
10531066
[

pandas/tests/io/test_parquet.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -533,25 +533,28 @@ def test_additional_extension_arrays(self, pa):
533533
df = pd.DataFrame(
534534
{
535535
"a": pd.Series([1, 2, 3], dtype="Int64"),
536-
"b": pd.Series(["a", None, "c"], dtype="string"),
536+
"b": pd.Series([1, 2, 3], dtype="UInt32"),
537+
"c": pd.Series(["a", None, "c"], dtype="string"),
537538
}
538539
)
539-
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"):
540+
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.16.0"):
540541
expected = df
541542
else:
542543
# de-serialized as plain int / object
543-
expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object"))
544+
expected = df.assign(
545+
a=df.a.astype("int64"), b=df.b.astype("int64"), c=df.c.astype("object")
546+
)
544547
check_round_trip(df, pa, expected=expected)
545548

546549
df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
547-
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"):
550+
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.16.0"):
548551
expected = df
549552
else:
550553
# if missing values in integer, currently de-serialized as float
551554
expected = df.assign(a=df.a.astype("float64"))
552555
check_round_trip(df, pa, expected=expected)
553556

554-
@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
557+
@td.skip_if_no("pyarrow", min_version="0.16.0")
555558
def test_additional_extension_types(self, pa):
556559
# test additional ExtensionArrays that are supported through the
557560
# __arrow_array__ protocol + by defining a custom ExtensionType

0 commit comments

Comments
 (0)