Skip to content

Commit 5aab71e

Browse files
mroeschkephofl
authored andcommitted
BUG: pickling subset of Arrow-backed data would serialize the entire data (pandas-dev#49078)
* BUG: pickling subset of Arrow-backed data would serialize the entire data * Use data
1 parent 8bd5f89 commit 5aab71e

File tree

4 files changed

+47
-0
lines changed

4 files changed

+47
-0
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ MultiIndex
274274
I/O
275275
^^^
276276
- Bug in :func:`read_sas` caused fragmentation of :class:`DataFrame` and raised :class:`.errors.PerformanceWarning` (:issue:`48595`)
277+
- Bug when a pickling a subset PyArrow-backed data that would serialize the entire data instead of the subset (:issue:`42600`)
277278
- Bug in :func:`read_csv` for a single-line csv with fewer columns than ``names`` raised :class:`.errors.ParserError` with ``engine="c"`` (:issue:`47566`)
278279
-
279280

pandas/core/arrays/arrow/array.py

+11
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,17 @@ def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
351351
def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
352352
return type(self)(pc.abs_checked(self._data))
353353

354+
# GH 42600: __getstate__/__setstate__ not necessary once
355+
# https://issues.apache.org/jira/browse/ARROW-10739 is addressed
356+
def __getstate__(self):
357+
state = self.__dict__.copy()
358+
state["_data"] = self._data.combine_chunks()
359+
return state
360+
361+
def __setstate__(self, state) -> None:
362+
state["_data"] = pa.chunked_array(state["_data"])
363+
self.__dict__.update(state)
364+
354365
def _cmp_method(self, other, op):
355366
from pandas.arrays import BooleanArray
356367

pandas/tests/arrays/string_/test_string_arrow.py

+18
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import pickle
12
import re
23

34
import numpy as np
@@ -197,3 +198,20 @@ def test_setitem_invalid_indexer_raises():
197198

198199
with pytest.raises(ValueError, match=None):
199200
arr[[0, 1]] = ["foo", "bar", "baz"]
201+
202+
203+
@skip_if_no_pyarrow
204+
def test_pickle_roundtrip():
205+
# GH 42600
206+
expected = pd.Series(range(10), dtype="string[pyarrow]")
207+
expected_sliced = expected.head(2)
208+
full_pickled = pickle.dumps(expected)
209+
sliced_pickled = pickle.dumps(expected_sliced)
210+
211+
assert len(full_pickled) > len(sliced_pickled)
212+
213+
result = pickle.loads(full_pickled)
214+
tm.assert_series_equal(result, expected)
215+
216+
result_sliced = pickle.loads(sliced_pickled)
217+
tm.assert_series_equal(result_sliced, expected_sliced)

pandas/tests/extension/test_arrow.py

+17
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
BytesIO,
2121
StringIO,
2222
)
23+
import pickle
2324

2425
import numpy as np
2526
import pytest
@@ -1347,3 +1348,19 @@ def test_is_bool_dtype():
13471348
result = s[data]
13481349
expected = s[np.asarray(data)]
13491350
tm.assert_series_equal(result, expected)
1351+
1352+
1353+
def test_pickle_roundtrip(data):
1354+
# GH 42600
1355+
expected = pd.Series(data)
1356+
expected_sliced = expected.head(2)
1357+
full_pickled = pickle.dumps(expected)
1358+
sliced_pickled = pickle.dumps(expected_sliced)
1359+
1360+
assert len(full_pickled) > len(sliced_pickled)
1361+
1362+
result = pickle.loads(full_pickled)
1363+
tm.assert_series_equal(result, expected)
1364+
1365+
result_sliced = pickle.loads(sliced_pickled)
1366+
tm.assert_series_equal(result_sliced, expected_sliced)

0 commit comments

Comments
 (0)