From 33281d5a25422e5b129752b1072117bb9a0ae2f9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 19 Feb 2020 07:44:39 -0600 Subject: [PATCH 1/3] BUG: Pickle NA objects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to https://docs.python.org/3/library/pickle.html#object.__reduce__, > If a string is returned, the string should be interpreted as the name > of a global variable. It should be the object’s local name relative to > its module; the pickle module searches the module namespace to determine > the object’s module. This behaviour is typically useful for singletons. Closes https://github.com/pandas-dev/pandas/issues/31847 --- doc/source/whatsnew/v1.0.2.rst | 1 + pandas/_libs/missing.pyx | 3 +++ pandas/tests/scalar/test_na_scalar.py | 8 ++++++++ 3 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index c9031ac1ae9fe..0201f29fa87e4 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -37,6 +37,7 @@ Bug fixes **I/O** - Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`) +- Fixed pickling of ``pandas.NA``. Previously a new object was returned, which broke computations relying on ``NA`` being a singleton (:issue:`31847`) - Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`). diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 4d17a6f883c1c..c54cb652d7b21 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -364,6 +364,9 @@ class NAType(C_NAType): exponent = 31 if is_32bit else 61 return 2 ** exponent - 1 + def __reduce__(self): + return "NA" + # Binary arithmetic and comparison ops -> propagate __add__ = _create_binary_propagating_op("__add__") diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index dcb9d66708724..0bb7d3e547c9c 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -267,3 +269,9 @@ def test_integer_hash_collision_set(): assert len(result) == 2 assert NA in result assert hash(NA) in result + + +def test_pickle_roundtrip(): + # https://github.com/pandas-dev/pandas/issues/31847 + result = pickle.loads(pickle.dumps(pd.NA)) + assert result is pd.NA From 8827a504284158e2ead0f76be3202959ce643bda Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 19 Feb 2020 08:56:00 -0600 Subject: [PATCH 2/3] more --- pandas/tests/scalar/test_na_scalar.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 0bb7d3e547c9c..6dff0eb8ae91f 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -275,3 +275,23 @@ def test_pickle_roundtrip(): # https://github.com/pandas-dev/pandas/issues/31847 result = pickle.loads(pickle.dumps(pd.NA)) assert result is pd.NA + + +def test_pickle_roundtrip_pandas(): + with tm.ensure_clean("data.pkl") as f: + with open(f): + pd.to_pickle(pd.NA, f) + with open(f): + result = pd.read_pickle(f) + assert result is pd.NA + + +def test_pickle_roundtrip_series(): + s = pd.Series(pd.array([1, 2, pd.NA])) + with tm.ensure_clean("data.pkl") as f: + with open(f): + pd.to_pickle(s, f) + with open(f): + result = pd.read_pickle(f) + + tm.assert_series_equal(result, s) From d3c6bce9b3e3310d4d6077b3edb6040a2c5a181a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 2 Mar 2020 11:24:10 -0600 Subject: [PATCH 3/3] fixup --- pandas/tests/scalar/test_na_scalar.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 6dff0eb8ae91f..07656de2e9062 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -278,20 +278,17 @@ def test_pickle_roundtrip(): def test_pickle_roundtrip_pandas(): - with tm.ensure_clean("data.pkl") as f: - with open(f): - pd.to_pickle(pd.NA, f) - with open(f): - result = pd.read_pickle(f) + result = tm.round_trip_pickle(pd.NA) assert result is pd.NA -def test_pickle_roundtrip_series(): - s = pd.Series(pd.array([1, 2, pd.NA])) - with tm.ensure_clean("data.pkl") as f: - with open(f): - pd.to_pickle(s, f) - with open(f): - result = pd.read_pickle(f) - - tm.assert_series_equal(result, s) +@pytest.mark.parametrize( + "values, dtype", [([1, 2, pd.NA], "Int64"), (["A", "B", pd.NA], "string")] +) +@pytest.mark.parametrize("as_frame", [True, False]) +def test_pickle_roundtrip_containers(as_frame, values, dtype): + s = pd.Series(pd.array(values, dtype=dtype)) + if as_frame: + s = s.to_frame(name="A") + result = tm.round_trip_pickle(s) + tm.assert_equal(result, s)