ENH: Allow NEP 42 dtypes to use np.save and np.load (numpy#24142)

ngoldbaum · web-flow · commit c6a449c7972e · 2023-07-12T10:33:01.000+02:00
Fixes numpy#24110 First, this makes it so that by default NEP 42 dtypes can't be pickled unless the dtype has a pickle implementation. Currently numpy will pickle them, but won't be able to unpickle them because the type code written to disk is invalid. Erroring is an improvement over writing corrupt files, I think. Second, if a type can be pickled, this makes it so that np.save will save the array using pickle and will lie that the dtype is object (see @rkern's suggestion). I've made it so if this happens a UserWarning will get printed. Unfortunately there's no way to indicate in the file that this really isn't an object array, so I can't do much on the load side to detect when this happens. Hopefully the UserWarning at save time is enough? I think adding a way to indicate in the file that we're not really storing an object array would require a revision to the npy file format, which ideally I'd like to avoid. Last, added a pickle implementation to the scaled float test dtype and then added a test doing a round-trip save and load with a scale float array.
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
@@ -2631,6 +2631,13 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args))
         obj = (PyObject *)self->typeobj;
         Py_INCREF(obj);
     }
+    else if (!NPY_DT_is_legacy(NPY_DTYPE(self))) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Custom dtypes cannot use the default pickle implementation "
+                "for NumPy dtypes. Add a custom pickle implementation to the "
+                "DType to avoid this error");
+        return NULL;
+    }
     else {
         elsize = self->elsize;
         if (self->type_num == NPY_UNICODE) {
diff --git a/numpy/core/src/umath/_scaled_float_dtype.c b/numpy/core/src/umath/_scaled_float_dtype.c
@@ -194,13 +194,21 @@ sfloat_get_scaling(PyArray_SFloatDescr *self, PyObject *NPY_UNUSED(args))
 }
 
 
+static PyObject *
+sfloat___reduce__(PyArray_SFloatDescr *self)
+{
+    return Py_BuildValue("(O(d))", Py_TYPE(self), self->scaling);
+}
+
 PyMethodDef sfloat_methods[] = {
     {"scaled_by",
          (PyCFunction)python_sfloat_scaled_copy, METH_O,
         "Method to get a dtype copy with different scaling, mainly to "
         "avoid having to implement many ways to create new instances."},
     {"get_scaling",
         (PyCFunction)sfloat_get_scaling, METH_NOARGS, NULL},
+    {"__reduce__",
+       (PyCFunction)sfloat___reduce__, METH_NOARGS, NULL},
     {NULL, NULL, 0, NULL}
 };
 
diff --git a/numpy/core/tests/test_custom_dtypes.py b/numpy/core/tests/test_custom_dtypes.py
@@ -1,3 +1,5 @@
+from tempfile import NamedTemporaryFile
+
 import pytest
 
 import numpy as np
@@ -243,6 +245,28 @@ def test_creation_class(self):
         assert np.zeros(3, dtype=SF).dtype == SF(1.)
         assert np.zeros_like(arr1, dtype=SF).dtype == SF(1.)
 
+    def test_np_save_load(self):
+        # this monkeypatch is needed because pickle
+        # uses the repr of a type to reconstruct it
+        np._ScaledFloatTestDType = SF
+
+        arr = np.array([1.0, 2.0, 3.0], dtype=SF(1.0))
+
+        # adapted from RoundtripTest.roundtrip in np.save tests
+        with NamedTemporaryFile("wb", delete=False, suffix=".npz") as f:
+            with pytest.warns(UserWarning) as record:
+                np.savez(f.name, arr)
+
+        assert len(record) == 1
+
+        with np.load(f.name, allow_pickle=True) as data:
+            larr = data["arr_0"]
+        assert_array_equal(arr.view(np.float64), larr.view(np.float64))
+        assert larr.dtype == arr.dtype == SF(1.0)
+
+        del np._ScaledFloatTestDType
+
+
 
 def test_type_pickle():
     # can't actually unpickle, but we can pickle (if in namespace)
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
@@ -277,6 +277,23 @@ def dtype_to_descr(dtype):
         # fiddled with. This needs to be fixed in the C implementation of
         # dtype().
         return dtype.descr
+    elif not type(dtype)._legacy:
+        # this must be a user-defined dtype since numpy does not yet expose any
+        # non-legacy dtypes in the public API
+        #
+        # non-legacy dtypes don't yet have __array_interface__
+        # support. Instead, as a hack, we use pickle to save the array, and lie
+        # that the dtype is object. When the array is loaded, the descriptor is
+        # unpickled with the array and the object dtype in the header is
+        # discarded.
+        #
+        # a future NEP should define a way to serialize user-defined
+        # descriptors and ideally work out the possible security implications
+        warnings.warn("Custom dtypes are saved as python objects using the "
+                      "pickle protocol. Loading this file requires "
+                      "allow_pickle=True to be set.",
+                      UserWarning, stacklevel=2)
+        return "|O"
     else:
         return dtype.str
 
@@ -710,12 +727,18 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
         # Set buffer size to 16 MiB to hide the Python loop overhead.
         buffersize = max(16 * 1024 ** 2 // array.itemsize, 1)
 
-    if array.dtype.hasobject:
+    dtype_class = type(array.dtype)
+
+    if array.dtype.hasobject or not dtype_class._legacy:
         # We contain Python objects so we cannot write out the data
         # directly.  Instead, we will pickle it out
         if not allow_pickle:
-            raise ValueError("Object arrays cannot be saved when "
-                             "allow_pickle=False")
+            if array.dtype.hasobject:
+                raise ValueError("Object arrays cannot be saved when "
+                                 "allow_pickle=False")
+            if not dtype_class._legacy:
+                raise ValueError("User-defined dtypes cannot be saved "
+                                 "when allow_pickle=False")
         if pickle_kwargs is None:
             pickle_kwargs = {}
         pickle.dump(array, fp, protocol=3, **pickle_kwargs)