Merge pull request #107 from Quansight-Labs/dtypes_in_ufuncs

ev-br · web-flow · commit 773d77299945 · 2023-04-06T13:54:02.000+03:00
BUG: fix dtype handling in ufuncs
diff --git a/torch_np/_helpers.py b/torch_np/_helpers.py
@@ -3,33 +3,6 @@
 from ._detail import _dtypes_impl, _util
 
 
-def ufunc_preprocess(
-    tensors, out, where, casting, order, dtype, subok, signature, extobj
-):
-    """
-    Notes
-    -----
-    The `out` array broadcasts `tensors`, but not vice versa.
-    """
-    # internal preprocessing or args in ufuncs (cf _unary_ufuncs, _binary_ufuncs)
-    if order != "K" or not where or signature or extobj:
-        raise NotImplementedError
-
-    # dtype of the result: depends on both dtype=... and out=... arguments
-    if dtype is None:
-        out_dtype = None if out is None else out.dtype.torch_dtype
-    else:
-        out_dtype = (
-            dtype
-            if out is None
-            else _dtypes_impl.result_type_impl([dtype, out.dtype.torch_dtype])
-        )
-
-    if out_dtype:
-        tensors = _util.typecast_tensors(tensors, out_dtype, casting)
-    return tensors
-
-
 def ndarrays_to_tensors(*inputs):
     """Convert all ndarrays from `inputs` to tensors. (other things are intact)"""
     from ._ndarray import asarray, ndarray
diff --git a/torch_np/_ufuncs.py b/torch_np/_ufuncs.py
@@ -3,8 +3,29 @@
 import torch
 
 from . import _binary_ufuncs_impl, _helpers, _unary_ufuncs_impl
+from ._detail import _dtypes_impl, _util
 from ._normalizations import ArrayLike, DTypeLike, NDArray, SubokLike, normalizer
 
+
+def _ufunc_preprocess(tensors, where, casting, order, dtype, subok, signature, extobj):
+    if order != "K" or not where or signature or extobj:
+        raise NotImplementedError
+
+    if dtype is None:
+        dtype = _dtypes_impl.result_type_impl([t.dtype for t in tensors])
+
+    tensors = _util.typecast_tensors(tensors, dtype, casting)
+
+    return tensors
+
+
+def _ufunc_postprocess(result, out, casting):
+    if out is not None:
+        (result,) = _util.typecast_tensors((result,), out.dtype.torch_dtype, casting)
+        result = torch.broadcast_to(result, out.shape)
+    return result
+
+
 # ############# Binary ufuncs ######################
 
 _binary = [
@@ -35,16 +56,12 @@ def wrapped(
         signature=None,
         extobj=None,
     ):
-        tensors = _helpers.ufunc_preprocess(
-            (x1, x2), out, where, casting, order, dtype, subok, signature, extobj
+        tensors = _ufunc_preprocess(
+            (x1, x2), where, casting, order, dtype, subok, signature, extobj
         )
-        # now broadcast input tensors against the out=... array
-        if out is not None:
-            # XXX: need to filter out noop broadcasts if t.shape == out.shape?
-            shape = out.shape
-            tensors = tuple(torch.broadcast_to(t, shape) for t in tensors)
-
         result = torch_func(*tensors)
+
+        result = _ufunc_postprocess(result, out, casting)
         return result
 
     wrapped.__qualname__ = torch_func.__name__
@@ -54,8 +71,9 @@ def wrapped(
 
 
 #
-# matmul is special in that its `out=...` array does not broadcast x1 and x2.
-# E.g. consider x1.shape = (5, 2) and x2.shape = (2, 3). Then `out.shape` is (5, 3).
+# matmul's signature is _slightly_ different from other ufuncs:
+# - no where=...
+# - additional axis=..., axes=...
 #
 @normalizer
 def matmul(
@@ -73,17 +91,21 @@ def matmul(
     axes=None,
     axis=None,
 ):
-    tensors = _helpers.ufunc_preprocess(
-        (x1, x2), out, True, casting, order, dtype, subok, signature, extobj
+    tensors = _ufunc_preprocess(
+        (x1, x2), True, casting, order, dtype, subok, signature, extobj
     )
     if axis is not None or axes is not None:
         raise NotImplementedError
 
-    # NB: do not broadcast input tensors against the out=... array
     result = _binary_ufuncs_impl.matmul(*tensors)
+
+    result = _ufunc_postprocess(result, out, casting)
     return result
 
 
+#
+# nin=2, nout=2
+#
 def divmod(
     x1: ArrayLike,
     x2: ArrayLike,
@@ -110,12 +132,14 @@ def divmod(
     if out1.shape != out2.shape or out1.dtype != out2.dtype:
         raise ValueError("out1, out2 must be compatible")
 
-    tensors = _helpers.ufunc_preprocess(
-        (x1, x2), out, True, casting, order, dtype, subok, signature, extobj
+    tensors = _ufunc_preprocess(
+        (x1, x2), True, casting, order, dtype, subok, signature, extobj
     )
 
-    result = _binary_ufuncs_impl.divmod(*tensors)
+    quot, rem = _binary_ufuncs_impl.divmod(*tensors)
 
+    quot = _ufunc_postprocess(quot, out1, casting)
+    rem = _ufunc_postprocess(rem, out2, casting)
     return quot, rem
 
 
@@ -167,15 +191,11 @@ def wrapped(
         signature=None,
         extobj=None,
     ):
-        tensors = _helpers.ufunc_preprocess(
-            (x,), out, where, casting, order, dtype, subok, signature, extobj
+        tensors = _ufunc_preprocess(
+            (x,), where, casting, order, dtype, subok, signature, extobj
         )
-        # now broadcast the input tensor against the out=... array
-        if out is not None:
-            # XXX: need to filter out noop broadcasts if t.shape == out.shape?
-            shape = out.shape
-            tensors = tuple(torch.broadcast_to(t, shape) for t in tensors)
         result = torch_func(*tensors)
+        result = _ufunc_postprocess(result, out, casting)
         return result
 
     wrapped.__qualname__ = torch_func.__name__
diff --git a/torch_np/tests/numpy_tests/core/test_multiarray.py b/torch_np/tests/numpy_tests/core/test_multiarray.py
@@ -2907,7 +2907,7 @@ def test_inplace(self):
         b = np.array([3])
         c = (a * a) / b
 
-        assert_almost_equal(c, 25 / 3)
+        assert_almost_equal(c, 25 / 3, decimal=5)
         assert_equal(a, 5)
         assert_equal(b, 3)
 
@@ -5577,7 +5577,7 @@ def test_empty_out(self):
         out = np.ones((1, 1, 1))
         assert self.matmul(arr, arr).shape == (0, 1, 1)
 
-        with pytest.raises(ValueError, match="Bad size of the out array"):  # match=r"non-broadcastable"):
+        with pytest.raises((RuntimeError, ValueError)):
             self.matmul(arr, arr, out=out)
 
     def test_out_contiguous(self):
diff --git a/torch_np/tests/test_ufuncs_basic.py b/torch_np/tests/test_ufuncs_basic.py
@@ -105,12 +105,6 @@ def test_x_and_out_broadcast(self, ufunc):
     (np.add, operator.__add__, operator.__iadd__),
     (np.subtract, operator.__sub__, operator.__isub__),
     (np.multiply, operator.__mul__, operator.__imul__),
-    (np.divide, operator.__truediv__, operator.__itruediv__),
-    (np.floor_divide, operator.__floordiv__, operator.__ifloordiv__),
-    (np.float_power, operator.__pow__, operator.__ipow__),
-    ##   (np.remainder, operator.__mod__, operator.__imod__),   # does not handle complex
-    # remainder vs fmod?
-    # pow vs power vs float_power
 ]
 
 ufuncs_with_dunders = [ufunc for ufunc, _, _ in ufunc_op_iop_numeric]
@@ -409,13 +403,11 @@ def test_binary_ufunc_dtype_and_out(self):
         assert (r32 == [1, 2]).all()
         assert r32.dtype == np.float32
 
-        # NB: this test differs from numpy: in numpy, r.dtype is float64
-        # but the precision is lost, r == [1, 2].
-        # I *guess* numpy casts inputs to the dtype=... value, performs calculations,
-        # and then casts the result back to out.dtype.
+        # dtype is float32, so computation is in float32: precision loss
+        # the result is then cast to float64
         out64 = np.empty(2, dtype=np.float64)
         r = np.add([1.0, 2.0], 1.0e-15, dtype=np.float32, out=out64)
-        assert (r != [1, 2]).all()
+        assert (r == [1, 2]).all()
         assert r.dtype == np.float64
 
         # Internal computations are in float64, but the final cast to out.dtype