Be consistent about second vs alloc in rewrites

ricardoV94 · ricardoV94 · commit 5db0d8330040 · 2023-08-07T12:55:27.000+02:00
diff --git a/pytensor/tensor/rewriting/basic.py b/pytensor/tensor/rewriting/basic.py
@@ -1,4 +1,26 @@
-""" Tensor optimizations addressing the ops in basic.py."""
+""" Tensor optimizations addressing the ops in basic.py.
+
+Notes
+-----
+There are two ways of broadcasting arrays:
+second(x, y) == alloc(y, broadcast_shapes(x.shape, y.shape))
+
+The second can be more efficient because x doesn't usually need to be computed when we only want its shape.
+It may also allow other rewrites that don't try to modify x when it has multiple clients (for fear of duplicating computation).
+
+However, the first one is easier to reason about.
+Knowing we have such a graph allows to do certain rewrites such as "sinking" broadcasting operations below Elemwise.
+The same rewrites with alloc would be more complicated as we would need to symbolically combine the shapes of each one.
+
+As an example contrast rewriting the following two equivalent graphs
+
+alloc(x, broadcast_shapes(x.shape, y.shape)) + alloc(y, broadcast_shapes(x.shape, y.shape)) -> x + y
+second(y, x) + second(x, y) -> x + y
+
+Theano developers (mostly) preferred to use the first form during canonicalization and introduce the second form later,
+via rewrites like `local_fill_to_alloc`, and using the `alloc_like` helper inside rewrites.
+Many stabilize and stabilization rewrites refuse to be applied when a variable has multiple clients, so this is important.
+"""
 
 import logging
 from typing import TYPE_CHECKING, Optional, Union
diff --git a/pytensor/tensor/rewriting/math.py b/pytensor/tensor/rewriting/math.py
@@ -30,7 +30,6 @@
     cast,
     constant,
     extract_constant,
-    fill,
     get_underlying_scalar_constant_value,
     ones_like,
     switch,
@@ -2041,8 +2040,6 @@ def local_zero_div(fgraph, node):
 @register_specialize
 @node_rewriter([at_pow])
 def local_pow_specialize(fgraph, node):
-    # here, we are past the point of canonicalization, so we don't want
-    # to put in un-necessary fills.
     if node.op == at_pow:
         # the idea here is that we have pow(x, y)
         odtype = node.outputs[0].dtype
@@ -2057,7 +2054,7 @@ def local_pow_specialize(fgraph, node):
             if np.all(y == 1):
                 rval = [xsym]
             if np.all(y == 0):
-                rval = [fill(xsym, np.asarray(1, dtype=odtype))]
+                rval = [alloc_like(1, xsym, fgraph)]
             if np.all(y == 0.5):
                 rval = [sqrt(xsym)]
             if np.all(y == -0.5):
@@ -2158,9 +2155,7 @@ def local_mul_specialize(fgraph, node):
     mul(-1, x, y) -/-> neg(mul(x, y))
 
     """
-    # here, we are past the point of canonicalization, so we don't
-    # want to put in un-necessary fills.
-    #
+
     # at this point [post canonicalize], mul() may have many inputs.
     if node.op == mul:
         # the idea here is that we have pow(x, y)
@@ -2221,16 +2216,7 @@ def local_mul_specialize(fgraph, node):
 
 @register_specialize
 @node_rewriter([add])
-def local_add_specialize(fgraph, node):
-    """Remove zeros from ``add``s.
-
-    TODO: This should be a canonicalization, no?
-    """
-    # here, we are past the point of canonicalization, so we don't want
-    # to put in un-necessary fills.
-    if node.op != add:
-        return False
-
+def local_add_remove_zeros(fgraph, node):
     new_inputs = []
     for inp in node.inputs:
         try:
@@ -2253,12 +2239,12 @@ def local_add_specialize(fgraph, node):
         # Reuse call to constant for cache()
         cst = constant(np.zeros((1,) * ndim, dtype=dtype))
         assert cst.type.broadcastable == (True,) * ndim
-        return [broadcast_arrays(cst, *node.inputs)[0]]
+        return [alloc_like(cst, node_output, fgraph)]
 
     if len(new_inputs) == 1:
-        ret = [broadcast_arrays(new_inputs[0], *node.inputs)[0]]
+        ret = [alloc_like(new_inputs[0], node_output, fgraph)]
     else:
-        ret = [broadcast_arrays(add(*new_inputs), *node.inputs)[0]]
+        ret = [alloc_like(add(*new_inputs), node_output, fgraph)]
 
     # The dtype should not be changed. It can happen if the input
     # that was forcing upcasting was equal to 0.
@@ -2376,7 +2362,7 @@ def local_log1p(fgraph, node):
                         ninp = nonconsts[0]
                     if ninp.dtype != log_arg.type.dtype:
                         ninp = ninp.astype(node.outputs[0].dtype)
-                    return [broadcast_arrays(log1p(ninp), *scalar_inputs)[0]]
+                    return [alloc_like(log1p(ninp), node.outputs[0], fgraph)]
 
         elif log_arg.owner and log_arg.owner.op == sub:
             one = extract_constant(log_arg.owner.inputs[0], only_process_constants=True)
@@ -3572,10 +3558,11 @@ def local_reciprocal_1_plus_exp(fgraph, node):
                 if nonconsts[0].owner and nonconsts[0].owner.op == exp:
                     if scalars_ and np.allclose(np.sum(scalars_), 1):
                         out = [
-                            broadcast_arrays(
+                            alloc_like(
                                 sigmoid(neg(nonconsts[0].owner.inputs[0])),
-                                *scalar_inputs,
-                            )[0]
+                                node.outputs[0],
+                                fgraph,
+                            )
                         ]
                         # keep combined stack traces of
                         #     exp(x):           nonconsts[0],