[SPARK-35621][SQL] Add rule id pruning to the TypeCoercion rule

sigmod · gengliangwang · commit 7bc364beed9a · 2021-06-05T14:49:16.000+08:00
### What changes were proposed in this pull request? - Added TreeNode.transformUpWithBeforeAndAfterRuleOnChildren(...); - Call transformUpWithBeforeAndAfterRuleOnChildren in TypeCoercionRule. ### Why are the changes needed? Reduce the number of tree traversals and hence improve the query compilation latency. ### How was this patch tested? Existing tests. Performance diff : <google-sheets-html-origin><style type="text/css"></style> &nbsp; | Baseline | Experiment (wo. ruleId) | Experiment (wo. ruleId)/Baseline | Experiment (w. ruleId) | Experiment (w. ruleId)/Baseline -- | -- | -- | -- | -- | -- CombinedTypeCoercionRule | 665020354 | 567320034 | 0.85 | 330798240 | 0.50 </google-sheets-html-origin> Closes #32761 from sigmod/transform. Authored-by: Yingyi Bu <yingyi.bu@databricks.com> Signed-off-by: Gengliang Wang <gengliang@apache.org>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -184,8 +184,6 @@ abstract class TypeCoercionBase {
         }
       }
     }
-
-    override val ruleName: String = rules.map(_.ruleName).mkString("Combined[", ", ", "]")
   }
 
   /**
@@ -1157,21 +1155,20 @@ trait TypeCoercionRule extends Rule[LogicalPlan] with Logging {
    */
   def apply(plan: LogicalPlan): LogicalPlan = {
     val typeCoercionFn = transform
-    def rewrite(plan: LogicalPlan): LogicalPlan = {
-      val withNewChildren = plan.mapChildren(rewrite)
-      if (!withNewChildren.childrenResolved) {
-        withNewChildren
-      } else {
-        // Only propagate types if the children have changed.
-        val withPropagatedTypes = if (withNewChildren ne plan) {
-          propagateTypes(withNewChildren)
+    plan.transformUpWithBeforeAndAfterRuleOnChildren(!_.analyzed, ruleId) {
+      case (beforeMapChildren, afterMapChildren) =>
+        if (!afterMapChildren.childrenResolved) {
+          afterMapChildren
         } else {
-          plan
+          // Only propagate types if the children have changed.
+          val withPropagatedTypes = if (beforeMapChildren ne afterMapChildren) {
+            propagateTypes(afterMapChildren)
+          } else {
+            beforeMapChildren
+          }
+          withPropagatedTypes.transformExpressionsUp(typeCoercionFn)
         }
-        withPropagatedTypes.transformExpressionsUp(typeCoercionFn)
-      }
     }
-    rewrite(plan)
   }
 
   def transform: PartialFunction[Expression, Expression]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.rules
 import scala.collection.mutable
 
 import org.apache.spark.sql.errors.QueryExecutionErrors
+import org.apache.spark.util.Utils
 
 // Represent unique rule ids for rules that are invoked multiple times.
 case class RuleId(id: Int) {
@@ -40,7 +41,7 @@ object RuleIdCollection {
   // invoked multiple times by Analyzer/Optimizer/Planner need a rule id to prune unnecessary
   // tree traversals in the transform function family. Note that those rules should not depend on
   // a changing, external state. Rules here are in alphabetical order.
-  private val rulesNeedingIds: Seq[String] = {
+  private var rulesNeedingIds: Seq[String] = {
       // Catalyst Analyzer rules
       "org.apache.spark.sql.catalyst.analysis.Analyzer$AddMetadataColumns" ::
       "org.apache.spark.sql.catalyst.analysis.Analyzer$ExtractGenerator" ::
@@ -88,6 +89,7 @@ object RuleIdCollection {
       "org.apache.spark.sql.catalyst.analysis.ResolveUnion" ::
       "org.apache.spark.sql.catalyst.analysis.SubstituteUnresolvedOrdinals" ::
       "org.apache.spark.sql.catalyst.analysis.TimeWindowing" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$CombinedTypeCoercionRule" ::
       "org.apache.spark.sql.catalyst.analysis.UpdateOuterReferences" ::
       "org.apache.spark.sql.catalyst.analysis.UpdateAttributeNullability" ::
       // Catalyst Optimizer rules
@@ -152,6 +154,31 @@ object RuleIdCollection {
       "org.apache.spark.sql.catalyst.optimizer.UnwrapCastInBinaryComparison" ::  Nil
   }
 
+  if(Utils.isTesting) {
+    rulesNeedingIds = rulesNeedingIds ++ {
+      // In the production code path, the following rules are run in CombinedTypeCoercionRule, and
+      // hence we only need to add them for unit testing.
+      "org.apache.spark.sql.catalyst.analysis.AnsiTypeCoercion$PromoteStringLiterals" ::
+      "org.apache.spark.sql.catalyst.analysis.DecimalPrecision" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercion$BooleanEquality" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$CaseWhenCoercion" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$ConcatCoercion" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$DateTimeOperations" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$Division" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$EltCoercion" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$FunctionArgumentConversion" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$IfCoercion" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$ImplicitTypeCasts" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$InConversion" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$IntegralDivision" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$MapZipWithCoercion" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercion$PromoteStrings" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$StackCoercion" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$StringLiteralCoercion" ::
+      "org.apache.spark.sql.catalyst.analysis.TypeCoercionBase$WindowFrameCoercion" :: Nil
+    }
+  }
+
   // Maps rule names to ids. Rule ids are continuous natural numbers starting from 0.
   private val ruleToId = new mutable.HashMap[String, RuleId]
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -549,6 +549,44 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product with Tre
     }
   }
 
+  /**
+   * Returns a copy of this node where `rule` has been recursively applied first to all of its
+   * children and then itself (post-order). When `rule` does not apply to a given node, it is left
+   * unchanged.
+   *
+   * @param cond   a Lambda expression to prune tree traversals. If `cond.apply` returns false
+   *               on a TreeNode T, skips processing T and its subtree; otherwise, processes
+   *               T and its subtree recursively.
+   * @param rule   the function use to transform this node and its descendant nodes. The function
+   *               takes a tuple as its input, where the first/second field is the before/after
+   *               image of applying the rule on the node's children.
+   * @param ruleId is a unique Id for `rule` to prune unnecessary tree traversals. When it is
+   *               UnknownRuleId, no pruning happens. Otherwise, if `rule` (with id `ruleId`)
+   *               has been marked as in effective on a TreeNode T, skips processing T and its
+   *               subtree. Do not pass it if the rule is not purely functional and reads a
+   *               varying initial state for different invocations.
+   */
+  def transformUpWithBeforeAndAfterRuleOnChildren(
+      cond: BaseType => Boolean, ruleId: RuleId = UnknownRuleId)(
+    rule: PartialFunction[(BaseType, BaseType), BaseType]): BaseType = {
+    if (!cond.apply(this) || isRuleIneffective(ruleId)) {
+      return this
+    }
+    val afterRuleOnChildren =
+      mapChildren(_.transformUpWithBeforeAndAfterRuleOnChildren(cond, ruleId)(rule))
+    val newNode = CurrentOrigin.withOrigin(origin) {
+      rule.applyOrElse((this, afterRuleOnChildren), { t: (BaseType, BaseType) => t._2 })
+    }
+    if (this eq newNode) {
+      this.markRuleAsIneffective(ruleId)
+      this
+    } else {
+      // If the transform function replaces this node with a new one, carry over the tags.
+      newNode.copyTagsFrom(this)
+      newNode
+    }
+  }
+
   /**
    * Returns a copy of this node where `f` has been applied to all the nodes in `children`.
    */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercionSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import java.sql.Timestamp
 
+import org.apache.spark.internal.config.Tests.IS_TESTING
 import org.apache.spark.sql.catalyst.analysis.AnsiTypeCoercion._
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
@@ -27,10 +28,16 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 class AnsiTypeCoercionSuite extends AnalysisTest {
   import TypeCoercionSuite._
 
+  // When Utils.isTesting is true, RuleIdCollection adds individual type coercion rules. Otherwise,
+  // RuleIdCollection doesn't add them because they are called in a train inside
+  // CombinedTypeCoercionRule.
+  assert(Utils.isTesting, s"${IS_TESTING.key} is not set to true")
+
   // scalastyle:off line.size.limit
   // The following table shows all implicit data type conversions that are not visible to the user.
   // +----------------------+----------+-----------+-------------+----------+------------+------------+------------+------------+-------------+------------+----------+---------------+------------+----------+-------------+----------+----------------------+---------------------+-------------+--------------+
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import java.sql.Timestamp
 
+import org.apache.spark.internal.config.Tests.IS_TESTING
 import org.apache.spark.sql.catalyst.analysis.TypeCoercion._
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
@@ -27,10 +28,16 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 class TypeCoercionSuite extends AnalysisTest {
   import TypeCoercionSuite._
 
+  // When Utils.isTesting is true, RuleIdCollection adds individual type coercion rules. Otherwise,
+  // RuleIdCollection doesn't add them because they are called in a train inside
+  // CombinedTypeCoercionRule.
+  assert(Utils.isTesting, s"${IS_TESTING.key} is not set to true")
+
   // scalastyle:off line.size.limit
   // The following table shows all implicit data type conversions that are not visible to the user.
   // +----------------------+----------+-----------+-------------+----------+------------+-----------+------------+------------+-------------+------------+----------+---------------+------------+----------+-------------+----------+----------------------+---------------------+-------------+--------------+

Original file line number	Diff line number	Diff line change
`@@ -184,8 +184,6 @@ abstract class TypeCoercionBase {`
`184`	`184`	`}`
`185`	`185`	`}`
`186`	`186`	`}`
`187`		`-`
`188`		`- override val ruleName: String = rules.map(_.ruleName).mkString("Combined[", ", ", "]")`
`189`	`187`	`}`
`190`	`188`
`191`	`189`	`/**`
`@@ -1157,21 +1155,20 @@ trait TypeCoercionRule extends Rule[LogicalPlan] with Logging {`
`1157`	`1155`	`*/`
`1158`	`1156`	`def apply(plan: LogicalPlan): LogicalPlan = {`
`1159`	`1157`	`val typeCoercionFn = transform`
`1160`		`- def rewrite(plan: LogicalPlan): LogicalPlan = {`
`1161`		`- val withNewChildren = plan.mapChildren(rewrite)`
`1162`		`- if (!withNewChildren.childrenResolved) {`
`1163`		`- withNewChildren`
`1164`		`- } else {`
`1165`		`- // Only propagate types if the children have changed.`
`1166`		`- val withPropagatedTypes = if (withNewChildren ne plan) {`
`1167`		`- propagateTypes(withNewChildren)`
	`1158`	`+ plan.transformUpWithBeforeAndAfterRuleOnChildren(!_.analyzed, ruleId) {`
	`1159`	`+ case (beforeMapChildren, afterMapChildren) =>`
	`1160`	`+ if (!afterMapChildren.childrenResolved) {`
	`1161`	`+ afterMapChildren`
`1168`	`1162`	`} else {`
`1169`		`- plan`
	`1163`	`+ // Only propagate types if the children have changed.`
	`1164`	`+ val withPropagatedTypes = if (beforeMapChildren ne afterMapChildren) {`
	`1165`	`+ propagateTypes(afterMapChildren)`
	`1166`	`+ } else {`
	`1167`	`+ beforeMapChildren`
	`1168`	`+ }`
	`1169`	`+ withPropagatedTypes.transformExpressionsUp(typeCoercionFn)`
`1170`	`1170`	`}`
`1171`		`- withPropagatedTypes.transformExpressionsUp(typeCoercionFn)`
`1172`		`- }`
`1173`	`1171`	`}`
`1174`		`- rewrite(plan)`
`1175`	`1172`	`}`
`1176`	`1173`
`1177`	`1174`	`def transform: PartialFunction[Expression, Expression]`