[SPARK-45073][PS][CONNECT] Replace LastNotNull with Last(ignoreNulls=True)

zhengruifeng · zhengruifeng · commit 4686c2733702 · 2023-09-05T11:13:35.000+08:00
### What changes were proposed in this pull request? Replace `LastNotNull` with `Last(ignoreNulls=True)` ### Why are the changes needed? #36127 introduced a PS dedicated expression `LastNotNull`, which was actually not needed and can be replaced with built-in `Last` ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? NO Closes #42808 from zhengruifeng/del_last_not_none. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -1905,10 +1905,6 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
         val ignoreNA = extractBoolean(children(2), "ignoreNA")
         Some(EWM(children(0), alpha, ignoreNA))
 
-      case "last_non_null" if fun.getArgumentsCount == 1 =>
-        val children = fun.getArgumentsList.asScala.map(transformExpression)
-        Some(LastNonNull(children(0)))
-
       case "null_index" if fun.getArgumentsCount == 1 =>
         val children = fun.getArgumentsList.asScala.map(transformExpression)
         Some(NullIndex(children(0)))
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
@@ -2257,7 +2257,7 @@ def _interpolate(
             return self._psdf.copy()._psser_for(self._column_label)
 
         scol = self.spark.column
-        last_non_null = SF.last_non_null(scol)
+        last_non_null = F.last(scol, True)
         null_index = SF.null_index(scol)
 
         Window = get_window_class()
diff --git a/python/pyspark/pandas/spark/functions.py b/python/pyspark/pandas/spark/functions.py
@@ -159,20 +159,6 @@ def ewm(col: Column, alpha: float, ignore_na: bool) -> Column:
         return Column(sc._jvm.PythonSQLUtils.ewm(col._jc, alpha, ignore_na))
 
 
-def last_non_null(col: Column) -> Column:
-    if is_remote():
-        from pyspark.sql.connect.functions import _invoke_function_over_columns
-
-        return _invoke_function_over_columns(  # type: ignore[return-value]
-            "last_non_null",
-            col,  # type: ignore[arg-type]
-        )
-
-    else:
-        sc = SparkContext._active_spark_context
-        return Column(sc._jvm.PythonSQLUtils.lastNonNull(col._jc))
-
-
 def null_index(col: Column) -> Column:
     if is_remote():
         from pyspark.sql.connect.functions import _invoke_function_over_columns
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -1152,43 +1152,6 @@ case class EWM(input: Expression, alpha: Double, ignoreNA: Boolean)
 }
 
 
-/**
- * Keep the last non-null value seen if any. This expression is dedicated only for
- * Pandas API on Spark.
- * For example,
- *  Input: null, 1, 2, 3, null, 4, 5, null
- *  Output: null, 1, 2, 3, 3, 4, 5, 5
- */
-case class LastNonNull(input: Expression)
-  extends AggregateWindowFunction with UnaryLike[Expression] {
-
-  override def dataType: DataType = input.dataType
-
-  private lazy val last = AttributeReference("last", dataType, nullable = true)()
-
-  override def aggBufferAttributes: Seq[AttributeReference] = last :: Nil
-
-  override lazy val initialValues: Seq[Expression] = Seq(Literal.create(null, dataType))
-
-  override lazy val updateExpressions: Seq[Expression] = {
-    Seq(
-      /* last = */ If(IsNull(input), last, input)
-    )
-  }
-
-  override lazy val evaluateExpression: Expression = last
-
-  override def prettyName: String = "last_non_null"
-
-  override def sql: String = s"$prettyName(${input.sql})"
-
-  override def child: Expression = input
-
-  override protected def withNewChildInternal(newChild: Expression): LastNonNull =
-    copy(input = newChild)
-}
-
-
 /**
  * Return the indices for consecutive null values, for non-null values, it returns 0.
  * This expression is dedicated only for Pandas API on Spark.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
@@ -145,8 +145,6 @@ private[sql] object PythonSQLUtils extends Logging {
   def ewm(e: Column, alpha: Double, ignoreNA: Boolean): Column =
     Column(EWM(e.expr, alpha, ignoreNA))
 
-  def lastNonNull(e: Column): Column = Column(LastNonNull(e.expr))
-
   def nullIndex(e: Column): Column = Column(NullIndex(e.expr))
 
   def makeInterval(unit: String, e: Column): Column = {