[SPARK-41741][SQL] Encode the string using the UTF_8 charset in ParquetFilters

wangyum · wangyum · commit d5fa41efe2b1 · 2023-02-20T19:15:30.000+08:00
### What changes were proposed in this pull request? This PR makes it encode the string using the `UTF_8` charset in `ParquetFilters`. ### Why are the changes needed? Fix data issue where the default charset is not `UTF_8`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual test. Closes #40090 from wangyum/SPARK-41741. Authored-by: Yuming Wang <yumwang@ebay.com> Signed-off-by: Yuming Wang <yumwang@ebay.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.lang.{Boolean => JBoolean, Double => JDouble, Float => JFloat, Long => JLong}
 import java.math.{BigDecimal => JBigDecimal}
+import java.nio.charset.StandardCharsets.UTF_8
 import java.sql.{Date, Timestamp}
 import java.time.{Duration, Instant, LocalDate, Period}
 import java.util.HashSet
@@ -776,7 +777,7 @@ class ParquetFilters(
         Option(prefix).map { v =>
           FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldNames),
             new UserDefinedPredicate[Binary] with Serializable {
-              private val strToBinary = Binary.fromReusedByteArray(v.getBytes)
+              private val strToBinary = Binary.fromReusedByteArray(v.getBytes(UTF_8))
               private val size = strToBinary.length
 
               override def canDrop(statistics: Statistics[Binary]): Boolean = {