Skip to content

Commit d5fa41e

Browse files
committed
[SPARK-41741][SQL] Encode the string using the UTF_8 charset in ParquetFilters
### What changes were proposed in this pull request? This PR makes it encode the string using the `UTF_8` charset in `ParquetFilters`. ### Why are the changes needed? Fix data issue where the default charset is not `UTF_8`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual test. Closes #40090 from wangyum/SPARK-41741. Authored-by: Yuming Wang <[email protected]> Signed-off-by: Yuming Wang <[email protected]>
1 parent 547737b commit d5fa41e

File tree

1 file changed

+2
-1
lines changed

1 file changed

+2
-1
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
1919

2020
import java.lang.{Boolean => JBoolean, Double => JDouble, Float => JFloat, Long => JLong}
2121
import java.math.{BigDecimal => JBigDecimal}
22+
import java.nio.charset.StandardCharsets.UTF_8
2223
import java.sql.{Date, Timestamp}
2324
import java.time.{Duration, Instant, LocalDate, Period}
2425
import java.util.HashSet
@@ -776,7 +777,7 @@ class ParquetFilters(
776777
Option(prefix).map { v =>
777778
FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldNames),
778779
new UserDefinedPredicate[Binary] with Serializable {
779-
private val strToBinary = Binary.fromReusedByteArray(v.getBytes)
780+
private val strToBinary = Binary.fromReusedByteArray(v.getBytes(UTF_8))
780781
private val size = strToBinary.length
781782

782783
override def canDrop(statistics: Statistics[Binary]): Boolean = {

0 commit comments

Comments
 (0)